#!/usr/bin/env python3 """ Analyse des faux positifs EPISODE pour identifier les patterns problématiques. """ import json from pathlib import Path from collections import Counter import re def analyze_episode_fp(): """Analyse les faux positifs EPISODE.""" # Lire les audits et annotations audit_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") annot_dir = Path("tests/ground_truth/annotations") # Collecter tous les EPISODE détectés episode_detections = [] for audit_file in audit_dir.glob("*.audit.jsonl"): doc_name = audit_file.stem.replace('.audit', '') # Lire les détections detections = [] with open(audit_file, 'r', encoding='utf-8') as f: for line in f: hit = json.loads(line) if hit['kind'] == 'EPISODE': detections.append(hit['original']) # Lire les annotations (ground truth) annot_file = annot_dir / f"{doc_name}.json" annotations = [] if annot_file.exists(): with open(annot_file, 'r', encoding='utf-8') as f: annot_data = json.load(f) annotations = [a['text'] for a in annot_data.get('annotations', []) if a['label'] == 'EPISODE'] # Identifier les faux positifs (détectés mais pas annotés) for det in detections: if det not in annotations: episode_detections.append({ 'document': doc_name, 'value': det }) print("=" * 80) print(f"ANALYSE DES {len(episode_detections)} FAUX POSITIFS EPISODE") print("=" * 80) if not episode_detections: print("\n✅ Aucun faux positif EPISODE trouvé!") return # Analyser les valeurs values = [fp['value'] for fp in episode_detections] value_counts = Counter(values) print(f"\n📊 Top 20 valeurs les plus fréquentes:") for value, count in value_counts.most_common(20): print(f" {value}: {count} occurrences") # Analyser les patterns print(f"\n📊 Analyse des patterns:") # Pattern 1: Codes médicaux CIM-10 (lettre + chiffres) cim10_codes = [v for v in values if re.match(r'^[A-Z]\d{2}', v)] print(f" Codes CIM-10 (ex: E11, Z95): {len(cim10_codes)} ({len(cim10_codes)/len(values)*100:.1f}%)") # Pattern 2: Numéros purs (que des chiffres) pure_numbers = [v for v in values if v.isdigit()] print(f" Numéros purs (que des chiffres): {len(pure_numbers)} ({len(pure_numbers)/len(values)*100:.1f}%)") # Pattern 3: Codes avec tirets codes_with_dash = [v for v in values if '-' in v] print(f" Codes avec tirets: {len(codes_with_dash)} ({len(codes_with_dash)/len(values)*100:.1f}%)") # Pattern 4: Codes courts (<=4 chars) short_codes = [v for v in values if len(v) <= 4] print(f" Codes courts (≤4 chars): {len(short_codes)} ({len(short_codes)/len(values)*100:.1f}%)") # Pattern 5: Codes longs (>=10 chars) long_codes = [v for v in values if len(v) >= 10] print(f" Codes longs (≥10 chars): {len(long_codes)} ({len(long_codes)/len(values)*100:.1f}%)") # Exemples par pattern print(f"\n📊 Exemples par pattern:") if cim10_codes: print(f" CIM-10: {', '.join(cim10_codes[:5])}") if pure_numbers: print(f" Numéros purs: {', '.join(pure_numbers[:5])}") if short_codes: print(f" Codes courts: {', '.join(short_codes[:5])}") # Identifier les documents avec le plus de FP EPISODE doc_counts = Counter([fp['document'] for fp in episode_detections]) print(f"\n📊 Documents avec le plus de FP EPISODE:") for doc, count in doc_counts.most_common(10): print(f" {doc}: {count} FP") # Sauvegarder l'analyse output_file = Path("tests/ground_truth/analysis/episode_fp_analysis.json") output_file.parent.mkdir(parents=True, exist_ok=True) analysis = { 'total_fp': len(episode_detections), 'unique_values': len(value_counts), 'top_values': dict(value_counts.most_common(20)), 'patterns': { 'cim10_codes': len(cim10_codes), 'pure_numbers': len(pure_numbers), 'codes_with_dash': len(codes_with_dash), 'short_codes': len(short_codes), 'long_codes': len(long_codes) }, 'top_documents': dict(doc_counts.most_common(10)), 'examples': { 'cim10': cim10_codes[:10], 'pure_numbers': pure_numbers[:10], 'short_codes': short_codes[:10] } } with open(output_file, 'w', encoding='utf-8') as f: json.dump(analysis, f, indent=2, ensure_ascii=False) print(f"\n📄 Analyse sauvegardée: {output_file}") # Recommandations print("\n" + "=" * 80) print("RECOMMANDATIONS") print("=" * 80) cim10_ratio = len(cim10_codes) / len(values) * 100 if cim10_ratio > 30: print(f"\n✅ {cim10_ratio:.1f}% des FP sont des codes CIM-10") print(" Recommandation: Filtrer les codes CIM-10 connus (pattern ^[A-Z]\\d{2})") short_ratio = len(short_codes) / len(values) * 100 if short_ratio > 50: print(f"\n✅ {short_ratio:.1f}% des FP sont des codes courts (≤4 chars)") print(" Recommandation: Augmenter la longueur minimale pour EPISODE (ex: ≥6 chars)") # Identifier les documents trackare trackare_docs = [doc for doc in doc_counts.keys() if 'trackare' in doc.lower()] if trackare_docs: trackare_fp = sum(doc_counts[doc] for doc in trackare_docs) print(f"\n✅ {trackare_fp} FP ({trackare_fp/len(episode_detections)*100:.1f}%) proviennent de documents trackare") print(" Recommandation: Filtrage spécifique pour les documents trackare") if __name__ == "__main__": analyze_episode_fp()