diff --git a/.kiro/specs/anonymization-quality-optimization/tasks.md b/.kiro/specs/anonymization-quality-optimization/tasks.md index 08d69b0..18a6b0a 100644 --- a/.kiro/specs/anonymization-quality-optimization/tasks.md +++ b/.kiro/specs/anonymization-quality-optimization/tasks.md @@ -26,9 +26,9 @@ - [x] 1.1.3.4 Valider les annotations (double vérification) - [x] 1.1.3.5 Calculer les statistiques du dataset (PII par type, difficulté) -- [ ] 1.1.4 Enrichir la liste des stopwords médicaux - - [ ] 1.1.4.1 Extraire les termes médicaux des 30 documents annotés - - [ ] 1.1.4.2 Identifier les faux positifs actuels (termes masqués à tort) +- [-] 1.1.4 Enrichir la liste des stopwords médicaux + - [x] 1.1.4.1 Extraire les termes médicaux des 30 documents annotés + - [x] 1.1.4.2 Identifier les faux positifs actuels (termes masqués à tort) - [ ] 1.1.4.3 Ajouter les nouveaux termes à `_MEDICAL_STOP_WORDS_SET` - [ ] 1.1.4.4 Documenter les sources des stopwords diff --git a/tests/ground_truth/analysis/global_propagation_analysis.json b/tests/ground_truth/analysis/global_propagation_analysis.json new file mode 100644 index 0000000..b4923dc --- /dev/null +++ b/tests/ground_truth/analysis/global_propagation_analysis.json @@ -0,0 +1,164 @@ +{ + "analysis_date": "2026-03-02", + "global_types_stats": [ + { + "type": "NOM_GLOBAL", + "base_type": "NOM", + "tp": 0, + "fp": 670, + "fn": 0, + "precision": 0.0, + "base_tp": 506, + "base_fp": 0, + "base_precision": 1.0 + }, + { + "type": "TEL_GLOBAL", + "base_type": "TEL", + "tp": 0, + "fp": 77, + "fn": 0, + "precision": 0.0, + "base_tp": 193, + "base_fp": 8, + "base_precision": 0.9602 + }, + { + "type": "ADRESSE_GLOBAL", + "base_type": "ADRESSE", + "tp": 0, + "fp": 55, + "fn": 0, + "precision": 0.0, + "base_tp": 72, + "base_fp": 10, + "base_precision": 0.878 + }, + { + "type": "CODE_POSTAL_GLOBAL", + "base_type": "CODE_POSTAL", + "tp": 0, + "fp": 39, + "fn": 0, + "precision": 0.0, + "base_tp": 50, + "base_fp": 10, + "base_precision": 0.8333 + }, + { + "type": "ETAB_GLOBAL", + "base_type": "ETAB", + "tp": 0, + "fp": 36, + "fn": 0, + "precision": 0.0, + "base_tp": 0, + "base_fp": 0, + "base_precision": 0.0 + }, + { + "type": "EMAIL_GLOBAL", + "base_type": "EMAIL", + "tp": 0, + "fp": 28, + "fn": 0, + "precision": 0.0, + "base_tp": 62, + "base_fp": 0, + "base_precision": 1.0 + }, + { + "type": "DATE_NAISSANCE_GLOBAL", + "base_type": "DATE_NAISSANCE", + "tp": 0, + "fp": 20, + "fn": 0, + "precision": 0.0, + "base_tp": 114, + "base_fp": 0, + "base_precision": 1.0 + }, + { + "type": "VILLE_GLOBAL", + "base_type": "VILLE", + "tp": 0, + "fp": 10, + "fn": 0, + "precision": 0.0, + "base_tp": 5, + "base_fp": 20, + "base_precision": 0.2 + }, + { + "type": "EPISODE_GLOBAL", + "base_type": "EPISODE", + "tp": 0, + "fp": 9, + "fn": 0, + "precision": 0.0, + "base_tp": 18, + "base_fp": 106, + "base_precision": 0.1452 + }, + { + "type": "RPPS_GLOBAL", + "base_type": "RPPS", + "tp": 0, + "fp": 7, + "fn": 0, + "precision": 0.0, + "base_tp": 21, + "base_fp": 0, + "base_precision": 1.0 + } + ], + "nom_extracted_stats": { + "precision": 0.0, + "recall": 0.0, + "f1_score": 0.0, + "true_positives": 0, + "false_positives": 3846, + "false_negatives": 0 + }, + "recommendations": [ + { + "priority": 1, + "title": "Désactiver les types *_GLOBAL inutiles", + "types": [ + "NOM_GLOBAL", + "TEL_GLOBAL", + "ADRESSE_GLOBAL", + "CODE_POSTAL_GLOBAL", + "ETAB_GLOBAL", + "EMAIL_GLOBAL", + "DATE_NAISSANCE_GLOBAL", + "VILLE_GLOBAL", + "EPISODE_GLOBAL", + "RPPS_GLOBAL" + ], + "impact": "Réduction de 951 FP", + "gain_precision": "+15.6 points", + "effort": "Faible", + "implementation": "Modifier la fonction de propagation globale pour exclure ces types" + }, + { + "priority": 2, + "title": "Désactiver NOM_EXTRACTED", + "types": [ + "NOM_EXTRACTED" + ], + "impact": "Réduction de 3846 FP", + "gain_precision": "+62.9 points", + "effort": "Faible", + "implementation": "Commenter ou supprimer la logique d'extraction de noms" + } + ], + "estimated_gain": { + "current_fp": 4951, + "estimated_fp": 154, + "fp_reduction": 4797, + "current_precision": 0.1897, + "estimated_precision": 0.8827113480578828, + "precision_gain": 0.6930113480578828 + } +} \ No newline at end of file diff --git a/tests/ground_truth/analysis/medical_stopwords_candidates.json b/tests/ground_truth/analysis/medical_stopwords_candidates.json new file mode 100644 index 0000000..19cfa8f --- /dev/null +++ b/tests/ground_truth/analysis/medical_stopwords_candidates.json @@ -0,0 +1,283 @@ +{ + "extraction_date": "2026-03-02", + "total_detections": 3846, + "unique_terms": 316, + "frequent_terms_count": 196, + "medical_terms_count": 7, + "top_50_frequent": [ + { + "term": "lucie", + "count": 188, + "is_medical": false + }, + { + "term": "schmitt", + "count": 185, + "is_medical": false + }, + { + "term": "masse", + "count": 170, + "is_medical": false + }, + { + "term": "laurence", + "count": 138, + "is_medical": false + }, + { + "term": "belleau", + "count": 135, + "is_medical": false + }, + { + "term": "céline", + "count": 124, + "is_medical": false + }, + { + "term": "justine", + "count": 96, + "is_medical": false + }, + { + "term": "burg", + "count": 96, + "is_medical": false + }, + { + "term": "schneider", + "count": 90, + "is_medical": false + }, + { + "term": "sophie", + "count": 74, + "is_medical": false + }, + { + "term": "aguer", + "count": 74, + "is_medical": false + }, + { + "term": "kasparian", + "count": 68, + "is_medical": false + }, + { + "term": "alexandra", + "count": 64, + "is_medical": false + }, + { + "term": "valette", + "count": 63, + "is_medical": false + }, + { + "term": "buccale", + "count": 61, + "is_medical": false + }, + { + "term": "samuel", + "count": 61, + "is_medical": false + }, + { + "term": "bannier", + "count": 60, + "is_medical": false + }, + { + "term": "grihault", + "count": 60, + "is_medical": false + }, + { + "term": "pedia", + "count": 59, + "is_medical": false + }, + { + "term": "françois", + "count": 57, + "is_medical": false + }, + { + "term": "quentin", + "count": 57, + "is_medical": false + }, + { + "term": "cazenave", + "count": 55, + "is_medical": false + }, + { + "term": "bedouet", + "count": 46, + "is_medical": false + }, + { + "term": "jean", + "count": 44, + "is_medical": false + }, + { + "term": "hurtado", + "count": 44, + "is_medical": false + }, + { + "term": "droit", + "count": 43, + "is_medical": true + }, + { + "term": "jean-pierre", + "count": 39, + "is_medical": false + }, + { + "term": "echelle", + "count": 37, + "is_medical": false + }, + { + "term": "glasgow", + "count": 37, + "is_medical": false + }, + { + "term": "carriere", + "count": 35, + "is_medical": false + }, + { + "term": "juliette", + "count": 35, + "is_medical": false + }, + { + "term": "txomin", + "count": 33, + "is_medical": false + }, + { + "term": "maternowski", + "count": 31, + "is_medical": false + }, + { + "term": "cuillere", + "count": 29, + "is_medical": false + }, + { + "term": "cafe", + "count": 29, + "is_medical": false + }, + { + "term": "vomissements", + "count": 26, + "is_medical": false + }, + { + "term": "gournay", + "count": 26, + "is_medical": false + }, + { + "term": "eva", + "count": 25, + "is_medical": false + }, + { + "term": "enf", + "count": 24, + "is_medical": false + }, + { + "term": "marie-line", + "count": 24, + "is_medical": false + }, + { + "term": "picamilh", + "count": 23, + "is_medical": false + }, + { + "term": "eneko", + "count": 23, + "is_medical": false + }, + { + "term": "bronswick", + "count": 22, + "is_medical": false + }, + { + "term": "larrouy", + "count": 20, + "is_medical": false + }, + { + "term": "elodie", + "count": 20, + "is_medical": false + }, + { + "term": "preremplie", + "count": 18, + "is_medical": false + }, + { + "term": "infectieuses", + "count": 16, + "is_medical": false + }, + { + "term": "petriat", + "count": 16, + "is_medical": false + }, + { + "term": "cotyle", + "count": 16, + "is_medical": false + }, + { + "term": "sylvie", + "count": 15, + "is_medical": false + } + ], + "medical_terms_by_category": { + "Titres/Fonctions": [ + "droit", + "droite" + ], + "Pathologies": [ + "anastomose" + ], + "Termes généraux": [ + "colique", + "hilaire", + "urologique", + "vasculaire" + ] + }, + "all_medical_terms": [ + "anastomose", + "colique", + "droit", + "droite", + "hilaire", + "urologique", + "vasculaire" + ] +} \ No newline at end of file diff --git a/tests/ground_truth/analysis/new_medical_stopwords.py b/tests/ground_truth/analysis/new_medical_stopwords.py new file mode 100644 index 0000000..32fe84b --- /dev/null +++ b/tests/ground_truth/analysis/new_medical_stopwords.py @@ -0,0 +1,12 @@ +# Nouveaux stopwords médicaux extraits automatiquement +# À ajouter à _MEDICAL_STOP_WORDS_SET dans anonymizer_core_refactored_onnx.py + +NEW_MEDICAL_STOPWORDS = { + "anastomose", + "colique", + "droit", + "droite", + "hilaire", + "urologique", + "vasculaire", +} diff --git a/tools/analyze_global_propagation.py b/tools/analyze_global_propagation.py new file mode 100755 index 0000000..9ff13e4 --- /dev/null +++ b/tools/analyze_global_propagation.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +""" +Analyse la propagation globale et propose des optimisations. + +La propagation globale crée 4,797 FP (96.9% du total): +- NOM_EXTRACTED: 3,846 FP (77.7%) +- *_GLOBAL: 951 FP (19.2%) + +Ce script analyse quels types bénéficient vraiment de la propagation. +""" +import sys +import json +from pathlib import Path +from collections import defaultdict, Counter + +def analyze_global_propagation(): + """Analyse la propagation globale.""" + + print("="*80) + print("ANALYSE DE LA PROPAGATION GLOBALE") + print("="*80) + + # Charger les résultats d'évaluation + eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json") + + with open(eval_file, 'r', encoding='utf-8') as f: + eval_data = json.load(f) + + by_type = eval_data['by_type'] + + # Analyser les types *_GLOBAL + global_types = {k: v for k, v in by_type.items() if k.endswith('_GLOBAL')} + non_global_types = {k: v for k, v in by_type.items() if not k.endswith('_GLOBAL') and k != 'NOM_EXTRACTED'} + + print(f"\n📊 Types avec propagation globale: {len(global_types)}") + print(f"📊 Types sans propagation: {len(non_global_types)}") + + # Statistiques par type *_GLOBAL + print(f"\n" + "="*80) + print("ANALYSE DES TYPES *_GLOBAL") + print("="*80) + + global_stats = [] + for pii_type, stats in global_types.items(): + base_type = pii_type.replace('_GLOBAL', '') + base_stats = non_global_types.get(base_type, {}) + + global_stats.append({ + "type": pii_type, + "base_type": base_type, + "tp": stats['true_positives'], + "fp": stats['false_positives'], + "fn": stats['false_negatives'], + "precision": stats['precision'], + "base_tp": base_stats.get('true_positives', 0), + "base_fp": base_stats.get('false_positives', 0), + "base_precision": base_stats.get('precision', 0.0) + }) + + # Trier par nombre de FP + global_stats.sort(key=lambda x: x['fp'], reverse=True) + + print(f"\n🔴 Types *_GLOBAL par nombre de faux positifs:\n") + for i, stat in enumerate(global_stats, 1): + print(f"{i}. {stat['type']}") + print(f" TP: {stat['tp']}, FP: {stat['fp']}, Précision: {stat['precision']:.2%}") + print(f" Type de base ({stat['base_type']}): TP: {stat['base_tp']}, FP: {stat['base_fp']}, Précision: {stat['base_precision']:.2%}") + + # Évaluer l'utilité + if stat['tp'] == 0 and stat['fp'] > 0: + print(f" ⚠️ INUTILE: Aucun TP, {stat['fp']} FP → À DÉSACTIVER") + elif stat['precision'] < 0.5: + print(f" ⚠️ PROBLÉMATIQUE: Précision {stat['precision']:.2%} → À DÉSACTIVER") + elif stat['tp'] > 0 and stat['precision'] >= 0.8: + print(f" ✅ UTILE: {stat['tp']} TP avec bonne précision → À CONSERVER") + else: + print(f" ⚙️ À ÉVALUER: Bénéfice/coût à analyser") + print() + + # Analyse NOM_EXTRACTED + print("="*80) + print("ANALYSE DE NOM_EXTRACTED") + print("="*80) + + nom_extracted_stats = by_type.get('NOM_EXTRACTED', {}) + print(f"\n📊 NOM_EXTRACTED:") + print(f" TP: {nom_extracted_stats.get('true_positives', 0)}") + print(f" FP: {nom_extracted_stats.get('false_positives', 0)}") + print(f" Précision: {nom_extracted_stats.get('precision', 0):.2%}") + + if nom_extracted_stats.get('false_positives', 0) > 1000: + print(f"\n ⚠️ PROBLÈME MAJEUR: {nom_extracted_stats['false_positives']} FP") + print(f" Impact: {nom_extracted_stats['false_positives'] / eval_data['global_metrics']['false_positives'] * 100:.1f}% des FP totaux") + print(f"\n 💡 SOLUTION: Désactiver NOM_EXTRACTED ou améliorer drastiquement le filtrage") + + # Recommandations + print(f"\n" + "="*80) + print("RECOMMANDATIONS") + print("="*80) + + recommendations = [] + + # Recommandation 1: Désactiver les types *_GLOBAL inutiles + useless_global = [s for s in global_stats if s['tp'] == 0 and s['fp'] > 0] + if useless_global: + total_fp_saved = sum(s['fp'] for s in useless_global) + recommendations.append({ + "priority": 1, + "title": "Désactiver les types *_GLOBAL inutiles", + "types": [s['type'] for s in useless_global], + "impact": f"Réduction de {total_fp_saved} FP", + "gain_precision": f"+{total_fp_saved / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points", + "effort": "Faible", + "implementation": "Modifier la fonction de propagation globale pour exclure ces types" + }) + + # Recommandation 2: Désactiver NOM_EXTRACTED + if nom_extracted_stats.get('false_positives', 0) > 1000: + recommendations.append({ + "priority": 2, + "title": "Désactiver NOM_EXTRACTED", + "types": ['NOM_EXTRACTED'], + "impact": f"Réduction de {nom_extracted_stats['false_positives']} FP", + "gain_precision": f"+{nom_extracted_stats['false_positives'] / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points", + "effort": "Faible", + "implementation": "Commenter ou supprimer la logique d'extraction de noms" + }) + + # Recommandation 3: Améliorer le filtrage des types *_GLOBAL problématiques + problematic_global = [s for s in global_stats if s['tp'] > 0 and s['precision'] < 0.5] + if problematic_global: + total_fp = sum(s['fp'] for s in problematic_global) + recommendations.append({ + "priority": 3, + "title": "Améliorer le filtrage des types *_GLOBAL problématiques", + "types": [s['type'] for s in problematic_global], + "impact": f"Réduction estimée de {int(total_fp * 0.7)} FP (70% des {total_fp})", + "gain_precision": f"+{int(total_fp * 0.7) / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points", + "effort": "Moyen", + "implementation": "Ajouter validation croisée ou seuil de confiance avant propagation" + }) + + print(f"\n🎯 {len(recommendations)} recommandations:\n") + for i, rec in enumerate(recommendations, 1): + print(f"Priorité {rec['priority']}: {rec['title']}") + print(f" Types: {', '.join(rec['types'][:5])}") + if len(rec['types']) > 5: + print(f" ... et {len(rec['types']) - 5} autres") + print(f" Impact: {rec['impact']}") + print(f" Gain précision: {rec['gain_precision']}") + print(f" Effort: {rec['effort']}") + print(f" Implémentation: {rec['implementation']}") + print() + + # Estimation du gain total + total_fp_reduction = 0 + for rec in recommendations: + # Extraire le nombre de FP de l'impact + import re + match = re.search(r'(\d+)', rec['impact']) + if match: + total_fp_reduction += int(match.group(1)) + + current_fp = eval_data['global_metrics']['false_positives'] + current_tp = eval_data['global_metrics']['true_positives'] + current_precision = eval_data['global_metrics']['precision'] + + new_fp = current_fp - total_fp_reduction + new_precision = current_tp / (current_tp + new_fp) if (current_tp + new_fp) > 0 else 0 + + print("="*80) + print("ESTIMATION DU GAIN TOTAL") + print("="*80) + print(f"\n🎯 Avec toutes les recommandations:") + print(f" - FP actuels: {current_fp}") + print(f" - FP estimés: {new_fp} (-{total_fp_reduction})") + print(f" - Précision actuelle: {current_precision:.2%}") + print(f" - Précision estimée: {new_precision:.2%} (+{(new_precision - current_precision)*100:.1f} points)") + + if new_precision >= 0.97: + print(f"\n ✅ Objectif de précision (≥97%) ATTEIGNABLE!") + else: + print(f"\n ⚠️ Objectif de précision (≥97%) nécessite {(0.97 - new_precision)*100:.1f} points supplémentaires") + + # Sauvegarder + output_dir = Path("tests/ground_truth/analysis") + output_file = output_dir / "global_propagation_analysis.json" + + output_data = { + "analysis_date": "2026-03-02", + "global_types_stats": global_stats, + "nom_extracted_stats": nom_extracted_stats, + "recommendations": recommendations, + "estimated_gain": { + "current_fp": current_fp, + "estimated_fp": new_fp, + "fp_reduction": total_fp_reduction, + "current_precision": current_precision, + "estimated_precision": new_precision, + "precision_gain": new_precision - current_precision + } + } + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2, ensure_ascii=False) + + print(f"\n📊 Analyse sauvegardée: {output_file}") + print("\n" + "="*80) + + return 0 + + +if __name__ == "__main__": + sys.exit(analyze_global_propagation()) diff --git a/tools/extract_medical_stopwords.py b/tools/extract_medical_stopwords.py new file mode 100755 index 0000000..cd55d4b --- /dev/null +++ b/tools/extract_medical_stopwords.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +Extrait les termes médicaux des documents pour enrichir les stopwords. + +Analyse les détections NOM_EXTRACTED pour identifier les faux positifs +récurrents (termes médicaux, anatomiques, etc.) à ajouter aux stopwords. +""" +import sys +import json +from pathlib import Path +from collections import Counter +import re + +def extract_medical_terms(): + """Extrait les termes médicaux des détections.""" + + baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") + audit_files = sorted(baseline_dir.glob("*.audit.jsonl")) + + if not audit_files: + print(f"✗ Aucun fichier audit trouvé dans {baseline_dir}") + return 1 + + print("="*80) + print("EXTRACTION DES TERMES MÉDICAUX") + print("="*80) + print(f"\n📁 Analyse de {len(audit_files)} fichiers audit...") + + # Collecter tous les NOM_EXTRACTED + nom_extracted = [] + + for audit_file in audit_files: + with open(audit_file, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + det = json.loads(line) + if det.get('kind') == 'NOM_EXTRACTED': + text = det.get('original', '').strip() + if text: + nom_extracted.append(text.lower()) + + print(f"\n📊 {len(nom_extracted)} détections NOM_EXTRACTED trouvées") + + # Compter les occurrences + counter = Counter(nom_extracted) + + print(f"📊 {len(counter)} termes uniques") + + # Identifier les termes médicaux potentiels + # Critères: termes fréquents (>= 3 occurrences) et patterns médicaux + medical_patterns = [ + r'^(dr|docteur|professeur|prof)$', + r'^(service|unite|unité|departement|département)$', + r'^(hopital|hôpital|clinique|centre|chu|ch)$', + r'^(medecin|médecin|infirmier|infirmière|aide.soignant)$', + r'^(consultation|hospitalisation|urgence|urgences)$', + r'^(chirurgie|cardiologie|pneumologie|neurologie|oncologie)$', + r'^(radiologie|imagerie|scanner|irm|echographie|échographie)$', + r'^(biologie|laboratoire|analyse|prelevement|prélèvement)$', + r'^(traitement|medicament|médicament|ordonnance|prescription)$', + r'^(diagnostic|pathologie|maladie|syndrome|infection)$', + r'^(patient|malade|sujet|cas)$', + r'^(examen|bilan|controle|contrôle|suivi)$', + r'^(resultat|résultat|valeur|taux|dosage)$', + r'^(antecedent|antécédent|allergie|risque|facteur)$', + r'^(gauche|droit|droite|superieur|supérieur|inferieur|inférieur)$', + r'^(anterieur|antérieur|posterieur|postérieur|lateral|latéral)$', + r'^(proximal|distal|medial|médial)$', + r'(ologie|ique|aire|ose|ite|ome)$', # Suffixes médicaux + ] + + medical_terms = set() + frequent_terms = [] + + for term, count in counter.most_common(): + # Termes fréquents (>= 3 occurrences) + if count >= 3: + frequent_terms.append((term, count)) + + # Vérifier si c'est un terme médical + is_medical = any(re.search(pattern, term, re.IGNORECASE) for pattern in medical_patterns) + + if is_medical: + medical_terms.add(term) + + print(f"\n📊 {len(frequent_terms)} termes fréquents (≥3 occurrences)") + print(f"📊 {len(medical_terms)} termes médicaux identifiés automatiquement") + + # Afficher les top 50 termes fréquents + print(f"\n🔝 Top 50 termes les plus fréquents:") + for i, (term, count) in enumerate(frequent_terms[:50], 1): + is_medical = term in medical_terms + marker = "🏥" if is_medical else " " + print(f" {i:2d}. {marker} {term:30s} ({count:3d} occurrences)") + + # Catégoriser les termes + categories = { + "Titres/Fonctions": [], + "Services/Départements": [], + "Établissements": [], + "Examens/Procédures": [], + "Anatomie": [], + "Pathologies": [], + "Médicaments/Traitements": [], + "Termes généraux": [] + } + + # Patterns par catégorie + category_patterns = { + "Titres/Fonctions": [r'(dr|docteur|prof|medecin|médecin|infirmier)'], + "Services/Départements": [r'(service|unite|unité|departement|département|ologie)'], + "Établissements": [r'(hopital|hôpital|clinique|centre|chu|ch)'], + "Examens/Procédures": [r'(examen|bilan|scanner|irm|echo|radio|analyse)'], + "Anatomie": [r'(gauche|droit|superieur|inferieur|anterieur|posterieur|lateral|proximal|distal)'], + "Pathologies": [r'(ite|ose|ome|pathologie|maladie|syndrome|infection)'], + "Médicaments/Traitements": [r'(traitement|medicament|médicament|ordonnance|prescription)'], + } + + for term in medical_terms: + categorized = False + for category, patterns in category_patterns.items(): + if any(re.search(p, term, re.IGNORECASE) for p in patterns): + categories[category].append(term) + categorized = True + break + if not categorized: + categories["Termes généraux"].append(term) + + # Afficher par catégorie + print(f"\n" + "="*80) + print("TERMES MÉDICAUX PAR CATÉGORIE") + print("="*80) + + for category, terms in categories.items(): + if terms: + print(f"\n{category} ({len(terms)} termes):") + for term in sorted(terms)[:20]: # Limiter à 20 par catégorie + count = counter[term] + print(f" - {term} ({count} occ.)") + if len(terms) > 20: + print(f" ... et {len(terms) - 20} autres") + + # Sauvegarder les résultats + output_dir = Path("tests/ground_truth/analysis") + output_dir.mkdir(exist_ok=True) + + output_data = { + "extraction_date": "2026-03-02", + "total_detections": len(nom_extracted), + "unique_terms": len(counter), + "frequent_terms_count": len(frequent_terms), + "medical_terms_count": len(medical_terms), + "top_50_frequent": [ + {"term": term, "count": count, "is_medical": term in medical_terms} + for term, count in frequent_terms[:50] + ], + "medical_terms_by_category": { + category: sorted(terms) + for category, terms in categories.items() + if terms + }, + "all_medical_terms": sorted(medical_terms) + } + + output_file = output_dir / "medical_stopwords_candidates.json" + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2, ensure_ascii=False) + + print(f"\n📊 Résultats sauvegardés: {output_file}") + + # Générer un fichier Python avec les nouveaux stopwords + stopwords_file = output_dir / "new_medical_stopwords.py" + with open(stopwords_file, 'w', encoding='utf-8') as f: + f.write("# Nouveaux stopwords médicaux extraits automatiquement\n") + f.write("# À ajouter à _MEDICAL_STOP_WORDS_SET dans anonymizer_core_refactored_onnx.py\n\n") + f.write("NEW_MEDICAL_STOPWORDS = {\n") + for term in sorted(medical_terms): + f.write(f' "{term}",\n') + f.write("}\n") + + print(f"📊 Stopwords Python générés: {stopwords_file}") + + print(f"\n" + "="*80) + print("RECOMMANDATIONS") + print("="*80) + print(f""" +1. Réviser manuellement les termes dans: {output_file} +2. Ajouter les termes validés à _MEDICAL_STOP_WORDS_SET +3. Re-exécuter l'anonymisation et l'évaluation +4. Vérifier la réduction des FP NOM_EXTRACTED + +Gain estimé: Réduction de ~{len(medical_terms)} termes récurrents +Impact: Amélioration de la précision de plusieurs points +""") + + return 0 + + +if __name__ == "__main__": + sys.exit(extract_medical_terms())