feat: Filtre hospitalier pour éliminer les faux positifs
- Ajout config/hospital_stopwords.yml avec adresses/téléphones hôpitaux - Ajout detectors/hospital_filter.py pour filtrer les FP - Intégration dans anonymizer_core_refactored_onnx.py - Test sur document: 40 -> 32 détections (-8 FP) - Élimine: adresses hôpitaux, codes postaux CEDEX, épisodes dans noms de fichiers
This commit is contained in:
87
tools/analyze_false_positives.py
Executable file
87
tools/analyze_false_positives.py
Executable file
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyse détaillée des faux positifs pour identifier les patterns problématiques.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
import sys
|
||||
|
||||
def analyze_false_positives():
|
||||
"""Analyse les faux positifs par type et identifie les patterns."""
|
||||
|
||||
# Charger les résultats d'évaluation
|
||||
eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
|
||||
if not eval_file.exists():
|
||||
print(f"❌ Fichier non trouvé: {eval_file}")
|
||||
return
|
||||
|
||||
with open(eval_file, 'r', encoding='utf-8') as f:
|
||||
eval_data = json.load(f)
|
||||
|
||||
# Charger les fichiers audit pour analyser les FP
|
||||
audit_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
|
||||
fp_examples = defaultdict(list)
|
||||
|
||||
# Parcourir les fichiers audit
|
||||
for audit_file in audit_dir.glob("*.audit.jsonl"):
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
detection = json.loads(line)
|
||||
pii_type = detection.get('type', 'UNKNOWN')
|
||||
text = detection.get('text', '')
|
||||
|
||||
# Collecter des exemples de chaque type
|
||||
if len(fp_examples[pii_type]) < 20: # Limiter à 20 exemples par type
|
||||
fp_examples[pii_type].append({
|
||||
'text': text,
|
||||
'file': audit_file.stem.replace('.audit', ''),
|
||||
'page': detection.get('page', 0)
|
||||
})
|
||||
|
||||
# Afficher l'analyse
|
||||
print("=" * 80)
|
||||
print("ANALYSE DES FAUX POSITIFS")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
# Focus sur les types problématiques
|
||||
problematic_types = ['EPISODE', 'VILLE', 'CODE_POSTAL', 'ADRESSE', 'TEL']
|
||||
|
||||
for pii_type in problematic_types:
|
||||
type_metrics = eval_data['by_type'].get(pii_type, {})
|
||||
fp_count = type_metrics.get('false_positives', 0)
|
||||
precision = type_metrics.get('precision', 0)
|
||||
|
||||
if fp_count == 0:
|
||||
continue
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Type: {pii_type}")
|
||||
print(f"Faux positifs: {fp_count}")
|
||||
print(f"Précision: {precision:.2%}")
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
examples = fp_examples.get(pii_type, [])
|
||||
if examples:
|
||||
print(f"\nExemples de détections (premiers 20):")
|
||||
for i, ex in enumerate(examples[:20], 1):
|
||||
print(f" {i:2d}. '{ex['text']}' (page {ex['page']})")
|
||||
else:
|
||||
print("\n⚠️ Aucun exemple trouvé dans les fichiers audit")
|
||||
|
||||
# Statistiques globales
|
||||
print(f"\n{'=' * 80}")
|
||||
print("STATISTIQUES GLOBALES")
|
||||
print(f"{'=' * 80}")
|
||||
global_metrics = eval_data['global_metrics']
|
||||
print(f"Précision: {global_metrics['precision']:.2%}")
|
||||
print(f"Rappel: {global_metrics['recall']:.2%}")
|
||||
print(f"F1-Score: {global_metrics['f1_score']:.2%}")
|
||||
print(f"Faux positifs totaux: {global_metrics['false_positives']}")
|
||||
print()
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_false_positives()
|
||||
155
tools/extract_false_positives.py
Normal file
155
tools/extract_false_positives.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extrait les exemples de faux positifs en comparant annotations et détections.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
def load_annotations(pdf_name):
|
||||
"""Charge les annotations pour un PDF."""
|
||||
# Essayer différents formats de noms
|
||||
possible_names = [
|
||||
pdf_name,
|
||||
pdf_name.replace('.redacted_raster', ''),
|
||||
pdf_name.split('.')[0]
|
||||
]
|
||||
|
||||
for name in possible_names:
|
||||
annotation_file = Path(f"tests/ground_truth/annotations/{name}.json")
|
||||
if annotation_file.exists():
|
||||
with open(annotation_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
return None
|
||||
|
||||
def load_detections(pdf_name):
|
||||
"""Charge les détections pour un PDF."""
|
||||
audit_file = Path(f"tests/ground_truth/pdfs/baseline_anonymized/{pdf_name}.audit.jsonl")
|
||||
if not audit_file.exists():
|
||||
return []
|
||||
|
||||
detections = []
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
detections.append(json.loads(line))
|
||||
return detections
|
||||
|
||||
def normalize_text(text):
|
||||
"""Normalise le texte pour la comparaison."""
|
||||
return text.lower().strip()
|
||||
|
||||
def is_match(detection, annotation, tolerance=5):
|
||||
"""Vérifie si une détection correspond à une annotation."""
|
||||
# Même page
|
||||
if detection.get('page') != annotation.get('page'):
|
||||
return False
|
||||
|
||||
# Même type (ou compatible)
|
||||
det_type = detection.get('type', '')
|
||||
ann_type = annotation.get('type', '')
|
||||
|
||||
# Normaliser les types
|
||||
type_mapping = {
|
||||
'NOM': ['NOM', 'PRENOM'],
|
||||
'PRENOM': ['NOM', 'PRENOM'],
|
||||
}
|
||||
|
||||
det_types = type_mapping.get(det_type, [det_type])
|
||||
ann_types = type_mapping.get(ann_type, [ann_type])
|
||||
|
||||
if not any(dt in ann_types for dt in det_types):
|
||||
return False
|
||||
|
||||
# Texte similaire
|
||||
det_text = normalize_text(detection.get('text', ''))
|
||||
ann_text = normalize_text(annotation.get('text', ''))
|
||||
|
||||
return det_text == ann_text or det_text in ann_text or ann_text in det_text
|
||||
|
||||
def extract_false_positives():
|
||||
"""Extrait les faux positifs de chaque document."""
|
||||
|
||||
eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
|
||||
with open(eval_file, 'r', encoding='utf-8') as f:
|
||||
eval_data = json.load(f)
|
||||
|
||||
false_positives = defaultdict(list)
|
||||
|
||||
# Parcourir chaque document
|
||||
for doc_result in eval_data['per_document']:
|
||||
pdf_name = doc_result['pdf']
|
||||
|
||||
# Charger annotations et détections
|
||||
annotations = load_annotations(pdf_name)
|
||||
detections = load_detections(pdf_name)
|
||||
|
||||
if not annotations or not detections:
|
||||
continue
|
||||
|
||||
# Identifier les faux positifs
|
||||
for detection in detections:
|
||||
# Vérifier si cette détection correspond à une annotation
|
||||
is_true_positive = False
|
||||
for annotation in annotations.get('pii', []):
|
||||
if is_match(detection, annotation):
|
||||
is_true_positive = True
|
||||
break
|
||||
|
||||
# Si pas de correspondance, c'est un faux positif
|
||||
if not is_true_positive:
|
||||
pii_type = detection.get('type', 'UNKNOWN')
|
||||
false_positives[pii_type].append({
|
||||
'text': detection.get('text', ''),
|
||||
'page': detection.get('page', 0),
|
||||
'file': pdf_name,
|
||||
'method': detection.get('method', 'unknown')
|
||||
})
|
||||
|
||||
# Afficher les résultats
|
||||
print("=" * 80)
|
||||
print("EXEMPLES DE FAUX POSITIFS")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
problematic_types = ['EPISODE', 'VILLE', 'CODE_POSTAL', 'ADRESSE', 'TEL']
|
||||
|
||||
for pii_type in problematic_types:
|
||||
fps = false_positives.get(pii_type, [])
|
||||
if not fps:
|
||||
continue
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Type: {pii_type} ({len(fps)} faux positifs)")
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
# Grouper par texte pour voir les patterns
|
||||
text_counts = defaultdict(int)
|
||||
for fp in fps:
|
||||
text_counts[fp['text']] += 1
|
||||
|
||||
# Afficher les plus fréquents
|
||||
sorted_texts = sorted(text_counts.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
print(f"\nTextes les plus fréquents:")
|
||||
for text, count in sorted_texts[:20]:
|
||||
print(f" {count:3d}x '{text}'")
|
||||
|
||||
# Afficher quelques exemples avec contexte
|
||||
print(f"\nExemples avec contexte:")
|
||||
for i, fp in enumerate(fps[:10], 1):
|
||||
print(f" {i:2d}. '{fp['text']}' (page {fp['page']}, méthode: {fp['method']})")
|
||||
print(f" Fichier: {fp['file']}")
|
||||
|
||||
# Sauvegarder les résultats
|
||||
output_file = Path("tests/ground_truth/analysis/false_positives_examples.json")
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(dict(false_positives), f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n✅ Résultats sauvegardés dans: {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_false_positives()
|
||||
77
tools/show_fp_details.py
Normal file
77
tools/show_fp_details.py
Normal file
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Affiche les détails des faux positifs à partir des résultats d'évaluation.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
# Charger l'évaluation
|
||||
eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
|
||||
with open(eval_file, 'r', encoding='utf-8') as f:
|
||||
eval_data = json.load(f)
|
||||
|
||||
# Analyser les types problématiques
|
||||
problematic_types = {
|
||||
'EPISODE': 106,
|
||||
'VILLE': 20,
|
||||
'CODE_POSTAL': 10,
|
||||
'ADRESSE': 10,
|
||||
'TEL': 8
|
||||
}
|
||||
|
||||
print("=" * 80)
|
||||
print("ANALYSE DES FAUX POSITIFS PAR TYPE")
|
||||
print("=" * 80)
|
||||
|
||||
# Collecter tous les exemples de détections
|
||||
all_detections = defaultdict(list)
|
||||
|
||||
for doc in eval_data['per_document']:
|
||||
pdf_name = doc['pdf']
|
||||
audit_file = Path(f"tests/ground_truth/pdfs/baseline_anonymized/{pdf_name}.audit.jsonl")
|
||||
|
||||
if not audit_file.exists():
|
||||
continue
|
||||
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
det = json.loads(line)
|
||||
kind = det.get('kind', 'UNKNOWN')
|
||||
original = det.get('original', '')
|
||||
page = det.get('page', -1)
|
||||
|
||||
all_detections[kind].append({
|
||||
'text': original,
|
||||
'page': page,
|
||||
'file': pdf_name
|
||||
})
|
||||
|
||||
# Afficher les statistiques pour chaque type problématique
|
||||
for pii_type, expected_fp in problematic_types.items():
|
||||
detections = all_detections.get(pii_type, [])
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Type: {pii_type}")
|
||||
print(f"Faux positifs attendus: {expected_fp}")
|
||||
print(f"Détections totales: {len(detections)}")
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
# Compter les occurrences
|
||||
text_counter = Counter(d['text'] for d in detections)
|
||||
|
||||
print(f"\nTextes les plus fréquents:")
|
||||
for text, count in text_counter.most_common(30):
|
||||
print(f" {count:3d}x '{text}'")
|
||||
|
||||
# Afficher quelques exemples avec contexte
|
||||
print(f"\nExemples avec fichier:")
|
||||
seen = set()
|
||||
for d in detections[:20]:
|
||||
key = (d['text'], d['file'])
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
print(f" '{d['text']}' (page {d['page']}) - {d['file']}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
70
tools/test_hospital_filter.py
Normal file
70
tools/test_hospital_filter.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test du filtre hospitalier sur le dataset complet.
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '.')
|
||||
|
||||
from pathlib import Path
|
||||
import json
|
||||
from collections import Counter
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
def main():
|
||||
# Répertoires
|
||||
input_dir = Path("tests/ground_truth/pdfs")
|
||||
output_dir = Path("tests/ground_truth/pdfs/filtered_anonymized")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Lister les PDFs
|
||||
pdf_files = sorted(input_dir.glob("*.pdf"))
|
||||
pdf_files = [p for p in pdf_files if not p.name.startswith('.')]
|
||||
|
||||
print(f"Anonymisation avec filtre hospitalier sur {len(pdf_files)} documents...")
|
||||
print("=" * 80)
|
||||
|
||||
total_detections = 0
|
||||
total_by_type = Counter()
|
||||
|
||||
for i, pdf_path in enumerate(pdf_files, 1):
|
||||
print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
|
||||
|
||||
try:
|
||||
result = process_pdf(
|
||||
pdf_path,
|
||||
output_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False
|
||||
)
|
||||
|
||||
# Compter les détections
|
||||
audit_file = Path(result['audit'])
|
||||
if audit_file.exists():
|
||||
detections = []
|
||||
with open(audit_file, 'r') as f:
|
||||
for line in f:
|
||||
det = json.loads(line)
|
||||
detections.append(det)
|
||||
total_by_type[det['kind']] += 1
|
||||
|
||||
total_detections += len(detections)
|
||||
print(f" ✅ {len(detections)} PII détectés")
|
||||
else:
|
||||
print(f" ⚠️ Pas de fichier audit")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Erreur: {e}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("RÉSULTATS GLOBAUX")
|
||||
print("=" * 80)
|
||||
print(f"Total PII détectés: {total_detections}")
|
||||
print(f"\nPar type:")
|
||||
for kind, count in sorted(total_by_type.items(), key=lambda x: -x[1]):
|
||||
print(f" {kind:20s}: {count:4d}")
|
||||
|
||||
print(f"\n✅ Résultats sauvegardés dans: {output_dir}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user