chore: Avant implémentation Phase 1 corrections qualité

This commit is contained in:
2026-03-02 23:34:06 +01:00
parent 93617bab55
commit 47a71df930
8 changed files with 1157 additions and 2 deletions

View File

@@ -0,0 +1,193 @@
#!/usr/bin/env python3
"""
Analyse des dates masquées pour comprendre le sur-masquage.
Compare les dates masquées avec les dates de naissance réelles.
"""
import json
import re
from pathlib import Path
from collections import defaultdict
def analyze_dates_in_audit(audit_path: Path, text_path: Path):
"""Analyse les dates dans un fichier audit."""
dates_info = {
"date_naissance": [],
"date_naissance_global": [],
"date_generic": [],
"total_dates": 0
}
# Charger l'audit
with open(audit_path, 'r', encoding='utf-8') as f:
for line in f:
if not line.strip():
continue
entry = json.loads(line)
pii_type = entry.get("kind", "")
original = entry.get("original", "")
page = entry.get("page", -1)
if "DATE" in pii_type:
dates_info["total_dates"] += 1
if pii_type == "DATE_NAISSANCE":
dates_info["date_naissance"].append({
"value": original,
"page": page,
"type": pii_type
})
elif pii_type == "DATE_NAISSANCE_GLOBAL":
dates_info["date_naissance_global"].append({
"value": original,
"page": page,
"type": pii_type
})
elif pii_type == "DATE":
dates_info["date_generic"].append({
"value": original,
"page": page,
"type": pii_type
})
# Charger le texte anonymisé pour compter les masques
with open(text_path, 'r', encoding='utf-8') as f:
text = f.read()
date_naissance_count = text.count("[DATE_NAISSANCE]")
date_count = text.count("[DATE]")
dates_info["masked_in_text"] = {
"date_naissance": date_naissance_count,
"date_generic": date_count,
"total": date_naissance_count + date_count
}
return dates_info
def main():
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
print("=" * 80)
print("ANALYSE DES DATES MASQUÉES")
print("=" * 80)
print()
all_dates = []
# Analyser les 5 premiers documents
for audit_file in sorted(prod_dir.glob("*.audit.jsonl"))[:5]:
text_file = audit_file.with_name(
audit_file.name.replace('.audit.jsonl', '.pseudonymise.txt')
)
if not text_file.exists():
continue
dates_info = analyze_dates_in_audit(audit_file, text_file)
all_dates.append({
"file": audit_file.name,
"info": dates_info
})
print(f"📄 {audit_file.name}")
print(f" Total dates dans audit: {dates_info['total_dates']}")
print(f" - DATE_NAISSANCE: {len(dates_info['date_naissance'])}")
print(f" - DATE_NAISSANCE_GLOBAL: {len(dates_info['date_naissance_global'])}")
print(f" - DATE générique: {len(dates_info['date_generic'])}")
print(f" Masques dans le texte:")
print(f" - [DATE_NAISSANCE]: {dates_info['masked_in_text']['date_naissance']}")
print(f" - [DATE]: {dates_info['masked_in_text']['date_generic']}")
print()
# Afficher quelques exemples de dates
if dates_info['date_naissance']:
print(f" Exemples DATE_NAISSANCE:")
for d in dates_info['date_naissance'][:3]:
print(f"{d['value']} (page {d['page']})")
if dates_info['date_naissance_global']:
print(f" Exemples DATE_NAISSANCE_GLOBAL:")
for d in dates_info['date_naissance_global'][:3]:
print(f"{d['value']} (page {d['page']})")
print()
# Statistiques globales
print("=" * 80)
print("STATISTIQUES GLOBALES")
print("=" * 80)
print()
total_dates = sum(d["info"]["total_dates"] for d in all_dates)
total_date_naissance = sum(len(d["info"]["date_naissance"]) for d in all_dates)
total_date_naissance_global = sum(len(d["info"]["date_naissance_global"]) for d in all_dates)
total_date_generic = sum(len(d["info"]["date_generic"]) for d in all_dates)
total_masked_dn = sum(d["info"]["masked_in_text"]["date_naissance"] for d in all_dates)
total_masked_d = sum(d["info"]["masked_in_text"]["date_generic"] for d in all_dates)
print(f"Total dates dans audits: {total_dates}")
print(f" - DATE_NAISSANCE: {total_date_naissance}")
print(f" - DATE_NAISSANCE_GLOBAL: {total_date_naissance_global}")
print(f" - DATE générique: {total_date_generic}")
print()
print(f"Total masques dans textes: {total_masked_dn + total_masked_d}")
print(f" - [DATE_NAISSANCE]: {total_masked_dn}")
print(f" - [DATE]: {total_masked_d}")
print()
# Analyse
print("=" * 80)
print("ANALYSE")
print("=" * 80)
print()
if total_date_generic > 0:
print("⚠️ PROBLÈME: DATE générique détecté !")
print(f" {total_date_generic} dates génériques dans les audits")
print(" Cause: RE_DATE n'est PAS désactivé ou NER détecte des dates")
print()
else:
print("✅ DATE générique: 0 (correct, désactivé)")
print()
if total_masked_d > 0:
print("⚠️ PROBLÈME: [DATE] dans le texte !")
print(f" {total_masked_d} masques [DATE] dans les textes")
print(" Cause: Propagation globale ou rescan de sécurité")
print()
else:
print("✅ [DATE] dans texte: 0 (correct)")
print()
ratio = total_masked_dn / max(1, total_date_naissance) if total_date_naissance > 0 else 0
print(f"Ratio masques/dates de naissance: {ratio:.1f}x")
if ratio > 3:
print("⚠️ PROBLÈME: Trop de masques par rapport aux dates de naissance")
print(" Cause probable: Propagation globale trop agressive")
print(" Chaque date de naissance génère plusieurs variations")
else:
print("✅ Ratio acceptable")
print()
# Recommandations
print("=" * 80)
print("RECOMMANDATIONS")
print("=" * 80)
print()
if total_date_generic > 0 or total_masked_d > 0:
print("1. Vérifier que RE_DATE est bien désactivé (ligne ~854)")
print("2. Vérifier que le rescan de sécurité ne masque pas les dates")
print("3. Vérifier que le NER ne détecte pas les dates de consultation")
if ratio > 3:
print("4. Réduire les variations de propagation globale")
print(" Actuellement: 4 variations (/, ., -, espace)")
print(" Recommandation: 2 variations (/, .)")
print()
if __name__ == "__main__":
main()

144
tools/test_phase1_corrections.py Executable file
View File

@@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""
Test des corrections Phase 1 sur un échantillon de documents.
Vérifie que:
1. Les termes médicaux structurels ne sont PAS masqués
2. Les médicaments ne sont PAS masqués
3. Les dates de consultation ne sont PAS masquées (seules les dates de naissance)
"""
import sys
sys.path.insert(0, '.')
from pathlib import Path
import re
from anonymizer_core_refactored_onnx import process_pdf
def test_phase1_corrections():
"""Test les 3 corrections Phase 1 sur un échantillon de documents."""
# Chercher des documents de test
test_dir = Path("tests/ground_truth/pdfs")
# Sélectionner 5 documents pour le test (éviter les .annotations.json)
pdf_files = [f for f in test_dir.glob("*.pdf") if not f.name.endswith('.annotations.json')][:5]
if not pdf_files:
print("❌ Aucun document de test trouvé")
return
print(f"Test des corrections Phase 1 sur {len(pdf_files)} documents...")
print("=" * 80)
output_dir = Path("tests/ground_truth/pdfs/phase1_test")
output_dir.mkdir(parents=True, exist_ok=True)
results = {
'medical_terms_preserved': 0,
'medications_preserved': 0,
'dates_reduced': 0,
'total_docs': 0
}
for i, pdf_path in enumerate(pdf_files, 1):
print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
try:
# Anonymiser
result = process_pdf(
pdf_path,
output_dir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml")
)
# Lire le texte anonymisé
text_file = Path(result['text'])
anonymized_text = text_file.read_text(encoding='utf-8')
# Test 1: Vérifier que les termes médicaux structurels sont préservés
medical_terms = [
"Chef de service",
"Chef de clinique",
"Praticien hospitalier",
"service de",
"unité de"
]
medical_preserved = 0
for term in medical_terms:
if term.lower() in anonymized_text.lower():
medical_preserved += 1
# Test 2: Vérifier que les médicaments sont préservés
medications = [
"IDACIO",
"Salazopyrine",
"Infliximab",
"Apranax"
]
medications_preserved = 0
for med in medications:
if med.lower() in anonymized_text.lower():
medications_preserved += 1
# Test 3: Compter les masques [DATE] vs [DATE_NAISSANCE]
date_masks = len(re.findall(r'\[DATE\]', anonymized_text))
date_naissance_masks = len(re.findall(r'\[DATE_NAISSANCE\]', anonymized_text))
print(f" ✓ Termes médicaux préservés: {medical_preserved}/{len(medical_terms)}")
print(f" ✓ Médicaments préservés: {medications_preserved}/{len(medications)}")
print(f" ✓ [DATE]: {date_masks}, [DATE_NAISSANCE]: {date_naissance_masks}")
# Vérifier que [DATE] = 0 (correction réussie)
if date_masks == 0:
results['dates_reduced'] += 1
print(f" ✅ Correction dates: OK (0 [DATE])")
else:
print(f" ⚠️ Correction dates: {date_masks} [DATE] restants")
if medical_preserved > 0:
results['medical_terms_preserved'] += 1
if medications_preserved > 0:
results['medications_preserved'] += 1
results['total_docs'] += 1
except Exception as e:
print(f" ❌ Erreur: {e}")
# Résumé
print("\n" + "=" * 80)
print("RÉSUMÉ DES CORRECTIONS PHASE 1")
print("=" * 80)
print(f"\nDocuments testés: {results['total_docs']}")
print(f"\n✅ Correction 1.1 (Termes médicaux):")
print(f" Documents avec termes préservés: {results['medical_terms_preserved']}/{results['total_docs']}")
print(f"\n✅ Correction 1.2 (Médicaments):")
print(f" Documents avec médicaments préservés: {results['medications_preserved']}/{results['total_docs']}")
print(f"\n✅ Correction 1.3 (Dates):")
print(f" Documents avec [DATE]=0: {results['dates_reduced']}/{results['total_docs']}")
success_rate = (
results['medical_terms_preserved'] +
results['medications_preserved'] +
results['dates_reduced']
) / (results['total_docs'] * 3) * 100
print(f"\n📊 Taux de succès global: {success_rate:.1f}%")
if success_rate >= 80:
print("\n✅ PHASE 1 CORRECTIONS VALIDÉES")
else:
print("\n⚠️ PHASE 1 CORRECTIONS PARTIELLES - Vérification manuelle requise")
print(f"\n📁 Résultats dans: {output_dir}")
if __name__ == "__main__":
test_phase1_corrections()