chore: Avant implémentation Phase 1 corrections qualité
This commit is contained in:
193
tools/analyze_date_masking.py
Normal file
193
tools/analyze_date_masking.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyse des dates masquées pour comprendre le sur-masquage.
|
||||
Compare les dates masquées avec les dates de naissance réelles.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
def analyze_dates_in_audit(audit_path: Path, text_path: Path):
|
||||
"""Analyse les dates dans un fichier audit."""
|
||||
dates_info = {
|
||||
"date_naissance": [],
|
||||
"date_naissance_global": [],
|
||||
"date_generic": [],
|
||||
"total_dates": 0
|
||||
}
|
||||
|
||||
# Charger l'audit
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if not line.strip():
|
||||
continue
|
||||
entry = json.loads(line)
|
||||
pii_type = entry.get("kind", "")
|
||||
original = entry.get("original", "")
|
||||
page = entry.get("page", -1)
|
||||
|
||||
if "DATE" in pii_type:
|
||||
dates_info["total_dates"] += 1
|
||||
|
||||
if pii_type == "DATE_NAISSANCE":
|
||||
dates_info["date_naissance"].append({
|
||||
"value": original,
|
||||
"page": page,
|
||||
"type": pii_type
|
||||
})
|
||||
elif pii_type == "DATE_NAISSANCE_GLOBAL":
|
||||
dates_info["date_naissance_global"].append({
|
||||
"value": original,
|
||||
"page": page,
|
||||
"type": pii_type
|
||||
})
|
||||
elif pii_type == "DATE":
|
||||
dates_info["date_generic"].append({
|
||||
"value": original,
|
||||
"page": page,
|
||||
"type": pii_type
|
||||
})
|
||||
|
||||
# Charger le texte anonymisé pour compter les masques
|
||||
with open(text_path, 'r', encoding='utf-8') as f:
|
||||
text = f.read()
|
||||
|
||||
date_naissance_count = text.count("[DATE_NAISSANCE]")
|
||||
date_count = text.count("[DATE]")
|
||||
|
||||
dates_info["masked_in_text"] = {
|
||||
"date_naissance": date_naissance_count,
|
||||
"date_generic": date_count,
|
||||
"total": date_naissance_count + date_count
|
||||
}
|
||||
|
||||
return dates_info
|
||||
|
||||
def main():
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
|
||||
print("=" * 80)
|
||||
print("ANALYSE DES DATES MASQUÉES")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
all_dates = []
|
||||
|
||||
# Analyser les 5 premiers documents
|
||||
for audit_file in sorted(prod_dir.glob("*.audit.jsonl"))[:5]:
|
||||
text_file = audit_file.with_name(
|
||||
audit_file.name.replace('.audit.jsonl', '.pseudonymise.txt')
|
||||
)
|
||||
|
||||
if not text_file.exists():
|
||||
continue
|
||||
|
||||
dates_info = analyze_dates_in_audit(audit_file, text_file)
|
||||
all_dates.append({
|
||||
"file": audit_file.name,
|
||||
"info": dates_info
|
||||
})
|
||||
|
||||
print(f"📄 {audit_file.name}")
|
||||
print(f" Total dates dans audit: {dates_info['total_dates']}")
|
||||
print(f" - DATE_NAISSANCE: {len(dates_info['date_naissance'])}")
|
||||
print(f" - DATE_NAISSANCE_GLOBAL: {len(dates_info['date_naissance_global'])}")
|
||||
print(f" - DATE générique: {len(dates_info['date_generic'])}")
|
||||
print(f" Masques dans le texte:")
|
||||
print(f" - [DATE_NAISSANCE]: {dates_info['masked_in_text']['date_naissance']}")
|
||||
print(f" - [DATE]: {dates_info['masked_in_text']['date_generic']}")
|
||||
print()
|
||||
|
||||
# Afficher quelques exemples de dates
|
||||
if dates_info['date_naissance']:
|
||||
print(f" Exemples DATE_NAISSANCE:")
|
||||
for d in dates_info['date_naissance'][:3]:
|
||||
print(f" • {d['value']} (page {d['page']})")
|
||||
|
||||
if dates_info['date_naissance_global']:
|
||||
print(f" Exemples DATE_NAISSANCE_GLOBAL:")
|
||||
for d in dates_info['date_naissance_global'][:3]:
|
||||
print(f" • {d['value']} (page {d['page']})")
|
||||
|
||||
print()
|
||||
|
||||
# Statistiques globales
|
||||
print("=" * 80)
|
||||
print("STATISTIQUES GLOBALES")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
total_dates = sum(d["info"]["total_dates"] for d in all_dates)
|
||||
total_date_naissance = sum(len(d["info"]["date_naissance"]) for d in all_dates)
|
||||
total_date_naissance_global = sum(len(d["info"]["date_naissance_global"]) for d in all_dates)
|
||||
total_date_generic = sum(len(d["info"]["date_generic"]) for d in all_dates)
|
||||
|
||||
total_masked_dn = sum(d["info"]["masked_in_text"]["date_naissance"] for d in all_dates)
|
||||
total_masked_d = sum(d["info"]["masked_in_text"]["date_generic"] for d in all_dates)
|
||||
|
||||
print(f"Total dates dans audits: {total_dates}")
|
||||
print(f" - DATE_NAISSANCE: {total_date_naissance}")
|
||||
print(f" - DATE_NAISSANCE_GLOBAL: {total_date_naissance_global}")
|
||||
print(f" - DATE générique: {total_date_generic}")
|
||||
print()
|
||||
print(f"Total masques dans textes: {total_masked_dn + total_masked_d}")
|
||||
print(f" - [DATE_NAISSANCE]: {total_masked_dn}")
|
||||
print(f" - [DATE]: {total_masked_d}")
|
||||
print()
|
||||
|
||||
# Analyse
|
||||
print("=" * 80)
|
||||
print("ANALYSE")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
if total_date_generic > 0:
|
||||
print("⚠️ PROBLÈME: DATE générique détecté !")
|
||||
print(f" {total_date_generic} dates génériques dans les audits")
|
||||
print(" Cause: RE_DATE n'est PAS désactivé ou NER détecte des dates")
|
||||
print()
|
||||
else:
|
||||
print("✅ DATE générique: 0 (correct, désactivé)")
|
||||
print()
|
||||
|
||||
if total_masked_d > 0:
|
||||
print("⚠️ PROBLÈME: [DATE] dans le texte !")
|
||||
print(f" {total_masked_d} masques [DATE] dans les textes")
|
||||
print(" Cause: Propagation globale ou rescan de sécurité")
|
||||
print()
|
||||
else:
|
||||
print("✅ [DATE] dans texte: 0 (correct)")
|
||||
print()
|
||||
|
||||
ratio = total_masked_dn / max(1, total_date_naissance) if total_date_naissance > 0 else 0
|
||||
print(f"Ratio masques/dates de naissance: {ratio:.1f}x")
|
||||
if ratio > 3:
|
||||
print("⚠️ PROBLÈME: Trop de masques par rapport aux dates de naissance")
|
||||
print(" Cause probable: Propagation globale trop agressive")
|
||||
print(" Chaque date de naissance génère plusieurs variations")
|
||||
else:
|
||||
print("✅ Ratio acceptable")
|
||||
print()
|
||||
|
||||
# Recommandations
|
||||
print("=" * 80)
|
||||
print("RECOMMANDATIONS")
|
||||
print("=" * 80)
|
||||
print()
|
||||
|
||||
if total_date_generic > 0 or total_masked_d > 0:
|
||||
print("1. Vérifier que RE_DATE est bien désactivé (ligne ~854)")
|
||||
print("2. Vérifier que le rescan de sécurité ne masque pas les dates")
|
||||
print("3. Vérifier que le NER ne détecte pas les dates de consultation")
|
||||
|
||||
if ratio > 3:
|
||||
print("4. Réduire les variations de propagation globale")
|
||||
print(" Actuellement: 4 variations (/, ., -, espace)")
|
||||
print(" Recommandation: 2 variations (/, .)")
|
||||
|
||||
print()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
144
tools/test_phase1_corrections.py
Executable file
144
tools/test_phase1_corrections.py
Executable file
@@ -0,0 +1,144 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test des corrections Phase 1 sur un échantillon de documents.
|
||||
Vérifie que:
|
||||
1. Les termes médicaux structurels ne sont PAS masqués
|
||||
2. Les médicaments ne sont PAS masqués
|
||||
3. Les dates de consultation ne sont PAS masquées (seules les dates de naissance)
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '.')
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
def test_phase1_corrections():
|
||||
"""Test les 3 corrections Phase 1 sur un échantillon de documents."""
|
||||
|
||||
# Chercher des documents de test
|
||||
test_dir = Path("tests/ground_truth/pdfs")
|
||||
|
||||
# Sélectionner 5 documents pour le test (éviter les .annotations.json)
|
||||
pdf_files = [f for f in test_dir.glob("*.pdf") if not f.name.endswith('.annotations.json')][:5]
|
||||
|
||||
if not pdf_files:
|
||||
print("❌ Aucun document de test trouvé")
|
||||
return
|
||||
|
||||
print(f"Test des corrections Phase 1 sur {len(pdf_files)} documents...")
|
||||
print("=" * 80)
|
||||
|
||||
output_dir = Path("tests/ground_truth/pdfs/phase1_test")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results = {
|
||||
'medical_terms_preserved': 0,
|
||||
'medications_preserved': 0,
|
||||
'dates_reduced': 0,
|
||||
'total_docs': 0
|
||||
}
|
||||
|
||||
for i, pdf_path in enumerate(pdf_files, 1):
|
||||
print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
|
||||
|
||||
try:
|
||||
# Anonymiser
|
||||
result = process_pdf(
|
||||
pdf_path,
|
||||
output_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=Path("config/dictionnaires.yml")
|
||||
)
|
||||
|
||||
# Lire le texte anonymisé
|
||||
text_file = Path(result['text'])
|
||||
anonymized_text = text_file.read_text(encoding='utf-8')
|
||||
|
||||
# Test 1: Vérifier que les termes médicaux structurels sont préservés
|
||||
medical_terms = [
|
||||
"Chef de service",
|
||||
"Chef de clinique",
|
||||
"Praticien hospitalier",
|
||||
"service de",
|
||||
"unité de"
|
||||
]
|
||||
|
||||
medical_preserved = 0
|
||||
for term in medical_terms:
|
||||
if term.lower() in anonymized_text.lower():
|
||||
medical_preserved += 1
|
||||
|
||||
# Test 2: Vérifier que les médicaments sont préservés
|
||||
medications = [
|
||||
"IDACIO",
|
||||
"Salazopyrine",
|
||||
"Infliximab",
|
||||
"Apranax"
|
||||
]
|
||||
|
||||
medications_preserved = 0
|
||||
for med in medications:
|
||||
if med.lower() in anonymized_text.lower():
|
||||
medications_preserved += 1
|
||||
|
||||
# Test 3: Compter les masques [DATE] vs [DATE_NAISSANCE]
|
||||
date_masks = len(re.findall(r'\[DATE\]', anonymized_text))
|
||||
date_naissance_masks = len(re.findall(r'\[DATE_NAISSANCE\]', anonymized_text))
|
||||
|
||||
print(f" ✓ Termes médicaux préservés: {medical_preserved}/{len(medical_terms)}")
|
||||
print(f" ✓ Médicaments préservés: {medications_preserved}/{len(medications)}")
|
||||
print(f" ✓ [DATE]: {date_masks}, [DATE_NAISSANCE]: {date_naissance_masks}")
|
||||
|
||||
# Vérifier que [DATE] = 0 (correction réussie)
|
||||
if date_masks == 0:
|
||||
results['dates_reduced'] += 1
|
||||
print(f" ✅ Correction dates: OK (0 [DATE])")
|
||||
else:
|
||||
print(f" ⚠️ Correction dates: {date_masks} [DATE] restants")
|
||||
|
||||
if medical_preserved > 0:
|
||||
results['medical_terms_preserved'] += 1
|
||||
|
||||
if medications_preserved > 0:
|
||||
results['medications_preserved'] += 1
|
||||
|
||||
results['total_docs'] += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Erreur: {e}")
|
||||
|
||||
# Résumé
|
||||
print("\n" + "=" * 80)
|
||||
print("RÉSUMÉ DES CORRECTIONS PHASE 1")
|
||||
print("=" * 80)
|
||||
|
||||
print(f"\nDocuments testés: {results['total_docs']}")
|
||||
print(f"\n✅ Correction 1.1 (Termes médicaux):")
|
||||
print(f" Documents avec termes préservés: {results['medical_terms_preserved']}/{results['total_docs']}")
|
||||
|
||||
print(f"\n✅ Correction 1.2 (Médicaments):")
|
||||
print(f" Documents avec médicaments préservés: {results['medications_preserved']}/{results['total_docs']}")
|
||||
|
||||
print(f"\n✅ Correction 1.3 (Dates):")
|
||||
print(f" Documents avec [DATE]=0: {results['dates_reduced']}/{results['total_docs']}")
|
||||
|
||||
success_rate = (
|
||||
results['medical_terms_preserved'] +
|
||||
results['medications_preserved'] +
|
||||
results['dates_reduced']
|
||||
) / (results['total_docs'] * 3) * 100
|
||||
|
||||
print(f"\n📊 Taux de succès global: {success_rate:.1f}%")
|
||||
|
||||
if success_rate >= 80:
|
||||
print("\n✅ PHASE 1 CORRECTIONS VALIDÉES")
|
||||
else:
|
||||
print("\n⚠️ PHASE 1 CORRECTIONS PARTIELLES - Vérification manuelle requise")
|
||||
|
||||
print(f"\n📁 Résultats dans: {output_dir}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_phase1_corrections()
|
||||
Reference in New Issue
Block a user