#!/usr/bin/env python3 """ Analyse des dates masquées pour comprendre le sur-masquage. Compare les dates masquées avec les dates de naissance réelles. """ import json import re from pathlib import Path from collections import defaultdict def analyze_dates_in_audit(audit_path: Path, text_path: Path): """Analyse les dates dans un fichier audit.""" dates_info = { "date_naissance": [], "date_naissance_global": [], "date_generic": [], "total_dates": 0 } # Charger l'audit with open(audit_path, 'r', encoding='utf-8') as f: for line in f: if not line.strip(): continue entry = json.loads(line) pii_type = entry.get("kind", "") original = entry.get("original", "") page = entry.get("page", -1) if "DATE" in pii_type: dates_info["total_dates"] += 1 if pii_type == "DATE_NAISSANCE": dates_info["date_naissance"].append({ "value": original, "page": page, "type": pii_type }) elif pii_type == "DATE_NAISSANCE_GLOBAL": dates_info["date_naissance_global"].append({ "value": original, "page": page, "type": pii_type }) elif pii_type == "DATE": dates_info["date_generic"].append({ "value": original, "page": page, "type": pii_type }) # Charger le texte anonymisé pour compter les masques with open(text_path, 'r', encoding='utf-8') as f: text = f.read() date_naissance_count = text.count("[DATE_NAISSANCE]") date_count = text.count("[DATE]") dates_info["masked_in_text"] = { "date_naissance": date_naissance_count, "date_generic": date_count, "total": date_naissance_count + date_count } return dates_info def main(): prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise") print("=" * 80) print("ANALYSE DES DATES MASQUÉES") print("=" * 80) print() all_dates = [] # Analyser les 5 premiers documents for audit_file in sorted(prod_dir.glob("*.audit.jsonl"))[:5]: text_file = audit_file.with_name( audit_file.name.replace('.audit.jsonl', '.pseudonymise.txt') ) if not text_file.exists(): continue dates_info = analyze_dates_in_audit(audit_file, text_file) all_dates.append({ "file": audit_file.name, "info": dates_info }) print(f"📄 {audit_file.name}") print(f" Total dates dans audit: {dates_info['total_dates']}") print(f" - DATE_NAISSANCE: {len(dates_info['date_naissance'])}") print(f" - DATE_NAISSANCE_GLOBAL: {len(dates_info['date_naissance_global'])}") print(f" - DATE générique: {len(dates_info['date_generic'])}") print(f" Masques dans le texte:") print(f" - [DATE_NAISSANCE]: {dates_info['masked_in_text']['date_naissance']}") print(f" - [DATE]: {dates_info['masked_in_text']['date_generic']}") print() # Afficher quelques exemples de dates if dates_info['date_naissance']: print(f" Exemples DATE_NAISSANCE:") for d in dates_info['date_naissance'][:3]: print(f" • {d['value']} (page {d['page']})") if dates_info['date_naissance_global']: print(f" Exemples DATE_NAISSANCE_GLOBAL:") for d in dates_info['date_naissance_global'][:3]: print(f" • {d['value']} (page {d['page']})") print() # Statistiques globales print("=" * 80) print("STATISTIQUES GLOBALES") print("=" * 80) print() total_dates = sum(d["info"]["total_dates"] for d in all_dates) total_date_naissance = sum(len(d["info"]["date_naissance"]) for d in all_dates) total_date_naissance_global = sum(len(d["info"]["date_naissance_global"]) for d in all_dates) total_date_generic = sum(len(d["info"]["date_generic"]) for d in all_dates) total_masked_dn = sum(d["info"]["masked_in_text"]["date_naissance"] for d in all_dates) total_masked_d = sum(d["info"]["masked_in_text"]["date_generic"] for d in all_dates) print(f"Total dates dans audits: {total_dates}") print(f" - DATE_NAISSANCE: {total_date_naissance}") print(f" - DATE_NAISSANCE_GLOBAL: {total_date_naissance_global}") print(f" - DATE générique: {total_date_generic}") print() print(f"Total masques dans textes: {total_masked_dn + total_masked_d}") print(f" - [DATE_NAISSANCE]: {total_masked_dn}") print(f" - [DATE]: {total_masked_d}") print() # Analyse print("=" * 80) print("ANALYSE") print("=" * 80) print() if total_date_generic > 0: print("⚠️ PROBLÈME: DATE générique détecté !") print(f" {total_date_generic} dates génériques dans les audits") print(" Cause: RE_DATE n'est PAS désactivé ou NER détecte des dates") print() else: print("✅ DATE générique: 0 (correct, désactivé)") print() if total_masked_d > 0: print("⚠️ PROBLÈME: [DATE] dans le texte !") print(f" {total_masked_d} masques [DATE] dans les textes") print(" Cause: Propagation globale ou rescan de sécurité") print() else: print("✅ [DATE] dans texte: 0 (correct)") print() ratio = total_masked_dn / max(1, total_date_naissance) if total_date_naissance > 0 else 0 print(f"Ratio masques/dates de naissance: {ratio:.1f}x") if ratio > 3: print("⚠️ PROBLÈME: Trop de masques par rapport aux dates de naissance") print(" Cause probable: Propagation globale trop agressive") print(" Chaque date de naissance génère plusieurs variations") else: print("✅ Ratio acceptable") print() # Recommandations print("=" * 80) print("RECOMMANDATIONS") print("=" * 80) print() if total_date_generic > 0 or total_masked_d > 0: print("1. Vérifier que RE_DATE est bien désactivé (ligne ~854)") print("2. Vérifier que le rescan de sécurité ne masque pas les dates") print("3. Vérifier que le NER ne détecte pas les dates de consultation") if ratio > 3: print("4. Réduire les variations de propagation globale") print(" Actuellement: 4 variations (/, ., -, espace)") print(" Recommandation: 2 variations (/, .)") print() if __name__ == "__main__": main()