#!/usr/bin/env python3 """Analyse de la qualité réelle des documents anonymisés.""" import json import re from pathlib import Path from collections import Counter, defaultdict # Répertoire des documents anonymisés ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise") def analyze_leaks(txt_file): """Détecte les fuites potentielles dans un fichier texte.""" with open(txt_file, 'r', encoding='utf-8') as f: content = f.read() leaks = [] # Patterns de fuites critiques patterns = { "date_naissance_context": re.compile(r"(?:n[ée]+\s+le|DDN|date\s+de\s+naissance)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE), "nom_propre": re.compile(r"\b[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]{2,}\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}\b"), "telephone": re.compile(r"\b0[1-9](?:[\s.-]?\d{2}){4}\b"), "email": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"), "adresse": re.compile(r"\b\d+\s+(?:rue|avenue|boulevard|place|chemin|impasse)\s+[A-Z]", re.IGNORECASE), "chcb": re.compile(r"\bCHCB\b", re.IGNORECASE), } for pattern_name, pattern in patterns.items(): matches = pattern.findall(content) if matches: leaks.append({ "type": pattern_name, "count": len(matches), "examples": matches[:3] # Premiers 3 exemples }) return leaks def analyze_audit(audit_file): """Analyse le fichier audit pour voir ce qui a été détecté.""" detections = [] with open(audit_file, 'r', encoding='utf-8') as f: for line in f: try: det = json.loads(line) detections.append(det) except: pass # Compter par type type_counts = Counter(d['kind'] for d in detections) return { "total": len(detections), "by_type": dict(type_counts), "detections": detections } def analyze_quality(): """Analyse la qualité globale des documents anonymisés.""" txt_files = list(ANON_DIR.glob("*.pseudonymise.txt")) audit_files = list(ANON_DIR.glob("*.audit.jsonl")) print(f"📁 Répertoire: {ANON_DIR}") print(f"📄 Fichiers texte: {len(txt_files)}") print(f"📋 Fichiers audit: {len(audit_files)}") print() # Analyse des fuites print("=" * 80) print("🔍 ANALYSE DES FUITES") print("=" * 80) total_leaks = defaultdict(int) files_with_leaks = [] for txt_file in txt_files[:10]: # Analyser les 10 premiers leaks = analyze_leaks(txt_file) if leaks: files_with_leaks.append({ "file": txt_file.name, "leaks": leaks }) for leak in leaks: total_leaks[leak["type"]] += leak["count"] if files_with_leaks: print(f"\n⚠️ {len(files_with_leaks)} fichiers avec fuites potentielles:") for file_info in files_with_leaks: print(f"\n 📄 {file_info['file']}") for leak in file_info['leaks']: print(f" - {leak['type']}: {leak['count']} occurrences") if leak['examples']: print(f" Exemples: {leak['examples'][:2]}") else: print("✅ Aucune fuite détectée dans les 10 premiers fichiers") print(f"\n📊 Total fuites par type:") for leak_type, count in sorted(total_leaks.items(), key=lambda x: x[1], reverse=True): print(f" - {leak_type}: {count}") # Analyse des détections print("\n" + "=" * 80) print("📊 ANALYSE DES DÉTECTIONS") print("=" * 80) all_detections = Counter() total_docs = 0 for audit_file in audit_files[:10]: # Analyser les 10 premiers audit_data = analyze_audit(audit_file) all_detections.update(audit_data["by_type"]) total_docs += 1 print(f"\n📈 Détections sur {total_docs} documents:") print(f" Total: {sum(all_detections.values())} PII détectés") print(f" Moyenne: {sum(all_detections.values()) / total_docs:.1f} PII/document") print() print(" Par type:") for pii_type, count in sorted(all_detections.items(), key=lambda x: x[1], reverse=True): pct = (count / sum(all_detections.values())) * 100 print(f" - {pii_type}: {count} ({pct:.1f}%)") # Analyse de la lisibilité print("\n" + "=" * 80) print("📖 ANALYSE DE LA LISIBILITÉ") print("=" * 80) for txt_file in txt_files[:3]: # Analyser les 3 premiers with open(txt_file, 'r', encoding='utf-8') as f: content = f.read() # Compter les placeholders placeholders = re.findall(r'\[([A-Z_]+)\]', content) placeholder_count = len(placeholders) # Compter les mots words = re.findall(r'\b\w+\b', content) word_count = len(words) # Ratio de masquage mask_ratio = (placeholder_count / word_count) * 100 if word_count > 0 else 0 print(f"\n 📄 {txt_file.name}") print(f" - Mots: {word_count}") print(f" - Placeholders: {placeholder_count}") print(f" - Ratio masquage: {mask_ratio:.1f}%") # Vérifier si le texte est encore lisible if mask_ratio > 30: print(f" ⚠️ Ratio de masquage élevé (>{30}%) - lisibilité compromise") elif mask_ratio > 20: print(f" ⚠️ Ratio de masquage modéré (>{20}%)") else: print(f" ✅ Ratio de masquage acceptable (<{20}%)") if __name__ == "__main__": analyze_quality()