Files
anonymisation/tools/analyze_real_quality.py

161 lines
5.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""Analyse de la qualité réelle des documents anonymisés."""
import json
import re
from pathlib import Path
from collections import Counter, defaultdict
# Répertoire des documents anonymisés
ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
def analyze_leaks(txt_file):
"""Détecte les fuites potentielles dans un fichier texte."""
with open(txt_file, 'r', encoding='utf-8') as f:
content = f.read()
leaks = []
# Patterns de fuites critiques
patterns = {
"date_naissance_context": re.compile(r"(?:n[ée]+\s+le|DDN|date\s+de\s+naissance)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
"nom_propre": re.compile(r"\b[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]{2,}\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}\b"),
"telephone": re.compile(r"\b0[1-9](?:[\s.-]?\d{2}){4}\b"),
"email": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"),
"adresse": re.compile(r"\b\d+\s+(?:rue|avenue|boulevard|place|chemin|impasse)\s+[A-Z]", re.IGNORECASE),
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
}
for pattern_name, pattern in patterns.items():
matches = pattern.findall(content)
if matches:
leaks.append({
"type": pattern_name,
"count": len(matches),
"examples": matches[:3] # Premiers 3 exemples
})
return leaks
def analyze_audit(audit_file):
"""Analyse le fichier audit pour voir ce qui a été détecté."""
detections = []
with open(audit_file, 'r', encoding='utf-8') as f:
for line in f:
try:
det = json.loads(line)
detections.append(det)
except:
pass
# Compter par type
type_counts = Counter(d['kind'] for d in detections)
return {
"total": len(detections),
"by_type": dict(type_counts),
"detections": detections
}
def analyze_quality():
"""Analyse la qualité globale des documents anonymisés."""
txt_files = list(ANON_DIR.glob("*.pseudonymise.txt"))
audit_files = list(ANON_DIR.glob("*.audit.jsonl"))
print(f"📁 Répertoire: {ANON_DIR}")
print(f"📄 Fichiers texte: {len(txt_files)}")
print(f"📋 Fichiers audit: {len(audit_files)}")
print()
# Analyse des fuites
print("=" * 80)
print("🔍 ANALYSE DES FUITES")
print("=" * 80)
total_leaks = defaultdict(int)
files_with_leaks = []
for txt_file in txt_files[:10]: # Analyser les 10 premiers
leaks = analyze_leaks(txt_file)
if leaks:
files_with_leaks.append({
"file": txt_file.name,
"leaks": leaks
})
for leak in leaks:
total_leaks[leak["type"]] += leak["count"]
if files_with_leaks:
print(f"\n⚠️ {len(files_with_leaks)} fichiers avec fuites potentielles:")
for file_info in files_with_leaks:
print(f"\n 📄 {file_info['file']}")
for leak in file_info['leaks']:
print(f" - {leak['type']}: {leak['count']} occurrences")
if leak['examples']:
print(f" Exemples: {leak['examples'][:2]}")
else:
print("✅ Aucune fuite détectée dans les 10 premiers fichiers")
print(f"\n📊 Total fuites par type:")
for leak_type, count in sorted(total_leaks.items(), key=lambda x: x[1], reverse=True):
print(f" - {leak_type}: {count}")
# Analyse des détections
print("\n" + "=" * 80)
print("📊 ANALYSE DES DÉTECTIONS")
print("=" * 80)
all_detections = Counter()
total_docs = 0
for audit_file in audit_files[:10]: # Analyser les 10 premiers
audit_data = analyze_audit(audit_file)
all_detections.update(audit_data["by_type"])
total_docs += 1
print(f"\n📈 Détections sur {total_docs} documents:")
print(f" Total: {sum(all_detections.values())} PII détectés")
print(f" Moyenne: {sum(all_detections.values()) / total_docs:.1f} PII/document")
print()
print(" Par type:")
for pii_type, count in sorted(all_detections.items(), key=lambda x: x[1], reverse=True):
pct = (count / sum(all_detections.values())) * 100
print(f" - {pii_type}: {count} ({pct:.1f}%)")
# Analyse de la lisibilité
print("\n" + "=" * 80)
print("📖 ANALYSE DE LA LISIBILITÉ")
print("=" * 80)
for txt_file in txt_files[:3]: # Analyser les 3 premiers
with open(txt_file, 'r', encoding='utf-8') as f:
content = f.read()
# Compter les placeholders
placeholders = re.findall(r'\[([A-Z_]+)\]', content)
placeholder_count = len(placeholders)
# Compter les mots
words = re.findall(r'\b\w+\b', content)
word_count = len(words)
# Ratio de masquage
mask_ratio = (placeholder_count / word_count) * 100 if word_count > 0 else 0
print(f"\n 📄 {txt_file.name}")
print(f" - Mots: {word_count}")
print(f" - Placeholders: {placeholder_count}")
print(f" - Ratio masquage: {mask_ratio:.1f}%")
# Vérifier si le texte est encore lisible
if mask_ratio > 30:
print(f" ⚠️ Ratio de masquage élevé (>{30}%) - lisibilité compromise")
elif mask_ratio > 20:
print(f" ⚠️ Ratio de masquage modéré (>{20}%)")
else:
print(f" ✅ Ratio de masquage acceptable (<{20}%)")
if __name__ == "__main__":
analyze_quality()