analysis: Analyse réelle de la qualité - Identification des faux positifs médicaux

This commit is contained in:
2026-03-02 22:41:14 +01:00
parent 85e19af655
commit eb797a4761
3 changed files with 519 additions and 0 deletions

160
tools/analyze_real_quality.py Executable file
View File

@@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""Analyse de la qualité réelle des documents anonymisés."""
import json
import re
from pathlib import Path
from collections import Counter, defaultdict
# Répertoire des documents anonymisés
ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
def analyze_leaks(txt_file):
"""Détecte les fuites potentielles dans un fichier texte."""
with open(txt_file, 'r', encoding='utf-8') as f:
content = f.read()
leaks = []
# Patterns de fuites critiques
patterns = {
"date_naissance_context": re.compile(r"(?:n[ée]+\s+le|DDN|date\s+de\s+naissance)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
"nom_propre": re.compile(r"\b[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]{2,}\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}\b"),
"telephone": re.compile(r"\b0[1-9](?:[\s.-]?\d{2}){4}\b"),
"email": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"),
"adresse": re.compile(r"\b\d+\s+(?:rue|avenue|boulevard|place|chemin|impasse)\s+[A-Z]", re.IGNORECASE),
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
}
for pattern_name, pattern in patterns.items():
matches = pattern.findall(content)
if matches:
leaks.append({
"type": pattern_name,
"count": len(matches),
"examples": matches[:3] # Premiers 3 exemples
})
return leaks
def analyze_audit(audit_file):
"""Analyse le fichier audit pour voir ce qui a été détecté."""
detections = []
with open(audit_file, 'r', encoding='utf-8') as f:
for line in f:
try:
det = json.loads(line)
detections.append(det)
except:
pass
# Compter par type
type_counts = Counter(d['kind'] for d in detections)
return {
"total": len(detections),
"by_type": dict(type_counts),
"detections": detections
}
def analyze_quality():
"""Analyse la qualité globale des documents anonymisés."""
txt_files = list(ANON_DIR.glob("*.pseudonymise.txt"))
audit_files = list(ANON_DIR.glob("*.audit.jsonl"))
print(f"📁 Répertoire: {ANON_DIR}")
print(f"📄 Fichiers texte: {len(txt_files)}")
print(f"📋 Fichiers audit: {len(audit_files)}")
print()
# Analyse des fuites
print("=" * 80)
print("🔍 ANALYSE DES FUITES")
print("=" * 80)
total_leaks = defaultdict(int)
files_with_leaks = []
for txt_file in txt_files[:10]: # Analyser les 10 premiers
leaks = analyze_leaks(txt_file)
if leaks:
files_with_leaks.append({
"file": txt_file.name,
"leaks": leaks
})
for leak in leaks:
total_leaks[leak["type"]] += leak["count"]
if files_with_leaks:
print(f"\n⚠️ {len(files_with_leaks)} fichiers avec fuites potentielles:")
for file_info in files_with_leaks:
print(f"\n 📄 {file_info['file']}")
for leak in file_info['leaks']:
print(f" - {leak['type']}: {leak['count']} occurrences")
if leak['examples']:
print(f" Exemples: {leak['examples'][:2]}")
else:
print("✅ Aucune fuite détectée dans les 10 premiers fichiers")
print(f"\n📊 Total fuites par type:")
for leak_type, count in sorted(total_leaks.items(), key=lambda x: x[1], reverse=True):
print(f" - {leak_type}: {count}")
# Analyse des détections
print("\n" + "=" * 80)
print("📊 ANALYSE DES DÉTECTIONS")
print("=" * 80)
all_detections = Counter()
total_docs = 0
for audit_file in audit_files[:10]: # Analyser les 10 premiers
audit_data = analyze_audit(audit_file)
all_detections.update(audit_data["by_type"])
total_docs += 1
print(f"\n📈 Détections sur {total_docs} documents:")
print(f" Total: {sum(all_detections.values())} PII détectés")
print(f" Moyenne: {sum(all_detections.values()) / total_docs:.1f} PII/document")
print()
print(" Par type:")
for pii_type, count in sorted(all_detections.items(), key=lambda x: x[1], reverse=True):
pct = (count / sum(all_detections.values())) * 100
print(f" - {pii_type}: {count} ({pct:.1f}%)")
# Analyse de la lisibilité
print("\n" + "=" * 80)
print("📖 ANALYSE DE LA LISIBILITÉ")
print("=" * 80)
for txt_file in txt_files[:3]: # Analyser les 3 premiers
with open(txt_file, 'r', encoding='utf-8') as f:
content = f.read()
# Compter les placeholders
placeholders = re.findall(r'\[([A-Z_]+)\]', content)
placeholder_count = len(placeholders)
# Compter les mots
words = re.findall(r'\b\w+\b', content)
word_count = len(words)
# Ratio de masquage
mask_ratio = (placeholder_count / word_count) * 100 if word_count > 0 else 0
print(f"\n 📄 {txt_file.name}")
print(f" - Mots: {word_count}")
print(f" - Placeholders: {placeholder_count}")
print(f" - Ratio masquage: {mask_ratio:.1f}%")
# Vérifier si le texte est encore lisible
if mask_ratio > 30:
print(f" ⚠️ Ratio de masquage élevé (>{30}%) - lisibilité compromise")
elif mask_ratio > 20:
print(f" ⚠️ Ratio de masquage modéré (>{20}%)")
else:
print(f" ✅ Ratio de masquage acceptable (<{20}%)")
if __name__ == "__main__":
analyze_quality()

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env python3
"""Compare un document original avec sa version anonymisée."""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import pdfplumber
# Document original
original_pdf = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/102_23056463/CRH 23056364.pdf")
anonymized_txt = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise/CRH 23056364.pseudonymise.txt")
print("="*80)
print("COMPARAISON ORIGINAL vs ANONYMISÉ")
print("="*80)
# Extraire texte original
print("\n📄 Extraction du texte original...")
with pdfplumber.open(original_pdf) as pdf:
original_text = "\n".join([page.extract_text() or "" for page in pdf.pages])
# Lire texte anonymisé
with open(anonymized_txt, 'r', encoding='utf-8') as f:
anonymized_text = f.read()
print(f"\n📊 Longueur texte original: {len(original_text)} caractères")
print(f"📊 Longueur texte anonymisé: {len(anonymized_text)} caractères")
# Afficher les 100 premières lignes de chaque
print("\n" + "="*80)
print("TEXTE ORIGINAL (100 premières lignes):")
print("="*80)
print("\n".join(original_text.split('\n')[:100]))
print("\n" + "="*80)
print("TEXTE ANONYMISÉ (100 premières lignes):")
print("="*80)
print("\n".join(anonymized_text.split('\n')[:100]))