analysis: Analyse réelle de la qualité - Identification des faux positifs médicaux
This commit is contained in:
160
tools/analyze_real_quality.py
Executable file
160
tools/analyze_real_quality.py
Executable file
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyse de la qualité réelle des documents anonymisés."""
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
# Répertoire des documents anonymisés
|
||||
ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
|
||||
def analyze_leaks(txt_file):
|
||||
"""Détecte les fuites potentielles dans un fichier texte."""
|
||||
with open(txt_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
leaks = []
|
||||
|
||||
# Patterns de fuites critiques
|
||||
patterns = {
|
||||
"date_naissance_context": re.compile(r"(?:n[ée]+\s+le|DDN|date\s+de\s+naissance)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
|
||||
"nom_propre": re.compile(r"\b[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]{2,}\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}\b"),
|
||||
"telephone": re.compile(r"\b0[1-9](?:[\s.-]?\d{2}){4}\b"),
|
||||
"email": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"),
|
||||
"adresse": re.compile(r"\b\d+\s+(?:rue|avenue|boulevard|place|chemin|impasse)\s+[A-Z]", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
for pattern_name, pattern in patterns.items():
|
||||
matches = pattern.findall(content)
|
||||
if matches:
|
||||
leaks.append({
|
||||
"type": pattern_name,
|
||||
"count": len(matches),
|
||||
"examples": matches[:3] # Premiers 3 exemples
|
||||
})
|
||||
|
||||
return leaks
|
||||
|
||||
def analyze_audit(audit_file):
|
||||
"""Analyse le fichier audit pour voir ce qui a été détecté."""
|
||||
detections = []
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
try:
|
||||
det = json.loads(line)
|
||||
detections.append(det)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Compter par type
|
||||
type_counts = Counter(d['kind'] for d in detections)
|
||||
|
||||
return {
|
||||
"total": len(detections),
|
||||
"by_type": dict(type_counts),
|
||||
"detections": detections
|
||||
}
|
||||
|
||||
def analyze_quality():
|
||||
"""Analyse la qualité globale des documents anonymisés."""
|
||||
|
||||
txt_files = list(ANON_DIR.glob("*.pseudonymise.txt"))
|
||||
audit_files = list(ANON_DIR.glob("*.audit.jsonl"))
|
||||
|
||||
print(f"📁 Répertoire: {ANON_DIR}")
|
||||
print(f"📄 Fichiers texte: {len(txt_files)}")
|
||||
print(f"📋 Fichiers audit: {len(audit_files)}")
|
||||
print()
|
||||
|
||||
# Analyse des fuites
|
||||
print("=" * 80)
|
||||
print("🔍 ANALYSE DES FUITES")
|
||||
print("=" * 80)
|
||||
|
||||
total_leaks = defaultdict(int)
|
||||
files_with_leaks = []
|
||||
|
||||
for txt_file in txt_files[:10]: # Analyser les 10 premiers
|
||||
leaks = analyze_leaks(txt_file)
|
||||
if leaks:
|
||||
files_with_leaks.append({
|
||||
"file": txt_file.name,
|
||||
"leaks": leaks
|
||||
})
|
||||
for leak in leaks:
|
||||
total_leaks[leak["type"]] += leak["count"]
|
||||
|
||||
if files_with_leaks:
|
||||
print(f"\n⚠️ {len(files_with_leaks)} fichiers avec fuites potentielles:")
|
||||
for file_info in files_with_leaks:
|
||||
print(f"\n 📄 {file_info['file']}")
|
||||
for leak in file_info['leaks']:
|
||||
print(f" - {leak['type']}: {leak['count']} occurrences")
|
||||
if leak['examples']:
|
||||
print(f" Exemples: {leak['examples'][:2]}")
|
||||
else:
|
||||
print("✅ Aucune fuite détectée dans les 10 premiers fichiers")
|
||||
|
||||
print(f"\n📊 Total fuites par type:")
|
||||
for leak_type, count in sorted(total_leaks.items(), key=lambda x: x[1], reverse=True):
|
||||
print(f" - {leak_type}: {count}")
|
||||
|
||||
# Analyse des détections
|
||||
print("\n" + "=" * 80)
|
||||
print("📊 ANALYSE DES DÉTECTIONS")
|
||||
print("=" * 80)
|
||||
|
||||
all_detections = Counter()
|
||||
total_docs = 0
|
||||
|
||||
for audit_file in audit_files[:10]: # Analyser les 10 premiers
|
||||
audit_data = analyze_audit(audit_file)
|
||||
all_detections.update(audit_data["by_type"])
|
||||
total_docs += 1
|
||||
|
||||
print(f"\n📈 Détections sur {total_docs} documents:")
|
||||
print(f" Total: {sum(all_detections.values())} PII détectés")
|
||||
print(f" Moyenne: {sum(all_detections.values()) / total_docs:.1f} PII/document")
|
||||
print()
|
||||
print(" Par type:")
|
||||
for pii_type, count in sorted(all_detections.items(), key=lambda x: x[1], reverse=True):
|
||||
pct = (count / sum(all_detections.values())) * 100
|
||||
print(f" - {pii_type}: {count} ({pct:.1f}%)")
|
||||
|
||||
# Analyse de la lisibilité
|
||||
print("\n" + "=" * 80)
|
||||
print("📖 ANALYSE DE LA LISIBILITÉ")
|
||||
print("=" * 80)
|
||||
|
||||
for txt_file in txt_files[:3]: # Analyser les 3 premiers
|
||||
with open(txt_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Compter les placeholders
|
||||
placeholders = re.findall(r'\[([A-Z_]+)\]', content)
|
||||
placeholder_count = len(placeholders)
|
||||
|
||||
# Compter les mots
|
||||
words = re.findall(r'\b\w+\b', content)
|
||||
word_count = len(words)
|
||||
|
||||
# Ratio de masquage
|
||||
mask_ratio = (placeholder_count / word_count) * 100 if word_count > 0 else 0
|
||||
|
||||
print(f"\n 📄 {txt_file.name}")
|
||||
print(f" - Mots: {word_count}")
|
||||
print(f" - Placeholders: {placeholder_count}")
|
||||
print(f" - Ratio masquage: {mask_ratio:.1f}%")
|
||||
|
||||
# Vérifier si le texte est encore lisible
|
||||
if mask_ratio > 30:
|
||||
print(f" ⚠️ Ratio de masquage élevé (>{30}%) - lisibilité compromise")
|
||||
elif mask_ratio > 20:
|
||||
print(f" ⚠️ Ratio de masquage modéré (>{20}%)")
|
||||
else:
|
||||
print(f" ✅ Ratio de masquage acceptable (<{20}%)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_quality()
|
||||
39
tools/compare_original_vs_anonymized.py
Normal file
39
tools/compare_original_vs_anonymized.py
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare un document original avec sa version anonymisée."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import pdfplumber
|
||||
|
||||
# Document original
|
||||
original_pdf = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/102_23056463/CRH 23056364.pdf")
|
||||
anonymized_txt = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise/CRH 23056364.pseudonymise.txt")
|
||||
|
||||
print("="*80)
|
||||
print("COMPARAISON ORIGINAL vs ANONYMISÉ")
|
||||
print("="*80)
|
||||
|
||||
# Extraire texte original
|
||||
print("\n📄 Extraction du texte original...")
|
||||
with pdfplumber.open(original_pdf) as pdf:
|
||||
original_text = "\n".join([page.extract_text() or "" for page in pdf.pages])
|
||||
|
||||
# Lire texte anonymisé
|
||||
with open(anonymized_txt, 'r', encoding='utf-8') as f:
|
||||
anonymized_text = f.read()
|
||||
|
||||
print(f"\n📊 Longueur texte original: {len(original_text)} caractères")
|
||||
print(f"📊 Longueur texte anonymisé: {len(anonymized_text)} caractères")
|
||||
|
||||
# Afficher les 100 premières lignes de chaque
|
||||
print("\n" + "="*80)
|
||||
print("TEXTE ORIGINAL (100 premières lignes):")
|
||||
print("="*80)
|
||||
print("\n".join(original_text.split('\n')[:100]))
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("TEXTE ANONYMISÉ (100 premières lignes):")
|
||||
print("="*80)
|
||||
print("\n".join(anonymized_text.split('\n')[:100]))
|
||||
Reference in New Issue
Block a user