docs: Analyse complète de la régression de qualité - Causes racines identifiées
This commit is contained in:
261
tools/deep_quality_regression_analysis.py
Normal file
261
tools/deep_quality_regression_analysis.py
Normal file
@@ -0,0 +1,261 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Analyse approfondie de la régression de qualité
|
||||
Comparaison détaillée entre documents originaux et anonymisés
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
import pdfplumber
|
||||
|
||||
def extract_original_text(pdf_path: str) -> str:
|
||||
"""Extrait le texte du PDF original"""
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
return "\n".join(page.extract_text() or "" for page in pdf.pages)
|
||||
|
||||
def load_anonymized_text(txt_path: str) -> str:
|
||||
"""Charge le texte anonymisé"""
|
||||
return Path(txt_path).read_text(encoding='utf-8')
|
||||
|
||||
def load_audit(audit_path: str) -> List[Dict]:
|
||||
"""Charge le fichier audit"""
|
||||
audit = []
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
audit.append(json.loads(line))
|
||||
return audit
|
||||
|
||||
def analyze_masking_quality(original: str, anonymized: str, audit: List[Dict]) -> Dict:
|
||||
"""Analyse la qualité du masquage"""
|
||||
|
||||
issues = {
|
||||
"ocr_artifacts": [],
|
||||
"over_masked_medical_terms": [],
|
||||
"over_masked_medications": [],
|
||||
"over_masked_dates": [],
|
||||
"over_masked_cities": [],
|
||||
"legitimate_masking": [],
|
||||
"false_positives": [],
|
||||
"text_quality_degradation": []
|
||||
}
|
||||
|
||||
# 1. Détecter les artefacts OCR
|
||||
ocr_patterns = [
|
||||
r'P Nr °a t Ric Pi Pen S',
|
||||
r'[A-Z]\s[a-z]\s[a-z]\s[a-z]', # Lettres espacées
|
||||
r'\d\s\d\s\d\s\d', # Chiffres espacés
|
||||
]
|
||||
for pattern in ocr_patterns:
|
||||
for match in re.finditer(pattern, anonymized):
|
||||
issues["ocr_artifacts"].append({
|
||||
"text": match.group(0),
|
||||
"position": match.start(),
|
||||
"context": anonymized[max(0, match.start()-30):match.end()+30]
|
||||
})
|
||||
|
||||
# 2. Détecter les termes médicaux sur-masqués
|
||||
medical_terms_masked = [
|
||||
("Chef de service", "Chef de [MASK]"),
|
||||
("Chef de Clinique", "Chef de [ETABLISSEMENT]"),
|
||||
("Note IDE", "[NOM] IDE"),
|
||||
("Avis ORL", "[NOM] ORL"),
|
||||
("Examen ORL", "[NOM] ORL"),
|
||||
]
|
||||
for original_term, masked_term in medical_terms_masked:
|
||||
if masked_term in anonymized and original_term in original:
|
||||
issues["over_masked_medical_terms"].append({
|
||||
"original": original_term,
|
||||
"masked": masked_term,
|
||||
"count": anonymized.count(masked_term)
|
||||
})
|
||||
|
||||
# 3. Détecter les médicaments sur-masqués
|
||||
medication_pattern = r'\[NOM\]\s+\d+\s*mg'
|
||||
for match in re.finditer(medication_pattern, anonymized):
|
||||
# Trouver le médicament original
|
||||
context_start = max(0, match.start() - 50)
|
||||
context_end = min(len(anonymized), match.end() + 50)
|
||||
context = anonymized[context_start:context_end]
|
||||
issues["over_masked_medications"].append({
|
||||
"masked": match.group(0),
|
||||
"context": context
|
||||
})
|
||||
|
||||
# 4. Analyser les dates masquées
|
||||
date_masks = re.findall(r'\[DATE\]', anonymized)
|
||||
date_naissance_masks = re.findall(r'\[DATE_NAISSANCE\]', anonymized)
|
||||
|
||||
# Compter les dates dans l'original
|
||||
date_pattern = r'\b\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}\b'
|
||||
original_dates = re.findall(date_pattern, original)
|
||||
|
||||
issues["over_masked_dates"] = {
|
||||
"total_date_masks": len(date_masks),
|
||||
"date_naissance_masks": len(date_naissance_masks),
|
||||
"original_dates_count": len(original_dates),
|
||||
"ratio": len(date_masks) / max(1, len(original_dates)),
|
||||
"problem": len(date_masks) > len(date_naissance_masks) * 5 # Si >5x plus de dates que de dates de naissance
|
||||
}
|
||||
|
||||
# 5. Analyser les villes masquées
|
||||
ville_masks = re.findall(r'\[VILLE\]', anonymized)
|
||||
issues["over_masked_cities"] = {
|
||||
"count": len(ville_masks),
|
||||
"contexts": []
|
||||
}
|
||||
for match in re.finditer(r'\[VILLE\]', anonymized):
|
||||
context_start = max(0, match.start() - 30)
|
||||
context_end = min(len(anonymized), match.end() + 30)
|
||||
issues["over_masked_cities"]["contexts"].append(
|
||||
anonymized[context_start:context_end]
|
||||
)
|
||||
|
||||
# 6. Analyser l'audit pour les faux positifs
|
||||
nom_count = sum(1 for h in audit if h['kind'] == 'NOM')
|
||||
nom_global_count = sum(1 for h in audit if h['kind'] == 'NOM_GLOBAL')
|
||||
|
||||
issues["false_positives"] = {
|
||||
"nom_count": nom_count,
|
||||
"nom_global_count": nom_global_count,
|
||||
"suspicious": nom_count > 50 # Plus de 50 noms dans un document = suspect
|
||||
}
|
||||
|
||||
# 7. Comparer la qualité du texte
|
||||
original_clean = re.sub(r'\s+', ' ', original).strip()
|
||||
anonymized_clean = re.sub(r'\s+', ' ', anonymized).strip()
|
||||
|
||||
# Ratio de caractères préservés (hors masques)
|
||||
mask_chars = len(re.findall(r'\[.*?\]', anonymized_clean))
|
||||
preserved_ratio = (len(anonymized_clean) - mask_chars) / max(1, len(original_clean))
|
||||
|
||||
issues["text_quality_degradation"] = {
|
||||
"original_length": len(original_clean),
|
||||
"anonymized_length": len(anonymized_clean),
|
||||
"preserved_ratio": preserved_ratio,
|
||||
"degraded": preserved_ratio < 0.7 # Si <70% du texte préservé
|
||||
}
|
||||
|
||||
return issues
|
||||
|
||||
def generate_report(issues: Dict, doc_name: str) -> str:
|
||||
"""Génère un rapport détaillé"""
|
||||
report = []
|
||||
report.append(f"\n{'='*80}")
|
||||
report.append(f"ANALYSE DE RÉGRESSION - {doc_name}")
|
||||
report.append(f"{'='*80}\n")
|
||||
|
||||
# Artefacts OCR
|
||||
if issues["ocr_artifacts"]:
|
||||
report.append(f"⚠️ ARTEFACTS OCR DÉTECTÉS: {len(issues['ocr_artifacts'])}")
|
||||
for i, artifact in enumerate(issues["ocr_artifacts"][:3], 1):
|
||||
report.append(f" {i}. '{artifact['text']}'")
|
||||
report.append(f" Contexte: ...{artifact['context']}...")
|
||||
report.append("")
|
||||
|
||||
# Termes médicaux sur-masqués
|
||||
if issues["over_masked_medical_terms"]:
|
||||
report.append(f"⚠️ TERMES MÉDICAUX SUR-MASQUÉS: {len(issues['over_masked_medical_terms'])}")
|
||||
for term in issues["over_masked_medical_terms"]:
|
||||
report.append(f" • '{term['original']}' → '{term['masked']}' ({term['count']}x)")
|
||||
report.append("")
|
||||
|
||||
# Médicaments sur-masqués
|
||||
if issues["over_masked_medications"]:
|
||||
report.append(f"⚠️ MÉDICAMENTS SUR-MASQUÉS: {len(issues['over_masked_medications'])}")
|
||||
for i, med in enumerate(issues["over_masked_medications"][:3], 1):
|
||||
report.append(f" {i}. {med['masked']}")
|
||||
report.append(f" Contexte: ...{med['context']}...")
|
||||
report.append("")
|
||||
|
||||
# Dates sur-masquées
|
||||
if issues["over_masked_dates"]["problem"]:
|
||||
report.append(f"⚠️ DATES SUR-MASQUÉES:")
|
||||
report.append(f" • Total [DATE]: {issues['over_masked_dates']['total_date_masks']}")
|
||||
report.append(f" • [DATE_NAISSANCE]: {issues['over_masked_dates']['date_naissance_masks']}")
|
||||
report.append(f" • Dates originales: {issues['over_masked_dates']['original_dates_count']}")
|
||||
report.append(f" • Ratio: {issues['over_masked_dates']['ratio']:.1f}x")
|
||||
report.append(f" • PROBLÈME: Toutes les dates sont masquées, pas seulement les dates de naissance!")
|
||||
report.append("")
|
||||
|
||||
# Villes sur-masquées
|
||||
if issues["over_masked_cities"]["count"] > 0:
|
||||
report.append(f"⚠️ VILLES SUR-MASQUÉES: {issues['over_masked_cities']['count']}")
|
||||
for i, ctx in enumerate(issues["over_masked_cities"]["contexts"][:3], 1):
|
||||
report.append(f" {i}. ...{ctx}...")
|
||||
report.append("")
|
||||
|
||||
# Faux positifs
|
||||
if issues["false_positives"]["suspicious"]:
|
||||
report.append(f"⚠️ FAUX POSITIFS SUSPECTS:")
|
||||
report.append(f" • NOM détectés: {issues['false_positives']['nom_count']}")
|
||||
report.append(f" • NOM_GLOBAL: {issues['false_positifs']['nom_global_count']}")
|
||||
report.append(f" • PROBLÈME: Trop de noms détectés (>50), probablement des termes médicaux")
|
||||
report.append("")
|
||||
|
||||
# Dégradation qualité texte
|
||||
if issues["text_quality_degradation"]["degraded"]:
|
||||
report.append(f"⚠️ DÉGRADATION QUALITÉ TEXTE:")
|
||||
report.append(f" • Longueur originale: {issues['text_quality_degradation']['original_length']}")
|
||||
report.append(f" • Longueur anonymisée: {issues['text_quality_degradation']['anonymized_length']}")
|
||||
report.append(f" • Ratio préservé: {issues['text_quality_degradation']['preserved_ratio']:.1%}")
|
||||
report.append(f" • PROBLÈME: Moins de 70% du texte préservé")
|
||||
report.append("")
|
||||
|
||||
return "\n".join(report)
|
||||
|
||||
def main():
|
||||
"""Analyse un échantillon de documents"""
|
||||
|
||||
# Chemins
|
||||
original_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
anonymized_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
|
||||
# Documents à analyser
|
||||
test_docs = [
|
||||
("102_23056463/CRH 23056364.pdf", "CRH 23056364"),
|
||||
]
|
||||
|
||||
all_reports = []
|
||||
|
||||
for original_rel, base_name in test_docs:
|
||||
print(f"\n🔍 Analyse de {base_name}...")
|
||||
|
||||
original_path = original_dir / original_rel
|
||||
anonymized_txt = anonymized_dir / f"{base_name}.pseudonymise.txt"
|
||||
audit_file = anonymized_dir / f"{base_name}.audit.jsonl"
|
||||
|
||||
if not original_path.exists():
|
||||
print(f" ❌ Original non trouvé: {original_path}")
|
||||
continue
|
||||
if not anonymized_txt.exists():
|
||||
print(f" ❌ Anonymisé non trouvé: {anonymized_txt}")
|
||||
continue
|
||||
if not audit_file.exists():
|
||||
print(f" ❌ Audit non trouvé: {audit_file}")
|
||||
continue
|
||||
|
||||
# Extraire et analyser
|
||||
original_text = extract_original_text(str(original_path))
|
||||
anonymized_text = load_anonymized_text(str(anonymized_txt))
|
||||
audit = load_audit(str(audit_file))
|
||||
|
||||
issues = analyze_masking_quality(original_text, anonymized_text, audit)
|
||||
report = generate_report(issues, base_name)
|
||||
|
||||
all_reports.append(report)
|
||||
print(report)
|
||||
|
||||
# Sauvegarder le rapport
|
||||
output_file = Path(".kiro/specs/anonymization-quality-optimization/DEEP_REGRESSION_ANALYSIS.md")
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
full_report = "\n\n".join(all_reports)
|
||||
output_file.write_text(full_report, encoding='utf-8')
|
||||
|
||||
print(f"\n✅ Rapport sauvegardé: {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user