chore(rgpd): replace CHCB/Bayonne/Saint-Denis/Réunion refs in source + configs (D-12)
Anonymise toutes les références à des entités réelles (CHCB, Bayonne, Saint-Denis, Réunion, etc.) dans le code source, les configurations YAML, les scripts/outils, et les tests unitaires. Conserve les tests synthétiques (cases) intentionnels. - profile key chcb_strict → chuxx_strict - CHCB → CHUXX, Bayonne → Chicago, Saint-Denis → Springfield, Réunion → Province Bêta, 64100/97400 → 12345, FINESS → 999999999, préfixe tél 05.59.44 → 0X.XX.XX - renomme tools/test_chcb_leak.py → tools/test_force_term_leak.py Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -65,7 +65,7 @@ def classify_complexity(stats: dict) -> str:
|
||||
|
||||
|
||||
def main():
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)/")
|
||||
|
||||
if not corpus_dir.exists():
|
||||
print(f"Erreur : {corpus_dir} n'existe pas")
|
||||
|
||||
@@ -66,7 +66,7 @@ def analyze_dates_in_audit(audit_path: Path, text_path: Path):
|
||||
return dates_info
|
||||
|
||||
def main():
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
print("=" * 80)
|
||||
print("ANALYSE DES DATES MASQUÉES")
|
||||
|
||||
@@ -7,7 +7,7 @@ from pathlib import Path
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
# Répertoire des documents anonymisés
|
||||
ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
ANON_DIR = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
def analyze_leaks(txt_file):
|
||||
"""Détecte les fuites potentielles dans un fichier texte."""
|
||||
@@ -23,7 +23,7 @@ def analyze_leaks(txt_file):
|
||||
"telephone": re.compile(r"\b0[1-9](?:[\s.-]?\d{2}){4}\b"),
|
||||
"email": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"),
|
||||
"adresse": re.compile(r"\b\d+\s+(?:rue|avenue|boulevard|place|chemin|impasse)\s+[A-Z]", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
"chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
for pattern_name, pattern in patterns.items():
|
||||
|
||||
@@ -8,8 +8,8 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import pdfplumber
|
||||
|
||||
# Document original
|
||||
original_pdf = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/102_23056463/CRH 23056364.pdf")
|
||||
anonymized_txt = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise/CRH 23056364.pseudonymise.txt")
|
||||
original_pdf = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/102_23056463/CRH 23056364.pdf")
|
||||
anonymized_txt = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise/CRH 23056364.pseudonymise.txt")
|
||||
|
||||
print("="*80)
|
||||
print("COMPARAISON ORIGINAL vs ANONYMISÉ")
|
||||
|
||||
@@ -46,7 +46,7 @@ def compare_datasets():
|
||||
test_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
|
||||
# Production (régression)
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("COMPARAISON TEST DATASET vs PRODUCTION")
|
||||
|
||||
@@ -17,11 +17,11 @@ print()
|
||||
|
||||
# Test the pattern
|
||||
test_lines = [
|
||||
"confirmée à 5,7 g ici au CHCB. Appel Dr [NOM], hématologue biologiste",
|
||||
"CHCB :",
|
||||
"CHCB",
|
||||
"au CHCB",
|
||||
"le CHCB est",
|
||||
"confirmée à 5,7 g ici au CHUXX. Appel Dr [NOM], hématologue biologiste",
|
||||
"CHUXX :",
|
||||
"CHUXX",
|
||||
"au CHUXX",
|
||||
"le CHUXX est",
|
||||
]
|
||||
|
||||
for term in cfg.get("blacklist", {}).get("force_mask_terms", []):
|
||||
|
||||
@@ -210,8 +210,8 @@ def main():
|
||||
"""Analyse un échantillon de documents"""
|
||||
|
||||
# Chemins
|
||||
original_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
anonymized_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
original_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
anonymized_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
# Documents à analyser
|
||||
test_docs = [
|
||||
|
||||
@@ -122,7 +122,7 @@ def analyze_anonymized_text(text_path: Path) -> Dict:
|
||||
def compare_datasets():
|
||||
"""Compare test dataset vs production."""
|
||||
test_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymise")
|
||||
prod_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/anonymise")
|
||||
|
||||
print("=" * 80)
|
||||
print("ANALYSE DES CAUSES RACINES - RÉGRESSION DE QUALITÉ")
|
||||
|
||||
@@ -4,15 +4,15 @@ Simule l'effet d'une règle d'administration sur un texte ou sur le corpus synth
|
||||
|
||||
Usage :
|
||||
# Appliquer une règle à un texte libre
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chcb_exact_mask \\
|
||||
--text "Consulté au CHCB le 12/06/2024."
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chuxx_exact_mask \\
|
||||
--text "Consulté au CHUXX le 12/06/2024."
|
||||
|
||||
# Appliquer à un fichier texte
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chcb_exact_mask \\
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chuxx_exact_mask \\
|
||||
--file path/to/document.txt
|
||||
|
||||
# Valider la règle sur ses required_case_ids (--corpus)
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chcb_exact_mask --corpus
|
||||
python tools/simulate_admin_rule.py --rule-id rule_chuxx_exact_mask --corpus
|
||||
|
||||
# Valider TOUTES les règles actives sur leurs corpus
|
||||
python tools/simulate_admin_rule.py --all --corpus
|
||||
|
||||
@@ -16,7 +16,7 @@ def test_all_cro():
|
||||
"""Test la propagation des dates de naissance sur tous les CRO."""
|
||||
|
||||
# Chercher tous les CRO dans les 59 OGC
|
||||
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
|
||||
# Trouver tous les CRO (compte rendu opératoire)
|
||||
print("Recherche de tous les CRO dans le corpus...")
|
||||
@@ -59,25 +59,25 @@ def test_all_cro():
|
||||
date_context_pattern = re.compile(r'Né(?:e)?\s+le\s+(\d{1,2}[\s/.\-]+\d{1,2}[\s/.\-]+\d{2,4})', re.IGNORECASE)
|
||||
context_leaks = date_context_pattern.findall(anonymized_text)
|
||||
|
||||
# Scanner "CHCB" en clair
|
||||
chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text)
|
||||
# Scanner "CHUXX" en clair
|
||||
chuxx_leaks = re.findall(r'\bCHUXX\b', anonymized_text)
|
||||
|
||||
# Compter les fuites totales
|
||||
total_leaks = len(context_leaks) + len(chcb_leaks)
|
||||
total_leaks = len(context_leaks) + len(chuxx_leaks)
|
||||
|
||||
status = "✅" if total_leaks == 0 else "❌"
|
||||
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}")
|
||||
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHUXX: {len(chuxx_leaks)}")
|
||||
|
||||
if context_leaks:
|
||||
print(f" Exemples dates: {context_leaks[:3]}")
|
||||
if chcb_leaks:
|
||||
print(f" Exemples CHCB: {chcb_leaks[:3]}")
|
||||
if chuxx_leaks:
|
||||
print(f" Exemples CHUXX: {chuxx_leaks[:3]}")
|
||||
|
||||
results.append({
|
||||
'file': pdf_path.name,
|
||||
'path': str(pdf_path),
|
||||
'context_leaks': len(context_leaks),
|
||||
'chcb_leaks': len(chcb_leaks),
|
||||
'chuxx_leaks': len(chuxx_leaks),
|
||||
'success': total_leaks == 0
|
||||
})
|
||||
|
||||
@@ -100,13 +100,13 @@ def test_all_cro():
|
||||
success_count = sum(1 for r in results if r.get('success', False))
|
||||
error_count = sum(1 for r in results if 'error' in r)
|
||||
total_context_leaks = sum(r.get('context_leaks', 0) for r in results)
|
||||
total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results)
|
||||
total_chuxx_leaks = sum(r.get('chuxx_leaks', 0) for r in results)
|
||||
|
||||
print(f"Documents testés: {len(results)}")
|
||||
print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)")
|
||||
print(f"Erreurs: {error_count}")
|
||||
print(f"Fuites 'Né(e) le' totales: {total_context_leaks}")
|
||||
print(f"Fuites CHCB totales: {total_chcb_leaks}")
|
||||
print(f"Fuites CHUXX totales: {total_chuxx_leaks}")
|
||||
print(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)")
|
||||
|
||||
# Liste des documents avec fuites
|
||||
@@ -119,7 +119,7 @@ def test_all_cro():
|
||||
print(f"\n{doc['file']}")
|
||||
print(f" Path: {doc['path']}")
|
||||
print(f" Fuites dates: {doc.get('context_leaks', 0)}")
|
||||
print(f" Fuites CHCB: {doc.get('chcb_leaks', 0)}")
|
||||
print(f" Fuites CHUXX: {doc.get('chuxx_leaks', 0)}")
|
||||
|
||||
# Liste des erreurs
|
||||
error_docs = [r for r in results if 'error' in r]
|
||||
@@ -148,7 +148,7 @@ def test_all_cro():
|
||||
f.write(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)\n")
|
||||
f.write(f"Erreurs: {error_count}\n")
|
||||
f.write(f"Fuites 'Né(e) le' totales: {total_context_leaks}\n")
|
||||
f.write(f"Fuites CHCB totales: {total_chcb_leaks}\n")
|
||||
f.write(f"Fuites CHUXX totales: {total_chuxx_leaks}\n")
|
||||
f.write(f"Temps total: {elapsed_time:.1f}s ({elapsed_time/len(results):.1f}s/doc)\n\n")
|
||||
|
||||
if failed_docs:
|
||||
@@ -159,7 +159,7 @@ def test_all_cro():
|
||||
f.write(f"{doc['file']}\n")
|
||||
f.write(f" Path: {doc['path']}\n")
|
||||
f.write(f" Fuites dates: {doc.get('context_leaks', 0)}\n")
|
||||
f.write(f" Fuites CHCB: {doc.get('chcb_leaks', 0)}\n\n")
|
||||
f.write(f" Fuites CHUXX: {doc.get('chuxx_leaks', 0)}\n\n")
|
||||
|
||||
if error_docs:
|
||||
f.write("=" * 80 + "\n")
|
||||
|
||||
@@ -16,7 +16,7 @@ def test_date_propagation():
|
||||
"""Test la propagation des dates de naissance sur un CRO."""
|
||||
|
||||
# Chercher un CRO dans les 59 OGC
|
||||
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
ogc_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
|
||||
# Trouver un CRO (compte rendu opératoire)
|
||||
cro_files = []
|
||||
@@ -68,19 +68,19 @@ def test_date_propagation():
|
||||
lines_with_placeholders = [line for line in anonymized_text.split('\n') if placeholder_pattern.search(line)]
|
||||
standalone_leaks = [d for d in standalone_dates if not any(d in line for line in lines_with_placeholders)]
|
||||
|
||||
# Scanner "CHCB" en clair
|
||||
chcb_leaks = re.findall(r'\bCHCB\b', anonymized_text)
|
||||
# Scanner "CHUXX" en clair
|
||||
chuxx_leaks = re.findall(r'\bCHUXX\b', anonymized_text)
|
||||
|
||||
# Compter les fuites totales
|
||||
total_leaks = len(context_leaks) + len(chcb_leaks)
|
||||
total_leaks = len(context_leaks) + len(chuxx_leaks)
|
||||
|
||||
status = "✅" if total_leaks == 0 else "❌"
|
||||
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHCB: {len(chcb_leaks)}")
|
||||
print(f" {status} Fuites 'Né(e) le': {len(context_leaks)}, Fuites CHUXX: {len(chuxx_leaks)}")
|
||||
|
||||
if context_leaks:
|
||||
print(f" Exemples dates: {context_leaks[:3]}")
|
||||
if chcb_leaks:
|
||||
print(f" Exemples CHCB: {chcb_leaks[:3]}")
|
||||
if chuxx_leaks:
|
||||
print(f" Exemples CHUXX: {chuxx_leaks[:3]}")
|
||||
|
||||
# Info : dates standalone (pas nécessairement des fuites)
|
||||
if standalone_leaks:
|
||||
@@ -89,7 +89,7 @@ def test_date_propagation():
|
||||
results.append({
|
||||
'file': pdf_path.name,
|
||||
'context_leaks': len(context_leaks),
|
||||
'chcb_leaks': len(chcb_leaks),
|
||||
'chuxx_leaks': len(chuxx_leaks),
|
||||
'standalone_dates': len(standalone_leaks),
|
||||
'success': total_leaks == 0
|
||||
})
|
||||
@@ -109,13 +109,13 @@ def test_date_propagation():
|
||||
|
||||
success_count = sum(1 for r in results if r.get('success', False))
|
||||
total_context_leaks = sum(r.get('context_leaks', 0) for r in results)
|
||||
total_chcb_leaks = sum(r.get('chcb_leaks', 0) for r in results)
|
||||
total_chuxx_leaks = sum(r.get('chuxx_leaks', 0) for r in results)
|
||||
total_standalone = sum(r.get('standalone_dates', 0) for r in results)
|
||||
|
||||
print(f"Documents testés: {len(results)}")
|
||||
print(f"Succès: {success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)")
|
||||
print(f"Fuites 'Né(e) le' totales: {total_context_leaks}")
|
||||
print(f"Fuites CHCB totales: {total_chcb_leaks}")
|
||||
print(f"Fuites CHUXX totales: {total_chuxx_leaks}")
|
||||
print(f"Dates standalone (info): {total_standalone}")
|
||||
|
||||
if success_count == len(results):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test CHCB force_term detection on the 2 leaked documents."""
|
||||
"""Test force_term detection on the 2 leaked documents."""
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
@@ -10,10 +10,10 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
|
||||
def test_chcb_detection():
|
||||
"""Test CHCB detection on the 2 documents with leaks."""
|
||||
def test_force_term_detection():
|
||||
"""Test force_term detection on the 2 documents with leaks."""
|
||||
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
|
||||
# Document 1: trackare-BA148337-23091302
|
||||
doc1_path = None
|
||||
@@ -45,7 +45,7 @@ def test_chcb_detection():
|
||||
print("TEST DOCUMENT 1: trackare-BA148337-23091302")
|
||||
print("=" * 80)
|
||||
|
||||
outdir = Path("test_chcb_leak")
|
||||
outdir = Path("test_force_term_leak")
|
||||
outdir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
@@ -64,14 +64,14 @@ def test_chcb_detection():
|
||||
txt_file = Path(outputs["text"])
|
||||
content = txt_file.read_text(encoding="utf-8")
|
||||
|
||||
if "CHCB" in content:
|
||||
print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé")
|
||||
if "CHUXX" in content:
|
||||
print("🔴 FUITE DÉTECTÉE: CHUXX trouvé dans le texte anonymisé")
|
||||
# Trouver le contexte
|
||||
for i, line in enumerate(content.split("\n"), 1):
|
||||
if "CHCB" in line:
|
||||
if "CHUXX" in line:
|
||||
print(f" Ligne {i}: {line.strip()}")
|
||||
else:
|
||||
print("✅ Aucune fuite CHCB")
|
||||
print("✅ Aucune fuite CHUXX")
|
||||
|
||||
# Vérifier l'audit
|
||||
import json
|
||||
@@ -80,10 +80,10 @@ def test_chcb_detection():
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
obj = json.loads(line)
|
||||
if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""):
|
||||
if obj.get("kind") == "force_term" and "CHUXX" in obj.get("value", ""):
|
||||
force_term_count += 1
|
||||
|
||||
print(f"📊 Détections force_term CHCB: {force_term_count}")
|
||||
print(f"📊 Détections force_term CHUXX: {force_term_count}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Erreur: {e}")
|
||||
@@ -113,14 +113,14 @@ def test_chcb_detection():
|
||||
txt_file = Path(outputs["text"])
|
||||
content = txt_file.read_text(encoding="utf-8")
|
||||
|
||||
if "CHCB" in content:
|
||||
print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé")
|
||||
if "CHUXX" in content:
|
||||
print("🔴 FUITE DÉTECTÉE: CHUXX trouvé dans le texte anonymisé")
|
||||
# Trouver le contexte
|
||||
for i, line in enumerate(content.split("\n"), 1):
|
||||
if "CHCB" in line:
|
||||
if "CHUXX" in line:
|
||||
print(f" Ligne {i}: {line.strip()}")
|
||||
else:
|
||||
print("✅ Aucune fuite CHCB")
|
||||
print("✅ Aucune fuite CHUXX")
|
||||
|
||||
# Vérifier l'audit
|
||||
import json
|
||||
@@ -129,10 +129,10 @@ def test_chcb_detection():
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
obj = json.loads(line)
|
||||
if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""):
|
||||
if obj.get("kind") == "force_term" and "CHUXX" in obj.get("value", ""):
|
||||
force_term_count += 1
|
||||
|
||||
print(f"📊 Détections force_term CHCB: {force_term_count}")
|
||||
print(f"📊 Détections force_term CHUXX: {force_term_count}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Erreur: {e}")
|
||||
@@ -140,4 +140,4 @@ def test_chcb_detection():
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_chcb_detection()
|
||||
test_force_term_detection()
|
||||
@@ -88,7 +88,7 @@ import re
|
||||
leak_count = 0
|
||||
patterns = {
|
||||
"date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
"chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
for txt_file in out_dir.glob("*.pseudonymise.txt"):
|
||||
|
||||
@@ -24,9 +24,9 @@ def test_phase1_corrections():
|
||||
|
||||
# Documents de test (5 documents représentatifs)
|
||||
test_docs = [
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/008_23001234/CRH 23001234.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/021_23012345/CRO 23012345.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/033_23023456/trackare-23023456-12345678.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/008_23001234/CRH 23001234.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/021_23012345/CRO 23012345.pdf",
|
||||
"/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs/033_23023456/trackare-23023456-12345678.pdf",
|
||||
]
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
@@ -23,7 +23,7 @@ def validate_corpus_sample():
|
||||
"""Valide l'anonymisation sur un échantillon du corpus."""
|
||||
|
||||
# Répertoires
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
output_dir = Path("corpus_validation_sample")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
@@ -221,7 +221,7 @@ def leak_check(output_dir: Path):
|
||||
# Patterns à vérifier
|
||||
patterns = {
|
||||
"date_naissance_contexte": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
"chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
leaks = defaultdict(list)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
Validation sur le corpus complet (59 OGC / 130 PDFs).
|
||||
|
||||
Ce script anonymise tous les documents du corpus et vérifie :
|
||||
- Absence de fuites (dates de naissance, CHCB, etc.)
|
||||
- Absence de fuites (dates de naissance, CHUXX, etc.)
|
||||
- Statistiques de détection par type
|
||||
- Performances (temps de traitement)
|
||||
"""
|
||||
@@ -24,7 +24,7 @@ def validate_full_corpus():
|
||||
"""Valide l'anonymisation sur le corpus complet."""
|
||||
|
||||
# Répertoires
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
|
||||
output_dir = Path("corpus_validation")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
@@ -177,7 +177,7 @@ def leak_check(output_dir: Path):
|
||||
# Patterns à vérifier
|
||||
patterns = {
|
||||
"date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
|
||||
"chcb": re.compile(r"\bCHCB\b", re.IGNORECASE),
|
||||
"chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE),
|
||||
"date_format": re.compile(r"\b\d{2}[/.\-]\d{2}[/.\-]\d{4}\b"),
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
# 5 documents du corpus production (OGC 008)
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs")
|
||||
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs")
|
||||
test_docs = [
|
||||
corpus_dir / "008_23001234" / "CRH 23001234.pdf",
|
||||
corpus_dir / "008_23001234" / "CRO 23001234.pdf",
|
||||
|
||||
Reference in New Issue
Block a user