feat(phase2): Fine-tuning CamemBERT-bio v2 (F1=0.90) + enrichissement données
- Fine-tuning camembert-bio-base : F1=0.903, Recall=0.930 (vs 0.89/0.85) - Data augmentation : substitution noms INSEE (219K patronymes, x3 copies) - Hard negatives BDPM (5.7K médicaments) + QUAERO (1319 termes médicaux) - Annotations silver enrichies par gazetteers (+612 VILLE, +5 HOPITAL) - Export silver avec support multi-répertoires (--extra-dir) - Gazetteers QUAERO : CHEM, DISO, PROC, ANAT depuis DrBenchmark/QUAERO - Gazetteers INSEE : noms de famille fréquents (96K) et complets (219K) - Batch silver 1194 PDFs (run_batch_silver_export.py) pour dataset v3 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
192
run_batch_silver_export.py
Normal file
192
run_batch_silver_export.py
Normal file
@@ -0,0 +1,192 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Batch anonymisation de PDFs pour enrichir le dataset silver.
|
||||
|
||||
Traite TOUS les PDFs disponibles (excluant ceux déjà dans audit_30) en mode CPU
|
||||
uniquement (sans VLM) pour générer des .pseudonymise.txt utilisables par
|
||||
export_silver_annotations.py.
|
||||
|
||||
Timeout par fichier pour éviter les blocages sur les gros documents.
|
||||
Reprend automatiquement là où il s'est arrêté (skip les déjà traités).
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
import signal
|
||||
import random
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
from eds_pseudo_manager import EdsPseudoManager
|
||||
from gliner_manager import GlinerManager
|
||||
from camembert_ner_manager import CamembertNerManager
|
||||
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
OUTDIR = SRC / "anonymise_silver_extra"
|
||||
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
||||
|
||||
# PDFs déjà traités dans l'audit 30 (à exclure)
|
||||
ALREADY_DONE_AUDIT30 = {
|
||||
"CONSULTATION ANESTHESISTE 23060661.pdf",
|
||||
"trackare-05000272-23074376_05000272_23074376.pdf",
|
||||
"CONSULTATION ANESTHESISTE 23056022.pdf",
|
||||
"trackare-BA042686-23090597_BA042686_23090597.pdf",
|
||||
"trackare-23000862-23018396_23000862_23018396.pdf",
|
||||
"LETTRE DE SORTIE 23087212.pdf",
|
||||
"CRO 23159905.pdf",
|
||||
"trackare-99246761-23159905_99246761_23159905.pdf",
|
||||
"CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
"CRO 23160703.pdf",
|
||||
"trackare-BA192486-23127395_BA192486_23127395.pdf",
|
||||
"BACTERIO 23232115.pdf",
|
||||
"CR consultation anesth-290-23025988.pdf",
|
||||
"trackare-05012965-23060770_05012965_23060770.pdf",
|
||||
"trackare-BA065989-23102874_BA065989_23102874.pdf",
|
||||
"trackare-BA127127-23135726_BA127127_23135726.pdf",
|
||||
"trackare-99252128-23177582_99252128_23177582.pdf",
|
||||
"trackare-BA171849-23214501_BA171849_23214501.pdf",
|
||||
"trackare-17015185-23043950_17015185_23043950.pdf",
|
||||
"CRH 60_23106634.pdf",
|
||||
"trackare-00260974-23070213_00260974_23070213.pdf",
|
||||
"trackare-BA067657-23076655_BA067657_23076655.pdf",
|
||||
"trackare-05012679-23098722_05012679_23098722.pdf",
|
||||
"trackare-11004431-23124019_11004431_23124019.pdf",
|
||||
"trackare-07003136-23135847_07003136_23135847.pdf",
|
||||
"trackare-13013848-23165708_13013848_23165708.pdf",
|
||||
"trackare-03020576-23175616_03020576_23175616.pdf",
|
||||
"trackare-BA093659-23074520_BA093659_23074520.pdf",
|
||||
"trackare-14025311-23034958_14025311_23034958.pdf",
|
||||
"trackare-BA121804-23016863_BA121804_23016863.pdf",
|
||||
}
|
||||
|
||||
TIMEOUT_PER_FILE = 120 # secondes max par PDF
|
||||
|
||||
|
||||
class TimeoutError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def timeout_handler(signum, frame):
|
||||
raise TimeoutError("Timeout")
|
||||
|
||||
|
||||
def main():
|
||||
# Collecter tous les PDFs disponibles (excluant audit_30)
|
||||
all_pdfs = []
|
||||
for ogc_dir in sorted(SRC.iterdir()):
|
||||
if not ogc_dir.is_dir() or ogc_dir.name.startswith("anonymise"):
|
||||
continue
|
||||
for pdf in ogc_dir.glob("*.pdf"):
|
||||
if pdf.name not in ALREADY_DONE_AUDIT30:
|
||||
all_pdfs.append(pdf)
|
||||
|
||||
# Trier par OGC pour reproductibilité
|
||||
all_pdfs.sort(key=lambda p: (p.parent.name, p.name))
|
||||
|
||||
# Détecter les fichiers déjà traités (reprise)
|
||||
OUTDIR.mkdir(exist_ok=True)
|
||||
already_done = {
|
||||
p.name.replace(".pseudonymise.txt", ".pdf")
|
||||
for p in OUTDIR.glob("*.pseudonymise.txt")
|
||||
}
|
||||
pdfs_to_do = [p for p in all_pdfs if p.name not in already_done]
|
||||
|
||||
print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)")
|
||||
print(f"Déjà traités: {len(already_done)}")
|
||||
print(f"Restant: {len(pdfs_to_do)}")
|
||||
|
||||
if not pdfs_to_do:
|
||||
print("Rien à faire.")
|
||||
return
|
||||
|
||||
# Chargement des modèles NER (CPU uniquement, pas de VLM)
|
||||
print("\nChargement EDS-Pseudo...", flush=True)
|
||||
ner = EdsPseudoManager()
|
||||
ner.load()
|
||||
assert ner.is_loaded(), "EDS-Pseudo non chargé"
|
||||
print("EDS-Pseudo chargé.", flush=True)
|
||||
|
||||
print("Chargement GLiNER...", flush=True)
|
||||
gliner = GlinerManager()
|
||||
try:
|
||||
gliner.load()
|
||||
print("GLiNER chargé.", flush=True)
|
||||
except Exception as e:
|
||||
print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
|
||||
gliner = None
|
||||
|
||||
print("Chargement CamemBERT-bio ONNX...", flush=True)
|
||||
camembert = CamembertNerManager()
|
||||
try:
|
||||
camembert.load()
|
||||
print("CamemBERT-bio ONNX chargé.", flush=True)
|
||||
except Exception as e:
|
||||
print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True)
|
||||
camembert = None
|
||||
|
||||
print(f"\nPas de VLM (CPU only pour silver export).\n", flush=True)
|
||||
|
||||
ok = ko = skip_encrypted = skip_timeout = 0
|
||||
t0 = time.time()
|
||||
total = len(pdfs_to_do)
|
||||
|
||||
for i, pdf in enumerate(pdfs_to_do, 1):
|
||||
ogc = pdf.parent.name.split("_")[0]
|
||||
print(f"[{i}/{total}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
|
||||
|
||||
# Timeout par fichier
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
signal.alarm(TIMEOUT_PER_FILE)
|
||||
try:
|
||||
core.process_pdf(
|
||||
pdf_path=pdf,
|
||||
out_dir=OUTDIR,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=CONFIG,
|
||||
use_hf=True,
|
||||
ner_manager=ner,
|
||||
ner_thresholds=None,
|
||||
ogc_label=ogc,
|
||||
vlm_manager=None,
|
||||
gliner_manager=gliner,
|
||||
camembert_manager=camembert,
|
||||
)
|
||||
signal.alarm(0)
|
||||
elapsed_file = time.time() - t0
|
||||
rate = ok / elapsed_file * 3600 if elapsed_file > 0 and ok > 0 else 0
|
||||
print(f"OK ({rate:.0f}/h)", flush=True)
|
||||
ok += 1
|
||||
except TimeoutError:
|
||||
signal.alarm(0)
|
||||
print(f"TIMEOUT ({TIMEOUT_PER_FILE}s)", flush=True)
|
||||
skip_timeout += 1
|
||||
except Exception as e:
|
||||
signal.alarm(0)
|
||||
err = str(e)
|
||||
if "encrypted" in err.lower() or "password" in err.lower():
|
||||
print("SKIP (chiffré)", flush=True)
|
||||
skip_encrypted += 1
|
||||
else:
|
||||
print(f"ERREUR: {e}", flush=True)
|
||||
ko += 1
|
||||
|
||||
# Rapport intermédiaire toutes les 50 fichiers
|
||||
if i % 50 == 0:
|
||||
elapsed = time.time() - t0
|
||||
remaining = (elapsed / i) * (total - i)
|
||||
print(f"\n --- Progression: {i}/{total} | OK: {ok} | "
|
||||
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
|
||||
f"Temps restant estimé: {remaining/60:.0f}min ---\n", flush=True)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)")
|
||||
print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}")
|
||||
print(f"Total .pseudonymise.txt: {len(list(OUTDIR.glob('*.pseudonymise.txt')))}")
|
||||
print(f"Sortie: {OUTDIR}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user