feat(phase2): Fine-tuning CamemBERT-bio v2 (F1=0.90) + enrichissement données
- Fine-tuning camembert-bio-base : F1=0.903, Recall=0.930 (vs 0.89/0.85) - Data augmentation : substitution noms INSEE (219K patronymes, x3 copies) - Hard negatives BDPM (5.7K médicaments) + QUAERO (1319 termes médicaux) - Annotations silver enrichies par gazetteers (+612 VILLE, +5 HOPITAL) - Export silver avec support multi-répertoires (--extra-dir) - Gazetteers QUAERO : CHEM, DISO, PROC, ANAT depuis DrBenchmark/QUAERO - Gazetteers INSEE : noms de famille fréquents (96K) et complets (219K) - Batch silver 1194 PDFs (run_batch_silver_export.py) pour dataset v3 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,8 +16,9 @@ import sys
|
||||
import re
|
||||
import difflib
|
||||
import argparse
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
from typing import Dict, List, Set, Tuple
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
@@ -47,6 +48,178 @@ RE_PLACEHOLDER = re.compile(r"^\[([A-Z_]+)\]$")
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
AUDIT_DIR = SRC / "anonymise_audit_30"
|
||||
|
||||
# --- Gazetteer paths ---
|
||||
GAZETTEERS_DIR = Path(__file__).parent.parent / "data"
|
||||
VILLES_FINESS_PATH = GAZETTEERS_DIR / "finess" / "villes_finess.txt"
|
||||
COMMUNES_INSEE_PATH = GAZETTEERS_DIR / "insee" / "communes_france.txt"
|
||||
ETABLISSEMENTS_PATH = GAZETTEERS_DIR / "finess" / "etablissements_distinctifs.txt"
|
||||
|
||||
# Mots de contexte indiquant qu'un token VILLE est bien un lieu (pas un nom commun)
|
||||
VILLE_CONTEXT_WORDS = {
|
||||
"à", "a", "de", "né", "née", "nee", "ne", "résid", "resid",
|
||||
"hospitalis", "transféré", "transfere", "transferée", "transferee",
|
||||
"domicilié", "domicilie", "domiciliée", "domiciliee",
|
||||
"habite", "habitant", "demeurant", "originaire", "ville",
|
||||
"commune", "cedex",
|
||||
}
|
||||
|
||||
|
||||
def _strip_accents(s: str) -> str:
|
||||
"""Supprime les accents d'une chaîne (é→e, à→a, etc.)."""
|
||||
nfkd = unicodedata.normalize("NFKD", s)
|
||||
return "".join(c for c in nfkd if not unicodedata.combining(c))
|
||||
|
||||
|
||||
def _normalize_gaz(s: str) -> str:
|
||||
"""Normalise pour comparaison gazetteer : minuscule, sans accents, stripped."""
|
||||
return _strip_accents(s.lower().strip())
|
||||
|
||||
|
||||
def load_gazetteers() -> dict:
|
||||
"""Charge les gazetteers depuis les fichiers, avec fallback gracieux.
|
||||
|
||||
Retourne un dict avec:
|
||||
- "villes": set de tuples de tokens normalisés (ex: ("saint", "palais"))
|
||||
- "hopitaux": set de tuples de tokens normalisés (ex: ("ch", "argentan"))
|
||||
"""
|
||||
villes: Set[Tuple[str, ...]] = set()
|
||||
hopitaux: Set[Tuple[str, ...]] = set()
|
||||
|
||||
# --- Villes FINESS (UPPERCASE, une par ligne) ---
|
||||
if VILLES_FINESS_PATH.exists():
|
||||
for line in VILLES_FINESS_PATH.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
# Filtrer les entrées avec "CEDEX" (adresses postales, pas des villes)
|
||||
tokens = tuple(_normalize_gaz(t) for t in line.split() if t != "CEDEX")
|
||||
if tokens and len(tokens[0]) >= 2: # Ignorer entrées trop courtes
|
||||
villes.add(tokens)
|
||||
|
||||
# --- Communes INSEE (UPPERCASE, une par ligne) ---
|
||||
if COMMUNES_INSEE_PATH.exists():
|
||||
for line in COMMUNES_INSEE_PATH.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
tokens = tuple(_normalize_gaz(t) for t in line.split())
|
||||
if tokens and len(tokens[0]) >= 2:
|
||||
villes.add(tokens)
|
||||
|
||||
# --- Établissements (format "- nom normalisé", minuscule) ---
|
||||
if ETABLISSEMENTS_PATH.exists():
|
||||
for line in ETABLISSEMENTS_PATH.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if not line.startswith("- "):
|
||||
continue
|
||||
name = line[2:].strip()
|
||||
if not name:
|
||||
continue
|
||||
tokens = tuple(_normalize_gaz(t) for t in name.split())
|
||||
# Ignorer les entrées trop courtes (1 token de 3 chars ou moins)
|
||||
if tokens and (len(tokens) > 1 or len(tokens[0]) > 3):
|
||||
hopitaux.add(tokens)
|
||||
|
||||
# Retirer les villes d'un seul token très court (risque élevé de faux positifs)
|
||||
villes = {v for v in villes if len(v) > 1 or len(v[0]) >= 3}
|
||||
|
||||
return {"villes": villes, "hopitaux": hopitaux}
|
||||
|
||||
|
||||
def _has_ville_context(tokens: List[str], labels: List[str], pos: int,
|
||||
window: int = 3) -> bool:
|
||||
"""Vérifie si un token à la position `pos` a un contexte indiquant un lieu.
|
||||
|
||||
Regarde les `window` tokens précédents pour des mots-clés de contexte.
|
||||
"""
|
||||
start = max(0, pos - window)
|
||||
for i in range(start, pos):
|
||||
tok_norm = _normalize_gaz(tokens[i].strip(".,;:!?()[]{}\"'"))
|
||||
# Vérifier correspondance exacte ou préfixe (ex: "résid" matche "résidence")
|
||||
for ctx in VILLE_CONTEXT_WORDS:
|
||||
if tok_norm == ctx or tok_norm.startswith(ctx):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def enrich_with_gazetteers(
|
||||
bio_tokens: List[Tuple[str, str]],
|
||||
gazetteers: dict,
|
||||
) -> Tuple[List[Tuple[str, str]], int, int]:
|
||||
"""Enrichit les annotations BIO avec les gazetteers.
|
||||
|
||||
Ne modifie JAMAIS un label existant (non-O). Ajoute uniquement des labels
|
||||
sur les tokens actuellement "O".
|
||||
|
||||
Retourne: (bio_tokens_enrichis, nb_villes_ajoutées, nb_hopitaux_ajoutés)
|
||||
"""
|
||||
tokens = [t for t, _ in bio_tokens]
|
||||
labels = [l for _, l in bio_tokens]
|
||||
n = len(tokens)
|
||||
|
||||
added_villes = 0
|
||||
added_hopitaux = 0
|
||||
|
||||
# Pré-calculer les tokens normalisés (sans ponctuation, sans accents, lowercase)
|
||||
tokens_norm = [
|
||||
_normalize_gaz(t.strip(".,;:!?()[]{}\"'"))
|
||||
for t in tokens
|
||||
]
|
||||
|
||||
# --- Enrichissement HOPITAL (multi-token, sans contrainte de contexte) ---
|
||||
# On traite d'abord les hôpitaux car ils sont plus spécifiques
|
||||
for gaz_tokens in gazetteers.get("hopitaux", set()):
|
||||
gaz_len = len(gaz_tokens)
|
||||
if gaz_len == 0:
|
||||
continue
|
||||
i = 0
|
||||
while i <= n - gaz_len:
|
||||
# Vérifier si la séquence de tokens matche
|
||||
match = True
|
||||
for k in range(gaz_len):
|
||||
if tokens_norm[i + k] != gaz_tokens[k]:
|
||||
match = False
|
||||
break
|
||||
if match:
|
||||
# Vérifier que TOUS les tokens sont actuellement "O"
|
||||
all_o = all(labels[i + k] == "O" for k in range(gaz_len))
|
||||
if all_o:
|
||||
labels[i] = "B-HOPITAL"
|
||||
for k in range(1, gaz_len):
|
||||
labels[i + k] = "I-HOPITAL"
|
||||
added_hopitaux += 1
|
||||
i += gaz_len
|
||||
continue
|
||||
i += 1
|
||||
|
||||
# --- Enrichissement VILLE (avec contexte obligatoire) ---
|
||||
for gaz_tokens in gazetteers.get("villes", set()):
|
||||
gaz_len = len(gaz_tokens)
|
||||
if gaz_len == 0:
|
||||
continue
|
||||
i = 0
|
||||
while i <= n - gaz_len:
|
||||
# Vérifier si la séquence de tokens matche
|
||||
match = True
|
||||
for k in range(gaz_len):
|
||||
if tokens_norm[i + k] != gaz_tokens[k]:
|
||||
match = False
|
||||
break
|
||||
if match:
|
||||
# Vérifier que TOUS les tokens sont actuellement "O"
|
||||
all_o = all(labels[i + k] == "O" for k in range(gaz_len))
|
||||
if all_o and _has_ville_context(tokens, labels, i):
|
||||
labels[i] = "B-VILLE"
|
||||
for k in range(1, gaz_len):
|
||||
labels[i + k] = "I-VILLE"
|
||||
added_villes += 1
|
||||
i += gaz_len
|
||||
continue
|
||||
i += 1
|
||||
|
||||
enriched = list(zip(tokens, labels))
|
||||
return enriched, added_villes, added_hopitaux
|
||||
|
||||
|
||||
def extract_original_text(pdf_path: Path) -> str:
|
||||
"""Extrait le texte brut d'un PDF (même méthode que le pipeline)."""
|
||||
@@ -173,20 +346,37 @@ def align_and_annotate(original_text: str, pseudo_text: str) -> List[Tuple[str,
|
||||
return bio_tokens
|
||||
|
||||
|
||||
def export_document(pdf_path: Path, pseudo_path: Path, out_dir: Path) -> Tuple[int, int]:
|
||||
"""Exporte un document en format BIO. Retourne (nb_tokens, nb_entités)."""
|
||||
def export_document(
|
||||
pdf_path: Path,
|
||||
pseudo_path: Path,
|
||||
out_dir: Path,
|
||||
gazetteers: dict | None = None,
|
||||
) -> Tuple[int, int, int, int]:
|
||||
"""Exporte un document en format BIO.
|
||||
|
||||
Retourne (nb_tokens, nb_entités_diff, nb_villes_gaz, nb_hopitaux_gaz).
|
||||
"""
|
||||
# Extraire le texte original
|
||||
original_text = extract_original_text(pdf_path)
|
||||
if not original_text.strip():
|
||||
return 0, 0
|
||||
return 0, 0, 0, 0
|
||||
|
||||
# Lire le texte pseudonymisé
|
||||
pseudo_text = pseudo_path.read_text(encoding="utf-8")
|
||||
if not pseudo_text.strip():
|
||||
return 0, 0
|
||||
return 0, 0, 0, 0
|
||||
|
||||
# Aligner et annoter
|
||||
# Aligner et annoter (diff-based)
|
||||
bio_tokens = align_and_annotate(original_text, pseudo_text)
|
||||
n_ents_diff = sum(1 for _, l in bio_tokens if l.startswith("B-"))
|
||||
|
||||
# Enrichissement gazetteer (post-processing)
|
||||
added_villes = 0
|
||||
added_hopitaux = 0
|
||||
if gazetteers:
|
||||
bio_tokens, added_villes, added_hopitaux = enrich_with_gazetteers(
|
||||
bio_tokens, gazetteers
|
||||
)
|
||||
|
||||
# Écrire en format CoNLL
|
||||
out_name = pdf_path.stem + ".bio"
|
||||
@@ -202,8 +392,7 @@ def export_document(pdf_path: Path, pseudo_path: Path, out_dir: Path) -> Tuple[i
|
||||
|
||||
out_path.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
n_ents = sum(1 for _, l in bio_tokens if l.startswith("B-"))
|
||||
return len(bio_tokens), n_ents
|
||||
return len(bio_tokens), n_ents_diff, added_villes, added_hopitaux
|
||||
|
||||
|
||||
def main():
|
||||
@@ -212,12 +401,42 @@ def main():
|
||||
default=Path(__file__).parent.parent / "data" / "silver_annotations",
|
||||
help="Répertoire de sortie")
|
||||
parser.add_argument("--limit", type=int, default=0, help="Limiter à N fichiers (0=tous)")
|
||||
parser.add_argument("--no-gazetteers", action="store_true",
|
||||
help="Désactiver l'enrichissement par gazetteers")
|
||||
parser.add_argument("--extra-dir", type=Path, nargs="*", default=[],
|
||||
help="Répertoires supplémentaires contenant des .pseudonymise.txt")
|
||||
args = parser.parse_args()
|
||||
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Trouver les paires PDF + pseudo
|
||||
pseudo_files = sorted(AUDIT_DIR.glob("*.pseudonymise.txt"))
|
||||
# Charger les gazetteers
|
||||
gazetteers = None
|
||||
if not args.no_gazetteers:
|
||||
gazetteers = load_gazetteers()
|
||||
n_villes = len(gazetteers["villes"])
|
||||
n_hop = len(gazetteers["hopitaux"])
|
||||
print(f"Gazetteers chargés: {n_villes} villes, {n_hop} établissements")
|
||||
else:
|
||||
print("Gazetteers désactivés")
|
||||
|
||||
# Trouver les paires PDF + pseudo (audit_30 + extra dirs)
|
||||
search_dirs = [AUDIT_DIR] + list(args.extra_dir)
|
||||
pseudo_files = []
|
||||
for sdir in search_dirs:
|
||||
if sdir.exists():
|
||||
pseudo_files.extend(sorted(sdir.glob("*.pseudonymise.txt")))
|
||||
print(f" {sdir.name}: {len(list(sdir.glob('*.pseudonymise.txt')))} fichiers pseudo")
|
||||
|
||||
# Dédupliquer par nom de base
|
||||
seen_bases = set()
|
||||
unique_pseudo = []
|
||||
for pf in pseudo_files:
|
||||
base = pf.name.replace(".pseudonymise.txt", "")
|
||||
if base not in seen_bases:
|
||||
seen_bases.add(base)
|
||||
unique_pseudo.append(pf)
|
||||
pseudo_files = unique_pseudo
|
||||
|
||||
pairs = []
|
||||
for pseudo_path in pseudo_files:
|
||||
# Retrouver le PDF source
|
||||
@@ -233,17 +452,35 @@ def main():
|
||||
print(f"Export silver annotations: {len(pairs)} documents → {args.out_dir}")
|
||||
|
||||
total_tokens = 0
|
||||
total_entities = 0
|
||||
total_ents_diff = 0
|
||||
total_villes_gaz = 0
|
||||
total_hop_gaz = 0
|
||||
for pdf_path, pseudo_path in pairs:
|
||||
try:
|
||||
n_tok, n_ent = export_document(pdf_path, pseudo_path, args.out_dir)
|
||||
n_tok, n_diff, n_vgaz, n_hgaz = export_document(
|
||||
pdf_path, pseudo_path, args.out_dir, gazetteers
|
||||
)
|
||||
total_tokens += n_tok
|
||||
total_entities += n_ent
|
||||
print(f" {pdf_path.name}: {n_tok} tokens, {n_ent} entités")
|
||||
total_ents_diff += n_diff
|
||||
total_villes_gaz += n_vgaz
|
||||
total_hop_gaz += n_hgaz
|
||||
gaz_info = ""
|
||||
if n_vgaz or n_hgaz:
|
||||
gaz_info = f" (+{n_vgaz} villes, +{n_hgaz} hôpitaux gaz.)"
|
||||
print(f" {pdf_path.name}: {n_tok} tokens, {n_diff} entités diff{gaz_info}")
|
||||
except Exception as e:
|
||||
print(f" {pdf_path.name}: ERREUR {e}")
|
||||
|
||||
print(f"\nTotal: {total_tokens} tokens, {total_entities} entités B-")
|
||||
total_gaz = total_villes_gaz + total_hop_gaz
|
||||
total_all = total_ents_diff + total_gaz
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Total tokens: {total_tokens}")
|
||||
print(f"Entités diff-based: {total_ents_diff} B-")
|
||||
print(f"Entités gazetteers: +{total_gaz} ({total_villes_gaz} VILLE, {total_hop_gaz} HOPITAL)")
|
||||
print(f"Total entités: {total_all} B-")
|
||||
if total_ents_diff > 0:
|
||||
pct = 100 * total_gaz / total_ents_diff
|
||||
print(f"Enrichissement: +{pct:.1f}% par gazetteers")
|
||||
print(f"Sortie: {args.out_dir}")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user