feat(phase2): Multi-signal NER — BDPM gazetteers, confiance EDS, safe patterns, GLiNER

Chantier 1: Intégration BDPM (5737 médicaments officiels) dans medication whitelist Chantier 2: Safe patterns contextuels (dosages mg/mL/cpr, formes pharma, même ligne) Chantier 3: Scores de confiance NER réels (edsnlp 0.20 ner_confidence_score) Chantier 4: GLiNER zero-shot (urchade/gliner_multi_pii-v1) en vote croisé Chantier 5: Scripts export silver annotations + fine-tuning CamemBERT-bio 0 fuite, 0 régression, -18 FP supplémentaires éliminés. Sécurité: GLiNER ne peut rejeter que si confiance NER < 0.70. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 12:01:46 +01:00
parent 782551c1c6
commit 26ac02b0cb
16 changed files with 6431 additions and 41 deletions
--- a/scripts/export_silver_annotations.py
+++ b/scripts/export_silver_annotations.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+"""
+Export silver annotations — Génère des données d'entraînement BIO à partir du pipeline existant.
+================================================================================================
+Utilise le pipeline regex+NER+VLM actuel pour produire des annotations "silver standard"
+sur les 706 OGC. Ces annotations servent de base pour fine-tuner CamemBERT-bio.
+
+Usage:
+    python scripts/export_silver_annotations.py [--limit N] [--out-dir DIR]
+
+Output: data/silver_annotations/ avec un fichier .bio par document
+Format BIO: TOKEN\tLABEL (un token par ligne, lignes vides entre phrases)
+"""
+import sys
+import re
+import json
+import argparse
+from pathlib import Path
+from typing import List, Tuple
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+# Regex pour détecter les placeholders et reconstruire l'alignement
+PLACEHOLDER_RE = re.compile(
+    r"\[(NOM|TEL|EMAIL|NIR|IPP|DOSSIER|NDA|EPISODE|RPPS|DATE_NAISSANCE|"
+    r"ADRESSE|CODE_POSTAL|VILLE|MASK|FINESS|OGC|AGE|ETAB|IBAN)\]"
+)
+
+# Mapping placeholder → label BIO
+PH_TO_BIO = {
+    "NOM": "PER",
+    "TEL": "TEL",
+    "EMAIL": "EMAIL",
+    "NIR": "NIR",
+    "IPP": "IPP",
+    "DOSSIER": "NDA",
+    "NDA": "NDA",
+    "EPISODE": "NDA",
+    "RPPS": "RPPS",
+    "DATE_NAISSANCE": "DATE_NAISSANCE",
+    "ADRESSE": "ADRESSE",
+    "CODE_POSTAL": "ZIP",
+    "VILLE": "VILLE",
+    "ETAB": "HOPITAL",
+    "FINESS": "HOPITAL",
+    "IBAN": "IBAN",
+    "AGE": "AGE",
+    "OGC": "NDA",
+    "MASK": "O",  # MASK générique = pas d'annotation spécifique
+}
+
+
+def text_to_bio(pseudonymised_text: str) -> List[Tuple[str, str]]:
+    """Convertit un texte pseudonymisé en séquence BIO.
+
+    Les tokens [PLACEHOLDER] deviennent B-TYPE / I-TYPE.
+    Les tokens normaux deviennent O.
+    """
+    bio_tokens: List[Tuple[str, str]] = []
+
+    # Split le texte en segments : alternance texte normal / placeholder
+    parts = PLACEHOLDER_RE.split(pseudonymised_text)
+    # parts = [texte, label, texte, label, texte, ...]
+
+    i = 0
+    while i < len(parts):
+        if i % 2 == 0:
+            # Texte normal
+            text_part = parts[i]
+            for word in text_part.split():
+                word = word.strip()
+                if word:
+                    bio_tokens.append((word, "O"))
+        else:
+            # Label de placeholder
+            label = parts[i]
+            bio_label = PH_TO_BIO.get(label, "O")
+            if bio_label != "O":
+                # Le placeholder remplace un ou plusieurs tokens
+                bio_tokens.append((f"[{label}]", f"B-{bio_label}"))
+            else:
+                bio_tokens.append((f"[{label}]", "O"))
+        i += 1
+
+    return bio_tokens
+
+
+def export_document(pseudo_path: Path, out_dir: Path) -> int:
+    """Exporte un fichier pseudonymisé en format BIO. Retourne le nombre de tokens."""
+    text = pseudo_path.read_text(encoding="utf-8", errors="replace")
+
+    bio_tokens = text_to_bio(text)
+    if not bio_tokens:
+        return 0
+
+    # Écrire en format CoNLL (TOKEN\tLABEL)
+    out_path = out_dir / pseudo_path.name.replace(".pseudonymise.txt", ".bio")
+    lines = []
+    for token, label in bio_tokens:
+        # Séparer les "phrases" par des lignes vides (heuristique: point final ou retour ligne)
+        if token in (".", "!", "?") and label == "O":
+            lines.append(f"{token}\t{label}")
+            lines.append("")  # séparateur de phrase
+        else:
+            lines.append(f"{token}\t{label}")
+
+    out_path.write_text("\n".join(lines), encoding="utf-8")
+    return len(bio_tokens)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Export silver annotations BIO")
+    parser.add_argument("--input-dir", type=Path,
+                        default=Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise_audit_30"),
+                        help="Répertoire contenant les .pseudonymise.txt")
+    parser.add_argument("--out-dir", type=Path,
+                        default=Path(__file__).parent.parent / "data" / "silver_annotations",
+                        help="Répertoire de sortie")
+    parser.add_argument("--limit", type=int, default=0, help="Limiter à N fichiers (0=tous)")
+    args = parser.parse_args()
+
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    pseudo_files = sorted(args.input_dir.glob("*.pseudonymise.txt"))
+    if args.limit > 0:
+        pseudo_files = pseudo_files[:args.limit]
+
+    print(f"Export silver annotations: {len(pseudo_files)} fichiers → {args.out_dir}")
+
+    total_tokens = 0
+    total_entities = 0
+    for f in pseudo_files:
+        n = export_document(f, args.out_dir)
+        ent_count = sum(1 for line in (args.out_dir / f.name.replace(".pseudonymise.txt", ".bio")).read_text().splitlines()
+                        if line and not line.endswith("\tO"))
+        total_tokens += n
+        total_entities += ent_count
+        print(f"  {f.name}: {n} tokens, {ent_count} entités")
+
+    print(f"\nTotal: {total_tokens} tokens, {total_entities} entités annotées")
+    print(f"Sortie: {args.out_dir}")
+
+
+if __name__ == "__main__":
+    main()