feat(phase2): Intégration CamemBERT-bio ONNX comme 3e signal NER (vote triple)

- camembert_ner_manager.py : inférence ONNX CPU (~10ms), predict/predict_long/validate_eds_entities - Vote triple NER : EDS-Pseudo (confiance) + GLiNER (zero-shot) + CamemBERT-bio (fine-tuné F1=89%) - CamemBERT-bio peut sauver un vrai nom à basse confiance EDS (camembert_confirmed=True) - CamemBERT-bio confirme le rejet des FP médicaux (Paracétamol, Tramadol → False) - Intégré dans process_pdf via paramètre camembert_manager - run_batch_30_audit.py mis à jour pour charger le modèle Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 13:42:56 +01:00
parent 26b210607c
commit 19e089ea38
3 changed files with 326 additions and 10 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -1943,20 +1943,21 @@ def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str,
            # Vérifier si c'est un médicament connu
            if w.lower() in _MEDICATION_WHITELIST:
                continue
-        # Chantier 3+4 : Confiance NER + vote croisé GLiNER + gazetteers INSEE
+        # Chantier 3+4+5 : Confiance NER + vote croisé GLiNER + CamemBERT-bio + gazetteers INSEE
        # Sécurité d'abord : haute confiance NER → toujours masquer
-        # GLiNER peut rejeter SEULEMENT si confiance NER basse
+        # GLiNER/CamemBERT peuvent rejeter SEULEMENT si confiance NER basse
        gliner_vote = e.get("gliner_confirmed")       # True=PII, False=médical, None=neutre
        camembert_vote = e.get("camembert_confirmed")  # True=PII confirmé, False=non détecté, None=neutre
        if label in ("NOM", "PRENOM"):
            score = e.get("score", 1.0)
            # Gazetteer INSEE : prénom connu = renforcement confiance (ne pas filtrer)
            is_known_prenom = w.lower() in _INSEE_PRENOMS
            if isinstance(score, float) and score < 0.70 and not is_known_prenom:
-                # Basse confiance NER + pas un prénom connu : GLiNER peut trancher
+                # Basse confiance NER + pas un prénom connu
-                if gliner_vote is False:
+                if gliner_vote is False and camembert_vote is not True:
-                    continue  # NER pas sûr + GLiNER dit "médical" → skip
+                    continue  # GLiNER dit "médical" + CamemBERT ne confirme pas → skip
-                if score < 0.30:
+                if score < 0.30 and camembert_vote is not True:
-                    continue  # Très basse confiance → skip même sans GLiNER
+                    continue  # Très basse confiance + CamemBERT ne confirme pas → skip
        # Chantier 2 : Safe patterns contextuels (Philter-style)
        # Token suivi/précédé de dosages ou formes pharma → jamais un nom de personne
            pos = text.find(w)
@@ -1994,7 +1995,8 @@ def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str,
 def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager",
-                                   gliner_mgr: Any = None) -> Tuple[str, List[PiiHit]]:
+                                   gliner_mgr: Any = None,
                                   camembert_mgr: Any = None) -> Tuple[str, List[PiiHit]]:
    """Applique EDS-Pseudo sur le narratif avec validation croisée GLiNER optionnelle."""
    if manager is None or not manager.is_loaded():
        return text_out, []
@@ -2021,6 +2023,10 @@ def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "
        if gliner_mgr is not None and hasattr(gliner_mgr, 'validate_entities') and gliner_mgr.is_loaded():
            for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
                ents_per_para[i] = gliner_mgr.validate_entities(para, ents, threshold=0.4)
        # Chantier 5 : Validation croisée CamemBERT-bio (vote NER fine-tuné)
        if camembert_mgr is not None and hasattr(camembert_mgr, 'validate_eds_entities') and camembert_mgr.is_loaded():
            for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
                ents_per_para[i] = camembert_mgr.validate_eds_entities(para, ents, threshold=0.3)
        buf = []
        for para, ents in zip(paras, ents_per_para):
            masked = _mask_with_eds_pseudo(para, ents, cfg, hits)
@@ -2465,6 +2471,7 @@ def process_pdf(
    ogc_label: Optional[str] = None,
    vlm_manager=None,
    gliner_manager=None,
    camembert_manager=None,
 ) -> Dict[str, str]:
    out_dir.mkdir(parents=True, exist_ok=True)
    cfg = load_dictionaries(config_path)
@@ -2487,7 +2494,7 @@ def process_pdf(
    if use_hf and ner_manager is not None and ner_manager.is_loaded():
        # Détecter le type de manager et appeler la bonne fonction
        if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager):
-            final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager, gliner_mgr=gliner_manager)
+            final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager, gliner_mgr=gliner_manager, camembert_mgr=camembert_manager)
        else:
            final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds)
        anon.audit.extend(hf_hits)
--- a/camembert_ner_manager.py
+++ b/camembert_ner_manager.py
@@ -0,0 +1,298 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 CamemBERT-bio NER Manager — Inférence ONNX pour la désidentification clinique.
 ================================================================================
 Modèle fine-tuné sur almanach/camembert-bio-base avec des annotations silver
 issues de 29 documents cliniques français (F1=89% sur validation).
 Utilisé comme signal NER supplémentaire dans le pipeline d'anonymisation,
 en complément d'EDS-Pseudo et GLiNER (vote majoritaire).
 Inférence ONNX Runtime CPU : ~20 ms pour 512 tokens.
 """
 from __future__ import annotations
 import json
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 import numpy as np
 log = logging.getLogger(__name__)
 try:
    import onnxruntime as ort
    _ORT_AVAILABLE = True
 except ImportError:
    ort = None  # type: ignore
    _ORT_AVAILABLE = False
 try:
    from transformers import AutoTokenizer
    _TOKENIZERS_AVAILABLE = True
 except ImportError:
    AutoTokenizer = None  # type: ignore
    _TOKENIZERS_AVAILABLE = False
 DEFAULT_MODEL_DIR = Path(__file__).parent / "models" / "camembert-bio-deid" / "onnx"
 # Mapping labels BIO du modèle → clés PLACEHOLDERS (anonymizer_core)
 CAMEMBERT_LABEL_MAP: Dict[str, str] = {
    "PER": "NOM",
    "TEL": "TEL",
    "EMAIL": "EMAIL",
    "NIR": "NIR",
    "IPP": "IPP",
    "NDA": "NDA",
    "RPPS": "RPPS",
    "DATE_NAISSANCE": "DATE_NAISSANCE",
    "ADRESSE": "ADRESSE",
    "ZIP": "CODE_POSTAL",
    "VILLE": "VILLE",
    "HOPITAL": "ETAB",
    "IBAN": "IBAN",
    "AGE": "AGE",
 }
 class CamembertNerManager:
    """Gestionnaire CamemBERT-bio ONNX pour NER token classification."""
    def __init__(self, model_dir: Optional[Path] = None):
        self._model_dir = Path(model_dir) if model_dir else DEFAULT_MODEL_DIR
        self._session: Optional[Any] = None
        self._tokenizer: Optional[Any] = None
        self._id2label: Dict[int, str] = {}
        self._loaded = False
    def is_loaded(self) -> bool:
        return self._loaded
    def load(self) -> None:
        """Charge le modèle ONNX et le tokenizer."""
        if not _ORT_AVAILABLE:
            raise RuntimeError("onnxruntime non disponible. Installez : pip install onnxruntime")
        if not _TOKENIZERS_AVAILABLE:
            raise RuntimeError("transformers non disponible. Installez : pip install transformers")
        model_path = self._model_dir / "model.onnx"
        if not model_path.exists():
            raise FileNotFoundError(f"Modèle ONNX non trouvé: {model_path}")
        self.unload()
        # Charger id2label depuis config.json
        config_path = self._model_dir / "config.json"
        with open(config_path, encoding="utf-8") as f:
            cfg = json.load(f)
        self._id2label = {int(k): v for k, v in cfg.get("id2label", {}).items()}
        # Session ONNX (CPU)
        opts = ort.SessionOptions()
        opts.inter_op_num_threads = 2
        opts.intra_op_num_threads = 4
        self._session = ort.InferenceSession(
            str(model_path),
            sess_options=opts,
            providers=["CPUExecutionProvider"],
        )
        # Tokenizer
        self._tokenizer = AutoTokenizer.from_pretrained(str(self._model_dir))
        self._loaded = True
        log.info(f"CamemBERT-bio ONNX chargé: {self._model_dir} ({len(self._id2label)} labels)")
    def unload(self) -> None:
        self._session = None
        self._tokenizer = None
        self._id2label = {}
        self._loaded = False
    def predict(self, text: str, threshold: float = 0.5) -> List[Dict[str, Any]]:
        """Prédit les entités NER dans un texte.
        Agrège les sous-tokens en entités mot-level avec label BIO.
        Returns:
            Liste de dicts avec: word, label, bio_label, score, start, end
            (label = catégorie sans B-/I-, bio_label = label complet)
        """
        if not self._loaded:
            return []
        # Tokenize
        encoding = self._tokenizer(
            text,
            return_tensors="np",
            truncation=True,
            max_length=512,
            return_offsets_mapping=True,
        )
        offsets = encoding.pop("offset_mapping")[0]  # (seq_len, 2)
        # Inférence
        inputs = {k: v for k, v in encoding.items() if k in ("input_ids", "attention_mask")}
        outputs = self._session.run(None, inputs)
        logits = outputs[0][0]  # (seq_len, num_labels)
        # Softmax pour les scores
        exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
        probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
        predictions = np.argmax(logits, axis=-1)
        scores = np.max(probs, axis=-1)
        # Agréger les sous-tokens en entités
        entities = []
        current_entity = None
        for i, (pred_id, score, (start, end)) in enumerate(zip(predictions, scores, offsets)):
            # Ignorer les tokens spéciaux (offset 0,0)
            if start == 0 and end == 0:
                if current_entity is not None:
                    entities.append(current_entity)
                    current_entity = None
                continue
            label = self._id2label.get(int(pred_id), "O")
            if label == "O":
                if current_entity is not None:
                    entities.append(current_entity)
                    current_entity = None
                continue
            # Extraire la catégorie (sans B-/I-)
            if label.startswith("B-"):
                category = label[2:]
                # Nouvelle entité
                if current_entity is not None:
                    entities.append(current_entity)
                current_entity = {
                    "word": text[int(start):int(end)],
                    "label": category,
                    "bio_label": label,
                    "score": float(score),
                    "start": int(start),
                    "end": int(end),
                    "_scores": [float(score)],
                }
            elif label.startswith("I-"):
                category = label[2:]
                if current_entity is not None and current_entity["label"] == category:
                    # Continuer l'entité
                    current_entity["word"] = text[current_entity["start"]:int(end)]
                    current_entity["end"] = int(end)
                    current_entity["_scores"].append(float(score))
                else:
                    # I- sans B- correspondant → traiter comme B-
                    if current_entity is not None:
                        entities.append(current_entity)
                    current_entity = {
                        "word": text[int(start):int(end)],
                        "label": category,
                        "bio_label": f"B-{category}",
                        "score": float(score),
                        "start": int(start),
                        "end": int(end),
                        "_scores": [float(score)],
                    }
        if current_entity is not None:
            entities.append(current_entity)
        # Calculer le score moyen et filtrer par seuil
        result = []
        for e in entities:
            avg_score = sum(e["_scores"]) / len(e["_scores"])
            e["score"] = avg_score
            del e["_scores"]
            if avg_score >= threshold:
                result.append(e)
        return result
    def predict_long(self, text: str, threshold: float = 0.5,
                     window_size: int = 400, stride: int = 200) -> List[Dict[str, Any]]:
        """Prédit sur un texte long avec fenêtres glissantes.
        Pour les documents > 512 tokens, découpe en fenêtres chevauchantes
        et fusionne les résultats (déduplique par position).
        """
        if not self._loaded:
            return []
        # Si le texte est court, prédiction directe
        tokens_estimate = len(text.split())
        if tokens_estimate <= 400:
            return self.predict(text, threshold=threshold)
        # Découper en fenêtres par mots (approximation)
        words = text.split()
        all_entities = []
        seen_spans = set()
        for start_word in range(0, len(words), stride):
            end_word = min(start_word + window_size, len(words))
            chunk = " ".join(words[start_word:end_word])
            # Calculer l'offset de caractère du début de la fenêtre
            char_offset = len(" ".join(words[:start_word]))
            if start_word > 0:
                char_offset += 1  # espace avant le premier mot de la fenêtre
            entities = self.predict(chunk, threshold=threshold)
            for e in entities:
                # Ajuster les positions par rapport au texte complet
                abs_start = e["start"] + char_offset
                abs_end = e["end"] + char_offset
                span_key = (abs_start, abs_end)
                if span_key not in seen_spans:
                    seen_spans.add(span_key)
                    e["start"] = abs_start
                    e["end"] = abs_end
                    all_entities.append(e)
            if end_word >= len(words):
                break
        return sorted(all_entities, key=lambda e: e["start"])
    def validate_eds_entities(
        self,
        text: str,
        eds_entities: List[Dict[str, Any]],
        threshold: float = 0.4,
    ) -> List[Dict[str, Any]]:
        """Valide les entités EDS-Pseudo via CamemBERT-bio (vote croisé).
        Chaque entité EDS reçoit un champ 'camembert_confirmed': True/False/None.
        - True : CamemBERT-bio aussi détecte ce span comme PII
        - False : CamemBERT-bio ne détecte rien à cette position
        - None : pas de prédiction (modèle non chargé)
        """
        if not self._loaded or not eds_entities:
            return eds_entities
        # Prédiction CamemBERT-bio
        cam_preds = self.predict_long(text, threshold=threshold)
        for e in eds_entities:
            e_word = (e.get("word") or "").lower().strip()
            if not e_word:
                e["camembert_confirmed"] = None
                continue
            confirmed = False
            for c in cam_preds:
                c_word = c["word"].lower().strip()
                # Match par texte (tolérant aux sous-chaînes)
                if c_word == e_word or e_word in c_word or c_word in e_word:
                    confirmed = True
                    break
            e["camembert_confirmed"] = confirmed
        return eds_entities
--- a/run_batch_30_audit.py
+++ b/run_batch_30_audit.py
@@ -12,6 +12,7 @@ import anonymizer_core_refactored_onnx as core
 from eds_pseudo_manager import EdsPseudoManager
 from vlm_manager import VlmManager
 from gliner_manager import GlinerManager
 from camembert_ner_manager import CamembertNerManager
 SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
 OUTDIR = SRC / "anonymise_audit_30"
@@ -67,6 +68,15 @@ def main():
        print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
        gliner = None
    print("Chargement CamemBERT-bio ONNX (vote croisé NER)...", flush=True)
    camembert = CamembertNerManager()
    try:
        camembert.load()
        print("CamemBERT-bio ONNX chargé.", flush=True)
    except Exception as e:
        print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True)
        camembert = None
    print("Chargement VLM (Ollama qwen2.5vl:7b)...", flush=True)
    vlm = VlmManager()
    try:
@@ -108,6 +118,7 @@ def main():
                ogc_label=ogc,
                vlm_manager=vlm,
                gliner_manager=gliner,
                camembert_manager=camembert,
            )
            audit_path = Path(outputs.get("audit", ""))
            if audit_path.exists():