Initial commit — Pseudonymisation de PDF v5

- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles - Core ONNX : anonymisation regex + NER optionnel - Extraction globale des noms depuis champs structurés (Patient, Rédigé par, MME/Madame, DR) - Génération simultanée PDF Image + PDF Anonymisé (structure préservée) - Build Windows via Nuitka (script batch + GitHub Actions CI) - install.sh pour setup/run Linux Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 15:03:37 +01:00
commit 8339069c83
18 changed files with 5127 additions and 0 deletions
--- a/ner_manager_onnx.py
+++ b/ner_manager_onnx.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+ONNX NER Model Manager (CamemBERT family)
+-----------------------------------------
+- Chargement paresseux (après lancement de l'appli)
+- Support des modèles ONNX publiés (model.onnx / model_quantized.onnx)
+- Fallback : export ONNX à la volée si seul un modèle PyTorch est fourni
+- Prédiction par paragraphes (token-classification), agrégation 'simple'
+
+Dépendances :
+  pip install onnxruntime optimum transformers sentencepiece
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+import os
+
+from transformers import AutoTokenizer, AutoConfig, pipeline
+
+try:
+    from optimum.onnxruntime import ORTModelForTokenClassification
+except Exception as e:
+    ORTModelForTokenClassification = None  # type: ignore
+
+try:
+    from optimum.exporters.onnx import export
+    from optimum.exporters.tasks import TasksManager
+except Exception:
+    export = None  # type: ignore
+    TasksManager = None  # type: ignore
+
+
+DEFAULT_MODELS = {
+    # Rapide & léger (quantifié quand présent)
+    "DistilCamemBERT-NER (ONNX)": "cmarkea/distilcamembert-base-ner",
+    # Robuste & répandu
+    "CamemBERT-NER (ONNX)": "Jean-Baptiste/camembert-ner",
+}
+
+SUPPORTED_PER_TAGS = {"PER", "PERSON"}
+SUPPORTED_LOC_TAGS = {"LOC"}
+SUPPORTED_ORG_TAGS = {"ORG"}
+SUPPORTED_DATE_TAGS = {"DATE"}
+
+
+@dataclass
+class NerThresholds:
+    per: float = 0.90
+    org: float = 0.90
+    loc: float = 0.90
+    date: float = 0.85
+
+
+class NerModelManager:
+    def __init__(self, cache_dir: Optional[Path] = None, prefer_quantized: bool = True, providers: Optional[List[str]] = None):
+        self.cache_dir = Path(cache_dir) if cache_dir else None
+        self.prefer_quantized = prefer_quantized
+        self.providers = providers or ["CPUExecutionProvider"]
+        self.model_id: Optional[str] = None
+        self._pipe = None
+        self._tokenizer = None
+        self._loaded = False
+
+    # ------------------ public API ------------------
+    def is_loaded(self) -> bool:
+        return self._loaded and self._pipe is not None
+
+    def load(self, model_id_or_path: str, try_export_if_missing_onnx: bool = True) -> None:
+        """Charge un modèle ONNX; si pas d'ONNX et try_export=True, exporte depuis PyTorch.
+        - Supporte un dossier local (contenant model.onnx) ou un repo HF.
+        """
+        if ORTModelForTokenClassification is None:
+            raise RuntimeError("optimum.onnxruntime introuvable. Installez 'optimum' et 'onnxruntime'.")
+
+        self.unload()
+        self.model_id = model_id_or_path
+        cache = str(self.cache_dir) if self.cache_dir else None
+
+        # 1) essaie ONNX quantifié puis normal
+        candidates = []
+        if self.prefer_quantized:
+            candidates.append("model_quantized.onnx")
+        candidates.append("model.onnx")
+
+        loaded = False
+        last_err: Optional[Exception] = None
+        for fname in candidates:
+            try:
+                model = ORTModelForTokenClassification.from_pretrained(
+                    self.model_id,
+                    file_name=fname,
+                    cache_dir=cache,
+                    provider=self.providers[0],
+                )
+                tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True)
+                self._pipe = pipeline(
+                    task="token-classification",
+                    model=model,
+                    tokenizer=tokenizer,
+                    aggregation_strategy="simple",
+                )
+                self._tokenizer = tokenizer
+                loaded = True
+                break
+            except Exception as e:
+                last_err = e
+                continue
+
+        # 2) fallback : export ONNX si demandé
+        if not loaded and try_export_if_missing_onnx:
+            if export is None or TasksManager is None:
+                raise RuntimeError("Impossible d'exporter en ONNX (optimum.exporters manquant).")
+            try:
+                tmp_dir = Path(cache or ".") / ".onnx_export"
+                tmp_dir.mkdir(parents=True, exist_ok=True)
+                task = "token-classification"
+                onnx_paths = export(
+                    model_name_or_path=self.model_id,
+                    output=tmp_dir,
+                    task=task,
+                    opset=17,
+                    optimize="O2",
+                    atol=1e-4,
+                )
+                model = ORTModelForTokenClassification.from_pretrained(str(tmp_dir), file_name="model.onnx", provider=self.providers[0])
+                tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True)
+                self._pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+                self._tokenizer = tokenizer
+                loaded = True
+            except Exception as e:
+                last_err = e
+
+        if not loaded:
+            raise RuntimeError(f"Échec de chargement/export ONNX pour '{self.model_id}': {last_err}")
+
+        self._loaded = True
+
+    def unload(self) -> None:
+        self._pipe = None
+        self._tokenizer = None
+        self._loaded = False
+
+    def models_catalog(self) -> Dict[str, str]:
+        return dict(DEFAULT_MODELS)
+
+    # ------------------ inference ------------------
+    def infer_paragraphs(self, paragraphs: List[str], thresholds: Optional[NerThresholds] = None, max_length: int = 384, stride: int = 128) -> List[List[Dict[str, Any]]]:
+        """Retourne, pour chaque paragraphe, une liste d'entités agrégées.
+        Chaque entité a les clés: entity_group, score, word, start, end.
+        """
+        if not self.is_loaded():
+            return [[] for _ in paragraphs]
+        th = thresholds or NerThresholds()
+        out: List[List[Dict[str, Any]]] = []
+        for para in paragraphs:
+            if not para.strip():
+                out.append([])
+                continue
+            # Tronquer manuellement si nécessaire (compatibilité transformers récents)
+            input_text = para
+            if self._tokenizer:
+                tok_len = len(self._tokenizer.encode(para, add_special_tokens=True))
+                if tok_len > 512:
+                    tokens = self._tokenizer.encode(para, add_special_tokens=False)[:510]
+                    input_text = self._tokenizer.decode(tokens)
+            ents = self._pipe(
+                input_text,
+                aggregation_strategy="simple",
+            )
+            # Filtrage par seuils
+            filtered: List[Dict[str, Any]] = []
+            for e in ents:
+                grp = (e.get("entity_group") or e.get("entity") or "").upper()
+                sc = float(e.get("score", 0.0))
+                if grp in SUPPORTED_PER_TAGS and sc >= th.per:
+                    filtered.append(e)
+                elif grp in SUPPORTED_ORG_TAGS and sc >= th.org:
+                    filtered.append(e)
+                elif grp in SUPPORTED_LOC_TAGS and sc >= th.loc:
+                    filtered.append(e)
+                elif grp in SUPPORTED_DATE_TAGS and sc >= th.date:
+                    filtered.append(e)
+            out.append(filtered)
+        return out
+