feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp
Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
95
src/anonymization/ner_anonymizer.py
Normal file
95
src/anonymization/ner_anonymizer.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""NER via CamemBERT pour détecter les noms en texte libre."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ..config import NER_CONFIDENCE_THRESHOLD, NER_MODEL
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import Pipeline
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_pipeline: Pipeline | None = None
|
||||
|
||||
|
||||
def _get_pipeline() -> Pipeline:
|
||||
"""Charge le modèle NER (lazy loading)."""
|
||||
global _pipeline
|
||||
if _pipeline is None:
|
||||
logger.info("Chargement du modèle NER %s...", NER_MODEL)
|
||||
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(NER_MODEL)
|
||||
model = AutoModelForTokenClassification.from_pretrained(NER_MODEL)
|
||||
_pipeline = pipeline(
|
||||
"ner",
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
aggregation_strategy="simple",
|
||||
)
|
||||
logger.info("Modèle NER chargé.")
|
||||
return _pipeline
|
||||
|
||||
|
||||
def extract_person_entities(text: str) -> list[dict]:
|
||||
"""Extrait les entités de type PER (personnes) du texte.
|
||||
|
||||
Retourne une liste de dicts avec 'word', 'start', 'end', 'score'.
|
||||
"""
|
||||
pipe = _get_pipeline()
|
||||
|
||||
# CamemBERT a une limite de tokens — découper en chunks
|
||||
chunks = _split_text(text, max_chars=500)
|
||||
entities: list[dict] = []
|
||||
offset = 0
|
||||
|
||||
for chunk in chunks:
|
||||
results = pipe(chunk)
|
||||
for ent in results:
|
||||
if ent["entity_group"] == "PER" and ent["score"] >= NER_CONFIDENCE_THRESHOLD:
|
||||
word = ent["word"].strip()
|
||||
if len(word) >= 2:
|
||||
entities.append({
|
||||
"word": word,
|
||||
"start": ent["start"] + offset,
|
||||
"end": ent["end"] + offset,
|
||||
"score": float(ent["score"]),
|
||||
})
|
||||
offset += len(chunk)
|
||||
|
||||
return _deduplicate(entities)
|
||||
|
||||
|
||||
def _split_text(text: str, max_chars: int = 500) -> list[str]:
|
||||
"""Découpe le texte en chunks de taille raisonnable aux limites de phrases."""
|
||||
if len(text) <= max_chars:
|
||||
return [text]
|
||||
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + max_chars
|
||||
if end < len(text):
|
||||
# Chercher la fin de phrase la plus proche
|
||||
for sep in ["\n", ". ", ", ", " "]:
|
||||
pos = text.rfind(sep, start, end)
|
||||
if pos > start:
|
||||
end = pos + len(sep)
|
||||
break
|
||||
chunks.append(text[start:end])
|
||||
start = end
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _deduplicate(entities: list[dict]) -> list[dict]:
|
||||
"""Déduplique les entités par mot (garde le score le plus élevé)."""
|
||||
seen: dict[str, dict] = {}
|
||||
for ent in entities:
|
||||
key = ent["word"].lower()
|
||||
if key not in seen or ent["score"] > seen[key]["score"]:
|
||||
seen[key] = ent
|
||||
return list(seen.values())
|
||||
Reference in New Issue
Block a user