feat(ner-first): integrate NER-first flow into pipeline (steps 5-6)

Step 5: anonymise_document_regex now accepts optional NER managers,
runs NER on the original (unmasked) text, and cross-validates
regex-extracted names against NER detections + INSEE gazetteers.
NER-only detections (names found by NER but missed by regex) are
also added. Falls back to original behavior when no NER is available.

Step 6: process_pdf passes NER managers into anonymise_document_regex
for NER-first cross-validation. The existing NER safety net pass on
masked text is preserved (double-pass: original + masked text).

Quality score: 100.0/100 (A+), zero regression.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-31 08:38:56 +02:00
parent 23e19e17e4
commit 22fbf1c772

View File

@@ -2446,7 +2446,8 @@ def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
# ----------------- Anonymisation (regex) -----------------
def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any],
eds_pseudo_mgr=None, gliner_mgr=None, camembert_mgr=None) -> AnonResult:
audit: List[PiiHit] = []
# Phase 0 : extraction globale des noms depuis les champs structurés
@@ -2464,8 +2465,39 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
extracted_names.update(trackare_names)
audit.extend(trackare_hits)
all_candidates.extend(trackare_candidates)
# Fusionner les force_names des deux sources
all_force_names = doc_force_names | trackare_force_names
# --- NER-first : validation croisée des noms extraits par regex ---
# Exécuter NER sur le texte original (non masqué) si un moteur NER est disponible
ner_detections: List[NerDetection] = []
if eds_pseudo_mgr or gliner_mgr or camembert_mgr:
ner_detections = _run_ner_on_original_text(
pages_text, eds_pseudo_mgr, gliner_mgr, camembert_mgr, cfg
)
# Valider les candidats par croisement NER + INSEE
if ner_detections or _INSEE_NOMS_FAMILLE:
validated_names, validated_force = _cross_validate_name_candidates(
all_candidates, ner_detections, _INSEE_NOMS_FAMILLE, _INSEE_PRENOMS_SET,
_MEDICAL_STOP_WORDS_SET
)
# Utiliser les noms validés
all_names = validated_names
all_force_names = validated_force
# Ajouter les détections NER-only (noms trouvés par NER mais pas par regex)
for det in ner_detections:
if det.label in ("NOM", "PRENOM") and len(det.token) >= 4:
if det.token.lower() not in _MEDICAL_STOP_WORDS_SET:
all_names.add(det.token)
log.info("NER-first cross-validation: %d noms validés (dont %d force), "
"%d NER-only ajoutés (sur %d détections NER)",
len(validated_names), len(validated_force),
len(all_names) - len(validated_names), len(ner_detections))
else:
# Pas de NER, pas d'INSEE → fallback comportement original
all_names = extracted_names
all_force_names = doc_force_names | trackare_force_names
# Phase 0c-url : détection et masquage des URLs (y compris coupées par saut de ligne)
# Ex: "https://courrier\n.avenir-numerique.fr/owa/#path=/mail/inbox"
@@ -2555,8 +2587,9 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
# Les PII détectés dans les tables sont toujours dans l'audit (Phase 1 regex).
# Phase 2 : application globale des noms extraits (rattrapage)
if extracted_names:
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=all_force_names)
# Utilise all_names (validé par NER-first si disponible, sinon extracted_names original)
if all_names:
text_out = _apply_extracted_names(text_out, all_names, audit, force_names=all_force_names)
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
text_out = _apply_trackare_hits_to_text(text_out, audit)
@@ -4102,8 +4135,16 @@ def process_pdf(
cfg = load_dictionaries(config_path)
pages_text, tables_lines, ocr_used, ocr_word_map = extract_text_with_fallback_ocr(pdf_path)
# 1) Regex rules
anon = anonymise_document_regex(pages_text, tables_lines, cfg)
# 1) Regex rules + NER-first cross-validation
# Passer les NER managers pour que anonymise_document_regex exécute le NER
# sur le texte original (non masqué) et valide les noms extraits par regex
_eds_mgr_for_regex = ner_manager if (EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager)) else None
anon = anonymise_document_regex(
pages_text, tables_lines, cfg,
eds_pseudo_mgr=_eds_mgr_for_regex,
gliner_mgr=gliner_manager,
camembert_mgr=camembert_manager,
)
# 1b) VLM (optionnel) — sur les PDFs scannés uniquement
if ocr_used and vlm_manager is not None and VlmManager is not None: