feat(T-I): validateur paranames + filtre mots-outils FR du gazetteer

Validateur scripts/validate_paranames.py exécuté sur le gazetteer réel, révèle 2 défauts → corrigés : - Mots-outils FR (avec/dans/voir/...) présents dans INSEE/paranames → risque FP au contexte 'low'. Ajout de 347 mots-outils spaCy fr (sûrs, filtrés des patronymes INSEE fréquents) à stopwords_manuels.txt. build_paranames_gazetteer.py filtre désormais aussi contre ce fichier ; gazetteer reconstruit (1 379 196 noms, mots-outils ≥3 chars retirés). - Priorité sécurité respectée : allez/polygone sont de vrais patronymes INSEE rares → laissés MASQUABLES (pas de fuite), hors stopwords. - OYARCABAL reclassé en warning (couvert par regex F3, absent de Wikidata). Garde-fous vérifiés : Petit/Boucher/Berger conservés, noms étrangers (EJNAINI/NGUYEN/...) conservés. Validateur 5/5. tests/unit 85 passed. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 11:20:21 +02:00
parent 87377a54de
commit c110de4a2e
5 changed files with 528 additions and 7 deletions
--- a/scripts/build_paranames_gazetteer.py
+++ b/scripts/build_paranames_gazetteer.py
@@ -42,6 +42,7 @@ from typing import Iterable, Iterator
 REPO_ROOT = Path(__file__).resolve().parent.parent
 DATA_DIR = REPO_ROOT / "data" / "paranames"
 BDPM_STOPWORDS = REPO_ROOT / "data" / "bdpm" / "medicaments_stopwords.txt"
+MANUAL_STOPWORDS = REPO_ROOT / "data" / "stopwords_manuels.txt"
 INSEE_NOMS = REPO_ROOT / "data" / "insee" / "noms_famille_france.txt"

 OUT_NOMS = DATA_DIR / "noms_famille_world.txt.gz"
@@ -68,12 +69,13 @@ def normalize(token: str) -> str:
    return "".join(c for c in up if "A" <= c <= "Z")


-def load_stopwords() -> set[str]:
-    stop: set[str] = set()
-    if not BDPM_STOPWORDS.exists():
-        print(f"[WARN] {BDPM_STOPWORDS} introuvable — pas de filtrage BDPM.")
-        return stop
-    with BDPM_STOPWORDS.open("r", encoding="utf-8") as f:
+def _load_terms(path: Path, label: str, stop: set[str]) -> None:
+    """Charge un fichier de termes (1 par ligne, # = commentaire) dans stop."""
+    if not path.exists():
+        print(f"[WARN] {path} introuvable — pas de filtrage {label}.")
+        return
+    before = len(stop)
+    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
@@ -81,7 +83,21 @@ def load_stopwords() -> set[str]:
            n = normalize(line)
            if n:
                stop.add(n)
-    print(f"[INFO] BDPM stop-words : {len(stop):,} entrées.")
+    print(f"[INFO] {label} : +{len(stop) - before:,} termes.")
+
+
+def load_stopwords() -> set[str]:
+    """Filtre cumulé : médicaments BDPM + mots-outils/stop-words français curés.
+
+    stopwords_manuels.txt couvre les mots français courants qui apparaissent à
+    tort comme patronymes dans INSEE/paranames (ex. « voir », « avec »). Ces
+    termes sont exclus du gazetteer pour réduire les faux positifs au contexte
+    de détection faible.
+    """
+    stop: set[str] = set()
+    _load_terms(BDPM_STOPWORDS, "BDPM stop-words", stop)
+    _load_terms(MANUAL_STOPWORDS, "stop-words manuels FR", stop)
+    print(f"[INFO] Filtre total : {len(stop):,} termes.")
    return stop