feat: réduction FP + gazetteers adresses FINESS + batch parallèle + corrections multi-axes
- Token min length relevé de 2-3 → 4 chars (élimine FP EPO, IRC, SIB...) - Stop-words enrichis : acronymes médicaux 3 lettres, termes pharma, soins infirmiers - BDPM stop-words : ~7300 noms commerciaux + DCI/substances actives - Gazetteers adresses FINESS : 63K patterns Aho-Corasick (position-preserving normalization) - Filtre contextuel anatomique pour FINESS établissements - Nouvelles regex : RE_CIVILITE_COMMA_LIST, RE_EXTRACT_NOM_UTILISE, RE_EXTRACT_PRENOM, RE_NUM_EXAMEN_PATIENT, RE_ADRESSE_LIEU_DIT, RE_CIVILITE_INITIALE, Dr X.NOM - URLs complètes (RE_URL) + détection multiline - N° venue inversé (layout-aware) + EPISODE/NDA dans _CRITICAL_PII_TYPES - HospitalFilter désactivé pour ADRESSE/TEL/VILLE/EPISODE (identifient le patient) - Batch silver export parallélisé (multiprocessing spawn, N workers) - Seuil sur-masquage relevé à 8%, server.py enrichi (source regex/ner) - Blacklist villes : COURANT, PARIS ; contexte villes étendu (UHCD, spécialités) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -24,9 +24,14 @@ blacklist:
|
|||||||
- '640780417'
|
- '640780417'
|
||||||
- 'Dates du séjour :'
|
- 'Dates du séjour :'
|
||||||
- CONCERTATION
|
- CONCERTATION
|
||||||
|
- BAYONNE CEDEX
|
||||||
|
- BAYONNE
|
||||||
|
- '64109'
|
||||||
|
- LABORATOIRE de BIOLOGIE MEDICALE
|
||||||
force_mask_regex:
|
force_mask_regex:
|
||||||
- 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
|
- 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
|
||||||
- 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud'
|
- 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud'
|
||||||
|
- '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
|
||||||
kv_labels_preserve:
|
kv_labels_preserve:
|
||||||
- FINESS
|
- FINESS
|
||||||
- IPP
|
- IPP
|
||||||
|
|||||||
15816
data/bdpm/CIS_bdpm.txt
Normal file
15816
data/bdpm/CIS_bdpm.txt
Normal file
File diff suppressed because it is too large
Load Diff
7316
data/bdpm/medicaments_stopwords.txt
Normal file
7316
data/bdpm/medicaments_stopwords.txt
Normal file
File diff suppressed because it is too large
Load Diff
63107
data/finess/adresses_finess.txt
Normal file
63107
data/finess/adresses_finess.txt
Normal file
File diff suppressed because it is too large
Load Diff
52463
data/finess/voies_distinctives.txt
Normal file
52463
data/finess/voies_distinctives.txt
Normal file
File diff suppressed because it is too large
Load Diff
218984
data/insee/noms2008nat_txt.txt
Normal file
218984
data/insee/noms2008nat_txt.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1184,8 +1184,8 @@ déglobulisation. O
|
|||||||
Bladder O
|
Bladder O
|
||||||
négatif. O
|
négatif. O
|
||||||
Sur O
|
Sur O
|
||||||
le O
|
le B-VILLE
|
||||||
plan B-VILLE
|
plan I-VILLE
|
||||||
antalgique O
|
antalgique O
|
||||||
: O
|
: O
|
||||||
Faux B-VILLE
|
Faux B-VILLE
|
||||||
@@ -1515,8 +1515,8 @@ cette O
|
|||||||
patiente O
|
patiente O
|
||||||
altérée O
|
altérée O
|
||||||
sur O
|
sur O
|
||||||
le O
|
le B-VILLE
|
||||||
plan B-VILLE
|
plan I-VILLE
|
||||||
général, O
|
général, O
|
||||||
OMS2/3. O
|
OMS2/3. O
|
||||||
> O
|
> O
|
||||||
@@ -1529,8 +1529,8 @@ du O
|
|||||||
traitement O
|
traitement O
|
||||||
antalgique. O
|
antalgique. O
|
||||||
Sur O
|
Sur O
|
||||||
le O
|
le B-VILLE
|
||||||
plan B-VILLE
|
plan I-VILLE
|
||||||
infectieux O
|
infectieux O
|
||||||
: O
|
: O
|
||||||
Pic O
|
Pic O
|
||||||
@@ -2817,8 +2817,8 @@ apyrexie O
|
|||||||
au O
|
au O
|
||||||
décours. O
|
décours. O
|
||||||
Sur O
|
Sur O
|
||||||
le O
|
le B-VILLE
|
||||||
plan B-VILLE
|
plan I-VILLE
|
||||||
urologique O
|
urologique O
|
||||||
: O
|
: O
|
||||||
Un O
|
Un O
|
||||||
@@ -2919,8 +2919,8 @@ oncologique O
|
|||||||
Nette O
|
Nette O
|
||||||
amélioration O
|
amélioration O
|
||||||
sur O
|
sur O
|
||||||
le O
|
le B-VILLE
|
||||||
plan B-VILLE
|
plan I-VILLE
|
||||||
général O
|
général O
|
||||||
avec O
|
avec O
|
||||||
la O
|
la O
|
||||||
|
|||||||
@@ -2572,8 +2572,8 @@ de O
|
|||||||
traitement O
|
traitement O
|
||||||
antibiotique O
|
antibiotique O
|
||||||
Sur O
|
Sur O
|
||||||
le O
|
le B-VILLE
|
||||||
plan B-VILLE
|
plan I-VILLE
|
||||||
hématologique O
|
hématologique O
|
||||||
Anémie O
|
Anémie O
|
||||||
autour O
|
autour O
|
||||||
|
|||||||
@@ -1812,8 +1812,8 @@ de O
|
|||||||
cette O
|
cette O
|
||||||
décision. O
|
décision. O
|
||||||
Sur O
|
Sur O
|
||||||
le O
|
le B-VILLE
|
||||||
plan B-VILLE
|
plan I-VILLE
|
||||||
hématologique: O
|
hématologique: O
|
||||||
Elle O
|
Elle O
|
||||||
présente O
|
présente O
|
||||||
|
|||||||
@@ -1420,8 +1420,8 @@ en O
|
|||||||
charge O
|
charge O
|
||||||
antalgique. O
|
antalgique. O
|
||||||
Sur O
|
Sur O
|
||||||
le O
|
le B-VILLE
|
||||||
plan B-VILLE
|
plan I-VILLE
|
||||||
de O
|
de O
|
||||||
la O
|
la O
|
||||||
gravité: O
|
gravité: O
|
||||||
|
|||||||
@@ -1102,8 +1102,8 @@ de O
|
|||||||
l'épisode O
|
l'épisode O
|
||||||
aigüe. O
|
aigüe. O
|
||||||
Sur O
|
Sur O
|
||||||
le O
|
le B-VILLE
|
||||||
plan B-VILLE
|
plan I-VILLE
|
||||||
infectieux, O
|
infectieux, O
|
||||||
présence O
|
présence O
|
||||||
de O
|
de O
|
||||||
|
|||||||
@@ -166,24 +166,13 @@ class HospitalFilter:
|
|||||||
Returns:
|
Returns:
|
||||||
True si la détection doit être filtrée (faux positif)
|
True si la détection doit être filtrée (faux positif)
|
||||||
"""
|
"""
|
||||||
# Filtrer par type
|
# ADRESSE, CODE_POSTAL, VILLE, TEL : NE PAS filtrer.
|
||||||
if pii_type == "ADRESSE":
|
# Les coordonnées hospitalières identifient indirectement le patient
|
||||||
return self.is_hospital_address(text)
|
# et doivent être masquées (validé par contrôle humain 2026-03-12).
|
||||||
|
|
||||||
elif pii_type == "CODE_POSTAL":
|
# EPISODE : NE PAS filtrer.
|
||||||
return self.is_hospital_postal_code(text)
|
# Les numéros d'épisode identifient le patient (validé 2026-03-14).
|
||||||
|
|
||||||
elif pii_type == "VILLE":
|
|
||||||
return self.is_hospital_city(text)
|
|
||||||
|
|
||||||
elif pii_type == "TEL":
|
|
||||||
return self.is_hospital_phone(text)
|
|
||||||
|
|
||||||
elif pii_type == "EPISODE":
|
|
||||||
# Filtrer les épisodes qui proviennent du nom de fichier
|
|
||||||
# (répétés dans les en-têtes/pieds de page des documents trackare)
|
|
||||||
return self.is_episode_in_filename(text, filename)
|
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def filter_detections(self, detections: List[Dict], filename: str = "", is_trackare: bool = False) -> List[Dict]:
|
def filter_detections(self, detections: List[Dict], filename: str = "", is_trackare: bool = False) -> List[Dict]:
|
||||||
@@ -222,15 +211,17 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
# Tests
|
# Tests
|
||||||
test_cases = [
|
test_cases = [
|
||||||
("ADRESSE", "13, Avenue de l'Interne J", "", -1, True),
|
# ADRESSE, CODE_POSTAL, VILLE, TEL : ne sont plus filtrés (identifient le patient)
|
||||||
|
("ADRESSE", "13, Avenue de l'Interne J", "", -1, False),
|
||||||
("ADRESSE", "22 LOT MENDI ALDE", "", -1, False),
|
("ADRESSE", "22 LOT MENDI ALDE", "", -1, False),
|
||||||
("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, True),
|
("CODE_POSTAL", "64109 BAYONNE CEDEX", "", -1, False),
|
||||||
("CODE_POSTAL", "64130", "", -1, False),
|
("CODE_POSTAL", "64130", "", -1, False),
|
||||||
("VILLE", "BAYONNE CEDEX", "", -1, True),
|
("VILLE", "BAYONNE CEDEX", "", -1, False),
|
||||||
("VILLE", "CHERAUTE", "", -1, False),
|
("VILLE", "CHERAUTE", "", -1, False),
|
||||||
("VILLE", "DROIT", "", -1, True), # Terme anatomique
|
("VILLE", "DROIT", "", -1, False),
|
||||||
("TEL", "05 59 44 35 35", "", -1, True),
|
("TEL", "05 59 44 35 35", "", -1, False),
|
||||||
("TEL", "0676085336", "", -1, False),
|
("TEL", "0676085336", "", -1, False),
|
||||||
|
# EPISODE : filtré uniquement si provient du nom de fichier trackare
|
||||||
("EPISODE", "23202435", "trackare-14004105-23202435", -1, True),
|
("EPISODE", "23202435", "trackare-14004105-23202435", -1, True),
|
||||||
("EPISODE", "23102610", "CRH_23102610", 0, False),
|
("EPISODE", "23102610", "CRH_23102610", 0, False),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,18 +1,18 @@
|
|||||||
{
|
{
|
||||||
"date": "2026-03-12T10:24:59.261417",
|
"date": "2026-03-12T17:16:25.993851",
|
||||||
"scores": {
|
"scores": {
|
||||||
"global_score": 97.0,
|
"global_score": 97.0,
|
||||||
"leak_score": 100.0,
|
"leak_score": 100.0,
|
||||||
"fp_score": 90,
|
"fp_score": 90,
|
||||||
"totals": {
|
"totals": {
|
||||||
"documents": 29,
|
"documents": 29,
|
||||||
"audit_hits": 2797,
|
"audit_hits": 3186,
|
||||||
"name_tokens_known": 461,
|
"name_tokens_known": 457,
|
||||||
"leak_audit": 0,
|
"leak_audit": 0,
|
||||||
"leak_occurrences": 0,
|
"leak_occurrences": 0,
|
||||||
"leak_regex": 0,
|
"leak_regex": 0,
|
||||||
"leak_insee_high": 0,
|
"leak_insee_high": 0,
|
||||||
"leak_insee_medium": 569,
|
"leak_insee_medium": 570,
|
||||||
"fp_medical": 0,
|
"fp_medical": 0,
|
||||||
"fp_overmasking": 2
|
"fp_overmasking": 2
|
||||||
}
|
}
|
||||||
@@ -110,7 +110,7 @@
|
|||||||
"leak_audit": 0,
|
"leak_audit": 0,
|
||||||
"leak_regex": 0,
|
"leak_regex": 0,
|
||||||
"leak_insee_high": 0,
|
"leak_insee_high": 0,
|
||||||
"leak_insee_medium": 23,
|
"leak_insee_medium": 24,
|
||||||
"fp_medical": 0,
|
"fp_medical": 0,
|
||||||
"fp_overmasking": 0
|
"fp_overmasking": 0
|
||||||
},
|
},
|
||||||
@@ -206,7 +206,7 @@
|
|||||||
"leak_audit": 0,
|
"leak_audit": 0,
|
||||||
"leak_regex": 0,
|
"leak_regex": 0,
|
||||||
"leak_insee_high": 0,
|
"leak_insee_high": 0,
|
||||||
"leak_insee_medium": 32,
|
"leak_insee_medium": 33,
|
||||||
"fp_medical": 0,
|
"fp_medical": 0,
|
||||||
"fp_overmasking": 0
|
"fp_overmasking": 0
|
||||||
},
|
},
|
||||||
@@ -222,7 +222,7 @@
|
|||||||
"leak_audit": 0,
|
"leak_audit": 0,
|
||||||
"leak_regex": 0,
|
"leak_regex": 0,
|
||||||
"leak_insee_high": 0,
|
"leak_insee_high": 0,
|
||||||
"leak_insee_medium": 34,
|
"leak_insee_medium": 32,
|
||||||
"fp_medical": 0,
|
"fp_medical": 0,
|
||||||
"fp_overmasking": 0
|
"fp_overmasking": 0
|
||||||
},
|
},
|
||||||
@@ -246,7 +246,7 @@
|
|||||||
"leak_audit": 0,
|
"leak_audit": 0,
|
||||||
"leak_regex": 0,
|
"leak_regex": 0,
|
||||||
"leak_insee_high": 0,
|
"leak_insee_high": 0,
|
||||||
"leak_insee_medium": 26,
|
"leak_insee_medium": 27,
|
||||||
"fp_medical": 0,
|
"fp_medical": 0,
|
||||||
"fp_overmasking": 0
|
"fp_overmasking": 0
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,27 +1,24 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Batch anonymisation de PDFs pour enrichir le dataset silver.
|
"""Batch anonymisation parallèle de PDFs pour enrichir le dataset silver.
|
||||||
|
|
||||||
Traite TOUS les PDFs disponibles (excluant ceux déjà dans audit_30) en mode CPU
|
Traite TOUS les PDFs disponibles en mode CPU (sans VLM), avec N workers
|
||||||
uniquement (sans VLM) pour générer des .pseudonymise.txt utilisables par
|
parallèles. Chaque worker charge ses propres modèles NER.
|
||||||
export_silver_annotations.py.
|
|
||||||
|
|
||||||
Timeout par fichier pour éviter les blocages sur les gros documents.
|
|
||||||
Reprend automatiquement là où il s'est arrêté (skip les déjà traités).
|
Reprend automatiquement là où il s'est arrêté (skip les déjà traités).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python run_batch_silver_export.py # 6 workers (défaut)
|
||||||
|
python run_batch_silver_export.py --workers 4 # 4 workers
|
||||||
"""
|
"""
|
||||||
import sys
|
import sys
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
import signal
|
import argparse
|
||||||
import random
|
import multiprocessing as mp
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import Counter
|
|
||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
import anonymizer_core_refactored_onnx as core
|
|
||||||
from eds_pseudo_manager import EdsPseudoManager
|
|
||||||
from gliner_manager import GlinerManager
|
|
||||||
from camembert_ner_manager import CamembertNerManager
|
|
||||||
|
|
||||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||||
OUTDIR = SRC / "anonymise_silver_extra"
|
OUTDIR = SRC / "anonymise_silver_extra"
|
||||||
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
||||||
@@ -62,16 +59,102 @@ ALREADY_DONE_AUDIT30 = {
|
|||||||
|
|
||||||
TIMEOUT_PER_FILE = 120 # secondes max par PDF
|
TIMEOUT_PER_FILE = 120 # secondes max par PDF
|
||||||
|
|
||||||
|
# Variables globales par worker (initialisées une seule fois)
|
||||||
class TimeoutError(Exception):
|
_worker_ner = None
|
||||||
pass
|
_worker_gliner = None
|
||||||
|
_worker_camembert = None
|
||||||
|
_worker_id = None
|
||||||
|
|
||||||
|
|
||||||
def timeout_handler(signum, frame):
|
def init_worker(worker_id):
|
||||||
raise TimeoutError("Timeout")
|
"""Initialise les modèles NER dans chaque worker (appelé une seule fois)."""
|
||||||
|
global _worker_ner, _worker_gliner, _worker_camembert, _worker_id
|
||||||
|
_worker_id = worker_id
|
||||||
|
|
||||||
|
# Limiter les threads ONNX/OpenMP par worker pour éviter la contention
|
||||||
|
n_threads = max(2, 32 // (mp.cpu_count() // 2)) # répartir équitablement
|
||||||
|
os.environ["OMP_NUM_THREADS"] = str(n_threads)
|
||||||
|
os.environ["MKL_NUM_THREADS"] = str(n_threads)
|
||||||
|
|
||||||
|
import anonymizer_core_refactored_onnx as core # noqa: F401
|
||||||
|
from eds_pseudo_manager import EdsPseudoManager
|
||||||
|
from gliner_manager import GlinerManager
|
||||||
|
from camembert_ner_manager import CamembertNerManager
|
||||||
|
|
||||||
|
_worker_ner = EdsPseudoManager()
|
||||||
|
_worker_ner.load()
|
||||||
|
print(f" [W{worker_id}] EDS-Pseudo chargé", flush=True)
|
||||||
|
|
||||||
|
_worker_gliner = GlinerManager()
|
||||||
|
try:
|
||||||
|
_worker_gliner.load()
|
||||||
|
print(f" [W{worker_id}] GLiNER chargé", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" [W{worker_id}] GLiNER indisponible ({e})", flush=True)
|
||||||
|
_worker_gliner = None
|
||||||
|
|
||||||
|
_worker_camembert = CamembertNerManager()
|
||||||
|
try:
|
||||||
|
_worker_camembert.load()
|
||||||
|
print(f" [W{worker_id}] CamemBERT-bio chargé", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" [W{worker_id}] CamemBERT-bio indisponible ({e})", flush=True)
|
||||||
|
_worker_camembert = None
|
||||||
|
|
||||||
|
print(f" [W{worker_id}] Prêt (threads={n_threads})", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def process_one_pdf(args):
|
||||||
|
"""Traite un seul PDF. Appelé par le pool de workers."""
|
||||||
|
pdf_path, idx, total = args
|
||||||
|
import signal
|
||||||
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
|
||||||
|
ogc = pdf_path.parent.name.split("_")[0]
|
||||||
|
|
||||||
|
# Timeout via alarm
|
||||||
|
def _timeout_handler(signum, frame):
|
||||||
|
raise TimeoutError("Timeout")
|
||||||
|
|
||||||
|
signal.signal(signal.SIGALRM, _timeout_handler)
|
||||||
|
signal.alarm(TIMEOUT_PER_FILE)
|
||||||
|
|
||||||
|
try:
|
||||||
|
core.process_pdf(
|
||||||
|
pdf_path=pdf_path,
|
||||||
|
out_dir=OUTDIR,
|
||||||
|
make_vector_redaction=False,
|
||||||
|
also_make_raster_burn=False,
|
||||||
|
config_path=CONFIG,
|
||||||
|
use_hf=True,
|
||||||
|
ner_manager=_worker_ner,
|
||||||
|
ner_thresholds=None,
|
||||||
|
ogc_label=ogc,
|
||||||
|
vlm_manager=None,
|
||||||
|
gliner_manager=_worker_gliner,
|
||||||
|
camembert_manager=_worker_camembert,
|
||||||
|
)
|
||||||
|
signal.alarm(0)
|
||||||
|
return ("OK", pdf_path.name, idx, total)
|
||||||
|
except TimeoutError:
|
||||||
|
signal.alarm(0)
|
||||||
|
return ("TIMEOUT", pdf_path.name, idx, total)
|
||||||
|
except Exception as e:
|
||||||
|
signal.alarm(0)
|
||||||
|
err = str(e)
|
||||||
|
if "encrypted" in err.lower() or "password" in err.lower():
|
||||||
|
return ("SKIP", pdf_path.name, idx, total)
|
||||||
|
return ("ERROR", pdf_path.name, idx, total, str(e)[:100])
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Batch silver export parallèle")
|
||||||
|
parser.add_argument("--workers", type=int, default=6,
|
||||||
|
help="Nombre de workers parallèles (défaut: 6)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
n_workers = args.workers
|
||||||
|
|
||||||
# Collecter tous les PDFs disponibles (excluant audit_30)
|
# Collecter tous les PDFs disponibles (excluant audit_30)
|
||||||
all_pdfs = []
|
all_pdfs = []
|
||||||
for ogc_dir in sorted(SRC.iterdir()):
|
for ogc_dir in sorted(SRC.iterdir()):
|
||||||
@@ -81,7 +164,6 @@ def main():
|
|||||||
if pdf.name not in ALREADY_DONE_AUDIT30:
|
if pdf.name not in ALREADY_DONE_AUDIT30:
|
||||||
all_pdfs.append(pdf)
|
all_pdfs.append(pdf)
|
||||||
|
|
||||||
# Trier par OGC pour reproductibilité
|
|
||||||
all_pdfs.sort(key=lambda p: (p.parent.name, p.name))
|
all_pdfs.sort(key=lambda p: (p.parent.name, p.name))
|
||||||
|
|
||||||
# Détecter les fichiers déjà traités (reprise)
|
# Détecter les fichiers déjà traités (reprise)
|
||||||
@@ -95,96 +177,73 @@ def main():
|
|||||||
print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)")
|
print(f"PDFs disponibles: {len(all_pdfs)} (excl. audit_30)")
|
||||||
print(f"Déjà traités: {len(already_done)}")
|
print(f"Déjà traités: {len(already_done)}")
|
||||||
print(f"Restant: {len(pdfs_to_do)}")
|
print(f"Restant: {len(pdfs_to_do)}")
|
||||||
|
print(f"Workers: {n_workers}")
|
||||||
|
print(f"RAM par worker: ~4 Go (NER models)")
|
||||||
|
print(f"RAM totale estimée: ~{n_workers * 4} Go\n")
|
||||||
|
|
||||||
if not pdfs_to_do:
|
if not pdfs_to_do:
|
||||||
print("Rien à faire.")
|
print("Rien à faire.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Chargement des modèles NER (CPU uniquement, pas de VLM)
|
# Préparer les arguments : (pdf_path, index, total)
|
||||||
print("\nChargement EDS-Pseudo...", flush=True)
|
tasks = [(pdf, i, len(pdfs_to_do)) for i, pdf in enumerate(pdfs_to_do, 1)]
|
||||||
ner = EdsPseudoManager()
|
|
||||||
ner.load()
|
|
||||||
assert ner.is_loaded(), "EDS-Pseudo non chargé"
|
|
||||||
print("EDS-Pseudo chargé.", flush=True)
|
|
||||||
|
|
||||||
print("Chargement GLiNER...", flush=True)
|
print(f"Chargement des modèles dans {n_workers} workers...", flush=True)
|
||||||
gliner = GlinerManager()
|
|
||||||
try:
|
|
||||||
gliner.load()
|
|
||||||
print("GLiNER chargé.", flush=True)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
|
|
||||||
gliner = None
|
|
||||||
|
|
||||||
print("Chargement CamemBERT-bio ONNX...", flush=True)
|
# Créer le pool avec initialisation des modèles par worker
|
||||||
camembert = CamembertNerManager()
|
# On utilise mp.Pool avec initializer pour charger les modèles une seule fois
|
||||||
try:
|
# Note: fork + ONNX peut poser problème, on utilise 'spawn'
|
||||||
camembert.load()
|
ctx = mp.get_context("spawn")
|
||||||
print("CamemBERT-bio ONNX chargé.", flush=True)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"CamemBERT-bio indisponible ({e}), on continue sans.", flush=True)
|
|
||||||
camembert = None
|
|
||||||
|
|
||||||
print(f"\nPas de VLM (CPU only pour silver export).\n", flush=True)
|
|
||||||
|
|
||||||
ok = ko = skip_encrypted = skip_timeout = 0
|
ok = ko = skip_encrypted = skip_timeout = 0
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
total = len(pdfs_to_do)
|
|
||||||
|
|
||||||
for i, pdf in enumerate(pdfs_to_do, 1):
|
# Lancer les workers séquentiellement pour l'init (éviter pic mémoire)
|
||||||
ogc = pdf.parent.name.split("_")[0]
|
# puis traiter en parallèle
|
||||||
print(f"[{i}/{total}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
|
with ctx.Pool(
|
||||||
|
processes=n_workers,
|
||||||
|
initializer=init_worker,
|
||||||
|
initargs=(0,), # worker_id simplifié
|
||||||
|
) as pool:
|
||||||
|
for result in pool.imap_unordered(process_one_pdf, tasks, chunksize=1):
|
||||||
|
status = result[0]
|
||||||
|
name = result[1]
|
||||||
|
idx = result[2]
|
||||||
|
total = result[3]
|
||||||
|
|
||||||
# Timeout par fichier
|
|
||||||
signal.signal(signal.SIGALRM, timeout_handler)
|
|
||||||
signal.alarm(TIMEOUT_PER_FILE)
|
|
||||||
try:
|
|
||||||
core.process_pdf(
|
|
||||||
pdf_path=pdf,
|
|
||||||
out_dir=OUTDIR,
|
|
||||||
make_vector_redaction=False,
|
|
||||||
also_make_raster_burn=False,
|
|
||||||
config_path=CONFIG,
|
|
||||||
use_hf=True,
|
|
||||||
ner_manager=ner,
|
|
||||||
ner_thresholds=None,
|
|
||||||
ogc_label=ogc,
|
|
||||||
vlm_manager=None,
|
|
||||||
gliner_manager=gliner,
|
|
||||||
camembert_manager=camembert,
|
|
||||||
)
|
|
||||||
signal.alarm(0)
|
|
||||||
elapsed_file = time.time() - t0
|
|
||||||
rate = ok / elapsed_file * 3600 if elapsed_file > 0 and ok > 0 else 0
|
|
||||||
print(f"OK ({rate:.0f}/h)", flush=True)
|
|
||||||
ok += 1
|
|
||||||
except TimeoutError:
|
|
||||||
signal.alarm(0)
|
|
||||||
print(f"TIMEOUT ({TIMEOUT_PER_FILE}s)", flush=True)
|
|
||||||
skip_timeout += 1
|
|
||||||
except Exception as e:
|
|
||||||
signal.alarm(0)
|
|
||||||
err = str(e)
|
|
||||||
if "encrypted" in err.lower() or "password" in err.lower():
|
|
||||||
print("SKIP (chiffré)", flush=True)
|
|
||||||
skip_encrypted += 1
|
|
||||||
else:
|
|
||||||
print(f"ERREUR: {e}", flush=True)
|
|
||||||
ko += 1
|
|
||||||
|
|
||||||
# Rapport intermédiaire toutes les 50 fichiers
|
|
||||||
if i % 50 == 0:
|
|
||||||
elapsed = time.time() - t0
|
elapsed = time.time() - t0
|
||||||
remaining = (elapsed / i) * (total - i)
|
done = ok + ko + skip_encrypted + skip_timeout + 1
|
||||||
print(f"\n --- Progression: {i}/{total} | OK: {ok} | "
|
|
||||||
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
|
if status == "OK":
|
||||||
f"Temps restant estimé: {remaining/60:.0f}min ---\n", flush=True)
|
ok += 1
|
||||||
|
rate = ok / elapsed * 3600 if elapsed > 0 else 0
|
||||||
|
print(f"[{done}/{total}] {name} OK ({rate:.0f}/h)", flush=True)
|
||||||
|
elif status == "TIMEOUT":
|
||||||
|
skip_timeout += 1
|
||||||
|
print(f"[{done}/{total}] {name} TIMEOUT", flush=True)
|
||||||
|
elif status == "SKIP":
|
||||||
|
skip_encrypted += 1
|
||||||
|
print(f"[{done}/{total}] {name} SKIP (chiffré)", flush=True)
|
||||||
|
else:
|
||||||
|
ko += 1
|
||||||
|
err_msg = result[4] if len(result) > 4 else "?"
|
||||||
|
print(f"[{done}/{total}] {name} ERREUR: {err_msg}", flush=True)
|
||||||
|
|
||||||
|
# Rapport intermédiaire toutes les 50 fichiers
|
||||||
|
if done % 50 == 0:
|
||||||
|
remaining = (elapsed / done) * (total - done)
|
||||||
|
print(f"\n --- Progression: {done}/{total} | OK: {ok} | "
|
||||||
|
f"Erreurs: {ko} | Timeout: {skip_timeout} | "
|
||||||
|
f"Débit: {ok/elapsed*3600:.0f}/h | "
|
||||||
|
f"Restant: {remaining/60:.0f}min ---\n", flush=True)
|
||||||
|
|
||||||
elapsed = time.time() - t0
|
elapsed = time.time() - t0
|
||||||
|
total_pseudo = len(list(OUTDIR.glob("*.pseudonymise.txt")))
|
||||||
print(f"\n{'='*60}")
|
print(f"\n{'='*60}")
|
||||||
print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)")
|
print(f"Terminé en {elapsed:.0f}s ({elapsed/60:.1f}min)")
|
||||||
print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}")
|
print(f"OK: {ok}, Chiffrés: {skip_encrypted}, Timeout: {skip_timeout}, Erreurs: {ko}")
|
||||||
print(f"Total .pseudonymise.txt: {len(list(OUTDIR.glob('*.pseudonymise.txt')))}")
|
print(f"Total .pseudonymise.txt: {total_pseudo}")
|
||||||
|
print(f"Débit moyen: {ok/elapsed*3600:.0f} fichiers/h")
|
||||||
print(f"Sortie: {OUTDIR}")
|
print(f"Sortie: {OUTDIR}")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -190,6 +190,93 @@ def main():
|
|||||||
out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
|
out.write_text("\n".join(sorted(phones)) + "\n", encoding="utf-8")
|
||||||
print(f" → {out.name}: {len(phones)} entrées")
|
print(f" → {out.name}: {len(phones)} entrées")
|
||||||
|
|
||||||
|
# 6. Adresses FINESS (type_voie + nom_voie) pour Aho-Corasick
|
||||||
|
# Mapping des codes type_voie FINESS vers formes étendues
|
||||||
|
TYPE_VOIE_MAP = {
|
||||||
|
"AV": "avenue", "R": "rue", "BD": "boulevard", "RTE": "route",
|
||||||
|
"CHE": "chemin", "PL": "place", "IMP": "impasse", "ALL": "allee",
|
||||||
|
"SQ": "square", "PASS": "passage", "QU": "quai", "CRS": "cours",
|
||||||
|
"SEN": "sentier", "RPT": "rond-point", "LD": "lieu-dit",
|
||||||
|
"HAM": "hameau", "LOT": "lotissement", "TSSE": "traverse",
|
||||||
|
"CHEM": "chemin", "RES": "residence", "CTRE": "centre",
|
||||||
|
"ESP": "esplanade", "PRO": "promenade", "MTE": "montee",
|
||||||
|
"VOI": "voie", "CAR": "carrefour", "FBG": "faubourg",
|
||||||
|
}
|
||||||
|
# Charger les prénoms INSEE pour générer des variantes abrégées
|
||||||
|
prenoms_path = Path(__file__).parent.parent / "data" / "insee" / "prenoms_france.txt"
|
||||||
|
prenoms_set = set()
|
||||||
|
if prenoms_path.exists():
|
||||||
|
for line in prenoms_path.read_text(encoding="utf-8").splitlines():
|
||||||
|
p = line.strip().lower()
|
||||||
|
if p and len(p) >= 3:
|
||||||
|
prenoms_set.add(p)
|
||||||
|
print(f" Prénoms INSEE chargés: {len(prenoms_set)}")
|
||||||
|
|
||||||
|
VOIE_GENERIC = {
|
||||||
|
"de", "du", "des", "la", "le", "les", "l", "et", "en", "au", "aux",
|
||||||
|
"a", "sur", "sous", "par", "pour", "dans", "rue", "avenue", "boulevard",
|
||||||
|
"route", "chemin", "place", "impasse", "square", "passage", "quai", "cours",
|
||||||
|
"grande", "grand", "petit", "petite", "vieux", "vieille", "nouveau", "nouvelle",
|
||||||
|
"haut", "haute", "bas", "basse",
|
||||||
|
}
|
||||||
|
|
||||||
|
addr_patterns = set()
|
||||||
|
|
||||||
|
def _add_with_abbrev(pattern: str):
|
||||||
|
"""Ajoute le pattern + variantes avec prénoms abrégés (initiale seule)."""
|
||||||
|
addr_patterns.add(pattern)
|
||||||
|
words = pattern.split()
|
||||||
|
for i, w in enumerate(words):
|
||||||
|
if w in prenoms_set and len(w) >= 3:
|
||||||
|
# Variante avec initiale seule — seulement si un mot distinctif suit
|
||||||
|
remaining = words[i+1:]
|
||||||
|
if not remaining or all(len(r) <= 2 or r in VOIE_GENERIC for r in remaining):
|
||||||
|
continue # Pas d'abréviation si rien de distinctif après
|
||||||
|
abbrev_words = words[:i] + [w[0]] + words[i+1:]
|
||||||
|
abbrev = " ".join(abbrev_words)
|
||||||
|
# Minimum 12 chars, et le pattern ne doit pas commencer par une initiale seule
|
||||||
|
if len(abbrev) >= 12 and len(abbrev_words[0]) >= 2:
|
||||||
|
addr_patterns.add(abbrev)
|
||||||
|
|
||||||
|
with open(csv_path, encoding="utf-8") as f:
|
||||||
|
reader = csv.reader(f, delimiter=";")
|
||||||
|
next(reader)
|
||||||
|
for row in reader:
|
||||||
|
if len(row) < 10:
|
||||||
|
continue
|
||||||
|
type_voie_raw = row[8].strip() if len(row) > 8 else ""
|
||||||
|
nom_voie = row[9].strip() if len(row) > 9 else ""
|
||||||
|
if not nom_voie or len(nom_voie) < 3:
|
||||||
|
continue
|
||||||
|
nom_norm = normalize(nom_voie)
|
||||||
|
words = nom_norm.split()
|
||||||
|
|
||||||
|
# Pattern complet : type_voie + nom_voie (ex: "avenue de l interne jacques loeb")
|
||||||
|
type_voie_expanded = TYPE_VOIE_MAP.get(type_voie_raw.upper(), type_voie_raw.lower())
|
||||||
|
if type_voie_expanded and nom_norm:
|
||||||
|
full = f"{type_voie_expanded} {nom_norm}"
|
||||||
|
full_words = full.split()
|
||||||
|
has_distinctive = any(
|
||||||
|
w not in VOIE_GENERIC and len(w) >= 4 for w in full_words
|
||||||
|
)
|
||||||
|
if has_distinctive and len(full) >= 12:
|
||||||
|
_add_with_abbrev(full)
|
||||||
|
|
||||||
|
# Pattern nom_voie seul (seulement si très distinctif)
|
||||||
|
has_distinctive = any(w not in VOIE_GENERIC and len(w) >= 4 for w in words)
|
||||||
|
if has_distinctive and len(nom_norm) >= 15:
|
||||||
|
_add_with_abbrev(nom_norm)
|
||||||
|
|
||||||
|
out = OUT_DIR / "adresses_finess.txt"
|
||||||
|
out.write_text("\n".join(sorted(addr_patterns)) + "\n", encoding="utf-8")
|
||||||
|
print(f"\n → {out.name}: {len(addr_patterns)} entrées")
|
||||||
|
|
||||||
|
# Garder aussi voies_distinctives.txt pour compatibilité
|
||||||
|
voie_names = {p for p in addr_patterns if len(p) >= 15}
|
||||||
|
out = OUT_DIR / "voies_distinctives.txt"
|
||||||
|
out.write_text("\n".join(sorted(voie_names)) + "\n", encoding="utf-8")
|
||||||
|
print(f" → {out.name}: {len(voie_names)} entrées")
|
||||||
|
|
||||||
# Stats par longueur
|
# Stats par longueur
|
||||||
print(f"\nDistribution noms distinctifs par longueur (mots):")
|
print(f"\nDistribution noms distinctifs par longueur (mots):")
|
||||||
word_counts = Counter(len(n.split()) for n in filtered_distinctive)
|
word_counts = Counter(len(n.split()) for n in filtered_distinctive)
|
||||||
|
|||||||
@@ -300,7 +300,7 @@ def check_fp_density(text: str) -> dict:
|
|||||||
"density_pct": round(density, 2),
|
"density_pct": round(density, 2),
|
||||||
"nom_count": nom_count,
|
"nom_count": nom_count,
|
||||||
"nom_pct": round(nom_pct, 2),
|
"nom_pct": round(nom_pct, 2),
|
||||||
"alert": nom_pct > 5.0,
|
"alert": nom_pct > 8.0, # seuil relevé : CRO/CRH courts listent 8-10 soignants = légitime
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
27
server.py
27
server.py
@@ -210,17 +210,34 @@ async def anonymize_text(
|
|||||||
final_text = selective_rescan(final_text, cfg=cfg)
|
final_text = selective_rescan(final_text, cfg=cfg)
|
||||||
|
|
||||||
elapsed = time.time() - t0
|
elapsed = time.time() - t0
|
||||||
audit_list = [
|
|
||||||
{"kind": h.kind, "original": h.original, "placeholder": h.placeholder, "page": h.page}
|
# Inclure tous les hits (regex page≥0 + NER page=-1) avec source
|
||||||
for h in anon.audit
|
ner_prefixes = ("NER_", "EDS_")
|
||||||
if h.page != -1 # exclure les propagations globales
|
audit_list = []
|
||||||
]
|
ner_count = 0
|
||||||
|
regex_count = 0
|
||||||
|
for h in anon.audit:
|
||||||
|
is_ner = h.kind.startswith(ner_prefixes) or h.page == -1
|
||||||
|
entry = {
|
||||||
|
"kind": h.kind,
|
||||||
|
"original": h.original,
|
||||||
|
"placeholder": h.placeholder,
|
||||||
|
"page": h.page,
|
||||||
|
"source": "ner" if is_ner else "regex",
|
||||||
|
}
|
||||||
|
audit_list.append(entry)
|
||||||
|
if is_ner:
|
||||||
|
ner_count += 1
|
||||||
|
else:
|
||||||
|
regex_count += 1
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"text_anonymized": final_text,
|
"text_anonymized": final_text,
|
||||||
"audit": audit_list,
|
"audit": audit_list,
|
||||||
"stats": {
|
"stats": {
|
||||||
"pii_detected": len(audit_list),
|
"pii_detected": len(audit_list),
|
||||||
|
"regex_count": regex_count,
|
||||||
|
"ner_count": ner_count,
|
||||||
"elapsed_seconds": round(elapsed, 3),
|
"elapsed_seconds": round(elapsed, 3),
|
||||||
"ner_active": use_ner and _eds_manager is not None,
|
"ner_active": use_ner and _eds_manager is not None,
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user