fix(phase2): Élimination FP cross-line + word boundaries — 0 fuite, 0 FP médical

- Remplace \s+ par [ \t]+ dans 11 regex d'extraction de noms (empêche capture cross-line de médicaments)
- Ajoute \b word boundaries dans RE_PERSON_CONTEXT (empêche "PDR" de matcher "DR")
- Ajoute filtrage _MEDICAL_STOP_WORDS_SET dans selective_rescan._rescan_person
- Ajoute stop words : labos pharma (MYL/VTS/ARW/PAN/MSO), dosages (FAIBLE/FORT), anatomie imagerie (CEREBRAL/ABDOMINO-PELVIEN)
- Filtre stop words dans _add_name_force et _add_tokens_force_first
- Mise à jour baseline regression_tests/ avec 29 fichiers du batch audit 30

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-08 11:24:22 +01:00
parent 58cb209e26
commit 5972a09f9f
3 changed files with 189 additions and 78 deletions

View File

@@ -191,7 +191,8 @@ RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)") RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)") RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b") RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE) RE_IPP = re.compile(r"\b(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
RE_CSULT = re.compile(r"\b(?:N°\s*Csult|N°\s*Interv)\s*[:\-]?\s*(\d{6,})\b", re.IGNORECASE)
RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE) RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE) RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE) RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE)
@@ -294,8 +295,12 @@ _MEDICAL_STOP_WORDS_SET = {
"meropenem", "imipenem", "clindamycine", "doxycycline", "meropenem", "imipenem", "clindamycine", "doxycycline",
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim", "azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
"polyionique", "propranolol", "apidra", "solostar", "polyionique", "propranolol", "apidra", "solostar",
# Suffixes laboratoires pharmaceutiques # Noms et suffixes laboratoires pharmaceutiques
"arw", "myl", "myp", "arg", "teva", "bga", "agt", "arw", "myl", "myp", "arg", "teva", "bga", "agt",
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
"evolugen", "alter", "zydus", "medisol", "substipharm",
"sdz", "bgr", "egt", "rnb",
# Formes galéniques / voies d'administration # Formes galéniques / voies d'administration
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen", "cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime", "flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
@@ -374,6 +379,9 @@ _MEDICAL_STOP_WORDS_SET = {
"montelukast", "rosuvastatine", "montelukast", "rosuvastatine",
# Abréviations pharma courtes # Abréviations pharma courtes
"cpr", "sol", "bic", "agt", "poche", "inhal", "cpr", "sol", "bic", "agt", "poche", "inhal",
# Termes chirurgicaux/cliniques FP
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
"gauche", "droit", "droite", "face", "profil",
# Faux positifs EDS supplémentaires # Faux positifs EDS supplémentaires
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta", "psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med", "axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
@@ -444,6 +452,20 @@ _MEDICAL_STOP_WORDS_SET = {
"thermie", "alim", "alimentation", "admin", "thermie", "alim", "alimentation", "admin",
# Médicaments/tests labo capturés par patterns soignants # Médicaments/tests labo capturés par patterns soignants
"biprofenid", "bi-profenid", "phosphatase", "phosphatases", "biprofenid", "bi-profenid", "phosphatase", "phosphatases",
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
"ciprofloxacine", "lavement", "desinfection", "désinfection",
"avaler", "rachis", "lombaire", "thoraco-lombaire",
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
"faible", "fort", "forte",
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
"entree", "entrée", "continu", "continue",
"morphine", "claforan", "skenan", "actiskenan",
# Fragments de noms de médicaments (pdfplumber split)
"sium", "pegic", "fenid", "profenid",
# Termes structurels trackare # Termes structurels trackare
"transmissions", "transmission", "releve", "relevé", "transmissions", "transmission", "releve", "relevé",
"objectif", "objectifs", "evaluation", "évaluation", "objectif", "objectifs", "evaluation", "évaluation",
@@ -507,11 +529,11 @@ _MEDICAL_STOP_WORDS = (
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point) # Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+" _PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
RE_PERSON_CONTEXT = re.compile( RE_PERSON_CONTEXT = re.compile(
r"(?:(?:Dr\.?|DR\.?|Docteur|Pr\.?|Professeur|Mme|MME|Madame|M\.|Mr\.?|Monsieur" r"(?:(?:\bDr\.?|\bDR\.?|\bDocteur|\bPr\.?|\bProfesseur|\bMme|\bMME|\bMadame|\bM\.|\bMr\.?|\bMonsieur"
r"|Nom\s*:\s*" r"|\bNom[ \t]*:[ \t]*"
r"|Rédigé\s+par|Validé\s+par|Signé\s+par|Saisi\s+par|Réalisé\s+par" r"|\bRédigé[ \t]+par|\bValidé[ \t]+par|\bSigné[ \t]+par|\bSaisi[ \t]+par|\bRéalisé[ \t]+par"
r")\s+)" r")[ \t]+)"
rf"({_PERSON_TOKEN}(?:\s+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots rf"({_PERSON_TOKEN}(?:[ \t]+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots, pas de newline
) )
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO") # Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
@@ -561,8 +583,8 @@ RE_EXTRACT_CONTACT = re.compile(
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?", r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?",
) )
RE_EXTRACT_REDIGE = re.compile( RE_EXTRACT_REDIGE = re.compile(
r"(?:Rédigé|Validé|Signé|Saisi)\s+par\s+" r"(?:Rédigé|Validé|Signé|Saisi)[ \t]+par[ \t]+"
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)", rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
) )
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc. # Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*" _UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*"
@@ -573,22 +595,34 @@ RE_EXTRACT_MME_MR = re.compile(
) )
_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?" _INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
RE_EXTRACT_DR_DEST = re.compile( RE_EXTRACT_DR_DEST = re.compile(
r"(?:DR\.?|Dr\.?|Docteur)\s+" r"(?:DR\.?|Dr\.?|Docteur)[ \t]+"
+ _INITIAL_OPT + + _INITIAL_OPT +
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)", rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
) )
# Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY" # Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY"
RE_EXTRACT_STAFF_ROLE = re.compile( RE_EXTRACT_STAFF_ROLE = re.compile(
r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre\s+Infirmier" r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre[ \t]+Infirmier"
r"|Prescripteur|Prescrit\s+par|Exécut[ée]\s+par|Réalisé\s+par)\s*:?\s*" r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)[ \t]*:?[ \t]*"
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:\s*-\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?\s+)?" r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \t]+)?"
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[\s\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})", r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
) )
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL" # "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
RE_EXTRACT_PR = re.compile( RE_EXTRACT_PR = re.compile(
r"(?:Pr\.?|Professeur)\s+" r"(?:Pr\.?|Professeur)[ \t]+"
+ _INITIAL_OPT + + _INITIAL_OPT +
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)", rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
)
# "Opérateur : Docteur X. NOM", "Anesthésiste(s) Docteur J. NOM",
# "Opérateur : Dr J.-M. NOM", "Anesthésiste : NOM"
RE_EXTRACT_OPERATEUR = re.compile(
r"(?:Op[ée]rateur|Anesth[ée]siste\(?s?\)?|Chirurgien)[ \t]*:?[ \t]*"
r"(?:(?:Docteur|Dr\.?|Pr\.?)[ \t]+)?"
+ _INITIAL_OPT +
rf"((?:{_UC_COMPOUND})(?:[ \t]+(?:{_UC_COMPOUND})){{0,2}})",
)
# Téléphone avec extension slash : 05.59.44.38.32/34
RE_TEL_SLASH = re.compile(
r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?:/\d{1,4})(?!\d)"
) )
CID_PATTERN = re.compile(r"\(cid:\d+\)") CID_PATTERN = re.compile(r"\(cid:\d+\)")
@@ -596,7 +630,7 @@ CID_PATTERN = re.compile(r"\(cid:\d+\)")
# --- Nouvelles regex : dates, adresses, âges, dossiers --- # --- Nouvelles regex : dates, adresses, âges, dossiers ---
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)" _MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
RE_DATE_NAISSANCE = re.compile( RE_DATE_NAISSANCE = re.compile(
r"(?:n[ée]+\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*" r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})", r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
re.IGNORECASE, re.IGNORECASE,
) )
@@ -947,6 +981,10 @@ def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
if m: if m:
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"])) val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line) return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
m = RE_CSULT.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "DOSSIER", val, PLACEHOLDERS["DOSSIER"]))
return RE_CSULT.sub(lambda _: f"N° : {PLACEHOLDERS['DOSSIER']}", line)
m = RE_RPPS.search(line) m = RE_RPPS.search(line)
if m: if m:
val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"])) val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"]))
@@ -975,6 +1013,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
def _repl_tel(m: re.Match) -> str: def _repl_tel(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"])) audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"] return PLACEHOLDERS["TEL"]
line = RE_TEL_SLASH.sub(_repl_tel, line) # slash d'abord (plus spécifique)
line = RE_TEL.sub(_repl_tel, line) line = RE_TEL.sub(_repl_tel, line)
line = RE_TEL_COMPACT.sub(_repl_tel, line) line = RE_TEL_COMPACT.sub(_repl_tel, line)
@@ -1140,6 +1179,7 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
def _repl_tel(m: re.Match) -> str: def _repl_tel(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"])) audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"] return PLACEHOLDERS["TEL"]
key = RE_TEL_SLASH.sub(_repl_tel, key)
key = RE_TEL.sub(_repl_tel, key) key = RE_TEL.sub(_repl_tel, key)
key = RE_TEL_COMPACT.sub(_repl_tel, key) key = RE_TEL_COMPACT.sub(_repl_tel, key)
def _repl_email(m: re.Match) -> str: def _repl_email(m: re.Match) -> str:
@@ -1200,6 +1240,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
return return
if tok.lower() in _FORCE_EXCLUDE: if tok.lower() in _FORCE_EXCLUDE:
return return
# Filtre supplémentaire : ne pas force-add les mots médicaux connus
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
return
names.add(tok) names.add(tok)
force_names.add(tok) force_names.add(tok)
@@ -1324,10 +1367,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") --- # --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") ---
for m in re.finditer( for m in re.finditer(
r"Note\s+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])\s+" r"Note[ \t]+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])[ \t]+"
r"(?:DR\.?\s+)?" r"(?:DR\.?[ \t]+)?"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)" r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
full_text full_text
): ):
for g in (m.group(1), m.group(2)): for g in (m.group(1), m.group(2)):
@@ -1337,9 +1380,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
_add_name_force(tok) _add_name_force(tok)
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") --- # --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") ---
# IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante
for m in re.finditer( for m in re.finditer(
r"Signé\s+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)" r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
full_text full_text
): ):
for g in (m.group(1), m.group(2)): for g in (m.group(1), m.group(2)):
@@ -1350,9 +1394,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") --- # --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") ---
for m in re.finditer( for m in re.finditer(
r"Signé\s+—\s+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)\s+[-]?\s*" r"Signé[ \t]+—[ \t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \t]+[-]?[ \t]*"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})" r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?", r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
full_text full_text
): ):
for g in (m.group(1), m.group(2)): for g in (m.group(1), m.group(2)):
@@ -1363,7 +1407,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") --- # --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") ---
for m in re.finditer( for m in re.finditer(
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?\s+" r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?[ \t]+"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})", r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})",
full_text full_text
): ):
@@ -1373,8 +1417,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions --- # --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions ---
for m in re.finditer( for m in re.finditer(
r"DR\.?\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})" r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
full_text full_text
): ):
for g in (m.group(1), m.group(2)): for g in (m.group(1), m.group(2)):
@@ -1387,9 +1431,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM" # Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM"
# Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant) # Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant)
for m in re.finditer( for m in re.finditer(
r"\d{1,2}\s*:\s*\d{2}\s+" r"\d{1,2}[ \t]*:[ \t]*\d{2}[ \t]+"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})" r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?", r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
full_text full_text
): ):
for g in (m.group(1), m.group(2)): for g in (m.group(1), m.group(2)):
@@ -1415,13 +1459,15 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
return filtered, hits, force_names return filtered, hits, force_names
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set: def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, set]:
"""Pré-scan du document brut pour extraire les noms de personnes """Pré-scan du document brut pour extraire les noms de personnes
depuis les champs structurés (Patient, Rédigé par, etc.). depuis les champs structurés (Patient, Rédigé par, etc.).
Retourne un ensemble de tokens (mots) à masquer globalement.""" Retourne (names, force_names) : ensemble de tokens à masquer,
et sous-ensemble qui bypass les stop words."""
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or []) wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or []) wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
names: set = set() names: set = set()
force_names: set = set()
def _add_tokens(match_str: str): def _add_tokens(match_str: str):
for token in match_str.split(): for token in match_str.split():
@@ -1434,6 +1480,17 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
continue continue
names.add(token) names.add(token)
def _add_tokens_force_all(match_str: str):
"""Bypass stop words pour TOUS les tokens (contexte Patient: très fiable)."""
for token in match_str.split():
token = token.strip(" .-'")
if len(token) < 2:
continue
if token.upper() in wl_sections or token in wl_phrases:
continue
names.add(token)
force_names.add(token)
def _add_tokens_force_first(match_str): def _add_tokens_force_first(match_str):
"""Comme _add_tokens mais force le 1er token (contexte Dr/Mme fort).""" """Comme _add_tokens mais force le 1er token (contexte Dr/Mme fort)."""
tokens = match_str.split() tokens = match_str.split()
@@ -1441,21 +1498,20 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
token = token.strip(" .-'") token = token.strip(" .-'")
if len(token) < 2: if len(token) < 2:
continue continue
if token.upper() in wl_sections or token in wl_phrases:
continue
if token.lower() in _MEDICAL_STOP_WORDS_SET:
continue
if i == 0: if i == 0:
# Premier token après Dr/Mme : toujours un nom, bypass stop words # Premier token après Dr/Mme : contexte fiable
if token.upper() not in wl_sections: names.add(token)
names.add(token)
else: else:
if len(token) < 3: if len(token) < 3:
continue continue
if token.upper() in wl_sections or token in wl_phrases:
continue
if token.lower() in _MEDICAL_STOP_WORDS_SET:
continue
names.add(token) names.add(token)
for m in RE_EXTRACT_PATIENT.finditer(full_text): for m in RE_EXTRACT_PATIENT.finditer(full_text):
_add_tokens(m.group(1)) _add_tokens_force_all(m.group(1))
for m in RE_EXTRACT_REDIGE.finditer(full_text): for m in RE_EXTRACT_REDIGE.finditer(full_text):
_add_tokens(m.group(1)) _add_tokens(m.group(1))
for m in RE_EXTRACT_MME_MR.finditer(full_text): for m in RE_EXTRACT_MME_MR.finditer(full_text):
@@ -1482,6 +1538,9 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
# Pr / Professeur + nom(s) # Pr / Professeur + nom(s)
for m in RE_EXTRACT_PR.finditer(full_text): for m in RE_EXTRACT_PR.finditer(full_text):
_add_tokens_force_first(m.group(1)) _add_tokens_force_first(m.group(1))
# Opérateur / Anesthésiste / Chirurgien + nom(s)
for m in RE_EXTRACT_OPERATEUR.finditer(full_text):
_add_tokens_force_first(m.group(1))
# Extraction des noms dans les listes virgulées après Dr/Docteur # Extraction des noms dans les listes virgulées après Dr/Docteur
# ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé" # ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé"
@@ -1509,7 +1568,7 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
if len(part) >= 3 and part.lower() not in _MEDICAL_STOP_WORDS_SET: if len(part) >= 3 and part.lower() not in _MEDICAL_STOP_WORDS_SET:
names.add(part) names.add(part)
return names return names, force_names
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str: def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str:
@@ -1517,6 +1576,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
placeholder = PLACEHOLDERS["NOM"] placeholder = PLACEHOLDERS["NOM"]
_force = force_names or set() _force = force_names or set()
safe_names = {n for n in names if len(n) >= 3 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)} safe_names = {n for n in names if len(n) >= 3 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
for token in sorted(safe_names, key=len, reverse=True):
audit.append(PiiHit(-1, "NOM_GLOBAL", token, placeholder))
for token in sorted(safe_names, key=len, reverse=True): for token in sorted(safe_names, key=len, reverse=True):
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE) pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
new_text = [] new_text = []
@@ -1577,7 +1640,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
full_raw = "\n".join(pages_text) + "\n" + "\n".join( full_raw = "\n".join(pages_text) + "\n" + "\n".join(
"\n".join(rows) for rows in tables_lines "\n".join(rows) for rows in tables_lines
) )
extracted_names = _extract_document_names(full_raw, cfg) extracted_names, doc_force_names = _extract_document_names(full_raw, cfg)
# Phase 0b : si document Trackare, extraction renforcée des PII structurés # Phase 0b : si document Trackare, extraction renforcée des PII structurés
is_trackare = _is_trackare_document(full_raw) is_trackare = _is_trackare_document(full_raw)
@@ -1586,6 +1649,8 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw) trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw)
extracted_names.update(trackare_names) extracted_names.update(trackare_names)
audit.extend(trackare_hits) audit.extend(trackare_hits)
# Fusionner les force_names des deux sources
all_force_names = doc_force_names | trackare_force_names
# Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées, # Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées,
# avec possiblement 0-2 lignes intermédiaires masquées ou vides) # avec possiblement 0-2 lignes intermédiaires masquées ou vides)
@@ -1595,6 +1660,32 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
for m in _RE_FINESS_MULTILINE.finditer(full_raw): for m in _RE_FINESS_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"])) audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"]))
# Phase 0d : date de naissance multiline (label et date sur lignes séparées)
# Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950"
_RE_DATE_NAISSANCE_MULTILINE = re.compile(
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n\s*"
r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})",
re.IGNORECASE,
)
for m in _RE_DATE_NAISSANCE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "DATE_NAISSANCE", m.group(1), PLACEHOLDERS["DATE_NAISSANCE"]))
# Phase 0e : IPP multiline (N°Ipp :\n20023294 ou I.P.P. :\nS1032021)
_RE_IPP_MULTILINE = re.compile(
r"(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*\n\s*([A-Za-z0-9]{6,})\b",
re.IGNORECASE,
)
for m in _RE_IPP_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
_RE_DEMANDE_MULTILINE = re.compile(
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
re.IGNORECASE,
)
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
# Phase 1 : masquage ligne par ligne (regex classiques) # Phase 1 : masquage ligne par ligne (regex classiques)
out_pages: List[str] = [] out_pages: List[str] = []
for i, page_txt in enumerate(pages_text): for i, page_txt in enumerate(pages_text):
@@ -1620,7 +1711,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
# Phase 2 : application globale des noms extraits (rattrapage) # Phase 2 : application globale des noms extraits (rattrapage)
if extracted_names: if extracted_names:
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=trackare_force_names) text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=all_force_names)
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS) # Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
text_out = _apply_trackare_hits_to_text(text_out, audit) text_out = _apply_trackare_hits_to_text(text_out, audit)
@@ -1806,6 +1897,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
protected, kept = strip_tables(text) protected, kept = strip_tables(text)
# PII critiques (comme avant) # PII critiques (comme avant)
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected) protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected) protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected) protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected) protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
@@ -1846,6 +1938,10 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
tokens = [t for t in span.split() if t] tokens = [t for t in span.split() if t]
if len(tokens) == 1 and len(tokens[0]) <= 3: if len(tokens) == 1 and len(tokens[0]) <= 3:
return raw return raw
# Filtrer les termes médicaux (stop words)
clean = [t for t in tokens if t.lower() not in _MEDICAL_STOP_WORDS_SET]
if not clean:
return raw
return raw.replace(span, PLACEHOLDERS["NOM"]) return raw.replace(span, PLACEHOLDERS["NOM"])
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected) protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
res = list(protected) res = list(protected)
@@ -1971,7 +2067,7 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
compact = re.sub(r"\s+", "", token) compact = re.sub(r"\s+", "", token)
if compact != token: if compact != token:
rects = page.search_for(compact) rects = page.search_for(compact)
if not rects and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}: if not rects and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
for word in token.split(): for word in token.split():
word = word.strip(" .-'") word = word.strip(" .-'")
if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET: if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET:
@@ -2074,7 +2170,7 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}: if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
compact = re.sub(r"\s+", "", token) compact = re.sub(r"\s+", "", token)
found = page.search_for(compact) found = page.search_for(compact)
if not found and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM", if not found and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
"VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}: "VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}:
for word in token.split(): for word in token.split():
word = word.strip(" .-'") word = word.strip(" .-'")
@@ -2359,13 +2455,13 @@ def process_pdf(
# 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques # 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques
# Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages # Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages
# pour éviter les fuites sur les documents multi-pages (ex: CRO) # pour éviter les fuites sur les documents multi-pages (ex: CRO)
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS"} _CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER"}
_global_pii: Dict[str, set] = {} _global_pii: Dict[str, set] = {}
for h in anon.audit: for h in anon.audit:
# Collecter TOUS les types pour analyse, mais ne propager que les critiques # Collecter TOUS les types pour analyse, mais ne propager que les critiques
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB", if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", "VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", "DOSSIER",
"force_term", "force_regex", "FINESS"}: "force_term", "force_regex", "FINESS"}:
# Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations # Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations
if h.kind == "DATE_NAISSANCE": if h.kind == "DATE_NAISSANCE":

View File

@@ -18,11 +18,15 @@ blacklist:
force_mask_terms: force_mask_terms:
- CENTRE HOSPITALIER COTE BASQUE - CENTRE HOSPITALIER COTE BASQUE
- CENTRE HOSPITALIER DE LA COTE BASQUE - CENTRE HOSPITALIER DE LA COTE BASQUE
- POLYCLINIQUE COTE BASQUE SUD
- POLYCLINIQUE CÔTE BASQUE SUD
- CHCB - CHCB
- '640780417'
- 'Dates du séjour :' - 'Dates du séjour :'
- CONCERTATION - CONCERTATION
force_mask_regex: force_mask_regex:
- 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque' - 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
- 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud'
kv_labels_preserve: kv_labels_preserve:
- FINESS - FINESS
- IPP - IPP

View File

@@ -10,42 +10,43 @@ sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core import anonymizer_core_refactored_onnx as core
from eds_pseudo_manager import EdsPseudoManager from eds_pseudo_manager import EdsPseudoManager
from vlm_manager import VlmManager
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise_audit_30" OUTDIR = SRC / "anonymise_audit_30"
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
PDFS = [ PDFS = [
SRC / "110_23061319/trackare-07026002-23061319_07026002_23061319.pdf", SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf",
SRC / "115_23066188/CRH 23066188.pdf", SRC / "124_23074376/trackare-05000272-23074376_05000272_23074376.pdf",
SRC / "161_23098838/CRO 23098838.pdf", SRC / "133_23056022/CONSULTATION ANESTHESISTE 23056022.pdf",
SRC / "179_23126805/trackare-23005591-23126805_23005591_23126805.pdf", SRC / "141_23090597/trackare-BA042686-23090597_BA042686_23090597.pdf",
SRC / "181_23127286/CRH 23127286.pdf", SRC / "148_23018396/trackare-23000862-23018396_23000862_23018396.pdf",
SRC / "192_23132490/CRH 23132490.pdf", SRC / "183_23087212/LETTRE DE SORTIE 23087212.pdf",
SRC / "208_23151988/trackare-23020064-23151988_23020064_23151988.pdf", SRC / "216_23159905/CRO 23159905.pdf",
SRC / "215_23158603/trackare-22028007-23158603_22028007_23158603.pdf", SRC / "216_23159905/trackare-99246761-23159905_99246761_23159905.pdf",
SRC / "227_23173599/CRH 23173599.pdf", SRC / "222_23139653/CONSULTATION ANESTHESISTE 23139653.pdf",
SRC / "236_23116794/trackare-BA054633-23116794_BA054633_23116794.pdf", SRC / "225_23160703/CRO 23160703.pdf",
SRC / "248_23194278/CRH 23194278.pdf", SRC / "26_23127395/trackare-BA192486-23127395_BA192486_23127395.pdf",
SRC / "263_23203642/CRO 23203642.pdf", SRC / "269_23232115/BACTERIO 23232115.pdf",
SRC / "28_23135549/trackare-15021750-23135549_15021750_23135549.pdf", SRC / "290_23025988/CR consultation anesth-290-23025988.pdf",
SRC / "321_23043929/CRH 321_23066387.pdf", SRC / "315_23060770/trackare-05012965-23060770_05012965_23060770.pdf",
SRC / "379_23098754/trackare-18009635-23098754_18009635_23098754.pdf", SRC / "385_23102874/trackare-BA065989-23102874_BA065989_23102874.pdf",
SRC / "39_23167029/trackare-23022121-23167029_23022121_23167029.pdf", SRC / "433_23135726/trackare-BA127127-23135726_BA127127_23135726.pdf",
SRC / "444_23141032/trackare-BA102259-23141032_BA102259_23141032.pdf",
SRC / "478_23161697/cro 478_23161697.pdf",
SRC / "50_23219173/trackare-07019278-23219173_07019278_23219173.pdf",
SRC / "520_23177582/trackare-99252128-23177582_99252128_23177582.pdf", SRC / "520_23177582/trackare-99252128-23177582_99252128_23177582.pdf",
SRC / "556_23220878/trackare-21041742-23220878_21041742_23220878.pdf", SRC / "552_23214501/trackare-BA171849-23214501_BA171849_23214501.pdf",
SRC / "602_23070052/trackare-20028293-23070052_20028293_23070052.pdf", SRC / "590_23043950/trackare-17015185-23043950_17015185_23043950.pdf",
SRC / "604_23070704/trackare-23008170-23070704_23008170_23070704.pdf", SRC / "60_23106634/CRH 60_23106634.pdf",
SRC / "655_23163458/trackare-01296746-23163458_01296746_23163458.pdf", SRC / "603_23070213/trackare-00260974-23070213_00260974_23070213.pdf",
SRC / "684_23207941/CRH 684_23207941.pdf", SRC / "609_23076655/trackare-BA067657-23076655_BA067657_23076655.pdf",
SRC / "79_23187785/79_23187785 Dossier.pdf", SRC / "625_23098722/trackare-05012679-23098722_05012679_23098722.pdf",
SRC / "12_23084754/CRO 23084754.pdf" if (SRC / "12_23084754/CRO 23084754.pdf").exists() else SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf", SRC / "632_23124019/trackare-11004431-23124019_11004431_23124019.pdf",
SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf", SRC / "639_23135847/trackare-07003136-23135847_07003136_23135847.pdf",
SRC / "131_23079402/CRH 23079402.pdf", SRC / "656_23165708/trackare-13013848-23165708_13013848_23165708.pdf",
SRC / "290_23025988/cr anesth 290_23025988.pdf", SRC / "664_23175616/trackare-03020576-23175616_03020576_23175616.pdf",
SRC / "8_23074520/trackare-BA093659-23074520_BA093659_23074520.pdf",
SRC / "88_23034958/trackare-14025311-23034958_14025311_23034958.pdf",
SRC / "89_23016863/trackare-BA121804-23016863_BA121804_23016863.pdf",
] ]
@@ -54,7 +55,16 @@ def main():
ner = EdsPseudoManager() ner = EdsPseudoManager()
ner.load() ner.load()
assert ner.is_loaded(), "EDS-Pseudo non chargé" assert ner.is_loaded(), "EDS-Pseudo non chargé"
print("EDS-Pseudo chargé.\n", flush=True) print("EDS-Pseudo chargé.", flush=True)
print("Chargement VLM (Ollama qwen2.5vl:7b)...", flush=True)
vlm = VlmManager()
try:
vlm.load()
print(f"VLM chargé.\n", flush=True)
except Exception as e:
print(f"VLM indisponible ({e}), on continue sans.\n", flush=True)
vlm = None
# Vérifier existence des fichiers # Vérifier existence des fichiers
existing = [p for p in PDFS if p.exists()] existing = [p for p in PDFS if p.exists()]
@@ -86,6 +96,7 @@ def main():
ner_manager=ner, ner_manager=ner,
ner_thresholds=None, ner_thresholds=None,
ogc_label=ogc, ogc_label=ogc,
vlm_manager=vlm,
) )
audit_path = Path(outputs.get("audit", "")) audit_path = Path(outputs.get("audit", ""))
if audit_path.exists(): if audit_path.exists():