fix(phase2): Élimination FP cross-line + word boundaries — 0 fuite, 0 FP médical
- Remplace \s+ par [ \t]+ dans 11 regex d'extraction de noms (empêche capture cross-line de médicaments) - Ajoute \b word boundaries dans RE_PERSON_CONTEXT (empêche "PDR" de matcher "DR") - Ajoute filtrage _MEDICAL_STOP_WORDS_SET dans selective_rescan._rescan_person - Ajoute stop words : labos pharma (MYL/VTS/ARW/PAN/MSO), dosages (FAIBLE/FORT), anatomie imagerie (CEREBRAL/ABDOMINO-PELVIEN) - Filtre stop words dans _add_name_force et _add_tokens_force_first - Mise à jour baseline regression_tests/ avec 29 fichiers du batch audit 30 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -191,7 +191,8 @@ RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
|||||||
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
|
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
|
||||||
RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
|
RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
|
||||||
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
|
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
|
||||||
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
RE_IPP = re.compile(r"\b(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||||||
|
RE_CSULT = re.compile(r"\b(?:N°\s*Csult|N°\s*Interv)\s*[:\-]?\s*(\d{6,})\b", re.IGNORECASE)
|
||||||
RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
||||||
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
|
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
|
||||||
RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE)
|
RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE)
|
||||||
@@ -294,8 +295,12 @@ _MEDICAL_STOP_WORDS_SET = {
|
|||||||
"meropenem", "imipenem", "clindamycine", "doxycycline",
|
"meropenem", "imipenem", "clindamycine", "doxycycline",
|
||||||
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
|
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
|
||||||
"polyionique", "propranolol", "apidra", "solostar",
|
"polyionique", "propranolol", "apidra", "solostar",
|
||||||
# Suffixes laboratoires pharmaceutiques
|
# Noms et suffixes laboratoires pharmaceutiques
|
||||||
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
|
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
|
||||||
|
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
|
||||||
|
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
|
||||||
|
"evolugen", "alter", "zydus", "medisol", "substipharm",
|
||||||
|
"sdz", "bgr", "egt", "rnb",
|
||||||
# Formes galéniques / voies d'administration
|
# Formes galéniques / voies d'administration
|
||||||
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
|
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
|
||||||
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
|
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
|
||||||
@@ -374,6 +379,9 @@ _MEDICAL_STOP_WORDS_SET = {
|
|||||||
"montelukast", "rosuvastatine",
|
"montelukast", "rosuvastatine",
|
||||||
# Abréviations pharma courtes
|
# Abréviations pharma courtes
|
||||||
"cpr", "sol", "bic", "agt", "poche", "inhal",
|
"cpr", "sol", "bic", "agt", "poche", "inhal",
|
||||||
|
# Termes chirurgicaux/cliniques FP
|
||||||
|
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
|
||||||
|
"gauche", "droit", "droite", "face", "profil",
|
||||||
# Faux positifs EDS supplémentaires
|
# Faux positifs EDS supplémentaires
|
||||||
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
|
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
|
||||||
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
|
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
|
||||||
@@ -444,6 +452,20 @@ _MEDICAL_STOP_WORDS_SET = {
|
|||||||
"thermie", "alim", "alimentation", "admin",
|
"thermie", "alim", "alimentation", "admin",
|
||||||
# Médicaments/tests labo capturés par patterns soignants
|
# Médicaments/tests labo capturés par patterns soignants
|
||||||
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
|
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
|
||||||
|
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
|
||||||
|
"ciprofloxacine", "lavement", "desinfection", "désinfection",
|
||||||
|
"avaler", "rachis", "lombaire", "thoraco-lombaire",
|
||||||
|
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
|
||||||
|
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
|
||||||
|
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
|
||||||
|
"faible", "fort", "forte",
|
||||||
|
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
|
||||||
|
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
|
||||||
|
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
|
||||||
|
"entree", "entrée", "continu", "continue",
|
||||||
|
"morphine", "claforan", "skenan", "actiskenan",
|
||||||
|
# Fragments de noms de médicaments (pdfplumber split)
|
||||||
|
"sium", "pegic", "fenid", "profenid",
|
||||||
# Termes structurels trackare
|
# Termes structurels trackare
|
||||||
"transmissions", "transmission", "releve", "relevé",
|
"transmissions", "transmission", "releve", "relevé",
|
||||||
"objectif", "objectifs", "evaluation", "évaluation",
|
"objectif", "objectifs", "evaluation", "évaluation",
|
||||||
@@ -507,11 +529,11 @@ _MEDICAL_STOP_WORDS = (
|
|||||||
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
|
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
|
||||||
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
||||||
RE_PERSON_CONTEXT = re.compile(
|
RE_PERSON_CONTEXT = re.compile(
|
||||||
r"(?:(?:Dr\.?|DR\.?|Docteur|Pr\.?|Professeur|Mme|MME|Madame|M\.|Mr\.?|Monsieur"
|
r"(?:(?:\bDr\.?|\bDR\.?|\bDocteur|\bPr\.?|\bProfesseur|\bMme|\bMME|\bMadame|\bM\.|\bMr\.?|\bMonsieur"
|
||||||
r"|Nom\s*:\s*"
|
r"|\bNom[ \t]*:[ \t]*"
|
||||||
r"|Rédigé\s+par|Validé\s+par|Signé\s+par|Saisi\s+par|Réalisé\s+par"
|
r"|\bRédigé[ \t]+par|\bValidé[ \t]+par|\bSigné[ \t]+par|\bSaisi[ \t]+par|\bRéalisé[ \t]+par"
|
||||||
r")\s+)"
|
r")[ \t]+)"
|
||||||
rf"({_PERSON_TOKEN}(?:\s+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots
|
rf"({_PERSON_TOKEN}(?:[ \t]+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots, pas de newline
|
||||||
)
|
)
|
||||||
|
|
||||||
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
|
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
|
||||||
@@ -561,8 +583,8 @@ RE_EXTRACT_CONTACT = re.compile(
|
|||||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?",
|
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?",
|
||||||
)
|
)
|
||||||
RE_EXTRACT_REDIGE = re.compile(
|
RE_EXTRACT_REDIGE = re.compile(
|
||||||
r"(?:Rédigé|Validé|Signé|Saisi)\s+par\s+"
|
r"(?:Rédigé|Validé|Signé|Saisi)[ \t]+par[ \t]+"
|
||||||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||||||
)
|
)
|
||||||
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
|
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
|
||||||
_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*"
|
_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*"
|
||||||
@@ -573,22 +595,34 @@ RE_EXTRACT_MME_MR = re.compile(
|
|||||||
)
|
)
|
||||||
_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
|
_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
|
||||||
RE_EXTRACT_DR_DEST = re.compile(
|
RE_EXTRACT_DR_DEST = re.compile(
|
||||||
r"(?:DR\.?|Dr\.?|Docteur)\s+"
|
r"(?:DR\.?|Dr\.?|Docteur)[ \t]+"
|
||||||
+ _INITIAL_OPT +
|
+ _INITIAL_OPT +
|
||||||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||||||
)
|
)
|
||||||
# Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY"
|
# Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY"
|
||||||
RE_EXTRACT_STAFF_ROLE = re.compile(
|
RE_EXTRACT_STAFF_ROLE = re.compile(
|
||||||
r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre\s+Infirmier"
|
r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre[ \t]+Infirmier"
|
||||||
r"|Prescripteur|Prescrit\s+par|Exécut[ée]\s+par|Réalisé\s+par)\s*:?\s*"
|
r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)[ \t]*:?[ \t]*"
|
||||||
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:\s*-\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?\s+)?"
|
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \t]+)?"
|
||||||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[\s\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
|
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
|
||||||
)
|
)
|
||||||
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
|
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
|
||||||
RE_EXTRACT_PR = re.compile(
|
RE_EXTRACT_PR = re.compile(
|
||||||
r"(?:Pr\.?|Professeur)\s+"
|
r"(?:Pr\.?|Professeur)[ \t]+"
|
||||||
+ _INITIAL_OPT +
|
+ _INITIAL_OPT +
|
||||||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||||||
|
)
|
||||||
|
# "Opérateur : Docteur X. NOM", "Anesthésiste(s) Docteur J. NOM",
|
||||||
|
# "Opérateur : Dr J.-M. NOM", "Anesthésiste : NOM"
|
||||||
|
RE_EXTRACT_OPERATEUR = re.compile(
|
||||||
|
r"(?:Op[ée]rateur|Anesth[ée]siste\(?s?\)?|Chirurgien)[ \t]*:?[ \t]*"
|
||||||
|
r"(?:(?:Docteur|Dr\.?|Pr\.?)[ \t]+)?"
|
||||||
|
+ _INITIAL_OPT +
|
||||||
|
rf"((?:{_UC_COMPOUND})(?:[ \t]+(?:{_UC_COMPOUND})){{0,2}})",
|
||||||
|
)
|
||||||
|
# Téléphone avec extension slash : 05.59.44.38.32/34
|
||||||
|
RE_TEL_SLASH = re.compile(
|
||||||
|
r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?:/\d{1,4})(?!\d)"
|
||||||
)
|
)
|
||||||
|
|
||||||
CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
||||||
@@ -596,7 +630,7 @@ CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
|||||||
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
|
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
|
||||||
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
|
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
|
||||||
RE_DATE_NAISSANCE = re.compile(
|
RE_DATE_NAISSANCE = re.compile(
|
||||||
r"(?:n[ée]+\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
|
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
|
||||||
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
|
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
@@ -947,6 +981,10 @@ def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
|||||||
if m:
|
if m:
|
||||||
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
|
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
|
||||||
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
|
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
|
||||||
|
m = RE_CSULT.search(line)
|
||||||
|
if m:
|
||||||
|
val = m.group(1); audit.append(PiiHit(page_idx, "DOSSIER", val, PLACEHOLDERS["DOSSIER"]))
|
||||||
|
return RE_CSULT.sub(lambda _: f"N° : {PLACEHOLDERS['DOSSIER']}", line)
|
||||||
m = RE_RPPS.search(line)
|
m = RE_RPPS.search(line)
|
||||||
if m:
|
if m:
|
||||||
val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"]))
|
val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"]))
|
||||||
@@ -975,6 +1013,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
|
|||||||
def _repl_tel(m: re.Match) -> str:
|
def _repl_tel(m: re.Match) -> str:
|
||||||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||||
return PLACEHOLDERS["TEL"]
|
return PLACEHOLDERS["TEL"]
|
||||||
|
line = RE_TEL_SLASH.sub(_repl_tel, line) # slash d'abord (plus spécifique)
|
||||||
line = RE_TEL.sub(_repl_tel, line)
|
line = RE_TEL.sub(_repl_tel, line)
|
||||||
line = RE_TEL_COMPACT.sub(_repl_tel, line)
|
line = RE_TEL_COMPACT.sub(_repl_tel, line)
|
||||||
|
|
||||||
@@ -1140,6 +1179,7 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
|
|||||||
def _repl_tel(m: re.Match) -> str:
|
def _repl_tel(m: re.Match) -> str:
|
||||||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||||
return PLACEHOLDERS["TEL"]
|
return PLACEHOLDERS["TEL"]
|
||||||
|
key = RE_TEL_SLASH.sub(_repl_tel, key)
|
||||||
key = RE_TEL.sub(_repl_tel, key)
|
key = RE_TEL.sub(_repl_tel, key)
|
||||||
key = RE_TEL_COMPACT.sub(_repl_tel, key)
|
key = RE_TEL_COMPACT.sub(_repl_tel, key)
|
||||||
def _repl_email(m: re.Match) -> str:
|
def _repl_email(m: re.Match) -> str:
|
||||||
@@ -1200,6 +1240,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
|||||||
return
|
return
|
||||||
if tok.lower() in _FORCE_EXCLUDE:
|
if tok.lower() in _FORCE_EXCLUDE:
|
||||||
return
|
return
|
||||||
|
# Filtre supplémentaire : ne pas force-add les mots médicaux connus
|
||||||
|
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||||
|
return
|
||||||
names.add(tok)
|
names.add(tok)
|
||||||
force_names.add(tok)
|
force_names.add(tok)
|
||||||
|
|
||||||
@@ -1324,10 +1367,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
|||||||
|
|
||||||
# --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") ---
|
# --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") ---
|
||||||
for m in re.finditer(
|
for m in re.finditer(
|
||||||
r"Note\s+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])\s+"
|
r"Note[ \t]+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])[ \t]+"
|
||||||
r"(?:DR\.?\s+)?"
|
r"(?:DR\.?[ \t]+)?"
|
||||||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
||||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||||
full_text
|
full_text
|
||||||
):
|
):
|
||||||
for g in (m.group(1), m.group(2)):
|
for g in (m.group(1), m.group(2)):
|
||||||
@@ -1337,9 +1380,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
|||||||
_add_name_force(tok)
|
_add_name_force(tok)
|
||||||
|
|
||||||
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") ---
|
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") ---
|
||||||
|
# IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante
|
||||||
for m in re.finditer(
|
for m in re.finditer(
|
||||||
r"Signé\s+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
||||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||||
full_text
|
full_text
|
||||||
):
|
):
|
||||||
for g in (m.group(1), m.group(2)):
|
for g in (m.group(1), m.group(2)):
|
||||||
@@ -1350,9 +1394,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
|||||||
|
|
||||||
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") ---
|
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") ---
|
||||||
for m in re.finditer(
|
for m in re.finditer(
|
||||||
r"Signé\s+—\s+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)\s+[-]?\s*"
|
r"Signé[ \t]+—[ \t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \t]+[-]?[ \t]*"
|
||||||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})"
|
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})"
|
||||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
|
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
|
||||||
full_text
|
full_text
|
||||||
):
|
):
|
||||||
for g in (m.group(1), m.group(2)):
|
for g in (m.group(1), m.group(2)):
|
||||||
@@ -1363,7 +1407,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
|||||||
|
|
||||||
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") ---
|
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") ---
|
||||||
for m in re.finditer(
|
for m in re.finditer(
|
||||||
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?\s+"
|
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?[ \t]+"
|
||||||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})",
|
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})",
|
||||||
full_text
|
full_text
|
||||||
):
|
):
|
||||||
@@ -1373,8 +1417,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
|||||||
|
|
||||||
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions ---
|
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions ---
|
||||||
for m in re.finditer(
|
for m in re.finditer(
|
||||||
r"DR\.?\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})"
|
r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})"
|
||||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||||
full_text
|
full_text
|
||||||
):
|
):
|
||||||
for g in (m.group(1), m.group(2)):
|
for g in (m.group(1), m.group(2)):
|
||||||
@@ -1387,9 +1431,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
|||||||
# Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM"
|
# Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM"
|
||||||
# Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant)
|
# Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant)
|
||||||
for m in re.finditer(
|
for m in re.finditer(
|
||||||
r"\d{1,2}\s*:\s*\d{2}\s+"
|
r"\d{1,2}[ \t]*:[ \t]*\d{2}[ \t]+"
|
||||||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})"
|
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})"
|
||||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
|
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
|
||||||
full_text
|
full_text
|
||||||
):
|
):
|
||||||
for g in (m.group(1), m.group(2)):
|
for g in (m.group(1), m.group(2)):
|
||||||
@@ -1415,13 +1459,15 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
|||||||
return filtered, hits, force_names
|
return filtered, hits, force_names
|
||||||
|
|
||||||
|
|
||||||
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, set]:
|
||||||
"""Pré-scan du document brut pour extraire les noms de personnes
|
"""Pré-scan du document brut pour extraire les noms de personnes
|
||||||
depuis les champs structurés (Patient, Rédigé par, etc.).
|
depuis les champs structurés (Patient, Rédigé par, etc.).
|
||||||
Retourne un ensemble de tokens (mots) à masquer globalement."""
|
Retourne (names, force_names) : ensemble de tokens à masquer,
|
||||||
|
et sous-ensemble qui bypass les stop words."""
|
||||||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||||
names: set = set()
|
names: set = set()
|
||||||
|
force_names: set = set()
|
||||||
|
|
||||||
def _add_tokens(match_str: str):
|
def _add_tokens(match_str: str):
|
||||||
for token in match_str.split():
|
for token in match_str.split():
|
||||||
@@ -1434,6 +1480,17 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
|||||||
continue
|
continue
|
||||||
names.add(token)
|
names.add(token)
|
||||||
|
|
||||||
|
def _add_tokens_force_all(match_str: str):
|
||||||
|
"""Bypass stop words pour TOUS les tokens (contexte Patient: très fiable)."""
|
||||||
|
for token in match_str.split():
|
||||||
|
token = token.strip(" .-'")
|
||||||
|
if len(token) < 2:
|
||||||
|
continue
|
||||||
|
if token.upper() in wl_sections or token in wl_phrases:
|
||||||
|
continue
|
||||||
|
names.add(token)
|
||||||
|
force_names.add(token)
|
||||||
|
|
||||||
def _add_tokens_force_first(match_str):
|
def _add_tokens_force_first(match_str):
|
||||||
"""Comme _add_tokens mais force le 1er token (contexte Dr/Mme fort)."""
|
"""Comme _add_tokens mais force le 1er token (contexte Dr/Mme fort)."""
|
||||||
tokens = match_str.split()
|
tokens = match_str.split()
|
||||||
@@ -1441,21 +1498,20 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
|||||||
token = token.strip(" .-'")
|
token = token.strip(" .-'")
|
||||||
if len(token) < 2:
|
if len(token) < 2:
|
||||||
continue
|
continue
|
||||||
|
if token.upper() in wl_sections or token in wl_phrases:
|
||||||
|
continue
|
||||||
|
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||||
|
continue
|
||||||
if i == 0:
|
if i == 0:
|
||||||
# Premier token après Dr/Mme : toujours un nom, bypass stop words
|
# Premier token après Dr/Mme : contexte fiable
|
||||||
if token.upper() not in wl_sections:
|
names.add(token)
|
||||||
names.add(token)
|
|
||||||
else:
|
else:
|
||||||
if len(token) < 3:
|
if len(token) < 3:
|
||||||
continue
|
continue
|
||||||
if token.upper() in wl_sections or token in wl_phrases:
|
|
||||||
continue
|
|
||||||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
|
||||||
continue
|
|
||||||
names.add(token)
|
names.add(token)
|
||||||
|
|
||||||
for m in RE_EXTRACT_PATIENT.finditer(full_text):
|
for m in RE_EXTRACT_PATIENT.finditer(full_text):
|
||||||
_add_tokens(m.group(1))
|
_add_tokens_force_all(m.group(1))
|
||||||
for m in RE_EXTRACT_REDIGE.finditer(full_text):
|
for m in RE_EXTRACT_REDIGE.finditer(full_text):
|
||||||
_add_tokens(m.group(1))
|
_add_tokens(m.group(1))
|
||||||
for m in RE_EXTRACT_MME_MR.finditer(full_text):
|
for m in RE_EXTRACT_MME_MR.finditer(full_text):
|
||||||
@@ -1482,6 +1538,9 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
|||||||
# Pr / Professeur + nom(s)
|
# Pr / Professeur + nom(s)
|
||||||
for m in RE_EXTRACT_PR.finditer(full_text):
|
for m in RE_EXTRACT_PR.finditer(full_text):
|
||||||
_add_tokens_force_first(m.group(1))
|
_add_tokens_force_first(m.group(1))
|
||||||
|
# Opérateur / Anesthésiste / Chirurgien + nom(s)
|
||||||
|
for m in RE_EXTRACT_OPERATEUR.finditer(full_text):
|
||||||
|
_add_tokens_force_first(m.group(1))
|
||||||
|
|
||||||
# Extraction des noms dans les listes virgulées après Dr/Docteur
|
# Extraction des noms dans les listes virgulées après Dr/Docteur
|
||||||
# ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé"
|
# ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé"
|
||||||
@@ -1509,7 +1568,7 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
|||||||
if len(part) >= 3 and part.lower() not in _MEDICAL_STOP_WORDS_SET:
|
if len(part) >= 3 and part.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||||||
names.add(part)
|
names.add(part)
|
||||||
|
|
||||||
return names
|
return names, force_names
|
||||||
|
|
||||||
|
|
||||||
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str:
|
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str:
|
||||||
@@ -1517,6 +1576,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
|
|||||||
placeholder = PLACEHOLDERS["NOM"]
|
placeholder = PLACEHOLDERS["NOM"]
|
||||||
_force = force_names or set()
|
_force = force_names or set()
|
||||||
safe_names = {n for n in names if len(n) >= 3 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
|
safe_names = {n for n in names if len(n) >= 3 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
|
||||||
|
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
|
||||||
|
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
|
||||||
|
for token in sorted(safe_names, key=len, reverse=True):
|
||||||
|
audit.append(PiiHit(-1, "NOM_GLOBAL", token, placeholder))
|
||||||
for token in sorted(safe_names, key=len, reverse=True):
|
for token in sorted(safe_names, key=len, reverse=True):
|
||||||
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
||||||
new_text = []
|
new_text = []
|
||||||
@@ -1577,7 +1640,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
|||||||
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
|
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
|
||||||
"\n".join(rows) for rows in tables_lines
|
"\n".join(rows) for rows in tables_lines
|
||||||
)
|
)
|
||||||
extracted_names = _extract_document_names(full_raw, cfg)
|
extracted_names, doc_force_names = _extract_document_names(full_raw, cfg)
|
||||||
|
|
||||||
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
|
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
|
||||||
is_trackare = _is_trackare_document(full_raw)
|
is_trackare = _is_trackare_document(full_raw)
|
||||||
@@ -1586,6 +1649,8 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
|||||||
trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw)
|
trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw)
|
||||||
extracted_names.update(trackare_names)
|
extracted_names.update(trackare_names)
|
||||||
audit.extend(trackare_hits)
|
audit.extend(trackare_hits)
|
||||||
|
# Fusionner les force_names des deux sources
|
||||||
|
all_force_names = doc_force_names | trackare_force_names
|
||||||
|
|
||||||
# Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées,
|
# Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées,
|
||||||
# avec possiblement 0-2 lignes intermédiaires masquées ou vides)
|
# avec possiblement 0-2 lignes intermédiaires masquées ou vides)
|
||||||
@@ -1595,6 +1660,32 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
|||||||
for m in _RE_FINESS_MULTILINE.finditer(full_raw):
|
for m in _RE_FINESS_MULTILINE.finditer(full_raw):
|
||||||
audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"]))
|
audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"]))
|
||||||
|
|
||||||
|
# Phase 0d : date de naissance multiline (label et date sur lignes séparées)
|
||||||
|
# Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950"
|
||||||
|
_RE_DATE_NAISSANCE_MULTILINE = re.compile(
|
||||||
|
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n\s*"
|
||||||
|
r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
for m in _RE_DATE_NAISSANCE_MULTILINE.finditer(full_raw):
|
||||||
|
audit.append(PiiHit(-1, "DATE_NAISSANCE", m.group(1), PLACEHOLDERS["DATE_NAISSANCE"]))
|
||||||
|
|
||||||
|
# Phase 0e : IPP multiline (N°Ipp :\n20023294 ou I.P.P. :\nS1032021)
|
||||||
|
_RE_IPP_MULTILINE = re.compile(
|
||||||
|
r"(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*\n\s*([A-Za-z0-9]{6,})\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
for m in _RE_IPP_MULTILINE.finditer(full_raw):
|
||||||
|
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
|
||||||
|
|
||||||
|
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
|
||||||
|
_RE_DEMANDE_MULTILINE = re.compile(
|
||||||
|
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
|
||||||
|
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||||||
|
|
||||||
# Phase 1 : masquage ligne par ligne (regex classiques)
|
# Phase 1 : masquage ligne par ligne (regex classiques)
|
||||||
out_pages: List[str] = []
|
out_pages: List[str] = []
|
||||||
for i, page_txt in enumerate(pages_text):
|
for i, page_txt in enumerate(pages_text):
|
||||||
@@ -1620,7 +1711,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
|||||||
|
|
||||||
# Phase 2 : application globale des noms extraits (rattrapage)
|
# Phase 2 : application globale des noms extraits (rattrapage)
|
||||||
if extracted_names:
|
if extracted_names:
|
||||||
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=trackare_force_names)
|
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=all_force_names)
|
||||||
|
|
||||||
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
|
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
|
||||||
text_out = _apply_trackare_hits_to_text(text_out, audit)
|
text_out = _apply_trackare_hits_to_text(text_out, audit)
|
||||||
@@ -1806,6 +1897,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
|||||||
protected, kept = strip_tables(text)
|
protected, kept = strip_tables(text)
|
||||||
# PII critiques (comme avant)
|
# PII critiques (comme avant)
|
||||||
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
||||||
|
protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected)
|
||||||
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
||||||
protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
|
protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
|
||||||
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
||||||
@@ -1846,6 +1938,10 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
|||||||
tokens = [t for t in span.split() if t]
|
tokens = [t for t in span.split() if t]
|
||||||
if len(tokens) == 1 and len(tokens[0]) <= 3:
|
if len(tokens) == 1 and len(tokens[0]) <= 3:
|
||||||
return raw
|
return raw
|
||||||
|
# Filtrer les termes médicaux (stop words)
|
||||||
|
clean = [t for t in tokens if t.lower() not in _MEDICAL_STOP_WORDS_SET]
|
||||||
|
if not clean:
|
||||||
|
return raw
|
||||||
return raw.replace(span, PLACEHOLDERS["NOM"])
|
return raw.replace(span, PLACEHOLDERS["NOM"])
|
||||||
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
|
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
|
||||||
res = list(protected)
|
res = list(protected)
|
||||||
@@ -1971,7 +2067,7 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
|
|||||||
compact = re.sub(r"\s+", "", token)
|
compact = re.sub(r"\s+", "", token)
|
||||||
if compact != token:
|
if compact != token:
|
||||||
rects = page.search_for(compact)
|
rects = page.search_for(compact)
|
||||||
if not rects and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
|
if not rects and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
|
||||||
for word in token.split():
|
for word in token.split():
|
||||||
word = word.strip(" .-'")
|
word = word.strip(" .-'")
|
||||||
if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||||
@@ -2074,7 +2170,7 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
|||||||
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
|
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
|
||||||
compact = re.sub(r"\s+", "", token)
|
compact = re.sub(r"\s+", "", token)
|
||||||
found = page.search_for(compact)
|
found = page.search_for(compact)
|
||||||
if not found and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
|
if not found and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
|
||||||
"VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}:
|
"VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}:
|
||||||
for word in token.split():
|
for word in token.split():
|
||||||
word = word.strip(" .-'")
|
word = word.strip(" .-'")
|
||||||
@@ -2359,13 +2455,13 @@ def process_pdf(
|
|||||||
# 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques
|
# 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques
|
||||||
# Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages
|
# Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages
|
||||||
# pour éviter les fuites sur les documents multi-pages (ex: CRO)
|
# pour éviter les fuites sur les documents multi-pages (ex: CRO)
|
||||||
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS"}
|
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER"}
|
||||||
|
|
||||||
_global_pii: Dict[str, set] = {}
|
_global_pii: Dict[str, set] = {}
|
||||||
for h in anon.audit:
|
for h in anon.audit:
|
||||||
# Collecter TOUS les types pour analyse, mais ne propager que les critiques
|
# Collecter TOUS les types pour analyse, mais ne propager que les critiques
|
||||||
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
|
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
|
||||||
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP",
|
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", "DOSSIER",
|
||||||
"force_term", "force_regex", "FINESS"}:
|
"force_term", "force_regex", "FINESS"}:
|
||||||
# Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations
|
# Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations
|
||||||
if h.kind == "DATE_NAISSANCE":
|
if h.kind == "DATE_NAISSANCE":
|
||||||
|
|||||||
@@ -18,11 +18,15 @@ blacklist:
|
|||||||
force_mask_terms:
|
force_mask_terms:
|
||||||
- CENTRE HOSPITALIER COTE BASQUE
|
- CENTRE HOSPITALIER COTE BASQUE
|
||||||
- CENTRE HOSPITALIER DE LA COTE BASQUE
|
- CENTRE HOSPITALIER DE LA COTE BASQUE
|
||||||
|
- POLYCLINIQUE COTE BASQUE SUD
|
||||||
|
- POLYCLINIQUE CÔTE BASQUE SUD
|
||||||
- CHCB
|
- CHCB
|
||||||
|
- '640780417'
|
||||||
- 'Dates du séjour :'
|
- 'Dates du séjour :'
|
||||||
- CONCERTATION
|
- CONCERTATION
|
||||||
force_mask_regex:
|
force_mask_regex:
|
||||||
- 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
|
- 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
|
||||||
|
- 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud'
|
||||||
kv_labels_preserve:
|
kv_labels_preserve:
|
||||||
- FINESS
|
- FINESS
|
||||||
- IPP
|
- IPP
|
||||||
|
|||||||
@@ -10,42 +10,43 @@ sys.path.insert(0, str(Path(__file__).parent))
|
|||||||
|
|
||||||
import anonymizer_core_refactored_onnx as core
|
import anonymizer_core_refactored_onnx as core
|
||||||
from eds_pseudo_manager import EdsPseudoManager
|
from eds_pseudo_manager import EdsPseudoManager
|
||||||
|
from vlm_manager import VlmManager
|
||||||
|
|
||||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||||
OUTDIR = SRC / "anonymise_audit_30"
|
OUTDIR = SRC / "anonymise_audit_30"
|
||||||
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
||||||
|
|
||||||
PDFS = [
|
PDFS = [
|
||||||
SRC / "110_23061319/trackare-07026002-23061319_07026002_23061319.pdf",
|
SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf",
|
||||||
SRC / "115_23066188/CRH 23066188.pdf",
|
SRC / "124_23074376/trackare-05000272-23074376_05000272_23074376.pdf",
|
||||||
SRC / "161_23098838/CRO 23098838.pdf",
|
SRC / "133_23056022/CONSULTATION ANESTHESISTE 23056022.pdf",
|
||||||
SRC / "179_23126805/trackare-23005591-23126805_23005591_23126805.pdf",
|
SRC / "141_23090597/trackare-BA042686-23090597_BA042686_23090597.pdf",
|
||||||
SRC / "181_23127286/CRH 23127286.pdf",
|
SRC / "148_23018396/trackare-23000862-23018396_23000862_23018396.pdf",
|
||||||
SRC / "192_23132490/CRH 23132490.pdf",
|
SRC / "183_23087212/LETTRE DE SORTIE 23087212.pdf",
|
||||||
SRC / "208_23151988/trackare-23020064-23151988_23020064_23151988.pdf",
|
SRC / "216_23159905/CRO 23159905.pdf",
|
||||||
SRC / "215_23158603/trackare-22028007-23158603_22028007_23158603.pdf",
|
SRC / "216_23159905/trackare-99246761-23159905_99246761_23159905.pdf",
|
||||||
SRC / "227_23173599/CRH 23173599.pdf",
|
SRC / "222_23139653/CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||||
SRC / "236_23116794/trackare-BA054633-23116794_BA054633_23116794.pdf",
|
SRC / "225_23160703/CRO 23160703.pdf",
|
||||||
SRC / "248_23194278/CRH 23194278.pdf",
|
SRC / "26_23127395/trackare-BA192486-23127395_BA192486_23127395.pdf",
|
||||||
SRC / "263_23203642/CRO 23203642.pdf",
|
SRC / "269_23232115/BACTERIO 23232115.pdf",
|
||||||
SRC / "28_23135549/trackare-15021750-23135549_15021750_23135549.pdf",
|
SRC / "290_23025988/CR consultation anesth-290-23025988.pdf",
|
||||||
SRC / "321_23043929/CRH 321_23066387.pdf",
|
SRC / "315_23060770/trackare-05012965-23060770_05012965_23060770.pdf",
|
||||||
SRC / "379_23098754/trackare-18009635-23098754_18009635_23098754.pdf",
|
SRC / "385_23102874/trackare-BA065989-23102874_BA065989_23102874.pdf",
|
||||||
SRC / "39_23167029/trackare-23022121-23167029_23022121_23167029.pdf",
|
SRC / "433_23135726/trackare-BA127127-23135726_BA127127_23135726.pdf",
|
||||||
SRC / "444_23141032/trackare-BA102259-23141032_BA102259_23141032.pdf",
|
|
||||||
SRC / "478_23161697/cro 478_23161697.pdf",
|
|
||||||
SRC / "50_23219173/trackare-07019278-23219173_07019278_23219173.pdf",
|
|
||||||
SRC / "520_23177582/trackare-99252128-23177582_99252128_23177582.pdf",
|
SRC / "520_23177582/trackare-99252128-23177582_99252128_23177582.pdf",
|
||||||
SRC / "556_23220878/trackare-21041742-23220878_21041742_23220878.pdf",
|
SRC / "552_23214501/trackare-BA171849-23214501_BA171849_23214501.pdf",
|
||||||
SRC / "602_23070052/trackare-20028293-23070052_20028293_23070052.pdf",
|
SRC / "590_23043950/trackare-17015185-23043950_17015185_23043950.pdf",
|
||||||
SRC / "604_23070704/trackare-23008170-23070704_23008170_23070704.pdf",
|
SRC / "60_23106634/CRH 60_23106634.pdf",
|
||||||
SRC / "655_23163458/trackare-01296746-23163458_01296746_23163458.pdf",
|
SRC / "603_23070213/trackare-00260974-23070213_00260974_23070213.pdf",
|
||||||
SRC / "684_23207941/CRH 684_23207941.pdf",
|
SRC / "609_23076655/trackare-BA067657-23076655_BA067657_23076655.pdf",
|
||||||
SRC / "79_23187785/79_23187785 Dossier.pdf",
|
SRC / "625_23098722/trackare-05012679-23098722_05012679_23098722.pdf",
|
||||||
SRC / "12_23084754/CRO 23084754.pdf" if (SRC / "12_23084754/CRO 23084754.pdf").exists() else SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf",
|
SRC / "632_23124019/trackare-11004431-23124019_11004431_23124019.pdf",
|
||||||
SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf",
|
SRC / "639_23135847/trackare-07003136-23135847_07003136_23135847.pdf",
|
||||||
SRC / "131_23079402/CRH 23079402.pdf",
|
SRC / "656_23165708/trackare-13013848-23165708_13013848_23165708.pdf",
|
||||||
SRC / "290_23025988/cr anesth 290_23025988.pdf",
|
SRC / "664_23175616/trackare-03020576-23175616_03020576_23175616.pdf",
|
||||||
|
SRC / "8_23074520/trackare-BA093659-23074520_BA093659_23074520.pdf",
|
||||||
|
SRC / "88_23034958/trackare-14025311-23034958_14025311_23034958.pdf",
|
||||||
|
SRC / "89_23016863/trackare-BA121804-23016863_BA121804_23016863.pdf",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -54,7 +55,16 @@ def main():
|
|||||||
ner = EdsPseudoManager()
|
ner = EdsPseudoManager()
|
||||||
ner.load()
|
ner.load()
|
||||||
assert ner.is_loaded(), "EDS-Pseudo non chargé"
|
assert ner.is_loaded(), "EDS-Pseudo non chargé"
|
||||||
print("EDS-Pseudo chargé.\n", flush=True)
|
print("EDS-Pseudo chargé.", flush=True)
|
||||||
|
|
||||||
|
print("Chargement VLM (Ollama qwen2.5vl:7b)...", flush=True)
|
||||||
|
vlm = VlmManager()
|
||||||
|
try:
|
||||||
|
vlm.load()
|
||||||
|
print(f"VLM chargé.\n", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"VLM indisponible ({e}), on continue sans.\n", flush=True)
|
||||||
|
vlm = None
|
||||||
|
|
||||||
# Vérifier existence des fichiers
|
# Vérifier existence des fichiers
|
||||||
existing = [p for p in PDFS if p.exists()]
|
existing = [p for p in PDFS if p.exists()]
|
||||||
@@ -86,6 +96,7 @@ def main():
|
|||||||
ner_manager=ner,
|
ner_manager=ner,
|
||||||
ner_thresholds=None,
|
ner_thresholds=None,
|
||||||
ogc_label=ogc,
|
ogc_label=ogc,
|
||||||
|
vlm_manager=vlm,
|
||||||
)
|
)
|
||||||
audit_path = Path(outputs.get("audit", ""))
|
audit_path = Path(outputs.get("audit", ""))
|
||||||
if audit_path.exists():
|
if audit_path.exists():
|
||||||
|
|||||||
Reference in New Issue
Block a user