diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 41f696c..5a2f154 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -191,7 +191,8 @@ RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}") RE_TEL = re.compile(r"(? str: if m: val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"])) return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line) + m = RE_CSULT.search(line) + if m: + val = m.group(1); audit.append(PiiHit(page_idx, "DOSSIER", val, PLACEHOLDERS["DOSSIER"])) + return RE_CSULT.sub(lambda _: f"N° : {PLACEHOLDERS['DOSSIER']}", line) m = RE_RPPS.search(line) if m: val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"])) @@ -975,6 +1013,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict def _repl_tel(m: re.Match) -> str: audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"])) return PLACEHOLDERS["TEL"] + line = RE_TEL_SLASH.sub(_repl_tel, line) # slash d'abord (plus spécifique) line = RE_TEL.sub(_repl_tel, line) line = RE_TEL_COMPACT.sub(_repl_tel, line) @@ -1140,6 +1179,7 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str: def _repl_tel(m: re.Match) -> str: audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"])) return PLACEHOLDERS["TEL"] + key = RE_TEL_SLASH.sub(_repl_tel, key) key = RE_TEL.sub(_repl_tel, key) key = RE_TEL_COMPACT.sub(_repl_tel, key) def _repl_email(m: re.Match) -> str: @@ -1200,6 +1240,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]: return if tok.lower() in _FORCE_EXCLUDE: return + # Filtre supplémentaire : ne pas force-add les mots médicaux connus + if tok.lower() in _MEDICAL_STOP_WORDS_SET: + return names.add(tok) force_names.add(tok) @@ -1324,10 +1367,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]: # --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") --- for m in re.finditer( - r"Note\s+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])\s+" - r"(?:DR\.?\s+)?" + r"Note[ \t]+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])[ \t]+" + r"(?:DR\.?[ \t]+)?" r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", full_text ): for g in (m.group(1), m.group(2)): @@ -1337,9 +1380,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]: _add_name_force(tok) # --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") --- + # IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante for m in re.finditer( - r"Signé\s+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", + r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)" + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", full_text ): for g in (m.group(1), m.group(2)): @@ -1350,9 +1394,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]: # --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") --- for m in re.finditer( - r"Signé\s+—\s+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)\s+[-]?\s*" + r"Signé[ \t]+—[ \t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \t]+[-]?[ \t]*" r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?", + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?", full_text ): for g in (m.group(1), m.group(2)): @@ -1363,7 +1407,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]: # --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") --- for m in re.finditer( - r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?\s+" + r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?[ \t]+" r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})", full_text ): @@ -1373,8 +1417,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]: # --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions --- for m in re.finditer( - r"DR\.?\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", + r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})" + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", full_text ): for g in (m.group(1), m.group(2)): @@ -1387,9 +1431,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]: # Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM" # Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant) for m in re.finditer( - r"\d{1,2}\s*:\s*\d{2}\s+" + r"\d{1,2}[ \t]*:[ \t]*\d{2}[ \t]+" r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?", + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?", full_text ): for g in (m.group(1), m.group(2)): @@ -1415,13 +1459,15 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]: return filtered, hits, force_names -def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set: +def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, set]: """Pré-scan du document brut pour extraire les noms de personnes depuis les champs structurés (Patient, Rédigé par, etc.). - Retourne un ensemble de tokens (mots) à masquer globalement.""" + Retourne (names, force_names) : ensemble de tokens à masquer, + et sous-ensemble qui bypass les stop words.""" wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or []) wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or []) names: set = set() + force_names: set = set() def _add_tokens(match_str: str): for token in match_str.split(): @@ -1434,6 +1480,17 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set: continue names.add(token) + def _add_tokens_force_all(match_str: str): + """Bypass stop words pour TOUS les tokens (contexte Patient: très fiable).""" + for token in match_str.split(): + token = token.strip(" .-'") + if len(token) < 2: + continue + if token.upper() in wl_sections or token in wl_phrases: + continue + names.add(token) + force_names.add(token) + def _add_tokens_force_first(match_str): """Comme _add_tokens mais force le 1er token (contexte Dr/Mme fort).""" tokens = match_str.split() @@ -1441,21 +1498,20 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set: token = token.strip(" .-'") if len(token) < 2: continue + if token.upper() in wl_sections or token in wl_phrases: + continue + if token.lower() in _MEDICAL_STOP_WORDS_SET: + continue if i == 0: - # Premier token après Dr/Mme : toujours un nom, bypass stop words - if token.upper() not in wl_sections: - names.add(token) + # Premier token après Dr/Mme : contexte fiable + names.add(token) else: if len(token) < 3: continue - if token.upper() in wl_sections or token in wl_phrases: - continue - if token.lower() in _MEDICAL_STOP_WORDS_SET: - continue names.add(token) for m in RE_EXTRACT_PATIENT.finditer(full_text): - _add_tokens(m.group(1)) + _add_tokens_force_all(m.group(1)) for m in RE_EXTRACT_REDIGE.finditer(full_text): _add_tokens(m.group(1)) for m in RE_EXTRACT_MME_MR.finditer(full_text): @@ -1482,6 +1538,9 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set: # Pr / Professeur + nom(s) for m in RE_EXTRACT_PR.finditer(full_text): _add_tokens_force_first(m.group(1)) + # Opérateur / Anesthésiste / Chirurgien + nom(s) + for m in RE_EXTRACT_OPERATEUR.finditer(full_text): + _add_tokens_force_first(m.group(1)) # Extraction des noms dans les listes virgulées après Dr/Docteur # ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé" @@ -1509,7 +1568,7 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set: if len(part) >= 3 and part.lower() not in _MEDICAL_STOP_WORDS_SET: names.add(part) - return names + return names, force_names def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str: @@ -1517,6 +1576,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam placeholder = PLACEHOLDERS["NOM"] _force = force_names or set() safe_names = {n for n in names if len(n) >= 3 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)} + # Ajouter un hit global (page=-1) par nom pour la redaction PDF raster + # (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page) + for token in sorted(safe_names, key=len, reverse=True): + audit.append(PiiHit(-1, "NOM_GLOBAL", token, placeholder)) for token in sorted(safe_names, key=len, reverse=True): pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE) new_text = [] @@ -1577,7 +1640,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] full_raw = "\n".join(pages_text) + "\n" + "\n".join( "\n".join(rows) for rows in tables_lines ) - extracted_names = _extract_document_names(full_raw, cfg) + extracted_names, doc_force_names = _extract_document_names(full_raw, cfg) # Phase 0b : si document Trackare, extraction renforcée des PII structurés is_trackare = _is_trackare_document(full_raw) @@ -1586,6 +1649,8 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw) extracted_names.update(trackare_names) audit.extend(trackare_hits) + # Fusionner les force_names des deux sources + all_force_names = doc_force_names | trackare_force_names # Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées, # avec possiblement 0-2 lignes intermédiaires masquées ou vides) @@ -1595,6 +1660,32 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] for m in _RE_FINESS_MULTILINE.finditer(full_raw): audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"])) + # Phase 0d : date de naissance multiline (label et date sur lignes séparées) + # Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950" + _RE_DATE_NAISSANCE_MULTILINE = re.compile( + r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n\s*" + r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})", + re.IGNORECASE, + ) + for m in _RE_DATE_NAISSANCE_MULTILINE.finditer(full_raw): + audit.append(PiiHit(-1, "DATE_NAISSANCE", m.group(1), PLACEHOLDERS["DATE_NAISSANCE"])) + + # Phase 0e : IPP multiline (N°Ipp :\n20023294 ou I.P.P. :\nS1032021) + _RE_IPP_MULTILINE = re.compile( + r"(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*\n\s*([A-Za-z0-9]{6,})\b", + re.IGNORECASE, + ) + for m in _RE_IPP_MULTILINE.finditer(full_raw): + audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"])) + + # Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164) + _RE_DEMANDE_MULTILINE = re.compile( + r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})", + re.IGNORECASE, + ) + for m in _RE_DEMANDE_MULTILINE.finditer(full_raw): + audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"])) + # Phase 1 : masquage ligne par ligne (regex classiques) out_pages: List[str] = [] for i, page_txt in enumerate(pages_text): @@ -1620,7 +1711,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] # Phase 2 : application globale des noms extraits (rattrapage) if extracted_names: - text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=trackare_force_names) + text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=all_force_names) # Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS) text_out = _apply_trackare_hits_to_text(text_out, audit) @@ -1806,6 +1897,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: protected, kept = strip_tables(text) # PII critiques (comme avant) protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected) + protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected) protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected) protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected) protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected) @@ -1846,6 +1938,10 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: tokens = [t for t in span.split() if t] if len(tokens) == 1 and len(tokens[0]) <= 3: return raw + # Filtrer les termes médicaux (stop words) + clean = [t for t in tokens if t.lower() not in _MEDICAL_STOP_WORDS_SET] + if not clean: + return raw return raw.replace(span, PLACEHOLDERS["NOM"]) protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected) res = list(protected) @@ -1971,7 +2067,7 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc compact = re.sub(r"\s+", "", token) if compact != token: rects = page.search_for(compact) - if not rects and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}: + if not rects and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}: for word in token.split(): word = word.strip(" .-'") if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET: @@ -2074,7 +2170,7 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}: compact = re.sub(r"\s+", "", token) found = page.search_for(compact) - if not found and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM", + if not found and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM", "VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}: for word in token.split(): word = word.strip(" .-'") @@ -2359,13 +2455,13 @@ def process_pdf( # 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques # Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages # pour éviter les fuites sur les documents multi-pages (ex: CRO) - _CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS"} - + _CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER"} + _global_pii: Dict[str, set] = {} for h in anon.audit: # Collecter TOUS les types pour analyse, mais ne propager que les critiques if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB", - "VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", + "VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", "DOSSIER", "force_term", "force_regex", "FINESS"}: # Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations if h.kind == "DATE_NAISSANCE": diff --git a/config/dictionnaires.yml b/config/dictionnaires.yml index c22ce9c..bde8f05 100644 --- a/config/dictionnaires.yml +++ b/config/dictionnaires.yml @@ -18,11 +18,15 @@ blacklist: force_mask_terms: - CENTRE HOSPITALIER COTE BASQUE - CENTRE HOSPITALIER DE LA COTE BASQUE + - POLYCLINIQUE COTE BASQUE SUD + - POLYCLINIQUE CÔTE BASQUE SUD - CHCB + - '640780417' - 'Dates du séjour :' - CONCERTATION force_mask_regex: - 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque' + - 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud' kv_labels_preserve: - FINESS - IPP diff --git a/run_batch_30_audit.py b/run_batch_30_audit.py index 199acfd..ee48a01 100644 --- a/run_batch_30_audit.py +++ b/run_batch_30_audit.py @@ -10,42 +10,43 @@ sys.path.insert(0, str(Path(__file__).parent)) import anonymizer_core_refactored_onnx as core from eds_pseudo_manager import EdsPseudoManager +from vlm_manager import VlmManager SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") OUTDIR = SRC / "anonymise_audit_30" CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") PDFS = [ - SRC / "110_23061319/trackare-07026002-23061319_07026002_23061319.pdf", - SRC / "115_23066188/CRH 23066188.pdf", - SRC / "161_23098838/CRO 23098838.pdf", - SRC / "179_23126805/trackare-23005591-23126805_23005591_23126805.pdf", - SRC / "181_23127286/CRH 23127286.pdf", - SRC / "192_23132490/CRH 23132490.pdf", - SRC / "208_23151988/trackare-23020064-23151988_23020064_23151988.pdf", - SRC / "215_23158603/trackare-22028007-23158603_22028007_23158603.pdf", - SRC / "227_23173599/CRH 23173599.pdf", - SRC / "236_23116794/trackare-BA054633-23116794_BA054633_23116794.pdf", - SRC / "248_23194278/CRH 23194278.pdf", - SRC / "263_23203642/CRO 23203642.pdf", - SRC / "28_23135549/trackare-15021750-23135549_15021750_23135549.pdf", - SRC / "321_23043929/CRH 321_23066387.pdf", - SRC / "379_23098754/trackare-18009635-23098754_18009635_23098754.pdf", - SRC / "39_23167029/trackare-23022121-23167029_23022121_23167029.pdf", - SRC / "444_23141032/trackare-BA102259-23141032_BA102259_23141032.pdf", - SRC / "478_23161697/cro 478_23161697.pdf", - SRC / "50_23219173/trackare-07019278-23219173_07019278_23219173.pdf", + SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf", + SRC / "124_23074376/trackare-05000272-23074376_05000272_23074376.pdf", + SRC / "133_23056022/CONSULTATION ANESTHESISTE 23056022.pdf", + SRC / "141_23090597/trackare-BA042686-23090597_BA042686_23090597.pdf", + SRC / "148_23018396/trackare-23000862-23018396_23000862_23018396.pdf", + SRC / "183_23087212/LETTRE DE SORTIE 23087212.pdf", + SRC / "216_23159905/CRO 23159905.pdf", + SRC / "216_23159905/trackare-99246761-23159905_99246761_23159905.pdf", + SRC / "222_23139653/CONSULTATION ANESTHESISTE 23139653.pdf", + SRC / "225_23160703/CRO 23160703.pdf", + SRC / "26_23127395/trackare-BA192486-23127395_BA192486_23127395.pdf", + SRC / "269_23232115/BACTERIO 23232115.pdf", + SRC / "290_23025988/CR consultation anesth-290-23025988.pdf", + SRC / "315_23060770/trackare-05012965-23060770_05012965_23060770.pdf", + SRC / "385_23102874/trackare-BA065989-23102874_BA065989_23102874.pdf", + SRC / "433_23135726/trackare-BA127127-23135726_BA127127_23135726.pdf", SRC / "520_23177582/trackare-99252128-23177582_99252128_23177582.pdf", - SRC / "556_23220878/trackare-21041742-23220878_21041742_23220878.pdf", - SRC / "602_23070052/trackare-20028293-23070052_20028293_23070052.pdf", - SRC / "604_23070704/trackare-23008170-23070704_23008170_23070704.pdf", - SRC / "655_23163458/trackare-01296746-23163458_01296746_23163458.pdf", - SRC / "684_23207941/CRH 684_23207941.pdf", - SRC / "79_23187785/79_23187785 Dossier.pdf", - SRC / "12_23084754/CRO 23084754.pdf" if (SRC / "12_23084754/CRO 23084754.pdf").exists() else SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf", - SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf", - SRC / "131_23079402/CRH 23079402.pdf", - SRC / "290_23025988/cr anesth 290_23025988.pdf", + SRC / "552_23214501/trackare-BA171849-23214501_BA171849_23214501.pdf", + SRC / "590_23043950/trackare-17015185-23043950_17015185_23043950.pdf", + SRC / "60_23106634/CRH 60_23106634.pdf", + SRC / "603_23070213/trackare-00260974-23070213_00260974_23070213.pdf", + SRC / "609_23076655/trackare-BA067657-23076655_BA067657_23076655.pdf", + SRC / "625_23098722/trackare-05012679-23098722_05012679_23098722.pdf", + SRC / "632_23124019/trackare-11004431-23124019_11004431_23124019.pdf", + SRC / "639_23135847/trackare-07003136-23135847_07003136_23135847.pdf", + SRC / "656_23165708/trackare-13013848-23165708_13013848_23165708.pdf", + SRC / "664_23175616/trackare-03020576-23175616_03020576_23175616.pdf", + SRC / "8_23074520/trackare-BA093659-23074520_BA093659_23074520.pdf", + SRC / "88_23034958/trackare-14025311-23034958_14025311_23034958.pdf", + SRC / "89_23016863/trackare-BA121804-23016863_BA121804_23016863.pdf", ] @@ -54,7 +55,16 @@ def main(): ner = EdsPseudoManager() ner.load() assert ner.is_loaded(), "EDS-Pseudo non chargé" - print("EDS-Pseudo chargé.\n", flush=True) + print("EDS-Pseudo chargé.", flush=True) + + print("Chargement VLM (Ollama qwen2.5vl:7b)...", flush=True) + vlm = VlmManager() + try: + vlm.load() + print(f"VLM chargé.\n", flush=True) + except Exception as e: + print(f"VLM indisponible ({e}), on continue sans.\n", flush=True) + vlm = None # Vérifier existence des fichiers existing = [p for p in PDFS if p.exists()] @@ -86,6 +96,7 @@ def main(): ner_manager=ner, ner_thresholds=None, ogc_label=ogc, + vlm_manager=vlm, ) audit_path = Path(outputs.get("audit", "")) if audit_path.exists():