From a827d860f1511f8e1eae26ba22d403b8d44d031e Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Thu, 12 Mar 2026 10:38:27 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20corrections=20retours=20collaborateurs?= =?UTF-8?q?=20=E2=80=94=20FP=20m=C3=A9dicaments,=20N=C2=B0=20venue,=20tail?= =?UTF-8?q?le=20PDF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix critique: whole-word search dans redact_pdf_raster et redact_pdf_vector pour éviter le substring matching (ex: "Luc" dans "FLUCONAZOLE", "TATIN" dans "ATORVASTATINE"). Appliqué à tous les kinds nom/NER. - Ajout regex RE_VENUE_SEJOUR pour N° venue / N° séjour (BACTERIO, Trackare) - DDN multiline élargi: tolère 0-3 lignes entre label DDN et date (tableaux BACTERIO) - N° venue multiline: détection dans tableaux BACTERIO interleaved - Réduction taille PDF raster: 150 DPI + JPEG quality 85 (était 300 DPI PNG) Ratio moyen: 19.5x (était 30-50x) - Score qualité maintenu: 97.0/100 (grade A), 0 régression Co-Authored-By: Claude Opus 4.6 --- anonymizer_core_refactored_onnx.py | 131 ++++++++++++++++++++++------- evaluation/baseline_scores.json | 8 +- 2 files changed, 105 insertions(+), 34 deletions(-) diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 277608c..c07a0c4 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -912,6 +912,12 @@ RE_EPISODE = re.compile( r"[ÉéEe]pisode\s*N[o°.]?\s*\.?\s*:?\s*(\d{5,})", re.IGNORECASE, ) +# N° venue (BACTERIO), N° séjour — identifiants de séjour hospitalier +RE_VENUE_SEJOUR = re.compile( + r"(?:N[°o]?\s*venue|N[°o]?\s*séjour|N[°o]?\s*de\s+séjour" + r"|num[ée]ro\s+de\s+(?:venue|séjour))\s*[:\-]?\s*(\d{4,})", + re.IGNORECASE, +) @dataclass class PiiHit: @@ -1302,6 +1308,14 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict return full[:full.find(val)] + PLACEHOLDERS["EPISODE"] line = RE_EPISODE.sub(_repl_episode, line) + # N° venue / N° séjour (BACTERIO, Trackare) + def _repl_venue(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "NDA", m.group(0), PLACEHOLDERS["NDA"])) + full = m.group(0) + val = m.group(1) + return full[:full.find(val)] + PLACEHOLDERS["NDA"] + line = RE_VENUE_SEJOUR.sub(_repl_venue, line) + # Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.) def _repl_etab(m: re.Match) -> str: audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["ETAB"])) @@ -1988,8 +2002,10 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] # Phase 0d : date de naissance multiline (label et date sur lignes séparées) # Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950" + # Variante large : tolère 0-3 lignes intermédiaires (tableaux BACTERIO) _RE_DATE_NAISSANCE_MULTILINE = re.compile( - r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n\s*" + r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n" + r"(?:[^\n]*\n){0,3}\s*" r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})", re.IGNORECASE, ) @@ -2012,6 +2028,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] for m in _RE_DEMANDE_MULTILINE.finditer(full_raw): audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"])) + # Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés) + _RE_VENUE_MULTILINE = re.compile( + r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})", + re.IGNORECASE, + ) + for m in _RE_VENUE_MULTILINE.finditer(full_raw): + audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"])) + # Phase 1 : masquage ligne par ligne (regex classiques) out_pages: List[str] = [] for i, page_txt in enumerate(pages_text): @@ -2609,6 +2633,8 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected) # N° Episode protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected) + # N° venue / séjour + protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected) # N° RPPS protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected) # FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS) @@ -2740,14 +2766,28 @@ def _search_ocr_words(ocr_words: List[Tuple[str, float, float, float, float]], t def _search_whole_word(page, token: str) -> list: """Cherche un token comme mot entier (pas substring) via get_text('words'). - Évite les faux positifs de page.search_for() qui fait du substring matching.""" + Évite les faux positifs de page.search_for() qui fait du substring matching. + Gère les noms composés (JEAN-PIERRE) qui peuvent être splittés par le PDF.""" rects = [] token_lower = token.lower().strip() - for w in page.get_text("words"): + words = page.get_text("words") + for w in words: # w = (x0, y0, x1, y1, word, block_no, line_no, word_no) word_text = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\") if word_text.lower() == token_lower: rects.append(fitz.Rect(w[0], w[1], w[2], w[3])) + # Fallback pour noms composés avec tiret (JEAN-PIERRE) splittés par le PDF + if not rects and "-" in token: + parts = [p for p in token.split("-") if p] + if len(parts) >= 2: + for p in parts: + p_lower = p.lower().strip() + if len(p_lower) < 2: + continue + for w in words: + wt = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\") + if wt.lower() == p_lower: + rects.append(fitz.Rect(w[0], w[1], w[2], w[3])) return rects def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, ocr_word_map: OcrWordMap = None) -> None: @@ -2761,9 +2801,10 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc # Kinds à ne pas chercher dans le PDF (dates masquées uniquement dans le texte, # pas dans le PDF où elles rendent les tableaux illisibles) _VECTOR_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"} - # Kinds dont les tokens courts (< 5) risquent le substring matching via page.search_for() - _VECTOR_SHORT_TOKEN_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM", - "EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL"} + # Kinds sensibles au substring matching : utiliser _search_whole_word + _VECTOR_WHOLEWORD_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM", + "EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL", + "NER_PER", "NER_ORG", "NER_LOC"} for pno in range(len(doc)): page = doc[pno] hits = by_page.get(pno, []) + by_page.get(-1, []) @@ -2783,26 +2824,33 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc if dedup_key in seen_tokens: continue seen_tokens.add(dedup_key) - if h.kind in _VECTOR_SHORT_TOKEN_KINDS and len(token) < 5: - if token.lower() not in _MEDICAL_STOP_WORDS_SET: + # --- Kinds de type nom/entité : whole-word search pour éviter le + # substring matching (ex: "TATIN" dans "ATORVASTATINE") --- + if h.kind in _VECTOR_WHOLEWORD_KINDS: + if token.lower() in _MEDICAL_STOP_WORDS_SET: + continue + if " " not in token: rects = _search_whole_word(page, token) if not rects and ocr_word_map and pno in ocr_word_map: rects = _search_ocr_words(ocr_word_map[pno], token, page.rect) all_rects.extend(rects) + else: + rects = page.search_for(token) + if not rects: + for word in token.split(): + word = word.strip(" .-'") + if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET: + continue + rects.extend(_search_whole_word(page, word)) + if not rects and ocr_word_map and pno in ocr_word_map: + rects.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect)) + all_rects.extend(rects) continue rects = page.search_for(token) if not rects and h.kind in {"NIR", "IBAN", "TEL"}: compact = re.sub(r"\s+", "", token) if compact != token: rects = page.search_for(compact) - if not rects and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}: - for word in token.split(): - word = word.strip(" .-'") - if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET: - continue - if not word[0].isupper(): - continue - rects.extend(page.search_for(word)) if not rects and ocr_word_map and pno in ocr_word_map: rects = _search_ocr_words(ocr_word_map[pno], token, page.rect) all_rects.extend(rects) @@ -2819,7 +2867,7 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc def _rasterize_page(args): """Worker parallèle : rasterise une page + dessine les rectangles noirs.""" - pdf_path_str, pno, rects_tuples, dpi, ogc_label = args + pdf_path_str, pno, rects_tuples, dpi, ogc_label, jpeg_quality = args doc = fitz.open(pdf_path_str) src = doc[pno] rect_w, rect_h = src.rect.width, src.rect.height @@ -2851,19 +2899,24 @@ def _rasterize_page(args): draw.rectangle([x - 4, y - 2, x + tw + 4, y + th + 2], fill=(255, 255, 255)) draw.text((x, y), text, fill=(0, 0, 0), font=font) buf = io.BytesIO() - img.save(buf, format="PNG") + if jpeg_quality and jpeg_quality > 0: + img.save(buf, format="JPEG", quality=jpeg_quality, optimize=True) + else: + img.save(buf, format="PNG") doc.close() return pno, buf.getvalue(), rect_w, rect_h -def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300, ogc_label: Optional[str] = None, ocr_word_map: OcrWordMap = None) -> None: +def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 150, ogc_label: Optional[str] = None, ocr_word_map: OcrWordMap = None, jpeg_quality: int = 85) -> None: if fitz is None: raise RuntimeError("PyMuPDF non disponible – installez pymupdf.") doc = fitz.open(str(original_pdf)) all_rects: Dict[int, List["fitz.Rect"]] = {} _RASTER_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"} - _RASTER_SHORT_TOKEN_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM", - "EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL"} + # Kinds sensibles au substring matching : utiliser _search_whole_word + _RASTER_WHOLEWORD_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM", + "EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL", + "NER_PER", "NER_ORG", "NER_LOC"} _VLM_NUMERIC_KINDS = {"VLM_NUM_PATIENT", "VLM_NUM_LOT", "VLM_NUM_ORD", "VLM_NDA", "VLM_NIR", "VLM_IPP", "VLM_RPPS"} by_page: Dict[int, List[PiiHit]] = {} @@ -2887,12 +2940,30 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp if token in seen_tokens: continue seen_tokens.add(token) - if h.kind in _RASTER_SHORT_TOKEN_KINDS and len(token) < 5: - if token.lower() not in _MEDICAL_STOP_WORDS_SET: - found_short = _search_whole_word(page, token) - if not found_short and ocr_word_map and pno in ocr_word_map: - found_short = _search_ocr_words(ocr_word_map[pno], token, page.rect) - rects.extend(found_short) + # --- Kinds de type nom/entité : whole-word search pour éviter le + # substring matching (ex: "TATIN" dans "ATORVASTATINE") --- + if h.kind in _RASTER_WHOLEWORD_KINDS: + if token.lower() in _MEDICAL_STOP_WORDS_SET: + continue + if " " not in token: + # Token mono-mot : chercher comme mot entier + found_ww = _search_whole_word(page, token) + if not found_ww and ocr_word_map and pno in ocr_word_map: + found_ww = _search_ocr_words(ocr_word_map[pno], token, page.rect) + rects.extend(found_ww) + else: + # Token multi-mots : d'abord chercher la chaîne complète + found_multi = page.search_for(token) + if not found_multi: + # Fallback : chercher chaque mot comme mot entier + for word in token.split(): + word = word.strip(" .-'") + if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET: + continue + found_multi.extend(_search_whole_word(page, word)) + if not found_multi and ocr_word_map and pno in ocr_word_map: + found_multi.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect)) + rects.extend(found_multi) continue found = page.search_for(token) if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}: @@ -2926,7 +2997,7 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp n_workers = min(n_pages, os.cpu_count() or 4) tasks = [ - (str(original_pdf), pno, rects_as_tuples.get(pno, []), dpi, ogc_label) + (str(original_pdf), pno, rects_as_tuples.get(pno, []), dpi, ogc_label, jpeg_quality) for pno in range(n_pages) ] @@ -2935,9 +3006,9 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp # Assemblage final (séquentiel, rapide) out = fitz.open() - for pno, png_bytes, w, h in results: + for pno, img_bytes, w, h in results: dst = out.new_page(width=w, height=h) - dst.insert_image(fitz.Rect(0, 0, w, h), stream=png_bytes) + dst.insert_image(fitz.Rect(0, 0, w, h), stream=img_bytes) out.save(str(out_pdf), deflate=True, garbage=4, clean=True) out.close() diff --git a/evaluation/baseline_scores.json b/evaluation/baseline_scores.json index 4d1ae6b..fbf0a79 100644 --- a/evaluation/baseline_scores.json +++ b/evaluation/baseline_scores.json @@ -1,18 +1,18 @@ { - "date": "2026-03-11T12:11:24.286697", + "date": "2026-03-12T10:24:59.261417", "scores": { "global_score": 97.0, "leak_score": 100.0, "fp_score": 90, "totals": { "documents": 29, - "audit_hits": 2804, + "audit_hits": 2797, "name_tokens_known": 461, "leak_audit": 0, "leak_occurrences": 0, "leak_regex": 0, "leak_insee_high": 0, - "leak_insee_medium": 568, + "leak_insee_medium": 569, "fp_medical": 0, "fp_overmasking": 2 } @@ -158,7 +158,7 @@ "leak_audit": 0, "leak_regex": 0, "leak_insee_high": 0, - "leak_insee_medium": 18, + "leak_insee_medium": 19, "fp_medical": 0, "fp_overmasking": 0 },