fix: corrections retours collaborateurs — FP médicaments, N° venue, taille PDF

- Fix critique: whole-word search dans redact_pdf_raster et redact_pdf_vector
  pour éviter le substring matching (ex: "Luc" dans "FLUCONAZOLE",
  "TATIN" dans "ATORVASTATINE"). Appliqué à tous les kinds nom/NER.
- Ajout regex RE_VENUE_SEJOUR pour N° venue / N° séjour (BACTERIO, Trackare)
- DDN multiline élargi: tolère 0-3 lignes entre label DDN et date (tableaux BACTERIO)
- N° venue multiline: détection dans tableaux BACTERIO interleaved
- Réduction taille PDF raster: 150 DPI + JPEG quality 85 (était 300 DPI PNG)
  Ratio moyen: 19.5x (était 30-50x)
- Score qualité maintenu: 97.0/100 (grade A), 0 régression

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-12 10:38:27 +01:00
parent eb14cd219d
commit a827d860f1
2 changed files with 105 additions and 34 deletions

View File

@@ -912,6 +912,12 @@ RE_EPISODE = re.compile(
r"[ÉéEe]pisode\s*N[o°.]?\s*\.?\s*:?\s*(\d{5,})",
re.IGNORECASE,
)
# N° venue (BACTERIO), N° séjour — identifiants de séjour hospitalier
RE_VENUE_SEJOUR = re.compile(
r"(?:N[°o]?\s*venue|N[°o]?\s*séjour|N[°o]?\s*de\s+séjour"
r"|num[ée]ro\s+de\s+(?:venue|séjour))\s*[:\-]?\s*(\d{4,})",
re.IGNORECASE,
)
@dataclass
class PiiHit:
@@ -1302,6 +1308,14 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
return full[:full.find(val)] + PLACEHOLDERS["EPISODE"]
line = RE_EPISODE.sub(_repl_episode, line)
# N° venue / N° séjour (BACTERIO, Trackare)
def _repl_venue(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "NDA", m.group(0), PLACEHOLDERS["NDA"]))
full = m.group(0)
val = m.group(1)
return full[:full.find(val)] + PLACEHOLDERS["NDA"]
line = RE_VENUE_SEJOUR.sub(_repl_venue, line)
# Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.)
def _repl_etab(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["ETAB"]))
@@ -1988,8 +2002,10 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
# Phase 0d : date de naissance multiline (label et date sur lignes séparées)
# Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950"
# Variante large : tolère 0-3 lignes intermédiaires (tableaux BACTERIO)
_RE_DATE_NAISSANCE_MULTILINE = re.compile(
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n\s*"
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n"
r"(?:[^\n]*\n){0,3}\s*"
r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})",
re.IGNORECASE,
)
@@ -2012,6 +2028,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
# Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
_RE_VENUE_MULTILINE = re.compile(
r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})",
re.IGNORECASE,
)
for m in _RE_VENUE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
# Phase 1 : masquage ligne par ligne (regex classiques)
out_pages: List[str] = []
for i, page_txt in enumerate(pages_text):
@@ -2609,6 +2633,8 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
# N° Episode
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
# N° venue / séjour
protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected)
# N° RPPS
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
@@ -2740,14 +2766,28 @@ def _search_ocr_words(ocr_words: List[Tuple[str, float, float, float, float]], t
def _search_whole_word(page, token: str) -> list:
"""Cherche un token comme mot entier (pas substring) via get_text('words').
Évite les faux positifs de page.search_for() qui fait du substring matching."""
Évite les faux positifs de page.search_for() qui fait du substring matching.
Gère les noms composés (JEAN-PIERRE) qui peuvent être splittés par le PDF."""
rects = []
token_lower = token.lower().strip()
for w in page.get_text("words"):
words = page.get_text("words")
for w in words:
# w = (x0, y0, x1, y1, word, block_no, line_no, word_no)
word_text = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
if word_text.lower() == token_lower:
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
# Fallback pour noms composés avec tiret (JEAN-PIERRE) splittés par le PDF
if not rects and "-" in token:
parts = [p for p in token.split("-") if p]
if len(parts) >= 2:
for p in parts:
p_lower = p.lower().strip()
if len(p_lower) < 2:
continue
for w in words:
wt = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
if wt.lower() == p_lower:
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
return rects
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, ocr_word_map: OcrWordMap = None) -> None:
@@ -2761,9 +2801,10 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
# Kinds à ne pas chercher dans le PDF (dates masquées uniquement dans le texte,
# pas dans le PDF où elles rendent les tableaux illisibles)
_VECTOR_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
# Kinds dont les tokens courts (< 5) risquent le substring matching via page.search_for()
_VECTOR_SHORT_TOKEN_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL"}
# Kinds sensibles au substring matching : utiliser _search_whole_word
_VECTOR_WHOLEWORD_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL",
"NER_PER", "NER_ORG", "NER_LOC"}
for pno in range(len(doc)):
page = doc[pno]
hits = by_page.get(pno, []) + by_page.get(-1, [])
@@ -2783,26 +2824,33 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
if dedup_key in seen_tokens:
continue
seen_tokens.add(dedup_key)
if h.kind in _VECTOR_SHORT_TOKEN_KINDS and len(token) < 5:
if token.lower() not in _MEDICAL_STOP_WORDS_SET:
# --- Kinds de type nom/entité : whole-word search pour éviter le
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
if h.kind in _VECTOR_WHOLEWORD_KINDS:
if token.lower() in _MEDICAL_STOP_WORDS_SET:
continue
if " " not in token:
rects = _search_whole_word(page, token)
if not rects and ocr_word_map and pno in ocr_word_map:
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
all_rects.extend(rects)
else:
rects = page.search_for(token)
if not rects:
for word in token.split():
word = word.strip(" .-'")
if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET:
continue
rects.extend(_search_whole_word(page, word))
if not rects and ocr_word_map and pno in ocr_word_map:
rects.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
all_rects.extend(rects)
continue
rects = page.search_for(token)
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
compact = re.sub(r"\s+", "", token)
if compact != token:
rects = page.search_for(compact)
if not rects and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
for word in token.split():
word = word.strip(" .-'")
if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET:
continue
if not word[0].isupper():
continue
rects.extend(page.search_for(word))
if not rects and ocr_word_map and pno in ocr_word_map:
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
all_rects.extend(rects)
@@ -2819,7 +2867,7 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
def _rasterize_page(args):
"""Worker parallèle : rasterise une page + dessine les rectangles noirs."""
pdf_path_str, pno, rects_tuples, dpi, ogc_label = args
pdf_path_str, pno, rects_tuples, dpi, ogc_label, jpeg_quality = args
doc = fitz.open(pdf_path_str)
src = doc[pno]
rect_w, rect_h = src.rect.width, src.rect.height
@@ -2851,19 +2899,24 @@ def _rasterize_page(args):
draw.rectangle([x - 4, y - 2, x + tw + 4, y + th + 2], fill=(255, 255, 255))
draw.text((x, y), text, fill=(0, 0, 0), font=font)
buf = io.BytesIO()
img.save(buf, format="PNG")
if jpeg_quality and jpeg_quality > 0:
img.save(buf, format="JPEG", quality=jpeg_quality, optimize=True)
else:
img.save(buf, format="PNG")
doc.close()
return pno, buf.getvalue(), rect_w, rect_h
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300, ogc_label: Optional[str] = None, ocr_word_map: OcrWordMap = None) -> None:
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 150, ogc_label: Optional[str] = None, ocr_word_map: OcrWordMap = None, jpeg_quality: int = 85) -> None:
if fitz is None:
raise RuntimeError("PyMuPDF non disponible installez pymupdf.")
doc = fitz.open(str(original_pdf))
all_rects: Dict[int, List["fitz.Rect"]] = {}
_RASTER_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
_RASTER_SHORT_TOKEN_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL"}
# Kinds sensibles au substring matching : utiliser _search_whole_word
_RASTER_WHOLEWORD_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL",
"NER_PER", "NER_ORG", "NER_LOC"}
_VLM_NUMERIC_KINDS = {"VLM_NUM_PATIENT", "VLM_NUM_LOT", "VLM_NUM_ORD", "VLM_NDA",
"VLM_NIR", "VLM_IPP", "VLM_RPPS"}
by_page: Dict[int, List[PiiHit]] = {}
@@ -2887,12 +2940,30 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
if token in seen_tokens:
continue
seen_tokens.add(token)
if h.kind in _RASTER_SHORT_TOKEN_KINDS and len(token) < 5:
if token.lower() not in _MEDICAL_STOP_WORDS_SET:
found_short = _search_whole_word(page, token)
if not found_short and ocr_word_map and pno in ocr_word_map:
found_short = _search_ocr_words(ocr_word_map[pno], token, page.rect)
rects.extend(found_short)
# --- Kinds de type nom/entité : whole-word search pour éviter le
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
if h.kind in _RASTER_WHOLEWORD_KINDS:
if token.lower() in _MEDICAL_STOP_WORDS_SET:
continue
if " " not in token:
# Token mono-mot : chercher comme mot entier
found_ww = _search_whole_word(page, token)
if not found_ww and ocr_word_map and pno in ocr_word_map:
found_ww = _search_ocr_words(ocr_word_map[pno], token, page.rect)
rects.extend(found_ww)
else:
# Token multi-mots : d'abord chercher la chaîne complète
found_multi = page.search_for(token)
if not found_multi:
# Fallback : chercher chaque mot comme mot entier
for word in token.split():
word = word.strip(" .-'")
if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET:
continue
found_multi.extend(_search_whole_word(page, word))
if not found_multi and ocr_word_map and pno in ocr_word_map:
found_multi.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
rects.extend(found_multi)
continue
found = page.search_for(token)
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
@@ -2926,7 +2997,7 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
n_workers = min(n_pages, os.cpu_count() or 4)
tasks = [
(str(original_pdf), pno, rects_as_tuples.get(pno, []), dpi, ogc_label)
(str(original_pdf), pno, rects_as_tuples.get(pno, []), dpi, ogc_label, jpeg_quality)
for pno in range(n_pages)
]
@@ -2935,9 +3006,9 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
# Assemblage final (séquentiel, rapide)
out = fitz.open()
for pno, png_bytes, w, h in results:
for pno, img_bytes, w, h in results:
dst = out.new_page(width=w, height=h)
dst.insert_image(fitz.Rect(0, 0, w, h), stream=png_bytes)
dst.insert_image(fitz.Rect(0, 0, w, h), stream=img_bytes)
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
out.close()

View File

@@ -1,18 +1,18 @@
{
"date": "2026-03-11T12:11:24.286697",
"date": "2026-03-12T10:24:59.261417",
"scores": {
"global_score": 97.0,
"leak_score": 100.0,
"fp_score": 90,
"totals": {
"documents": 29,
"audit_hits": 2804,
"audit_hits": 2797,
"name_tokens_known": 461,
"leak_audit": 0,
"leak_occurrences": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 568,
"leak_insee_medium": 569,
"fp_medical": 0,
"fp_overmasking": 2
}
@@ -158,7 +158,7 @@
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 18,
"leak_insee_medium": 19,
"fp_medical": 0,
"fp_overmasking": 0
},