fix: corrections retours collaborateurs — FP médicaments, N° venue, taille PDF
- Fix critique: whole-word search dans redact_pdf_raster et redact_pdf_vector pour éviter le substring matching (ex: "Luc" dans "FLUCONAZOLE", "TATIN" dans "ATORVASTATINE"). Appliqué à tous les kinds nom/NER. - Ajout regex RE_VENUE_SEJOUR pour N° venue / N° séjour (BACTERIO, Trackare) - DDN multiline élargi: tolère 0-3 lignes entre label DDN et date (tableaux BACTERIO) - N° venue multiline: détection dans tableaux BACTERIO interleaved - Réduction taille PDF raster: 150 DPI + JPEG quality 85 (était 300 DPI PNG) Ratio moyen: 19.5x (était 30-50x) - Score qualité maintenu: 97.0/100 (grade A), 0 régression Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -912,6 +912,12 @@ RE_EPISODE = re.compile(
|
||||
r"[ÉéEe]pisode\s*N[o°.]?\s*\.?\s*:?\s*(\d{5,})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# N° venue (BACTERIO), N° séjour — identifiants de séjour hospitalier
|
||||
RE_VENUE_SEJOUR = re.compile(
|
||||
r"(?:N[°o]?\s*venue|N[°o]?\s*séjour|N[°o]?\s*de\s+séjour"
|
||||
r"|num[ée]ro\s+de\s+(?:venue|séjour))\s*[:\-]?\s*(\d{4,})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class PiiHit:
|
||||
@@ -1302,6 +1308,14 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
|
||||
return full[:full.find(val)] + PLACEHOLDERS["EPISODE"]
|
||||
line = RE_EPISODE.sub(_repl_episode, line)
|
||||
|
||||
# N° venue / N° séjour (BACTERIO, Trackare)
|
||||
def _repl_venue(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "NDA", m.group(0), PLACEHOLDERS["NDA"]))
|
||||
full = m.group(0)
|
||||
val = m.group(1)
|
||||
return full[:full.find(val)] + PLACEHOLDERS["NDA"]
|
||||
line = RE_VENUE_SEJOUR.sub(_repl_venue, line)
|
||||
|
||||
# Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.)
|
||||
def _repl_etab(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["ETAB"]))
|
||||
@@ -1988,8 +2002,10 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
|
||||
# Phase 0d : date de naissance multiline (label et date sur lignes séparées)
|
||||
# Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950"
|
||||
# Variante large : tolère 0-3 lignes intermédiaires (tableaux BACTERIO)
|
||||
_RE_DATE_NAISSANCE_MULTILINE = re.compile(
|
||||
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n\s*"
|
||||
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n"
|
||||
r"(?:[^\n]*\n){0,3}\s*"
|
||||
r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
@@ -2012,6 +2028,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
|
||||
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||||
|
||||
# Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
|
||||
_RE_VENUE_MULTILINE = re.compile(
|
||||
r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for m in _RE_VENUE_MULTILINE.finditer(full_raw):
|
||||
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
|
||||
|
||||
# Phase 1 : masquage ligne par ligne (regex classiques)
|
||||
out_pages: List[str] = []
|
||||
for i, page_txt in enumerate(pages_text):
|
||||
@@ -2609,6 +2633,8 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
|
||||
# N° Episode
|
||||
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
|
||||
# N° venue / séjour
|
||||
protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected)
|
||||
# N° RPPS
|
||||
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
|
||||
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
|
||||
@@ -2740,14 +2766,28 @@ def _search_ocr_words(ocr_words: List[Tuple[str, float, float, float, float]], t
|
||||
|
||||
def _search_whole_word(page, token: str) -> list:
|
||||
"""Cherche un token comme mot entier (pas substring) via get_text('words').
|
||||
Évite les faux positifs de page.search_for() qui fait du substring matching."""
|
||||
Évite les faux positifs de page.search_for() qui fait du substring matching.
|
||||
Gère les noms composés (JEAN-PIERRE) qui peuvent être splittés par le PDF."""
|
||||
rects = []
|
||||
token_lower = token.lower().strip()
|
||||
for w in page.get_text("words"):
|
||||
words = page.get_text("words")
|
||||
for w in words:
|
||||
# w = (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
||||
word_text = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
|
||||
if word_text.lower() == token_lower:
|
||||
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
|
||||
# Fallback pour noms composés avec tiret (JEAN-PIERRE) splittés par le PDF
|
||||
if not rects and "-" in token:
|
||||
parts = [p for p in token.split("-") if p]
|
||||
if len(parts) >= 2:
|
||||
for p in parts:
|
||||
p_lower = p.lower().strip()
|
||||
if len(p_lower) < 2:
|
||||
continue
|
||||
for w in words:
|
||||
wt = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
|
||||
if wt.lower() == p_lower:
|
||||
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
|
||||
return rects
|
||||
|
||||
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, ocr_word_map: OcrWordMap = None) -> None:
|
||||
@@ -2761,9 +2801,10 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
|
||||
# Kinds à ne pas chercher dans le PDF (dates masquées uniquement dans le texte,
|
||||
# pas dans le PDF où elles rendent les tableaux illisibles)
|
||||
_VECTOR_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
|
||||
# Kinds dont les tokens courts (< 5) risquent le substring matching via page.search_for()
|
||||
_VECTOR_SHORT_TOKEN_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
|
||||
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL"}
|
||||
# Kinds sensibles au substring matching : utiliser _search_whole_word
|
||||
_VECTOR_WHOLEWORD_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
|
||||
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL",
|
||||
"NER_PER", "NER_ORG", "NER_LOC"}
|
||||
for pno in range(len(doc)):
|
||||
page = doc[pno]
|
||||
hits = by_page.get(pno, []) + by_page.get(-1, [])
|
||||
@@ -2783,26 +2824,33 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
|
||||
if dedup_key in seen_tokens:
|
||||
continue
|
||||
seen_tokens.add(dedup_key)
|
||||
if h.kind in _VECTOR_SHORT_TOKEN_KINDS and len(token) < 5:
|
||||
if token.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||||
if h.kind in _VECTOR_WHOLEWORD_KINDS:
|
||||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
continue
|
||||
if " " not in token:
|
||||
rects = _search_whole_word(page, token)
|
||||
if not rects and ocr_word_map and pno in ocr_word_map:
|
||||
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||||
all_rects.extend(rects)
|
||||
else:
|
||||
rects = page.search_for(token)
|
||||
if not rects:
|
||||
for word in token.split():
|
||||
word = word.strip(" .-'")
|
||||
if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
continue
|
||||
rects.extend(_search_whole_word(page, word))
|
||||
if not rects and ocr_word_map and pno in ocr_word_map:
|
||||
rects.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
|
||||
all_rects.extend(rects)
|
||||
continue
|
||||
rects = page.search_for(token)
|
||||
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||
compact = re.sub(r"\s+", "", token)
|
||||
if compact != token:
|
||||
rects = page.search_for(compact)
|
||||
if not rects and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
|
||||
for word in token.split():
|
||||
word = word.strip(" .-'")
|
||||
if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
continue
|
||||
if not word[0].isupper():
|
||||
continue
|
||||
rects.extend(page.search_for(word))
|
||||
if not rects and ocr_word_map and pno in ocr_word_map:
|
||||
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||||
all_rects.extend(rects)
|
||||
@@ -2819,7 +2867,7 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
|
||||
|
||||
def _rasterize_page(args):
|
||||
"""Worker parallèle : rasterise une page + dessine les rectangles noirs."""
|
||||
pdf_path_str, pno, rects_tuples, dpi, ogc_label = args
|
||||
pdf_path_str, pno, rects_tuples, dpi, ogc_label, jpeg_quality = args
|
||||
doc = fitz.open(pdf_path_str)
|
||||
src = doc[pno]
|
||||
rect_w, rect_h = src.rect.width, src.rect.height
|
||||
@@ -2851,19 +2899,24 @@ def _rasterize_page(args):
|
||||
draw.rectangle([x - 4, y - 2, x + tw + 4, y + th + 2], fill=(255, 255, 255))
|
||||
draw.text((x, y), text, fill=(0, 0, 0), font=font)
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="PNG")
|
||||
if jpeg_quality and jpeg_quality > 0:
|
||||
img.save(buf, format="JPEG", quality=jpeg_quality, optimize=True)
|
||||
else:
|
||||
img.save(buf, format="PNG")
|
||||
doc.close()
|
||||
return pno, buf.getvalue(), rect_w, rect_h
|
||||
|
||||
|
||||
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300, ogc_label: Optional[str] = None, ocr_word_map: OcrWordMap = None) -> None:
|
||||
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 150, ogc_label: Optional[str] = None, ocr_word_map: OcrWordMap = None, jpeg_quality: int = 85) -> None:
|
||||
if fitz is None:
|
||||
raise RuntimeError("PyMuPDF non disponible – installez pymupdf.")
|
||||
doc = fitz.open(str(original_pdf))
|
||||
all_rects: Dict[int, List["fitz.Rect"]] = {}
|
||||
_RASTER_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
|
||||
_RASTER_SHORT_TOKEN_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
|
||||
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL"}
|
||||
# Kinds sensibles au substring matching : utiliser _search_whole_word
|
||||
_RASTER_WHOLEWORD_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
|
||||
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL",
|
||||
"NER_PER", "NER_ORG", "NER_LOC"}
|
||||
_VLM_NUMERIC_KINDS = {"VLM_NUM_PATIENT", "VLM_NUM_LOT", "VLM_NUM_ORD", "VLM_NDA",
|
||||
"VLM_NIR", "VLM_IPP", "VLM_RPPS"}
|
||||
by_page: Dict[int, List[PiiHit]] = {}
|
||||
@@ -2887,12 +2940,30 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
||||
if token in seen_tokens:
|
||||
continue
|
||||
seen_tokens.add(token)
|
||||
if h.kind in _RASTER_SHORT_TOKEN_KINDS and len(token) < 5:
|
||||
if token.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||||
found_short = _search_whole_word(page, token)
|
||||
if not found_short and ocr_word_map and pno in ocr_word_map:
|
||||
found_short = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||||
rects.extend(found_short)
|
||||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||||
if h.kind in _RASTER_WHOLEWORD_KINDS:
|
||||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
continue
|
||||
if " " not in token:
|
||||
# Token mono-mot : chercher comme mot entier
|
||||
found_ww = _search_whole_word(page, token)
|
||||
if not found_ww and ocr_word_map and pno in ocr_word_map:
|
||||
found_ww = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||||
rects.extend(found_ww)
|
||||
else:
|
||||
# Token multi-mots : d'abord chercher la chaîne complète
|
||||
found_multi = page.search_for(token)
|
||||
if not found_multi:
|
||||
# Fallback : chercher chaque mot comme mot entier
|
||||
for word in token.split():
|
||||
word = word.strip(" .-'")
|
||||
if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
continue
|
||||
found_multi.extend(_search_whole_word(page, word))
|
||||
if not found_multi and ocr_word_map and pno in ocr_word_map:
|
||||
found_multi.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
|
||||
rects.extend(found_multi)
|
||||
continue
|
||||
found = page.search_for(token)
|
||||
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
|
||||
@@ -2926,7 +2997,7 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
||||
|
||||
n_workers = min(n_pages, os.cpu_count() or 4)
|
||||
tasks = [
|
||||
(str(original_pdf), pno, rects_as_tuples.get(pno, []), dpi, ogc_label)
|
||||
(str(original_pdf), pno, rects_as_tuples.get(pno, []), dpi, ogc_label, jpeg_quality)
|
||||
for pno in range(n_pages)
|
||||
]
|
||||
|
||||
@@ -2935,9 +3006,9 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
||||
|
||||
# Assemblage final (séquentiel, rapide)
|
||||
out = fitz.open()
|
||||
for pno, png_bytes, w, h in results:
|
||||
for pno, img_bytes, w, h in results:
|
||||
dst = out.new_page(width=w, height=h)
|
||||
dst.insert_image(fitz.Rect(0, 0, w, h), stream=png_bytes)
|
||||
dst.insert_image(fitz.Rect(0, 0, w, h), stream=img_bytes)
|
||||
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
|
||||
out.close()
|
||||
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
{
|
||||
"date": "2026-03-11T12:11:24.286697",
|
||||
"date": "2026-03-12T10:24:59.261417",
|
||||
"scores": {
|
||||
"global_score": 97.0,
|
||||
"leak_score": 100.0,
|
||||
"fp_score": 90,
|
||||
"totals": {
|
||||
"documents": 29,
|
||||
"audit_hits": 2804,
|
||||
"audit_hits": 2797,
|
||||
"name_tokens_known": 461,
|
||||
"leak_audit": 0,
|
||||
"leak_occurrences": 0,
|
||||
"leak_regex": 0,
|
||||
"leak_insee_high": 0,
|
||||
"leak_insee_medium": 568,
|
||||
"leak_insee_medium": 569,
|
||||
"fp_medical": 0,
|
||||
"fp_overmasking": 2
|
||||
}
|
||||
@@ -158,7 +158,7 @@
|
||||
"leak_audit": 0,
|
||||
"leak_regex": 0,
|
||||
"leak_insee_high": 0,
|
||||
"leak_insee_medium": 18,
|
||||
"leak_insee_medium": 19,
|
||||
"fp_medical": 0,
|
||||
"fp_overmasking": 0
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user