feat: qualité anonymisation — sur-anonymisation, fuites PHI, nettoyage bruit

P0-A: stop words français + seuil subparts 5 chars + sweep conditionnel
P0-B: 6 nouveaux patterns PHI (DDN, Par, N Ipp, Adresse, DEMANDE, venue)
P2-C: cohérence pseudonymes (_find_matching_entity) + fix crochets
P1-B: text_cleaner.py — sidebar OCR, footers, dédup vitales, collapse blanks
P1-A: dédup CRH par SequenceMatcher (seuil 85%)
Tests: 34 nouveaux tests (996 pass, 0 fail)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-02-25 14:00:07 +01:00
parent 63354e75bc
commit f4a23a5f43
7 changed files with 499 additions and 8 deletions

View File

@@ -12,6 +12,7 @@ from __future__ import annotations
import re
import logging
from difflib import SequenceMatcher
logger = logging.getLogger(__name__)
@@ -121,4 +122,35 @@ def _split_crh(text: str) -> list[str]:
end = crh_starts[i + 1] if i + 1 < len(crh_starts) else len(text)
chunks.append(text[start:end].rstrip())
# Déduplication : supprimer les copies quasi-identiques (destinataires multiples)
chunks = _dedup_chunks(chunks)
return chunks
def _dedup_chunks(chunks: list[str], threshold: float = 0.85) -> list[str]:
"""Supprime les chunks quasi-identiques (copies pour destinataires multiples).
Compare les 500 premiers caractères de chaque paire.
Si le ratio de similarité > threshold, le doublon est supprimé.
"""
if len(chunks) <= 1:
return chunks
duplicates: set[int] = set()
for i in range(len(chunks)):
if i in duplicates:
continue
for j in range(i + 1, len(chunks)):
if j in duplicates:
continue
ratio = SequenceMatcher(
None,
chunks[i][:500],
chunks[j][:500],
).ratio()
if ratio > threshold:
duplicates.add(j)
logger.info(" CRH chunk %d doublon de %d (ratio=%.2f), supprimé", j + 1, i + 1, ratio)
return [c for idx, c in enumerate(chunks) if idx not in duplicates]