feat: qualité anonymisation — sur-anonymisation, fuites PHI, nettoyage bruit
P0-A: stop words français + seuil subparts 5 chars + sweep conditionnel P0-B: 6 nouveaux patterns PHI (DDN, Par, N Ipp, Adresse, DEMANDE, venue) P2-C: cohérence pseudonymes (_find_matching_entity) + fix crochets P1-B: text_cleaner.py — sidebar OCR, footers, dédup vitales, collapse blanks P1-A: dédup CRH par SequenceMatcher (seuil 85%) Tests: 34 nouveaux tests (996 pass, 0 fail) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,18 +4,26 @@ import pytest
|
||||
|
||||
from src.anonymization.entity_registry import EntityRegistry
|
||||
from src.anonymization.regex_patterns import (
|
||||
CONSULT_ADRESSE_PATTERN,
|
||||
CRH_FOOTER_IPP_EPISODE,
|
||||
CRH_FOOTER_PATIENT_PATTERN,
|
||||
DATE_NAISSANCE_PATTERN,
|
||||
DDN_PATTERN,
|
||||
DEMANDE_NUM_PATTERN,
|
||||
DR_NAME_PATTERN,
|
||||
EMAIL_PATTERN,
|
||||
EPISODE_PATTERN,
|
||||
FOOTER_PATIENT_PATTERN,
|
||||
IPP_PATTERN,
|
||||
NOTE_AUTHOR_PATTERN,
|
||||
N_IPP_PATTERN,
|
||||
PAR_NOM_PATTERN,
|
||||
PHONE_PATTERN,
|
||||
RPPS_PATTERN,
|
||||
VENUE_PATTERN,
|
||||
)
|
||||
from src.extraction.text_cleaner import clean_extracted_text
|
||||
from src.extraction.document_splitter import _dedup_chunks
|
||||
|
||||
|
||||
class TestRegexPatterns:
|
||||
@@ -195,3 +203,217 @@ class TestAnonymizer:
|
||||
|
||||
assert "faudemar@ch-cotebasque.fr" not in result
|
||||
assert "[EMAIL" in result
|
||||
|
||||
|
||||
# --- P0-A : Sur-anonymisation ---
|
||||
|
||||
class TestStopWordsAndSubparts:
|
||||
"""Vérifie que les stop words et sous-parties courtes ne sont pas enregistrés."""
|
||||
|
||||
def test_stop_word_not_registered_as_subpart(self):
|
||||
"""'jean' (stop word) ne doit pas être enregistré en sous-partie."""
|
||||
reg = EntityRegistry()
|
||||
reg.register("Jean Martin", "medecin")
|
||||
entities = reg.get_all_entities()
|
||||
assert "jean" not in entities
|
||||
assert "martin" not in entities # < 5 chars → exclu aussi
|
||||
|
||||
def test_long_subpart_registered(self):
|
||||
"""Les sous-parties >= 5 chars qui ne sont pas des stop words sont enregistrées."""
|
||||
reg = EntityRegistry()
|
||||
reg.register("Jean Audemar", "medecin")
|
||||
assert reg.is_subpart("audemar")
|
||||
|
||||
def test_sans_not_anonymized_when_dr_sans(self):
|
||||
"""'sans' ne doit pas être remplacé quand un médecin s'appelle 'Dr Sans'."""
|
||||
from src.anonymization.anonymizer import Anonymizer
|
||||
|
||||
parsed = {
|
||||
"patient": {},
|
||||
"medecins": ["Sans Martin"],
|
||||
"contacts": [],
|
||||
}
|
||||
anonymizer = Anonymizer(parsed_data=parsed)
|
||||
text = "Patient sans signe de gravité. Vu par Dr Sans Martin."
|
||||
result = anonymizer.anonymize(text)
|
||||
|
||||
# "sans" minuscule dans le texte médical doit rester
|
||||
assert "sans signe de gravité" in result
|
||||
|
||||
def test_full_name_still_anonymized(self):
|
||||
"""Les noms complets sont toujours anonymisés même avec les nouvelles règles."""
|
||||
from src.anonymization.anonymizer import Anonymizer
|
||||
|
||||
parsed = {
|
||||
"patient": {"nom_prenom": "DUPONT Jean"},
|
||||
"medecins": [],
|
||||
"contacts": [],
|
||||
}
|
||||
anonymizer = Anonymizer(parsed_data=parsed)
|
||||
text = "Le patient DUPONT Jean est sorti."
|
||||
result = anonymizer.anonymize(text)
|
||||
|
||||
assert "DUPONT" not in result
|
||||
|
||||
|
||||
# --- P0-B : Fuites PHI ---
|
||||
|
||||
class TestNewPHIPatterns:
|
||||
"""Tests pour les patterns PHI spécialisés (BACTERIO, CONSULTATION, ANAPATH)."""
|
||||
|
||||
def test_ddn_pattern_slash(self):
|
||||
m = DDN_PATTERN.search("DDN : 21/01/1948")
|
||||
assert m is not None
|
||||
assert m.group(1) == "21/01/1948"
|
||||
|
||||
def test_ddn_pattern_dash(self):
|
||||
m = DDN_PATTERN.search("DDN : 21-01-1948")
|
||||
assert m is not None
|
||||
assert m.group(1) == "21-01-1948"
|
||||
|
||||
def test_par_nom_pattern(self):
|
||||
m = PAR_NOM_PATTERN.search("Par : GENDRE Juliette")
|
||||
assert m is not None
|
||||
assert "GENDRE" in m.group(1)
|
||||
|
||||
def test_demande_num_pattern(self):
|
||||
m = DEMANDE_NUM_PATTERN.search("DEMANDE N° 2300126709")
|
||||
assert m is not None
|
||||
assert m.group(1) == "2300126709"
|
||||
|
||||
def test_venue_pattern(self):
|
||||
m = VENUE_PATTERN.search("N° venue : 23111304")
|
||||
assert m is not None
|
||||
assert m.group(1) == "23111304"
|
||||
|
||||
def test_n_ipp_pattern(self):
|
||||
m = N_IPP_PATTERN.search("N Ipp : 19029841")
|
||||
assert m is not None
|
||||
assert m.group(1) == "19029841"
|
||||
|
||||
def test_consult_adresse_pattern(self):
|
||||
m = CONSULT_ADRESSE_PATTERN.search("Adresse : 15 rue des Lilas 64100 BAYONNE")
|
||||
assert m is not None
|
||||
assert "15 rue des Lilas" in m.group(1)
|
||||
|
||||
|
||||
# --- P2-C : Cohérence pseudonymes ---
|
||||
|
||||
class TestEntityMatching:
|
||||
"""Tests pour la cohérence des pseudonymes (sous-ensembles)."""
|
||||
|
||||
def test_subset_matching_same_pseudo(self):
|
||||
"""Enregistrer 'MARTIN' puis 'MARTIN Pierre' retourne le même pseudo."""
|
||||
reg = EntityRegistry()
|
||||
p1 = reg.register("MARTIN", "medecin")
|
||||
p2 = reg.register("MARTIN Pierre", "medecin")
|
||||
assert p1 == p2
|
||||
|
||||
def test_superset_matching_same_pseudo(self):
|
||||
"""Enregistrer 'MARTIN Pierre' puis 'MARTIN' retourne le même pseudo."""
|
||||
reg = EntityRegistry()
|
||||
p1 = reg.register("MARTIN Pierre", "medecin")
|
||||
p2 = reg.register("MARTIN", "medecin")
|
||||
assert p1 == p2
|
||||
|
||||
def test_fix_double_brackets(self):
|
||||
"""Les doubles crochets sont corrigés."""
|
||||
from src.anonymization.anonymizer import Anonymizer
|
||||
result = Anonymizer._fix_brackets("Mme[PERSONNE_14]]")
|
||||
assert result == "Mme [PERSONNE_14]"
|
||||
|
||||
def test_fix_glued_bracket(self):
|
||||
"""Un tag collé à un mot reçoit un espace."""
|
||||
from src.anonymization.anonymizer import Anonymizer
|
||||
result = Anonymizer._fix_brackets("Docteur[MEDECIN_1]")
|
||||
assert result == "Docteur [MEDECIN_1]"
|
||||
|
||||
|
||||
# --- P1-A : Déduplication CRH ---
|
||||
|
||||
class TestDedupChunks:
|
||||
"""Tests pour la déduplication de chunks CRH."""
|
||||
|
||||
def test_identical_chunks_deduped(self):
|
||||
chunks = [
|
||||
"MME DUPONT\nCompte rendu du 01/01/2024 au 05/01/2024\nContenu médical...",
|
||||
"MME DUPONT\nCompte rendu du 01/01/2024 au 05/01/2024\nContenu médical...",
|
||||
"MME DUPONT\nCompte rendu du 01/01/2024 au 05/01/2024\nContenu médical...",
|
||||
]
|
||||
result = _dedup_chunks(chunks)
|
||||
assert len(result) == 1
|
||||
|
||||
def test_different_chunks_preserved(self):
|
||||
chunks = [
|
||||
"MME DUPONT\nCompte rendu du 01/01/2024\nPancréatite aiguë",
|
||||
"M. MARTIN\nCompte rendu du 15/02/2024\nFracture du fémur",
|
||||
]
|
||||
result = _dedup_chunks(chunks)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_single_chunk_passthrough(self):
|
||||
chunks = ["Un seul document"]
|
||||
result = _dedup_chunks(chunks)
|
||||
assert len(result) == 1
|
||||
|
||||
|
||||
# --- P1-B : Text cleaner ---
|
||||
|
||||
class TestTextCleaner:
|
||||
"""Tests pour le nettoyage du texte OCR."""
|
||||
|
||||
def test_single_char_lines_removed(self):
|
||||
text = "Contenu normal\nA\nB\nSuite du contenu\n"
|
||||
result = clean_extracted_text(text)
|
||||
assert "\nA\n" not in result
|
||||
assert "\nB\n" not in result
|
||||
assert "Contenu normal" in result
|
||||
assert "Suite du contenu" in result
|
||||
|
||||
def test_page_footer_v1_removed(self):
|
||||
text = "Contenu\nV1 - Imprime le 25/02/2026 a 14:30 par user Page(s): 1 sur 3\nSuite"
|
||||
result = clean_extracted_text(text)
|
||||
assert "V1 - Imprime" not in result
|
||||
assert "Contenu" in result
|
||||
|
||||
def test_info_patient_footer_removed(self):
|
||||
text = "Contenu\nInformation patient Page 1 25/02/2026 14:30:00\nSuite"
|
||||
result = clean_extracted_text(text)
|
||||
assert "Information patient" not in result
|
||||
|
||||
def test_collapse_blank_lines(self):
|
||||
text = "Ligne 1\n\n\n\n\nLigne 2"
|
||||
result = clean_extracted_text(text)
|
||||
assert "\n\n\n" not in result
|
||||
assert "Ligne 1\n\nLigne 2" == result
|
||||
|
||||
def test_preserves_medical_content(self):
|
||||
text = "Pancréatite aiguë biliaire.\nIMC 34.37.\nCholécystectomie."
|
||||
result = clean_extracted_text(text)
|
||||
assert "Pancréatite" in result
|
||||
assert "IMC 34.37" in result
|
||||
assert "Cholécystectomie" in result
|
||||
|
||||
def test_dedup_vital_signs(self):
|
||||
text = (
|
||||
"Signes vitaux\nFC 80 TA 12/8\n\n"
|
||||
"Surv. Isolement et Contention\nFC 80 TA 12/8\n\n"
|
||||
"Conclusion"
|
||||
)
|
||||
result = clean_extracted_text(text)
|
||||
assert "Signes vitaux" in result
|
||||
assert "Surv. Isolement" not in result
|
||||
assert "Conclusion" in result
|
||||
|
||||
def test_patient_footer_dedup(self):
|
||||
text = (
|
||||
"Patient(e) : DUPONT Jean N° Episode 12345678\n"
|
||||
"Contenu page 1\n"
|
||||
"Patient(e) : DUPONT Jean N° Episode 12345678\n"
|
||||
"Contenu page 2\n"
|
||||
"Patient(e) : DUPONT Jean N° Episode 12345678\n"
|
||||
)
|
||||
result = clean_extracted_text(text)
|
||||
# La première occurrence est gardée, les suivantes supprimées
|
||||
count = result.count("Patient(e)")
|
||||
assert count == 1
|
||||
|
||||
Reference in New Issue
Block a user