"""Tests pour le module d'anonymisation.""" import pytest from src.anonymization.entity_registry import EntityRegistry from src.anonymization.regex_patterns import ( BACTERIO_NOM_HEADER_PATTERN, CONSULT_ADRESSE_PATTERN, CRH_FOOTER_IPP_EPISODE, CRH_FOOTER_PATIENT_PATTERN, DATE_NAISSANCE_PATTERN, DDN_PATTERN, DEMANDE_NUM_PATTERN, DR_NAME_PATTERN, EMAIL_PATTERN, EPISODE_PATTERN, FOOTER_PATIENT_PATTERN, IPP_PATTERN, NOM_NAISSANCE_TRACKARE_PATTERN, NOTE_AUTHOR_PATTERN, N_IPP_PATTERN, PAR_NOM_PATTERN, PERSONNE_PREVENIR_PATTERN, PHONE_INTL_PATTERN, PHONE_PATTERN, PRENOM_NAISSANCE_PATTERN, RPPS_PATTERN, VENUE_PATTERN, ) from src.extraction.text_cleaner import clean_extracted_text from src.extraction.document_splitter import _dedup_chunks class TestRegexPatterns: def test_ipp_with_colon(self): m = IPP_PATTERN.search("IPP: 01306172") assert m is not None assert m.group(1) == "01306172" def test_ipp_without_colon(self): m = IPP_PATTERN.search("IPP 01306172") assert m is not None assert m.group(1) == "01306172" def test_ipp_in_parentheses(self): m = IPP_PATTERN.search("(01306172 )") assert m is not None assert m.group(2) == "01306172" def test_episode_no(self): m = EPISODE_PATTERN.search("Episode No: 23042753") assert m is not None assert m.group(1) == "23042753" def test_episode_n_degree(self): m = EPISODE_PATTERN.search("N° Episode 23042753") assert m is not None assert m.group(2) == "23042753" def test_phone_dots(self): m = PHONE_PATTERN.search("06.25.39.26.82") assert m is not None assert m.group(0) == "06.25.39.26.82" def test_phone_spaces(self): m = PHONE_PATTERN.search("05 59 44 35 35") assert m is not None def test_email(self): m = EMAIL_PATTERN.search("faudemar@ch-cotebasque.fr") assert m is not None assert m.group(0) == "faudemar@ch-cotebasque.fr" def test_rpps(self): m = RPPS_PATTERN.search("RPPS : 10100532760") assert m is not None assert m.group(1) == "10100532760" def test_date_naissance_nee_le(self): m = DATE_NAISSANCE_PATTERN.search("née le 23/02/1980") assert m is not None assert m.group(1) == "23/02/1980" def test_date_naissance_ne_e_le(self): m = DATE_NAISSANCE_PATTERN.search("Né(e) le 23/02/1980") assert m is not None assert m.group(1) == "23/02/1980" def test_date_naissance_field(self): m = DATE_NAISSANCE_PATTERN.search("Date de naissance: 23/02/1980") assert m is not None assert m.group(1) == "23/02/1980" def test_dr_name(self): m = DR_NAME_PATTERN.search("Dr F. AUDEMAR") assert m is not None assert "AUDEMAR" in m.group(1) def test_dr_name_docteur(self): m = DR_NAME_PATTERN.search("Docteur AUDEMAR Franck") assert m is not None assert "AUDEMAR" in m.group(1) def test_note_author(self): m = NOTE_AUTHOR_PATTERN.search("Note IDE Annie GUIRESSE Non algique") assert m is not None assert m.group(1) == "Annie GUIRESSE" def test_footer_patient_trackare(self): m = FOOTER_PATIENT_PATTERN.search( "Patient: CLIER NARBAIS AUDREY - Date de naissance: 23/02/1980" ) assert m is not None assert "CLIER" in m.group(1) def test_crh_footer_patient(self): m = CRH_FOOTER_PATIENT_PATTERN.search( "Patient(e) : CLIER AUDREY NARBAIS Né(e) le 23/02/1980" ) assert m is not None assert "CLIER" in m.group(1) def test_crh_footer_ipp_episode(self): m = CRH_FOOTER_IPP_EPISODE.search( "IPP 01306172 / N° Episode 23042753 (MEDECINE GASTRO B2 HC)" ) assert m is not None assert m.group(1) == "01306172" assert m.group(2) == "23042753" class TestEntityRegistry: def test_register_returns_pseudo(self): reg = EntityRegistry() pseudo = reg.register("Jean Dupont", "patient") assert pseudo == "[PATIENT_1]" def test_register_same_entity_returns_same(self): reg = EntityRegistry() p1 = reg.register("Jean Dupont", "patient") p2 = reg.register("Jean Dupont", "patient") assert p1 == p2 def test_register_case_insensitive(self): reg = EntityRegistry() p1 = reg.register("Jean DUPONT", "patient") p2 = reg.register("jean dupont", "patient") assert p1 == p2 def test_register_different_categories(self): reg = EntityRegistry() p1 = reg.register("Dupont", "patient") p2 = reg.register("Martin", "medecin") assert p1 == "[PATIENT_1]" assert p2 == "[MEDECIN_1]" def test_get_replacement(self): reg = EntityRegistry() reg.register("Jean Dupont", "patient") assert reg.get_replacement("jean dupont") == "[PATIENT_1]" assert reg.get_replacement("inconnu") is None class TestAnonymizer: def test_anonymize_basic(self): from src.anonymization.anonymizer import Anonymizer parsed = { "patient": {"nom_prenom": "DUPONT Jean", "nom_naissance": "DUPONT"}, "medecins": ["MARTIN Pierre"], "contacts": [], } anonymizer = Anonymizer(parsed_data=parsed) text = "Le patient DUPONT Jean a été vu par Dr MARTIN Pierre." result = anonymizer.anonymize(text) assert "DUPONT" not in result assert "MARTIN" not in result assert "[PATIENT" in result or "[MEDECIN" in result def test_preserves_medical_content(self): from src.anonymization.anonymizer import Anonymizer anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []}) text = "Pancréatite aiguë biliaire. Cholécystectomie par cœlioscopie. IMC 34.37." result = anonymizer.anonymize(text) assert "Pancréatite" in result assert "Cholécystectomie" in result assert "IMC" in result def test_anonymize_phone(self): from src.anonymization.anonymizer import Anonymizer anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []}) text = "Appeler le 06.25.39.26.82 pour le rendez-vous." result = anonymizer.anonymize(text) assert "06.25.39.26.82" not in result assert "[TEL" in result def test_anonymize_email(self): from src.anonymization.anonymizer import Anonymizer anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []}) text = "Contact: faudemar@ch-cotebasque.fr" result = anonymizer.anonymize(text) assert "faudemar@ch-cotebasque.fr" not in result assert "[EMAIL" in result # --- P0-A : Sur-anonymisation --- class TestStopWordsAndSubparts: """Vérifie que les stop words et sous-parties courtes ne sont pas enregistrés.""" def test_stop_word_not_registered_as_subpart(self): """Les vrais stop words (sans, dans, avec) ne doivent pas être des sous-parties.""" reg = EntityRegistry() reg.register("Sans Martin", "medecin") assert not reg.is_subpart("sans") def test_prenoms_are_subparts(self): """Les prénoms (jean, paul, marie) sont des PHI et doivent être des sous-parties.""" reg = EntityRegistry() reg.register("Jean Dupont", "patient") assert reg.is_subpart("jean") assert reg.is_subpart("dupont") def test_short_parts_excluded(self): """Les sous-parties < 4 chars sont exclues (trop de faux positifs).""" reg = EntityRegistry() reg.register("Ré Dupont", "patient") assert not reg.is_subpart("ré") assert reg.is_subpart("dupont") def test_long_subpart_registered(self): """Les sous-parties >= 4 chars qui ne sont pas des stop words sont enregistrées.""" reg = EntityRegistry() reg.register("Jean Audemar", "medecin") assert reg.is_subpart("audemar") assert reg.is_subpart("jean") def test_sans_not_anonymized_when_dr_sans(self): """'sans' ne doit pas être remplacé quand un médecin s'appelle 'Dr Sans'.""" from src.anonymization.anonymizer import Anonymizer parsed = { "patient": {}, "medecins": ["Sans Martin"], "contacts": [], } anonymizer = Anonymizer(parsed_data=parsed) text = "Patient sans signe de gravité. Vu par Dr Sans Martin." result = anonymizer.anonymize(text) # "sans" minuscule dans le texte médical doit rester assert "sans signe de gravité" in result def test_full_name_still_anonymized(self): """Les noms complets sont toujours anonymisés même avec les nouvelles règles.""" from src.anonymization.anonymizer import Anonymizer parsed = { "patient": {"nom_prenom": "DUPONT Jean"}, "medecins": [], "contacts": [], } anonymizer = Anonymizer(parsed_data=parsed) text = "Le patient DUPONT Jean est sorti." result = anonymizer.anonymize(text) assert "DUPONT" not in result # --- P0-B : Fuites PHI --- class TestNewPHIPatterns: """Tests pour les patterns PHI spécialisés (BACTERIO, CONSULTATION, ANAPATH).""" def test_ddn_pattern_slash(self): m = DDN_PATTERN.search("DDN : 21/01/1948") assert m is not None assert m.group(1) == "21/01/1948" def test_ddn_pattern_dash(self): m = DDN_PATTERN.search("DDN : 21-01-1948") assert m is not None assert m.group(1) == "21-01-1948" def test_par_nom_pattern(self): m = PAR_NOM_PATTERN.search("Par : GENDRE Juliette") assert m is not None assert "GENDRE" in m.group(1) def test_demande_num_pattern(self): m = DEMANDE_NUM_PATTERN.search("DEMANDE N° 2300126709") assert m is not None assert m.group(1) == "2300126709" def test_venue_pattern(self): m = VENUE_PATTERN.search("N° venue : 23111304") assert m is not None assert m.group(1) == "23111304" def test_n_ipp_pattern(self): m = N_IPP_PATTERN.search("N Ipp : 19029841") assert m is not None assert m.group(1) == "19029841" def test_consult_adresse_pattern(self): m = CONSULT_ADRESSE_PATTERN.search("Adresse : 15 rue des Lilas 64100 BAYONNE") assert m is not None assert "15 rue des Lilas" in m.group(1) def test_date_naissance_with_dashes(self): """Date de naissance : DD-MM-YYYY (format Trackare).""" m = DATE_NAISSANCE_PATTERN.search("Date de naissance : 21-01-1948") assert m is not None assert m.group(1) == "21-01-1948" def test_phone_intl_pattern(self): """Téléphone international +33(0)XXXXXXXXX.""" m = PHONE_INTL_PATTERN.search("☎ +33(0)156125400") assert m is not None def test_phone_intl_with_spaces(self): m = PHONE_INTL_PATTERN.search("+33 1 56 12 54 00") assert m is not None def test_personne_prevenir_pattern(self): m = PERSONNE_PREVENIR_PATTERN.search("Personne à prévenir EICHE 06 27 56 38") assert m is not None assert "EICHE" in m.group(1) def test_nom_naissance_trackare(self): m = NOM_NAISSANCE_TRACKARE_PATTERN.search("Nom de naissance : EICHE") assert m is not None assert m.group(1).strip() == "EICHE" def test_nom_utilise_trackare(self): m = NOM_NAISSANCE_TRACKARE_PATTERN.search("Nom utilisé : DUPONT") assert m is not None assert m.group(1).strip() == "DUPONT" def test_prenom_naissance_pattern(self): m = PRENOM_NAISSANCE_PATTERN.search("Prénom de naissance : MARIE") assert m is not None assert m.group(1) == "MARIE" def test_prenom_1er_pattern(self): m = PRENOM_NAISSANCE_PATTERN.search("1er prénom de naissance: MARIE") assert m is not None assert m.group(1) == "MARIE" def test_bacterio_nom_header_pattern(self): """En-tête BACTERIO : NOM Prénom avant 'Nom usuel :'.""" text = "Compte renduComplet\nURTIZVEREA Marie\nNom usuel : EICHE URGENCES" m = BACTERIO_NOM_HEADER_PATTERN.search(text) assert m is not None assert m.group(1) == "URTIZVEREA Marie" def test_bacterio_nom_header_nom_de_naissance(self): """En-tête BACTERIO : NOM Prénom avant 'Nom de naissance :'.""" text = "DUPONT Jean-Pierre\nNom de naissance : MARTIN" m = BACTERIO_NOM_HEADER_PATTERN.search(text) assert m is not None assert m.group(1) == "DUPONT Jean-Pierre" # --- P2-C : Cohérence pseudonymes --- class TestEntityMatching: """Tests pour la cohérence des pseudonymes (sous-ensembles).""" def test_subset_matching_same_pseudo(self): """Enregistrer 'MARTIN' puis 'MARTIN Pierre' retourne le même pseudo.""" reg = EntityRegistry() p1 = reg.register("MARTIN", "medecin") p2 = reg.register("MARTIN Pierre", "medecin") assert p1 == p2 def test_superset_matching_same_pseudo(self): """Enregistrer 'MARTIN Pierre' puis 'MARTIN' retourne le même pseudo.""" reg = EntityRegistry() p1 = reg.register("MARTIN Pierre", "medecin") p2 = reg.register("MARTIN", "medecin") assert p1 == p2 def test_no_cross_category_match_via_subset(self): """Un patient et une adresse ne matchent PAS par sous-ensemble.""" reg = EntityRegistry() p1 = reg.register("MARTIN Pierre", "patient") p2 = reg.register("MARTIN Rue", "adresse") # "MARTIN" est commun mais catégories incompatibles → pas de match assert p1 != p2 def test_person_categories_compatible(self): """Les catégories de personnes (patient/medecin/soignant) sont compatibles.""" reg = EntityRegistry() p1 = reg.register("DUPONT", "patient") p2 = reg.register("DUPONT Pierre", "medecin") # Catégories de personnes compatibles → même pseudo assert p1 == p2 def test_fix_double_brackets(self): """Les doubles crochets fermants sont corrigés.""" from src.anonymization.anonymizer import Anonymizer result = Anonymizer._fix_brackets("Mme[PERSONNE_14]]") assert result == "Mme [PERSONNE_14]" def test_fix_double_open_brackets(self): """Les doubles crochets ouvrants sont corrigés.""" from src.anonymization.anonymizer import Anonymizer result = Anonymizer._fix_brackets("Dr [[PERSONNE_7]") assert result == "Dr [PERSONNE_7]" def test_fix_orphan_digits(self): """Les chiffres orphelins entre crochets sont supprimés.""" from src.anonymization.anonymizer import Anonymizer result = Anonymizer._fix_brackets("[PERSONNE_6]10]") assert result == "[PERSONNE_6]" def test_fix_orphan_underscore_digits(self): """Les _N] orphelins sont supprimés.""" from src.anonymization.anonymizer import Anonymizer result = Anonymizer._fix_brackets("[PERSONNE_4]_2]") assert result == "[PERSONNE_4]" def test_fix_glued_tags(self): """Deux tags collés ][ reçoivent un espace.""" from src.anonymization.anonymizer import Anonymizer result = Anonymizer._fix_brackets("[MEDECIN_5][MEDECIN_6]") assert result == "[MEDECIN_5] [MEDECIN_6]" def test_fix_glued_bracket(self): """Un tag collé à un mot reçoit un espace.""" from src.anonymization.anonymizer import Anonymizer result = Anonymizer._fix_brackets("Docteur[MEDECIN_1]") assert result == "Docteur [MEDECIN_1]" # --- P1-A : Déduplication CRH --- class TestDedupChunks: """Tests pour la déduplication de chunks CRH.""" def test_identical_chunks_deduped(self): chunks = [ "MME DUPONT\nCompte rendu du 01/01/2024 au 05/01/2024\nContenu médical...", "MME DUPONT\nCompte rendu du 01/01/2024 au 05/01/2024\nContenu médical...", "MME DUPONT\nCompte rendu du 01/01/2024 au 05/01/2024\nContenu médical...", ] result = _dedup_chunks(chunks) assert len(result) == 1 def test_different_chunks_preserved(self): chunks = [ "MME DUPONT\nCompte rendu du 01/01/2024\nPancréatite aiguë", "M. MARTIN\nCompte rendu du 15/02/2024\nFracture du fémur", ] result = _dedup_chunks(chunks) assert len(result) == 2 def test_single_chunk_passthrough(self): chunks = ["Un seul document"] result = _dedup_chunks(chunks) assert len(result) == 1 # --- P1-B : Text cleaner --- class TestTextCleaner: """Tests pour le nettoyage du texte OCR.""" def test_single_char_lines_removed(self): text = "Contenu normal\nA\nB\nSuite du contenu\n" result = clean_extracted_text(text) assert "\nA\n" not in result assert "\nB\n" not in result assert "Contenu normal" in result assert "Suite du contenu" in result def test_page_footer_v1_removed(self): text = "Contenu\nV1 - Imprime le 25/02/2026 a 14:30 par user Page(s): 1 sur 3\nSuite" result = clean_extracted_text(text) assert "V1 - Imprime" not in result assert "Contenu" in result def test_info_patient_footer_removed(self): text = "Contenu\nInformation patient Page 1 25/02/2026 14:30:00\nSuite" result = clean_extracted_text(text) assert "Information patient" not in result def test_collapse_blank_lines(self): text = "Ligne 1\n\n\n\n\nLigne 2" result = clean_extracted_text(text) assert "\n\n\n" not in result assert "Ligne 1\n\nLigne 2" == result def test_preserves_medical_content(self): text = "Pancréatite aiguë biliaire.\nIMC 34.37.\nCholécystectomie." result = clean_extracted_text(text) assert "Pancréatite" in result assert "IMC 34.37" in result assert "Cholécystectomie" in result def test_dedup_vital_signs(self): text = ( "Signes vitaux\nFC 80 TA 12/8\n\n" "Surv. Isolement et Contention\nFC 80 TA 12/8\n\n" "Conclusion" ) result = clean_extracted_text(text) assert "Signes vitaux" in result assert "Surv. Isolement" not in result assert "Conclusion" in result def test_patient_footer_dedup(self): text = ( "Patient(e) : DUPONT Jean N° Episode 12345678\n" "Contenu page 1\n" "Patient(e) : DUPONT Jean N° Episode 12345678\n" "Contenu page 2\n" "Patient(e) : DUPONT Jean N° Episode 12345678\n" ) result = clean_extracted_text(text) # La première occurrence est gardée, les suivantes supprimées count = result.count("Patient(e)") assert count == 1