Bloc A: fix sous-parties dans _mappings, filtre NER anti-tag, intégration patterns manquants (DESTINATAIRE, PRESCRIPTION_AUTHOR), whitelist médicaments élargie (+60), villes retirées de whitelist. Bloc B: CRH dedup chars 200-1000, CP_VILLE vrais codes postaux FR, DR_NAME capital par mot, BACTERIO header tolère ligne vide. Bloc C: DR_NAME negative lookahead multi-docteurs même ligne, entity_registry split tirets (RITZ-QUILLACQ), fix early return subparts dans _find_matching_entity, PRESCRIPTION_AUTHOR élargi (Révisé/Traité, variable.), NOTE_AUTHOR élargi (Diététicienne, Kiné, Ergo), + 8 nouveaux patterns (CONTACT_RELATION, MOD_PAR, AIDE_NAME, SIGNATURE_LINE, VALIDE_PAR, INTERNE_SIGNATURE, FOIS_NAME, MALADIE_NAME), adresses inline +ALLEE/IMP, text_cleaner préserve abréviations médicales. Validé sur 6 cas (21, 11, 104, 160, 50, 200). 70 tests OK. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
532 lines
19 KiB
Python
532 lines
19 KiB
Python
"""Tests pour le module d'anonymisation."""
|
|
|
|
import pytest
|
|
|
|
from src.anonymization.entity_registry import EntityRegistry
|
|
from src.anonymization.regex_patterns import (
|
|
BACTERIO_NOM_HEADER_PATTERN,
|
|
CONSULT_ADRESSE_PATTERN,
|
|
CRH_FOOTER_IPP_EPISODE,
|
|
CRH_FOOTER_PATIENT_PATTERN,
|
|
DATE_NAISSANCE_PATTERN,
|
|
DDN_PATTERN,
|
|
DEMANDE_NUM_PATTERN,
|
|
DR_NAME_PATTERN,
|
|
EMAIL_PATTERN,
|
|
EPISODE_PATTERN,
|
|
FOOTER_PATIENT_PATTERN,
|
|
IPP_PATTERN,
|
|
NOM_NAISSANCE_TRACKARE_PATTERN,
|
|
NOTE_AUTHOR_PATTERN,
|
|
N_IPP_PATTERN,
|
|
PAR_NOM_PATTERN,
|
|
PERSONNE_PREVENIR_PATTERN,
|
|
PHONE_INTL_PATTERN,
|
|
PHONE_PATTERN,
|
|
PRENOM_NAISSANCE_PATTERN,
|
|
RPPS_PATTERN,
|
|
VENUE_PATTERN,
|
|
)
|
|
from src.extraction.text_cleaner import clean_extracted_text
|
|
from src.extraction.document_splitter import _dedup_chunks
|
|
|
|
|
|
class TestRegexPatterns:
|
|
def test_ipp_with_colon(self):
|
|
m = IPP_PATTERN.search("IPP: 01306172")
|
|
assert m is not None
|
|
assert m.group(1) == "01306172"
|
|
|
|
def test_ipp_without_colon(self):
|
|
m = IPP_PATTERN.search("IPP 01306172")
|
|
assert m is not None
|
|
assert m.group(1) == "01306172"
|
|
|
|
def test_ipp_in_parentheses(self):
|
|
m = IPP_PATTERN.search("(01306172 )")
|
|
assert m is not None
|
|
assert m.group(2) == "01306172"
|
|
|
|
def test_episode_no(self):
|
|
m = EPISODE_PATTERN.search("Episode No: 23042753")
|
|
assert m is not None
|
|
assert m.group(1) == "23042753"
|
|
|
|
def test_episode_n_degree(self):
|
|
m = EPISODE_PATTERN.search("N° Episode 23042753")
|
|
assert m is not None
|
|
assert m.group(2) == "23042753"
|
|
|
|
def test_phone_dots(self):
|
|
m = PHONE_PATTERN.search("06.25.39.26.82")
|
|
assert m is not None
|
|
assert m.group(0) == "06.25.39.26.82"
|
|
|
|
def test_phone_spaces(self):
|
|
m = PHONE_PATTERN.search("05 59 44 35 35")
|
|
assert m is not None
|
|
|
|
def test_email(self):
|
|
m = EMAIL_PATTERN.search("faudemar@ch-cotebasque.fr")
|
|
assert m is not None
|
|
assert m.group(0) == "faudemar@ch-cotebasque.fr"
|
|
|
|
def test_rpps(self):
|
|
m = RPPS_PATTERN.search("RPPS : 10100532760")
|
|
assert m is not None
|
|
assert m.group(1) == "10100532760"
|
|
|
|
def test_date_naissance_nee_le(self):
|
|
m = DATE_NAISSANCE_PATTERN.search("née le 23/02/1980")
|
|
assert m is not None
|
|
assert m.group(1) == "23/02/1980"
|
|
|
|
def test_date_naissance_ne_e_le(self):
|
|
m = DATE_NAISSANCE_PATTERN.search("Né(e) le 23/02/1980")
|
|
assert m is not None
|
|
assert m.group(1) == "23/02/1980"
|
|
|
|
def test_date_naissance_field(self):
|
|
m = DATE_NAISSANCE_PATTERN.search("Date de naissance: 23/02/1980")
|
|
assert m is not None
|
|
assert m.group(1) == "23/02/1980"
|
|
|
|
def test_dr_name(self):
|
|
m = DR_NAME_PATTERN.search("Dr F. AUDEMAR")
|
|
assert m is not None
|
|
assert "AUDEMAR" in m.group(1)
|
|
|
|
def test_dr_name_docteur(self):
|
|
m = DR_NAME_PATTERN.search("Docteur AUDEMAR Franck")
|
|
assert m is not None
|
|
assert "AUDEMAR" in m.group(1)
|
|
|
|
def test_note_author(self):
|
|
m = NOTE_AUTHOR_PATTERN.search("Note IDE Annie GUIRESSE Non algique")
|
|
assert m is not None
|
|
assert m.group(1) == "Annie GUIRESSE"
|
|
|
|
def test_footer_patient_trackare(self):
|
|
m = FOOTER_PATIENT_PATTERN.search(
|
|
"Patient: CLIER NARBAIS AUDREY - Date de naissance: 23/02/1980"
|
|
)
|
|
assert m is not None
|
|
assert "CLIER" in m.group(1)
|
|
|
|
def test_crh_footer_patient(self):
|
|
m = CRH_FOOTER_PATIENT_PATTERN.search(
|
|
"Patient(e) : CLIER AUDREY NARBAIS Né(e) le 23/02/1980"
|
|
)
|
|
assert m is not None
|
|
assert "CLIER" in m.group(1)
|
|
|
|
def test_crh_footer_ipp_episode(self):
|
|
m = CRH_FOOTER_IPP_EPISODE.search(
|
|
"IPP 01306172 / N° Episode 23042753 (MEDECINE GASTRO B2 HC)"
|
|
)
|
|
assert m is not None
|
|
assert m.group(1) == "01306172"
|
|
assert m.group(2) == "23042753"
|
|
|
|
|
|
class TestEntityRegistry:
|
|
def test_register_returns_pseudo(self):
|
|
reg = EntityRegistry()
|
|
pseudo = reg.register("Jean Dupont", "patient")
|
|
assert pseudo == "[PATIENT_1]"
|
|
|
|
def test_register_same_entity_returns_same(self):
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("Jean Dupont", "patient")
|
|
p2 = reg.register("Jean Dupont", "patient")
|
|
assert p1 == p2
|
|
|
|
def test_register_case_insensitive(self):
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("Jean DUPONT", "patient")
|
|
p2 = reg.register("jean dupont", "patient")
|
|
assert p1 == p2
|
|
|
|
def test_register_different_categories(self):
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("Dupont", "patient")
|
|
p2 = reg.register("Martin", "medecin")
|
|
assert p1 == "[PATIENT_1]"
|
|
assert p2 == "[MEDECIN_1]"
|
|
|
|
def test_get_replacement(self):
|
|
reg = EntityRegistry()
|
|
reg.register("Jean Dupont", "patient")
|
|
assert reg.get_replacement("jean dupont") == "[PATIENT_1]"
|
|
assert reg.get_replacement("inconnu") is None
|
|
|
|
|
|
class TestAnonymizer:
|
|
def test_anonymize_basic(self):
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
parsed = {
|
|
"patient": {"nom_prenom": "DUPONT Jean", "nom_naissance": "DUPONT"},
|
|
"medecins": ["MARTIN Pierre"],
|
|
"contacts": [],
|
|
}
|
|
anonymizer = Anonymizer(parsed_data=parsed)
|
|
text = "Le patient DUPONT Jean a été vu par Dr MARTIN Pierre."
|
|
result = anonymizer.anonymize(text)
|
|
|
|
assert "DUPONT" not in result
|
|
assert "MARTIN" not in result
|
|
assert "[PATIENT" in result or "[MEDECIN" in result
|
|
|
|
def test_preserves_medical_content(self):
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
|
text = "Pancréatite aiguë biliaire. Cholécystectomie par cœlioscopie. IMC 34.37."
|
|
result = anonymizer.anonymize(text)
|
|
|
|
assert "Pancréatite" in result
|
|
assert "Cholécystectomie" in result
|
|
assert "IMC" in result
|
|
|
|
def test_anonymize_phone(self):
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
|
text = "Appeler le 06.25.39.26.82 pour le rendez-vous."
|
|
result = anonymizer.anonymize(text)
|
|
|
|
assert "06.25.39.26.82" not in result
|
|
assert "[TEL" in result
|
|
|
|
def test_anonymize_email(self):
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
|
text = "Contact: faudemar@ch-cotebasque.fr"
|
|
result = anonymizer.anonymize(text)
|
|
|
|
assert "faudemar@ch-cotebasque.fr" not in result
|
|
assert "[EMAIL" in result
|
|
|
|
|
|
# --- P0-A : Sur-anonymisation ---
|
|
|
|
class TestStopWordsAndSubparts:
|
|
"""Vérifie que les stop words et sous-parties courtes ne sont pas enregistrés."""
|
|
|
|
def test_stop_word_not_registered_as_subpart(self):
|
|
"""Les vrais stop words (sans, dans, avec) ne doivent pas être des sous-parties."""
|
|
reg = EntityRegistry()
|
|
reg.register("Sans Martin", "medecin")
|
|
assert not reg.is_subpart("sans")
|
|
|
|
def test_prenoms_are_subparts(self):
|
|
"""Les prénoms (jean, paul, marie) sont des PHI et doivent être des sous-parties."""
|
|
reg = EntityRegistry()
|
|
reg.register("Jean Dupont", "patient")
|
|
assert reg.is_subpart("jean")
|
|
assert reg.is_subpart("dupont")
|
|
|
|
def test_short_parts_excluded(self):
|
|
"""Les sous-parties < 4 chars sont exclues (trop de faux positifs)."""
|
|
reg = EntityRegistry()
|
|
reg.register("Ré Dupont", "patient")
|
|
assert not reg.is_subpart("ré")
|
|
assert reg.is_subpart("dupont")
|
|
|
|
def test_long_subpart_registered(self):
|
|
"""Les sous-parties >= 4 chars qui ne sont pas des stop words sont enregistrées."""
|
|
reg = EntityRegistry()
|
|
reg.register("Jean Audemar", "medecin")
|
|
assert reg.is_subpart("audemar")
|
|
assert reg.is_subpart("jean")
|
|
|
|
def test_sans_not_anonymized_when_dr_sans(self):
|
|
"""'sans' ne doit pas être remplacé quand un médecin s'appelle 'Dr Sans'."""
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
parsed = {
|
|
"patient": {},
|
|
"medecins": ["Sans Martin"],
|
|
"contacts": [],
|
|
}
|
|
anonymizer = Anonymizer(parsed_data=parsed)
|
|
text = "Patient sans signe de gravité. Vu par Dr Sans Martin."
|
|
result = anonymizer.anonymize(text)
|
|
|
|
# "sans" minuscule dans le texte médical doit rester
|
|
assert "sans signe de gravité" in result
|
|
|
|
def test_full_name_still_anonymized(self):
|
|
"""Les noms complets sont toujours anonymisés même avec les nouvelles règles."""
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
parsed = {
|
|
"patient": {"nom_prenom": "DUPONT Jean"},
|
|
"medecins": [],
|
|
"contacts": [],
|
|
}
|
|
anonymizer = Anonymizer(parsed_data=parsed)
|
|
text = "Le patient DUPONT Jean est sorti."
|
|
result = anonymizer.anonymize(text)
|
|
|
|
assert "DUPONT" not in result
|
|
|
|
|
|
# --- P0-B : Fuites PHI ---
|
|
|
|
class TestNewPHIPatterns:
|
|
"""Tests pour les patterns PHI spécialisés (BACTERIO, CONSULTATION, ANAPATH)."""
|
|
|
|
def test_ddn_pattern_slash(self):
|
|
m = DDN_PATTERN.search("DDN : 21/01/1948")
|
|
assert m is not None
|
|
assert m.group(1) == "21/01/1948"
|
|
|
|
def test_ddn_pattern_dash(self):
|
|
m = DDN_PATTERN.search("DDN : 21-01-1948")
|
|
assert m is not None
|
|
assert m.group(1) == "21-01-1948"
|
|
|
|
def test_par_nom_pattern(self):
|
|
m = PAR_NOM_PATTERN.search("Par : GENDRE Juliette")
|
|
assert m is not None
|
|
assert "GENDRE" in m.group(1)
|
|
|
|
def test_demande_num_pattern(self):
|
|
m = DEMANDE_NUM_PATTERN.search("DEMANDE N° 2300126709")
|
|
assert m is not None
|
|
assert m.group(1) == "2300126709"
|
|
|
|
def test_venue_pattern(self):
|
|
m = VENUE_PATTERN.search("N° venue : 23111304")
|
|
assert m is not None
|
|
assert m.group(1) == "23111304"
|
|
|
|
def test_n_ipp_pattern(self):
|
|
m = N_IPP_PATTERN.search("N Ipp : 19029841")
|
|
assert m is not None
|
|
assert m.group(1) == "19029841"
|
|
|
|
def test_consult_adresse_pattern(self):
|
|
m = CONSULT_ADRESSE_PATTERN.search("Adresse : 15 rue des Lilas 64100 BAYONNE")
|
|
assert m is not None
|
|
assert "15 rue des Lilas" in m.group(1)
|
|
|
|
def test_date_naissance_with_dashes(self):
|
|
"""Date de naissance : DD-MM-YYYY (format Trackare)."""
|
|
m = DATE_NAISSANCE_PATTERN.search("Date de naissance : 21-01-1948")
|
|
assert m is not None
|
|
assert m.group(1) == "21-01-1948"
|
|
|
|
def test_phone_intl_pattern(self):
|
|
"""Téléphone international +33(0)XXXXXXXXX."""
|
|
m = PHONE_INTL_PATTERN.search("☎ +33(0)156125400")
|
|
assert m is not None
|
|
|
|
def test_phone_intl_with_spaces(self):
|
|
m = PHONE_INTL_PATTERN.search("+33 1 56 12 54 00")
|
|
assert m is not None
|
|
|
|
def test_personne_prevenir_pattern(self):
|
|
m = PERSONNE_PREVENIR_PATTERN.search("Personne à prévenir EICHE 06 27 56 38")
|
|
assert m is not None
|
|
assert "EICHE" in m.group(1)
|
|
|
|
def test_nom_naissance_trackare(self):
|
|
m = NOM_NAISSANCE_TRACKARE_PATTERN.search("Nom de naissance : EICHE")
|
|
assert m is not None
|
|
assert m.group(1).strip() == "EICHE"
|
|
|
|
def test_nom_utilise_trackare(self):
|
|
m = NOM_NAISSANCE_TRACKARE_PATTERN.search("Nom utilisé : DUPONT")
|
|
assert m is not None
|
|
assert m.group(1).strip() == "DUPONT"
|
|
|
|
def test_prenom_naissance_pattern(self):
|
|
m = PRENOM_NAISSANCE_PATTERN.search("Prénom de naissance : MARIE")
|
|
assert m is not None
|
|
assert m.group(1) == "MARIE"
|
|
|
|
def test_prenom_1er_pattern(self):
|
|
m = PRENOM_NAISSANCE_PATTERN.search("1er prénom de naissance: MARIE")
|
|
assert m is not None
|
|
assert m.group(1) == "MARIE"
|
|
|
|
def test_bacterio_nom_header_pattern(self):
|
|
"""En-tête BACTERIO : NOM Prénom avant 'Nom usuel :'."""
|
|
text = "Compte renduComplet\nURTIZVEREA Marie\nNom usuel : EICHE URGENCES"
|
|
m = BACTERIO_NOM_HEADER_PATTERN.search(text)
|
|
assert m is not None
|
|
assert m.group(1) == "URTIZVEREA Marie"
|
|
|
|
def test_bacterio_nom_header_nom_de_naissance(self):
|
|
"""En-tête BACTERIO : NOM Prénom avant 'Nom de naissance :'."""
|
|
text = "DUPONT Jean-Pierre\nNom de naissance : MARTIN"
|
|
m = BACTERIO_NOM_HEADER_PATTERN.search(text)
|
|
assert m is not None
|
|
assert m.group(1) == "DUPONT Jean-Pierre"
|
|
|
|
|
|
# --- P2-C : Cohérence pseudonymes ---
|
|
|
|
class TestEntityMatching:
|
|
"""Tests pour la cohérence des pseudonymes (sous-ensembles)."""
|
|
|
|
def test_subset_matching_same_pseudo(self):
|
|
"""Enregistrer 'MARTIN' puis 'MARTIN Pierre' retourne le même pseudo."""
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("MARTIN", "medecin")
|
|
p2 = reg.register("MARTIN Pierre", "medecin")
|
|
assert p1 == p2
|
|
|
|
def test_superset_matching_same_pseudo(self):
|
|
"""Enregistrer 'MARTIN Pierre' puis 'MARTIN' retourne le même pseudo."""
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("MARTIN Pierre", "medecin")
|
|
p2 = reg.register("MARTIN", "medecin")
|
|
assert p1 == p2
|
|
|
|
def test_no_cross_category_match_via_subset(self):
|
|
"""Un patient et une adresse ne matchent PAS par sous-ensemble."""
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("MARTIN Pierre", "patient")
|
|
p2 = reg.register("MARTIN Rue", "adresse")
|
|
# "MARTIN" est commun mais catégories incompatibles → pas de match
|
|
assert p1 != p2
|
|
|
|
def test_person_categories_compatible(self):
|
|
"""Les catégories de personnes (patient/medecin/soignant) sont compatibles."""
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("DUPONT", "patient")
|
|
p2 = reg.register("DUPONT Pierre", "medecin")
|
|
# Catégories de personnes compatibles → même pseudo
|
|
assert p1 == p2
|
|
|
|
def test_fix_double_brackets(self):
|
|
"""Les doubles crochets fermants sont corrigés."""
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
result = Anonymizer._fix_brackets("Mme[PERSONNE_14]]")
|
|
assert result == "Mme [PERSONNE_14]"
|
|
|
|
def test_fix_double_open_brackets(self):
|
|
"""Les doubles crochets ouvrants sont corrigés."""
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
result = Anonymizer._fix_brackets("Dr [[PERSONNE_7]")
|
|
assert result == "Dr [PERSONNE_7]"
|
|
|
|
def test_fix_orphan_digits(self):
|
|
"""Les chiffres orphelins entre crochets sont supprimés."""
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
result = Anonymizer._fix_brackets("[PERSONNE_6]10]")
|
|
assert result == "[PERSONNE_6]"
|
|
|
|
def test_fix_orphan_underscore_digits(self):
|
|
"""Les _N] orphelins sont supprimés."""
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
result = Anonymizer._fix_brackets("[PERSONNE_4]_2]")
|
|
assert result == "[PERSONNE_4]"
|
|
|
|
def test_fix_glued_tags(self):
|
|
"""Deux tags collés ][ reçoivent un espace."""
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
result = Anonymizer._fix_brackets("[MEDECIN_5][MEDECIN_6]")
|
|
assert result == "[MEDECIN_5] [MEDECIN_6]"
|
|
|
|
def test_fix_glued_bracket(self):
|
|
"""Un tag collé à un mot reçoit un espace."""
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
result = Anonymizer._fix_brackets("Docteur[MEDECIN_1]")
|
|
assert result == "Docteur [MEDECIN_1]"
|
|
|
|
|
|
# --- P1-A : Déduplication CRH ---
|
|
|
|
class TestDedupChunks:
|
|
"""Tests pour la déduplication de chunks CRH."""
|
|
|
|
def test_identical_chunks_deduped(self):
|
|
chunks = [
|
|
"MME DUPONT\nCompte rendu du 01/01/2024 au 05/01/2024\nContenu médical...",
|
|
"MME DUPONT\nCompte rendu du 01/01/2024 au 05/01/2024\nContenu médical...",
|
|
"MME DUPONT\nCompte rendu du 01/01/2024 au 05/01/2024\nContenu médical...",
|
|
]
|
|
result = _dedup_chunks(chunks)
|
|
assert len(result) == 1
|
|
|
|
def test_different_chunks_preserved(self):
|
|
chunks = [
|
|
"MME DUPONT\nCompte rendu du 01/01/2024\nPancréatite aiguë",
|
|
"M. MARTIN\nCompte rendu du 15/02/2024\nFracture du fémur",
|
|
]
|
|
result = _dedup_chunks(chunks)
|
|
assert len(result) == 2
|
|
|
|
def test_single_chunk_passthrough(self):
|
|
chunks = ["Un seul document"]
|
|
result = _dedup_chunks(chunks)
|
|
assert len(result) == 1
|
|
|
|
|
|
# --- P1-B : Text cleaner ---
|
|
|
|
class TestTextCleaner:
|
|
"""Tests pour le nettoyage du texte OCR."""
|
|
|
|
def test_single_char_lines_removed(self):
|
|
text = "Contenu normal\nA\nB\nSuite du contenu\n"
|
|
result = clean_extracted_text(text)
|
|
assert "\nA\n" not in result
|
|
assert "\nB\n" not in result
|
|
assert "Contenu normal" in result
|
|
assert "Suite du contenu" in result
|
|
|
|
def test_page_footer_v1_removed(self):
|
|
text = "Contenu\nV1 - Imprime le 25/02/2026 a 14:30 par user Page(s): 1 sur 3\nSuite"
|
|
result = clean_extracted_text(text)
|
|
assert "V1 - Imprime" not in result
|
|
assert "Contenu" in result
|
|
|
|
def test_info_patient_footer_removed(self):
|
|
text = "Contenu\nInformation patient Page 1 25/02/2026 14:30:00\nSuite"
|
|
result = clean_extracted_text(text)
|
|
assert "Information patient" not in result
|
|
|
|
def test_collapse_blank_lines(self):
|
|
text = "Ligne 1\n\n\n\n\nLigne 2"
|
|
result = clean_extracted_text(text)
|
|
assert "\n\n\n" not in result
|
|
assert "Ligne 1\n\nLigne 2" == result
|
|
|
|
def test_preserves_medical_content(self):
|
|
text = "Pancréatite aiguë biliaire.\nIMC 34.37.\nCholécystectomie."
|
|
result = clean_extracted_text(text)
|
|
assert "Pancréatite" in result
|
|
assert "IMC 34.37" in result
|
|
assert "Cholécystectomie" in result
|
|
|
|
def test_dedup_vital_signs(self):
|
|
text = (
|
|
"Signes vitaux\nFC 80 TA 12/8\n\n"
|
|
"Surv. Isolement et Contention\nFC 80 TA 12/8\n\n"
|
|
"Conclusion"
|
|
)
|
|
result = clean_extracted_text(text)
|
|
assert "Signes vitaux" in result
|
|
assert "Surv. Isolement" not in result
|
|
assert "Conclusion" in result
|
|
|
|
def test_patient_footer_dedup(self):
|
|
text = (
|
|
"Patient(e) : DUPONT Jean N° Episode 12345678\n"
|
|
"Contenu page 1\n"
|
|
"Patient(e) : DUPONT Jean N° Episode 12345678\n"
|
|
"Contenu page 2\n"
|
|
"Patient(e) : DUPONT Jean N° Episode 12345678\n"
|
|
)
|
|
result = clean_extracted_text(text)
|
|
# La première occurrence est gardée, les suivantes supprimées
|
|
count = result.count("Patient(e)")
|
|
assert count == 1
|