From fe22c0f0f5d717a17a21c6516783c58ed39e268c Mon Sep 17 00:00:00 2001 From: dom Date: Wed, 18 Feb 2026 19:20:50 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20filtre=20bruit=20Trackare=20=E2=80=94=20?= =?UTF-8?q?ant=C3=A9c=C3=A9dents=20parasites=20+=20r=C3=A9p=C3=A9titions?= =?UTF-8?q?=20DAS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - das_filter: regex anti-répétition gère les espaces entre mots concaténés ("VentilationVentilation Ventilation..." désormais rejeté) - cim10_extractor: regex antécédents s'arrête à "Signes Vitaux" (ne capture plus le tableau de surveillance) - Nouveau _is_valid_antecedent() filtre noms de service, mots de surveillance isolés, infos admin (RPPS), répétitions, Mode de vie - 28 nouveaux tests (TestIsValidAntecedent + das_filter repetition) Co-Authored-By: Claude Opus 4.6 --- src/medical/cim10_extractor.py | 65 +++++++++++++++++++------- src/medical/das_filter.py | 4 +- tests/test_das_filter.py | 9 ++++ tests/test_medical.py | 85 ++++++++++++++++++++++++++++++++++ 4 files changed, 145 insertions(+), 18 deletions(-) diff --git a/src/medical/cim10_extractor.py b/src/medical/cim10_extractor.py index 16894ea..2da00fe 100644 --- a/src/medical/cim10_extractor.py +++ b/src/medical/cim10_extractor.py @@ -528,10 +528,57 @@ def _extract_actes(text: str, dossier: DossierMedical) -> None: acte.code_ccam_suggestion = code +_ANTECEDENT_NOISE = ( + "item de", "surveillance", "température", "signes vitaux", + "pouls", "type de note", "aucune donnée", "renseignée", + "habitudes de vie", "systolique", "diastolique", "saturation", + "texte libre", "mode de vie", "n° rpps", +) + +_SURVEILLANCE_SINGLE_WORDS = frozenset({ + "moyenne", "ventilation", "echelle", "gauche", "droite", + "capillaire", "repos", "diurèse", "glycémie", "ambiant", +}) + + +def _is_valid_antecedent(line: str) -> bool: + """Filtre les lignes parasites du bloc antécédents (bruit Trackare).""" + if not line or len(line) <= 5 or line == "0": + return False + if re.match(r"^\d", line): + return False + low = line.lower() + # Mots-clés de bruit (sous-chaînes) + if any(kw in low for kw in _ANTECEDENT_NOISE): + return False + words = low.split() + # Mots isolés de tableau de surveillance + if len(words) == 1 and low in _SURVEILLANCE_SINGLE_WORDS: + return False + # Noms de service (tout majuscules, court) + if line.isupper() and len(line) < 40: + return False + # Mots concaténés ou répétés avec espaces : "VentilationVentilation Ventilation..." + if re.match(r'^([a-zà-ÿ]{3,})(\s*\1)+\s*$', line, re.IGNORECASE): + return False + # Mots répétés mélangés (≥ 3 occurrences du même mot) + if len(words) >= 3: + from collections import Counter + if Counter(words).most_common(1)[0][1] >= 3: + return False + # Deux mots identiques + if len(words) == 2 and len(set(words)) == 1: + return False + # Identifiants administratifs isolés + if re.match(r'^\[MEDECIN\]\s', line) and len(line) < 30: + return False + return True + + def _extract_antecedents(text: str, dossier: DossierMedical) -> None: """Extrait les antécédents.""" m = re.search( - r"Antécédents?\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[::]|Allergie|Histoire de la maladie|Examen clinique|\n\n))", + r"Antécédents?\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[::]|Allergie|Histoire de la maladie|Examen clinique|Signes\s+[Vv]itaux|Observations?\s+m[eé]dicale|Passage aux|\n\n))", text, re.DOTALL | re.IGNORECASE, ) @@ -539,21 +586,7 @@ def _extract_antecedents(text: str, dossier: DossierMedical) -> None: block = m.group(1).strip() for line in block.split("\n"): line = line.strip().lstrip("- •") - # Filtrer les lignes non pertinentes - if (line and len(line) > 5 and line != "0" - and not re.match(r"^\d", line) - and "Item de" not in line - and "surveillance" not in line.lower() - and "Température" not in line - and "Signes Vitaux" not in line - and "Pouls" not in line - and "Type de note" not in line - and "Aucune donnée" not in line - and "renseignée" not in line - and "habitudes de vie" not in line - and "Systolique" not in line - and "Diastolique" not in line - and "Saturation" not in line): + if _is_valid_antecedent(line): dossier.antecedents.append(line) diff --git a/src/medical/das_filter.py b/src/medical/das_filter.py index 2402817..ad0301f 100644 --- a/src/medical/das_filter.py +++ b/src/medical/das_filter.py @@ -39,8 +39,8 @@ def is_valid_diagnostic_text(text: str) -> bool: if re.match(r"^[A-ZÀ-Ú]\s*\d{1,3}([.,]\d+)?$", t): return False - # 4. Mots concaténés : "Ventilationventilation" - if re.match(r"^([a-zà-ÿ]{3,})\1+[a-zà-ÿ]*$", t, re.IGNORECASE): + # 4. Mots concaténés et/ou répétés avec espaces : "VentilationVentilation Ventilation..." + if re.match(r"^([a-zà-ÿ]{3,})(\s*\1)+\s*$", t, re.IGNORECASE): return False # 5. Mots répétés : tous identiques ("Absence absence", "Anticoagulant anticoagulant") diff --git a/tests/test_das_filter.py b/tests/test_das_filter.py index 9d814c9..2e11188 100644 --- a/tests/test_das_filter.py +++ b/tests/test_das_filter.py @@ -68,6 +68,15 @@ class TestIsValidDiagnosticText: def test_reject_concatenated_words_long(self): assert not is_valid_diagnostic_text("ventilationventilationventilation") + def test_reject_concatenated_with_spaces(self): + """VentilationVentilation Ventilation... (mixed concat+spaces) is rejected.""" + assert not is_valid_diagnostic_text( + "VentilationVentilationVentilation VentilationVentilationVentilationVentilationVentilationVentilationVentilationVentilation" + ) + + def test_reject_concatenated_with_trailing_space(self): + assert not is_valid_diagnostic_text("ventilationventilation ") + def test_reject_repeated_words(self): assert not is_valid_diagnostic_text("Spontanée spontanée spontanée spontanée") diff --git a/tests/test_medical.py b/tests/test_medical.py index 201b4b7..dfd32ef 100644 --- a/tests/test_medical.py +++ b/tests/test_medical.py @@ -7,6 +7,7 @@ from src.medical.cim10_extractor import ( extract_medical_info, _lookup_cim10, _is_abnormal, + _is_valid_antecedent, ) from src.medical.cim10_dict import normalize_text, load_dict, lookup, reset_cache from src.extraction.document_classifier import classify, classify_with_confidence @@ -430,6 +431,90 @@ class TestTraitementEdgeCases: assert "7 jours" in ttt[0].posologie +class TestIsValidAntecedent: + """Tests pour le filtre d'antécédents parasites Trackare.""" + + # --- Vrais antécédents (acceptés) --- + def test_accept_syndrome(self): + assert _is_valid_antecedent("Syndrome anxio depressif") + + def test_accept_fracture(self): + assert _is_valid_antecedent("fracture des deux humérus en 2017") + + def test_accept_hta_diabete(self): + assert _is_valid_antecedent("HTA, diabète type 2") + + def test_accept_bilan_neurologique(self): + assert _is_valid_antecedent("Bilan neurologique: IRMc leucopathie vasculaire") + + # --- Bruit surveillance Trackare (rejetés) --- + def test_reject_ventilation_concat(self): + assert not _is_valid_antecedent( + "VentilationVentilationVentilation VentilationVentilationVentilation" + ) + + def test_reject_spontanee_repeated(self): + assert not _is_valid_antecedent( + "spontanée spontanée spontanée spontanée spontanée" + ) + + def test_reject_air_repeated(self): + assert not _is_valid_antecedent("Air Air Air Air Air Air Air") + + def test_reject_ambiant_repeated(self): + assert not _is_valid_antecedent("ambiant ambiant ambiant ambiant") + + def test_reject_en_repeated(self): + assert not _is_valid_antecedent("EN EN EN EN") + + def test_reject_moyenne_single(self): + assert not _is_valid_antecedent("Moyenne") + + def test_reject_ventilation_single(self): + assert not _is_valid_antecedent("Ventilation") + + def test_reject_echelle_single(self): + assert not _is_valid_antecedent("Echelle") + + def test_reject_glycemie_single(self): + assert not _is_valid_antecedent("Glycémie") + + def test_reject_capillaire_single(self): + assert not _is_valid_antecedent("capillaire") + + def test_reject_gauche_single(self): + assert not _is_valid_antecedent("Gauche") + + # --- Bruit administratif (rejetés) --- + def test_reject_service_name_caps(self): + assert not _is_valid_antecedent("MEDECINE INTERNE ET") + + def test_reject_immunologie_caps(self): + assert not _is_valid_antecedent("IMMUNOLOGIE CLINIQUE") + + def test_reject_rpps(self): + assert not _is_valid_antecedent("N° RPPS [RPPS_7]") + + def test_reject_medecin_hospitalier(self): + assert not _is_valid_antecedent("[MEDECIN] Hospitalier") + + def test_reject_mode_de_vie(self): + assert not _is_valid_antecedent("Mode de vie : divorcée, une fille") + + def test_reject_texte_libre(self): + assert not _is_valid_antecedent("(texte libre)") + + # --- Cas limites --- + def test_reject_too_short(self): + assert not _is_valid_antecedent("de Bo") + + def test_reject_starts_with_digit(self): + assert not _is_valid_antecedent("97,00 100,00 98,00") + + def test_reject_empty(self): + assert not _is_valid_antecedent("") + + class TestClassifierConfidence: """Tests pour classify_with_confidence."""