fix: filtre bruit Trackare — antécédents parasites + répétitions DAS

- das_filter: regex anti-répétition gère les espaces entre mots concaténés ("VentilationVentilation Ventilation..." désormais rejeté) - cim10_extractor: regex antécédents s'arrête à "Signes Vitaux" (ne capture plus le tableau de surveillance) - Nouveau _is_valid_antecedent() filtre noms de service, mots de surveillance isolés, infos admin (RPPS), répétitions, Mode de vie - 28 nouveaux tests (TestIsValidAntecedent + das_filter repetition) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 19:20:50 +01:00
parent f7d87f2602
commit fe22c0f0f5
4 changed files with 145 additions and 18 deletions
--- a/src/medical/cim10_extractor.py
+++ b/src/medical/cim10_extractor.py
@@ -528,10 +528,57 @@ def _extract_actes(text: str, dossier: DossierMedical) -> None:
                acte.code_ccam_suggestion = code
 _ANTECEDENT_NOISE = (
    "item de", "surveillance", "température", "signes vitaux",
    "pouls", "type de note", "aucune donnée", "renseignée",
    "habitudes de vie", "systolique", "diastolique", "saturation",
    "texte libre", "mode de vie", "n° rpps",
 )
 _SURVEILLANCE_SINGLE_WORDS = frozenset({
    "moyenne", "ventilation", "echelle", "gauche", "droite",
    "capillaire", "repos", "diurèse", "glycémie", "ambiant",
 })
 def _is_valid_antecedent(line: str) -> bool:
    """Filtre les lignes parasites du bloc antécédents (bruit Trackare)."""
    if not line or len(line) <= 5 or line == "0":
        return False
    if re.match(r"^\d", line):
        return False
    low = line.lower()
    # Mots-clés de bruit (sous-chaînes)
    if any(kw in low for kw in _ANTECEDENT_NOISE):
        return False
    words = low.split()
    # Mots isolés de tableau de surveillance
    if len(words) == 1 and low in _SURVEILLANCE_SINGLE_WORDS:
        return False
    # Noms de service (tout majuscules, court)
    if line.isupper() and len(line) < 40:
        return False
    # Mots concaténés ou répétés avec espaces : "VentilationVentilation Ventilation..."
    if re.match(r'^([a-zà-ÿ]{3,})(\s*\1)+\s*$', line, re.IGNORECASE):
        return False
    # Mots répétés mélangés (≥ 3 occurrences du même mot)
    if len(words) >= 3:
        from collections import Counter
        if Counter(words).most_common(1)[0][1] >= 3:
            return False
    # Deux mots identiques
    if len(words) == 2 and len(set(words)) == 1:
        return False
    # Identifiants administratifs isolés
    if re.match(r'^\[MEDECIN\]\s', line) and len(line) < 30:
        return False
    return True
 def _extract_antecedents(text: str, dossier: DossierMedical) -> None:
    """Extrait les antécédents."""
    m = re.search(
-        r"Antécédents?\s*[:：]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[:：]|Allergie|Histoire de la maladie|Examen clinique|\n\n))",
+        r"Antécédents?\s*[:：]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[:：]|Allergie|Histoire de la maladie|Examen clinique|Signes\s+[Vv]itaux|Observations?\s+m[eé]dicale|Passage aux|\n\n))",
        text,
        re.DOTALL | re.IGNORECASE,
    )
@@ -539,21 +586,7 @@ def _extract_antecedents(text: str, dossier: DossierMedical) -> None:
        block = m.group(1).strip()
        for line in block.split("\n"):
            line = line.strip().lstrip("- •")
-            # Filtrer les lignes non pertinentes
+            if _is_valid_antecedent(line):
            if (line and len(line) > 5 and line != "0"
                    and not re.match(r"^\d", line)
                    and "Item de" not in line
                    and "surveillance" not in line.lower()
                    and "Température" not in line
                    and "Signes Vitaux" not in line
                    and "Pouls" not in line
                    and "Type de note" not in line
                    and "Aucune donnée" not in line
                    and "renseignée" not in line
                    and "habitudes de vie" not in line
                    and "Systolique" not in line
                    and "Diastolique" not in line
                    and "Saturation" not in line):
                dossier.antecedents.append(line)
--- a/src/medical/das_filter.py
+++ b/src/medical/das_filter.py
@@ -39,8 +39,8 @@ def is_valid_diagnostic_text(text: str) -> bool:
    if re.match(r"^[A-ZÀ-Ú]\s*\d{1,3}([.,]\d+)?$", t):
        return False
-    # 4. Mots concaténés : "Ventilationventilation"
+    # 4. Mots concaténés et/ou répétés avec espaces : "VentilationVentilation Ventilation..."
-    if re.match(r"^([a-zà-ÿ]{3,})\1+[a-zà-ÿ]*$", t, re.IGNORECASE):
+    if re.match(r"^([a-zà-ÿ]{3,})(\s*\1)+\s*$", t, re.IGNORECASE):
        return False
    # 5. Mots répétés : tous identiques ("Absence absence", "Anticoagulant anticoagulant")
--- a/tests/test_das_filter.py
+++ b/tests/test_das_filter.py
@@ -68,6 +68,15 @@ class TestIsValidDiagnosticText:
    def test_reject_concatenated_words_long(self):
        assert not is_valid_diagnostic_text("ventilationventilationventilation")
    def test_reject_concatenated_with_spaces(self):
        """VentilationVentilation Ventilation... (mixed concat+spaces) is rejected."""
        assert not is_valid_diagnostic_text(
            "VentilationVentilationVentilation VentilationVentilationVentilationVentilationVentilationVentilationVentilationVentilation"
        )
    def test_reject_concatenated_with_trailing_space(self):
        assert not is_valid_diagnostic_text("ventilationventilation ")
    def test_reject_repeated_words(self):
        assert not is_valid_diagnostic_text("Spontanée spontanée spontanée spontanée")
--- a/tests/test_medical.py
+++ b/tests/test_medical.py
@@ -7,6 +7,7 @@ from src.medical.cim10_extractor import (
    extract_medical_info,
    _lookup_cim10,
    _is_abnormal,
    _is_valid_antecedent,
 )
 from src.medical.cim10_dict import normalize_text, load_dict, lookup, reset_cache
 from src.extraction.document_classifier import classify, classify_with_confidence
@@ -430,6 +431,90 @@ class TestTraitementEdgeCases:
        assert "7 jours" in ttt[0].posologie
 class TestIsValidAntecedent:
    """Tests pour le filtre d'antécédents parasites Trackare."""
    # --- Vrais antécédents (acceptés) ---
    def test_accept_syndrome(self):
        assert _is_valid_antecedent("Syndrome anxio depressif")
    def test_accept_fracture(self):
        assert _is_valid_antecedent("fracture des deux humérus en 2017")
    def test_accept_hta_diabete(self):
        assert _is_valid_antecedent("HTA, diabète type 2")
    def test_accept_bilan_neurologique(self):
        assert _is_valid_antecedent("Bilan neurologique: IRMc leucopathie vasculaire")
    # --- Bruit surveillance Trackare (rejetés) ---
    def test_reject_ventilation_concat(self):
        assert not _is_valid_antecedent(
            "VentilationVentilationVentilation VentilationVentilationVentilation"
        )
    def test_reject_spontanee_repeated(self):
        assert not _is_valid_antecedent(
            "spontanée spontanée spontanée spontanée spontanée"
        )
    def test_reject_air_repeated(self):
        assert not _is_valid_antecedent("Air Air Air Air Air Air Air")
    def test_reject_ambiant_repeated(self):
        assert not _is_valid_antecedent("ambiant ambiant ambiant ambiant")
    def test_reject_en_repeated(self):
        assert not _is_valid_antecedent("EN EN EN EN")
    def test_reject_moyenne_single(self):
        assert not _is_valid_antecedent("Moyenne")
    def test_reject_ventilation_single(self):
        assert not _is_valid_antecedent("Ventilation")
    def test_reject_echelle_single(self):
        assert not _is_valid_antecedent("Echelle")
    def test_reject_glycemie_single(self):
        assert not _is_valid_antecedent("Glycémie")
    def test_reject_capillaire_single(self):
        assert not _is_valid_antecedent("capillaire")
    def test_reject_gauche_single(self):
        assert not _is_valid_antecedent("Gauche")
    # --- Bruit administratif (rejetés) ---
    def test_reject_service_name_caps(self):
        assert not _is_valid_antecedent("MEDECINE INTERNE ET")
    def test_reject_immunologie_caps(self):
        assert not _is_valid_antecedent("IMMUNOLOGIE CLINIQUE")
    def test_reject_rpps(self):
        assert not _is_valid_antecedent("N° RPPS [RPPS_7]")
    def test_reject_medecin_hospitalier(self):
        assert not _is_valid_antecedent("[MEDECIN] Hospitalier")
    def test_reject_mode_de_vie(self):
        assert not _is_valid_antecedent("Mode de vie : divorcée, une fille")
    def test_reject_texte_libre(self):
        assert not _is_valid_antecedent("(texte libre)")
    # --- Cas limites ---
    def test_reject_too_short(self):
        assert not _is_valid_antecedent("de Bo")
    def test_reject_starts_with_digit(self):
        assert not _is_valid_antecedent("97,00 100,00 98,00")
    def test_reject_empty(self):
        assert not _is_valid_antecedent("")
 class TestClassifierConfidence:
    """Tests pour classify_with_confidence."""