From fe22c0f0f5d717a17a21c6516783c58ed39e268c Mon Sep 17 00:00:00 2001
From: dom <dom@local>
Date: Wed, 18 Feb 2026 19:20:50 +0100
Subject: [PATCH] =?UTF-8?q?fix:=20filtre=20bruit=20Trackare=20=E2=80=94=20?=
 =?UTF-8?q?ant=C3=A9c=C3=A9dents=20parasites=20+=20r=C3=A9p=C3=A9titions?=
 =?UTF-8?q?=20DAS?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- das_filter: regex anti-répétition gère les espaces entre mots concaténés
  ("VentilationVentilation Ventilation..." désormais rejeté)
- cim10_extractor: regex antécédents s'arrête à "Signes Vitaux" (ne capture
  plus le tableau de surveillance)
- Nouveau _is_valid_antecedent() filtre noms de service, mots de surveillance
  isolés, infos admin (RPPS), répétitions, Mode de vie
- 28 nouveaux tests (TestIsValidAntecedent + das_filter repetition)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/medical/cim10_extractor.py | 65 +++++++++++++++++++-------
 src/medical/das_filter.py      |  4 +-
 tests/test_das_filter.py       |  9 ++++
 tests/test_medical.py          | 85 ++++++++++++++++++++++++++++++++++
 4 files changed, 145 insertions(+), 18 deletions(-)

diff --git a/src/medical/cim10_extractor.py b/src/medical/cim10_extractor.py
index 16894ea..2da00fe 100644
--- a/src/medical/cim10_extractor.py
+++ b/src/medical/cim10_extractor.py
@@ -528,10 +528,57 @@ def _extract_actes(text: str, dossier: DossierMedical) -> None:
                 acte.code_ccam_suggestion = code
 
 
+_ANTECEDENT_NOISE = (
+    "item de", "surveillance", "température", "signes vitaux",
+    "pouls", "type de note", "aucune donnée", "renseignée",
+    "habitudes de vie", "systolique", "diastolique", "saturation",
+    "texte libre", "mode de vie", "n° rpps",
+)
+
+_SURVEILLANCE_SINGLE_WORDS = frozenset({
+    "moyenne", "ventilation", "echelle", "gauche", "droite",
+    "capillaire", "repos", "diurèse", "glycémie", "ambiant",
+})
+
+
+def _is_valid_antecedent(line: str) -> bool:
+    """Filtre les lignes parasites du bloc antécédents (bruit Trackare)."""
+    if not line or len(line) <= 5 or line == "0":
+        return False
+    if re.match(r"^\d", line):
+        return False
+    low = line.lower()
+    # Mots-clés de bruit (sous-chaînes)
+    if any(kw in low for kw in _ANTECEDENT_NOISE):
+        return False
+    words = low.split()
+    # Mots isolés de tableau de surveillance
+    if len(words) == 1 and low in _SURVEILLANCE_SINGLE_WORDS:
+        return False
+    # Noms de service (tout majuscules, court)
+    if line.isupper() and len(line) < 40:
+        return False
+    # Mots concaténés ou répétés avec espaces : "VentilationVentilation Ventilation..."
+    if re.match(r'^([a-zà-ÿ]{3,})(\s*\1)+\s*$', line, re.IGNORECASE):
+        return False
+    # Mots répétés mélangés (≥ 3 occurrences du même mot)
+    if len(words) >= 3:
+        from collections import Counter
+        if Counter(words).most_common(1)[0][1] >= 3:
+            return False
+    # Deux mots identiques
+    if len(words) == 2 and len(set(words)) == 1:
+        return False
+    # Identifiants administratifs isolés
+    if re.match(r'^\[MEDECIN\]\s', line) and len(line) < 30:
+        return False
+    return True
+
+
 def _extract_antecedents(text: str, dossier: DossierMedical) -> None:
     """Extrait les antécédents."""
     m = re.search(
-        r"Antécédents?\s*[:：]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[:：]|Allergie|Histoire de la maladie|Examen clinique|\n\n))",
+        r"Antécédents?\s*[:：]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[:：]|Allergie|Histoire de la maladie|Examen clinique|Signes\s+[Vv]itaux|Observations?\s+m[eé]dicale|Passage aux|\n\n))",
         text,
         re.DOTALL | re.IGNORECASE,
     )
@@ -539,21 +586,7 @@ def _extract_antecedents(text: str, dossier: DossierMedical) -> None:
         block = m.group(1).strip()
         for line in block.split("\n"):
             line = line.strip().lstrip("- •")
-            # Filtrer les lignes non pertinentes
-            if (line and len(line) > 5 and line != "0"
-                    and not re.match(r"^\d", line)
-                    and "Item de" not in line
-                    and "surveillance" not in line.lower()
-                    and "Température" not in line
-                    and "Signes Vitaux" not in line
-                    and "Pouls" not in line
-                    and "Type de note" not in line
-                    and "Aucune donnée" not in line
-                    and "renseignée" not in line
-                    and "habitudes de vie" not in line
-                    and "Systolique" not in line
-                    and "Diastolique" not in line
-                    and "Saturation" not in line):
+            if _is_valid_antecedent(line):
                 dossier.antecedents.append(line)
 
 
diff --git a/src/medical/das_filter.py b/src/medical/das_filter.py
index 2402817..ad0301f 100644
--- a/src/medical/das_filter.py
+++ b/src/medical/das_filter.py
@@ -39,8 +39,8 @@ def is_valid_diagnostic_text(text: str) -> bool:
     if re.match(r"^[A-ZÀ-Ú]\s*\d{1,3}([.,]\d+)?$", t):
         return False
 
-    # 4. Mots concaténés : "Ventilationventilation"
-    if re.match(r"^([a-zà-ÿ]{3,})\1+[a-zà-ÿ]*$", t, re.IGNORECASE):
+    # 4. Mots concaténés et/ou répétés avec espaces : "VentilationVentilation Ventilation..."
+    if re.match(r"^([a-zà-ÿ]{3,})(\s*\1)+\s*$", t, re.IGNORECASE):
         return False
 
     # 5. Mots répétés : tous identiques ("Absence absence", "Anticoagulant anticoagulant")
diff --git a/tests/test_das_filter.py b/tests/test_das_filter.py
index 9d814c9..2e11188 100644
--- a/tests/test_das_filter.py
+++ b/tests/test_das_filter.py
@@ -68,6 +68,15 @@ class TestIsValidDiagnosticText:
     def test_reject_concatenated_words_long(self):
         assert not is_valid_diagnostic_text("ventilationventilationventilation")
 
+    def test_reject_concatenated_with_spaces(self):
+        """VentilationVentilation Ventilation... (mixed concat+spaces) is rejected."""
+        assert not is_valid_diagnostic_text(
+            "VentilationVentilationVentilation VentilationVentilationVentilationVentilationVentilationVentilationVentilationVentilation"
+        )
+
+    def test_reject_concatenated_with_trailing_space(self):
+        assert not is_valid_diagnostic_text("ventilationventilation ")
+
     def test_reject_repeated_words(self):
         assert not is_valid_diagnostic_text("Spontanée spontanée spontanée spontanée")
 
diff --git a/tests/test_medical.py b/tests/test_medical.py
index 201b4b7..dfd32ef 100644
--- a/tests/test_medical.py
+++ b/tests/test_medical.py
@@ -7,6 +7,7 @@ from src.medical.cim10_extractor import (
     extract_medical_info,
     _lookup_cim10,
     _is_abnormal,
+    _is_valid_antecedent,
 )
 from src.medical.cim10_dict import normalize_text, load_dict, lookup, reset_cache
 from src.extraction.document_classifier import classify, classify_with_confidence
@@ -430,6 +431,90 @@ class TestTraitementEdgeCases:
         assert "7 jours" in ttt[0].posologie
 
 
+class TestIsValidAntecedent:
+    """Tests pour le filtre d'antécédents parasites Trackare."""
+
+    # --- Vrais antécédents (acceptés) ---
+    def test_accept_syndrome(self):
+        assert _is_valid_antecedent("Syndrome anxio depressif")
+
+    def test_accept_fracture(self):
+        assert _is_valid_antecedent("fracture des deux humérus en 2017")
+
+    def test_accept_hta_diabete(self):
+        assert _is_valid_antecedent("HTA, diabète type 2")
+
+    def test_accept_bilan_neurologique(self):
+        assert _is_valid_antecedent("Bilan neurologique: IRMc leucopathie vasculaire")
+
+    # --- Bruit surveillance Trackare (rejetés) ---
+    def test_reject_ventilation_concat(self):
+        assert not _is_valid_antecedent(
+            "VentilationVentilationVentilation VentilationVentilationVentilation"
+        )
+
+    def test_reject_spontanee_repeated(self):
+        assert not _is_valid_antecedent(
+            "spontanée spontanée spontanée spontanée spontanée"
+        )
+
+    def test_reject_air_repeated(self):
+        assert not _is_valid_antecedent("Air Air Air Air Air Air Air")
+
+    def test_reject_ambiant_repeated(self):
+        assert not _is_valid_antecedent("ambiant ambiant ambiant ambiant")
+
+    def test_reject_en_repeated(self):
+        assert not _is_valid_antecedent("EN EN EN EN")
+
+    def test_reject_moyenne_single(self):
+        assert not _is_valid_antecedent("Moyenne")
+
+    def test_reject_ventilation_single(self):
+        assert not _is_valid_antecedent("Ventilation")
+
+    def test_reject_echelle_single(self):
+        assert not _is_valid_antecedent("Echelle")
+
+    def test_reject_glycemie_single(self):
+        assert not _is_valid_antecedent("Glycémie")
+
+    def test_reject_capillaire_single(self):
+        assert not _is_valid_antecedent("capillaire")
+
+    def test_reject_gauche_single(self):
+        assert not _is_valid_antecedent("Gauche")
+
+    # --- Bruit administratif (rejetés) ---
+    def test_reject_service_name_caps(self):
+        assert not _is_valid_antecedent("MEDECINE INTERNE ET")
+
+    def test_reject_immunologie_caps(self):
+        assert not _is_valid_antecedent("IMMUNOLOGIE CLINIQUE")
+
+    def test_reject_rpps(self):
+        assert not _is_valid_antecedent("N° RPPS [RPPS_7]")
+
+    def test_reject_medecin_hospitalier(self):
+        assert not _is_valid_antecedent("[MEDECIN] Hospitalier")
+
+    def test_reject_mode_de_vie(self):
+        assert not _is_valid_antecedent("Mode de vie : divorcée, une fille")
+
+    def test_reject_texte_libre(self):
+        assert not _is_valid_antecedent("(texte libre)")
+
+    # --- Cas limites ---
+    def test_reject_too_short(self):
+        assert not _is_valid_antecedent("de Bo")
+
+    def test_reject_starts_with_digit(self):
+        assert not _is_valid_antecedent("97,00 100,00 98,00")
+
+    def test_reject_empty(self):
+        assert not _is_valid_antecedent("")
+
+
 class TestClassifierConfidence:
     """Tests pour classify_with_confidence."""