fix: filtre bruit Trackare — antécédents parasites + répétitions DAS
- das_filter: regex anti-répétition gère les espaces entre mots concaténés
("VentilationVentilation Ventilation..." désormais rejeté)
- cim10_extractor: regex antécédents s'arrête à "Signes Vitaux" (ne capture
plus le tableau de surveillance)
- Nouveau _is_valid_antecedent() filtre noms de service, mots de surveillance
isolés, infos admin (RPPS), répétitions, Mode de vie
- 28 nouveaux tests (TestIsValidAntecedent + das_filter repetition)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -528,10 +528,57 @@ def _extract_actes(text: str, dossier: DossierMedical) -> None:
|
|||||||
acte.code_ccam_suggestion = code
|
acte.code_ccam_suggestion = code
|
||||||
|
|
||||||
|
|
||||||
|
_ANTECEDENT_NOISE = (
|
||||||
|
"item de", "surveillance", "température", "signes vitaux",
|
||||||
|
"pouls", "type de note", "aucune donnée", "renseignée",
|
||||||
|
"habitudes de vie", "systolique", "diastolique", "saturation",
|
||||||
|
"texte libre", "mode de vie", "n° rpps",
|
||||||
|
)
|
||||||
|
|
||||||
|
_SURVEILLANCE_SINGLE_WORDS = frozenset({
|
||||||
|
"moyenne", "ventilation", "echelle", "gauche", "droite",
|
||||||
|
"capillaire", "repos", "diurèse", "glycémie", "ambiant",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _is_valid_antecedent(line: str) -> bool:
|
||||||
|
"""Filtre les lignes parasites du bloc antécédents (bruit Trackare)."""
|
||||||
|
if not line or len(line) <= 5 or line == "0":
|
||||||
|
return False
|
||||||
|
if re.match(r"^\d", line):
|
||||||
|
return False
|
||||||
|
low = line.lower()
|
||||||
|
# Mots-clés de bruit (sous-chaînes)
|
||||||
|
if any(kw in low for kw in _ANTECEDENT_NOISE):
|
||||||
|
return False
|
||||||
|
words = low.split()
|
||||||
|
# Mots isolés de tableau de surveillance
|
||||||
|
if len(words) == 1 and low in _SURVEILLANCE_SINGLE_WORDS:
|
||||||
|
return False
|
||||||
|
# Noms de service (tout majuscules, court)
|
||||||
|
if line.isupper() and len(line) < 40:
|
||||||
|
return False
|
||||||
|
# Mots concaténés ou répétés avec espaces : "VentilationVentilation Ventilation..."
|
||||||
|
if re.match(r'^([a-zà-ÿ]{3,})(\s*\1)+\s*$', line, re.IGNORECASE):
|
||||||
|
return False
|
||||||
|
# Mots répétés mélangés (≥ 3 occurrences du même mot)
|
||||||
|
if len(words) >= 3:
|
||||||
|
from collections import Counter
|
||||||
|
if Counter(words).most_common(1)[0][1] >= 3:
|
||||||
|
return False
|
||||||
|
# Deux mots identiques
|
||||||
|
if len(words) == 2 and len(set(words)) == 1:
|
||||||
|
return False
|
||||||
|
# Identifiants administratifs isolés
|
||||||
|
if re.match(r'^\[MEDECIN\]\s', line) and len(line) < 30:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _extract_antecedents(text: str, dossier: DossierMedical) -> None:
|
def _extract_antecedents(text: str, dossier: DossierMedical) -> None:
|
||||||
"""Extrait les antécédents."""
|
"""Extrait les antécédents."""
|
||||||
m = re.search(
|
m = re.search(
|
||||||
r"Antécédents?\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[::]|Allergie|Histoire de la maladie|Examen clinique|\n\n))",
|
r"Antécédents?\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[::]|Allergie|Histoire de la maladie|Examen clinique|Signes\s+[Vv]itaux|Observations?\s+m[eé]dicale|Passage aux|\n\n))",
|
||||||
text,
|
text,
|
||||||
re.DOTALL | re.IGNORECASE,
|
re.DOTALL | re.IGNORECASE,
|
||||||
)
|
)
|
||||||
@@ -539,21 +586,7 @@ def _extract_antecedents(text: str, dossier: DossierMedical) -> None:
|
|||||||
block = m.group(1).strip()
|
block = m.group(1).strip()
|
||||||
for line in block.split("\n"):
|
for line in block.split("\n"):
|
||||||
line = line.strip().lstrip("- •")
|
line = line.strip().lstrip("- •")
|
||||||
# Filtrer les lignes non pertinentes
|
if _is_valid_antecedent(line):
|
||||||
if (line and len(line) > 5 and line != "0"
|
|
||||||
and not re.match(r"^\d", line)
|
|
||||||
and "Item de" not in line
|
|
||||||
and "surveillance" not in line.lower()
|
|
||||||
and "Température" not in line
|
|
||||||
and "Signes Vitaux" not in line
|
|
||||||
and "Pouls" not in line
|
|
||||||
and "Type de note" not in line
|
|
||||||
and "Aucune donnée" not in line
|
|
||||||
and "renseignée" not in line
|
|
||||||
and "habitudes de vie" not in line
|
|
||||||
and "Systolique" not in line
|
|
||||||
and "Diastolique" not in line
|
|
||||||
and "Saturation" not in line):
|
|
||||||
dossier.antecedents.append(line)
|
dossier.antecedents.append(line)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -39,8 +39,8 @@ def is_valid_diagnostic_text(text: str) -> bool:
|
|||||||
if re.match(r"^[A-ZÀ-Ú]\s*\d{1,3}([.,]\d+)?$", t):
|
if re.match(r"^[A-ZÀ-Ú]\s*\d{1,3}([.,]\d+)?$", t):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# 4. Mots concaténés : "Ventilationventilation"
|
# 4. Mots concaténés et/ou répétés avec espaces : "VentilationVentilation Ventilation..."
|
||||||
if re.match(r"^([a-zà-ÿ]{3,})\1+[a-zà-ÿ]*$", t, re.IGNORECASE):
|
if re.match(r"^([a-zà-ÿ]{3,})(\s*\1)+\s*$", t, re.IGNORECASE):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# 5. Mots répétés : tous identiques ("Absence absence", "Anticoagulant anticoagulant")
|
# 5. Mots répétés : tous identiques ("Absence absence", "Anticoagulant anticoagulant")
|
||||||
|
|||||||
@@ -68,6 +68,15 @@ class TestIsValidDiagnosticText:
|
|||||||
def test_reject_concatenated_words_long(self):
|
def test_reject_concatenated_words_long(self):
|
||||||
assert not is_valid_diagnostic_text("ventilationventilationventilation")
|
assert not is_valid_diagnostic_text("ventilationventilationventilation")
|
||||||
|
|
||||||
|
def test_reject_concatenated_with_spaces(self):
|
||||||
|
"""VentilationVentilation Ventilation... (mixed concat+spaces) is rejected."""
|
||||||
|
assert not is_valid_diagnostic_text(
|
||||||
|
"VentilationVentilationVentilation VentilationVentilationVentilationVentilationVentilationVentilationVentilationVentilation"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_reject_concatenated_with_trailing_space(self):
|
||||||
|
assert not is_valid_diagnostic_text("ventilationventilation ")
|
||||||
|
|
||||||
def test_reject_repeated_words(self):
|
def test_reject_repeated_words(self):
|
||||||
assert not is_valid_diagnostic_text("Spontanée spontanée spontanée spontanée")
|
assert not is_valid_diagnostic_text("Spontanée spontanée spontanée spontanée")
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from src.medical.cim10_extractor import (
|
|||||||
extract_medical_info,
|
extract_medical_info,
|
||||||
_lookup_cim10,
|
_lookup_cim10,
|
||||||
_is_abnormal,
|
_is_abnormal,
|
||||||
|
_is_valid_antecedent,
|
||||||
)
|
)
|
||||||
from src.medical.cim10_dict import normalize_text, load_dict, lookup, reset_cache
|
from src.medical.cim10_dict import normalize_text, load_dict, lookup, reset_cache
|
||||||
from src.extraction.document_classifier import classify, classify_with_confidence
|
from src.extraction.document_classifier import classify, classify_with_confidence
|
||||||
@@ -430,6 +431,90 @@ class TestTraitementEdgeCases:
|
|||||||
assert "7 jours" in ttt[0].posologie
|
assert "7 jours" in ttt[0].posologie
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsValidAntecedent:
|
||||||
|
"""Tests pour le filtre d'antécédents parasites Trackare."""
|
||||||
|
|
||||||
|
# --- Vrais antécédents (acceptés) ---
|
||||||
|
def test_accept_syndrome(self):
|
||||||
|
assert _is_valid_antecedent("Syndrome anxio depressif")
|
||||||
|
|
||||||
|
def test_accept_fracture(self):
|
||||||
|
assert _is_valid_antecedent("fracture des deux humérus en 2017")
|
||||||
|
|
||||||
|
def test_accept_hta_diabete(self):
|
||||||
|
assert _is_valid_antecedent("HTA, diabète type 2")
|
||||||
|
|
||||||
|
def test_accept_bilan_neurologique(self):
|
||||||
|
assert _is_valid_antecedent("Bilan neurologique: IRMc leucopathie vasculaire")
|
||||||
|
|
||||||
|
# --- Bruit surveillance Trackare (rejetés) ---
|
||||||
|
def test_reject_ventilation_concat(self):
|
||||||
|
assert not _is_valid_antecedent(
|
||||||
|
"VentilationVentilationVentilation VentilationVentilationVentilation"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_reject_spontanee_repeated(self):
|
||||||
|
assert not _is_valid_antecedent(
|
||||||
|
"spontanée spontanée spontanée spontanée spontanée"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_reject_air_repeated(self):
|
||||||
|
assert not _is_valid_antecedent("Air Air Air Air Air Air Air")
|
||||||
|
|
||||||
|
def test_reject_ambiant_repeated(self):
|
||||||
|
assert not _is_valid_antecedent("ambiant ambiant ambiant ambiant")
|
||||||
|
|
||||||
|
def test_reject_en_repeated(self):
|
||||||
|
assert not _is_valid_antecedent("EN EN EN EN")
|
||||||
|
|
||||||
|
def test_reject_moyenne_single(self):
|
||||||
|
assert not _is_valid_antecedent("Moyenne")
|
||||||
|
|
||||||
|
def test_reject_ventilation_single(self):
|
||||||
|
assert not _is_valid_antecedent("Ventilation")
|
||||||
|
|
||||||
|
def test_reject_echelle_single(self):
|
||||||
|
assert not _is_valid_antecedent("Echelle")
|
||||||
|
|
||||||
|
def test_reject_glycemie_single(self):
|
||||||
|
assert not _is_valid_antecedent("Glycémie")
|
||||||
|
|
||||||
|
def test_reject_capillaire_single(self):
|
||||||
|
assert not _is_valid_antecedent("capillaire")
|
||||||
|
|
||||||
|
def test_reject_gauche_single(self):
|
||||||
|
assert not _is_valid_antecedent("Gauche")
|
||||||
|
|
||||||
|
# --- Bruit administratif (rejetés) ---
|
||||||
|
def test_reject_service_name_caps(self):
|
||||||
|
assert not _is_valid_antecedent("MEDECINE INTERNE ET")
|
||||||
|
|
||||||
|
def test_reject_immunologie_caps(self):
|
||||||
|
assert not _is_valid_antecedent("IMMUNOLOGIE CLINIQUE")
|
||||||
|
|
||||||
|
def test_reject_rpps(self):
|
||||||
|
assert not _is_valid_antecedent("N° RPPS [RPPS_7]")
|
||||||
|
|
||||||
|
def test_reject_medecin_hospitalier(self):
|
||||||
|
assert not _is_valid_antecedent("[MEDECIN] Hospitalier")
|
||||||
|
|
||||||
|
def test_reject_mode_de_vie(self):
|
||||||
|
assert not _is_valid_antecedent("Mode de vie : divorcée, une fille")
|
||||||
|
|
||||||
|
def test_reject_texte_libre(self):
|
||||||
|
assert not _is_valid_antecedent("(texte libre)")
|
||||||
|
|
||||||
|
# --- Cas limites ---
|
||||||
|
def test_reject_too_short(self):
|
||||||
|
assert not _is_valid_antecedent("de Bo")
|
||||||
|
|
||||||
|
def test_reject_starts_with_digit(self):
|
||||||
|
assert not _is_valid_antecedent("97,00 100,00 98,00")
|
||||||
|
|
||||||
|
def test_reject_empty(self):
|
||||||
|
assert not _is_valid_antecedent("")
|
||||||
|
|
||||||
|
|
||||||
class TestClassifierConfidence:
|
class TestClassifierConfidence:
|
||||||
"""Tests pour classify_with_confidence."""
|
"""Tests pour classify_with_confidence."""
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user