feat: filtrage des DAS parasites (artefacts OCR trackare)
Nouveau module das_filter.py avec 7 règles de rejet (trop court, chiffres, lettre+chiffres OCR, mots concaténés/répétés, fragments non-médicaux) + nettoyage newlines/ponctuation. Filtrage appliqué aux 3 sources de DAS : trackare, regex et edsnlp. 31 tests unitaires. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,8 @@ from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from src.medical.das_filter import clean_diagnostic_text, is_valid_diagnostic_text
|
||||
|
||||
|
||||
def parse_trackare(text: str) -> dict:
|
||||
"""Parse un export Trackare et retourne les sections structurées."""
|
||||
@@ -358,11 +360,14 @@ def _extract_diagnostics(text: str, result: dict) -> None:
|
||||
r"(Principal|Associé|Significatif)\s+(actif|inactif)\s+([A-Z]\d{2}(?:\.\d{1,2})?)\s+(.+?)(?:\s+\[.*?\])?\s+\d{2}/\d{2}/\d{4}",
|
||||
text,
|
||||
):
|
||||
libelle = clean_diagnostic_text(m.group(4).strip())
|
||||
if not is_valid_diagnostic_text(libelle):
|
||||
continue
|
||||
result["diagnostics"].append({
|
||||
"type": m.group(1),
|
||||
"statut": m.group(2),
|
||||
"code_cim10": m.group(3),
|
||||
"libelle": m.group(4).strip(),
|
||||
"libelle": libelle,
|
||||
})
|
||||
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
from .cim10_dict import lookup as dict_lookup, normalize_text
|
||||
from .ccam_dict import lookup as ccam_lookup, validate_code as ccam_validate
|
||||
from .das_filter import clean_diagnostic_text, is_valid_diagnostic_text
|
||||
from ..config import (
|
||||
ActeCCAM,
|
||||
BiologieCle,
|
||||
@@ -204,8 +205,11 @@ def _extract_diagnostics(
|
||||
|
||||
# Diagnostics codés depuis Trackare (prioritaires)
|
||||
for diag in parsed.get("diagnostics", []):
|
||||
texte = clean_diagnostic_text(diag.get("libelle", ""))
|
||||
if not is_valid_diagnostic_text(texte):
|
||||
continue
|
||||
d = Diagnostic(
|
||||
texte=diag.get("libelle", ""),
|
||||
texte=texte,
|
||||
cim10_suggestion=diag.get("code_cim10"),
|
||||
)
|
||||
if diag.get("type", "").lower() == "principal":
|
||||
@@ -245,6 +249,7 @@ def _extract_diagnostics(
|
||||
|
||||
# Diagnostics associés depuis le texte (regex)
|
||||
das = _find_diagnostics_associes(text_lower, conclusion, dossier)
|
||||
das = [d for d in das if is_valid_diagnostic_text(d.texte)]
|
||||
dossier.diagnostics_associes.extend(das)
|
||||
|
||||
# Enrichissement DAS depuis edsnlp
|
||||
@@ -258,9 +263,12 @@ def _extract_diagnostics(
|
||||
for ent in edsnlp_result.cim10_entities:
|
||||
if ent.negation or ent.hypothese:
|
||||
continue
|
||||
texte = clean_diagnostic_text(ent.texte.capitalize())
|
||||
if not is_valid_diagnostic_text(texte):
|
||||
continue
|
||||
if ent.code not in existing_codes:
|
||||
dossier.diagnostics_associes.append(Diagnostic(
|
||||
texte=ent.texte.capitalize(),
|
||||
texte=texte,
|
||||
cim10_suggestion=ent.code,
|
||||
))
|
||||
existing_codes.add(ent.code)
|
||||
|
||||
50
src/medical/das_filter.py
Normal file
50
src/medical/das_filter.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""Filtrage des diagnostics associés parasites (artefacts OCR trackare)."""
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
|
||||
def clean_diagnostic_text(text: str) -> str:
|
||||
"""Nettoie un texte de diagnostic (newlines, ponctuation trailing, espaces)."""
|
||||
text = text.replace("\n", " ")
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
text = text.rstrip(",.;:!")
|
||||
return text
|
||||
|
||||
|
||||
def is_valid_diagnostic_text(text: str) -> bool:
|
||||
"""Retourne True si le texte ressemble à un diagnostic médical légitime."""
|
||||
t = text.strip()
|
||||
|
||||
# 1. Trop court
|
||||
if len(t) < 3:
|
||||
return False
|
||||
|
||||
# 2. Chiffres purs (>= 50% de chiffres)
|
||||
digits = sum(c.isdigit() for c in t)
|
||||
if digits >= len(t) * 0.5:
|
||||
return False
|
||||
|
||||
# 3. Lettre + chiffres OCR : "H 51", "À 08", "H\n10"
|
||||
if re.match(r"^[A-ZÀ-Ú]\s*\d{1,3}$", t):
|
||||
return False
|
||||
|
||||
# 4. Mots concaténés : "Ventilationventilation"
|
||||
if re.match(r"^([a-zà-ÿ]{3,})\1+[a-zà-ÿ]*$", t, re.IGNORECASE):
|
||||
return False
|
||||
|
||||
# 5. Mots répétés ≥ 3 fois : "Spontanée spontanée spontanée spontanée"
|
||||
words = t.lower().split()
|
||||
if words:
|
||||
from collections import Counter
|
||||
counts = Counter(words)
|
||||
if counts.most_common(1)[0][1] >= 3:
|
||||
return False
|
||||
|
||||
# 6. Fragments non-médicaux
|
||||
if re.match(r"^(De |Du |Des |]\s)", t):
|
||||
return False
|
||||
if t in {"Isolement", "Pp 500"}:
|
||||
return False
|
||||
|
||||
return True
|
||||
106
tests/test_das_filter.py
Normal file
106
tests/test_das_filter.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Tests unitaires pour le filtre de DAS parasites."""
|
||||
|
||||
import pytest
|
||||
|
||||
from src.medical.das_filter import clean_diagnostic_text, is_valid_diagnostic_text
|
||||
|
||||
|
||||
class TestCleanDiagnosticText:
|
||||
def test_removes_trailing_punctuation(self):
|
||||
assert clean_diagnostic_text("Thrombopénie,") == "Thrombopénie"
|
||||
|
||||
def test_removes_trailing_semicolon(self):
|
||||
assert clean_diagnostic_text("HTA;") == "HTA"
|
||||
|
||||
def test_replaces_newlines(self):
|
||||
assert clean_diagnostic_text("Insuffisance rénale\naigue") == "Insuffisance rénale aigue"
|
||||
|
||||
def test_strips_whitespace(self):
|
||||
assert clean_diagnostic_text(" HTA ") == "HTA"
|
||||
|
||||
def test_collapses_multiple_spaces(self):
|
||||
assert clean_diagnostic_text("Insuffisance rénale aigue") == "Insuffisance rénale aigue"
|
||||
|
||||
def test_combined_cleanup(self):
|
||||
assert clean_diagnostic_text(" Anticoagulant\nanticoagulant, ") == "Anticoagulant anticoagulant"
|
||||
|
||||
|
||||
class TestIsValidDiagnosticText:
|
||||
# --- Rejets ---
|
||||
def test_reject_empty(self):
|
||||
assert not is_valid_diagnostic_text("")
|
||||
|
||||
def test_reject_too_short(self):
|
||||
assert not is_valid_diagnostic_text("Ab")
|
||||
|
||||
def test_reject_digits_only(self):
|
||||
assert not is_valid_diagnostic_text("666666666666664")
|
||||
|
||||
def test_reject_mostly_digits(self):
|
||||
assert not is_valid_diagnostic_text("12345abc")
|
||||
|
||||
def test_reject_letter_space_digits(self):
|
||||
assert not is_valid_diagnostic_text("H 51")
|
||||
|
||||
def test_reject_letter_space_digits_a_accent(self):
|
||||
assert not is_valid_diagnostic_text("À 08")
|
||||
|
||||
def test_reject_letter_newline_digits(self):
|
||||
# Après clean, "H\n10" devient "H 10"
|
||||
assert not is_valid_diagnostic_text("H 10")
|
||||
|
||||
def test_reject_concatenated_words(self):
|
||||
assert not is_valid_diagnostic_text("Ventilationventilation")
|
||||
|
||||
def test_reject_concatenated_words_long(self):
|
||||
assert not is_valid_diagnostic_text("ventilationventilationventilation")
|
||||
|
||||
def test_reject_repeated_words(self):
|
||||
assert not is_valid_diagnostic_text("Spontanée spontanée spontanée spontanée")
|
||||
|
||||
def test_reject_repeated_words_three(self):
|
||||
assert not is_valid_diagnostic_text("oui oui oui")
|
||||
|
||||
def test_reject_fragment_de(self):
|
||||
assert not is_valid_diagnostic_text("De laboratoire")
|
||||
|
||||
def test_reject_fragment_du(self):
|
||||
assert not is_valid_diagnostic_text("Du sang")
|
||||
|
||||
def test_reject_fragment_des(self):
|
||||
assert not is_valid_diagnostic_text("Des résultats")
|
||||
|
||||
def test_reject_bracket_fragment(self):
|
||||
assert not is_valid_diagnostic_text("] de laboratoire")
|
||||
|
||||
def test_reject_isolement(self):
|
||||
assert not is_valid_diagnostic_text("Isolement")
|
||||
|
||||
def test_reject_pp_marker(self):
|
||||
assert not is_valid_diagnostic_text("Pp 500")
|
||||
|
||||
# --- Acceptations ---
|
||||
def test_accept_hta(self):
|
||||
assert is_valid_diagnostic_text("HTA")
|
||||
|
||||
def test_accept_cholecystite(self):
|
||||
assert is_valid_diagnostic_text("Cholécystite aiguë")
|
||||
|
||||
def test_accept_lithiase(self):
|
||||
assert is_valid_diagnostic_text("Lithiase vésiculaire")
|
||||
|
||||
def test_accept_insuffisance_renale(self):
|
||||
assert is_valid_diagnostic_text("Insuffisance rénale aigue")
|
||||
|
||||
def test_accept_obesite_with_imc(self):
|
||||
assert is_valid_diagnostic_text("Obésité (IMC 35.251)")
|
||||
|
||||
def test_accept_short_valid(self):
|
||||
# 3 chars = seuil exact, doit passer
|
||||
assert is_valid_diagnostic_text("HTA")
|
||||
|
||||
def test_accept_diabete(self):
|
||||
assert is_valid_diagnostic_text("Diabète de type 2")
|
||||
|
||||
def test_accept_sepsis(self):
|
||||
assert is_valid_diagnostic_text("Sepsis sévère")
|
||||
Reference in New Issue
Block a user