From 31c29078a1f527e22afd12e88fc635f7be8a9e57 Mon Sep 17 00:00:00 2001 From: dom Date: Wed, 11 Feb 2026 17:48:25 +0100 Subject: [PATCH] feat: filtrage des DAS parasites (artefacts OCR trackare) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nouveau module das_filter.py avec 7 règles de rejet (trop court, chiffres, lettre+chiffres OCR, mots concaténés/répétés, fragments non-médicaux) + nettoyage newlines/ponctuation. Filtrage appliqué aux 3 sources de DAS : trackare, regex et edsnlp. 31 tests unitaires. Co-Authored-By: Claude Opus 4.6 --- src/extraction/trackare_parser.py | 7 +- src/medical/cim10_extractor.py | 12 +++- src/medical/das_filter.py | 50 ++++++++++++++ tests/test_das_filter.py | 106 ++++++++++++++++++++++++++++++ 4 files changed, 172 insertions(+), 3 deletions(-) create mode 100644 src/medical/das_filter.py create mode 100644 tests/test_das_filter.py diff --git a/src/extraction/trackare_parser.py b/src/extraction/trackare_parser.py index f03af49..c9b0786 100644 --- a/src/extraction/trackare_parser.py +++ b/src/extraction/trackare_parser.py @@ -4,6 +4,8 @@ from __future__ import annotations import re +from src.medical.das_filter import clean_diagnostic_text, is_valid_diagnostic_text + def parse_trackare(text: str) -> dict: """Parse un export Trackare et retourne les sections structurées.""" @@ -358,11 +360,14 @@ def _extract_diagnostics(text: str, result: dict) -> None: r"(Principal|Associé|Significatif)\s+(actif|inactif)\s+([A-Z]\d{2}(?:\.\d{1,2})?)\s+(.+?)(?:\s+\[.*?\])?\s+\d{2}/\d{2}/\d{4}", text, ): + libelle = clean_diagnostic_text(m.group(4).strip()) + if not is_valid_diagnostic_text(libelle): + continue result["diagnostics"].append({ "type": m.group(1), "statut": m.group(2), "code_cim10": m.group(3), - "libelle": m.group(4).strip(), + "libelle": libelle, }) diff --git a/src/medical/cim10_extractor.py b/src/medical/cim10_extractor.py index 0dc6626..f527c46 100644 --- a/src/medical/cim10_extractor.py +++ b/src/medical/cim10_extractor.py @@ -11,6 +11,7 @@ logger = logging.getLogger(__name__) from .cim10_dict import lookup as dict_lookup, normalize_text from .ccam_dict import lookup as ccam_lookup, validate_code as ccam_validate +from .das_filter import clean_diagnostic_text, is_valid_diagnostic_text from ..config import ( ActeCCAM, BiologieCle, @@ -204,8 +205,11 @@ def _extract_diagnostics( # Diagnostics codés depuis Trackare (prioritaires) for diag in parsed.get("diagnostics", []): + texte = clean_diagnostic_text(diag.get("libelle", "")) + if not is_valid_diagnostic_text(texte): + continue d = Diagnostic( - texte=diag.get("libelle", ""), + texte=texte, cim10_suggestion=diag.get("code_cim10"), ) if diag.get("type", "").lower() == "principal": @@ -245,6 +249,7 @@ def _extract_diagnostics( # Diagnostics associés depuis le texte (regex) das = _find_diagnostics_associes(text_lower, conclusion, dossier) + das = [d for d in das if is_valid_diagnostic_text(d.texte)] dossier.diagnostics_associes.extend(das) # Enrichissement DAS depuis edsnlp @@ -258,9 +263,12 @@ def _extract_diagnostics( for ent in edsnlp_result.cim10_entities: if ent.negation or ent.hypothese: continue + texte = clean_diagnostic_text(ent.texte.capitalize()) + if not is_valid_diagnostic_text(texte): + continue if ent.code not in existing_codes: dossier.diagnostics_associes.append(Diagnostic( - texte=ent.texte.capitalize(), + texte=texte, cim10_suggestion=ent.code, )) existing_codes.add(ent.code) diff --git a/src/medical/das_filter.py b/src/medical/das_filter.py new file mode 100644 index 0000000..737dca0 --- /dev/null +++ b/src/medical/das_filter.py @@ -0,0 +1,50 @@ +"""Filtrage des diagnostics associés parasites (artefacts OCR trackare).""" + +import re +import unicodedata + + +def clean_diagnostic_text(text: str) -> str: + """Nettoie un texte de diagnostic (newlines, ponctuation trailing, espaces).""" + text = text.replace("\n", " ") + text = re.sub(r"\s+", " ", text).strip() + text = text.rstrip(",.;:!") + return text + + +def is_valid_diagnostic_text(text: str) -> bool: + """Retourne True si le texte ressemble à un diagnostic médical légitime.""" + t = text.strip() + + # 1. Trop court + if len(t) < 3: + return False + + # 2. Chiffres purs (>= 50% de chiffres) + digits = sum(c.isdigit() for c in t) + if digits >= len(t) * 0.5: + return False + + # 3. Lettre + chiffres OCR : "H 51", "À 08", "H\n10" + if re.match(r"^[A-ZÀ-Ú]\s*\d{1,3}$", t): + return False + + # 4. Mots concaténés : "Ventilationventilation" + if re.match(r"^([a-zà-ÿ]{3,})\1+[a-zà-ÿ]*$", t, re.IGNORECASE): + return False + + # 5. Mots répétés ≥ 3 fois : "Spontanée spontanée spontanée spontanée" + words = t.lower().split() + if words: + from collections import Counter + counts = Counter(words) + if counts.most_common(1)[0][1] >= 3: + return False + + # 6. Fragments non-médicaux + if re.match(r"^(De |Du |Des |]\s)", t): + return False + if t in {"Isolement", "Pp 500"}: + return False + + return True diff --git a/tests/test_das_filter.py b/tests/test_das_filter.py new file mode 100644 index 0000000..0967479 --- /dev/null +++ b/tests/test_das_filter.py @@ -0,0 +1,106 @@ +"""Tests unitaires pour le filtre de DAS parasites.""" + +import pytest + +from src.medical.das_filter import clean_diagnostic_text, is_valid_diagnostic_text + + +class TestCleanDiagnosticText: + def test_removes_trailing_punctuation(self): + assert clean_diagnostic_text("Thrombopénie,") == "Thrombopénie" + + def test_removes_trailing_semicolon(self): + assert clean_diagnostic_text("HTA;") == "HTA" + + def test_replaces_newlines(self): + assert clean_diagnostic_text("Insuffisance rénale\naigue") == "Insuffisance rénale aigue" + + def test_strips_whitespace(self): + assert clean_diagnostic_text(" HTA ") == "HTA" + + def test_collapses_multiple_spaces(self): + assert clean_diagnostic_text("Insuffisance rénale aigue") == "Insuffisance rénale aigue" + + def test_combined_cleanup(self): + assert clean_diagnostic_text(" Anticoagulant\nanticoagulant, ") == "Anticoagulant anticoagulant" + + +class TestIsValidDiagnosticText: + # --- Rejets --- + def test_reject_empty(self): + assert not is_valid_diagnostic_text("") + + def test_reject_too_short(self): + assert not is_valid_diagnostic_text("Ab") + + def test_reject_digits_only(self): + assert not is_valid_diagnostic_text("666666666666664") + + def test_reject_mostly_digits(self): + assert not is_valid_diagnostic_text("12345abc") + + def test_reject_letter_space_digits(self): + assert not is_valid_diagnostic_text("H 51") + + def test_reject_letter_space_digits_a_accent(self): + assert not is_valid_diagnostic_text("À 08") + + def test_reject_letter_newline_digits(self): + # Après clean, "H\n10" devient "H 10" + assert not is_valid_diagnostic_text("H 10") + + def test_reject_concatenated_words(self): + assert not is_valid_diagnostic_text("Ventilationventilation") + + def test_reject_concatenated_words_long(self): + assert not is_valid_diagnostic_text("ventilationventilationventilation") + + def test_reject_repeated_words(self): + assert not is_valid_diagnostic_text("Spontanée spontanée spontanée spontanée") + + def test_reject_repeated_words_three(self): + assert not is_valid_diagnostic_text("oui oui oui") + + def test_reject_fragment_de(self): + assert not is_valid_diagnostic_text("De laboratoire") + + def test_reject_fragment_du(self): + assert not is_valid_diagnostic_text("Du sang") + + def test_reject_fragment_des(self): + assert not is_valid_diagnostic_text("Des résultats") + + def test_reject_bracket_fragment(self): + assert not is_valid_diagnostic_text("] de laboratoire") + + def test_reject_isolement(self): + assert not is_valid_diagnostic_text("Isolement") + + def test_reject_pp_marker(self): + assert not is_valid_diagnostic_text("Pp 500") + + # --- Acceptations --- + def test_accept_hta(self): + assert is_valid_diagnostic_text("HTA") + + def test_accept_cholecystite(self): + assert is_valid_diagnostic_text("Cholécystite aiguë") + + def test_accept_lithiase(self): + assert is_valid_diagnostic_text("Lithiase vésiculaire") + + def test_accept_insuffisance_renale(self): + assert is_valid_diagnostic_text("Insuffisance rénale aigue") + + def test_accept_obesite_with_imc(self): + assert is_valid_diagnostic_text("Obésité (IMC 35.251)") + + def test_accept_short_valid(self): + # 3 chars = seuil exact, doit passer + assert is_valid_diagnostic_text("HTA") + + def test_accept_diabete(self): + assert is_valid_diagnostic_text("Diabète de type 2") + + def test_accept_sepsis(self): + assert is_valid_diagnostic_text("Sepsis sévère")