feat: filtrage des DAS parasites (artefacts OCR trackare)
Nouveau module das_filter.py avec 7 règles de rejet (trop court, chiffres, lettre+chiffres OCR, mots concaténés/répétés, fragments non-médicaux) + nettoyage newlines/ponctuation. Filtrage appliqué aux 3 sources de DAS : trackare, regex et edsnlp. 31 tests unitaires. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
106
tests/test_das_filter.py
Normal file
106
tests/test_das_filter.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Tests unitaires pour le filtre de DAS parasites."""
|
||||
|
||||
import pytest
|
||||
|
||||
from src.medical.das_filter import clean_diagnostic_text, is_valid_diagnostic_text
|
||||
|
||||
|
||||
class TestCleanDiagnosticText:
|
||||
def test_removes_trailing_punctuation(self):
|
||||
assert clean_diagnostic_text("Thrombopénie,") == "Thrombopénie"
|
||||
|
||||
def test_removes_trailing_semicolon(self):
|
||||
assert clean_diagnostic_text("HTA;") == "HTA"
|
||||
|
||||
def test_replaces_newlines(self):
|
||||
assert clean_diagnostic_text("Insuffisance rénale\naigue") == "Insuffisance rénale aigue"
|
||||
|
||||
def test_strips_whitespace(self):
|
||||
assert clean_diagnostic_text(" HTA ") == "HTA"
|
||||
|
||||
def test_collapses_multiple_spaces(self):
|
||||
assert clean_diagnostic_text("Insuffisance rénale aigue") == "Insuffisance rénale aigue"
|
||||
|
||||
def test_combined_cleanup(self):
|
||||
assert clean_diagnostic_text(" Anticoagulant\nanticoagulant, ") == "Anticoagulant anticoagulant"
|
||||
|
||||
|
||||
class TestIsValidDiagnosticText:
|
||||
# --- Rejets ---
|
||||
def test_reject_empty(self):
|
||||
assert not is_valid_diagnostic_text("")
|
||||
|
||||
def test_reject_too_short(self):
|
||||
assert not is_valid_diagnostic_text("Ab")
|
||||
|
||||
def test_reject_digits_only(self):
|
||||
assert not is_valid_diagnostic_text("666666666666664")
|
||||
|
||||
def test_reject_mostly_digits(self):
|
||||
assert not is_valid_diagnostic_text("12345abc")
|
||||
|
||||
def test_reject_letter_space_digits(self):
|
||||
assert not is_valid_diagnostic_text("H 51")
|
||||
|
||||
def test_reject_letter_space_digits_a_accent(self):
|
||||
assert not is_valid_diagnostic_text("À 08")
|
||||
|
||||
def test_reject_letter_newline_digits(self):
|
||||
# Après clean, "H\n10" devient "H 10"
|
||||
assert not is_valid_diagnostic_text("H 10")
|
||||
|
||||
def test_reject_concatenated_words(self):
|
||||
assert not is_valid_diagnostic_text("Ventilationventilation")
|
||||
|
||||
def test_reject_concatenated_words_long(self):
|
||||
assert not is_valid_diagnostic_text("ventilationventilationventilation")
|
||||
|
||||
def test_reject_repeated_words(self):
|
||||
assert not is_valid_diagnostic_text("Spontanée spontanée spontanée spontanée")
|
||||
|
||||
def test_reject_repeated_words_three(self):
|
||||
assert not is_valid_diagnostic_text("oui oui oui")
|
||||
|
||||
def test_reject_fragment_de(self):
|
||||
assert not is_valid_diagnostic_text("De laboratoire")
|
||||
|
||||
def test_reject_fragment_du(self):
|
||||
assert not is_valid_diagnostic_text("Du sang")
|
||||
|
||||
def test_reject_fragment_des(self):
|
||||
assert not is_valid_diagnostic_text("Des résultats")
|
||||
|
||||
def test_reject_bracket_fragment(self):
|
||||
assert not is_valid_diagnostic_text("] de laboratoire")
|
||||
|
||||
def test_reject_isolement(self):
|
||||
assert not is_valid_diagnostic_text("Isolement")
|
||||
|
||||
def test_reject_pp_marker(self):
|
||||
assert not is_valid_diagnostic_text("Pp 500")
|
||||
|
||||
# --- Acceptations ---
|
||||
def test_accept_hta(self):
|
||||
assert is_valid_diagnostic_text("HTA")
|
||||
|
||||
def test_accept_cholecystite(self):
|
||||
assert is_valid_diagnostic_text("Cholécystite aiguë")
|
||||
|
||||
def test_accept_lithiase(self):
|
||||
assert is_valid_diagnostic_text("Lithiase vésiculaire")
|
||||
|
||||
def test_accept_insuffisance_renale(self):
|
||||
assert is_valid_diagnostic_text("Insuffisance rénale aigue")
|
||||
|
||||
def test_accept_obesite_with_imc(self):
|
||||
assert is_valid_diagnostic_text("Obésité (IMC 35.251)")
|
||||
|
||||
def test_accept_short_valid(self):
|
||||
# 3 chars = seuil exact, doit passer
|
||||
assert is_valid_diagnostic_text("HTA")
|
||||
|
||||
def test_accept_diabete(self):
|
||||
assert is_valid_diagnostic_text("Diabète de type 2")
|
||||
|
||||
def test_accept_sepsis(self):
|
||||
assert is_valid_diagnostic_text("Sepsis sévère")
|
||||
Reference in New Issue
Block a user