feat: filtrage des DAS parasites (artefacts OCR trackare)

Nouveau module das_filter.py avec 7 règles de rejet (trop court, chiffres,
lettre+chiffres OCR, mots concaténés/répétés, fragments non-médicaux) +
nettoyage newlines/ponctuation. Filtrage appliqué aux 3 sources de DAS :
trackare, regex et edsnlp. 31 tests unitaires.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-02-11 17:48:25 +01:00
parent 96ccb4850f
commit 31c29078a1
4 changed files with 172 additions and 3 deletions

View File

@@ -11,6 +11,7 @@ logger = logging.getLogger(__name__)
from .cim10_dict import lookup as dict_lookup, normalize_text
from .ccam_dict import lookup as ccam_lookup, validate_code as ccam_validate
from .das_filter import clean_diagnostic_text, is_valid_diagnostic_text
from ..config import (
ActeCCAM,
BiologieCle,
@@ -204,8 +205,11 @@ def _extract_diagnostics(
# Diagnostics codés depuis Trackare (prioritaires)
for diag in parsed.get("diagnostics", []):
texte = clean_diagnostic_text(diag.get("libelle", ""))
if not is_valid_diagnostic_text(texte):
continue
d = Diagnostic(
texte=diag.get("libelle", ""),
texte=texte,
cim10_suggestion=diag.get("code_cim10"),
)
if diag.get("type", "").lower() == "principal":
@@ -245,6 +249,7 @@ def _extract_diagnostics(
# Diagnostics associés depuis le texte (regex)
das = _find_diagnostics_associes(text_lower, conclusion, dossier)
das = [d for d in das if is_valid_diagnostic_text(d.texte)]
dossier.diagnostics_associes.extend(das)
# Enrichissement DAS depuis edsnlp
@@ -258,9 +263,12 @@ def _extract_diagnostics(
for ent in edsnlp_result.cim10_entities:
if ent.negation or ent.hypothese:
continue
texte = clean_diagnostic_text(ent.texte.capitalize())
if not is_valid_diagnostic_text(texte):
continue
if ent.code not in existing_codes:
dossier.diagnostics_associes.append(Diagnostic(
texte=ent.texte.capitalize(),
texte=texte,
cim10_suggestion=ent.code,
))
existing_codes.add(ent.code)