feat: sanitisation déterministe des codes CIM-10 hors périmètre CPAM
Le LLM (deepseek) propose systématiquement des codes alternatifs (D62, T81.0, T80, R39.2) malgré l'interdiction dans le prompt. Ces codes déclenchaient des warnings CRITIQUE → Tier C automatique. Solution conforme au principe "LLM propose, moteur de règles dispose" : - _sanitize_unauthorized_codes() supprime les codes hors whitelist du texte de la réponse AVANT toute validation - Nettoyage propre : "D62 — libellé" → "libellé", "(D62)" → "" - _build_whitelist_prefixes() factorisé en helper partagé - Sanitisation appliquée après génération ET après correction - 9 tests unitaires couvrant tous les cas (parenthèses, tirets, multiple) Résultat live : 0 warning CRITIQUE "code hors périmètre" sur 3 dossiers (vs 6 warnings CRITIQUE avant). Le seul CRITIQUE restant est le score adversarial bas, qui reflète des limites de raisonnement du modèle. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -110,6 +110,145 @@ def _validate_references(parsed: dict, sources: list[dict]) -> list[str]:
|
||||
# Regex pour capturer les codes CIM-10 (ex: K81.0, E87, Z45.80)
|
||||
_CIM10_CODE_RE = re.compile(r"\b([A-Z]\d{2}\.?\d{0,2})\b")
|
||||
|
||||
# Champs textuels de la réponse LLM à scanner pour les codes CIM-10
|
||||
_TEXT_FIELDS = (
|
||||
"analyse_contestation",
|
||||
"contre_arguments_medicaux",
|
||||
"contre_arguments_asymetrie",
|
||||
"contre_arguments_reglementaires",
|
||||
"conclusion",
|
||||
)
|
||||
|
||||
|
||||
def _build_whitelist_prefixes(
|
||||
dossier: DossierMedical,
|
||||
controle: ControleCPAM,
|
||||
) -> set[str]:
|
||||
"""Construit la whitelist de préfixes CIM-10 autorisés (3 chars).
|
||||
|
||||
Sources : DP + DAS du dossier, dp_ucr + da_ucr + dr_ucr du contrôle.
|
||||
"""
|
||||
prefixes: set[str] = set()
|
||||
|
||||
def _add(raw: str) -> None:
|
||||
raw = raw.strip()
|
||||
if not raw:
|
||||
return
|
||||
norm = normalize_code(raw)
|
||||
if norm and len(norm) >= 3:
|
||||
prefixes.add(norm[:3])
|
||||
|
||||
if dossier.diagnostic_principal and dossier.diagnostic_principal.cim10_suggestion:
|
||||
_add(dossier.diagnostic_principal.cim10_suggestion)
|
||||
for das in dossier.diagnostics_associes:
|
||||
if das.cim10_suggestion:
|
||||
_add(das.cim10_suggestion)
|
||||
|
||||
for field in (controle.dp_ucr, controle.da_ucr, controle.dr_ucr):
|
||||
if not field:
|
||||
continue
|
||||
for raw in re.split(r"[,;\s]+", field.strip()):
|
||||
_add(raw)
|
||||
|
||||
return prefixes
|
||||
|
||||
|
||||
# Patterns pour supprimer proprement un code hors périmètre et ses artefacts :
|
||||
# "D62 — libellé" → "libellé"
|
||||
# "(D62)" → ""
|
||||
# "D62" → ""
|
||||
_SANITIZE_PATTERNS = [
|
||||
# "CODE — libellé" ou "CODE - libellé"
|
||||
re.compile(r"\b[A-Z]\d{2}\.?\d{0,2}\s*[—–\-]\s*"),
|
||||
# "(CODE)" avec espaces optionnels
|
||||
re.compile(r"\(\s*[A-Z]\d{2}\.?\d{0,2}\s*\)"),
|
||||
# "CODE" seul
|
||||
_CIM10_CODE_RE,
|
||||
]
|
||||
|
||||
|
||||
def _sanitize_unauthorized_codes(
|
||||
parsed: dict,
|
||||
dossier: DossierMedical,
|
||||
controle: ControleCPAM,
|
||||
) -> list[str]:
|
||||
"""Supprime les codes CIM-10 hors périmètre des champs textuels de la réponse.
|
||||
|
||||
Modifie `parsed` in-place. Applique le principe « LLM propose, moteur de
|
||||
règles dispose » : le texte garde le sens médical mais les codes inventés
|
||||
sont retirés pour éviter les warnings CRITIQUE.
|
||||
|
||||
Returns:
|
||||
Liste des codes supprimés (pour logging).
|
||||
"""
|
||||
whitelist = _build_whitelist_prefixes(dossier, controle)
|
||||
if not whitelist:
|
||||
return []
|
||||
|
||||
removed: list[str] = []
|
||||
|
||||
def _is_authorized(code_str: str) -> bool:
|
||||
norm = normalize_code(code_str)
|
||||
return bool(norm and len(norm) >= 3 and norm[:3] in whitelist)
|
||||
|
||||
def _replace_code(match: re.Match) -> str:
|
||||
"""Callback de remplacement : garde le code si autorisé, supprime sinon."""
|
||||
code = _CIM10_CODE_RE.search(match.group(0))
|
||||
if not code:
|
||||
return match.group(0)
|
||||
if _is_authorized(code.group(0)):
|
||||
return match.group(0)
|
||||
if code.group(0) not in removed:
|
||||
removed.append(code.group(0))
|
||||
return ""
|
||||
|
||||
# Sanitiser les champs textuels
|
||||
for key in _TEXT_FIELDS:
|
||||
val = parsed.get(key)
|
||||
if not val or not isinstance(val, str):
|
||||
continue
|
||||
new_val = val
|
||||
for pattern in _SANITIZE_PATTERNS:
|
||||
new_val = pattern.sub(
|
||||
lambda m, _p=pattern: _replace_code(m),
|
||||
new_val,
|
||||
)
|
||||
# Nettoyage artefacts : doubles espaces, parenthèses vides
|
||||
new_val = re.sub(r"\(\s*\)", "", new_val)
|
||||
new_val = re.sub(r" +", " ", new_val)
|
||||
new_val = new_val.strip()
|
||||
if new_val != val:
|
||||
parsed[key] = new_val
|
||||
|
||||
# Sanitiser aussi les preuves_dossier.valeur
|
||||
preuves = parsed.get("preuves_dossier")
|
||||
if preuves and isinstance(preuves, list):
|
||||
for p in preuves:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
v = p.get("valeur", "")
|
||||
if not v or not isinstance(v, str):
|
||||
continue
|
||||
new_v = v
|
||||
for pattern in _SANITIZE_PATTERNS:
|
||||
new_v = pattern.sub(
|
||||
lambda m, _p=pattern: _replace_code(m),
|
||||
new_v,
|
||||
)
|
||||
new_v = re.sub(r"\(\s*\)", "", new_v)
|
||||
new_v = re.sub(r" +", " ", new_v).strip()
|
||||
if new_v != v:
|
||||
p["valeur"] = new_v
|
||||
|
||||
if removed:
|
||||
for code in removed:
|
||||
norm = normalize_code(code)
|
||||
is_valid, label = validate_code(norm)
|
||||
label_str = f" ({label})" if is_valid and label else ""
|
||||
logger.info("Sanitize : code %s%s hors périmètre supprimé du texte", code, label_str)
|
||||
|
||||
return removed
|
||||
|
||||
|
||||
def _validate_codes_in_response(
|
||||
parsed: dict,
|
||||
@@ -125,43 +264,13 @@ def _validate_codes_in_response(
|
||||
Returns:
|
||||
Liste de warnings pour les codes hors périmètre.
|
||||
"""
|
||||
# 1. Construire la whitelist (préfixes 3 chars)
|
||||
whitelist_prefixes: set[str] = set()
|
||||
|
||||
def _add_code(raw: str) -> None:
|
||||
raw = raw.strip()
|
||||
if not raw:
|
||||
return
|
||||
norm = normalize_code(raw)
|
||||
if norm and len(norm) >= 3:
|
||||
whitelist_prefixes.add(norm[:3])
|
||||
|
||||
# Codes du dossier
|
||||
if dossier.diagnostic_principal and dossier.diagnostic_principal.cim10_suggestion:
|
||||
_add_code(dossier.diagnostic_principal.cim10_suggestion)
|
||||
for das in dossier.diagnostics_associes:
|
||||
if das.cim10_suggestion:
|
||||
_add_code(das.cim10_suggestion)
|
||||
|
||||
# Codes de l'UCR
|
||||
for field in (controle.dp_ucr, controle.da_ucr, controle.dr_ucr):
|
||||
if not field:
|
||||
continue
|
||||
for raw in re.split(r"[,;\s]+", field.strip()):
|
||||
_add_code(raw)
|
||||
|
||||
whitelist_prefixes = _build_whitelist_prefixes(dossier, controle)
|
||||
if not whitelist_prefixes:
|
||||
return []
|
||||
|
||||
# 2. Extraire les codes CIM-10 de la réponse LLM (hors citations RAG)
|
||||
# 2. Extraire les codes CIM-10 de la réponse LLM
|
||||
text_fields = []
|
||||
for key in (
|
||||
"analyse_contestation",
|
||||
"contre_arguments_medicaux",
|
||||
"contre_arguments_asymetrie",
|
||||
"contre_arguments_reglementaires",
|
||||
"conclusion",
|
||||
):
|
||||
for key in _TEXT_FIELDS:
|
||||
val = parsed.get(key)
|
||||
if val and isinstance(val, str):
|
||||
text_fields.append(val)
|
||||
|
||||
Reference in New Issue
Block a user