feat: architecture multi-modèles LLM + quality engine + benchmark
- Multi-modèles : 4 rôles LLM (coding=gemma3:27b-cloud, cpam=gemma3:27b-cloud, validation=deepseek-v3.2:cloud, qc=gemma3:12b) avec get_model(role) - Prompts externalisés : 7 templates dans src/prompts/templates.py - Cache Ollama : modèle stocké par entrée (migration auto ancien format) - call_ollama() : paramètre role= (priorité: model > role > global) - Quality engine : veto_engine + decision_engine + rules_router (YAML) - Benchmark qualité : scripts/benchmark_quality.py (A/B, métriques CIM-10) - Fix biologie : valeurs qualitatives (troponine négative) non filtrées - Fix CPAM : gemma3:27b-cloud au lieu de deepseek (JSON tronqué par thinking) - CPAM max_tokens 4000→6000, viewer admin multi-modèles - Benchmark 10 dossiers : 100% DAS valides, 10/10 CPAM, 243s/dossier Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
137
batch_50.sh
Executable file
137
batch_50.sh
Executable file
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_INPUT="${1:-input}" # ex: input
|
||||
N="${2:-50}" # nb dossiers à traiter
|
||||
|
||||
# Options
|
||||
FORCE="${FORCE:-0}" # FORCE=1 => retraiter même si output JSON existe
|
||||
CLEAN="${CLEAN:-0}" # CLEAN=1 => supprime outputs du dossier avant retraitement (recommandé avec FORCE)
|
||||
RANDOM_PICK="${RANDOM_PICK:-0}" # RANDOM_PICK=1 => choisir N dossiers aléatoires
|
||||
MAX_PARALLEL="${OLLAMA_MAX_PARALLEL:-1}"
|
||||
|
||||
# Modèles
|
||||
export OLLAMA_CODER_MODEL="${OLLAMA_CODER_MODEL:-gemma3:27b}"
|
||||
export OLLAMA_VERIFIER_MODEL="${OLLAMA_VERIFIER_MODEL:-deepseek-v3.2:cloud}"
|
||||
export OLLAMA_MAX_PARALLEL="$MAX_PARALLEL"
|
||||
|
||||
# Python du venv (fiable)
|
||||
PY="./.venv/bin/python"
|
||||
if [[ ! -x "$PY" ]]; then
|
||||
echo "❌ Venv introuvable: $PY"
|
||||
echo " Active ton venv ou crée-le, puis relance."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
RUN_ID="$(date +%Y%m%d_%H%M%S)"
|
||||
RUN_DIR="output/batch_runs/$RUN_ID"
|
||||
LOG_DIR="output/batch_logs/$RUN_ID"
|
||||
mkdir -p "$RUN_DIR" "$LOG_DIR"
|
||||
|
||||
IDS_FILE="$RUN_DIR/ids.txt"
|
||||
FILES_FILE="$RUN_DIR/files.txt"
|
||||
|
||||
echo "=== Batch Run: $RUN_ID ===" | tee "$RUN_DIR/summary.txt"
|
||||
echo "ROOT_INPUT=$ROOT_INPUT N=$N FORCE=$FORCE CLEAN=$CLEAN RANDOM_PICK=$RANDOM_PICK" | tee -a "$RUN_DIR/summary.txt"
|
||||
echo "CODER=$OLLAMA_CODER_MODEL VERIFIER=$OLLAMA_VERIFIER_MODEL OLLAMA_MAX_PARALLEL=$OLLAMA_MAX_PARALLEL" | tee -a "$RUN_DIR/summary.txt"
|
||||
echo | tee -a "$RUN_DIR/summary.txt"
|
||||
|
||||
# Liste des dossiers = sous-dossiers avec au moins 1 pdf
|
||||
LIST_CMD=(find "$ROOT_INPUT" -mindepth 1 -maxdepth 1 -type d -print)
|
||||
mapfile -t ALL_DIRS < <("${LIST_CMD[@]}" | while read -r d; do
|
||||
compgen -G "$d/*.pdf" >/dev/null && echo "$d"
|
||||
done)
|
||||
|
||||
if [[ "${#ALL_DIRS[@]}" -eq 0 ]]; then
|
||||
echo "❌ Aucun dossier avec PDF trouvé dans: $ROOT_INPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Sélection N dossiers
|
||||
if [[ "$RANDOM_PICK" == "1" ]]; then
|
||||
mapfile -t DOSSIERS < <(printf "%s\n" "${ALL_DIRS[@]}" | shuf | head -n "$N")
|
||||
else
|
||||
mapfile -t DOSSIERS < <(printf "%s\n" "${ALL_DIRS[@]}" | sort | head -n "$N")
|
||||
fi
|
||||
|
||||
echo "→ Dossiers sélectionnés: ${#DOSSIERS[@]}" | tee -a "$RUN_DIR/summary.txt"
|
||||
|
||||
# Traitement
|
||||
for d in "${DOSSIERS[@]}"; do
|
||||
id="$(basename "$d")"
|
||||
out_json="output/structured/$id/${id}_fusionne_cim10.json"
|
||||
log="$LOG_DIR/${id}.log"
|
||||
|
||||
# Enregistre l'ID (pour stats de fin)
|
||||
echo "$id" >> "$IDS_FILE"
|
||||
|
||||
if [[ -f "$out_json" && "$FORCE" != "1" ]]; then
|
||||
echo "⏭️ SKIP $id (déjà traité)" | tee -a "$RUN_DIR/summary.txt"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ "$CLEAN" == "1" ]]; then
|
||||
rm -rf "output/structured/$id" "output/reports/$id" "output/anonymized/$id" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
echo "▶️ START $id" | tee -a "$RUN_DIR/summary.txt"
|
||||
("$PY" -m src.main "$d") 2>&1 | tee "$log"
|
||||
echo "✅ DONE $id" | tee -a "$RUN_DIR/summary.txt"
|
||||
done
|
||||
|
||||
# Construit la liste des fichiers JSON réellement présents pour ce run
|
||||
: > "$FILES_FILE"
|
||||
while read -r id; do
|
||||
f="output/structured/$id/${id}_fusionne_cim10.json"
|
||||
[[ -f "$f" ]] && echo "$f" >> "$FILES_FILE"
|
||||
done < "$IDS_FILE"
|
||||
|
||||
COUNT_FILES=$(wc -l < "$FILES_FILE" | tr -d ' ')
|
||||
echo | tee -a "$RUN_DIR/summary.txt"
|
||||
echo "→ JSON trouvés pour stats: $COUNT_FILES" | tee -a "$RUN_DIR/summary.txt"
|
||||
|
||||
if [[ "$COUNT_FILES" -eq 0 ]]; then
|
||||
echo "⚠️ Aucun JSON pour stats. Fin." | tee -a "$RUN_DIR/summary.txt"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo | tee -a "$RUN_DIR/summary.txt"
|
||||
echo "=== STATS (sur ce run uniquement) ===" | tee -a "$RUN_DIR/summary.txt"
|
||||
|
||||
# 1) Verdicts
|
||||
echo "--- Verdicts ---" | tee -a "$RUN_DIR/summary.txt"
|
||||
xargs -a "$FILES_FILE" jq -r '(.veto_report.verdict // "NO_REPORT")' \
|
||||
| sort | uniq -c | sort -nr | tee -a "$RUN_DIR/summary.txt"
|
||||
|
||||
# 2) Top VETOs
|
||||
echo | tee -a "$RUN_DIR/summary.txt"
|
||||
echo "--- Top VETOs ---" | tee -a "$RUN_DIR/summary.txt"
|
||||
xargs -a "$FILES_FILE" jq -r '.veto_report.issues[]?.veto' \
|
||||
| sort | uniq -c | sort -nr | head -n 20 | tee -a "$RUN_DIR/summary.txt"
|
||||
|
||||
# 3) HARD count
|
||||
echo | tee -a "$RUN_DIR/summary.txt"
|
||||
echo "--- Dossiers avec HARD ---" | tee -a "$RUN_DIR/summary.txt"
|
||||
while read -r f; do
|
||||
id="$(basename "$f" _fusionne_cim10.json)"
|
||||
hard=$(jq '[.veto_report.issues[]? | select(.severity=="HARD")] | length' "$f")
|
||||
[[ "$hard" -gt 0 ]] && printf "%s\tHARD=%s\n" "$id" "$hard"
|
||||
done < "$FILES_FILE" | sort -k2,2nr | tee -a "$RUN_DIR/summary.txt"
|
||||
|
||||
# 4) Downgrades (cim10_final != cim10_suggestion)
|
||||
echo | tee -a "$RUN_DIR/summary.txt"
|
||||
echo "--- Downgrades (TOP 30) ---" | tee -a "$RUN_DIR/summary.txt"
|
||||
while read -r f; do
|
||||
id="$(basename "$f" _fusionne_cim10.json)"
|
||||
dw=$(jq '
|
||||
([
|
||||
(.diagnostic_principal? | select(.cim10_final? and .cim10_suggestion? and .cim10_final != .cim10_suggestion) | 1),
|
||||
(.diagnostics_associes[]? | select(.cim10_final? and .cim10_suggestion? and .cim10_final != .cim10_suggestion) | 1)
|
||||
] | add) // 0
|
||||
' "$f")
|
||||
[[ "$dw" -gt 0 ]] && printf "%s\tDOWN=%s\n" "$id" "$dw"
|
||||
done < "$FILES_FILE" | sort -k2,2nr | head -n 30 | tee -a "$RUN_DIR/summary.txt"
|
||||
|
||||
echo | tee -a "$RUN_DIR/summary.txt"
|
||||
echo "✅ Stats écrites dans: $RUN_DIR/summary.txt"
|
||||
echo "📁 Logs dossier par dossier: $LOG_DIR/"
|
||||
34
config/bio_rules.yaml
Normal file
34
config/bio_rules.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
version: 2
|
||||
|
||||
# Règles biologiques (contradiction bio ⇒ ruled_out)
|
||||
# + garde-fou "preuve manquante" (diag d'ionogramme sans valeur extraite ⇒ NEED_INFO)
|
||||
#
|
||||
# Objectif: éviter des FAIL "bêtes" quand la biologie contredit clairement un diagnostic,
|
||||
# et éviter des PASS "trop optimistes" quand on n'a même pas la valeur biologique.
|
||||
#
|
||||
# Hiérarchie des seuils:
|
||||
# - Priorité aux normes du document (ex: [N: 135-145])
|
||||
# - Sinon fallback config/reference_ranges.yaml
|
||||
# - Si âge inconnu/enfant: safe zones conservatrices (reference_ranges.yaml)
|
||||
|
||||
missing_evidence:
|
||||
enabled: true
|
||||
veto: VETO-17
|
||||
severity: LOW
|
||||
score_penalty: 2
|
||||
|
||||
rules:
|
||||
hyponatremia:
|
||||
enabled: true
|
||||
codes: ["E87.1"] # hyponatrémie
|
||||
analyte: sodium
|
||||
|
||||
hyperkalemia:
|
||||
enabled: true
|
||||
codes: ["E87.5"] # hyperkaliémie
|
||||
analyte: potassium
|
||||
|
||||
hypokalemia:
|
||||
enabled: true
|
||||
codes: ["E87.6"] # hypokaliémie
|
||||
analyte: potassium
|
||||
62
config/lab_value_sanity.yaml
Normal file
62
config/lab_value_sanity.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
# Garde-fous de parsing des valeurs biologiques
|
||||
# ------------------------------------------------
|
||||
# Objectif: éviter des faux positifs dus à des artefacts PDF/OCR
|
||||
# (ex: "8" au lieu de "4.8" pour le potassium).
|
||||
#
|
||||
# IMPORTANT:
|
||||
# - Ce fichier ne définit PAS des "normes biologiques" (ça c'est reference_ranges.yaml)
|
||||
# - Ici on définit des bornes *plausibles* très larges + quelques heuristiques "anti-OCR".
|
||||
#
|
||||
# Clés des tests: minuscules, sans accents, ex: potassium, sodium, plaquettes, hemoglobine...
|
||||
version: 1
|
||||
|
||||
policy:
|
||||
drop_out_of_range: true # écarte les valeurs hors bornes plausibles du dossier
|
||||
keep_suspect: true # conserve les valeurs suspectes (audit) mais les règles privilégient les valeurs ok
|
||||
|
||||
tests:
|
||||
potassium:
|
||||
hard_min: 0.5
|
||||
hard_max: 9.0
|
||||
suspect:
|
||||
single_digit_over: 6.0 # "8" seul est souvent une décimale perdue (4,8 -> 8)
|
||||
|
||||
sodium:
|
||||
hard_min: 90
|
||||
hard_max: 200
|
||||
|
||||
plaquettes:
|
||||
hard_min: 5
|
||||
hard_max: 2000
|
||||
|
||||
hemoglobine:
|
||||
hard_min: 3
|
||||
hard_max: 25
|
||||
|
||||
creatinine:
|
||||
hard_min: 1
|
||||
hard_max: 5000
|
||||
|
||||
crp:
|
||||
hard_min: 0
|
||||
hard_max: 1000
|
||||
|
||||
alat:
|
||||
hard_min: 0
|
||||
hard_max: 5000
|
||||
|
||||
asat:
|
||||
hard_min: 0
|
||||
hard_max: 5000
|
||||
|
||||
ggt:
|
||||
hard_min: 0
|
||||
hard_max: 5000
|
||||
|
||||
pal:
|
||||
hard_min: 0
|
||||
hard_max: 5000
|
||||
|
||||
bilirubine totale:
|
||||
hard_min: 0
|
||||
hard_max: 2000
|
||||
30
config/reference_ranges.yaml
Normal file
30
config/reference_ranges.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
# Références biologiques (fallback) pour règles de qualité (VETO-09 / ruled_out)
|
||||
# Ordre de priorité recommandé:
|
||||
# 1) Normes du document (ex: "[N: 135-145]")
|
||||
# 2) Fallback ci-dessous (par bande d'âge)
|
||||
# 3) Safe zones conservatrices si âge inconnu (évite les faux "barrés")
|
||||
|
||||
version: 1
|
||||
|
||||
age_bands:
|
||||
adult_min_years: 18
|
||||
|
||||
fallback_ranges:
|
||||
adult:
|
||||
platelets: { low: 150, high: 450, unit: "G/L" }
|
||||
sodium: { low: 135, high: 145, unit: "mmol/L" }
|
||||
potassium: { low: 3.5, high: 5.0, unit: "mmol/L" }
|
||||
|
||||
# Pédiatrie: à affiner par tranches d'âge si besoin.
|
||||
# NB: pour les décisions "ruled_out" avec âge inconnu, on applique plutôt les safe zones.
|
||||
child:
|
||||
platelets: { low: 150, high: 450, unit: "G/L" }
|
||||
sodium: { low: 135, high: 145, unit: "mmol/L" }
|
||||
potassium: { low: 3.5, high: 5.0, unit: "mmol/L" }
|
||||
|
||||
# Seuils "safe" quand l'âge n'est pas connu (plus conservateurs que les bornes normales)
|
||||
safe_zones_unknown_age:
|
||||
platelets_ruled_out_low: 170 # si PLT >= 170 -> thrombopénie ruled_out
|
||||
sodium_ruled_out_low: 138 # si Na >= 138 -> hyponatrémie ruled_out
|
||||
potassium_ruled_out_high: 4.9 # si K <= 4.9 -> hyperkaliémie ruled_out
|
||||
potassium_ruled_out_low: 3.7 # si K >= 3.7 -> hypokaliémie ruled_out
|
||||
68
config/rules/README.md
Normal file
68
config/rules/README.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# Règles (vetos + décisions)
|
||||
|
||||
Ce dossier contient la configuration "métier" pour piloter le moteur qualité.
|
||||
|
||||
## Fichiers
|
||||
|
||||
- `base.yaml` : socle commun (règles activées par défaut).
|
||||
- `enabled.yaml` : choisit les overlays à activer (site/spécialité).
|
||||
- `specialties/*.yaml` : overrides par spécialité.
|
||||
- `sites/*.yaml` : overrides par établissement.
|
||||
|
||||
## Principe
|
||||
|
||||
- Une règle **non listée** est considérée **activée**.
|
||||
- Ça évite de casser le comportement historique lors d'une montée de version.
|
||||
- Une règle listée peut être :
|
||||
- `enabled: false` → désactivée
|
||||
- (VETO) `force_severity: "HARD"|"MEDIUM"|"LOW"` → force la sévérité
|
||||
|
||||
## Exemple d'override
|
||||
|
||||
Créer `config/rules/sites/chu_poitiers.yaml` :
|
||||
|
||||
```yaml
|
||||
version: 1
|
||||
rules:
|
||||
VETO-12:
|
||||
enabled: false
|
||||
VETO-09:
|
||||
force_severity: "HARD"
|
||||
```
|
||||
|
||||
Puis activer dans `enabled.yaml` :
|
||||
|
||||
```yaml
|
||||
active:
|
||||
site: "chu_poitiers"
|
||||
specialty: ""
|
||||
extra: []
|
||||
```
|
||||
|
||||
|
||||
## Routage automatique (router.yaml)
|
||||
|
||||
Le fichier `router.yaml` permet d’activer automatiquement des **packs** de règles en fonction des signaux du dossier (codes, biologie, extraits). Concrètement :
|
||||
|
||||
- Par défaut, seuls les packs listés dans `defaults.enabled_packs` sont actifs.
|
||||
- Quand un trigger match, on ajoute ses `enable_packs`.
|
||||
- Le routage est appliqué **par dossier** (et re-appliqué sur la version fusionnée).
|
||||
|
||||
### Mode strict
|
||||
|
||||
Quand `mode: strict`, une règle *non listée* dans `base.yaml` est considérée **désactivée** dès que le routage runtime est actif.
|
||||
|
||||
Ça force une approche “catalogue explicite” : tout ce qui tourne en prod est visible et gouvernable.
|
||||
|
||||
### Exemple
|
||||
|
||||
Activer les règles ionogramme uniquement si un code `E87.*` est détecté ou si la biologie mentionne Sodium/Potassium :
|
||||
|
||||
```yaml
|
||||
triggers:
|
||||
- id: TRG-ELECTROLYTES
|
||||
enable_packs: [bio_electrolytes]
|
||||
when_any:
|
||||
codes_prefix: ["E87."]
|
||||
lab_tests: ["sodium", "potassium"]
|
||||
```
|
||||
82
config/rules/base.yaml
Normal file
82
config/rules/base.yaml
Normal file
@@ -0,0 +1,82 @@
|
||||
version: 1
|
||||
|
||||
# Catalogue "socle" de règles.
|
||||
#
|
||||
# Objectif : piloter (sans toucher au code) :
|
||||
# - l'activation/désactivation de règles (vetos + décisions)
|
||||
# - éventuellement un forçage de sévérité pour un VETO
|
||||
#
|
||||
# Important : si une règle n'est pas listée ici, elle est considérée activée.
|
||||
# (=> comportement historique conservé)
|
||||
|
||||
packs:
|
||||
vetos_core:
|
||||
enabled: true
|
||||
rules:
|
||||
VETO-02:
|
||||
enabled: true
|
||||
description: "Code sans preuve exploitable"
|
||||
VETO-03:
|
||||
enabled: true
|
||||
description: "Conditionnel / négation / contradictions dans la preuve"
|
||||
VETO-06:
|
||||
enabled: true
|
||||
description: "DP dupliqué dans les DAS"
|
||||
VETO-07:
|
||||
enabled: true
|
||||
description: "Doublons DAS"
|
||||
VETO-09:
|
||||
enabled: true
|
||||
description: "Contradiction biologique (plaquettes/créat)"
|
||||
# force_severity: "HARD" # Optionnel : forcer la sévérité globale
|
||||
VETO-12:
|
||||
enabled: true
|
||||
description: "Sur-confiance (high sans preuve)"
|
||||
VETO-15:
|
||||
enabled: true
|
||||
description: "Preuve issue d'un score/test (risque de sur-codage)"
|
||||
VETO-16:
|
||||
enabled: true
|
||||
description: "Heuristique libellé→code (hors-sujet probable)"
|
||||
VETO-17:
|
||||
enabled: true
|
||||
description: "Preuve biologique manquante => NEED_INFO (non bloquant)"
|
||||
|
||||
decisions_core:
|
||||
enabled: true
|
||||
rules:
|
||||
RULE-D50-NEEDS-IRON:
|
||||
enabled: true
|
||||
description: "D50 sans preuve martiale => downgrade D64.9 + NEED_INFO"
|
||||
RULE-D69.6-PLT-NORMAL:
|
||||
enabled: true
|
||||
description: "D69.6 incompatible avec plaquettes normales => ruled_out (barré)"
|
||||
|
||||
bio_electrolytes:
|
||||
enabled: true
|
||||
rules:
|
||||
RULE-E87.1-NA-NORMAL:
|
||||
enabled: true
|
||||
description: "E87.1 suggérée mais Na normal => ruled_out"
|
||||
RULE-E87.1-MISSING-NA:
|
||||
enabled: true
|
||||
description: "E87.1 suggérée mais Na absent => NEED_INFO"
|
||||
RULE-E87.5-K-NORMAL:
|
||||
enabled: true
|
||||
description: "E87.5 suggérée mais K normal => ruled_out"
|
||||
RULE-E87.5-MISSING-K:
|
||||
enabled: true
|
||||
description: "E87.5 suggérée mais K absent => NEED_INFO"
|
||||
RULE-E87.6-K-NORMAL:
|
||||
enabled: true
|
||||
description: "E87.6 suggérée mais K normal => ruled_out"
|
||||
RULE-E87.6-MISSING-K:
|
||||
enabled: true
|
||||
description: "E87.6 suggérée mais K absent => NEED_INFO"
|
||||
|
||||
placeholders_future:
|
||||
enabled: false
|
||||
rules:
|
||||
RULE-PDF-PROTECTED-NEED_INFO:
|
||||
enabled: false
|
||||
description: "PDF protégé => NEED_INFO (à implémenter si besoin)"
|
||||
12
config/rules/enabled.yaml
Normal file
12
config/rules/enabled.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
version: 1
|
||||
|
||||
# Sélection d'overlays (facile à brancher plus tard sur une UI).
|
||||
#
|
||||
# - specialty : charge config/rules/specialties/<specialty>.yaml
|
||||
# - site : charge config/rules/sites/<site>.yaml
|
||||
# - extra : charge des fichiers YAML additionnels (chemins relatifs à config/rules/)
|
||||
|
||||
active:
|
||||
specialty: ""
|
||||
site: ""
|
||||
extra: []
|
||||
35
config/rules/router.yaml
Normal file
35
config/rules/router.yaml
Normal file
@@ -0,0 +1,35 @@
|
||||
version: 1
|
||||
|
||||
# 'strict' => si un rule_id n'est pas listé dans base.yaml, il est considéré OFF
|
||||
# quand le routage runtime est actif (objectif: pro / pas de surprise).
|
||||
mode: strict
|
||||
|
||||
defaults:
|
||||
# Socle pro: toujours actif (peu coûteux, structure la contestabilité)
|
||||
enabled_packs:
|
||||
- vetos_core
|
||||
- decisions_core
|
||||
|
||||
# (Optionnel) règles toujours ON même si leur pack n'est pas actif
|
||||
always_on_rules: []
|
||||
|
||||
# Triggers : activer des packs additionnels seulement si le dossier a des signaux pertinents
|
||||
triggers:
|
||||
- id: TRG-ELECTROLYTES
|
||||
enable_packs: ["bio_electrolytes"]
|
||||
when_any:
|
||||
# Codes souvent porteurs d'ionogramme (hyponatrémie/hyperkaliémie/hypokaliémie)
|
||||
codes_prefix: ["E87."]
|
||||
# Ou biologie présente
|
||||
lab_tests: ["ionogramme", "sodium", "potassium", "na", "k"]
|
||||
# Ou texte
|
||||
keywords:
|
||||
- "ionogramme"
|
||||
- "hypokali"
|
||||
- "hyperkali"
|
||||
- "hyponatr"
|
||||
- "hypernatr"
|
||||
- "kaliémie"
|
||||
- "natrémie"
|
||||
- "sodium"
|
||||
- "potassium"
|
||||
9
config/rules/sites/_template.yaml
Normal file
9
config/rules/sites/_template.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
version: 1
|
||||
|
||||
# Overlay établissement (ex: chu_poitiers, clinique_x, etc.)
|
||||
# Ce fichier ne contient que des overrides.
|
||||
|
||||
rules:
|
||||
# Exemple : forcer VETO-09 en HARD
|
||||
# VETO-09:
|
||||
# force_severity: "HARD"
|
||||
13
config/rules/specialties/_template.yaml
Normal file
13
config/rules/specialties/_template.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
version: 1
|
||||
|
||||
# Overlay spécialité (ex: digestif, cardio, pneumo, onco...)
|
||||
# Ce fichier ne contient que des overrides.
|
||||
|
||||
rules:
|
||||
# Exemple : être plus strict sur le conditionnel
|
||||
# VETO-03:
|
||||
# force_severity: "MEDIUM"
|
||||
|
||||
# Exemple : désactiver un downgrade jugé trop agressif
|
||||
# RULE-D50-NEEDS-IRON:
|
||||
# enabled: false
|
||||
33
docs/prompts.md
Normal file
33
docs/prompts.md
Normal file
@@ -0,0 +1,33 @@
|
||||
# Prompts LLM — Pipeline T2A v2
|
||||
|
||||
7 prompts externalisés dans `src/prompts/templates.py`, importables via `from src.prompts import ...`.
|
||||
|
||||
| # | Template | Rôle LLM | Modèle par défaut | Temp. | max_tokens | Variables | Appelant |
|
||||
|---|----------|----------|-------------------|-------|------------|-----------|----------|
|
||||
| 1 | `CODING_CIM10` | coding | gemma3:27b-cloud | 0.1 | 2500 | texte, type_diag, ctx_str, sources_text | `rag_search._build_prompt()` |
|
||||
| 2 | `CODING_CCAM` | coding | gemma3:27b-cloud | 0.1 | 2500 | texte, ctx_str, sources_text | `rag_search._build_prompt_ccam()` |
|
||||
| 3 | `DAS_EXTRACTION` | coding | gemma3:27b-cloud | 0.1 | 2000 | dp_texte, existing_str, ctx_str, text_medical | `rag_search._build_prompt_das_extraction()` |
|
||||
| 4 | `QC_VALIDATION` | qc | gemma3:12b | 0.1 | 2500 | ctx_str, codes_section | `cim10_extractor._validate_justifications()` |
|
||||
| 5 | `CPAM_EXTRACTION` | cpam | deepseek-v3.2:cloud | 0.0 | 1500 | dp_str, das_str, tagged_text, titre, arg_ucr, decision_ucr, dp_ucr_line, da_ucr_line | `cpam_response._extraction_pass()` |
|
||||
| 6 | `CPAM_ARGUMENTATION` | cpam | deepseek-v3.2:cloud | 0.1 | 4000 | dossier_str, asymetrie_str, tagged_str, titre, arg_ucr, decision_ucr, codes_str, definitions_str, sources_text, extraction_str | `cpam_response._build_cpam_prompt()` |
|
||||
| 7 | `CPAM_ADVERSARIAL` | validation | deepseek-v3.2:cloud | 0.0 | 800 | response_json, factual_section, normes_section, dp_ucr_line, da_ucr_line | `cpam_response._validate_adversarial()` |
|
||||
|
||||
## Rôles LLM (config.py)
|
||||
|
||||
```python
|
||||
OLLAMA_MODELS = {
|
||||
"coding": "gemma3:27b-cloud", # Codage CIM-10/CCAM, extraction DAS
|
||||
"cpam": "deepseek-v3.2:cloud", # Passe 1 extraction + passe 2 argumentation CPAM
|
||||
"validation": "deepseek-v3.2:cloud", # Validation adversariale (DOIT différer du cpam en prod)
|
||||
"qc": "gemma3:12b", # Validation batch justifications, rapide
|
||||
}
|
||||
```
|
||||
|
||||
Surchargeable par env : `T2A_MODEL_CODING`, `T2A_MODEL_CPAM`, `T2A_MODEL_VALIDATION`, `T2A_MODEL_QC`.
|
||||
|
||||
## Priorité de résolution du modèle
|
||||
|
||||
`call_ollama(model=, role=)` :
|
||||
1. `model` explicite (prioritaire)
|
||||
2. `get_model(role)` si role fourni
|
||||
3. `OLLAMA_MODEL` global (fallback)
|
||||
@@ -14,3 +14,5 @@ flask>=3.0.0
|
||||
python-dotenv>=1.0.0
|
||||
openpyxl>=3.0.0
|
||||
pandas>=2.0.0
|
||||
PyMuPDF>=1.24.0
|
||||
PyYAML>=6.0
|
||||
|
||||
313
scripts/benchmark_models.py
Normal file
313
scripts/benchmark_models.py
Normal file
@@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Benchmark A/B : gemma3:12b (base) vs pmsi-coder-v2 (fine-tuné).
|
||||
|
||||
Compare les codes CIM-10 produits par les deux modèles sur N dossiers.
|
||||
Teste DP + DAS (échantillon) pour chaque dossier.
|
||||
|
||||
Usage: python scripts/benchmark_models.py [--n 50] [--das-max 5]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from src.config import STRUCTURED_DIR, OLLAMA_URL, DossierMedical
|
||||
from src.medical.cim10_dict import load_dict, normalize_code, validate_code
|
||||
|
||||
import requests
|
||||
|
||||
MODEL_BASE = "gemma3:12b"
|
||||
MODEL_FINETUNED = "pmsi-coder-v2"
|
||||
|
||||
PROMPT_TEMPLATE = """Tu es un médecin DIM expert en codage PMSI.
|
||||
Code le diagnostic suivant en CIM-10. Choisis le code le plus spécifique possible.
|
||||
|
||||
DIAGNOSTIC : "{texte}"
|
||||
TYPE : {type_diag}
|
||||
|
||||
{contexte}
|
||||
|
||||
Réponds UNIQUEMENT avec un objet JSON :
|
||||
{{"code": "X99.9", "confidence": "high|medium|low", "justification": "explication courte"}}"""
|
||||
|
||||
|
||||
def call_model(prompt: str, model: str, timeout: int = 120) -> tuple[dict | None, float]:
|
||||
"""Appelle un modèle Ollama et retourne (résultat, durée_s)."""
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"temperature": 0.1, "num_predict": 500},
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
raw = resp.json().get("response", "")
|
||||
duration = time.time() - t0
|
||||
try:
|
||||
return json.loads(raw), duration
|
||||
except json.JSONDecodeError:
|
||||
return None, duration
|
||||
except Exception as e:
|
||||
return None, time.time() - t0
|
||||
|
||||
|
||||
def load_dossiers(n: int) -> list[dict]:
|
||||
"""Charge N dossiers fusionnés diversifiés."""
|
||||
dossiers = []
|
||||
for subdir in sorted(STRUCTURED_DIR.iterdir()):
|
||||
if not subdir.is_dir():
|
||||
continue
|
||||
for f in subdir.glob("*fusionne*.json"):
|
||||
if ".gemma_" in f.name or ".bak" in f.name:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(f.read_text(encoding="utf-8"))
|
||||
d = DossierMedical.model_validate(data)
|
||||
if d.diagnostic_principal and d.diagnostic_principal.cim10_suggestion:
|
||||
dossiers.append({
|
||||
"name": subdir.name,
|
||||
"dossier": d,
|
||||
"path": str(f),
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
break
|
||||
random.seed(42)
|
||||
random.shuffle(dossiers)
|
||||
return dossiers[:n]
|
||||
|
||||
|
||||
def build_contexte(d: DossierMedical) -> str:
|
||||
"""Construit un contexte clinique résumé."""
|
||||
parts = []
|
||||
s = d.sejour
|
||||
if s.age is not None:
|
||||
parts.append(f"Patient {s.sexe or '?'}, {s.age} ans")
|
||||
if s.duree_sejour is not None:
|
||||
parts.append(f"Durée séjour : {s.duree_sejour}j")
|
||||
if d.diagnostic_principal:
|
||||
parts.append(f"DP : {d.diagnostic_principal.texte}")
|
||||
bio = [f"{b.test}={b.valeur}" for b in d.biologie_cle[:5] if b.valeur]
|
||||
if bio:
|
||||
parts.append(f"Bio : {', '.join(bio)}")
|
||||
return "CONTEXTE : " + " | ".join(parts) if parts else ""
|
||||
|
||||
|
||||
def code_match_level(code_a: str, code_b: str) -> str:
|
||||
"""Retourne le niveau de correspondance entre deux codes."""
|
||||
if code_a == code_b:
|
||||
return "exact"
|
||||
if code_a[:3] == code_b[:3]:
|
||||
return "categorie"
|
||||
return "diff"
|
||||
|
||||
|
||||
def run_benchmark(n: int = 50, das_max: int = 5):
|
||||
print(f"=== Benchmark A/B : {MODEL_BASE} vs {MODEL_FINETUNED} ===")
|
||||
print(f" Dossiers : {n}, DAS max/dossier : {das_max}\n")
|
||||
|
||||
# Vérifier que les deux modèles sont disponibles
|
||||
for model in [MODEL_BASE, MODEL_FINETUNED]:
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={"model": model, "prompt": "test", "stream": False,
|
||||
"options": {"num_predict": 1}},
|
||||
timeout=60,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
print(f" {model} : OK")
|
||||
except Exception as e:
|
||||
print(f" {model} : ERREUR — {e}")
|
||||
sys.exit(1)
|
||||
|
||||
dossiers = load_dossiers(n)
|
||||
print(f"\nDossiers chargés : {len(dossiers)}\n")
|
||||
|
||||
cim10 = load_dict()
|
||||
t_global_start = time.time()
|
||||
|
||||
dp_results = []
|
||||
das_results = []
|
||||
|
||||
for i, item in enumerate(dossiers, 1):
|
||||
d = item["dossier"]
|
||||
dp = d.diagnostic_principal
|
||||
name = item["name"]
|
||||
ctx = build_contexte(d)
|
||||
|
||||
# === DP ===
|
||||
prompt_dp = PROMPT_TEMPLATE.format(
|
||||
texte=dp.texte,
|
||||
type_diag="DP (diagnostic principal)",
|
||||
contexte=ctx,
|
||||
)
|
||||
res_base, t_base = call_model(prompt_dp, MODEL_BASE)
|
||||
res_ft, t_ft = call_model(prompt_dp, MODEL_FINETUNED)
|
||||
|
||||
code_base = normalize_code(res_base.get("code", "")) if res_base else "ERREUR"
|
||||
code_ft = normalize_code(res_ft.get("code", "")) if res_ft else "ERREUR"
|
||||
conf_base = res_base.get("confidence", "?") if res_base else "?"
|
||||
conf_ft = res_ft.get("confidence", "?") if res_ft else "?"
|
||||
valid_base = validate_code(code_base)[0] if code_base != "ERREUR" else False
|
||||
valid_ft = validate_code(code_ft)[0] if code_ft != "ERREUR" else False
|
||||
|
||||
pipeline_code = dp.cim10_suggestion
|
||||
match_level = code_match_level(code_base, code_ft)
|
||||
|
||||
dp_result = {
|
||||
"dossier": name,
|
||||
"texte": dp.texte[:80],
|
||||
"pipeline": pipeline_code,
|
||||
"base": code_base,
|
||||
"ft": code_ft,
|
||||
"conf_base": conf_base,
|
||||
"conf_ft": conf_ft,
|
||||
"valid_base": valid_base,
|
||||
"valid_ft": valid_ft,
|
||||
"match": match_level,
|
||||
"t_base": round(t_base, 2),
|
||||
"t_ft": round(t_ft, 2),
|
||||
}
|
||||
dp_results.append(dp_result)
|
||||
|
||||
tag = {"exact": "=", "categorie": "~", "diff": "X"}[match_level]
|
||||
print(f" [{i:2d}/{len(dossiers)}] {name:<20s} DP=\"{dp.texte[:35]:<35s}\" "
|
||||
f"base={code_base:<7s} ft={code_ft:<7s} [{tag}] "
|
||||
f"({t_base:.1f}s / {t_ft:.1f}s)")
|
||||
|
||||
# === DAS (échantillon) ===
|
||||
das_list = [das for das in d.diagnostics_associes
|
||||
if das.texte and das.cim10_suggestion]
|
||||
if len(das_list) > das_max:
|
||||
random.seed(hash(name))
|
||||
das_list = random.sample(das_list, das_max)
|
||||
|
||||
for das in das_list:
|
||||
prompt_das = PROMPT_TEMPLATE.format(
|
||||
texte=das.texte,
|
||||
type_diag="DAS (diagnostic associé significatif)",
|
||||
contexte=ctx,
|
||||
)
|
||||
res_b, tb = call_model(prompt_das, MODEL_BASE)
|
||||
res_f, tf = call_model(prompt_das, MODEL_FINETUNED)
|
||||
|
||||
cb = normalize_code(res_b.get("code", "")) if res_b else "ERREUR"
|
||||
cf = normalize_code(res_f.get("code", "")) if res_f else "ERREUR"
|
||||
vb = validate_code(cb)[0] if cb != "ERREUR" else False
|
||||
vf = validate_code(cf)[0] if cf != "ERREUR" else False
|
||||
|
||||
das_results.append({
|
||||
"dossier": name,
|
||||
"texte": das.texte[:80],
|
||||
"pipeline": das.cim10_suggestion,
|
||||
"base": cb,
|
||||
"ft": cf,
|
||||
"conf_base": (res_b or {}).get("confidence", "?"),
|
||||
"conf_ft": (res_f or {}).get("confidence", "?"),
|
||||
"valid_base": vb,
|
||||
"valid_ft": vf,
|
||||
"match": code_match_level(cb, cf),
|
||||
"t_base": round(tb, 2),
|
||||
"t_ft": round(tf, 2),
|
||||
})
|
||||
|
||||
t_global = time.time() - t_global_start
|
||||
|
||||
# === RÉSUMÉ ===
|
||||
print(f"\n{'='*75}")
|
||||
print(f"RÉSUMÉ — {len(dp_results)} dossiers, {len(das_results)} DAS testés")
|
||||
print(f"Durée totale : {t_global/60:.1f} min\n")
|
||||
|
||||
for label, results in [("DP", dp_results), ("DAS", das_results)]:
|
||||
if not results:
|
||||
continue
|
||||
nt = len(results)
|
||||
n_exact = sum(1 for r in results if r["match"] == "exact")
|
||||
n_cat = sum(1 for r in results if r["match"] == "categorie")
|
||||
n_diff = sum(1 for r in results if r["match"] == "diff")
|
||||
n_vb = sum(1 for r in results if r["valid_base"])
|
||||
n_vf = sum(1 for r in results if r["valid_ft"])
|
||||
avg_tb = sum(r["t_base"] for r in results) / nt
|
||||
avg_tf = sum(r["t_ft"] for r in results) / nt
|
||||
|
||||
# Confiance
|
||||
conf_b = {}
|
||||
conf_f = {}
|
||||
for r in results:
|
||||
conf_b[r["conf_base"]] = conf_b.get(r["conf_base"], 0) + 1
|
||||
conf_f[r["conf_ft"]] = conf_f.get(r["conf_ft"], 0) + 1
|
||||
|
||||
# Concordance avec pipeline (gemma run original)
|
||||
n_base_eq_pipe = sum(1 for r in results if r["base"] == r["pipeline"])
|
||||
n_ft_eq_pipe = sum(1 for r in results if r["ft"] == r["pipeline"])
|
||||
n_base_cat_pipe = sum(1 for r in results
|
||||
if r["base"][:3] == r["pipeline"][:3])
|
||||
n_ft_cat_pipe = sum(1 for r in results
|
||||
if r["ft"][:3] == r["pipeline"][:3])
|
||||
|
||||
print(f" --- {label} ({nt} diagnostics) ---")
|
||||
print(f" Concordance base↔ft :")
|
||||
print(f" Exact : {n_exact}/{nt} ({100*n_exact/nt:.0f}%)")
|
||||
print(f" Catégorie : {n_exact+n_cat}/{nt} ({100*(n_exact+n_cat)/nt:.0f}%)")
|
||||
print(f" Différent : {n_diff}/{nt} ({100*n_diff/nt:.0f}%)")
|
||||
print(f" Codes valides :")
|
||||
print(f" base : {n_vb}/{nt} ({100*n_vb/nt:.0f}%)")
|
||||
print(f" ft : {n_vf}/{nt} ({100*n_vf/nt:.0f}%)")
|
||||
print(f" vs pipeline (gemma original) :")
|
||||
print(f" base=pipe : {n_base_eq_pipe}/{nt} exact, {n_base_cat_pipe}/{nt} catégorie")
|
||||
print(f" ft=pipe : {n_ft_eq_pipe}/{nt} exact, {n_ft_cat_pipe}/{nt} catégorie")
|
||||
print(f" Temps moyen : base={avg_tb:.2f}s ft={avg_tf:.2f}s (Δ={100*(avg_tf-avg_tb)/avg_tb:+.0f}%)")
|
||||
print(f" Confiance base : {conf_b}")
|
||||
print(f" Confiance ft : {conf_f}")
|
||||
print()
|
||||
|
||||
# Lister les différences DP
|
||||
diffs_dp = [r for r in dp_results if r["match"] == "diff"]
|
||||
if diffs_dp:
|
||||
print(f" Différences DP ({len(diffs_dp)}) :")
|
||||
for r in diffs_dp:
|
||||
vb = "✓" if r["valid_base"] else "✗"
|
||||
vf = "✓" if r["valid_ft"] else "✗"
|
||||
print(f" {r['dossier']:<18s} \"{r['texte'][:40]}\"")
|
||||
print(f" base={r['base']:<7s}{vb} ft={r['ft']:<7s}{vf} pipe={r['pipeline']}")
|
||||
|
||||
# Sauvegarder
|
||||
out = {
|
||||
"meta": {
|
||||
"date": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
"model_base": MODEL_BASE,
|
||||
"model_ft": MODEL_FINETUNED,
|
||||
"n_dossiers": len(dp_results),
|
||||
"n_das": len(das_results),
|
||||
"duration_min": round(t_global / 60, 1),
|
||||
},
|
||||
"dp": dp_results,
|
||||
"das": das_results,
|
||||
}
|
||||
out_path = Path(__file__).parent.parent / "output" / "benchmark_ab.json"
|
||||
out_path.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"\nRésultats détaillés : {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--n", type=int, default=50,
|
||||
help="Nombre de dossiers à tester")
|
||||
parser.add_argument("--das-max", type=int, default=5,
|
||||
help="Max DAS testés par dossier")
|
||||
args = parser.parse_args()
|
||||
run_benchmark(args.n, args.das_max)
|
||||
689
scripts/benchmark_quality.py
Normal file
689
scripts/benchmark_quality.py
Normal file
@@ -0,0 +1,689 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Benchmark qualité T2A — validation end-to-end sur vrais dossiers.
|
||||
|
||||
Compare la qualité des codes CIM-10, vetos, downgrades et CPAM
|
||||
entre runs successifs. Chaque run est sauvegardé dans un répertoire
|
||||
isolé pour permettre des comparaisons A/B.
|
||||
|
||||
Usage:
|
||||
python scripts/benchmark_quality.py --n 10
|
||||
python scripts/benchmark_quality.py --n 10 --compare RUN_ID
|
||||
python scripts/benchmark_quality.py --dossiers 116_23065570,45_23183041
|
||||
python scripts/benchmark_quality.py --gold-standard
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from statistics import mean, median
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
BENCHMARKS_DIR = ROOT / "output" / "benchmarks"
|
||||
GOLD_STANDARD_FILE = ROOT / "data" / "gold_standard" / "_selection.json"
|
||||
INPUT_DIR = ROOT / "input"
|
||||
OUTPUT_DIR = ROOT / "output" / "structured"
|
||||
PY = str(ROOT / ".venv" / "bin" / "python3")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Sélection des dossiers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _gold_standard_ids() -> list[str]:
|
||||
"""Charge les IDs du gold standard."""
|
||||
if not GOLD_STANDARD_FILE.exists():
|
||||
print(f"ERREUR: {GOLD_STANDARD_FILE} introuvable")
|
||||
sys.exit(1)
|
||||
data = json.loads(GOLD_STANDARD_FILE.read_text("utf-8"))
|
||||
# Format: "116_23065570/116_23065570_fusionne_cim10" → on prend la partie avant /
|
||||
return [d.split("/")[0] for d in data["dossiers"]]
|
||||
|
||||
|
||||
def select_dossiers(n: int, gold_standard: bool, specific: list[str] | None, seed: int = 42) -> list[str]:
|
||||
"""Sélectionne les dossiers à benchmarker."""
|
||||
if specific:
|
||||
# Vérifier que les dossiers existent
|
||||
valid = []
|
||||
for d in specific:
|
||||
if (INPUT_DIR / d).is_dir():
|
||||
valid.append(d)
|
||||
else:
|
||||
print(f" WARN: dossier {d} introuvable dans input/")
|
||||
return valid
|
||||
|
||||
if gold_standard:
|
||||
ids = _gold_standard_ids()
|
||||
return ids[:n] if n < len(ids) else ids
|
||||
|
||||
# Sinon : prendre N dossiers depuis input/ (tri déterministe + seed pour reproductibilité)
|
||||
all_dirs = sorted(
|
||||
d.name for d in INPUT_DIR.iterdir()
|
||||
if d.is_dir() and any(d.glob("*.pdf"))
|
||||
)
|
||||
if not all_dirs:
|
||||
print("ERREUR: aucun dossier avec PDF dans input/")
|
||||
sys.exit(1)
|
||||
|
||||
import random
|
||||
rng = random.Random(seed)
|
||||
rng.shuffle(all_dirs)
|
||||
return all_dirs[:n]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Exécution pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_pipeline(dossier_id: str, clean: bool) -> tuple[float, bool]:
|
||||
"""Exécute le pipeline sur un dossier. Retourne (durée_s, succès)."""
|
||||
input_path = INPUT_DIR / dossier_id
|
||||
|
||||
if clean:
|
||||
for subdir in ["structured", "reports", "anonymized"]:
|
||||
target = ROOT / "output" / subdir / dossier_id
|
||||
if target.exists():
|
||||
shutil.rmtree(target)
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[PY, "-m", "src.main", str(input_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=str(ROOT),
|
||||
timeout=600, # 10 min max par dossier
|
||||
)
|
||||
duration = time.time() - t0
|
||||
if result.returncode != 0:
|
||||
print(f" STDERR: {result.stderr[-500:]}")
|
||||
return duration, False
|
||||
return duration, True
|
||||
except subprocess.TimeoutExpired:
|
||||
return time.time() - t0, False
|
||||
except Exception as e:
|
||||
print(f" EXCEPTION: {e}")
|
||||
return time.time() - t0, False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chargement dictionnaire CIM-10
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_cim10_dict() -> dict[str, str]:
|
||||
"""Charge le dictionnaire CIM-10 (sans passer par le singleton)."""
|
||||
dict_path = ROOT / "data" / "cim10_dict.json"
|
||||
supp_path = ROOT / "data" / "cim10_supplements.json"
|
||||
d = {}
|
||||
if dict_path.exists():
|
||||
d = json.loads(dict_path.read_text("utf-8"))
|
||||
if supp_path.exists():
|
||||
for code, label in json.loads(supp_path.read_text("utf-8")).items():
|
||||
d.setdefault(code, label)
|
||||
return d
|
||||
|
||||
|
||||
def normalize_code(code: str) -> str:
|
||||
"""K810 → K81.0, k85.1 → K85.1."""
|
||||
code = code.strip().upper()
|
||||
if len(code) > 3 and "." not in code:
|
||||
code = code[:3] + "." + code[3:]
|
||||
return code
|
||||
|
||||
|
||||
def is_valid_code(code: str, cim10: dict[str, str]) -> bool:
|
||||
"""Vérifie si un code CIM-10 existe dans le dictionnaire."""
|
||||
nc = normalize_code(code)
|
||||
return nc in cim10 or code.upper().strip() in cim10
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Analyse d'un dossier
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def find_merged_json(dossier_id: str) -> Path | None:
|
||||
"""Trouve le JSON fusionné d'un dossier."""
|
||||
d = OUTPUT_DIR / dossier_id
|
||||
if not d.exists():
|
||||
return None
|
||||
# Chercher le fusionné d'abord
|
||||
fusions = list(d.glob("*fusionne_cim10.json"))
|
||||
if fusions:
|
||||
return fusions[0]
|
||||
# Sinon premier _cim10.json
|
||||
cim10s = list(d.glob("*_cim10.json"))
|
||||
return cim10s[0] if cim10s else None
|
||||
|
||||
|
||||
def analyze_dossier(dossier_id: str, cim10: dict[str, str], duration: float) -> dict:
|
||||
"""Analyse le JSON de sortie d'un dossier et extrait les métriques."""
|
||||
result = {
|
||||
"dossier_id": dossier_id,
|
||||
"processing_time_s": round(duration, 1),
|
||||
"success": False,
|
||||
}
|
||||
|
||||
json_path = find_merged_json(dossier_id)
|
||||
if not json_path:
|
||||
return result
|
||||
|
||||
try:
|
||||
data = json.loads(json_path.read_text("utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return result
|
||||
|
||||
result["success"] = True
|
||||
|
||||
# --- DP ---
|
||||
dp = data.get("diagnostic_principal", {})
|
||||
dp_code = dp.get("cim10_final") or dp.get("cim10_suggestion") or ""
|
||||
dp_suggestion = dp.get("cim10_suggestion") or ""
|
||||
result["dp"] = {
|
||||
"texte": (dp.get("texte") or "")[:80],
|
||||
"code_suggestion": dp_suggestion,
|
||||
"code_final": dp_code,
|
||||
"confidence": dp.get("cim10_confidence", ""),
|
||||
"has_code": bool(dp_code),
|
||||
"valid_code": is_valid_code(dp_code, cim10) if dp_code else False,
|
||||
"downgraded": bool(dp_code and dp_suggestion and dp_code != dp_suggestion),
|
||||
}
|
||||
|
||||
# --- DAS ---
|
||||
das_list = data.get("diagnostics_associes", [])
|
||||
das_codes = []
|
||||
das_conf = {"high": 0, "medium": 0, "low": 0}
|
||||
das_valid = 0
|
||||
das_no_code = 0
|
||||
das_downgraded = 0
|
||||
|
||||
for d_item in das_list:
|
||||
code = d_item.get("cim10_final") or d_item.get("cim10_suggestion") or ""
|
||||
suggestion = d_item.get("cim10_suggestion") or ""
|
||||
conf = d_item.get("cim10_confidence", "low")
|
||||
|
||||
if not code:
|
||||
das_no_code += 1
|
||||
continue
|
||||
|
||||
das_codes.append(code)
|
||||
if conf in das_conf:
|
||||
das_conf[conf] += 1
|
||||
|
||||
if is_valid_code(code, cim10):
|
||||
das_valid += 1
|
||||
|
||||
if code and suggestion and code != suggestion:
|
||||
das_downgraded += 1
|
||||
|
||||
n_das_with_code = len(das_codes)
|
||||
result["das"] = {
|
||||
"total": len(das_list),
|
||||
"with_code": n_das_with_code,
|
||||
"no_code": das_no_code,
|
||||
"valid": das_valid,
|
||||
"validity_rate": round(das_valid / n_das_with_code, 3) if n_das_with_code else 0,
|
||||
"confidence": das_conf,
|
||||
"downgraded": das_downgraded,
|
||||
"downgrade_rate": round(das_downgraded / n_das_with_code, 3) if n_das_with_code else 0,
|
||||
"codes_uniques": sorted(set(das_codes)),
|
||||
}
|
||||
|
||||
# --- Metrics du dossier ---
|
||||
metrics = data.get("metrics", {})
|
||||
result["metrics"] = {
|
||||
"das_active": metrics.get("das_active", 0),
|
||||
"das_removed": metrics.get("das_removed", 0),
|
||||
"das_ruled_out": metrics.get("das_ruled_out", 0),
|
||||
}
|
||||
|
||||
# --- Veto ---
|
||||
veto = data.get("veto_report", {})
|
||||
issues = veto.get("issues", [])
|
||||
result["veto"] = {
|
||||
"verdict": veto.get("verdict", "NO_REPORT"),
|
||||
"score": veto.get("score_contestabilite", 0),
|
||||
"issues_count": len(issues),
|
||||
"hard_count": sum(1 for i in issues if i.get("severity") == "HARD"),
|
||||
"top_issues": [i.get("veto", i.get("type", "?")) for i in issues[:5]],
|
||||
}
|
||||
|
||||
# --- GHM ---
|
||||
ghm = data.get("ghm_estimation")
|
||||
result["ghm"] = {
|
||||
"estimated": ghm is not None and bool(ghm),
|
||||
"cmd": ghm.get("cmd") if ghm else None,
|
||||
"severity": ghm.get("severity") if ghm else None,
|
||||
"ghm": ghm.get("ghm") if ghm else None,
|
||||
}
|
||||
|
||||
# --- CPAM ---
|
||||
cpam = data.get("controles_cpam", [])
|
||||
result["cpam"] = {
|
||||
"controls_count": len(cpam),
|
||||
"has_response": any(bool(c.get("contre_argumentation")) for c in cpam),
|
||||
"sources_count": sum(len(c.get("sources_reponse", [])) for c in cpam),
|
||||
}
|
||||
|
||||
# --- Biologie ---
|
||||
bio = data.get("biologie_cle", [])
|
||||
result["biologie"] = {
|
||||
"tests_count": len(bio),
|
||||
"anomalies": sum(1 for b in bio if b.get("anomalie")),
|
||||
}
|
||||
|
||||
# --- Codes CIM-10 invalides (détail) ---
|
||||
invalid_codes = []
|
||||
if dp_code and not is_valid_code(dp_code, cim10):
|
||||
invalid_codes.append(f"DP:{dp_code}")
|
||||
for code in das_codes:
|
||||
if not is_valid_code(code, cim10):
|
||||
invalid_codes.append(f"DAS:{code}")
|
||||
result["invalid_codes"] = invalid_codes
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Agrégation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def compute_aggregate(per_dossier: list[dict]) -> dict:
|
||||
"""Calcule les métriques agrégées sur tous les dossiers."""
|
||||
successful = [d for d in per_dossier if d.get("success")]
|
||||
n = len(successful)
|
||||
if n == 0:
|
||||
return {"n_total": len(per_dossier), "n_success": 0}
|
||||
|
||||
# DP
|
||||
dp_has_code = sum(1 for d in successful if d["dp"]["has_code"])
|
||||
dp_valid = sum(1 for d in successful if d["dp"]["valid_code"])
|
||||
dp_conf = {"high": 0, "medium": 0, "low": 0}
|
||||
for d in successful:
|
||||
c = d["dp"]["confidence"]
|
||||
if c in dp_conf:
|
||||
dp_conf[c] += 1
|
||||
dp_downgraded = sum(1 for d in successful if d["dp"]["downgraded"])
|
||||
|
||||
# DAS
|
||||
total_das = sum(d["das"]["total"] for d in successful)
|
||||
total_das_with_code = sum(d["das"]["with_code"] for d in successful)
|
||||
total_das_valid = sum(d["das"]["valid"] for d in successful)
|
||||
total_das_downgraded = sum(d["das"]["downgraded"] for d in successful)
|
||||
das_conf_agg = {"high": 0, "medium": 0, "low": 0}
|
||||
for d in successful:
|
||||
for k in das_conf_agg:
|
||||
das_conf_agg[k] += d["das"]["confidence"].get(k, 0)
|
||||
|
||||
# Veto
|
||||
verdicts = {}
|
||||
total_hard = 0
|
||||
for d in successful:
|
||||
v = d["veto"]["verdict"]
|
||||
verdicts[v] = verdicts.get(v, 0) + 1
|
||||
total_hard += d["veto"]["hard_count"]
|
||||
|
||||
# GHM
|
||||
ghm_estimated = sum(1 for d in successful if d["ghm"]["estimated"])
|
||||
|
||||
# CPAM
|
||||
cpam_total = sum(d["cpam"]["controls_count"] for d in successful)
|
||||
cpam_with_response = sum(1 for d in successful if d["cpam"]["has_response"])
|
||||
|
||||
# Temps
|
||||
times = [d["processing_time_s"] for d in successful]
|
||||
times_sorted = sorted(times)
|
||||
p90_idx = int(len(times_sorted) * 0.9)
|
||||
|
||||
# Codes invalides
|
||||
all_invalid = []
|
||||
for d in successful:
|
||||
all_invalid.extend(d.get("invalid_codes", []))
|
||||
|
||||
return {
|
||||
"n_total": len(per_dossier),
|
||||
"n_success": n,
|
||||
"n_failed": len(per_dossier) - n,
|
||||
"dp": {
|
||||
"has_code_rate": round(dp_has_code / n, 3),
|
||||
"valid_code_rate": round(dp_valid / n, 3),
|
||||
"confidence": dp_conf,
|
||||
"downgraded": dp_downgraded,
|
||||
},
|
||||
"das": {
|
||||
"total": total_das,
|
||||
"mean_per_dossier": round(total_das / n, 1),
|
||||
"with_code": total_das_with_code,
|
||||
"valid": total_das_valid,
|
||||
"validity_rate": round(total_das_valid / total_das_with_code, 3) if total_das_with_code else 0,
|
||||
"confidence": das_conf_agg,
|
||||
"confidence_high_rate": round(das_conf_agg["high"] / total_das_with_code, 3) if total_das_with_code else 0,
|
||||
"downgraded": total_das_downgraded,
|
||||
"downgrade_rate": round(total_das_downgraded / total_das_with_code, 3) if total_das_with_code else 0,
|
||||
},
|
||||
"veto": {
|
||||
"verdicts": verdicts,
|
||||
"hard_total": total_hard,
|
||||
"dossiers_with_hard": sum(1 for d in successful if d["veto"]["hard_count"] > 0),
|
||||
},
|
||||
"ghm": {
|
||||
"estimated_rate": round(ghm_estimated / n, 3),
|
||||
},
|
||||
"cpam": {
|
||||
"controls_total": cpam_total,
|
||||
"with_response": cpam_with_response,
|
||||
},
|
||||
"timing": {
|
||||
"mean_s": round(mean(times), 1),
|
||||
"median_s": round(median(times), 1),
|
||||
"p90_s": round(times_sorted[p90_idx], 1) if times_sorted else 0,
|
||||
"total_s": round(sum(times), 1),
|
||||
},
|
||||
"invalid_codes": all_invalid,
|
||||
"invalid_codes_count": len(all_invalid),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rapport texte
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _pct(val: float) -> str:
|
||||
return f"{val * 100:.1f}%"
|
||||
|
||||
|
||||
def _bar(val: float, width: int = 20) -> str:
|
||||
filled = int(val * width)
|
||||
return "█" * filled + "░" * (width - filled)
|
||||
|
||||
|
||||
def generate_report(run_id: str, config: dict, agg: dict, per_dossier: list[dict]) -> str:
|
||||
"""Génère un rapport lisible."""
|
||||
lines = []
|
||||
w = 66
|
||||
|
||||
lines.append("=" * w)
|
||||
lines.append(f" BENCHMARK QUALITÉ T2A — {run_id}")
|
||||
lines.append("=" * w)
|
||||
lines.append(f" Date : {config['timestamp']}")
|
||||
lines.append(f" Modèles : coding={config['models'].get('coding','?')} cpam={config['models'].get('cpam','?')}")
|
||||
lines.append(f" validation={config['models'].get('validation','?')} qc={config['models'].get('qc','?')}")
|
||||
lines.append(f" Dossiers : {agg['n_success']}/{agg['n_total']} traités ({agg.get('n_failed',0)} échecs)")
|
||||
lines.append(f" Durée : {agg['timing']['total_s']:.0f}s ({agg['timing']['mean_s']:.1f}s/dossier)")
|
||||
lines.append("-" * w)
|
||||
|
||||
# DP
|
||||
dp = agg["dp"]
|
||||
lines.append("")
|
||||
lines.append(" DIAGNOSTIC PRINCIPAL (DP)")
|
||||
lines.append(f" Code obtenu : {_bar(dp['has_code_rate'])} {_pct(dp['has_code_rate'])}")
|
||||
lines.append(f" Code CIM-10 valide : {_bar(dp['valid_code_rate'])} {_pct(dp['valid_code_rate'])}")
|
||||
lines.append(f" Confiance high : {dp['confidence'].get('high',0)}/{agg['n_success']} "
|
||||
f"medium: {dp['confidence'].get('medium',0)} low: {dp['confidence'].get('low',0)}")
|
||||
lines.append(f" Downgrades : {dp['downgraded']}")
|
||||
|
||||
# DAS
|
||||
das = agg["das"]
|
||||
lines.append("")
|
||||
lines.append(" DIAGNOSTICS ASSOCIÉS (DAS)")
|
||||
lines.append(f" Total : {das['total']} (moy {das['mean_per_dossier']}/dossier)")
|
||||
lines.append(f" Avec code : {das['with_code']}/{das['total']}")
|
||||
lines.append(f" Codes valides : {_bar(das['validity_rate'])} {_pct(das['validity_rate'])}")
|
||||
lines.append(f" Confiance : high={das['confidence']['high']} "
|
||||
f"medium={das['confidence']['medium']} low={das['confidence']['low']}")
|
||||
lines.append(f" Confiance high : {_bar(das['confidence_high_rate'])} {_pct(das['confidence_high_rate'])}")
|
||||
lines.append(f" Downgrades : {das['downgraded']} ({_pct(das['downgrade_rate'])})")
|
||||
|
||||
# Veto
|
||||
veto = agg["veto"]
|
||||
lines.append("")
|
||||
lines.append(" VETOS / QUALITÉ")
|
||||
for v, count in sorted(veto["verdicts"].items(), key=lambda x: -x[1]):
|
||||
lines.append(f" {v:12s} : {count}")
|
||||
lines.append(f" Issues HARD : {veto['hard_total']} (dans {veto['dossiers_with_hard']} dossiers)")
|
||||
|
||||
# GHM
|
||||
lines.append("")
|
||||
lines.append(" GHM")
|
||||
lines.append(f" Estimé : {_bar(agg['ghm']['estimated_rate'])} {_pct(agg['ghm']['estimated_rate'])}")
|
||||
|
||||
# CPAM
|
||||
if agg["cpam"]["controls_total"] > 0:
|
||||
lines.append("")
|
||||
lines.append(" CPAM")
|
||||
lines.append(f" Contrôles : {agg['cpam']['controls_total']}")
|
||||
lines.append(f" Avec réponse : {agg['cpam']['with_response']}")
|
||||
|
||||
# Temps
|
||||
lines.append("")
|
||||
lines.append(" TEMPS DE TRAITEMENT")
|
||||
lines.append(f" Moyen : {agg['timing']['mean_s']:.1f}s")
|
||||
lines.append(f" Médian : {agg['timing']['median_s']:.1f}s")
|
||||
lines.append(f" P90 : {agg['timing']['p90_s']:.1f}s")
|
||||
lines.append(f" Total : {agg['timing']['total_s']:.0f}s")
|
||||
|
||||
# Codes invalides
|
||||
if agg["invalid_codes"]:
|
||||
lines.append("")
|
||||
lines.append(f" CODES CIM-10 INVALIDES ({agg['invalid_codes_count']})")
|
||||
for code in agg["invalid_codes"][:20]:
|
||||
lines.append(f" {code}")
|
||||
if agg["invalid_codes_count"] > 20:
|
||||
lines.append(f" ... et {agg['invalid_codes_count'] - 20} autres")
|
||||
|
||||
# Détail par dossier
|
||||
lines.append("")
|
||||
lines.append("-" * w)
|
||||
lines.append(" DÉTAIL PAR DOSSIER")
|
||||
lines.append("-" * w)
|
||||
lines.append(f" {'Dossier':<25s} {'DP':>6s} {'DAS':>4s} {'Valid%':>7s} {'Veto':>10s} {'Temps':>6s}")
|
||||
lines.append(f" {'-'*25:<25s} {'-'*6:>6s} {'-'*4:>4s} {'-'*7:>7s} {'-'*10:>10s} {'-'*6:>6s}")
|
||||
|
||||
for d in sorted(per_dossier, key=lambda x: x["dossier_id"]):
|
||||
if not d.get("success"):
|
||||
lines.append(f" {d['dossier_id']:<25s} {'ÉCHEC':>6s}")
|
||||
continue
|
||||
dp_code = d["dp"]["code_final"] or "-"
|
||||
dp_mark = "✓" if d["dp"]["valid_code"] else "✗"
|
||||
n_das = d["das"]["total"]
|
||||
vr = f"{d['das']['validity_rate']*100:.0f}%" if d["das"]["with_code"] else "-"
|
||||
verdict = d["veto"]["verdict"]
|
||||
t = f"{d['processing_time_s']:.0f}s"
|
||||
lines.append(f" {d['dossier_id']:<25s} {dp_code:>5s}{dp_mark} {n_das:>4d} {vr:>7s} {verdict:>10s} {t:>6s}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("=" * w)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Comparaison entre runs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def compare_runs(current_agg: dict, baseline_agg: dict, baseline_id: str) -> str:
|
||||
"""Compare deux runs et génère un rapport diff."""
|
||||
lines = []
|
||||
w = 66
|
||||
lines.append("")
|
||||
lines.append("=" * w)
|
||||
lines.append(f" COMPARAISON avec {baseline_id}")
|
||||
lines.append("=" * w)
|
||||
|
||||
def _delta(cur: float, base: float, is_pct: bool = True) -> str:
|
||||
d = cur - base
|
||||
sign = "+" if d >= 0 else ""
|
||||
if is_pct:
|
||||
return f"{sign}{d*100:.1f}%"
|
||||
return f"{sign}{d:.1f}"
|
||||
|
||||
def _row(label: str, cur_val: float, base_val: float, is_pct: bool = True):
|
||||
if is_pct:
|
||||
cur_s = _pct(cur_val)
|
||||
base_s = _pct(base_val)
|
||||
else:
|
||||
cur_s = f"{cur_val:.1f}"
|
||||
base_s = f"{base_val:.1f}"
|
||||
delta_s = _delta(cur_val, base_val, is_pct)
|
||||
lines.append(f" {label:<24s} {base_s:>10s} {cur_s:>10s} {delta_s:>10s}")
|
||||
|
||||
lines.append(f" {'Métrique':<24s} {'Baseline':>10s} {'Actuel':>10s} {'Delta':>10s}")
|
||||
lines.append(f" {'-'*24:<24s} {'-'*10:>10s} {'-'*10:>10s} {'-'*10:>10s}")
|
||||
|
||||
_row("DP code valide", current_agg["dp"]["valid_code_rate"], baseline_agg["dp"]["valid_code_rate"])
|
||||
_row("DAS validité", current_agg["das"]["validity_rate"], baseline_agg["das"]["validity_rate"])
|
||||
_row("DAS confiance high", current_agg["das"]["confidence_high_rate"], baseline_agg["das"]["confidence_high_rate"])
|
||||
_row("DAS downgrade", current_agg["das"]["downgrade_rate"], baseline_agg["das"]["downgrade_rate"])
|
||||
_row("GHM estimé", current_agg["ghm"]["estimated_rate"], baseline_agg["ghm"]["estimated_rate"])
|
||||
_row("DAS moy/dossier", current_agg["das"]["mean_per_dossier"], baseline_agg["das"]["mean_per_dossier"], is_pct=False)
|
||||
_row("Temps moyen (s)", current_agg["timing"]["mean_s"], baseline_agg["timing"]["mean_s"], is_pct=False)
|
||||
|
||||
# Codes invalides
|
||||
cur_inv = set(current_agg.get("invalid_codes", []))
|
||||
base_inv = set(baseline_agg.get("invalid_codes", []))
|
||||
new_inv = cur_inv - base_inv
|
||||
fixed_inv = base_inv - cur_inv
|
||||
if new_inv:
|
||||
lines.append(f"\n Nouveaux codes invalides : {', '.join(sorted(new_inv))}")
|
||||
if fixed_inv:
|
||||
lines.append(f" Codes corrigés : {', '.join(sorted(fixed_inv))}")
|
||||
|
||||
lines.append("=" * w)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_current_config() -> dict:
|
||||
"""Récupère la configuration modèle actuelle."""
|
||||
try:
|
||||
from src.config import OLLAMA_MODELS, OLLAMA_MODEL, OLLAMA_URL
|
||||
return {
|
||||
"models": dict(OLLAMA_MODELS),
|
||||
"ollama_model": OLLAMA_MODEL,
|
||||
"ollama_url": OLLAMA_URL,
|
||||
}
|
||||
except ImportError:
|
||||
return {
|
||||
"models": {
|
||||
"coding": os.environ.get("T2A_MODEL_CODING", "?"),
|
||||
"cpam": os.environ.get("T2A_MODEL_CPAM", "?"),
|
||||
"validation": os.environ.get("T2A_MODEL_VALIDATION", "?"),
|
||||
"qc": os.environ.get("T2A_MODEL_QC", "?"),
|
||||
},
|
||||
"ollama_model": os.environ.get("OLLAMA_MODEL", "?"),
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Benchmark qualité T2A")
|
||||
parser.add_argument("--n", type=int, default=10, help="Nombre de dossiers")
|
||||
parser.add_argument("--dossiers", type=str, help="IDs séparés par des virgules")
|
||||
parser.add_argument("--gold-standard", action="store_true", help="Utiliser les 50 dossiers gold standard")
|
||||
parser.add_argument("--compare", type=str, help="Run ID à comparer")
|
||||
parser.add_argument("--label", type=str, default="", help="Label pour ce run")
|
||||
parser.add_argument("--no-reprocess", action="store_true", help="Analyser les outputs existants sans relancer le pipeline")
|
||||
parser.add_argument("--clean", action="store_true", help="Supprimer les outputs avant retraitement")
|
||||
parser.add_argument("--seed", type=int, default=42, help="Seed pour la sélection aléatoire")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Sélection dossiers
|
||||
specific = args.dossiers.split(",") if args.dossiers else None
|
||||
dossiers = select_dossiers(args.n, args.gold_standard, specific, args.seed)
|
||||
print(f"\n Dossiers sélectionnés : {len(dossiers)}")
|
||||
for d in dossiers:
|
||||
print(f" - {d}")
|
||||
|
||||
# Config
|
||||
config = get_current_config()
|
||||
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
if args.label:
|
||||
run_id = f"{run_id}_{args.label}"
|
||||
config["timestamp"] = datetime.now().isoformat()
|
||||
config["run_id"] = run_id
|
||||
config["dossiers"] = dossiers
|
||||
config["args"] = {
|
||||
"n": args.n,
|
||||
"gold_standard": args.gold_standard,
|
||||
"clean": args.clean,
|
||||
"no_reprocess": args.no_reprocess,
|
||||
"seed": args.seed,
|
||||
"label": args.label,
|
||||
}
|
||||
|
||||
print(f"\n Run ID : {run_id}")
|
||||
print(f" Modèles : {config['models']}")
|
||||
print(f" Reprocess: {'NON' if args.no_reprocess else 'OUI (clean=' + str(args.clean) + ')'}")
|
||||
print()
|
||||
|
||||
# Charger dictionnaire CIM-10
|
||||
cim10 = load_cim10_dict()
|
||||
print(f" Dictionnaire CIM-10 : {len(cim10)} codes")
|
||||
print()
|
||||
|
||||
# Traitement
|
||||
per_dossier = []
|
||||
for i, dossier_id in enumerate(dossiers, 1):
|
||||
print(f" [{i}/{len(dossiers)}] {dossier_id}", end="", flush=True)
|
||||
|
||||
if args.no_reprocess:
|
||||
duration = 0.0
|
||||
success = find_merged_json(dossier_id) is not None
|
||||
if not success:
|
||||
print(" — pas de JSON")
|
||||
else:
|
||||
print(" — analyse existant")
|
||||
else:
|
||||
print(" — traitement...", end="", flush=True)
|
||||
duration, success = run_pipeline(dossier_id, args.clean)
|
||||
print(f" {duration:.1f}s {'✓' if success else '✗'}")
|
||||
|
||||
metrics = analyze_dossier(dossier_id, cim10, duration)
|
||||
per_dossier.append(metrics)
|
||||
|
||||
# Agrégation
|
||||
agg = compute_aggregate(per_dossier)
|
||||
|
||||
# Rapport
|
||||
report = generate_report(run_id, config, agg, per_dossier)
|
||||
print(report)
|
||||
|
||||
# Comparaison si demandée
|
||||
comparison = ""
|
||||
if args.compare:
|
||||
baseline_path = BENCHMARKS_DIR / args.compare / "metrics.json"
|
||||
if baseline_path.exists():
|
||||
baseline = json.loads(baseline_path.read_text("utf-8"))
|
||||
comparison = compare_runs(agg, baseline["aggregate"], args.compare)
|
||||
print(comparison)
|
||||
else:
|
||||
print(f"\n WARN: run baseline {args.compare} introuvable ({baseline_path})")
|
||||
|
||||
# Sauvegarde
|
||||
run_dir = BENCHMARKS_DIR / run_id
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
(run_dir / "config.json").write_text(
|
||||
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
(run_dir / "metrics.json").write_text(
|
||||
json.dumps({"aggregate": agg, "per_dossier": per_dossier}, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
(run_dir / "report.txt").write_text(report + comparison, encoding="utf-8")
|
||||
|
||||
print(f"\n Résultats sauvegardés dans : {run_dir}")
|
||||
print(f" Pour comparer un futur run : python scripts/benchmark_quality.py --compare {run_id}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
515
src/config.py
515
src/config.py
@@ -3,8 +3,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import contextvars
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Optional, Any, Dict
|
||||
|
||||
import yaml
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
@@ -20,8 +24,17 @@ OUTPUT_DIR = BASE_DIR / "output"
|
||||
ANONYMIZED_DIR = OUTPUT_DIR / "anonymized"
|
||||
STRUCTURED_DIR = OUTPUT_DIR / "structured"
|
||||
REPORTS_DIR = OUTPUT_DIR / "reports"
|
||||
CONFIG_DIR = BASE_DIR / "config"
|
||||
REFERENCE_RANGES_PATH = CONFIG_DIR / "reference_ranges.yaml"
|
||||
BIO_RULES_PATH = CONFIG_DIR / "bio_rules.yaml"
|
||||
LAB_SANITY_PATH = CONFIG_DIR / "lab_value_sanity.yaml"
|
||||
RULES_DIR = CONFIG_DIR / "rules"
|
||||
RULES_BASE_PATH = RULES_DIR / "base.yaml"
|
||||
RULES_ENABLED_PATH = RULES_DIR / "enabled.yaml"
|
||||
RULES_ROUTER_PATH = RULES_DIR / "router.yaml"
|
||||
|
||||
for d in (INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR, REPORTS_DIR):
|
||||
|
||||
for d in (INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR, REPORTS_DIR, CONFIG_DIR, RULES_DIR):
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
@@ -40,6 +53,20 @@ OLLAMA_TIMEOUT = int(os.environ.get("OLLAMA_TIMEOUT", "120"))
|
||||
OLLAMA_CACHE_PATH = BASE_DIR / "data" / "ollama_cache.json"
|
||||
OLLAMA_MAX_PARALLEL = int(os.environ.get("OLLAMA_MAX_PARALLEL", "2"))
|
||||
|
||||
# --- Modèles par rôle LLM ---
|
||||
|
||||
OLLAMA_MODELS: dict[str, str] = {
|
||||
"coding": os.environ.get("T2A_MODEL_CODING", "gemma3:27b-cloud"),
|
||||
"cpam": os.environ.get("T2A_MODEL_CPAM", "gemma3:27b-cloud"),
|
||||
"validation": os.environ.get("T2A_MODEL_VALIDATION", "deepseek-v3.2:cloud"),
|
||||
"qc": os.environ.get("T2A_MODEL_QC", "gemma3:12b"),
|
||||
}
|
||||
|
||||
|
||||
def get_model(role: str) -> str:
|
||||
"""Retourne le modèle associé à un rôle LLM, ou le modèle global par défaut."""
|
||||
return OLLAMA_MODELS.get(role, OLLAMA_MODEL)
|
||||
|
||||
|
||||
# --- Configuration RUM / établissement ---
|
||||
|
||||
@@ -69,6 +96,418 @@ EMBEDDING_MODEL = os.environ.get("T2A_EMBEDDING_MODEL", "dangvantuan/sentence-ca
|
||||
|
||||
RERANKER_MODEL = os.environ.get("T2A_RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
|
||||
|
||||
# --- Références biologiques (fallback) ---
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_reference_ranges() -> Dict[str, Any]:
|
||||
"""Charge les intervalles de référence biologiques depuis config/reference_ranges.yaml.
|
||||
|
||||
Hiérarchie d'usage recommandée dans les règles :
|
||||
1) Normes présentes dans le document (ex: [N: 135-145])
|
||||
2) Table YAML (par bande d'âge)
|
||||
3) "Safe zones" conservatrices si âge inconnu
|
||||
|
||||
Le YAML est volontairement éditable par des non-informaticiens (future UI).
|
||||
"""
|
||||
# Defaults minimalistes (adultes) si YAML absent
|
||||
defaults: Dict[str, Any] = {
|
||||
"version": 1,
|
||||
"age_bands": {"adult_min_years": 18},
|
||||
"fallback_ranges": {
|
||||
"adult": {
|
||||
"platelets": {"low": 150, "high": 450, "unit": "G/L"},
|
||||
"sodium": {"low": 135, "high": 145, "unit": "mmol/L"},
|
||||
"potassium": {"low": 3.5, "high": 5.0, "unit": "mmol/L"},
|
||||
},
|
||||
# Valeurs pédiatriques: à affiner (par bandes d'âge) si besoin.
|
||||
# Pour les règles "ruled_out" on utilise plutôt les safe_zones_unknown_age
|
||||
"child": {
|
||||
"platelets": {"low": 150, "high": 450, "unit": "G/L"},
|
||||
"sodium": {"low": 135, "high": 145, "unit": "mmol/L"},
|
||||
"potassium": {"low": 3.5, "high": 5.0, "unit": "mmol/L"},
|
||||
},
|
||||
},
|
||||
"safe_zones_unknown_age": {
|
||||
"platelets_ruled_out_low": 170,
|
||||
"sodium_ruled_out_low": 138,
|
||||
"potassium_ruled_out_high": 4.9,
|
||||
"potassium_ruled_out_low": 3.7,
|
||||
},
|
||||
}
|
||||
|
||||
path = REFERENCE_RANGES_PATH
|
||||
if not path.exists():
|
||||
return defaults
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
except Exception:
|
||||
# PyYAML absent: on garde les valeurs par défaut
|
||||
return defaults
|
||||
|
||||
try:
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
if not isinstance(data, dict):
|
||||
return defaults
|
||||
# Merge léger: defaults comme socle, YAML surcharge
|
||||
merged = dict(defaults)
|
||||
for k, v in data.items():
|
||||
merged[k] = v
|
||||
return merged
|
||||
except Exception:
|
||||
return defaults
|
||||
|
||||
|
||||
# --- Règles biologiques (pilotées par YAML) ---
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_bio_rules() -> Dict[str, Any]:
|
||||
"""Charge les règles biologiques depuis config/bio_rules.yaml.
|
||||
|
||||
Objectif: permettre d'activer/désactiver et de paramétrer les règles
|
||||
de type "contradiction bio ⇒ ruled_out" sans modifier le code.
|
||||
|
||||
Le fichier est volontairement simple (future UI).
|
||||
"""
|
||||
|
||||
defaults: Dict[str, Any] = {
|
||||
"version": 1,
|
||||
"rules": {
|
||||
"hyponatremia": {"enabled": True, "codes": ["E87.1"], "analyte": "sodium"},
|
||||
"hyperkalemia": {"enabled": True, "codes": ["E87.5"], "analyte": "potassium"},
|
||||
"hypokalemia": {"enabled": True, "codes": ["E87.6"], "analyte": "potassium"},
|
||||
},
|
||||
}
|
||||
|
||||
path = BIO_RULES_PATH
|
||||
if not path.exists():
|
||||
return defaults
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
except Exception:
|
||||
return defaults
|
||||
|
||||
try:
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
if not isinstance(data, dict):
|
||||
return defaults
|
||||
merged = dict(defaults)
|
||||
for k, v in data.items():
|
||||
merged[k] = v
|
||||
return merged
|
||||
except Exception:
|
||||
return defaults
|
||||
|
||||
|
||||
# --- Garde-fous de parsing des valeurs biologiques (anti-OCR) ---
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_lab_value_sanity() -> Dict[str, Any]:
|
||||
"""Charge des garde-fous de parsing depuis config/lab_value_sanity.yaml.
|
||||
|
||||
But:
|
||||
- éviter que des artefacts de lecture PDF/OCR (ex: "8" au lieu de "4.8")
|
||||
déclenchent de faux diagnostics (hyperK, etc.)
|
||||
- garder une trace *auditable* (valeurs suspectes / écartées)
|
||||
|
||||
Ce fichier est volontairement éditable (future UI).
|
||||
"""
|
||||
|
||||
defaults: Dict[str, Any] = {
|
||||
"version": 1,
|
||||
"policy": {
|
||||
# Si True: les valeurs hors bornes plausibles sont écartées du dossier.
|
||||
# Sinon: elles sont gardées avec quality="discarded".
|
||||
"drop_out_of_range": True,
|
||||
# Si True: on conserve les valeurs suspectes (quality="suspect") pour audit,
|
||||
# mais les règles qualité privilégient les valeurs "ok" quand elles existent.
|
||||
"keep_suspect": True,
|
||||
},
|
||||
# Clés normalisées (minuscules, sans accents) : potassium, sodium, plaquettes...
|
||||
"tests": {
|
||||
"potassium": {
|
||||
# Bornes très larges (mmol/L) : sert uniquement à écarter l'impossible.
|
||||
"hard_min": 0.5,
|
||||
"hard_max": 9.0,
|
||||
# Heuristique anti-OCR : un chiffre seul >=6 est souvent une décimale perdue (4,8 -> 8)
|
||||
"suspect": {"single_digit_over": 6.0},
|
||||
},
|
||||
"sodium": {"hard_min": 90.0, "hard_max": 200.0},
|
||||
"plaquettes": {"hard_min": 5.0, "hard_max": 2000.0},
|
||||
"hemoglobine": {"hard_min": 3.0, "hard_max": 25.0},
|
||||
"creatinine": {"hard_min": 1.0, "hard_max": 5000.0},
|
||||
"crp": {"hard_min": 0.0, "hard_max": 1000.0},
|
||||
"alat": {"hard_min": 0.0, "hard_max": 5000.0},
|
||||
"asat": {"hard_min": 0.0, "hard_max": 5000.0},
|
||||
"ggt": {"hard_min": 0.0, "hard_max": 5000.0},
|
||||
"pal": {"hard_min": 0.0, "hard_max": 5000.0},
|
||||
"bilirubine totale": {"hard_min": 0.0, "hard_max": 2000.0},
|
||||
},
|
||||
}
|
||||
|
||||
path = LAB_SANITY_PATH
|
||||
if not path.exists():
|
||||
return defaults
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
except Exception:
|
||||
return defaults
|
||||
|
||||
try:
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
if not isinstance(data, dict):
|
||||
return defaults
|
||||
merged = dict(defaults)
|
||||
for k, v in data.items():
|
||||
merged[k] = v
|
||||
return merged
|
||||
except Exception:
|
||||
return defaults
|
||||
|
||||
|
||||
# --- Catalogue de règles (vetos + décisions), piloté par YAML ---
|
||||
|
||||
|
||||
def _flatten_rules_yaml(data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
||||
"""Transforme un YAML de règles en dict {rule_id: cfg}.
|
||||
|
||||
Formats supportés :
|
||||
- {packs: {pack_name: {enabled: bool, rules: {RULE_ID: {...}}}}}
|
||||
- {rules: {RULE_ID: {...}}} (overlay simple)
|
||||
"""
|
||||
|
||||
out: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
# Overlay simple
|
||||
rules_block = data.get("rules")
|
||||
if isinstance(rules_block, dict):
|
||||
for rid, cfg in rules_block.items():
|
||||
if not isinstance(cfg, dict):
|
||||
cfg = {}
|
||||
out[str(rid)] = dict(cfg)
|
||||
|
||||
packs = data.get("packs")
|
||||
if isinstance(packs, dict):
|
||||
for pack_name, pack_cfg in packs.items():
|
||||
if not isinstance(pack_cfg, dict):
|
||||
continue
|
||||
pack_enabled = bool(pack_cfg.get("enabled", True))
|
||||
rules = pack_cfg.get("rules")
|
||||
if not isinstance(rules, dict):
|
||||
continue
|
||||
for rid, cfg in rules.items():
|
||||
if not isinstance(cfg, dict):
|
||||
cfg = {}
|
||||
merged = dict(cfg)
|
||||
merged.setdefault("pack", str(pack_name))
|
||||
# La désactivation du pack désactive ses règles
|
||||
merged["enabled"] = bool(merged.get("enabled", True)) and pack_enabled
|
||||
out[str(rid)] = merged
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _merge_rule_catalog(base: Dict[str, Dict[str, Any]], overlay: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
|
||||
"""Merge overlay → base (par règle)."""
|
||||
merged = {k: dict(v) for k, v in base.items()}
|
||||
for rid, cfg in overlay.items():
|
||||
if rid not in merged:
|
||||
merged[rid] = dict(cfg)
|
||||
else:
|
||||
# override champ par champ
|
||||
for k, v in cfg.items():
|
||||
merged[rid][k] = v
|
||||
return merged
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_rules_catalog() -> Dict[str, Dict[str, Any]]:
|
||||
"""Charge le catalogue de règles depuis config/rules/*.yaml.
|
||||
|
||||
- base.yaml : socle partagé (vetos + décisions)
|
||||
- enabled.yaml : sélection d'overlays (site/spécialité)
|
||||
- specialties/<name>.yaml et sites/<name>.yaml : overrides ciblés
|
||||
|
||||
Politique : si une règle n'est pas listée, elle est considérée "enabled".
|
||||
(=> ne casse pas le comportement historique)
|
||||
"""
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
catalog: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
# 1) base
|
||||
if RULES_BASE_PATH.exists():
|
||||
try:
|
||||
base_data = yaml.safe_load(RULES_BASE_PATH.read_text(encoding="utf-8")) or {}
|
||||
if isinstance(base_data, dict):
|
||||
catalog = _flatten_rules_yaml(base_data)
|
||||
except Exception:
|
||||
catalog = {}
|
||||
|
||||
# 2) enabled overlays
|
||||
active_site = ""
|
||||
active_specialty = ""
|
||||
extra_files: list[str] = []
|
||||
if RULES_ENABLED_PATH.exists():
|
||||
try:
|
||||
enabled_data = yaml.safe_load(RULES_ENABLED_PATH.read_text(encoding="utf-8")) or {}
|
||||
if isinstance(enabled_data, dict):
|
||||
active = enabled_data.get("active") or {}
|
||||
if isinstance(active, dict):
|
||||
active_site = str(active.get("site") or "").strip()
|
||||
active_specialty = str(active.get("specialty") or "").strip()
|
||||
extra = active.get("extra")
|
||||
if isinstance(extra, list):
|
||||
extra_files = [str(x) for x in extra if str(x).strip()]
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
# fallback env
|
||||
active_site = os.environ.get("T2A_SITE", "").strip()
|
||||
active_specialty = os.environ.get("T2A_SPECIALTY", "").strip()
|
||||
|
||||
# 3) specialty overlay
|
||||
if active_specialty:
|
||||
p = RULES_DIR / "specialties" / f"{active_specialty}.yaml"
|
||||
if p.exists():
|
||||
try:
|
||||
data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
|
||||
if isinstance(data, dict):
|
||||
catalog = _merge_rule_catalog(catalog, _flatten_rules_yaml(data))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 4) site overlay
|
||||
if active_site:
|
||||
p = RULES_DIR / "sites" / f"{active_site}.yaml"
|
||||
if p.exists():
|
||||
try:
|
||||
data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
|
||||
if isinstance(data, dict):
|
||||
catalog = _merge_rule_catalog(catalog, _flatten_rules_yaml(data))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 5) extra overlays
|
||||
for rel in extra_files:
|
||||
p = RULES_DIR / rel
|
||||
if p.exists():
|
||||
try:
|
||||
data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
|
||||
if isinstance(data, dict):
|
||||
catalog = _merge_rule_catalog(catalog, _flatten_rules_yaml(data))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return catalog
|
||||
|
||||
|
||||
# --- Routage dynamique des règles (packs) ---
|
||||
|
||||
# Contexte runtime, défini *par dossier* (contextvars => safe pour batch / multi-thread)
|
||||
_RULES_RUNTIME_CTX: contextvars.ContextVar[dict | None] = contextvars.ContextVar("t2a_rules_runtime", default=None)
|
||||
|
||||
def set_rules_runtime(ctx: dict) -> contextvars.Token:
|
||||
"""Active un contexte de règles pour le dossier courant."""
|
||||
return _RULES_RUNTIME_CTX.set(ctx)
|
||||
|
||||
def reset_rules_runtime(token: contextvars.Token) -> None:
|
||||
"""Restaure le contexte précédent."""
|
||||
_RULES_RUNTIME_CTX.reset(token)
|
||||
|
||||
def get_rules_runtime() -> dict | None:
|
||||
return _RULES_RUNTIME_CTX.get()
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_rules_router() -> Dict[str, Any]:
|
||||
"""Charge la config de routage (config/rules/router.yaml).
|
||||
|
||||
- mode: 'strict' => une règle non listée dans base.yaml est considérée désactivée
|
||||
quand le routage runtime est actif (objectif: éviter les surprises).
|
||||
- defaults.enabled_packs: packs actifs par défaut sur tous les dossiers.
|
||||
- triggers: conditions simples qui activent des packs additionnels.
|
||||
"""
|
||||
defaults: Dict[str, Any] = {
|
||||
"version": 1,
|
||||
"mode": "strict",
|
||||
"defaults": {"enabled_packs": ["vetos_core", "decisions_core"]},
|
||||
"triggers": [],
|
||||
}
|
||||
path = RULES_ROUTER_PATH
|
||||
if not path.exists():
|
||||
return defaults
|
||||
try:
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
# merge conservateur
|
||||
if isinstance(data, dict):
|
||||
defaults.update({k: v for k, v in data.items() if v is not None})
|
||||
return defaults
|
||||
except Exception:
|
||||
return defaults
|
||||
|
||||
|
||||
def rule_enabled(rule_id: str) -> bool:
|
||||
"""Retourne True si la règle est activée.
|
||||
|
||||
Mode legacy (pas de routage runtime): une règle inconnue => True (comportement historique).
|
||||
|
||||
Mode routé (runtime actif):
|
||||
- On *garde* l'info 'enabled' du catalogue (base.yaml / overlays)
|
||||
- On **désactive** automatiquement les règles dont le pack n'est pas dans enabled_packs
|
||||
- En mode 'strict', une règle inconnue => False (ça évite les surprises en prod)
|
||||
"""
|
||||
catalog = load_rules_catalog()
|
||||
cfg = catalog.get(rule_id)
|
||||
|
||||
runtime = get_rules_runtime()
|
||||
if runtime is None:
|
||||
# legacy
|
||||
if not cfg:
|
||||
return True
|
||||
return bool(cfg.get("enabled", True))
|
||||
|
||||
mode = str(runtime.get("mode") or "strict").lower()
|
||||
enabled_packs = set(runtime.get("enabled_packs") or [])
|
||||
always_on = set(runtime.get("always_on_rules") or [])
|
||||
force_enable = set(runtime.get("force_enable_rules") or [])
|
||||
force_disable = set(runtime.get("force_disable_rules") or [])
|
||||
|
||||
if rule_id in force_disable:
|
||||
return False
|
||||
if rule_id in force_enable:
|
||||
return True
|
||||
|
||||
# Règles inconnues: strict => off, legacy => on
|
||||
if cfg is None:
|
||||
return False if mode == "strict" else True
|
||||
|
||||
# Respecte le flag d'activation du catalogue (l'admin peut couper une règle)
|
||||
if not bool(cfg.get("enabled", True)):
|
||||
return False
|
||||
|
||||
pack = cfg.get("pack")
|
||||
if pack and (pack not in enabled_packs) and (rule_id not in always_on):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def rule_force_severity(rule_id: str) -> str | None:
|
||||
"""Optionnel: force la sévérité d'un veto (HARD/MEDIUM/LOW)."""
|
||||
cfg = load_rules_catalog().get(rule_id) or {}
|
||||
sev = cfg.get("force_severity")
|
||||
return str(sev) if sev else None
|
||||
|
||||
|
||||
|
||||
# --- Modèles de données CIM-10 ---
|
||||
|
||||
|
||||
@@ -98,10 +537,34 @@ class PreuveClinique(BaseModel):
|
||||
interpretation: str # "syndrome inflammatoire majeur"
|
||||
|
||||
|
||||
class CodeDecision(BaseModel):
|
||||
"""Décision finale sur un code (audit-friendly).
|
||||
|
||||
- action=KEEP: on garde la suggestion
|
||||
- action=DOWNGRADE: on remplace par un code moins spécifique (ex: D50→D64.9)
|
||||
- action=REMOVE: on retire le code (ou on le laisse vide)
|
||||
"""
|
||||
|
||||
action: str = "KEEP" # KEEP | DOWNGRADE | REMOVE
|
||||
final_code: Optional[str] = None
|
||||
downgraded_from: Optional[str] = None
|
||||
reason: Optional[str] = None
|
||||
needs_info: list[str] = Field(default_factory=list)
|
||||
applied_rules: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class Diagnostic(BaseModel):
|
||||
texte: str
|
||||
cim10_suggestion: Optional[str] = None
|
||||
cim10_confidence: Optional[str] = None
|
||||
# Statut clinique / qualité (pour affichage "barré" et exclusion métriques)
|
||||
# - confirmed/probable/uncertain: actifs
|
||||
# - ruled_out: visible mais barré (n'entre pas dans les métriques/GHM)
|
||||
status: Optional[str] = None
|
||||
ruled_out_reason: Optional[str] = None
|
||||
# Sortie finale (post-traitement qualité)
|
||||
cim10_final: Optional[str] = None
|
||||
cim10_decision: Optional[CodeDecision] = None
|
||||
justification: Optional[str] = None
|
||||
raisonnement: Optional[str] = None
|
||||
sources_rag: list[RAGSource] = Field(default_factory=list)
|
||||
@@ -115,6 +578,24 @@ class Diagnostic(BaseModel):
|
||||
source_excerpt: Optional[str] = None # extrait du texte source (~200 chars)
|
||||
|
||||
|
||||
class DossierMetrics(BaseModel):
|
||||
"""Métriques de qualité / reporting (audit-friendly).
|
||||
|
||||
Objectif : distinguer les éléments *actifs* (qui comptent pour le codage / GHM)
|
||||
de ceux écartés par les règles qualité (vetos / décisions).
|
||||
"""
|
||||
|
||||
das_total: int = 0
|
||||
das_active: int = 0
|
||||
das_excluded: int = 0 # total - active
|
||||
das_removed: int = 0 # décision REMOVE (future: ruled_out)
|
||||
das_ruled_out: int = 0 # visible mais barré (action RULED_OUT)
|
||||
das_no_code: int = 0 # pas de code suggestion/final
|
||||
actes_total: int = 0
|
||||
actes_with_code: int = 0
|
||||
dp_has_code: bool = False
|
||||
|
||||
|
||||
class ActeCCAM(BaseModel):
|
||||
texte: str
|
||||
code_ccam_suggestion: Optional[str] = None
|
||||
@@ -140,7 +621,12 @@ class Traitement(BaseModel):
|
||||
class BiologieCle(BaseModel):
|
||||
test: str
|
||||
valeur: Optional[str] = None
|
||||
# Valeur numérique parsée (si possible). Sert aux règles qualité.
|
||||
valeur_num: Optional[float] = None
|
||||
anomalie: Optional[bool] = None
|
||||
# Qualité de parsing: ok | suspect | discarded
|
||||
quality: Optional[str] = None
|
||||
discard_reason: Optional[str] = None
|
||||
source_page: Optional[int] = None
|
||||
source_excerpt: Optional[str] = None
|
||||
|
||||
@@ -175,13 +661,18 @@ class DossierMedical(BaseModel):
|
||||
antecedents: list[Antecedent] = Field(default_factory=list)
|
||||
traitements_sortie: list[Traitement] = Field(default_factory=list)
|
||||
biologie_cle: list[BiologieCle] = Field(default_factory=list)
|
||||
# Valeurs biologiques écartées (artefacts PDF/OCR) pour audit
|
||||
biologie_discarded: list[dict] = Field(default_factory=list)
|
||||
imagerie: list[Imagerie] = Field(default_factory=list)
|
||||
complications: list[Complication] = Field(default_factory=list)
|
||||
alertes_codage: list[str] = Field(default_factory=list)
|
||||
source_files: list[str] = Field(default_factory=list)
|
||||
ghm_estimation: Optional[GHMEstimation] = None
|
||||
controles_cpam: list[ControleCPAM] = Field(default_factory=list)
|
||||
veto_report: Optional["VetoReport"] = None
|
||||
processing_time_s: float | None = None
|
||||
metrics: Optional[DossierMetrics] = None
|
||||
rules_runtime: Optional[dict] = None
|
||||
|
||||
@field_validator("antecedents", mode="before")
|
||||
@classmethod
|
||||
@@ -240,6 +731,26 @@ class ControleCPAM(BaseModel):
|
||||
sources_reponse: list[RAGSource] = Field(default_factory=list)
|
||||
|
||||
|
||||
# --- Qualité / Vetos (contestabilité) ---
|
||||
|
||||
|
||||
class VetoIssue(BaseModel):
|
||||
"""Un problème détecté lors du contrôle de contestabilité."""
|
||||
|
||||
veto: str
|
||||
severity: str # HARD | MEDIUM | LOW
|
||||
where: str
|
||||
message: str
|
||||
|
||||
|
||||
class VetoReport(BaseModel):
|
||||
"""Rapport global de vetos pour un dossier."""
|
||||
|
||||
verdict: str # PASS | NEED_INFO | FAIL
|
||||
score_contestabilite: int = 100 # 0-100
|
||||
issues: list[VetoIssue] = Field(default_factory=list)
|
||||
|
||||
|
||||
class AnonymizationReport(BaseModel):
|
||||
source_file: str
|
||||
total_replacements: int = 0
|
||||
|
||||
@@ -9,6 +9,7 @@ from ..config import ControleCPAM, DossierMedical, RAGSource
|
||||
from ..medical.cim10_dict import normalize_code, validate_code
|
||||
from ..medical.cim10_extractor import BIO_NORMALS
|
||||
from ..medical.ollama_client import call_anthropic, call_ollama
|
||||
from ..prompts import CPAM_EXTRACTION, CPAM_ARGUMENTATION, CPAM_ADVERSARIAL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -602,88 +603,18 @@ def _build_cpam_prompt(
|
||||
+ "\n".join(ext_lines)
|
||||
)
|
||||
|
||||
prompt = f"""Tu es un médecin DIM (Département d'Information Médicale) expert en contentieux T2A.
|
||||
Tu dois produire une analyse ÉQUILIBRÉE ET CRÉDIBLE de la contestation CPAM, puis contre-argumenter en mobilisant trois axes : médical, asymétrie d'information, et réglementaire.
|
||||
|
||||
IMPORTANT — CRÉDIBILITÉ DE L'ANALYSE :
|
||||
Une contre-argumentation crédible reconnaît TOUJOURS au moins un point valide dans le raisonnement adverse.
|
||||
Répondre "Aucun point d'accord" décrédibilise l'ensemble de l'argumentation. Tu DOIS identifier au moins un élément où la CPAM a un point légitime (même partiel), puis expliquer pourquoi cela ne suffit pas à invalider le codage.
|
||||
|
||||
IMPORTANT — CODES CIM-10 :
|
||||
Ne parle JAMAIS de « codage initial » ou « codage contesté » sans citer explicitement le code CIM-10 et son libellé (ex: Z45.80 — Ajustement et entretien d'un dispositif implantable).
|
||||
Chaque argument doit désigner précisément quel code est défendu ou contesté, avec son libellé complet.
|
||||
|
||||
DOSSIER MÉDICAL DE L'ÉTABLISSEMENT :
|
||||
{dossier_str}
|
||||
{asymetrie_str}
|
||||
{tagged_str}
|
||||
|
||||
OBJET DU DÉSACCORD : {controle.titre}
|
||||
|
||||
ARGUMENTATION DE LA CPAM (UCR) :
|
||||
{controle.arg_ucr}
|
||||
|
||||
DÉCISION UCR : {controle.decision_ucr}
|
||||
|
||||
CODES CONTESTÉS :
|
||||
{codes_str}
|
||||
{definitions_str}
|
||||
|
||||
SOURCES RÉGLEMENTAIRES (Guide méthodologique, CIM-10) :
|
||||
{sources_text}
|
||||
{extraction_str}
|
||||
|
||||
CONSIGNES :
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
- Prends en compte l'ÂGE du patient (pédiatrie < 18 ans, personne âgée >= 80 ans), le MODE D'ENTRÉE (urgence vs programmé), et la DURÉE DE SÉJOUR pour contextualiser ton analyse
|
||||
- En pédiatrie, les normes biologiques et les codages peuvent différer de l'adulte
|
||||
- Une admission en urgence implique un contexte clinique aigu qui influence le choix du DP
|
||||
|
||||
ÉTAPE 1 — ANALYSE HONNÊTE (avant de contre-argumenter) :
|
||||
- Identifie ce que la CPAM a compris correctement dans le dossier
|
||||
- Reconnais les points où leur raisonnement est fondé, même partiellement
|
||||
- Explique ENSUITE pourquoi ces points ne justifient pas leur conclusion
|
||||
|
||||
AXE MÉDICAL :
|
||||
- Analyse le bien-fondé médical du codage de l'établissement
|
||||
- CITE les éléments cliniques EXACTS du dossier en utilisant les tags [XX-N] fournis (ex: [BIO-1] CRP 180 mg/L)
|
||||
- Confronte l'argumentation CPAM aux sources CIM-10 et Guide Méthodologique fournies
|
||||
- Ne mentionne AUCUN élément qui ne figure pas dans les éléments référencés ci-dessus
|
||||
|
||||
AXE ASYMÉTRIE D'INFORMATION :
|
||||
- La CPAM a fondé son analyse uniquement sur le CRH et les codes transmis
|
||||
- Pour CHAQUE élément clinique pertinent, cite les VALEURS EXACTES et explique leur signification clinique
|
||||
- Démontre en quoi ces éléments complémentaires (biologie, imagerie, traitements, actes) justifient le codage contesté
|
||||
- Ne mentionne AUCUN élément qui n'est pas dans le dossier fourni
|
||||
|
||||
MISE EN FORME :
|
||||
- Structure chaque section avec des tirets pour lister les arguments distincts
|
||||
- Un argument par puce, avec la preuve ou la référence associée
|
||||
|
||||
AXE RÉGLEMENTAIRE :
|
||||
- Identifie si l'UCR fait une interprétation restrictive non fondée d'une règle
|
||||
- Confronte le raisonnement CPAM au texte EXACT des sources fournies
|
||||
- Format OBLIGATOIRE pour chaque référence : [Document - page N] suivi d'une CITATION VERBATIM du passage pertinent
|
||||
- INTERDICTION ABSOLUE de citer une référence qui ne figure pas dans les sources fournies ci-dessus
|
||||
- Si aucune source pertinente n'est disponible → écrire explicitement "Pas de source réglementaire disponible"
|
||||
- Relève les contradictions entre l'argumentation CPAM et les règles officielles
|
||||
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant :
|
||||
{{
|
||||
"analyse_contestation": "Résumé de ce que conteste la CPAM et sur quelle base",
|
||||
"points_accord": "Points CONCRETS où la CPAM a raison ou partiellement raison (JAMAIS 'Aucun' — il y a toujours au moins un point légitime à reconnaître)",
|
||||
"contre_arguments_medicaux": "Argumentation médicale en faveur du codage, en expliquant pourquoi les points d'accord ne suffisent pas à invalider le codage",
|
||||
"preuves_dossier": [
|
||||
{{"ref": "BIO-1", "element": "biologie|imagerie|traitement|acte|clinique", "valeur": "valeur exacte du dossier", "signification": "explication clinique"}}
|
||||
],
|
||||
"contre_arguments_asymetrie": "Éléments cliniques que la CPAM n'avait pas et qui justifient le codage",
|
||||
"contre_arguments_reglementaires": "Erreurs d'interprétation réglementaire de la CPAM, avec citations verbatim des sources",
|
||||
"references": [
|
||||
{{"document": "nom du document source", "page": "numéro de page", "citation": "citation verbatim du passage"}}
|
||||
],
|
||||
"conclusion": "Synthèse en citant EXPLICITEMENT les codes CIM-10 défendus (ex: DP Z45.80 — libellé) : points reconnus à la CPAM, puis pourquoi ce codage précis est néanmoins justifié"
|
||||
}}"""
|
||||
prompt = CPAM_ARGUMENTATION.format(
|
||||
dossier_str=dossier_str,
|
||||
asymetrie_str=asymetrie_str,
|
||||
tagged_str=tagged_str,
|
||||
titre=controle.titre,
|
||||
arg_ucr=controle.arg_ucr,
|
||||
decision_ucr=controle.decision_ucr,
|
||||
codes_str=codes_str,
|
||||
definitions_str=definitions_str,
|
||||
sources_text=sources_text,
|
||||
extraction_str=extraction_str,
|
||||
)
|
||||
return prompt, tag_map
|
||||
|
||||
|
||||
@@ -845,35 +776,19 @@ def _validate_adversarial(
|
||||
normes_lines.append(f" {test}: {lo}-{hi}")
|
||||
normes_section = "NORMES BIOLOGIQUES DE RÉFÉRENCE :\n" + "\n".join(normes_lines)
|
||||
|
||||
prompt = f"""Tu es un relecteur critique. Vérifie la cohérence de cette contre-argumentation CPAM.
|
||||
dp_ucr_line = f"DP UCR : {controle.dp_ucr}" if controle.dp_ucr else ""
|
||||
da_ucr_line = f"DA UCR : {controle.da_ucr}" if controle.da_ucr else ""
|
||||
|
||||
RÉPONSE GÉNÉRÉE :
|
||||
{response_json}
|
||||
|
||||
{factual_section}
|
||||
|
||||
{normes_section}
|
||||
|
||||
CODES CONTESTÉS :
|
||||
{f"DP UCR : {controle.dp_ucr}" if controle.dp_ucr else ""}
|
||||
{f"DA UCR : {controle.da_ucr}" if controle.da_ucr else ""}
|
||||
|
||||
Vérifie STRICTEMENT :
|
||||
1. Chaque valeur bio/imagerie/traitement citée dans les preuves existe dans les éléments factuels
|
||||
2. Si une valeur bio est qualifiée de "élevée", "basse" ou "anormale", vérifie qu'elle est RÉELLEMENT hors normes selon les normes ci-dessus (ex: CRP 5 = NORMAL, pas élevé)
|
||||
3. La conclusion est cohérente avec l'argumentation développée
|
||||
4. Les points d'accord ne contredisent pas les contre-arguments
|
||||
5. Les codes CIM-10 mentionnés dans la conclusion sont cohérents avec le reste
|
||||
|
||||
Réponds UNIQUEMENT en JSON :
|
||||
{{
|
||||
"coherent": true ou false,
|
||||
"erreurs": ["description précise de chaque incohérence trouvée"],
|
||||
"score_confiance": 0 à 10
|
||||
}}"""
|
||||
prompt = CPAM_ADVERSARIAL.format(
|
||||
response_json=response_json,
|
||||
factual_section=factual_section,
|
||||
normes_section=normes_section,
|
||||
dp_ucr_line=dp_ucr_line,
|
||||
da_ucr_line=da_ucr_line,
|
||||
)
|
||||
|
||||
logger.debug(" Validation adversariale")
|
||||
result = call_ollama(prompt, temperature=0.0, max_tokens=800)
|
||||
result = call_ollama(prompt, temperature=0.0, max_tokens=800, role="validation")
|
||||
if result is None:
|
||||
result = call_anthropic(prompt, temperature=0.0, max_tokens=800)
|
||||
if result is None:
|
||||
@@ -924,36 +839,22 @@ def _extraction_pass(
|
||||
# Contexte tagué (réutilise la même fonction)
|
||||
tagged_text, _ = _build_tagged_context(dossier)
|
||||
|
||||
prompt = f"""Tu es un médecin DIM expert. Analyse cette contestation CPAM sans argumenter.
|
||||
dp_ucr_line = f"DP proposé UCR : {controle.dp_ucr}" if controle.dp_ucr else ""
|
||||
da_ucr_line = f"DA proposés UCR : {controle.da_ucr}" if controle.da_ucr else ""
|
||||
|
||||
DOSSIER :
|
||||
- DP : {dp_str or "Non extrait"}
|
||||
- DAS : {das_str or "Aucun"}
|
||||
{tagged_text}
|
||||
|
||||
CONTESTATION CPAM :
|
||||
Titre : {controle.titre}
|
||||
Argument : {controle.arg_ucr}
|
||||
Décision : {controle.decision_ucr}
|
||||
{f"DP proposé UCR : {controle.dp_ucr}" if controle.dp_ucr else ""}
|
||||
{f"DA proposés UCR : {controle.da_ucr}" if controle.da_ucr else ""}
|
||||
|
||||
Réponds UNIQUEMENT en JSON :
|
||||
{{
|
||||
"comprehension_contestation": "Résumé factuel : que conteste la CPAM et pourquoi",
|
||||
"elements_cliniques_pertinents": [
|
||||
{{"tag": "BIO-1 ou texte libre", "pertinence": "en quoi cet élément est pertinent pour le codage contesté"}}
|
||||
],
|
||||
"points_accord_potentiels": ["points où la CPAM a partiellement raison"],
|
||||
"codes_en_jeu": {{
|
||||
"dp_etablissement": "code + libellé",
|
||||
"dp_ucr": "code + libellé si proposé",
|
||||
"difference_cle": "explication de la différence entre les deux codages"
|
||||
}}
|
||||
}}"""
|
||||
prompt = CPAM_EXTRACTION.format(
|
||||
dp_str=dp_str or "Non extrait",
|
||||
das_str=das_str or "Aucun",
|
||||
tagged_text=tagged_text,
|
||||
titre=controle.titre,
|
||||
arg_ucr=controle.arg_ucr,
|
||||
decision_ucr=controle.decision_ucr,
|
||||
dp_ucr_line=dp_ucr_line,
|
||||
da_ucr_line=da_ucr_line,
|
||||
)
|
||||
|
||||
logger.debug(" Passe 1 — extraction structurée")
|
||||
result = call_ollama(prompt, temperature=0.0, max_tokens=1500)
|
||||
result = call_ollama(prompt, temperature=0.0, max_tokens=1500, role="cpam")
|
||||
if result is None:
|
||||
result = call_anthropic(prompt, temperature=0.0, max_tokens=1500)
|
||||
if result is not None:
|
||||
@@ -990,13 +891,13 @@ def generate_cpam_response(
|
||||
# 3. Construction du prompt (passe 2 — argumentation)
|
||||
prompt, tag_map = _build_cpam_prompt(dossier, controle, sources, extraction)
|
||||
|
||||
# 4. Appel LLM — Ollama (modèle par défaut) > Haiku fallback
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=4000)
|
||||
# 4. Appel LLM — Ollama (rôle cpam) > Haiku fallback
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=6000, role="cpam")
|
||||
if result is not None:
|
||||
logger.info(" Contre-argumentation via Ollama")
|
||||
else:
|
||||
logger.info(" Ollama indisponible → fallback Anthropic Haiku")
|
||||
result = call_anthropic(prompt, temperature=0.1, max_tokens=4000)
|
||||
result = call_anthropic(prompt, temperature=0.1, max_tokens=6000)
|
||||
if result is not None:
|
||||
logger.info(" Contre-argumentation via Anthropic Haiku")
|
||||
|
||||
|
||||
267
src/main.py
267
src/main.py
@@ -10,7 +10,19 @@ import time
|
||||
from pathlib import Path
|
||||
|
||||
from .anonymization.anonymizer import Anonymizer
|
||||
from .config import ANONYMIZED_DIR, INPUT_DIR, OUTPUT_DIR, REPORTS_DIR, STRUCTURED_DIR, AnonymizationReport, DossierMedical
|
||||
from .config import (
|
||||
ANONYMIZED_DIR,
|
||||
INPUT_DIR,
|
||||
OUTPUT_DIR,
|
||||
REPORTS_DIR,
|
||||
STRUCTURED_DIR,
|
||||
AnonymizationReport,
|
||||
DossierMedical,
|
||||
DossierMetrics,
|
||||
VetoReport,
|
||||
set_rules_runtime,
|
||||
reset_rules_runtime,
|
||||
)
|
||||
from .extraction.document_classifier import classify
|
||||
from .extraction.crh_parser import parse_crh
|
||||
from .extraction.document_splitter import split_documents
|
||||
@@ -18,6 +30,9 @@ from .extraction.pdf_extractor import extract_text, extract_text_with_pages
|
||||
from .extraction.trackare_parser import parse_trackare
|
||||
from .medical.cim10_extractor import extract_medical_info
|
||||
from .medical.ghm import estimate_ghm
|
||||
from .quality.veto_engine import apply_vetos
|
||||
from .quality.decision_engine import apply_decisions, decision_summaries
|
||||
from .quality.rules_router import build_rules_runtime_context
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -25,6 +40,102 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _compute_metrics(dossier: DossierMedical) -> DossierMetrics:
|
||||
"""Calcule les métriques "actifs vs écartés" pour reporting.
|
||||
|
||||
Règle pro : les métriques (GHM, sévérité, stats) ne doivent compter
|
||||
que les diagnostics *actifs* (pas ceux écartés par décisions).
|
||||
"""
|
||||
|
||||
def _has_any_code(diag) -> bool:
|
||||
return bool(getattr(diag, "cim10_final", None) or getattr(diag, "cim10_suggestion", None))
|
||||
|
||||
def _is_active_diag(diag) -> bool:
|
||||
dec = getattr(diag, "cim10_decision", None)
|
||||
if dec is not None and getattr(dec, "action", None) == "REMOVE":
|
||||
return False
|
||||
if getattr(diag, "cim10_final", None):
|
||||
return True
|
||||
# Backward compat : si pas de final + suggestion et pas de décision
|
||||
if getattr(diag, "cim10_final", None) is None and getattr(diag, "cim10_suggestion", None) and dec is None:
|
||||
return True
|
||||
return False
|
||||
|
||||
das_total = len(dossier.diagnostics_associes)
|
||||
das_active = 0
|
||||
das_removed = 0
|
||||
das_no_code = 0
|
||||
for d in dossier.diagnostics_associes:
|
||||
dec = getattr(d, "cim10_decision", None)
|
||||
if dec is not None and getattr(dec, "action", None) == "REMOVE":
|
||||
das_removed += 1
|
||||
if not _has_any_code(d):
|
||||
das_no_code += 1
|
||||
if _is_active_diag(d):
|
||||
das_active += 1
|
||||
|
||||
actes_total = len(dossier.actes_ccam)
|
||||
actes_with_code = sum(1 for a in dossier.actes_ccam if getattr(a, "code_ccam_suggestion", None))
|
||||
|
||||
dp_has_code = False
|
||||
if dossier.diagnostic_principal is not None:
|
||||
dp = dossier.diagnostic_principal
|
||||
dp_dec = getattr(dp, "cim10_decision", None)
|
||||
if not (dp_dec is not None and getattr(dp_dec, "action", None) == "REMOVE"):
|
||||
dp_has_code = bool(getattr(dp, "cim10_final", None) or getattr(dp, "cim10_suggestion", None))
|
||||
|
||||
metrics = DossierMetrics(
|
||||
das_total=das_total,
|
||||
das_active=das_active,
|
||||
das_excluded=max(0, das_total - das_active),
|
||||
das_removed=das_removed,
|
||||
das_no_code=das_no_code,
|
||||
actes_total=actes_total,
|
||||
actes_with_code=actes_with_code,
|
||||
dp_has_code=dp_has_code,
|
||||
)
|
||||
dossier.metrics = metrics
|
||||
return metrics
|
||||
|
||||
|
||||
def _inject_veto_alerts(dossier: DossierMedical, veto: VetoReport, scope: str = "FINAL") -> None:
|
||||
"""Injecte les alertes liées aux vetos dans alertes_codage en évitant les doublons.
|
||||
|
||||
On *remplace* la section VETO précédente (qu'elle vienne d'un PDF individuel ou d'une passe de fusion),
|
||||
afin que le JSON fusionné reste lisible.
|
||||
"""
|
||||
cleaned: list[str] = []
|
||||
for line in (dossier.alertes_codage or []):
|
||||
if isinstance(line, str) and (line.startswith("VETOS:") or line.startswith("VETOS[") or line.startswith("VETO-")):
|
||||
continue
|
||||
cleaned.append(line)
|
||||
dossier.alertes_codage = cleaned
|
||||
|
||||
if veto.verdict != "PASS":
|
||||
dossier.alertes_codage.append(f"VETOS[{scope}]: {veto.verdict} (score={veto.score_contestabilite})")
|
||||
for it in veto.issues[:25]:
|
||||
dossier.alertes_codage.append(f"{it.veto} [{it.severity}] {it.where}: {it.message}")
|
||||
|
||||
|
||||
def _inject_decision_alerts(dossier: DossierMedical, scope: str = "FINAL") -> None:
|
||||
"""Injecte les décisions (downgrade/suppression) dans alertes_codage.
|
||||
|
||||
On remplace la section DECISION précédente pour garder un JSON lisible.
|
||||
"""
|
||||
cleaned: list[str] = []
|
||||
for line in (dossier.alertes_codage or []):
|
||||
if isinstance(line, str) and line.startswith("DECISION:"):
|
||||
continue
|
||||
cleaned.append(line)
|
||||
dossier.alertes_codage = cleaned
|
||||
|
||||
lines = decision_summaries(dossier)
|
||||
if lines:
|
||||
dossier.alertes_codage.append(f"DECISIONS[{scope}]: {len(lines)} ligne(s)")
|
||||
dossier.alertes_codage.extend(lines[:30])
|
||||
|
||||
|
||||
# Flags globaux
|
||||
_use_edsnlp = True
|
||||
_use_rag = True
|
||||
@@ -89,17 +200,67 @@ def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, Anonymization
|
||||
dossier.source_file = pdf_path.name
|
||||
dossier.document_type = doc_type
|
||||
logger.info(" DP%s : %s", part_label, dossier.diagnostic_principal)
|
||||
logger.info(" DAS : %d, Actes : %d", len(dossier.diagnostics_associes), len(dossier.actes_ccam))
|
||||
|
||||
# 8. Estimation GHM
|
||||
# 8. Vetos (contestabilité) + décisions (post-traitement)
|
||||
# Routage des règles (packs) : par défaut, on garde le socle vetos/decisions,
|
||||
# et on active des packs additionnels selon les signaux du dossier (codes/labs/extraits).
|
||||
rules_token = None
|
||||
try:
|
||||
rules_ctx = build_rules_runtime_context(dossier)
|
||||
dossier.rules_runtime = rules_ctx
|
||||
rules_token = set_rules_runtime(rules_ctx)
|
||||
|
||||
packs = ",".join(rules_ctx.get("enabled_packs", []))
|
||||
if packs:
|
||||
logger.info(" Règles%s : packs=%s", part_label, packs)
|
||||
if rules_ctx.get("triggers_fired"):
|
||||
logger.info(" Règles%s : triggers=%s", part_label, ",".join(rules_ctx["triggers_fired"]))
|
||||
except Exception:
|
||||
logger.warning(" Routage règles : erreur", exc_info=True)
|
||||
|
||||
veto = None
|
||||
try:
|
||||
veto = apply_vetos(dossier)
|
||||
dossier.veto_report = veto
|
||||
except Exception:
|
||||
logger.warning(" Vetos : erreur lors du contrôle", exc_info=True)
|
||||
|
||||
try:
|
||||
apply_decisions(dossier)
|
||||
_inject_decision_alerts(dossier, scope="PDF")
|
||||
if veto is not None:
|
||||
_inject_veto_alerts(dossier, veto, scope="PDF")
|
||||
except Exception:
|
||||
logger.warning(" Décisions : erreur lors du post-traitement", exc_info=True)
|
||||
finally:
|
||||
if rules_token is not None:
|
||||
reset_rules_runtime(rules_token)
|
||||
|
||||
# 9. Estimation GHM (sur codes finaux) + métriques (actifs vs écartés)
|
||||
try:
|
||||
metrics = _compute_metrics(dossier)
|
||||
ghm = estimate_ghm(dossier)
|
||||
dossier.ghm_estimation = ghm
|
||||
logger.info(" GHM : CMD=%s, Type=%s, Sévérité=%d → %s",
|
||||
ghm.cmd or "?", ghm.type_ghm or "?",
|
||||
ghm.severite, ghm.ghm_approx or "?")
|
||||
|
||||
logger.info(
|
||||
" DAS : actifs=%d / total=%d (écartés=%d, removed=%d, no_code=%d) | Actes : %d (avec code=%d)",
|
||||
metrics.das_active,
|
||||
metrics.das_total,
|
||||
metrics.das_excluded,
|
||||
metrics.das_removed,
|
||||
metrics.das_no_code,
|
||||
metrics.actes_total,
|
||||
metrics.actes_with_code,
|
||||
)
|
||||
logger.info(
|
||||
" GHM : CMD=%s, Type=%s, Sévérité=%d → %s",
|
||||
ghm.cmd or "?",
|
||||
ghm.type_ghm or "?",
|
||||
ghm.severite,
|
||||
ghm.ghm_approx or "?",
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(" Erreur estimation GHM", exc_info=True)
|
||||
logger.warning(" Erreur estimation GHM/metrics", exc_info=True)
|
||||
|
||||
dossier.processing_time_s = round(time.time() - t0, 2)
|
||||
results.append((anonymized_text, dossier, report))
|
||||
@@ -223,6 +384,11 @@ def main(input_path: str | None = None) -> None:
|
||||
action="store_true",
|
||||
help="Forcer la reconstruction de l'index FAISS",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rebuild-index-all",
|
||||
action="store_true",
|
||||
help="Reconstruit les index FAISS + ré-indexe tous les référentiels uploadés",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--export-rum",
|
||||
action="store_true",
|
||||
@@ -246,6 +412,24 @@ def main(input_path: str | None = None) -> None:
|
||||
logger.info("Dictionnaire CCAM : %d codes générés", len(result))
|
||||
return
|
||||
|
||||
if args.rebuild_index_all:
|
||||
from .medical.rag_index import build_index
|
||||
build_index(force=True)
|
||||
# Ré-indexer tous les référentiels uploadés (pour appliquer le nouveau chunking/filtrage)
|
||||
try:
|
||||
from .viewer.referentiels import ReferentielManager
|
||||
rm = ReferentielManager()
|
||||
total = 0
|
||||
for ref in rm.list_all():
|
||||
try:
|
||||
total += rm.index_referentiel(ref["id"])
|
||||
except Exception:
|
||||
logger.warning("Ré-indexation référentiel échouée : %s", ref.get("filename"), exc_info=True)
|
||||
logger.info("Ré-indexation référentiels terminée : %d chunks ajoutés", total)
|
||||
except Exception:
|
||||
logger.warning("Impossible de ré-indexer les référentiels uploadés", exc_info=True)
|
||||
return
|
||||
|
||||
if args.rebuild_index:
|
||||
from .medical.rag_index import build_index
|
||||
build_index(force=True)
|
||||
@@ -341,16 +525,6 @@ def main(input_path: str | None = None) -> None:
|
||||
from .medical.fusion import merge_dossiers
|
||||
merged = merge_dossiers(group_dossiers)
|
||||
|
||||
# Re-estimer le GHM sur le dossier fusionné (DP/DAS consolidés)
|
||||
try:
|
||||
ghm = estimate_ghm(merged)
|
||||
merged.ghm_estimation = ghm
|
||||
logger.info(" GHM fusionné : CMD=%s, Type=%s, Sévérité=%d → %s",
|
||||
ghm.cmd or "?", ghm.type_ghm or "?",
|
||||
ghm.severite, ghm.ghm_approx or "?")
|
||||
except Exception:
|
||||
logger.warning(" Erreur estimation GHM fusionné", exc_info=True)
|
||||
|
||||
struct_dir = STRUCTURED_DIR / subdir
|
||||
struct_dir.mkdir(parents=True, exist_ok=True)
|
||||
merged_path = struct_dir / f"{subdir}_fusionne_cim10.json"
|
||||
@@ -389,6 +563,65 @@ def main(input_path: str | None = None) -> None:
|
||||
# Écrire le dossier fusionné (après enrichissement CPAM éventuel)
|
||||
if merged is not None and subdir:
|
||||
try:
|
||||
# Vetos sur la version finale (fusion + CPAM) + décisions
|
||||
# Routage des règles (packs) pour la version fusionnée
|
||||
rules_token = None
|
||||
try:
|
||||
rules_ctx = build_rules_runtime_context(merged)
|
||||
merged.rules_runtime = rules_ctx
|
||||
rules_token = set_rules_runtime(rules_ctx)
|
||||
|
||||
packs = ",".join(rules_ctx.get("enabled_packs", []))
|
||||
if packs:
|
||||
logger.info(" Règles fusionné : packs=%s", packs)
|
||||
if rules_ctx.get("triggers_fired"):
|
||||
logger.info(" Règles fusionné : triggers=%s", ",".join(rules_ctx["triggers_fired"]))
|
||||
except Exception:
|
||||
logger.warning(" Routage règles fusionné : erreur", exc_info=True)
|
||||
|
||||
veto = None
|
||||
try:
|
||||
veto = apply_vetos(merged)
|
||||
merged.veto_report = veto
|
||||
except Exception:
|
||||
logger.warning(" Vetos fusionné : erreur lors du contrôle", exc_info=True)
|
||||
|
||||
try:
|
||||
apply_decisions(merged)
|
||||
_inject_decision_alerts(merged, scope="FINAL")
|
||||
if veto is not None:
|
||||
_inject_veto_alerts(merged, veto, scope="FINAL")
|
||||
except Exception:
|
||||
logger.warning(" Décisions fusionné : erreur lors du post-traitement", exc_info=True)
|
||||
finally:
|
||||
if rules_token is not None:
|
||||
reset_rules_runtime(rules_token)
|
||||
|
||||
# Re-estimer le GHM (sur codes finaux) + métriques (actifs vs écartés)
|
||||
try:
|
||||
metrics = _compute_metrics(merged)
|
||||
ghm = estimate_ghm(merged)
|
||||
merged.ghm_estimation = ghm
|
||||
logger.info(
|
||||
" Fusion métriques : DAS actifs=%d / total=%d (écartés=%d, removed=%d, no_code=%d) | Actes=%d (avec code=%d)",
|
||||
metrics.das_active,
|
||||
metrics.das_total,
|
||||
metrics.das_excluded,
|
||||
metrics.das_removed,
|
||||
metrics.das_no_code,
|
||||
metrics.actes_total,
|
||||
metrics.actes_with_code,
|
||||
)
|
||||
logger.info(
|
||||
" GHM final : CMD=%s, Type=%s, Sévérité=%d → %s",
|
||||
ghm.cmd or "?",
|
||||
ghm.type_ghm or "?",
|
||||
ghm.severite,
|
||||
ghm.ghm_approx or "?",
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(" Erreur estimation GHM/metrics final", exc_info=True)
|
||||
|
||||
struct_dir = STRUCTURED_DIR / subdir
|
||||
struct_dir.mkdir(parents=True, exist_ok=True)
|
||||
merged_path = struct_dir / f"{subdir}_fusionne_cim10.json"
|
||||
|
||||
@@ -39,7 +39,7 @@ def normalize_text(text: str) -> str:
|
||||
|
||||
|
||||
def build_dict() -> dict[str, str]:
|
||||
"""Construit le dictionnaire CIM-10 depuis metadata.json et l'écrit dans data/cim10_dict.json.
|
||||
"""Construit le dictionnaire CIM-10 depuis les métadonnées RAG.
|
||||
|
||||
Extrait le code et le label (première ligne de l'extrait, sans le préfixe code)
|
||||
depuis chaque entrée CIM-10 du metadata.json existant.
|
||||
@@ -47,9 +47,14 @@ def build_dict() -> dict[str, str]:
|
||||
Returns:
|
||||
Le dictionnaire code → label.
|
||||
"""
|
||||
metadata_path = RAG_INDEX_DIR / "metadata.json"
|
||||
# Nouveau format : metadata_ref.json (fallback legacy : metadata.json)
|
||||
metadata_path = RAG_INDEX_DIR / "metadata_ref.json"
|
||||
if not metadata_path.exists():
|
||||
logger.error("metadata.json non trouvé : %s", metadata_path)
|
||||
legacy = RAG_INDEX_DIR / "metadata.json"
|
||||
if legacy.exists():
|
||||
metadata_path = legacy
|
||||
else:
|
||||
logger.error("Métadonnées RAG non trouvées : %s", metadata_path)
|
||||
return {}
|
||||
|
||||
with open(metadata_path, encoding="utf-8") as f:
|
||||
|
||||
@@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
@@ -19,6 +20,7 @@ from ..config import (
|
||||
Complication,
|
||||
Diagnostic,
|
||||
DossierMedical,
|
||||
load_lab_value_sanity,
|
||||
Imagerie,
|
||||
Sejour,
|
||||
Traitement,
|
||||
@@ -168,13 +170,13 @@ def _extract_das_llm(text: str, dossier: DossierMedical) -> None:
|
||||
try:
|
||||
from .rag_search import extract_das_llm
|
||||
from .ollama_cache import OllamaCache
|
||||
from ..config import OLLAMA_CACHE_PATH, OLLAMA_MODEL
|
||||
from ..config import OLLAMA_CACHE_PATH, get_model
|
||||
except ImportError:
|
||||
logger.warning("Module RAG non disponible pour l'extraction DAS LLM")
|
||||
return
|
||||
|
||||
try:
|
||||
cache = OllamaCache(OLLAMA_CACHE_PATH, OLLAMA_MODEL)
|
||||
cache = OllamaCache(OLLAMA_CACHE_PATH, get_model("coding"))
|
||||
|
||||
# Construire le contexte
|
||||
contexte = {
|
||||
@@ -684,37 +686,181 @@ def _match_drug_atc(med_name: str, drug_atc: dict[str, str]) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _extract_biologie(text: str, dossier: DossierMedical) -> None:
|
||||
"""Extrait les résultats biologiques clés.
|
||||
|
||||
Supporte les aliases (TGO/TGP, Hb), variantes d'unités (UI/L, µmol/L, g/dL),
|
||||
et des tests additionnels (hémoglobine, plaquettes, leucocytes, créatinine).
|
||||
def _norm_key(s: str) -> str:
|
||||
"""Normalise une clé (minuscules, sans accents) pour index YAML."""
|
||||
s = (s or "").strip().lower()
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
||||
return re.sub(r"\s+", " ", s)
|
||||
|
||||
|
||||
def _parse_float_and_token(raw: str) -> tuple[float | None, str | None]:
|
||||
"""Parse un float et renvoie aussi le token numérique normalisé (avec '.')."""
|
||||
if raw is None:
|
||||
return None, None
|
||||
s = str(raw).strip()
|
||||
m = re.search(r"(-?\d+(?:[\.,]\d+)?)", s)
|
||||
if not m:
|
||||
return None, None
|
||||
token = m.group(1).replace(",", ".")
|
||||
try:
|
||||
return float(token), token
|
||||
except ValueError:
|
||||
return None, None
|
||||
|
||||
|
||||
def _sanitize_bio_value(test_name: str, raw_value: str, sanity_cfg: dict) -> tuple[str, float, str, str | None] | None:
|
||||
"""Applique des garde-fous anti-artefacts (OCR/PDF).
|
||||
|
||||
Retour:
|
||||
(token, value_float, quality, reason) ou None si non parsable.
|
||||
quality: ok | suspect | discarded
|
||||
"""
|
||||
bio_patterns = [
|
||||
(r"[Ll]ipas[ée]mie\s*(?:[àa=:])?\s*(\d+)\s*(?:UI/L|U/L)?", "Lipasémie", None),
|
||||
(r"CRP\s*[=:àa]?\s*(\d+(?:[.,]\d+)?)\s*(?:mg/[Ll])?", "CRP", None),
|
||||
(r"(?:ASAT|TGO)\s*[=:àa]?\s*([\d.,]+)\s*(?:N|U(?:I)?/L)?", "ASAT", None),
|
||||
(r"(?:ALAT|TGP)\s*[=:àa]?\s*([\d.,]+)\s*(?:N|U(?:I)?/L)?", "ALAT", None),
|
||||
(r"GGT\s*[=:àa]?\s*(\d+)\s*(?:U(?:I)?/L)?", "GGT", None),
|
||||
(r"PAL\s*[=:àa]?\s*(\d+)\s*(?:U(?:I)?/L)?", "PAL", None),
|
||||
(r"[Bb]ilirubine\s+(?:totale\s+)?[àa=:]\s*(\d+(?:[.,]\d+)?)\s*(?:µmol/L|mg/dL)?", "Bilirubine totale", None),
|
||||
(r"[Tt]roponine\s+(?:us\s+)?(n[ée]gative|positive|normale)", "Troponine", None),
|
||||
(r"(?:[Hh][ée]moglobine|Hb)\s*[=:àa]?\s*(\d+(?:[.,]\d+)?)\s*(?:g/dL|g/L)?", "Hémoglobine", None),
|
||||
(r"[Pp]laquettes?\s*[=:àa]?\s*(\d+(?:\s*000)?)\s*(?:/mm3|G/L)?", "Plaquettes", None),
|
||||
(r"[Ll]eucocytes?\s*[=:àa]?\s*(\d+(?:\s*000)?)\s*(?:/mm3|G/L)?", "Leucocytes", None),
|
||||
(r"[Cc]r[ée]atinine?\s*[=:àa]?\s*(\d+(?:[.,]\d+)?)\s*(?:µmol/L|mg/dL)?", "Créatinine", None),
|
||||
val, token = _parse_float_and_token(raw_value)
|
||||
if val is None or token is None:
|
||||
return None
|
||||
|
||||
key = _norm_key(test_name)
|
||||
tests_cfg = (sanity_cfg or {}).get("tests") or {}
|
||||
cfg = tests_cfg.get(key) or {}
|
||||
hard_min = cfg.get("hard_min")
|
||||
hard_max = cfg.get("hard_max")
|
||||
|
||||
if hard_min is not None and val < float(hard_min):
|
||||
return token, val, "discarded", f"Valeur hors bornes plausibles (<{hard_min})"
|
||||
if hard_max is not None and val > float(hard_max):
|
||||
return token, val, "discarded", f"Valeur hors bornes plausibles (>{hard_max})"
|
||||
|
||||
quality = "ok"
|
||||
reason: str | None = None
|
||||
|
||||
suspect_cfg = cfg.get("suspect") or {}
|
||||
single_digit_over = suspect_cfg.get("single_digit_over")
|
||||
if single_digit_over is not None:
|
||||
# Ex: potassium '8' au lieu de '4.8' (décimale perdue)
|
||||
if re.fullmatch(r"\d", str(raw_value).strip()) and val >= float(single_digit_over):
|
||||
quality = "suspect"
|
||||
reason = f"Valeur à 1 chiffre (possible décimale perdue) : vérifier dans le CR"
|
||||
|
||||
return token, val, quality, reason
|
||||
|
||||
|
||||
def _extract_biologie(text: str, dossier: DossierMedical) -> None:
|
||||
"""Extrait des résultats biologiques clés.
|
||||
|
||||
Notes:
|
||||
- Supporte des aliases (TGO/TGP, Hb, Na/K…)
|
||||
- Capte plusieurs occurrences (utile pour valider/infirmer des diagnostics)
|
||||
- Reste volontairement *simple* (regex sur texte extrait) : si une valeur est
|
||||
uniquement dans un tableau PDF mal extrait, elle peut manquer.
|
||||
"""
|
||||
# (pattern, test_name)
|
||||
bio_patterns: list[tuple[str, str]] = [
|
||||
(r"[Ll]ipas[ée]mie\s*(?:[àa=:])?\s*(\d+)\s*(?:UI/L|U/L)?", "Lipasémie"),
|
||||
(r"\bCRP\b\s*[=:àa]?\s*(\d+(?:[.,]\d+)?)\s*(?:mg/[Ll])?", "CRP"),
|
||||
(r"(?:\bASAT\b|\bTGO\b)\s*[=:àa]?\s*([\d.,]+)\s*(?:N|U(?:I)?/L)?", "ASAT"),
|
||||
(r"(?:\bALAT\b|\bTGP\b)\s*[=:àa]?\s*([\d.,]+)\s*(?:N|U(?:I)?/L)?", "ALAT"),
|
||||
(r"\bGGT\b\s*[=:àa]?\s*(\d+)\s*(?:U(?:I)?/L)?", "GGT"),
|
||||
(r"\bPAL\b\s*[=:àa]?\s*(\d+)\s*(?:U(?:I)?/L)?", "PAL"),
|
||||
(r"[Bb]ilirubine\s+(?:totale\s+)?[àa=:]\s*(\d+(?:[.,]\d+)?)\s*(?:µmol/L|mg/dL)?", "Bilirubine totale"),
|
||||
|
||||
# Ionogramme / électrolytes
|
||||
(r"(?:[Ss]odium|[Nn]atr[ée]mie|(?<![A-Za-z])Na\+?(?![A-Za-z]))\s*[=:àa]?\s*([0-9]{2,3}(?:[.,][0-9]+)?)\s*(?:mmol/L|mEq/L)?", "Sodium"),
|
||||
(r"(?:[Pp]otassium|[Kk]ali[ée]mie|(?<![A-Za-z])K\+?(?![A-Za-z]))\s*[=:àa]?\s*([0-9](?:[.,][0-9]+)?)\s*(?:mmol/L|mEq/L)?", "Potassium"),
|
||||
|
||||
(r"[Tt]roponine\s+(?:us\s+)?(n[ée]gative|positive|normale)", "Troponine"),
|
||||
(r"(?:[Hh][ée]moglobine|\bHb\b)\s*[=:àa]?\s*(\d+(?:[.,]\d+)?)\s*(?:g/dL|g/L)?", "Hémoglobine"),
|
||||
(r"[Pp]laquettes?\s*[=:àa]?\s*(\d+(?:[.,]\d+)?)\s*(?:/mm3|G/L)?", "Plaquettes"),
|
||||
(r"[Ll]eucocytes?\s*[=:àa]?\s*(\d+(?:[.,]\d+)?)\s*(?:/mm3|G/L)?", "Leucocytes"),
|
||||
(r"[Cc]r[ée]atinine?\s*[=:àa]?\s*(\d+(?:[.,]\d+)?)\s*(?:µmol/L|mg/dL)?", "Créatinine"),
|
||||
]
|
||||
|
||||
for pattern, test_name, _ in bio_patterns:
|
||||
m = re.search(pattern, text)
|
||||
if m:
|
||||
value = m.group(1)
|
||||
anomalie = _is_abnormal(test_name, value)
|
||||
dossier.biologie_cle.append(BiologieCle(
|
||||
|
||||
# Anti-doublons + limite par test (évite d'exploser le JSON)
|
||||
max_per_test = 6
|
||||
counts: dict[str, int] = {}
|
||||
seen: set[tuple[str, str]] = set()
|
||||
|
||||
sanity_cfg = load_lab_value_sanity()
|
||||
policy = (sanity_cfg or {}).get("policy") or {}
|
||||
drop_out_of_range = bool(policy.get("drop_out_of_range", True))
|
||||
keep_suspect = bool(policy.get("keep_suspect", True))
|
||||
|
||||
for pattern, test_name in bio_patterns:
|
||||
for m in re.finditer(pattern, text):
|
||||
raw_value = (m.group(1) or "").strip()
|
||||
if not raw_value:
|
||||
continue
|
||||
|
||||
# Valeurs qualitatives (troponine négative/positive/normale) :
|
||||
# pas de sanitization numérique.
|
||||
if re.fullmatch(r"[a-zA-Zéèêëàâôûùïîç]+", raw_value):
|
||||
key = (test_name, raw_value.lower())
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
counts[test_name] = counts.get(test_name, 0) + 1
|
||||
if counts[test_name] > max_per_test:
|
||||
break
|
||||
anomalie = _is_abnormal(test_name, raw_value)
|
||||
dossier.biologie_cle.append(
|
||||
BiologieCle(
|
||||
test=test_name,
|
||||
valeur=value,
|
||||
valeur=raw_value,
|
||||
valeur_num=None,
|
||||
anomalie=anomalie,
|
||||
))
|
||||
quality="ok",
|
||||
discard_reason=None,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
sanitized = _sanitize_bio_value(test_name, raw_value, sanity_cfg)
|
||||
if sanitized is None:
|
||||
continue
|
||||
token, val_num, quality, reason = sanitized
|
||||
|
||||
if quality == "suspect" and not keep_suspect:
|
||||
quality = "discarded"
|
||||
reason = reason or "Valeur suspecte (policy keep_suspect=false)"
|
||||
|
||||
# Déduplication sur la valeur normalisée
|
||||
key = (test_name, token)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
counts[test_name] = counts.get(test_name, 0) + 1
|
||||
if counts[test_name] > max_per_test:
|
||||
break
|
||||
|
||||
if quality == "discarded":
|
||||
# On garde la trace pour audit, sans polluer les règles qualité.
|
||||
dossier.biologie_discarded.append(
|
||||
{
|
||||
"test": test_name,
|
||||
"raw": raw_value,
|
||||
"valeur": token,
|
||||
"valeur_num": val_num,
|
||||
"reason": reason,
|
||||
}
|
||||
)
|
||||
if drop_out_of_range:
|
||||
continue
|
||||
|
||||
anomalie = _is_abnormal(test_name, token)
|
||||
dossier.biologie_cle.append(
|
||||
BiologieCle(
|
||||
test=test_name,
|
||||
valeur=token,
|
||||
valeur_num=val_num,
|
||||
anomalie=anomalie,
|
||||
quality=quality,
|
||||
discard_reason=reason,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
|
||||
def _extract_imagerie(text: str, dossier: DossierMedical) -> None:
|
||||
@@ -1013,6 +1159,9 @@ BIO_NORMALS: dict[str, tuple[float, float]] = {
|
||||
"GGT": (0, 60),
|
||||
"PAL": (0, 150),
|
||||
"Bilirubine totale": (0, 17),
|
||||
# Ionogramme (fallback adulte ; les règles de décision utilisent reference_ranges.yaml)
|
||||
"Sodium": (135, 145),
|
||||
"Potassium": (3.5, 5.0),
|
||||
"Hémoglobine": (12, 17),
|
||||
"Plaquettes": (150, 400),
|
||||
"Leucocytes": (4, 10),
|
||||
@@ -1152,36 +1301,11 @@ def _validate_justifications(dossier: DossierMedical) -> None:
|
||||
ctx = build_enriched_context(dossier)
|
||||
ctx_str = format_enriched_context(ctx)
|
||||
|
||||
prompt = f"""Tu es un médecin DIM contrôleur qualité PMSI.
|
||||
Vérifie la cohérence et la justification de ce codage complet.
|
||||
|
||||
DOSSIER CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
CODAGE À VALIDER :
|
||||
{codes_section}
|
||||
|
||||
Pour CHAQUE code, vérifie :
|
||||
1. Existe-t-il une preuve clinique concrète dans le dossier ?
|
||||
2. Le code est-il le plus spécifique possible ?
|
||||
3. Y a-t-il des conflits ou redondances avec d'autres codes ?
|
||||
|
||||
Réponds avec un JSON :
|
||||
{{
|
||||
"validations": [
|
||||
{{
|
||||
"numero": 1,
|
||||
"code": "X99.9",
|
||||
"verdict": "maintenir|reclasser|supprimer",
|
||||
"confidence_recommandee": "high|medium|low",
|
||||
"commentaire": "explication courte"
|
||||
}}
|
||||
],
|
||||
"alertes_globales": ["..."]
|
||||
}}"""
|
||||
from ..prompts import QC_VALIDATION
|
||||
prompt = QC_VALIDATION.format(ctx_str=ctx_str, codes_section=codes_section)
|
||||
|
||||
try:
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=2500)
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=2500, role="qc")
|
||||
except Exception:
|
||||
logger.warning("Erreur lors de l'appel Ollama pour validation QC", exc_info=True)
|
||||
return
|
||||
|
||||
@@ -152,6 +152,12 @@ def _compute_severity(das_list: list) -> tuple[int, int, int]:
|
||||
max_cma_level = 1
|
||||
|
||||
for das in das_list:
|
||||
# Exclure les diagnostics "barrés" / retirés du calcul de sévérité
|
||||
dec = getattr(das, "cim10_decision", None)
|
||||
if getattr(das, "status", None) == "ruled_out":
|
||||
continue
|
||||
if dec is not None and getattr(dec, "action", None) in ("REMOVE", "RULED_OUT"):
|
||||
continue
|
||||
niveau_cma = getattr(das, "niveau_cma", None)
|
||||
if niveau_cma and niveau_cma > 1:
|
||||
max_cma_level = max(max_cma_level, niveau_cma)
|
||||
|
||||
@@ -14,53 +14,79 @@ class OllamaCache:
|
||||
"""Cache JSON persistant pour éviter les appels Ollama redondants.
|
||||
|
||||
Clé = (texte_diagnostic_normalisé, type).
|
||||
Le modèle Ollama est stocké dans les métadonnées : si le modèle change,
|
||||
le cache est automatiquement invalidé.
|
||||
Le modèle Ollama est stocké PAR ENTRÉE : si le modèle change pour un rôle,
|
||||
seules les entrées de cet ancien modèle sont invalides.
|
||||
|
||||
Migration automatique depuis l'ancien format (model global) au chargement.
|
||||
"""
|
||||
|
||||
def __init__(self, cache_path: Path, model: str):
|
||||
def __init__(self, cache_path: Path, model: str | None = None):
|
||||
self._path = cache_path
|
||||
self._model = model
|
||||
self._default_model = model
|
||||
self._lock = threading.Lock()
|
||||
self._data: dict[str, dict] = {}
|
||||
self._dirty = False
|
||||
self._load()
|
||||
|
||||
def _load(self) -> None:
|
||||
"""Charge le cache depuis le disque."""
|
||||
"""Charge le cache depuis le disque, avec migration automatique."""
|
||||
if not self._path.exists():
|
||||
logger.info("Cache Ollama : nouveau cache (%s)", self._path)
|
||||
return
|
||||
try:
|
||||
raw = json.loads(self._path.read_text(encoding="utf-8"))
|
||||
if raw.get("model") != self._model:
|
||||
logger.info(
|
||||
"Cache Ollama : modèle changé (%s → %s), cache invalidé",
|
||||
raw.get("model"), self._model,
|
||||
)
|
||||
return
|
||||
self._data = raw.get("entries", {})
|
||||
logger.info("Cache Ollama : %d entrées chargées", len(self._data))
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning("Cache Ollama : fichier corrompu (%s), réinitialisé", e)
|
||||
self._data = {}
|
||||
return
|
||||
|
||||
entries = raw.get("entries", {})
|
||||
|
||||
# Détection ancien format : {"model": "...", "entries": {k: result_dict_sans_model}}
|
||||
global_model = raw.get("model")
|
||||
if global_model and entries:
|
||||
first_val = next(iter(entries.values()), None)
|
||||
if isinstance(first_val, dict) and "model" not in first_val:
|
||||
# Migration : ancien format → nouveau (modèle par entrée)
|
||||
logger.info(
|
||||
"Cache Ollama : migration ancien format (model=%s) → modèle par entrée",
|
||||
global_model,
|
||||
)
|
||||
migrated: dict[str, dict] = {}
|
||||
for k, v in entries.items():
|
||||
if isinstance(v, dict):
|
||||
migrated[k] = {"model": global_model, "result": v}
|
||||
self._data = migrated
|
||||
self._dirty = True
|
||||
logger.info("Cache Ollama : %d entrées migrées", len(migrated))
|
||||
return
|
||||
|
||||
self._data = entries
|
||||
logger.info("Cache Ollama : %d entrées chargées", len(self._data))
|
||||
|
||||
@staticmethod
|
||||
def _make_key(texte: str, diag_type: str) -> str:
|
||||
"""Construit une clé normalisée."""
|
||||
return f"{diag_type}::{texte.strip().lower()}"
|
||||
|
||||
def get(self, texte: str, diag_type: str) -> dict | None:
|
||||
"""Récupère un résultat caché, ou None si absent."""
|
||||
def get(self, texte: str, diag_type: str, model: str | None = None) -> dict | None:
|
||||
"""Récupère un résultat caché, ou None si absent ou modèle différent."""
|
||||
key = self._make_key(texte, diag_type)
|
||||
use_model = model or self._default_model
|
||||
with self._lock:
|
||||
return self._data.get(key)
|
||||
entry = self._data.get(key)
|
||||
if entry is None:
|
||||
return None
|
||||
if use_model and entry.get("model") != use_model:
|
||||
return None
|
||||
return entry.get("result")
|
||||
|
||||
def put(self, texte: str, diag_type: str, result: dict) -> None:
|
||||
"""Stocke un résultat dans le cache."""
|
||||
def put(self, texte: str, diag_type: str, result: dict, model: str | None = None) -> None:
|
||||
"""Stocke un résultat dans le cache avec le modèle utilisé."""
|
||||
key = self._make_key(texte, diag_type)
|
||||
use_model = model or self._default_model
|
||||
with self._lock:
|
||||
self._data[key] = result
|
||||
self._data[key] = {"model": use_model, "result": result}
|
||||
self._dirty = True
|
||||
|
||||
def save(self) -> None:
|
||||
@@ -69,10 +95,7 @@ class OllamaCache:
|
||||
if not self._dirty:
|
||||
return
|
||||
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {
|
||||
"model": self._model,
|
||||
"entries": self._data,
|
||||
}
|
||||
payload = {"entries": self._data}
|
||||
self._path.write_text(
|
||||
json.dumps(payload, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
|
||||
@@ -8,7 +8,7 @@ import os
|
||||
|
||||
import requests
|
||||
|
||||
from ..config import OLLAMA_URL, OLLAMA_MODEL, OLLAMA_TIMEOUT
|
||||
from ..config import OLLAMA_URL, OLLAMA_MODEL, OLLAMA_TIMEOUT, get_model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -84,6 +84,7 @@ def call_ollama(
|
||||
max_tokens: int = 2500,
|
||||
model: str | None = None,
|
||||
timeout: int | None = None,
|
||||
role: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Appelle Ollama en mode JSON natif, avec fallback Anthropic si indisponible.
|
||||
|
||||
@@ -91,13 +92,14 @@ def call_ollama(
|
||||
prompt: Le prompt à envoyer.
|
||||
temperature: Température de génération (défaut: 0.1).
|
||||
max_tokens: Nombre max de tokens (défaut: 2500).
|
||||
model: Modèle Ollama à utiliser (défaut: OLLAMA_MODEL global).
|
||||
model: Modèle Ollama à utiliser (prioritaire sur role).
|
||||
timeout: Timeout en secondes (défaut: OLLAMA_TIMEOUT global).
|
||||
role: Rôle LLM (coding, cpam, validation, qc) → résolu via get_model().
|
||||
|
||||
Returns:
|
||||
Le dict JSON parsé, ou None en cas d'erreur.
|
||||
"""
|
||||
use_model = model or OLLAMA_MODEL
|
||||
use_model = model or (get_model(role) if role else OLLAMA_MODEL)
|
||||
use_timeout = timeout or OLLAMA_TIMEOUT
|
||||
for attempt in range(2):
|
||||
try:
|
||||
|
||||
@@ -1,4 +1,13 @@
|
||||
"""Indexation FAISS des documents de référence CIM-10 / Guide métho / CCAM."""
|
||||
"""Indexation FAISS des documents de référence.
|
||||
|
||||
Objectif : éviter que des documents "procédure/méthodo" influencent le codage.
|
||||
|
||||
On maintient donc 2 index FAISS :
|
||||
- ref : référentiels (CIM-10, CCAM, référentiels uploadés en ref:...)
|
||||
- proc : procédures / guide méthodologique (guide_methodo + uploadés en proc:...)
|
||||
|
||||
Backwards compat : si les nouveaux fichiers n'existent pas, on retombe sur faiss.index.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -15,9 +24,8 @@ from ..config import RAG_INDEX_DIR, CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF, CCAM
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Singleton pour l'index chargé en mémoire
|
||||
_faiss_index = None
|
||||
_metadata: list[dict] = []
|
||||
# Singletons pour les index chargés en mémoire
|
||||
_loaded: dict[str, tuple] = {}
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -28,6 +36,99 @@ class Chunk:
|
||||
code: Optional[str] = None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers nettoyage / découpe
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_RE_JUNK_LINE = re.compile(
|
||||
r"^(?:\d{1,4}|page\s*\d{1,4}|\d{1,4}\s*/\s*\d{1,4})$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _clean_lines(text: str) -> list[str]:
|
||||
"""Nettoie des artefacts d'extraction PDF (en-têtes/pieds de page, numéros, etc.)."""
|
||||
out: list[str] = []
|
||||
for raw in (text or "").split("\n"):
|
||||
line = (raw or "").strip().replace("\xa0", " ")
|
||||
if not line:
|
||||
continue
|
||||
# pagination / bruit
|
||||
if _RE_JUNK_LINE.match(line):
|
||||
continue
|
||||
# lignes ultra courtes non informatives
|
||||
if len(line) <= 2:
|
||||
continue
|
||||
out.append(line)
|
||||
return out
|
||||
|
||||
|
||||
def _split_by_words(text: str, max_words: int = 380, overlap: int = 50) -> list[str]:
|
||||
"""Découpe un texte long en fenêtres de mots avec recouvrement."""
|
||||
words = (text or "").split()
|
||||
if len(words) <= max_words:
|
||||
return [text.strip()] if text.strip() else []
|
||||
parts: list[str] = []
|
||||
i = 0
|
||||
step = max(1, max_words - overlap)
|
||||
while i < len(words):
|
||||
chunk = " ".join(words[i : i + max_words]).strip()
|
||||
if chunk:
|
||||
parts.append(chunk)
|
||||
i += step
|
||||
return parts
|
||||
|
||||
|
||||
_PROC_KW = (
|
||||
"procédure", "procedure", "méthodo", "methodo", "méthodologie", "methodologie",
|
||||
"démarche", "demarche", "étape", "etape", "objectif", "recommand", "doit", "il faut",
|
||||
"modalité", "modalite", "annexe", "document", "rappel", "consigne",
|
||||
)
|
||||
|
||||
_CRIT_KW = (
|
||||
"critère", "critere", "seuil", "score", "tableau", "cma", "ghm", "sévérité", "severite",
|
||||
"inclusion", "exclusion", "diagnostic", "code", "comorbid", "majoration",
|
||||
)
|
||||
|
||||
|
||||
def _looks_procedural(text: str) -> bool:
|
||||
"""Heuristique : détecte un chunk majoritairement 'procédural'.
|
||||
|
||||
Objectif : éviter que des passages 'process' (qui ne sont pas des critères ou définitions)
|
||||
polluent l'index référentiel (ex. COCOA).
|
||||
"""
|
||||
t = (text or "").lower()
|
||||
proc_hits = sum(1 for k in _PROC_KW if k in t)
|
||||
crit_hits = sum(1 for k in _CRIT_KW if k in t)
|
||||
# Si beaucoup de mots procéduraux et aucun signal de critères, on jette.
|
||||
return proc_hits >= 5 and crit_hits == 0
|
||||
|
||||
|
||||
def _paths(kind: str) -> tuple[Path, Path]:
|
||||
"""Retourne (index_path, meta_path) pour un type d'index.
|
||||
|
||||
kind:
|
||||
- "ref" : référentiels
|
||||
- "proc" : procédures
|
||||
- "all" : legacy (faiss.index)
|
||||
"""
|
||||
kind = (kind or "ref").lower()
|
||||
if kind == "proc":
|
||||
return (RAG_INDEX_DIR / "faiss_proc.index", RAG_INDEX_DIR / "metadata_proc.json")
|
||||
if kind == "all":
|
||||
return (RAG_INDEX_DIR / "faiss.index", RAG_INDEX_DIR / "metadata.json")
|
||||
# ref (default)
|
||||
return (RAG_INDEX_DIR / "faiss_ref.index", RAG_INDEX_DIR / "metadata_ref.json")
|
||||
|
||||
|
||||
def _kind_for_chunk(chunk: Chunk) -> str:
|
||||
"""Détermine le type d'index cible pour un chunk."""
|
||||
doc = (chunk.document or "").lower()
|
||||
if doc == "guide_methodo" or doc.startswith("proc:"):
|
||||
return "proc"
|
||||
return "ref"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chunking CIM-10
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -54,8 +155,9 @@ def _chunk_cim10(pdf_path: Path) -> list[Chunk]:
|
||||
if current_subcode and current_subcode_text:
|
||||
chunk_text = "\n".join(current_subcode_text)
|
||||
if len(chunk_text.split()) >= 3:
|
||||
for part in _split_by_words(chunk_text, max_words=260, overlap=40):
|
||||
chunks.append(Chunk(
|
||||
text=chunk_text,
|
||||
text=part,
|
||||
document="cim10",
|
||||
page=current_subcode_page,
|
||||
code=current_subcode,
|
||||
@@ -67,8 +169,9 @@ def _chunk_cim10(pdf_path: Path) -> list[Chunk]:
|
||||
if current_code3 and current_code3_text:
|
||||
chunk_text = "\n".join(current_code3_text)
|
||||
if len(chunk_text.split()) >= 5:
|
||||
for part in _split_by_words(chunk_text, max_words=320, overlap=50):
|
||||
chunks.append(Chunk(
|
||||
text=chunk_text,
|
||||
text=part,
|
||||
document="cim10",
|
||||
page=current_code3_page,
|
||||
code=current_code3,
|
||||
@@ -80,10 +183,7 @@ def _chunk_cim10(pdf_path: Path) -> list[Chunk]:
|
||||
if not text:
|
||||
continue
|
||||
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
for line in _clean_lines(text):
|
||||
|
||||
m_sub = subcode_pattern.match(line)
|
||||
m3 = code3_pattern.match(line)
|
||||
@@ -146,10 +246,7 @@ def _chunk_guide_methodo(pdf_path: Path) -> list[Chunk]:
|
||||
if not text:
|
||||
continue
|
||||
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
for line in _clean_lines(text):
|
||||
|
||||
is_title = False
|
||||
for pat in title_patterns:
|
||||
@@ -194,12 +291,13 @@ def _chunk_guide_methodo(pdf_path: Path) -> list[Chunk]:
|
||||
for page_num, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
page_texts.append(text)
|
||||
page_texts.append("\n".join(_clean_lines(text)))
|
||||
if len(page_texts) >= 3:
|
||||
combined = "\n".join(page_texts)
|
||||
if len(combined.split()) >= 20:
|
||||
for part in _split_by_words(combined, max_words=420, overlap=60):
|
||||
chunks.append(Chunk(
|
||||
text=combined,
|
||||
text=part,
|
||||
document="guide_methodo",
|
||||
page=start_page,
|
||||
))
|
||||
@@ -208,8 +306,9 @@ def _chunk_guide_methodo(pdf_path: Path) -> list[Chunk]:
|
||||
if page_texts:
|
||||
combined = "\n".join(page_texts)
|
||||
if len(combined.split()) >= 20:
|
||||
for part in _split_by_words(combined, max_words=420, overlap=60):
|
||||
chunks.append(Chunk(
|
||||
text=combined,
|
||||
text=part,
|
||||
document="guide_methodo",
|
||||
page=start_page,
|
||||
))
|
||||
@@ -238,16 +337,15 @@ def _chunk_ccam(pdf_path: Path) -> list[Chunk]:
|
||||
current_code: str | None = None
|
||||
current_lines: list[str] = []
|
||||
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
for line in _clean_lines(text):
|
||||
|
||||
m = ccam_pattern.match(line)
|
||||
if m:
|
||||
if current_code and current_lines:
|
||||
joined = "\n".join(current_lines)
|
||||
for part in _split_by_words(joined, max_words=320, overlap=40):
|
||||
chunks.append(Chunk(
|
||||
text="\n".join(current_lines),
|
||||
text=part,
|
||||
document="ccam",
|
||||
page=page_num,
|
||||
code=current_code,
|
||||
@@ -258,8 +356,10 @@ def _chunk_ccam(pdf_path: Path) -> list[Chunk]:
|
||||
current_lines.append(line)
|
||||
|
||||
if current_code and current_lines:
|
||||
joined = "\n".join(current_lines)
|
||||
for part in _split_by_words(joined, max_words=320, overlap=40):
|
||||
chunks.append(Chunk(
|
||||
text="\n".join(current_lines),
|
||||
text=part,
|
||||
document="ccam",
|
||||
page=page_num,
|
||||
code=current_code,
|
||||
@@ -351,10 +451,7 @@ def _chunk_cim10_alpha(pdf_path: Path) -> list[Chunk]:
|
||||
if not in_alpha_section:
|
||||
continue
|
||||
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
for line in _clean_lines(text):
|
||||
m = entry_pattern.match(line)
|
||||
if m:
|
||||
terme = m.group(1).strip()
|
||||
@@ -376,7 +473,10 @@ def _chunk_cim10_alpha(pdf_path: Path) -> list[Chunk]:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_index(force: bool = False) -> None:
|
||||
"""Construit l'index FAISS à partir des 3 PDFs de référence.
|
||||
"""Construit les index FAISS à partir des PDFs de référence.
|
||||
|
||||
- ref : CIM-10 (+ index alpha) + CCAM
|
||||
- proc : Guide méthodologique
|
||||
|
||||
Args:
|
||||
force: Si True, reconstruit même si l'index existe déjà.
|
||||
@@ -385,43 +485,48 @@ def build_index(force: bool = False) -> None:
|
||||
import numpy as np
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
index_path = RAG_INDEX_DIR / "faiss.index"
|
||||
meta_path = RAG_INDEX_DIR / "metadata.json"
|
||||
ref_index_path, ref_meta_path = _paths("ref")
|
||||
proc_index_path, proc_meta_path = _paths("proc")
|
||||
|
||||
if not force and index_path.exists() and meta_path.exists():
|
||||
logger.info("Index FAISS déjà existant dans %s (use force=True pour reconstruire)", RAG_INDEX_DIR)
|
||||
# Si tout existe déjà et pas de force
|
||||
ref_ok = ref_index_path.exists() and ref_meta_path.exists()
|
||||
proc_ok = proc_index_path.exists() and proc_meta_path.exists()
|
||||
guide_expected = GUIDE_METHODO_PDF.exists()
|
||||
if not force and ref_ok and ((not guide_expected) or proc_ok):
|
||||
logger.info("Index FAISS déjà existants dans %s (use force=True pour reconstruire)", RAG_INDEX_DIR)
|
||||
return
|
||||
|
||||
# Collecter tous les chunks
|
||||
all_chunks: list[Chunk] = []
|
||||
# Collecter les chunks
|
||||
ref_chunks: list[Chunk] = []
|
||||
proc_chunks: list[Chunk] = []
|
||||
|
||||
for pdf_path, chunk_fn in [
|
||||
(CIM10_PDF, _chunk_cim10),
|
||||
(GUIDE_METHODO_PDF, _chunk_guide_methodo),
|
||||
]:
|
||||
if pdf_path.exists():
|
||||
all_chunks.extend(chunk_fn(pdf_path))
|
||||
# CIM-10 (référentiel)
|
||||
if CIM10_PDF.exists():
|
||||
ref_chunks.extend(_chunk_cim10(CIM10_PDF))
|
||||
ref_chunks.extend(_chunk_cim10_alpha(CIM10_PDF))
|
||||
else:
|
||||
logger.warning("PDF non trouvé : %s", pdf_path)
|
||||
logger.warning("PDF non trouvé : %s", CIM10_PDF)
|
||||
|
||||
# CCAM : priorité au dictionnaire JSON sur le PDF
|
||||
# Guide méthodologique (procédures)
|
||||
if GUIDE_METHODO_PDF.exists():
|
||||
proc_chunks.extend(_chunk_guide_methodo(GUIDE_METHODO_PDF))
|
||||
else:
|
||||
logger.warning("PDF non trouvé : %s", GUIDE_METHODO_PDF)
|
||||
|
||||
# CCAM (référentiel)
|
||||
ccam_dict_chunks = _chunk_ccam_from_dict()
|
||||
if ccam_dict_chunks:
|
||||
all_chunks.extend(ccam_dict_chunks)
|
||||
ref_chunks.extend(ccam_dict_chunks)
|
||||
elif CCAM_PDF.exists():
|
||||
all_chunks.extend(_chunk_ccam(CCAM_PDF))
|
||||
ref_chunks.extend(_chunk_ccam(CCAM_PDF))
|
||||
else:
|
||||
logger.warning("Ni dictionnaire CCAM ni PDF CCAM trouvé")
|
||||
|
||||
# CIM-10 index alphabétique (source additionnelle)
|
||||
if CIM10_PDF.exists():
|
||||
all_chunks.extend(_chunk_cim10_alpha(CIM10_PDF))
|
||||
|
||||
if not all_chunks:
|
||||
if not ref_chunks and not proc_chunks:
|
||||
logger.error("Aucun chunk extrait — vérifiez les chemins des PDFs")
|
||||
return
|
||||
|
||||
logger.info("Total : %d chunks à indexer", len(all_chunks))
|
||||
logger.info("Total ref : %d chunks | total proc : %d chunks", len(ref_chunks), len(proc_chunks))
|
||||
|
||||
# Embeddings — GPU si disponible
|
||||
import torch
|
||||
@@ -430,58 +535,72 @@ def build_index(force: bool = False) -> None:
|
||||
model = SentenceTransformer(EMBEDDING_MODEL, device=_device)
|
||||
model.max_seq_length = 512 # CamemBERT max position embeddings
|
||||
|
||||
texts = [c.text[:2000] for c in all_chunks] # Tronquer les chunks trop longs
|
||||
logger.info("Calcul des embeddings pour %d chunks...", len(texts))
|
||||
embeddings = model.encode(
|
||||
texts, show_progress_bar=True, normalize_embeddings=True, batch_size=64,
|
||||
)
|
||||
def _write_index(chunks: list[Chunk], idx_path: Path, meta_path: Path, label: str) -> None:
|
||||
if not chunks:
|
||||
return
|
||||
texts = [c.text[:2000] for c in chunks]
|
||||
logger.info("Calcul des embeddings (%s) pour %d chunks...", label, len(texts))
|
||||
embeddings = model.encode(texts, show_progress_bar=True, normalize_embeddings=True, batch_size=64)
|
||||
embeddings = np.array(embeddings, dtype=np.float32)
|
||||
|
||||
# Index FAISS (IndexFlatIP = cosine similarity avec vecteurs normalisés)
|
||||
dim = embeddings.shape[1]
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
# Sauvegarder
|
||||
RAG_INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||
faiss.write_index(index, str(index_path))
|
||||
faiss.write_index(index, str(idx_path))
|
||||
|
||||
metadata = [asdict(c) for c in all_chunks]
|
||||
# Ne pas sauvegarder le texte complet dans metadata (trop lourd),
|
||||
# garder un extrait de 800 chars (les sous-codes sont courts, besoin du contexte)
|
||||
metadata = [asdict(c) for c in chunks]
|
||||
for m in metadata:
|
||||
m["extrait"] = m.pop("text")[:800]
|
||||
|
||||
meta_path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
logger.info("Index FAISS sauvegardé : %s (%d vecteurs, dim=%d)", index_path, len(all_chunks), dim)
|
||||
logger.info("Index FAISS sauvegardé (%s) : %s (%d vecteurs, dim=%d)", label, idx_path, len(chunks), dim)
|
||||
|
||||
_write_index(ref_chunks, ref_index_path, ref_meta_path, "ref")
|
||||
_write_index(proc_chunks, proc_index_path, proc_meta_path, "proc")
|
||||
|
||||
# Invalider les singletons
|
||||
reset_index()
|
||||
|
||||
|
||||
def get_index() -> tuple | None:
|
||||
"""Charge l'index FAISS et les métadonnées (singleton lazy-loaded).
|
||||
def get_index(kind: str = "ref") -> tuple | None:
|
||||
"""Charge un index FAISS et ses métadonnées (singleton lazy-loaded).
|
||||
|
||||
Args:
|
||||
kind: "ref" | "proc" | "all".
|
||||
|
||||
Returns:
|
||||
Tuple (faiss_index, metadata_list) ou None si l'index n'existe pas.
|
||||
"""
|
||||
global _faiss_index, _metadata
|
||||
kind = (kind or "ref").lower()
|
||||
|
||||
if _faiss_index is not None:
|
||||
return _faiss_index, _metadata
|
||||
if kind in _loaded:
|
||||
return _loaded[kind]
|
||||
|
||||
index_path = RAG_INDEX_DIR / "faiss.index"
|
||||
meta_path = RAG_INDEX_DIR / "metadata.json"
|
||||
index_path, meta_path = _paths(kind)
|
||||
|
||||
# Backwards compat : si ref/proc absent, fallback sur all
|
||||
if kind in ("ref", "proc") and (not index_path.exists() or not meta_path.exists()):
|
||||
legacy_idx, legacy_meta = _paths("all")
|
||||
if legacy_idx.exists() and legacy_meta.exists():
|
||||
logger.warning("Index %s absent — fallback legacy faiss.index", kind)
|
||||
index_path, meta_path = legacy_idx, legacy_meta
|
||||
else:
|
||||
logger.warning("Index FAISS non trouvé dans %s — lancez build_index() d'abord", RAG_INDEX_DIR)
|
||||
return None
|
||||
|
||||
if not index_path.exists() or not meta_path.exists():
|
||||
logger.warning("Index FAISS non trouvé dans %s — lancez build_index() d'abord", RAG_INDEX_DIR)
|
||||
logger.warning("Index FAISS non trouvé (%s) dans %s — lancez build_index() d'abord", kind, RAG_INDEX_DIR)
|
||||
return None
|
||||
|
||||
import faiss
|
||||
|
||||
_faiss_index = faiss.read_index(str(index_path))
|
||||
_metadata = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
faiss_index = faiss.read_index(str(index_path))
|
||||
metadata = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
|
||||
logger.info("Index FAISS chargé : %d vecteurs", _faiss_index.ntotal)
|
||||
return _faiss_index, _metadata
|
||||
logger.info("Index FAISS chargé (%s) : %d vecteurs", kind, faiss_index.ntotal)
|
||||
_loaded[kind] = (faiss_index, metadata)
|
||||
return _loaded[kind]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -516,8 +635,15 @@ def chunk_user_file(file_path: Path, doc_name: str) -> list[Chunk]:
|
||||
|
||||
|
||||
def _chunk_user_pdf(file_path: Path, doc_name: str) -> list[Chunk]:
|
||||
"""Découpe un PDF utilisateur en chunks de 2 pages."""
|
||||
"""Découpe un PDF utilisateur en chunks (par défaut 2 pages).
|
||||
|
||||
Spécial : pour certains référentiels (ex. COCOA), on préfère des chunks plus
|
||||
fins (1 page) et on filtre les passages majoritairement procéduraux.
|
||||
"""
|
||||
chunks: list[Chunk] = []
|
||||
doc_lower = (doc_name or "").lower()
|
||||
is_cocoa = "cocoa" in doc_lower or "coco" in doc_lower
|
||||
pages_per_chunk = 1 if is_cocoa else 2
|
||||
try:
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
page_texts: list[str] = []
|
||||
@@ -525,22 +651,29 @@ def _chunk_user_pdf(file_path: Path, doc_name: str) -> list[Chunk]:
|
||||
for page_num, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
page_texts.append(text)
|
||||
if len(page_texts) >= 2:
|
||||
combined = "\n".join(page_texts)
|
||||
page_texts.append("\n".join(_clean_lines(text)))
|
||||
if len(page_texts) >= pages_per_chunk:
|
||||
combined = "\n".join(page_texts).strip()
|
||||
if is_cocoa and _looks_procedural(combined):
|
||||
# on ignore les chunks "process" sans signal de critères/définitions
|
||||
page_texts = []
|
||||
start_page = page_num + 1
|
||||
continue
|
||||
if len(combined.split()) >= 10:
|
||||
for part in _split_by_words(combined, max_words=420 if is_cocoa else 520, overlap=60):
|
||||
chunks.append(Chunk(
|
||||
text=combined,
|
||||
text=part,
|
||||
document=doc_name,
|
||||
page=start_page,
|
||||
))
|
||||
page_texts = []
|
||||
start_page = page_num + 1
|
||||
if page_texts:
|
||||
combined = "\n".join(page_texts)
|
||||
if len(combined.split()) >= 10:
|
||||
combined = "\n".join(page_texts).strip()
|
||||
if not (is_cocoa and _looks_procedural(combined)) and len(combined.split()) >= 10:
|
||||
for part in _split_by_words(combined, max_words=420 if is_cocoa else 520, overlap=60):
|
||||
chunks.append(Chunk(
|
||||
text=combined,
|
||||
text=part,
|
||||
document=doc_name,
|
||||
page=start_page,
|
||||
))
|
||||
@@ -614,8 +747,16 @@ def add_chunks_to_index(chunks: list[Chunk]) -> int:
|
||||
import numpy as np
|
||||
from .rag_search import _get_embed_model
|
||||
|
||||
index_path = RAG_INDEX_DIR / "faiss.index"
|
||||
meta_path = RAG_INDEX_DIR / "metadata.json"
|
||||
# Dans 99% des cas, on veut éviter de mélanger : on route vers ref/proc selon le préfixe.
|
||||
# Si l'appelant veut forcer, il peut passer des chunks avec document="proc:...".
|
||||
kind = _kind_for_chunk(chunks[0])
|
||||
index_path, meta_path = _paths(kind)
|
||||
|
||||
# Backwards compat : si on n'a que l'ancien index, on l'utilise.
|
||||
if not index_path.exists() or not meta_path.exists():
|
||||
legacy_idx, legacy_meta = _paths("all")
|
||||
if legacy_idx.exists() and legacy_meta.exists():
|
||||
index_path, meta_path = legacy_idx, legacy_meta
|
||||
|
||||
# Charger l'index existant ou en créer un nouveau
|
||||
if index_path.exists() and meta_path.exists():
|
||||
@@ -658,7 +799,5 @@ def add_chunks_to_index(chunks: list[Chunk]) -> int:
|
||||
|
||||
|
||||
def reset_index() -> None:
|
||||
"""Invalide le singleton FAISS pour forcer le rechargement au prochain accès."""
|
||||
global _faiss_index, _metadata
|
||||
_faiss_index = None
|
||||
_metadata = []
|
||||
"""Invalide les singletons FAISS pour forcer le rechargement au prochain accès."""
|
||||
_loaded.clear()
|
||||
|
||||
@@ -8,7 +8,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from ..config import (
|
||||
ActeCCAM, Diagnostic, DossierMedical, PreuveClinique, RAGSource,
|
||||
OLLAMA_CACHE_PATH, OLLAMA_MAX_PARALLEL, OLLAMA_MODEL,
|
||||
OLLAMA_CACHE_PATH, OLLAMA_MAX_PARALLEL, get_model,
|
||||
EMBEDDING_MODEL, RERANKER_MODEL,
|
||||
)
|
||||
from .cim10_dict import normalize_code, validate_code as cim10_validate, fallback_parent_code
|
||||
@@ -17,6 +17,7 @@ from .clinical_context import build_enriched_context, format_enriched_context
|
||||
from .ccam_dict import validate_code as ccam_validate
|
||||
from .ollama_client import call_ollama, parse_json_response
|
||||
from .ollama_cache import OllamaCache
|
||||
from ..prompts import CODING_CIM10, CODING_CCAM, DAS_EXTRACTION
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -138,7 +139,8 @@ def search_similar(query: str, top_k: int = 10) -> list[dict]:
|
||||
from .rag_index import get_index
|
||||
import numpy as np
|
||||
|
||||
result = get_index()
|
||||
# Codage CIM-10 : on interroge l'index "ref" (pas le guide méthodo).
|
||||
result = get_index(kind="ref")
|
||||
if result is None:
|
||||
logger.warning("Index FAISS non disponible")
|
||||
return []
|
||||
@@ -163,17 +165,32 @@ def search_similar(query: str, top_k: int = 10) -> list[dict]:
|
||||
meta["score"] = float(score)
|
||||
raw_results.append(meta)
|
||||
|
||||
# Prioriser les sources CIM-10 (au moins 6 sur top_k)
|
||||
cim10_results = [r for r in raw_results if r["document"] in ("cim10", "cim10_alpha")]
|
||||
other_results = [r for r in raw_results if r["document"] not in ("cim10", "cim10_alpha")]
|
||||
# Codage : on garde uniquement CIM-10 + index alpha + éventuels référentiels uploadés en ref:...
|
||||
cim10_results = [r for r in raw_results if r.get("document") == "cim10"]
|
||||
alpha_results = [r for r in raw_results if r.get("document") == "cim10_alpha"]
|
||||
ref_uploads = [r for r in raw_results if str(r.get("document", "")).startswith("ref:")]
|
||||
|
||||
min_cim10 = min(6, len(cim10_results))
|
||||
final = cim10_results[:min_cim10]
|
||||
remaining_slots = top_k - len(final)
|
||||
# Remplir le reste avec les meilleurs résultats (CIM-10 restants + autres)
|
||||
remaining = cim10_results[min_cim10:] + other_results
|
||||
cim10_results.sort(key=lambda r: r["score"], reverse=True)
|
||||
alpha_results.sort(key=lambda r: r["score"], reverse=True)
|
||||
ref_uploads.sort(key=lambda r: r["score"], reverse=True)
|
||||
|
||||
# Quotas : on veut garder le codage ancré sur CIM-10, tout en gardant un peu d'alpha et de ref.
|
||||
q_cim10 = min(6, top_k)
|
||||
q_alpha = 2 if top_k >= 10 else (1 if top_k >= 8 else 0)
|
||||
q_alpha = min(q_alpha, max(0, top_k - q_cim10))
|
||||
q_ref = max(0, top_k - q_cim10 - q_alpha)
|
||||
q_ref = min(q_ref, 2) # éviter que les uploads 'ref:' prennent tout l'espace contexte
|
||||
|
||||
final: list[dict] = []
|
||||
final.extend(cim10_results[:q_cim10])
|
||||
final.extend(alpha_results[:q_alpha])
|
||||
final.extend(ref_uploads[:q_ref])
|
||||
|
||||
# Compléter si on a moins que top_k (ex: pas assez d'alpha/ref)
|
||||
if len(final) < top_k:
|
||||
remaining = cim10_results[q_cim10:] + alpha_results[q_alpha:] + ref_uploads[q_ref:]
|
||||
remaining.sort(key=lambda r: r["score"], reverse=True)
|
||||
final.extend(remaining[:remaining_slots])
|
||||
final.extend(remaining[: (top_k - len(final))])
|
||||
|
||||
return final
|
||||
|
||||
@@ -186,7 +203,8 @@ def search_similar_ccam(query: str, top_k: int = 8) -> list[dict]:
|
||||
from .rag_index import get_index
|
||||
import numpy as np
|
||||
|
||||
result = get_index()
|
||||
# CCAM : index "ref".
|
||||
result = get_index(kind="ref")
|
||||
if result is None:
|
||||
logger.warning("Index FAISS non disponible")
|
||||
return []
|
||||
@@ -236,22 +254,24 @@ def search_similar_cpam(query: str, top_k: int = 8) -> list[dict]:
|
||||
from .rag_index import get_index
|
||||
import numpy as np
|
||||
|
||||
result = get_index()
|
||||
if result is None:
|
||||
# Contexte CPAM : on veut des procédures (guide) + définitions référentielles (CIM-10).
|
||||
proc = get_index(kind="proc")
|
||||
ref = get_index(kind="ref")
|
||||
if proc is None and ref is None:
|
||||
logger.warning("Index FAISS non disponible")
|
||||
return []
|
||||
|
||||
faiss_index, metadata = result
|
||||
|
||||
model = _get_embed_model()
|
||||
query_vec = model.encode([query], normalize_embeddings=True)
|
||||
query_vec = np.array(query_vec, dtype=np.float32)
|
||||
|
||||
# Fetch élargi pour compenser le filtrage agressif
|
||||
fetch_k = min(top_k * 3, faiss_index.ntotal)
|
||||
def _search_one(result_tuple, fetch_mult: int) -> list[dict]:
|
||||
if result_tuple is None:
|
||||
return []
|
||||
faiss_index, metadata = result_tuple
|
||||
fetch_k = min(top_k * fetch_mult, faiss_index.ntotal)
|
||||
scores, indices = faiss_index.search(query_vec, fetch_k)
|
||||
|
||||
raw_results = []
|
||||
out = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx < 0:
|
||||
continue
|
||||
@@ -259,7 +279,19 @@ def search_similar_cpam(query: str, top_k: int = 8) -> list[dict]:
|
||||
continue
|
||||
meta = metadata[idx].copy()
|
||||
meta["score"] = float(score)
|
||||
raw_results.append(meta)
|
||||
out.append(meta)
|
||||
return out
|
||||
|
||||
raw_proc = _search_one(proc, fetch_mult=3)
|
||||
raw_ref = _search_one(ref, fetch_mult=3)
|
||||
|
||||
# Filtrer clairement :
|
||||
# - proc : guide_methodo + uploads proc:
|
||||
raw_proc = [r for r in raw_proc if r.get("document") == "guide_methodo" or str(r.get("document", "")).startswith("proc:")]
|
||||
# - ref : CIM-10 + index alpha + uploads ref:
|
||||
raw_ref = [r for r in raw_ref if r.get("document") in ("cim10", "cim10_alpha") or str(r.get("document", "")).startswith("ref:")]
|
||||
|
||||
raw_results = raw_proc + raw_ref
|
||||
|
||||
# Dédupliquer par code CIM-10 (garder meilleur score par code)
|
||||
seen_codes: dict[str, dict] = {}
|
||||
@@ -281,8 +313,11 @@ def search_similar_cpam(query: str, top_k: int = 8) -> list[dict]:
|
||||
reranked = _rerank(query, deduped, top_k=len(deduped))
|
||||
|
||||
# Prioriser le Guide Méthodologique (min 3 résultats)
|
||||
guide_results = [r for r in reranked if r["document"] == "guide_methodo"]
|
||||
other_results = [r for r in reranked if r["document"] != "guide_methodo"]
|
||||
guide_results = [r for r in reranked if r.get("document") == "guide_methodo" or str(r.get("document", "")).startswith("proc:")]
|
||||
other_results = [
|
||||
r for r in reranked
|
||||
if not (r.get("document") == "guide_methodo" or str(r.get("document", "")).startswith("proc:"))
|
||||
]
|
||||
|
||||
min_guide = min(3, len(guide_results))
|
||||
final = guide_results[:min_guide]
|
||||
@@ -357,107 +392,55 @@ def _format_contexte(contexte: dict) -> str:
|
||||
return "\n".join(lines) if lines else "Non précisé"
|
||||
|
||||
|
||||
def _build_prompt(texte: str, sources: list[dict], contexte: dict, est_dp: bool = True) -> str:
|
||||
"""Construit le prompt expert DIM avec raisonnement structuré."""
|
||||
def _format_sources(sources: list[dict]) -> str:
|
||||
"""Formate les sources RAG pour injection dans un prompt."""
|
||||
sources_text = ""
|
||||
for i, src in enumerate(sources, 1):
|
||||
doc_raw = str(src.get("document", ""))
|
||||
if doc_raw.startswith("ref:"):
|
||||
doc_name = f"Référentiel uploadé : {doc_raw[4:]}"
|
||||
elif doc_raw.startswith("proc:"):
|
||||
doc_name = f"Procédure uploadée : {doc_raw[5:]}"
|
||||
else:
|
||||
doc_name = {
|
||||
"cim10": "CIM-10 FR 2026",
|
||||
"cim10_alpha": "CIM-10 Index Alphabétique 2026",
|
||||
"guide_methodo": "Guide Méthodologique MCO 2026",
|
||||
"ccam": "CCAM PMSI V4 2025",
|
||||
}.get(src["document"], src["document"])
|
||||
}.get(doc_raw, doc_raw)
|
||||
|
||||
code_info = f" (code: {src['code']})" if src.get("code") else ""
|
||||
page_info = f" [page {src['page']}]" if src.get("page") else ""
|
||||
|
||||
sources_text += f"--- Source {i}: {doc_name}{code_info}{page_info} ---\n"
|
||||
sources_text += (src.get("extrait", "")[:800]) + "\n\n"
|
||||
return sources_text
|
||||
|
||||
|
||||
def _build_prompt(texte: str, sources: list[dict], contexte: dict, est_dp: bool = True) -> str:
|
||||
"""Construit le prompt expert DIM avec raisonnement structuré."""
|
||||
type_diag = "DP (diagnostic principal)" if est_dp else "DAS (diagnostic associé significatif)"
|
||||
ctx_str = format_enriched_context(contexte)
|
||||
sources_text = _format_sources(sources)
|
||||
|
||||
return f"""Tu es un médecin DIM (Département d'Information Médicale) expert en codage PMSI.
|
||||
Tu dois coder le diagnostic suivant en respectant STRICTEMENT les règles de l'ATIH.
|
||||
|
||||
RÈGLES IMPÉRATIVES :
|
||||
- Le code doit provenir UNIQUEMENT des sources CIM-10 fournies
|
||||
- Distingue la DESCRIPTION CLINIQUE (ce que le médecin écrit) de la LOGIQUE DE CODAGE (ce que l'ATIH impose)
|
||||
- Privilégie le code le plus SPÉCIFIQUE disponible (4e ou 5e caractère)
|
||||
- Vérifie les notes d'inclusion/exclusion de chaque code candidat
|
||||
- Si le diagnostic est un DP, il doit refléter le motif principal de prise en charge du séjour
|
||||
- Si c'est un DAS, il doit avoir mobilisé des ressources supplémentaires pendant le séjour
|
||||
- EXCLUSION SYMPTÔME : Si le diagnostic est un symptôme (R00-R99) et qu'un diagnostic précis (Chapitres I-XIV, A00-N99) expliquant ce symptôme est présent, le symptôme ne doit PAS être codé comme DAS
|
||||
|
||||
DIAGNOSTIC À CODER : "{texte}"
|
||||
TYPE : {type_diag}
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
SOURCES CIM-10 :
|
||||
{sources_text}
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant, sans aucun texte avant ou après :
|
||||
{{
|
||||
"analyse_clinique": "que signifie ce diagnostic sur le plan médical",
|
||||
"codes_candidats": "quels codes CIM-10 des sources sont compatibles",
|
||||
"discrimination": "pourquoi choisir ce code plutôt qu'un autre (inclusions/exclusions, spécificité)",
|
||||
"regle_pmsi": "conformité aux règles PMSI pour un {type_diag} (guide méthodologique)",
|
||||
"code": "X99.9",
|
||||
"confidence": "high ou medium ou low",
|
||||
"justification": "explication courte en français",
|
||||
"preuves_cliniques": [
|
||||
{{"type": "biologie|imagerie|traitement|acte|clinique", "element": "élément concret du dossier", "interpretation": "signification clinique justifiant le code"}}
|
||||
]
|
||||
}}"""
|
||||
return CODING_CIM10.format(
|
||||
texte=texte,
|
||||
type_diag=type_diag,
|
||||
ctx_str=ctx_str,
|
||||
sources_text=sources_text,
|
||||
)
|
||||
|
||||
|
||||
def _build_prompt_ccam(texte: str, sources: list[dict], contexte: dict) -> str:
|
||||
"""Construit le prompt expert DIM pour le codage CCAM avec raisonnement structuré."""
|
||||
sources_text = ""
|
||||
for i, src in enumerate(sources, 1):
|
||||
doc_name = {
|
||||
"cim10": "CIM-10 FR 2026",
|
||||
"cim10_alpha": "CIM-10 Index Alphabétique 2026",
|
||||
"guide_methodo": "Guide Méthodologique MCO 2026",
|
||||
"ccam": "CCAM PMSI V4 2025",
|
||||
}.get(src["document"], src["document"])
|
||||
|
||||
code_info = f" (code: {src['code']})" if src.get("code") else ""
|
||||
page_info = f" [page {src['page']}]" if src.get("page") else ""
|
||||
|
||||
sources_text += f"--- Source {i}: {doc_name}{code_info}{page_info} ---\n"
|
||||
sources_text += (src.get("extrait", "")[:800]) + "\n\n"
|
||||
|
||||
ctx_str = format_enriched_context(contexte)
|
||||
sources_text = _format_sources(sources)
|
||||
|
||||
return f"""Tu es un médecin DIM (Département d'Information Médicale) expert en codage CCAM PMSI.
|
||||
Tu dois coder l'acte chirurgical/médical suivant en respectant STRICTEMENT la nomenclature CCAM.
|
||||
|
||||
RÈGLES IMPÉRATIVES :
|
||||
- Le code doit provenir UNIQUEMENT des sources CCAM fournies
|
||||
- Un code CCAM est composé de 4 lettres + 3 chiffres (ex: HMFC004)
|
||||
- Vérifie l'activité (1=acte technique, 4=anesthésie) et le regroupement
|
||||
- Tiens compte du tarif secteur 1 pour valider la cohérence
|
||||
- Si plusieurs codes sont possibles, choisis le plus spécifique à l'acte décrit
|
||||
- En cas de doute, indique confidence "low" plutôt que de proposer un code inadapté
|
||||
|
||||
ACTE À CODER : "{texte}"
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
SOURCES CCAM :
|
||||
{sources_text}
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant, sans aucun texte avant ou après :
|
||||
{{
|
||||
"analyse_acte": "que décrit cet acte sur le plan technique/chirurgical",
|
||||
"codes_candidats": "quels codes CCAM des sources sont compatibles",
|
||||
"discrimination": "pourquoi choisir ce code plutôt qu'un autre (activité, regroupement, tarif)",
|
||||
"code": "ABCD123",
|
||||
"confidence": "high ou medium ou low",
|
||||
"justification": "explication courte en français"
|
||||
}}"""
|
||||
return CODING_CCAM.format(
|
||||
texte=texte,
|
||||
ctx_str=ctx_str,
|
||||
sources_text=sources_text,
|
||||
)
|
||||
|
||||
|
||||
def _parse_ollama_response(raw: str) -> dict | None:
|
||||
@@ -481,7 +464,7 @@ def _parse_ollama_response(raw: str) -> dict | None:
|
||||
|
||||
def _call_ollama(prompt: str) -> dict | None:
|
||||
"""Appelle Ollama (mode JSON) et parse la réponse avec reconstitution du raisonnement."""
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=2500)
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=2500, role="coding")
|
||||
if result is None:
|
||||
return None
|
||||
# Reconstituer le raisonnement structuré
|
||||
@@ -669,42 +652,12 @@ def _build_prompt_das_extraction(text: str, contexte: dict, existing_das: list[s
|
||||
ctx_str = format_enriched_context(contexte)
|
||||
existing_str = "\n".join(f"- {d}" for d in existing_das) if existing_das else "Aucun"
|
||||
|
||||
return f"""Tu es un médecin DIM (Département d'Information Médicale) expert en codage PMSI.
|
||||
Analyse le texte médical suivant et identifie les diagnostics associés significatifs (DAS) qui n'ont PAS encore été codés.
|
||||
|
||||
RÈGLES IMPÉRATIVES :
|
||||
- Un DAS doit avoir mobilisé des ressources supplémentaires pendant le séjour
|
||||
- Ne PAS proposer de doublons avec les DAS déjà codés ci-dessous
|
||||
- Ne PAS proposer le diagnostic principal comme DAS
|
||||
- Ne PAS coder les symptômes (R00-R99) si un diagnostic précis les explique
|
||||
- Ne PAS coder les antécédents non pertinents pour le séjour
|
||||
- Privilégie les codes CIM-10 les plus SPÉCIFIQUES (4e ou 5e caractère)
|
||||
- Ne propose que des diagnostics CLAIREMENT mentionnés dans le texte
|
||||
- ATTENTION aux valeurs biologiques : ne code PAS un diagnostic si les valeurs sont dans les normes indiquées entre crochets [N: min-max]. Exemple : Créatinine 76 [N: 50-120] = NORMAL, pas d'insuffisance rénale.
|
||||
|
||||
DIAGNOSTIC PRINCIPAL : {dp_texte or "Non identifié"}
|
||||
|
||||
DAS DÉJÀ CODÉS :
|
||||
{existing_str}
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
TEXTE MÉDICAL :
|
||||
{text[:4000]}
|
||||
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant, sans aucun texte avant ou après :
|
||||
{{
|
||||
"diagnostics_supplementaires": [
|
||||
{{
|
||||
"texte": "description du diagnostic",
|
||||
"code_cim10": "X99.9",
|
||||
"justification": "pourquoi ce DAS est pertinent pour le séjour"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Si aucun DAS supplémentaire n'est pertinent, retourne : {{"diagnostics_supplementaires": []}}"""
|
||||
return DAS_EXTRACTION.format(
|
||||
dp_texte=dp_texte or "Non identifié",
|
||||
existing_str=existing_str,
|
||||
ctx_str=ctx_str,
|
||||
text_medical=text[:4000],
|
||||
)
|
||||
|
||||
|
||||
def extract_das_llm(
|
||||
@@ -741,7 +694,7 @@ def extract_das_llm(
|
||||
|
||||
# Construire le prompt et appeler Ollama
|
||||
prompt = _build_prompt_das_extraction(text, contexte, existing_das, dp_texte)
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=2000)
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=2000, role="coding")
|
||||
|
||||
if result is None:
|
||||
logger.warning("Extraction DAS LLM : Ollama non disponible")
|
||||
@@ -766,7 +719,7 @@ def enrich_dossier(dossier: DossierMedical) -> None:
|
||||
Utilise un cache persistant et parallélise les appels Ollama
|
||||
pour les DAS et actes CCAM (max_workers = OLLAMA_MAX_PARALLEL).
|
||||
"""
|
||||
cache = OllamaCache(OLLAMA_CACHE_PATH, OLLAMA_MODEL)
|
||||
cache = OllamaCache(OLLAMA_CACHE_PATH, get_model("coding"))
|
||||
|
||||
contexte = build_enriched_context(dossier)
|
||||
|
||||
|
||||
21
src/prompts/__init__.py
Normal file
21
src/prompts/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""Prompts LLM externalisés pour le pipeline T2A."""
|
||||
|
||||
from .templates import (
|
||||
CODING_CIM10,
|
||||
CODING_CCAM,
|
||||
DAS_EXTRACTION,
|
||||
QC_VALIDATION,
|
||||
CPAM_EXTRACTION,
|
||||
CPAM_ARGUMENTATION,
|
||||
CPAM_ADVERSARIAL,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"CODING_CIM10",
|
||||
"CODING_CCAM",
|
||||
"DAS_EXTRACTION",
|
||||
"QC_VALIDATION",
|
||||
"CPAM_EXTRACTION",
|
||||
"CPAM_ARGUMENTATION",
|
||||
"CPAM_ADVERSARIAL",
|
||||
]
|
||||
340
src/prompts/templates.py
Normal file
340
src/prompts/templates.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""Templates LLM externalisés pour le pipeline T2A.
|
||||
|
||||
Chaque template utilise str.format() avec des variables nommées.
|
||||
Les accolades JSON sont doublées ({{ }}) pour échapper le format().
|
||||
Les fragments conditionnels (ex: DP UCR) sont résolus AVANT l'appel
|
||||
à template.format() dans les fonctions appelantes.
|
||||
|
||||
Variables par template :
|
||||
CODING_CIM10 : texte, type_diag, ctx_str, sources_text
|
||||
CODING_CCAM : texte, ctx_str, sources_text
|
||||
DAS_EXTRACTION : dp_texte, existing_str, ctx_str, text_medical
|
||||
QC_VALIDATION : ctx_str, codes_section
|
||||
CPAM_EXTRACTION : dp_str, das_str, tagged_text, titre, arg_ucr,
|
||||
decision_ucr, dp_ucr_line, da_ucr_line
|
||||
CPAM_ARGUMENTATION : dossier_str, asymetrie_str, tagged_str, titre,
|
||||
arg_ucr, decision_ucr, codes_str, definitions_str,
|
||||
sources_text, extraction_str
|
||||
CPAM_ADVERSARIAL : response_json, factual_section, normes_section,
|
||||
dp_ucr_line, da_ucr_line
|
||||
"""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. CODING_CIM10 — Codage CIM-10 (DP ou DAS) via RAG
|
||||
# Source : rag_search.py _build_prompt()
|
||||
# Rôle : coding | Température : 0.1 | max_tokens : 2500
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
CODING_CIM10 = """\
|
||||
Tu es un médecin DIM (Département d'Information Médicale) expert en codage PMSI.
|
||||
Tu dois coder le diagnostic suivant en respectant STRICTEMENT les règles de l'ATIH.
|
||||
|
||||
RÈGLES IMPÉRATIVES :
|
||||
- Le code doit provenir UNIQUEMENT des sources CIM-10 fournies
|
||||
- Distingue la DESCRIPTION CLINIQUE (ce que le médecin écrit) de la LOGIQUE DE CODAGE (ce que l'ATIH impose)
|
||||
- Privilégie le code le plus SPÉCIFIQUE disponible (4e ou 5e caractère)
|
||||
- Vérifie les notes d'inclusion/exclusion de chaque code candidat
|
||||
- Si le diagnostic est un DP, il doit refléter le motif principal de prise en charge du séjour
|
||||
- Si c'est un DAS, il doit avoir mobilisé des ressources supplémentaires pendant le séjour
|
||||
- EXCLUSION SYMPTÔME : Si le diagnostic est un symptôme (R00-R99) et qu'un diagnostic précis (Chapitres I-XIV, A00-N99) expliquant ce symptôme est présent, le symptôme ne doit PAS être codé comme DAS
|
||||
|
||||
DIAGNOSTIC À CODER : "{texte}"
|
||||
TYPE : {type_diag}
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
SOURCES DE RÉFÉRENCE :
|
||||
{sources_text}
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant, sans aucun texte avant ou après :
|
||||
{{
|
||||
"analyse_clinique": "que signifie ce diagnostic sur le plan médical",
|
||||
"codes_candidats": "quels codes CIM-10 des sources sont compatibles",
|
||||
"discrimination": "pourquoi choisir ce code plutôt qu'un autre (inclusions/exclusions, spécificité)",
|
||||
"regle_pmsi": "conformité aux règles PMSI pour un {type_diag} (guide méthodologique)",
|
||||
"code": "X99.9",
|
||||
"confidence": "high ou medium ou low",
|
||||
"justification": "explication courte en français",
|
||||
"preuves_cliniques": [
|
||||
{{"type": "biologie|imagerie|traitement|acte|clinique", "element": "élément concret du dossier", "interpretation": "signification clinique justifiant le code"}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. CODING_CCAM — Codage CCAM via RAG
|
||||
# Source : rag_search.py _build_prompt_ccam()
|
||||
# Rôle : coding | Température : 0.1 | max_tokens : 2500
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
CODING_CCAM = """\
|
||||
Tu es un médecin DIM (Département d'Information Médicale) expert en codage CCAM PMSI.
|
||||
Tu dois coder l'acte chirurgical/médical suivant en respectant STRICTEMENT la nomenclature CCAM.
|
||||
|
||||
RÈGLES IMPÉRATIVES :
|
||||
- Le code doit provenir UNIQUEMENT des sources CCAM fournies
|
||||
- Un code CCAM est composé de 4 lettres + 3 chiffres (ex: HMFC004)
|
||||
- Vérifie l'activité (1=acte technique, 4=anesthésie) et le regroupement
|
||||
- Tiens compte du tarif secteur 1 pour valider la cohérence
|
||||
- Si plusieurs codes sont possibles, choisis le plus spécifique à l'acte décrit
|
||||
- En cas de doute, indique confidence "low" plutôt que de proposer un code inadapté
|
||||
|
||||
ACTE À CODER : "{texte}"
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
SOURCES CCAM :
|
||||
{sources_text}
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant, sans aucun texte avant ou après :
|
||||
{{
|
||||
"analyse_acte": "que décrit cet acte sur le plan technique/chirurgical",
|
||||
"codes_candidats": "quels codes CCAM des sources sont compatibles",
|
||||
"discrimination": "pourquoi choisir ce code plutôt qu'un autre (activité, regroupement, tarif)",
|
||||
"code": "ABCD123",
|
||||
"confidence": "high ou medium ou low",
|
||||
"justification": "explication courte en français"
|
||||
}}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. DAS_EXTRACTION — Extraction DAS supplémentaires via LLM
|
||||
# Source : rag_search.py _build_prompt_das_extraction()
|
||||
# Rôle : coding | Température : 0.1 | max_tokens : 2000
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DAS_EXTRACTION = """\
|
||||
Tu es un médecin DIM (Département d'Information Médicale) expert en codage PMSI.
|
||||
Analyse le texte médical suivant et identifie les diagnostics associés significatifs (DAS) qui n'ont PAS encore été codés.
|
||||
|
||||
RÈGLES IMPÉRATIVES :
|
||||
- Un DAS doit avoir mobilisé des ressources supplémentaires pendant le séjour
|
||||
- Ne PAS proposer de doublons avec les DAS déjà codés ci-dessous
|
||||
- Ne PAS proposer le diagnostic principal comme DAS
|
||||
- Ne PAS coder les symptômes (R00-R99) si un diagnostic précis les explique
|
||||
- Ne PAS coder les antécédents non pertinents pour le séjour
|
||||
- Privilégie les codes CIM-10 les plus SPÉCIFIQUES (4e ou 5e caractère)
|
||||
- Ne propose que des diagnostics CLAIREMENT mentionnés dans le texte
|
||||
- ATTENTION aux valeurs biologiques : ne code PAS un diagnostic si les valeurs sont dans les normes indiquées entre crochets [N: min-max]. Exemple : Créatinine 76 [N: 50-120] = NORMAL, pas d'insuffisance rénale.
|
||||
|
||||
DIAGNOSTIC PRINCIPAL : {dp_texte}
|
||||
|
||||
DAS DÉJÀ CODÉS :
|
||||
{existing_str}
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
TEXTE MÉDICAL :
|
||||
{text_medical}
|
||||
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant, sans aucun texte avant ou après :
|
||||
{{
|
||||
"diagnostics_supplementaires": [
|
||||
{{
|
||||
"texte": "description du diagnostic",
|
||||
"code_cim10": "X99.9",
|
||||
"justification": "pourquoi ce DAS est pertinent pour le séjour"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Si aucun DAS supplémentaire n'est pertinent, retourne : {{"diagnostics_supplementaires": []}}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. QC_VALIDATION — Validation croisée batch des justifications
|
||||
# Source : cim10_extractor.py _validate_justifications()
|
||||
# Rôle : qc | Température : 0.1 | max_tokens : 2500
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
QC_VALIDATION = """\
|
||||
Tu es un médecin DIM contrôleur qualité PMSI.
|
||||
Vérifie la cohérence et la justification de ce codage complet.
|
||||
|
||||
DOSSIER CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
CODAGE À VALIDER :
|
||||
{codes_section}
|
||||
|
||||
Pour CHAQUE code, vérifie :
|
||||
1. Existe-t-il une preuve clinique concrète dans le dossier ?
|
||||
2. Le code est-il le plus spécifique possible ?
|
||||
3. Y a-t-il des conflits ou redondances avec d'autres codes ?
|
||||
|
||||
Réponds avec un JSON :
|
||||
{{
|
||||
"validations": [
|
||||
{{
|
||||
"numero": 1,
|
||||
"code": "X99.9",
|
||||
"verdict": "maintenir|reclasser|supprimer",
|
||||
"confidence_recommandee": "high|medium|low",
|
||||
"commentaire": "explication courte"
|
||||
}}
|
||||
],
|
||||
"alertes_globales": ["..."]
|
||||
}}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 5. CPAM_EXTRACTION — Passe 1 extraction structurée CPAM
|
||||
# Source : cpam_response.py _extraction_pass()
|
||||
# Rôle : cpam | Température : 0.0 | max_tokens : 1500
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
CPAM_EXTRACTION = """\
|
||||
Tu es un médecin DIM expert. Analyse cette contestation CPAM sans argumenter.
|
||||
|
||||
DOSSIER :
|
||||
- DP : {dp_str}
|
||||
- DAS : {das_str}
|
||||
{tagged_text}
|
||||
|
||||
CONTESTATION CPAM :
|
||||
Titre : {titre}
|
||||
Argument : {arg_ucr}
|
||||
Décision : {decision_ucr}
|
||||
{dp_ucr_line}
|
||||
{da_ucr_line}
|
||||
|
||||
Réponds UNIQUEMENT en JSON :
|
||||
{{
|
||||
"comprehension_contestation": "Résumé factuel : que conteste la CPAM et pourquoi",
|
||||
"elements_cliniques_pertinents": [
|
||||
{{"tag": "BIO-1 ou texte libre", "pertinence": "en quoi cet élément est pertinent pour le codage contesté"}}
|
||||
],
|
||||
"points_accord_potentiels": ["points où la CPAM a partiellement raison"],
|
||||
"codes_en_jeu": {{
|
||||
"dp_etablissement": "code + libellé",
|
||||
"dp_ucr": "code + libellé si proposé",
|
||||
"difference_cle": "explication de la différence entre les deux codages"
|
||||
}}
|
||||
}}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 6. CPAM_ARGUMENTATION — Passe 2 contre-argumentation CPAM
|
||||
# Source : cpam_response.py _build_cpam_prompt()
|
||||
# Rôle : cpam | Température : 0.1 | max_tokens : 4000
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
CPAM_ARGUMENTATION = """\
|
||||
Tu es un médecin DIM (Département d'Information Médicale) expert en contentieux T2A.
|
||||
Tu dois produire une analyse ÉQUILIBRÉE ET CRÉDIBLE de la contestation CPAM, puis contre-argumenter en mobilisant trois axes : médical, asymétrie d'information, et réglementaire.
|
||||
|
||||
IMPORTANT — CRÉDIBILITÉ DE L'ANALYSE :
|
||||
Une contre-argumentation crédible reconnaît TOUJOURS au moins un point valide dans le raisonnement adverse.
|
||||
Répondre "Aucun point d'accord" décrédibilise l'ensemble de l'argumentation. Tu DOIS identifier au moins un élément où la CPAM a un point légitime (même partiel), puis expliquer pourquoi cela ne suffit pas à invalider le codage.
|
||||
|
||||
IMPORTANT — CODES CIM-10 :
|
||||
Ne parle JAMAIS de « codage initial » ou « codage contesté » sans citer explicitement le code CIM-10 et son libellé (ex: Z45.80 — Ajustement et entretien d'un dispositif implantable).
|
||||
Chaque argument doit désigner précisément quel code est défendu ou contesté, avec son libellé complet.
|
||||
|
||||
DOSSIER MÉDICAL DE L'ÉTABLISSEMENT :
|
||||
{dossier_str}
|
||||
{asymetrie_str}
|
||||
{tagged_str}
|
||||
|
||||
OBJET DU DÉSACCORD : {titre}
|
||||
|
||||
ARGUMENTATION DE LA CPAM (UCR) :
|
||||
{arg_ucr}
|
||||
|
||||
DÉCISION UCR : {decision_ucr}
|
||||
|
||||
CODES CONTESTÉS :
|
||||
{codes_str}
|
||||
{definitions_str}
|
||||
|
||||
SOURCES RÉGLEMENTAIRES (Guide méthodologique, CIM-10) :
|
||||
{sources_text}
|
||||
{extraction_str}
|
||||
|
||||
CONSIGNES :
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
- Prends en compte l'ÂGE du patient (pédiatrie < 18 ans, personne âgée >= 80 ans), le MODE D'ENTRÉE (urgence vs programmé), et la DURÉE DE SÉJOUR pour contextualiser ton analyse
|
||||
- En pédiatrie, les normes biologiques et les codages peuvent différer de l'adulte
|
||||
- Une admission en urgence implique un contexte clinique aigu qui influence le choix du DP
|
||||
|
||||
ÉTAPE 1 — ANALYSE HONNÊTE (avant de contre-argumenter) :
|
||||
- Identifie ce que la CPAM a compris correctement dans le dossier
|
||||
- Reconnais les points où leur raisonnement est fondé, même partiellement
|
||||
- Explique ENSUITE pourquoi ces points ne justifient pas leur conclusion
|
||||
|
||||
AXE MÉDICAL :
|
||||
- Analyse le bien-fondé médical du codage de l'établissement
|
||||
- CITE les éléments cliniques EXACTS du dossier en utilisant les tags [XX-N] fournis (ex: [BIO-1] CRP 180 mg/L)
|
||||
- Confronte l'argumentation CPAM aux sources CIM-10 et Guide Méthodologique fournies
|
||||
- Ne mentionne AUCUN élément qui ne figure pas dans les éléments référencés ci-dessus
|
||||
|
||||
AXE ASYMÉTRIE D'INFORMATION :
|
||||
- La CPAM a fondé son analyse uniquement sur le CRH et les codes transmis
|
||||
- Pour CHAQUE élément clinique pertinent, cite les VALEURS EXACTES et explique leur signification clinique
|
||||
- Démontre en quoi ces éléments complémentaires (biologie, imagerie, traitements, actes) justifient le codage contesté
|
||||
- Ne mentionne AUCUN élément qui n'est pas dans le dossier fourni
|
||||
|
||||
MISE EN FORME :
|
||||
- Structure chaque section avec des tirets pour lister les arguments distincts
|
||||
- Un argument par puce, avec la preuve ou la référence associée
|
||||
|
||||
AXE RÉGLEMENTAIRE :
|
||||
- Identifie si l'UCR fait une interprétation restrictive non fondée d'une règle
|
||||
- Confronte le raisonnement CPAM au texte EXACT des sources fournies
|
||||
- Format OBLIGATOIRE pour chaque référence : [Document - page N] suivi d'une CITATION VERBATIM du passage pertinent
|
||||
- INTERDICTION ABSOLUE de citer une référence qui ne figure pas dans les sources fournies ci-dessus
|
||||
- Si aucune source pertinente n'est disponible → écrire explicitement "Pas de source réglementaire disponible"
|
||||
- Relève les contradictions entre l'argumentation CPAM et les règles officielles
|
||||
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant :
|
||||
{{
|
||||
"analyse_contestation": "Résumé de ce que conteste la CPAM et sur quelle base",
|
||||
"points_accord": "Points CONCRETS où la CPAM a raison ou partiellement raison (JAMAIS 'Aucun' — il y a toujours au moins un point légitime à reconnaître)",
|
||||
"contre_arguments_medicaux": "Argumentation médicale en faveur du codage, en expliquant pourquoi les points d'accord ne suffisent pas à invalider le codage",
|
||||
"preuves_dossier": [
|
||||
{{"ref": "BIO-1", "element": "biologie|imagerie|traitement|acte|clinique", "valeur": "valeur exacte du dossier", "signification": "explication clinique"}}
|
||||
],
|
||||
"contre_arguments_asymetrie": "Éléments cliniques que la CPAM n'avait pas et qui justifient le codage",
|
||||
"contre_arguments_reglementaires": "Erreurs d'interprétation réglementaire de la CPAM, avec citations verbatim des sources",
|
||||
"references": [
|
||||
{{"document": "nom du document source", "page": "numéro de page", "citation": "citation verbatim du passage"}}
|
||||
],
|
||||
"conclusion": "Synthèse en citant EXPLICITEMENT les codes CIM-10 défendus (ex: DP Z45.80 — libellé) : points reconnus à la CPAM, puis pourquoi ce codage précis est néanmoins justifié"
|
||||
}}"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 7. CPAM_ADVERSARIAL — Validation adversariale de la contre-argumentation
|
||||
# Source : cpam_response.py _validate_adversarial()
|
||||
# Rôle : validation | Température : 0.0 | max_tokens : 800
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
CPAM_ADVERSARIAL = """\
|
||||
Tu es un relecteur critique. Vérifie la cohérence de cette contre-argumentation CPAM.
|
||||
|
||||
RÉPONSE GÉNÉRÉE :
|
||||
{response_json}
|
||||
|
||||
{factual_section}
|
||||
|
||||
{normes_section}
|
||||
|
||||
CODES CONTESTÉS :
|
||||
{dp_ucr_line}
|
||||
{da_ucr_line}
|
||||
|
||||
Vérifie STRICTEMENT :
|
||||
1. Chaque valeur bio/imagerie/traitement citée dans les preuves existe dans les éléments factuels
|
||||
2. Si une valeur bio est qualifiée de "élevée", "basse" ou "anormale", vérifie qu'elle est RÉELLEMENT hors normes selon les normes ci-dessus (ex: CRP 5 = NORMAL, pas élevé)
|
||||
3. La conclusion est cohérente avec l'argumentation développée
|
||||
4. Les points d'accord ne contredisent pas les contre-arguments
|
||||
5. Les codes CIM-10 mentionnés dans la conclusion sont cohérents avec le reste
|
||||
|
||||
Réponds UNIQUEMENT en JSON :
|
||||
{{
|
||||
"coherent": true ou false,
|
||||
"erreurs": ["description précise de chaque incohérence trouvée"],
|
||||
"score_confiance": 0 à 10
|
||||
}}"""
|
||||
1
src/quality/__init__.py
Normal file
1
src/quality/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Contrôles qualité (vetos) pour réduire la contestabilité CPAM."""
|
||||
609
src/quality/decision_engine.py
Normal file
609
src/quality/decision_engine.py
Normal file
@@ -0,0 +1,609 @@
|
||||
"""Moteur de décisions (post-traitement qualité).
|
||||
|
||||
But: conserver la proposition du modèle (cim10_suggestion) tout en produisant une
|
||||
*sortie finale* plus défendable (cim10_final + cim10_decision).
|
||||
|
||||
Ce module est déterministe, court, et auditable.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import Optional
|
||||
|
||||
from ..config import (
|
||||
CodeDecision,
|
||||
Diagnostic,
|
||||
DossierMedical,
|
||||
VetoIssue,
|
||||
load_reference_ranges,
|
||||
load_bio_rules,
|
||||
rule_enabled,
|
||||
)
|
||||
|
||||
|
||||
# --- Règles "étiologiques" : ne pas affirmer sans preuve spécifique ---
|
||||
|
||||
IRON_MARKERS = (
|
||||
"ferrit", # ferritine
|
||||
"transferr", # transferrine
|
||||
"saturation", # saturation transferrine
|
||||
"cst", # coefficient de saturation
|
||||
"carence mart",
|
||||
"martiale",
|
||||
"ferripr", # ferriprive
|
||||
"fer intraveineux",
|
||||
"fer iv",
|
||||
"traitement martial",
|
||||
)
|
||||
|
||||
|
||||
def _norm(s: str) -> str:
|
||||
s = s.replace("’", "'")
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
||||
s = s.lower()
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def _first_float(text: str) -> Optional[float]:
|
||||
m = re.search(r"(-?\d+(?:[\.,]\d+)?)", text)
|
||||
if not m:
|
||||
return None
|
||||
return float(m.group(1).replace(",", "."))
|
||||
|
||||
|
||||
def _parse_normal_range(text: str) -> tuple[Optional[float], Optional[float]]:
|
||||
# Ex: "[N: 12-17]" / "[N: 12 - 17]"
|
||||
m = re.search(r"\[\s*N\s*:\s*([0-9]+(?:[\.,][0-9]+)?)\s*-\s*([0-9]+(?:[\.,][0-9]+)?)\s*\]", text)
|
||||
if not m:
|
||||
return None, None
|
||||
lo = float(m.group(1).replace(",", "."))
|
||||
hi = float(m.group(2).replace(",", "."))
|
||||
return lo, hi
|
||||
|
||||
|
||||
def _parse_float(v: str | None) -> float | None:
|
||||
if v is None:
|
||||
return None
|
||||
s = str(v).strip().replace(",", ".")
|
||||
m = re.search(r"(-?\d+(?:\.\d+)?)", s)
|
||||
if not m:
|
||||
return None
|
||||
try:
|
||||
return float(m.group(1))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _age_band(dossier: DossierMedical, cfg: dict) -> str:
|
||||
age = getattr(dossier.sejour, "age", None)
|
||||
adult_min = (cfg.get("age_bands") or {}).get("adult_min_years", 18)
|
||||
if age is None:
|
||||
return "unknown"
|
||||
return "adult" if age >= adult_min else "child"
|
||||
|
||||
|
||||
def _threshold(cfg: dict, test: str, age_band: str, doc_lo: float | None) -> float:
|
||||
"""Retourne un seuil 'normal' conservateur pour déclencher un RULED_OUT.
|
||||
|
||||
Priorité:
|
||||
- doc_lo si présent (norme du document = vérité du dossier)
|
||||
- safe zone si âge inconnu ou enfant (conservateur)
|
||||
- fallback YAML sinon (adult)
|
||||
"""
|
||||
if doc_lo is not None:
|
||||
return float(doc_lo)
|
||||
|
||||
safe = cfg.get("safe_zones_unknown_age") or {}
|
||||
fallback = cfg.get("fallback_ranges") or {}
|
||||
|
||||
if age_band in ("unknown", "child"):
|
||||
# Seuils safe si dispo, sinon fallback adult
|
||||
key_map = {
|
||||
"platelets": "platelets_ruled_out_low",
|
||||
"sodium": "sodium_ruled_out_low",
|
||||
"potassium_high": "potassium_ruled_out_high",
|
||||
"potassium_low": "potassium_ruled_out_low",
|
||||
}
|
||||
k = key_map.get(test)
|
||||
if k and k in safe:
|
||||
return float(safe[k])
|
||||
|
||||
band = "adult" if age_band == "unknown" else age_band
|
||||
band_cfg = fallback.get(band) or fallback.get("adult") or {}
|
||||
test_cfg = band_cfg.get(test.replace("_high", "").replace("_low", "")) or {}
|
||||
lo = test_cfg.get("low")
|
||||
if lo is None:
|
||||
# dernier recours
|
||||
return 0.0
|
||||
return float(lo)
|
||||
|
||||
|
||||
def _threshold_high(cfg: dict, test: str, age_band: str, doc_hi: float | None) -> float:
|
||||
"""Retourne un seuil 'normal haut' conservateur.
|
||||
|
||||
Utilisé pour écarter des diagnostics de type "hyper-" quand la valeur est
|
||||
clairement ≤ la borne haute normale.
|
||||
|
||||
Priorité:
|
||||
- doc_hi si présent (norme du document)
|
||||
- safe zone si âge inconnu/enfant (conservateur)
|
||||
- fallback YAML sinon (adult)
|
||||
"""
|
||||
|
||||
if doc_hi is not None:
|
||||
return float(doc_hi)
|
||||
|
||||
safe = cfg.get("safe_zones_unknown_age") or {}
|
||||
fallback = cfg.get("fallback_ranges") or {}
|
||||
|
||||
if age_band in ("unknown", "child"):
|
||||
# safe zone dédiée si dispo
|
||||
if test == "potassium" and "potassium_ruled_out_high" in safe:
|
||||
return float(safe["potassium_ruled_out_high"])
|
||||
|
||||
band = "adult" if age_band == "unknown" else age_band
|
||||
band_cfg = fallback.get(band) or fallback.get("adult") or {}
|
||||
test_cfg = band_cfg.get(test) or {}
|
||||
hi = test_cfg.get("high")
|
||||
if hi is None:
|
||||
# dernier recours
|
||||
return 0.0
|
||||
return float(hi)
|
||||
|
||||
|
||||
def _is_sodium_test(test: str) -> bool:
|
||||
t = (test or "").lower().strip()
|
||||
# 'na' est trop générique: on privilégie sodium/natrémie
|
||||
if "sodium" in t or "natr" in t:
|
||||
return True
|
||||
return bool(re.fullmatch(r"na\+?", t))
|
||||
|
||||
|
||||
def _is_potassium_test(test: str) -> bool:
|
||||
t = (test or "").lower().strip()
|
||||
if "potassium" in t or "kali" in t:
|
||||
return True
|
||||
return bool(re.fullmatch(r"k\+?", t))
|
||||
|
||||
|
||||
def _bio_values(
|
||||
dossier: DossierMedical,
|
||||
matcher,
|
||||
) -> tuple[list[float], float | None, float | None]:
|
||||
"""Collecte des valeurs biologiques et une éventuelle norme [N: lo-hi].
|
||||
|
||||
- Les entrées BiologieCle peuvent être marquées quality=ok|suspect|discarded.
|
||||
- Par défaut, on **privilégie** les valeurs 'ok'. Si aucune valeur ok n'existe,
|
||||
on retombe sur les valeurs 'suspect' (audit), afin de ne pas perdre l'info.
|
||||
|
||||
Retour:
|
||||
- liste de valeurs (float)
|
||||
- norme basse (si trouvée)
|
||||
- norme haute (si trouvée)
|
||||
"""
|
||||
ok_values: list[float] = []
|
||||
suspect_values: list[float] = []
|
||||
lo_doc: float | None = None
|
||||
hi_doc: float | None = None
|
||||
|
||||
for b in dossier.biologie_cle or []:
|
||||
if not matcher(getattr(b, "test", "") or ""):
|
||||
continue
|
||||
|
||||
q = getattr(b, "quality", None) or "ok"
|
||||
if q == "discarded":
|
||||
continue
|
||||
|
||||
# Priorité: valeur_num si disponible (plus fiable que reparsing)
|
||||
val = getattr(b, "valeur_num", None)
|
||||
if val is None:
|
||||
raw = str(getattr(b, "valeur", "") or "")
|
||||
val = _parse_float(raw)
|
||||
|
||||
if val is None:
|
||||
continue
|
||||
|
||||
if q == "suspect":
|
||||
suspect_values.append(val)
|
||||
else:
|
||||
ok_values.append(val)
|
||||
|
||||
# Normes éventuelles dans la chaîne
|
||||
if lo_doc is None and hi_doc is None:
|
||||
raw = str(getattr(b, "valeur", "") or "")
|
||||
lo, hi = _parse_normal_range(raw)
|
||||
if lo is not None or hi is not None:
|
||||
lo_doc, hi_doc = lo, hi
|
||||
|
||||
values = ok_values if ok_values else suspect_values
|
||||
return values, lo_doc, hi_doc
|
||||
|
||||
|
||||
def _get_platelets_context(dossier: DossierMedical) -> tuple[float | None, float | None, float | None]:
|
||||
"""Retourne (valeur_plaquettes, norme_basse, norme_haute) si disponible.
|
||||
|
||||
Politique:
|
||||
- privilégie une valeur qualité=ok
|
||||
- sinon retombe sur une valeur qualité=suspect
|
||||
- ignore discarded
|
||||
"""
|
||||
best_val: float | None = None
|
||||
best_q: str | None = None
|
||||
best_raw: str | None = None
|
||||
best_lo: float | None = None
|
||||
best_hi: float | None = None
|
||||
|
||||
for b in dossier.biologie_cle or []:
|
||||
test = (b.test or "").lower()
|
||||
if "plaquette" not in test and "platelet" not in test:
|
||||
continue
|
||||
|
||||
q = getattr(b, "quality", None) or "ok"
|
||||
if q == "discarded":
|
||||
continue
|
||||
|
||||
raw = str(b.valeur or "")
|
||||
val = getattr(b, "valeur_num", None)
|
||||
if val is None:
|
||||
val = _parse_float(raw)
|
||||
|
||||
if val is None:
|
||||
continue
|
||||
|
||||
lo, hi = _parse_normal_range(raw)
|
||||
|
||||
if best_val is None:
|
||||
best_val, best_q, best_raw, best_lo, best_hi = val, q, raw, lo, hi
|
||||
continue
|
||||
|
||||
# Remplacer un suspect par un ok
|
||||
if best_q == "suspect" and q != "suspect":
|
||||
best_val, best_q, best_raw, best_lo, best_hi = val, q, raw, lo, hi
|
||||
|
||||
return best_val, best_lo, best_hi
|
||||
|
||||
|
||||
|
||||
def _anemia_bio(diag: Diagnostic) -> bool:
|
||||
# 1) via preuves_cliniques (souvent déjà interprétées)
|
||||
for p in diag.preuves_cliniques or []:
|
||||
blob = f"{p.element} {p.interpretation}".lower()
|
||||
if "hemoglob" in blob or "hémoglob" in blob or blob.strip().startswith("hb"):
|
||||
val = _first_float(p.element) or _first_float(p.interpretation)
|
||||
lo, _ = _parse_normal_range(p.element)
|
||||
lo = lo if lo is not None else 12.0
|
||||
if val is not None and val < lo:
|
||||
return True
|
||||
if "confirm" in blob and "anemie" in blob:
|
||||
return True
|
||||
# 2) fallback : le texte mentionne une anémie chiffrée
|
||||
ex = _norm(diag.source_excerpt or "")
|
||||
if "hemoglob" in ex or "hémoglob" in ex:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _iron_evidence_blob(dossier: DossierMedical, diag: Diagnostic) -> str:
|
||||
parts: list[str] = []
|
||||
|
||||
# Preuves patient (extraits + éléments structurés)
|
||||
if diag.source_excerpt:
|
||||
parts.append(str(diag.source_excerpt))
|
||||
|
||||
for p in diag.preuves_cliniques or []:
|
||||
parts.append(f"{p.element} {p.interpretation}")
|
||||
|
||||
# Biologie clé globale (si ferritine/fer a été capté ailleurs)
|
||||
for b in dossier.biologie_cle or []:
|
||||
parts.append(f"{b.test} {b.valeur or ''}")
|
||||
|
||||
# Traitements (si supplémentation martiale documentée)
|
||||
for t in dossier.traitements_sortie or []:
|
||||
parts.append(f"{t.medicament} {t.posologie or ''}")
|
||||
|
||||
return _norm("\n".join(parts))
|
||||
|
||||
|
||||
def apply_decisions(dossier: DossierMedical) -> None:
|
||||
"""Applique des décisions finales sur DP/DAS.
|
||||
|
||||
- Ne supprime pas la suggestion du modèle.
|
||||
- Remplit cim10_final systématiquement quand une suggestion existe.
|
||||
- Remplit cim10_decision uniquement si action != KEEP (pour garder le JSON lisible).
|
||||
"""
|
||||
|
||||
def _set_default_final(diag: Diagnostic):
|
||||
if diag.cim10_suggestion and diag.cim10_final is None:
|
||||
diag.cim10_final = diag.cim10_suggestion
|
||||
|
||||
# DP
|
||||
if dossier.diagnostic_principal:
|
||||
_set_default_final(dossier.diagnostic_principal)
|
||||
|
||||
# DAS
|
||||
for das in dossier.diagnostics_associes or []:
|
||||
_set_default_final(das)
|
||||
|
||||
# --- Règle: D50 sans preuve martiale -> downgrade D64.9 + needs_info ---
|
||||
if rule_enabled("RULE-D50-NEEDS-IRON"):
|
||||
for das in dossier.diagnostics_associes or []:
|
||||
if das.cim10_suggestion != "D50":
|
||||
continue
|
||||
|
||||
blob = _iron_evidence_blob(dossier, das)
|
||||
has_iron = any(m in blob for m in IRON_MARKERS)
|
||||
has_anemia = _anemia_bio(das)
|
||||
|
||||
# Si on n'a même pas d'anémie biologique, on n'automatise pas.
|
||||
if not has_anemia:
|
||||
continue
|
||||
|
||||
if not has_iron:
|
||||
das.cim10_final = "D64.9"
|
||||
das.cim10_decision = CodeDecision(
|
||||
action="DOWNGRADE",
|
||||
final_code="D64.9",
|
||||
downgraded_from="D50",
|
||||
reason="Anémie biologique sans preuve d'étiologie ferriprive (bilan martial absent/insuffisant).",
|
||||
needs_info=[
|
||||
"Bilan martial disponible ? (ferritine, fer, CST/transferrine)",
|
||||
"Mention explicite 'anémie ferriprive' ou carence martiale ?",
|
||||
"Traitement martial (fer per os/IV) documenté ?",
|
||||
],
|
||||
applied_rules=["RULE-D50-NEEDS-IRON"],
|
||||
)
|
||||
|
||||
# --- Règle: thrombopénie (D69.6) incompatible avec plaquettes normales -> ruled_out (visible mais barré)
|
||||
# Objectif: éviter un FAIL "dur" sur incohérence biologique quand la biologie contredit clairement.
|
||||
if rule_enabled("RULE-D69.6-PLT-NORMAL"):
|
||||
cfg_ranges = load_reference_ranges()
|
||||
plaquettes, plt_lo_doc, _plt_hi_doc = _get_platelets_context(dossier)
|
||||
age_band = _age_band(dossier, cfg_ranges)
|
||||
plt_threshold = _threshold(cfg_ranges, "platelets", age_band, plt_lo_doc)
|
||||
if plaquettes is not None and plaquettes >= plt_threshold:
|
||||
for das in dossier.diagnostics_associes or []:
|
||||
if das.cim10_suggestion != "D69.6":
|
||||
continue
|
||||
# Visible mais barré : on conserve la suggestion, mais on retire le code final
|
||||
das.status = "ruled_out"
|
||||
das.ruled_out_reason = f"Contradiction biologique: plaquettes={plaquettes} (≥{plt_threshold}, valeur normale)" \
|
||||
" — thrombopénie non retenue sans preuve explicite."
|
||||
das.cim10_final = None
|
||||
das.cim10_decision = CodeDecision(
|
||||
action="RULED_OUT",
|
||||
final_code=None,
|
||||
downgraded_from="D69.6",
|
||||
reason=das.ruled_out_reason,
|
||||
needs_info=[
|
||||
"Mention explicite de thrombopénie confirmée dans le CR (malgré plaquettes normales) ?",
|
||||
"Valeurs de plaquettes sur d'autres dates (trend) ?",
|
||||
"Cause/iatrogénie documentée (héparine, hémopathie, etc.) ?",
|
||||
],
|
||||
applied_rules=["RULE-D69.6-PLT-NORMAL"],
|
||||
)
|
||||
|
||||
# --- Pack "bio": contradictions simples Na/K -> ruled_out (piloté par config/bio_rules.yaml)
|
||||
# Objectif: réduire VETO-09 en écartant les diagnostics "hyper/hypo" quand la valeur est clairement normale.
|
||||
bio_cfg = load_bio_rules() or {}
|
||||
rules = (bio_cfg.get("rules") or {}) if isinstance(bio_cfg, dict) else {}
|
||||
|
||||
missing_cfg = (bio_cfg.get("missing_evidence") or {}) if isinstance(bio_cfg, dict) else {}
|
||||
def _push_need_info_veto(where: str, message: str) -> None:
|
||||
"""Ajoute un VETO non-bloquant quand la preuve biologique est manquante."""
|
||||
if dossier.veto_report is None:
|
||||
return
|
||||
vr = dossier.veto_report
|
||||
veto = str(missing_cfg.get("veto") or "VETO-17")
|
||||
# Désactivation globale par YAML (config/rules)
|
||||
if not rule_enabled(veto):
|
||||
return
|
||||
severity = str(missing_cfg.get("severity") or "LOW")
|
||||
penalty = int(missing_cfg.get("score_penalty") or 0)
|
||||
|
||||
# Anti-doublon
|
||||
if any((it.veto == veto and it.where == where and (it.message or "") == message) for it in (vr.issues or [])):
|
||||
return
|
||||
|
||||
vr.issues.append(VetoIssue(veto=veto, severity=severity, where=where, message=message))
|
||||
if (vr.verdict or "") == "PASS":
|
||||
vr.verdict = "NEED_INFO"
|
||||
if penalty:
|
||||
vr.score_contestabilite = max(0, int(vr.score_contestabilite or 0) - penalty)
|
||||
|
||||
|
||||
# Sodium (hyponatrémie)
|
||||
r = rules.get("hyponatremia") or {}
|
||||
if r.get("enabled", True):
|
||||
codes = set(r.get("codes") or ["E87.1"])
|
||||
na_values, na_lo_doc, _na_hi_doc = _bio_values(dossier, _is_sodium_test)
|
||||
if (not na_values) and bool(missing_cfg.get("enabled", False)) and rule_enabled("RULE-E87.1-MISSING-NA"):
|
||||
for i, das in enumerate(dossier.diagnostics_associes or []):
|
||||
if (das.cim10_suggestion or "") not in codes:
|
||||
continue
|
||||
if das.cim10_decision and (das.cim10_decision.action or "") in ("RULED_OUT", "REMOVE"):
|
||||
continue
|
||||
|
||||
reason = "Preuve manquante: natrémie (sodium) non extraite — impossible de valider E87.1 de façon défendable."
|
||||
where = f"diagnostics_associes[{i}]"
|
||||
das.status = "needs_info"
|
||||
das.cim10_final = None
|
||||
das.cim10_decision = CodeDecision(
|
||||
action="NEED_INFO",
|
||||
final_code=None,
|
||||
downgraded_from=das.cim10_suggestion,
|
||||
reason=reason,
|
||||
needs_info=[
|
||||
"Valeur(s) de sodium (natrémie) + date(s) ?",
|
||||
"Normes du laboratoire si disponibles ?",
|
||||
],
|
||||
applied_rules=["RULE-E87.1-MISSING-NA"],
|
||||
)
|
||||
_push_need_info_veto(where, "E87.1 suggérée mais aucune natrémie (Na) n'a été extraite des résultats biologiques.")
|
||||
|
||||
if na_values and rule_enabled("RULE-E87.1-NA-NORMAL"):
|
||||
na_threshold = _threshold(cfg_ranges, "sodium", age_band, na_lo_doc)
|
||||
# Ne ruled_out que si AUCUNE valeur n'est sous la borne basse normale.
|
||||
if min(na_values) >= na_threshold:
|
||||
na_val = min(na_values)
|
||||
for das in dossier.diagnostics_associes or []:
|
||||
if (das.cim10_suggestion or "") not in codes:
|
||||
continue
|
||||
das.status = "ruled_out"
|
||||
das.ruled_out_reason = (
|
||||
f"Contradiction biologique: sodium={na_val} (≥{na_threshold}, valeur normale) "
|
||||
"— hyponatrémie non retenue sans preuve explicite."
|
||||
)
|
||||
das.cim10_final = None
|
||||
das.cim10_decision = CodeDecision(
|
||||
action="RULED_OUT",
|
||||
final_code=None,
|
||||
downgraded_from=das.cim10_suggestion,
|
||||
reason=das.ruled_out_reason,
|
||||
needs_info=[
|
||||
"Valeurs de natrémie sur d'autres dates (trend) ?",
|
||||
"Mention explicite d'hyponatrémie confirmée malgré valeurs normales ?",
|
||||
"Contexte (perfusions, diurétiques, SIADH, etc.) documenté ?",
|
||||
],
|
||||
applied_rules=["RULE-E87.1-NA-NORMAL"],
|
||||
)
|
||||
|
||||
# Potassium (hyper/hypo)
|
||||
k_values, k_lo_doc, k_hi_doc = _bio_values(dossier, _is_potassium_test)
|
||||
if (not k_values) and bool(missing_cfg.get("enabled", False)):
|
||||
# Valeur de kaliémie manquante : on refuse de valider E87.5/E87.6 sans preuve.
|
||||
codes_hyper = set((rules.get("hyperkalemia") or {}).get("codes") or ["E87.5"])
|
||||
codes_hypo = set((rules.get("hypokalemia") or {}).get("codes") or ["E87.6"])
|
||||
codes = codes_hyper.union(codes_hypo)
|
||||
|
||||
for i, das in enumerate(dossier.diagnostics_associes or []):
|
||||
if (das.cim10_suggestion or "") not in codes:
|
||||
continue
|
||||
if das.cim10_decision and (das.cim10_decision.action or "") in ("RULED_OUT", "REMOVE"):
|
||||
continue
|
||||
|
||||
code = das.cim10_suggestion or ""
|
||||
rule_id = f"RULE-{code}-MISSING-K"
|
||||
if not rule_enabled(rule_id):
|
||||
continue
|
||||
reason = f"Preuve manquante: kaliémie (potassium) non extraite — impossible de valider {code} de façon défendable."
|
||||
where = f"diagnostics_associes[{i}]"
|
||||
das.status = "needs_info"
|
||||
das.cim10_final = None
|
||||
das.cim10_decision = CodeDecision(
|
||||
action="NEED_INFO",
|
||||
final_code=None,
|
||||
downgraded_from=code,
|
||||
reason=reason,
|
||||
needs_info=[
|
||||
"Valeur(s) de potassium (kaliémie) + date(s) ?",
|
||||
"Normes du laboratoire si disponibles ?",
|
||||
],
|
||||
applied_rules=[f"RULE-{code}-MISSING-K"],
|
||||
)
|
||||
_push_need_info_veto(where, f"{code} suggéré mais aucune kaliémie (K) n'a été extraite des résultats biologiques.")
|
||||
|
||||
if k_values:
|
||||
# Hyperkaliémie
|
||||
r = rules.get("hyperkalemia") or {}
|
||||
if r.get("enabled", True) and rule_enabled("RULE-E87.5-K-NORMAL"):
|
||||
codes = set(r.get("codes") or ["E87.5"])
|
||||
k_high = _threshold_high(cfg_ranges, "potassium", age_band, k_hi_doc)
|
||||
# Ruled_out si AUCUNE valeur ne dépasse la borne haute normale.
|
||||
if max(k_values) <= k_high:
|
||||
k_val = max(k_values)
|
||||
for das in dossier.diagnostics_associes or []:
|
||||
if (das.cim10_suggestion or "") not in codes:
|
||||
continue
|
||||
das.status = "ruled_out"
|
||||
das.ruled_out_reason = (
|
||||
f"Contradiction biologique: potassium={k_val} (≤{k_high}, valeur normale) "
|
||||
"— hyperkaliémie non retenue sans preuve explicite."
|
||||
)
|
||||
das.cim10_final = None
|
||||
das.cim10_decision = CodeDecision(
|
||||
action="RULED_OUT",
|
||||
final_code=None,
|
||||
downgraded_from=das.cim10_suggestion,
|
||||
reason=das.ruled_out_reason,
|
||||
needs_info=[
|
||||
"Valeurs de kaliémie sur d'autres dates (trend) ?",
|
||||
"Mention explicite d'hyperkaliémie confirmée malgré valeurs normales ?",
|
||||
"Contexte (IRA, IEC/ARA2, spironolactone, hémolyse) documenté ?",
|
||||
],
|
||||
applied_rules=["RULE-E87.5-K-NORMAL"],
|
||||
)
|
||||
|
||||
# Hypokaliémie
|
||||
r = rules.get("hypokalemia") or {}
|
||||
if r.get("enabled", True) and rule_enabled("RULE-E87.6-K-NORMAL"):
|
||||
codes = set(r.get("codes") or ["E87.6"])
|
||||
k_low = _threshold(cfg_ranges, "potassium_low", age_band, k_lo_doc)
|
||||
# Ruled_out si AUCUNE valeur n'est sous la borne basse normale.
|
||||
if min(k_values) >= k_low:
|
||||
k_val = min(k_values)
|
||||
for das in dossier.diagnostics_associes or []:
|
||||
if (das.cim10_suggestion or "") not in codes:
|
||||
continue
|
||||
das.status = "ruled_out"
|
||||
das.ruled_out_reason = (
|
||||
f"Contradiction biologique: potassium={k_val} (≥{k_low}, valeur normale) "
|
||||
"— hypokaliémie non retenue sans preuve explicite."
|
||||
)
|
||||
das.cim10_final = None
|
||||
das.cim10_decision = CodeDecision(
|
||||
action="RULED_OUT",
|
||||
final_code=None,
|
||||
downgraded_from=das.cim10_suggestion,
|
||||
reason=das.ruled_out_reason,
|
||||
needs_info=[
|
||||
"Valeurs de kaliémie sur d'autres dates (trend) ?",
|
||||
"Mention explicite d'hypokaliémie confirmée malgré valeurs normales ?",
|
||||
"Contexte (diurétiques, diarrhées, pertes rénales) documenté ?",
|
||||
],
|
||||
applied_rules=["RULE-E87.6-K-NORMAL"],
|
||||
)
|
||||
|
||||
|
||||
|
||||
def decision_summaries(dossier: DossierMedical) -> list[str]:
|
||||
"""Retourne une liste de lignes lisibles à injecter dans alertes_codage."""
|
||||
lines: list[str] = []
|
||||
|
||||
def _summ(where: str, d: Diagnostic):
|
||||
dec = d.cim10_decision
|
||||
if not dec or dec.action == "KEEP":
|
||||
return
|
||||
if dec.action == "DOWNGRADE":
|
||||
lines.append(f"DECISION: {where} {dec.downgraded_from}→{dec.final_code} ({', '.join(dec.applied_rules)})")
|
||||
for ni in dec.needs_info[:3]:
|
||||
lines.append(f"DECISION: besoin_info: {ni}")
|
||||
elif dec.action == "REMOVE":
|
||||
lines.append(f"DECISION: {where} {d.cim10_suggestion} supprimé ({', '.join(dec.applied_rules)})")
|
||||
elif dec.action == "RULED_OUT":
|
||||
lines.append(
|
||||
f"DECISION: {where} {d.cim10_suggestion} écarté (ruled_out) ({', '.join(dec.applied_rules)})"
|
||||
)
|
||||
if dec.reason:
|
||||
lines.append(f"DECISION: raison: {dec.reason}")
|
||||
|
||||
|
||||
elif dec.action == "NEED_INFO":
|
||||
lines.append(
|
||||
f"DECISION: {where} {d.cim10_suggestion} non retenu (NEED_INFO) ({', '.join(dec.applied_rules)})"
|
||||
)
|
||||
if dec.reason:
|
||||
lines.append(f"DECISION: raison: {dec.reason}")
|
||||
if dec.needs_info:
|
||||
for q in dec.needs_info:
|
||||
lines.append(f"DECISION: besoin_info: {q}")
|
||||
|
||||
if dossier.diagnostic_principal:
|
||||
_summ("diagnostic_principal", dossier.diagnostic_principal)
|
||||
|
||||
for i, das in enumerate(dossier.diagnostics_associes or []):
|
||||
_summ(f"diagnostics_associes[{i}]", das)
|
||||
|
||||
return lines
|
||||
205
src/quality/rules_router.py
Normal file
205
src/quality/rules_router.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""src/quality/rules_router.py
|
||||
|
||||
Routage dynamique des règles (packs) par dossier.
|
||||
|
||||
Objectif:
|
||||
- éviter de tout exécuter tout le temps (surtout dans un batch de centaines de dossiers)
|
||||
- garder un socle 'pro' (vetos_core + decisions_core)
|
||||
- activer des packs additionnels uniquement quand le dossier contient des signaux pertinents
|
||||
(codes, biologie, extraits, etc.)
|
||||
|
||||
Le routage est piloté par config/rules/router.yaml (éditable et future UI-friendly).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, Iterable, List, Set
|
||||
|
||||
from ..config import DossierMedical, load_rules_router
|
||||
|
||||
|
||||
def _norm(s: str) -> str:
|
||||
s = (s or "").lower()
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
|
||||
def _iter_codes(dossier: DossierMedical) -> Iterable[str]:
|
||||
# DP
|
||||
if dossier.diagnostic_principal:
|
||||
for c in (dossier.diagnostic_principal.cim10_final, dossier.diagnostic_principal.cim10_suggestion):
|
||||
if c:
|
||||
yield str(c).upper()
|
||||
# DAS
|
||||
for d in (dossier.diagnostics_associes or []):
|
||||
for c in (getattr(d, "cim10_final", None), getattr(d, "cim10_suggestion", None)):
|
||||
if c:
|
||||
yield str(c).upper()
|
||||
|
||||
|
||||
def _collect_text_corpus(dossier: DossierMedical, max_chars: int = 60000) -> str:
|
||||
parts: List[str] = []
|
||||
|
||||
# Extraits DP/DAS
|
||||
if dossier.diagnostic_principal and dossier.diagnostic_principal.source_excerpt:
|
||||
parts.append(str(dossier.diagnostic_principal.source_excerpt))
|
||||
for d in (dossier.diagnostics_associes or []):
|
||||
ex = getattr(d, "source_excerpt", None)
|
||||
if ex:
|
||||
parts.append(str(ex))
|
||||
# Extraits RAG (souvent courts)
|
||||
for s in (getattr(d, "sources_rag", None) or []):
|
||||
ex2 = getattr(s, "extrait", None)
|
||||
if ex2:
|
||||
parts.append(str(ex2))
|
||||
|
||||
# Biologie (noms de tests)
|
||||
for b in (dossier.biologie_cle or []):
|
||||
if b.test:
|
||||
parts.append(str(b.test))
|
||||
|
||||
# Imagerie / complications / ATCD
|
||||
for img in (dossier.imagerie or []):
|
||||
if img.conclusion:
|
||||
parts.append(str(img.conclusion))
|
||||
for a in (dossier.antecedents or []):
|
||||
if a.texte:
|
||||
parts.append(str(a.texte))
|
||||
for c in (dossier.complications or []):
|
||||
if c.texte:
|
||||
parts.append(str(c.texte))
|
||||
|
||||
corpus = "\n".join(parts)
|
||||
if len(corpus) > max_chars:
|
||||
corpus = corpus[-max_chars:]
|
||||
return _norm(corpus)
|
||||
|
||||
|
||||
def _collect_lab_tests(dossier: DossierMedical) -> Set[str]:
|
||||
tests = set()
|
||||
for b in (dossier.biologie_cle or []):
|
||||
if b.test:
|
||||
tests.add(_norm(b.test))
|
||||
return tests
|
||||
|
||||
|
||||
def _match_codes_prefix(codes: Set[str], prefixes: List[str]) -> bool:
|
||||
pref = [p.upper() for p in prefixes if p]
|
||||
for c in codes:
|
||||
for p in pref:
|
||||
if c.startswith(p):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _match_codes_any(codes: Set[str], values: List[str]) -> bool:
|
||||
want = {v.upper() for v in values if v}
|
||||
return bool(codes & want)
|
||||
|
||||
|
||||
def _match_keywords(corpus: str, keywords: List[str]) -> bool:
|
||||
for kw in keywords:
|
||||
if not kw:
|
||||
continue
|
||||
if _norm(kw) in corpus:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _match_lab_tests(lab_tests: Set[str], values: List[str]) -> bool:
|
||||
want = {_norm(v) for v in values if v}
|
||||
# match exact or substring (ex: 'ionogramme' vs 'ionogramme (na, k, cl...)')
|
||||
for t in lab_tests:
|
||||
for w in want:
|
||||
if w in t:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _eval_condition_block(block: Dict[str, Any], codes: Set[str], corpus: str, lab_tests: Set[str], doc_type: str) -> bool:
|
||||
"""Un bloc est vrai si **toutes** ses clés connues matchent."""
|
||||
if not block:
|
||||
return True
|
||||
|
||||
# doc_types
|
||||
dt = block.get("doc_types")
|
||||
if dt:
|
||||
if _norm(doc_type) not in {_norm(x) for x in dt if x}:
|
||||
return False
|
||||
|
||||
# codes
|
||||
if block.get("codes_prefix") and not _match_codes_prefix(codes, list(block["codes_prefix"])):
|
||||
return False
|
||||
if block.get("codes") and not _match_codes_any(codes, list(block["codes"])):
|
||||
return False
|
||||
|
||||
# keywords
|
||||
if block.get("keywords") and not _match_keywords(corpus, list(block["keywords"])):
|
||||
return False
|
||||
|
||||
# labs
|
||||
if block.get("lab_tests") and not _match_lab_tests(lab_tests, list(block["lab_tests"])):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def build_rules_runtime_context(dossier: DossierMedical) -> Dict[str, Any]:
|
||||
"""Construit le contexte runtime (enabled_packs, mode, triggers_fired...)."""
|
||||
router = load_rules_router()
|
||||
mode = str(router.get("mode") or "strict").lower()
|
||||
|
||||
defaults = router.get("defaults") or {}
|
||||
enabled_packs: Set[str] = set(defaults.get("enabled_packs") or [])
|
||||
always_on_rules: Set[str] = set(defaults.get("always_on_rules") or [])
|
||||
|
||||
triggers_fired: List[str] = []
|
||||
|
||||
codes = set(_iter_codes(dossier))
|
||||
corpus = _collect_text_corpus(dossier)
|
||||
lab_tests = _collect_lab_tests(dossier)
|
||||
doc_type = dossier.document_type or ""
|
||||
|
||||
for trig in (router.get("triggers") or []):
|
||||
if not isinstance(trig, dict):
|
||||
continue
|
||||
trig_id = str(trig.get("id") or trig.get("name") or "TRIGGER")
|
||||
enable_packs = trig.get("enable_packs") or []
|
||||
enable_rules = trig.get("enable_rules") or []
|
||||
disable_rules = trig.get("disable_rules") or []
|
||||
|
||||
when_any = trig.get("when_any") or {}
|
||||
when_all = trig.get("when_all") or {}
|
||||
|
||||
ok_all = _eval_condition_block(when_all, codes, corpus, lab_tests, doc_type)
|
||||
ok_any = True
|
||||
if when_any:
|
||||
# any => au moins un sous-bloc match
|
||||
# On accepte deux formats:
|
||||
# - dict simple => interprété comme 1 bloc (AND interne)
|
||||
# - list[dict] => OR entre blocs
|
||||
if isinstance(when_any, list):
|
||||
ok_any = any(_eval_condition_block(b or {}, codes, corpus, lab_tests, doc_type) for b in when_any)
|
||||
elif isinstance(when_any, dict):
|
||||
ok_any = _eval_condition_block(when_any, codes, corpus, lab_tests, doc_type)
|
||||
else:
|
||||
ok_any = False
|
||||
|
||||
if ok_all and ok_any:
|
||||
enabled_packs.update([str(p) for p in enable_packs if p])
|
||||
triggers_fired.append(trig_id)
|
||||
|
||||
# allow rule-level overrides if needed later
|
||||
if enable_rules or disable_rules:
|
||||
# store in context (config.rule_enabled reads these)
|
||||
pass
|
||||
|
||||
ctx: Dict[str, Any] = {
|
||||
"router_version": router.get("version", 1),
|
||||
"mode": mode,
|
||||
"enabled_packs": sorted(enabled_packs),
|
||||
"always_on_rules": sorted(always_on_rules),
|
||||
"triggers_fired": triggers_fired,
|
||||
}
|
||||
return ctx
|
||||
411
src/quality/veto_engine.py
Normal file
411
src/quality/veto_engine.py
Normal file
@@ -0,0 +1,411 @@
|
||||
"""Moteur de vetos (contrôle de contestabilité).
|
||||
|
||||
Objectif : bloquer automatiquement les propositions CIM-10/CCAM contestables
|
||||
(absence de preuve, négation/conditionnel, doublons incohérents, etc.).
|
||||
|
||||
Ce module est volontairement simple et déterministe : il doit être stable,
|
||||
audit-able, et indépendant des modèles.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import Iterable
|
||||
|
||||
from ..config import (
|
||||
ActeCCAM,
|
||||
BiologieCle,
|
||||
Diagnostic,
|
||||
DossierMedical,
|
||||
VetoIssue,
|
||||
VetoReport,
|
||||
rule_enabled,
|
||||
rule_force_severity,
|
||||
)
|
||||
|
||||
|
||||
# NOTE: Vetos = déterministes et auditables.
|
||||
# On évite d'interpréter le « raisonnement » du modèle comme une preuve.
|
||||
|
||||
_NEGATION_CUES = (
|
||||
"pas de",
|
||||
"pas d",
|
||||
"absence de",
|
||||
"non retenu",
|
||||
"exclu",
|
||||
"a eliminer",
|
||||
"a éliminer",
|
||||
"negatif",
|
||||
"négatif",
|
||||
)
|
||||
|
||||
_CONDITIONAL_CUES = (
|
||||
"si",
|
||||
"s il", # OCR fréquent de "s'il"
|
||||
"eventuel",
|
||||
"éventuel",
|
||||
"suspect",
|
||||
"probable",
|
||||
"hypothese",
|
||||
"hypothèse",
|
||||
"?",
|
||||
)
|
||||
|
||||
_EVIDENCE_TEMPLATE_CUES = (
|
||||
"score",
|
||||
"fib4",
|
||||
"fibrosis-4",
|
||||
"test de depistage",
|
||||
"test de dépistage",
|
||||
"outil de depistage",
|
||||
"outil de dépistage",
|
||||
)
|
||||
|
||||
|
||||
def _norm(s: str) -> str:
|
||||
"""Normalisation légère (lower + sans accents) pour matcher OCR."""
|
||||
s = s.replace("’", "'")
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
||||
s = s.lower()
|
||||
# simplifier ponctuation en espaces
|
||||
s = re.sub(r"[^a-z0-9]+", " ", s)
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def _split_sentences(text: str) -> list[str]:
|
||||
# volontairement simple : robuste sur OCR
|
||||
text = text.replace("\r", "\n")
|
||||
parts = re.split(r"[\n\.\;\:]+", text)
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
|
||||
|
||||
def _concept_keywords(label: str) -> list[str]:
|
||||
"""Extrait des mots-clés discriminants depuis le libellé Diagnostic."""
|
||||
stop = {
|
||||
"de", "du", "des", "la", "le", "les", "un", "une", "et", "a", "au", "aux",
|
||||
"gauche", "droite", "bilaterale", "bilat", "chronique", "aigue", "aigu",
|
||||
"sans", "avec",
|
||||
}
|
||||
tokens = [t for t in _norm(label).split() if len(t) >= 4 and t not in stop]
|
||||
# garder l'ordre, éviter doublons
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for t in tokens:
|
||||
if t not in seen:
|
||||
seen.add(t)
|
||||
out.append(t)
|
||||
return out[:5]
|
||||
|
||||
|
||||
def _analyze_neg_cond(excerpts: Iterable[str], label: str) -> tuple[bool, bool, bool, bool]:
|
||||
"""Retourne (negated, conditional, contradictory, positive).
|
||||
|
||||
*negated* : une phrase qui contient le concept ET une négation proche.
|
||||
*conditional* : une phrase qui contient le concept ET un marqueur conditionnel.
|
||||
*positive* : une phrase qui contient le concept sans négation proche.
|
||||
*contradictory* : negated et positive.
|
||||
"""
|
||||
kws = _concept_keywords(label)
|
||||
if not kws:
|
||||
return False, False, False, False
|
||||
|
||||
negated = False
|
||||
conditional = False
|
||||
positive = False
|
||||
|
||||
for ex in excerpts:
|
||||
if not ex or not str(ex).strip():
|
||||
continue
|
||||
for sent in _split_sentences(str(ex)):
|
||||
ns = _norm(sent)
|
||||
if not ns:
|
||||
continue
|
||||
# le concept est-il mentionné ?
|
||||
hit_pos = None
|
||||
for kw in kws:
|
||||
pos = ns.find(kw)
|
||||
if pos != -1:
|
||||
hit_pos = pos
|
||||
break
|
||||
if hit_pos is None:
|
||||
continue
|
||||
|
||||
pre = ns[max(0, hit_pos - 40):hit_pos]
|
||||
has_neg = any(cue in pre for cue in _NEGATION_CUES)
|
||||
has_cond = any(cue in ns for cue in _CONDITIONAL_CUES)
|
||||
|
||||
if has_neg:
|
||||
negated = True
|
||||
else:
|
||||
positive = True
|
||||
|
||||
if has_cond:
|
||||
conditional = True
|
||||
|
||||
contradictory = negated and positive
|
||||
return negated, conditional, contradictory, positive
|
||||
|
||||
|
||||
def _evidence_excerpts(d: Diagnostic | ActeCCAM) -> list[str]:
|
||||
"""Ne retourne que des preuves (extraits), pas le raisonnement du modèle."""
|
||||
texts: list[str] = []
|
||||
if getattr(d, "source_excerpt", None):
|
||||
texts.append(str(getattr(d, "source_excerpt")))
|
||||
# Sources RAG (extraits)
|
||||
for s in getattr(d, "sources_rag", []) or []:
|
||||
if getattr(s, "extrait", None):
|
||||
texts.append(str(s.extrait))
|
||||
return [t for t in texts if t.strip()]
|
||||
|
||||
|
||||
def _has_evidence(d: Diagnostic | ActeCCAM) -> bool:
|
||||
if getattr(d, "source_excerpt", None):
|
||||
return True
|
||||
if getattr(d, "sources_rag", None):
|
||||
# un extrait RAG suffit
|
||||
for s in d.sources_rag:
|
||||
if s.extrait and str(s.extrait).strip():
|
||||
return True
|
||||
if isinstance(d, Diagnostic) and getattr(d, "preuves_cliniques", None):
|
||||
return len(d.preuves_cliniques) > 0
|
||||
return False
|
||||
|
||||
|
||||
def _has_template_evidence(excerpts: Iterable[str]) -> bool:
|
||||
joined = _norm("\n".join([str(x) for x in excerpts if x]))
|
||||
cues = [_norm(c) for c in _EVIDENCE_TEMPLATE_CUES]
|
||||
return any(cue in joined for cue in cues)
|
||||
|
||||
|
||||
def _parse_float(v: str | None) -> float | None:
|
||||
if v is None:
|
||||
return None
|
||||
s = str(v).strip().replace(",", ".")
|
||||
# extraire le premier nombre
|
||||
m = re.search(r"(-?\d+(?:\.\d+)?)", s)
|
||||
if not m:
|
||||
return None
|
||||
try:
|
||||
return float(m.group(1))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _get_bio_value(bios: list[BiologieCle], keywords: tuple[str, ...]) -> float | None:
|
||||
for b in bios:
|
||||
t = (b.test or "").lower()
|
||||
if any(k in t for k in keywords):
|
||||
return _parse_float(b.valeur)
|
||||
return None
|
||||
|
||||
|
||||
def _is_ruled_out(d: Diagnostic) -> bool:
|
||||
"""Retourne True si le diagnostic est marqué "visible mais barré"."""
|
||||
if getattr(d, "status", None) == "ruled_out":
|
||||
return True
|
||||
dec = getattr(d, "cim10_decision", None)
|
||||
return bool(dec is not None and getattr(dec, "action", None) == "RULED_OUT")
|
||||
|
||||
|
||||
def apply_vetos(dossier: DossierMedical) -> VetoReport:
|
||||
"""Applique des vetos déterministes et retourne un rapport.
|
||||
|
||||
Verdicts :
|
||||
- FAIL : au moins un veto HARD.
|
||||
- NEED_INFO : pas de HARD, au moins un MEDIUM.
|
||||
- PASS : aucun HARD/MEDIUM.
|
||||
"""
|
||||
|
||||
issues: list[VetoIssue] = []
|
||||
seen_issue_keys: set[tuple[str, str, str]] = set() # (veto, where, message)
|
||||
|
||||
def add(veto: str, severity: str, where: str, message: str):
|
||||
# Désactivation globale par YAML (config/rules)
|
||||
if not rule_enabled(veto):
|
||||
return
|
||||
# Optionnel: forcer la sévérité via YAML (utile en phase de calibration)
|
||||
forced = rule_force_severity(veto)
|
||||
if forced:
|
||||
severity = forced
|
||||
key = (veto, where, message)
|
||||
if key in seen_issue_keys:
|
||||
return
|
||||
seen_issue_keys.add(key)
|
||||
issues.append(VetoIssue(veto=veto, severity=severity, where=where, message=message))
|
||||
|
||||
# -----------------------------
|
||||
# VETO-02 : code sans preuve
|
||||
# -----------------------------
|
||||
dp = dossier.diagnostic_principal
|
||||
if dp and dp.cim10_suggestion:
|
||||
if not _has_evidence(dp):
|
||||
add("VETO-02", "HARD", "diagnostic_principal", f"DP {dp.cim10_suggestion} sans preuve exploitable")
|
||||
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if _is_ruled_out(das):
|
||||
continue
|
||||
if das.cim10_suggestion and not _has_evidence(das):
|
||||
add("VETO-02", "MEDIUM", f"diagnostics_associes[{i}]", f"DAS {das.cim10_suggestion} sans preuve exploitable")
|
||||
|
||||
for i, acte in enumerate(dossier.actes_ccam):
|
||||
if acte.code_ccam_suggestion and not _has_evidence(acte):
|
||||
add("VETO-02", "HARD", f"actes_ccam[{i}]", f"Acte {acte.code_ccam_suggestion} sans preuve exploitable")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-03 : négation / conditionnel DANS LES PREUVES
|
||||
# (pas dans le raisonnement du modèle)
|
||||
# -------------------------------------------------
|
||||
if dp and dp.cim10_suggestion:
|
||||
excerpts = _evidence_excerpts(dp)
|
||||
neg, cond, contra, pos = _analyze_neg_cond(excerpts, dp.texte or dp.cim10_suggestion)
|
||||
if neg and not pos:
|
||||
add("VETO-03", "HARD", "diagnostic_principal", f"DP {dp.cim10_suggestion} contredit par la preuve (négation)")
|
||||
elif contra:
|
||||
add("VETO-03", "MEDIUM", "diagnostic_principal", f"DP {dp.cim10_suggestion} preuves contradictoires (positif vs négatif)")
|
||||
elif cond and dp.cim10_confidence == "high":
|
||||
add("VETO-03", "MEDIUM", "diagnostic_principal", f"DP {dp.cim10_suggestion} basé sur du conditionnel")
|
||||
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if _is_ruled_out(das):
|
||||
continue
|
||||
if not das.cim10_suggestion:
|
||||
continue
|
||||
excerpts = _evidence_excerpts(das)
|
||||
neg, cond, contra, pos = _analyze_neg_cond(excerpts, das.texte or das.cim10_suggestion)
|
||||
where = f"diagnostics_associes[{i}]"
|
||||
if neg and not pos:
|
||||
# En contrôle CPAM : une négation explicite = bloquant, surtout si le modèle est « high ».
|
||||
severity = "HARD" if das.cim10_confidence == "high" else "MEDIUM"
|
||||
add("VETO-03", severity, where, f"DAS {das.cim10_suggestion} contredit par la preuve (négation)")
|
||||
elif contra:
|
||||
add("VETO-03", "MEDIUM", where, f"DAS {das.cim10_suggestion} preuves contradictoires")
|
||||
elif cond and das.cim10_confidence == "high":
|
||||
add("VETO-03", "LOW", where, f"DAS {das.cim10_suggestion} potentiellement conditionnel")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-15 : preuve de type "score/test" (risque élevé de sur-codage)
|
||||
# -------------------------------------------------
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if _is_ruled_out(das):
|
||||
continue
|
||||
if not das.cim10_suggestion:
|
||||
continue
|
||||
excerpts = _evidence_excerpts(das)
|
||||
if _has_template_evidence(excerpts) and ("fibrose" in _norm(das.texte or "") or str(das.cim10_suggestion).startswith("K74")):
|
||||
add("VETO-15", "MEDIUM", f"diagnostics_associes[{i}]", f"{das.cim10_suggestion}: preuve issue d'un score/test (à confirmer par diagnostic explicite)")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-16 : incohérence libellé→code (heuristique)
|
||||
# -------------------------------------------------
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if _is_ruled_out(das):
|
||||
continue
|
||||
if not das.cim10_suggestion:
|
||||
continue
|
||||
label_n = _norm(das.texte or "")
|
||||
if "sacroili" in label_n and str(das.cim10_suggestion) == "M53.3":
|
||||
add("VETO-16", "MEDIUM", f"diagnostics_associes[{i}]", "Sacro-iliite : M53.3 semble hors-sujet (à revalider via candidats, ex. M46.1)")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-06 : DP dupliqué en DAS (incohérent)
|
||||
# -------------------------------------------------
|
||||
if dp and dp.cim10_suggestion:
|
||||
dp_code = dp.cim10_suggestion
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if _is_ruled_out(das):
|
||||
continue
|
||||
if das.cim10_suggestion == dp_code:
|
||||
add("VETO-06", "HARD", "diagnostics_associes", f"Code DP {dp_code} dupliqué dans les DAS (index {i})")
|
||||
break
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-07 : doublons DAS (à fusionner)
|
||||
# -------------------------------------------------
|
||||
seen: dict[str, int] = {}
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if _is_ruled_out(das):
|
||||
continue
|
||||
c = das.cim10_suggestion
|
||||
if not c:
|
||||
continue
|
||||
if c in seen:
|
||||
add("VETO-07", "MEDIUM", "diagnostics_associes", f"Doublon DAS {c} (index {seen[c]} et {i})")
|
||||
else:
|
||||
seen[c] = i
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-09 : contradiction bio simple (plaquettes / créat)
|
||||
# -------------------------------------------------
|
||||
# Plaquettes : si code suggère thrombopénie (D69*) mais valeur normale
|
||||
plaquettes = _get_bio_value(dossier.biologie_cle, ("plaquette", "platelet"))
|
||||
if plaquettes is not None:
|
||||
# seuil volontairement large pour éviter faux positifs
|
||||
if dp and dp.cim10_suggestion and dp.cim10_suggestion.startswith("D69") and plaquettes >= 150:
|
||||
add("VETO-09", "HARD", "diagnostic_principal", f"DP {dp.cim10_suggestion} incompatible avec plaquettes={plaquettes} (sans preuve explicite)")
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if _is_ruled_out(das):
|
||||
continue
|
||||
if das.cim10_suggestion and das.cim10_suggestion.startswith("D69") and plaquettes >= 150:
|
||||
# Si les preuves disent explicitement "pas de thrombopénie" ou si le modèle est très confiant,
|
||||
# on passe en HARD (risque CPAM maximal).
|
||||
excerpts = _evidence_excerpts(das)
|
||||
neg, _, _, _ = _analyze_neg_cond(excerpts, das.texte or das.cim10_suggestion)
|
||||
severity = "HARD" if (das.cim10_confidence == "high" or neg) else "MEDIUM"
|
||||
add("VETO-09", severity, f"diagnostics_associes[{i}]", f"DAS {das.cim10_suggestion} incompatible avec plaquettes={plaquettes}")
|
||||
|
||||
creat = _get_bio_value(dossier.biologie_cle, ("créat", "creat", "creatin"))
|
||||
if creat is not None:
|
||||
# ultra prudence : on ne hard-fail pas sur l'IR, on alerte
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if das.cim10_suggestion and das.cim10_suggestion.startswith(("N17", "N18", "N19")) and creat < 110 and das.cim10_confidence == "high":
|
||||
add("VETO-09", "LOW", f"diagnostics_associes[{i}]", f"IR {das.cim10_suggestion} à confirmer (créat={creat})")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-12 : sur-confiance
|
||||
# -------------------------------------------------
|
||||
def _overconf(d: Diagnostic | ActeCCAM) -> bool:
|
||||
conf = getattr(d, "cim10_confidence", None) or getattr(d, "ccam_confidence", None)
|
||||
return conf == "high" and not _has_evidence(d)
|
||||
|
||||
if dp and dp.cim10_suggestion and _overconf(dp):
|
||||
add("VETO-12", "HARD", "diagnostic_principal", f"DP {dp.cim10_suggestion} en high sans preuve")
|
||||
|
||||
|
||||
# -------------------------------------------------
|
||||
# Post-traitement : si un veto HARD existe pour un même 'where',
|
||||
# on évite de polluer avec des vetos plus faibles redondants.
|
||||
# Exemple : thrombopénie (VETO-09 HARD) -> VETO-03 devient secondaire.
|
||||
# -------------------------------------------------
|
||||
hard_where = {it.where for it in issues if it.severity == "HARD"}
|
||||
if hard_where:
|
||||
issues = [
|
||||
it for it in issues
|
||||
if not (it.where in hard_where and it.severity in ("LOW", "MEDIUM") and it.veto in ("VETO-03", "VETO-15"))
|
||||
]
|
||||
|
||||
# -----------------------------
|
||||
# Verdict + score
|
||||
# -----------------------------
|
||||
hard = any(i.severity == "HARD" for i in issues)
|
||||
medium = any(i.severity == "MEDIUM" for i in issues)
|
||||
|
||||
if hard:
|
||||
verdict = "FAIL"
|
||||
elif medium:
|
||||
verdict = "NEED_INFO"
|
||||
else:
|
||||
verdict = "PASS"
|
||||
|
||||
score = 100
|
||||
for it in issues:
|
||||
if it.severity == "HARD":
|
||||
score -= 30
|
||||
elif it.severity == "MEDIUM":
|
||||
score -= 10
|
||||
else:
|
||||
score -= 3
|
||||
score = max(0, min(100, score))
|
||||
|
||||
return VetoReport(verdict=verdict, score_contestabilite=score, issues=issues)
|
||||
@@ -8,7 +8,7 @@ import re
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from flask import Flask, abort, render_template, request, jsonify
|
||||
from flask import Flask, Response, abort, render_template, request, jsonify
|
||||
from markupsafe import Markup
|
||||
|
||||
from werkzeug.utils import secure_filename
|
||||
@@ -16,7 +16,8 @@ from werkzeug.utils import secure_filename
|
||||
from collections import Counter
|
||||
|
||||
from ..config import (
|
||||
ANONYMIZED_DIR, STRUCTURED_DIR, OLLAMA_URL, CCAM_DICT_PATH, DossierMedical,
|
||||
ANONYMIZED_DIR, STRUCTURED_DIR, INPUT_DIR, REPORTS_DIR,
|
||||
OLLAMA_URL, CCAM_DICT_PATH, DossierMedical,
|
||||
ALLOWED_EXTENSIONS, UPLOAD_MAX_SIZE_MB,
|
||||
CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF, CIM10_DICT_PATH, CIM10_SUPPLEMENTS_PATH,
|
||||
)
|
||||
@@ -463,7 +464,11 @@ def create_app() -> Flask:
|
||||
@app.route("/admin/models", methods=["GET"])
|
||||
def list_models():
|
||||
models = fetch_ollama_models()
|
||||
return jsonify({"models": models, "current": cfg.OLLAMA_MODEL})
|
||||
return jsonify({
|
||||
"models": models,
|
||||
"current": cfg.OLLAMA_MODEL,
|
||||
"roles": dict(cfg.OLLAMA_MODELS),
|
||||
})
|
||||
|
||||
@app.route("/admin/models", methods=["POST"])
|
||||
def set_model():
|
||||
@@ -471,8 +476,15 @@ def create_app() -> Flask:
|
||||
new_model = data.get("model", "").strip()
|
||||
if not new_model:
|
||||
return jsonify({"error": "Champ 'model' requis"}), 400
|
||||
role = data.get("role", "").strip()
|
||||
if role:
|
||||
if role not in cfg.OLLAMA_MODELS:
|
||||
return jsonify({"error": f"Rôle inconnu : {role}"}), 400
|
||||
cfg.OLLAMA_MODELS[role] = new_model
|
||||
logger.info("Modèle Ollama pour rôle '%s' changé : %s", role, new_model)
|
||||
return jsonify({"ok": True, "role": role, "model": new_model, "roles": dict(cfg.OLLAMA_MODELS)})
|
||||
cfg.OLLAMA_MODEL = new_model
|
||||
logger.info("Modèle Ollama changé : %s", new_model)
|
||||
logger.info("Modèle Ollama global changé : %s", new_model)
|
||||
return jsonify({"ok": True, "model": cfg.OLLAMA_MODEL})
|
||||
|
||||
@app.route("/reprocess/<path:filepath>", methods=["POST"])
|
||||
@@ -615,6 +627,44 @@ def create_app() -> Flask:
|
||||
logger.warning("Impossible de lire %s", txt_path)
|
||||
return jsonify(result)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# API PDF caviardé
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@app.route("/api/pdf/<path:dossier_id>/<filename>")
|
||||
def serve_redacted_pdf(dossier_id: str, filename: str):
|
||||
"""Sert un PDF avec les données personnelles caviardées (rectangles noirs).
|
||||
|
||||
Query params optionnels :
|
||||
- highlight : texte à surligner en jaune
|
||||
- page : numéro de page (1-indexed) pour cibler le surlignage
|
||||
"""
|
||||
from .pdf_redactor import load_entities_from_report, redact_pdf, highlight_text
|
||||
|
||||
# Sécurité path traversal
|
||||
safe_dir = (INPUT_DIR / dossier_id).resolve()
|
||||
if not safe_dir.is_relative_to(INPUT_DIR.resolve()):
|
||||
abort(403)
|
||||
|
||||
pdf_path = safe_dir / filename
|
||||
if not pdf_path.exists() or pdf_path.suffix.lower() != ".pdf":
|
||||
abort(404)
|
||||
|
||||
# Charger les entités depuis le rapport d'anonymisation
|
||||
stem = Path(filename).stem.replace(" ", "_")
|
||||
report_path = REPORTS_DIR / dossier_id / f"{stem}_report.json"
|
||||
entities = load_entities_from_report(report_path) if report_path.exists() else set()
|
||||
|
||||
pdf_bytes = redact_pdf(pdf_path, entities)
|
||||
|
||||
# Surlignage optionnel
|
||||
highlight = request.args.get("highlight", "")
|
||||
page_num = request.args.get("page", type=int)
|
||||
if highlight:
|
||||
pdf_bytes = highlight_text(pdf_bytes, highlight, page_num)
|
||||
|
||||
return Response(pdf_bytes, mimetype="application/pdf")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Routes admin référentiels
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
154
src/viewer/pdf_redactor.py
Normal file
154
src/viewer/pdf_redactor.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""Caviardage PDF à la volée — remplace les entités NER par des rectangles noirs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Cache simple : (pdf_path, frozenset(entities)) -> (bytes, timestamp)
|
||||
_pdf_cache: dict[tuple[str, frozenset[str]], tuple[bytes, float]] = {}
|
||||
_CACHE_TTL_S = 300 # 5 minutes
|
||||
|
||||
|
||||
def load_entities_from_report(report_path: Path) -> set[str]:
|
||||
"""Extrait les entités uniques à caviarder depuis le rapport d'anonymisation."""
|
||||
data = json.loads(report_path.read_text(encoding="utf-8"))
|
||||
entities: set[str] = set()
|
||||
for e in data.get("entities_found", []):
|
||||
orig = e.get("original", "")
|
||||
# Ignorer les pseudonymes et les chaînes trop courtes
|
||||
if not orig.startswith("[") and len(orig) >= 2:
|
||||
entities.add(orig)
|
||||
return entities
|
||||
|
||||
|
||||
def redact_pdf(pdf_path: Path, entities: set[str]) -> bytes:
|
||||
"""Ouvre un PDF, caviarde toutes les occurrences des entités, retourne les bytes."""
|
||||
cache_key = (str(pdf_path), frozenset(entities))
|
||||
|
||||
# Vérifier le cache
|
||||
if cache_key in _pdf_cache:
|
||||
cached_bytes, cached_time = _pdf_cache[cache_key]
|
||||
if time.time() - cached_time < _CACHE_TTL_S:
|
||||
return cached_bytes
|
||||
|
||||
doc = fitz.open(str(pdf_path))
|
||||
try:
|
||||
for page in doc:
|
||||
for entity in entities:
|
||||
rects = page.search_for(entity)
|
||||
for rect in rects:
|
||||
page.add_redact_annot(rect, fill=(0, 0, 0))
|
||||
page.apply_redactions()
|
||||
pdf_bytes = doc.tobytes()
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
# Mettre en cache
|
||||
_pdf_cache[cache_key] = (pdf_bytes, time.time())
|
||||
|
||||
# Nettoyer les entrées expirées
|
||||
now = time.time()
|
||||
expired = [k for k, (_, t) in _pdf_cache.items() if now - t >= _CACHE_TTL_S]
|
||||
for k in expired:
|
||||
_pdf_cache.pop(k, None)
|
||||
|
||||
return pdf_bytes
|
||||
|
||||
|
||||
def _strip_accents(s: str) -> str:
|
||||
"""Retire les accents d'une chaîne (é→e, è→e, etc.)."""
|
||||
nfkd = unicodedata.normalize("NFD", s)
|
||||
return "".join(c for c in nfkd if unicodedata.category(c) != "Mn")
|
||||
|
||||
|
||||
def _add_highlight(page, rects) -> None:
|
||||
"""Ajoute des annotations highlight jaunes sur une liste de rectangles."""
|
||||
for rect in rects:
|
||||
annot = page.add_highlight_annot(rect)
|
||||
annot.set_colors(stroke=(1, 0.95, 0)) # jaune
|
||||
annot.update()
|
||||
|
||||
|
||||
def highlight_text(pdf_bytes: bytes, text: str, page_num: int | None = None) -> bytes:
|
||||
"""Ajoute un surlignage jaune sur les occurrences d'un texte dans le PDF.
|
||||
|
||||
Appliqué après le caviardage (sur les bytes déjà caviardés).
|
||||
Si page_num est fourni (1-indexed), cherche uniquement sur cette page.
|
||||
|
||||
Le texte reçu est typiquement le nom du diagnostic/item médical (court,
|
||||
une seule ligne) — pas l'excerpt brut qui est multi-lignes et bruité.
|
||||
"""
|
||||
if not text or len(text) < 3:
|
||||
return pdf_bytes
|
||||
|
||||
# Nettoyer le texte : retirer les "..." ajoutés par extract_excerpt()
|
||||
clean = text.strip()
|
||||
if clean.startswith("..."):
|
||||
clean = clean[3:]
|
||||
if clean.endswith("..."):
|
||||
clean = clean[:-3]
|
||||
clean = clean.strip()
|
||||
if len(clean) < 3:
|
||||
return pdf_bytes
|
||||
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
try:
|
||||
pages = [doc[page_num - 1]] if page_num and 0 < page_num <= len(doc) else list(doc)
|
||||
|
||||
single_line = " ".join(clean.split())
|
||||
found = False
|
||||
|
||||
# Essai 1 : texte exact
|
||||
for page in pages:
|
||||
rects = page.search_for(single_line)
|
||||
if rects:
|
||||
_add_highlight(page, rects)
|
||||
found = True
|
||||
break
|
||||
|
||||
# Essai 2 : fallback accents — le texte du diagnostic peut manquer
|
||||
# d'accents ("renale") alors que le PDF les a ("rénale")
|
||||
if not found:
|
||||
page_text_cache: dict[int, str] = {}
|
||||
for page in pages:
|
||||
page_text = page.get_text()
|
||||
page_text_cache[page.number] = page_text
|
||||
# Chercher dans le texte normalisé (sans accents) du PDF
|
||||
page_text_stripped = _strip_accents(page_text)
|
||||
search_stripped = _strip_accents(single_line)
|
||||
idx = page_text_stripped.lower().find(search_stripped.lower())
|
||||
if idx >= 0:
|
||||
# Extraire le texte original (avec accents) à cette position
|
||||
original_match = page_text[idx:idx + len(search_stripped)]
|
||||
# Chercher ce texte exact dans le PDF
|
||||
rects = page.search_for(original_match)
|
||||
if rects:
|
||||
_add_highlight(page, rects)
|
||||
found = True
|
||||
break
|
||||
|
||||
# Essai 3 : si multi-lignes, chercher ligne par ligne
|
||||
if not found and "\n" in clean:
|
||||
for line in clean.split("\n"):
|
||||
line = line.strip()
|
||||
if len(line) >= 10:
|
||||
for page in pages:
|
||||
rects = page.search_for(line)
|
||||
if rects:
|
||||
_add_highlight(page, rects)
|
||||
found = True
|
||||
break
|
||||
if found:
|
||||
break
|
||||
|
||||
return doc.tobytes()
|
||||
finally:
|
||||
doc.close()
|
||||
@@ -137,7 +137,12 @@ class ReferentielManager:
|
||||
|
||||
from ..medical.rag_index import chunk_user_file, add_chunks_to_index
|
||||
|
||||
doc_name = f"ref:{ref['filename']}"
|
||||
# Heuristique simple : si le fichier ressemble à une procédure/méthodo,
|
||||
# on l'isole pour éviter qu'il n'influence le codage.
|
||||
fname = (ref.get("filename") or "").lower()
|
||||
is_proc = any(k in fname for k in ("guide", "methodo", "méthodo", "procedure", "procédure", "pmsi", "atlh", "atih", "cpam"))
|
||||
prefix = "proc" if is_proc else "ref"
|
||||
doc_name = f"{prefix}:{ref['filename']}"
|
||||
chunks = chunk_user_file(file_path, doc_name)
|
||||
|
||||
if not chunks:
|
||||
|
||||
@@ -263,13 +263,21 @@
|
||||
#source-modal-inner {
|
||||
background: #fff;
|
||||
border-radius: 12px;
|
||||
max-width: 900px;
|
||||
max-width: 95vw;
|
||||
width: 95vw;
|
||||
margin: 0 auto;
|
||||
max-height: 90vh;
|
||||
max-height: 95vh;
|
||||
height: 95vh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
box-shadow: 0 8px 30px rgba(0,0,0,0.2);
|
||||
}
|
||||
#source-modal-inner.source-modal-text {
|
||||
max-width: 900px;
|
||||
width: auto;
|
||||
max-height: 90vh;
|
||||
height: auto;
|
||||
}
|
||||
#source-header {
|
||||
padding: 1rem 1.25rem;
|
||||
border-bottom: 1px solid #e2e8f0;
|
||||
@@ -290,6 +298,11 @@
|
||||
word-break: break-word;
|
||||
color: #334155;
|
||||
}
|
||||
#source-content.source-content-pdf {
|
||||
padding: 0;
|
||||
white-space: normal;
|
||||
overflow: hidden;
|
||||
}
|
||||
#source-content mark {
|
||||
background: #fef08a;
|
||||
padding: 2px 0;
|
||||
@@ -306,6 +319,22 @@
|
||||
font-weight: 600;
|
||||
}
|
||||
#source-close-btn:hover { background: #475569; }
|
||||
|
||||
/* PDF file picker buttons */
|
||||
.src-file-btn {
|
||||
display: inline-block;
|
||||
padding: 0.35rem 0.75rem;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #cbd5e1;
|
||||
background: #f8fafc;
|
||||
color: #1e293b;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.15s;
|
||||
}
|
||||
.src-file-btn:hover { background: #e2e8f0; border-color: #3b82f6; }
|
||||
.src-file-btn.active { background: #3b82f6; color: #fff; border-color: #3b82f6; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
@@ -289,7 +289,7 @@
|
||||
<h3>Diagnostic principal</h3>
|
||||
<div style="font-size:0.95rem;margin-bottom:0.5rem;">
|
||||
{{ dp.texte }}
|
||||
{% if dp.source_page %}<button class="src-btn" data-excerpt="{{ dp.source_excerpt|default('',true)|e }}" data-page="{{ dp.source_page }}">p.{{ dp.source_page }}</button>{% endif %}
|
||||
{% if dp.source_page %}<button class="src-btn" data-texte="{{ dp.texte|e }}" data-excerpt="{{ dp.source_excerpt|default('',true)|e }}" data-page="{{ dp.source_page }}">p.{{ dp.source_page }}</button>{% endif %}
|
||||
</div>
|
||||
{% if dp.cim10_suggestion %}
|
||||
<span class="badge" style="background:#dbeafe;color:#1d4ed8;font-size:0.85rem;">{{ dp.cim10_suggestion }}</span>
|
||||
@@ -358,7 +358,7 @@
|
||||
<span class="badge" style="background:#e0e7ff;color:#3730a3;font-size:0.7rem;">{{ das.source }}</span>
|
||||
{% endif %}
|
||||
{% if das.source_page %}
|
||||
<button class="src-btn" data-excerpt="{{ das.source_excerpt|default('',true)|e }}" data-page="{{ das.source_page }}">p.{{ das.source_page }}</button>
|
||||
<button class="src-btn" data-texte="{{ das.texte|e }}" data-excerpt="{{ das.source_excerpt|default('',true)|e }}" data-page="{{ das.source_page }}">p.{{ das.source_page }}</button>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td style="font-size:0.8rem;color:#475569;">
|
||||
@@ -430,7 +430,7 @@
|
||||
<div style="font-size:0.7rem;color:#dc2626;">{{ alerte }}</div>
|
||||
{% endfor %}
|
||||
</td>
|
||||
<td>{% if a.source_page %}<button class="src-btn" data-excerpt="{{ a.source_excerpt|default('',true)|e }}" data-page="{{ a.source_page }}">p.{{ a.source_page }}</button>{% endif %}</td>
|
||||
<td>{% if a.source_page %}<button class="src-btn" data-texte="{{ a.texte|e }}" data-excerpt="{{ a.source_excerpt|default('',true)|e }}" data-page="{{ a.source_page }}">p.{{ a.source_page }}</button>{% endif %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
@@ -450,7 +450,7 @@
|
||||
<td>{{ b.test }}</td>
|
||||
<td>{{ b.valeur or '' }}</td>
|
||||
<td>{% if b.anomalie %}<span class="badge" style="background:#fee2e2;color:#dc2626;">Oui</span>{% else %}—{% endif %}</td>
|
||||
<td>{% if b.source_page %}<button class="src-btn" data-excerpt="{{ b.source_excerpt|default('',true)|e }}" data-page="{{ b.source_page }}">p.{{ b.source_page }}</button>{% endif %}</td>
|
||||
<td>{% if b.source_page %}<button class="src-btn" data-texte="{{ b.test|e }}" data-excerpt="{{ b.source_excerpt|default('',true)|e }}" data-page="{{ b.source_page }}">p.{{ b.source_page }}</button>{% endif %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
@@ -466,7 +466,7 @@
|
||||
<div style="margin-bottom:0.5rem;">
|
||||
<strong>{{ img.type }}</strong>
|
||||
{% if img.score %} — Score : {{ img.score }}{% endif %}
|
||||
{% if img.source_page %}<button class="src-btn" data-excerpt="{{ img.source_excerpt|default('',true)|e }}" data-page="{{ img.source_page }}">p.{{ img.source_page }}</button>{% endif %}
|
||||
{% if img.source_page %}<button class="src-btn" data-texte="{{ img.type|e }}" data-excerpt="{{ img.source_excerpt|default('',true)|e }}" data-page="{{ img.source_page }}">p.{{ img.source_page }}</button>{% endif %}
|
||||
{% if img.conclusion %}
|
||||
<div style="font-size:0.85rem;color:#475569;">{{ img.conclusion }}</div>
|
||||
{% endif %}
|
||||
@@ -487,7 +487,7 @@
|
||||
<td>{{ t.medicament }}</td>
|
||||
<td>{{ t.posologie or '' }}</td>
|
||||
<td>{% if t.code_atc %}<span class="badge" style="background:#e0e7ff;color:#3730a3;">{{ t.code_atc }}</span>{% endif %}</td>
|
||||
<td>{% if t.source_page %}<button class="src-btn" data-excerpt="{{ t.source_excerpt|default('',true)|e }}" data-page="{{ t.source_page }}">p.{{ t.source_page }}</button>{% endif %}</td>
|
||||
<td>{% if t.source_page %}<button class="src-btn" data-texte="{{ t.medicament|e }}" data-excerpt="{{ t.source_excerpt|default('',true)|e }}" data-page="{{ t.source_page }}">p.{{ t.source_page }}</button>{% endif %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
@@ -501,7 +501,7 @@
|
||||
<h3>Antécédents ({{ dossier.antecedents|length }})</h3>
|
||||
<ul class="bullet">
|
||||
{% for a in dossier.antecedents %}
|
||||
<li>{{ a.texte }}{% if a.source_page %} <button class="src-btn" data-excerpt="{{ a.source_excerpt|default('',true)|e }}" data-page="{{ a.source_page }}">p.{{ a.source_page }}</button>{% endif %}</li>
|
||||
<li>{{ a.texte }}{% if a.source_page %} <button class="src-btn" data-texte="{{ a.texte|e }}" data-excerpt="{{ a.source_excerpt|default('',true)|e }}" data-page="{{ a.source_page }}">p.{{ a.source_page }}</button>{% endif %}</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
@@ -513,7 +513,7 @@
|
||||
<h3>Complications ({{ dossier.complications|length }})</h3>
|
||||
<ul class="bullet">
|
||||
{% for c in dossier.complications %}
|
||||
<li>{{ c.texte }}{% if c.source_page %} <button class="src-btn" data-excerpt="{{ c.source_excerpt|default('',true)|e }}" data-page="{{ c.source_page }}">p.{{ c.source_page }}</button>{% endif %}</li>
|
||||
<li>{{ c.texte }}{% if c.source_page %} <button class="src-btn" data-texte="{{ c.texte|e }}" data-excerpt="{{ c.source_excerpt|default('',true)|e }}" data-page="{{ c.source_page }}">p.{{ c.source_page }}</button>{% endif %}</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
@@ -536,36 +536,109 @@
|
||||
<script>
|
||||
/* --- Source modal --- */
|
||||
let _sourceCache = null;
|
||||
|
||||
function getDossierId() {
|
||||
// filepath = "103_23056749/103_23056749_fusionne_cim10.json"
|
||||
// dossier_id = "103_23056749"
|
||||
const _dossierId = (function() {
|
||||
const fp = {{ filepath|tojson }};
|
||||
const parts = fp.split('/');
|
||||
return parts.length > 1 ? parts.slice(0, -1).join('/') : '';
|
||||
}
|
||||
})();
|
||||
const _sourceFiles = {{ dossier.source_files|tojson }};
|
||||
|
||||
function getDossierId() { return _dossierId; }
|
||||
|
||||
async function loadSourceTexts() {
|
||||
if (_sourceCache !== null) return _sourceCache;
|
||||
const dossierId = getDossierId();
|
||||
if (!dossierId) { _sourceCache = {}; return _sourceCache; }
|
||||
if (!_dossierId) { _sourceCache = {}; return _sourceCache; }
|
||||
try {
|
||||
const resp = await fetch('/api/source-text/' + dossierId);
|
||||
const resp = await fetch('/api/source-text/' + _dossierId);
|
||||
if (resp.ok) { _sourceCache = await resp.json(); }
|
||||
else { _sourceCache = {}; }
|
||||
} catch (e) { _sourceCache = {}; }
|
||||
return _sourceCache;
|
||||
}
|
||||
|
||||
async function showSource(excerpt, page) {
|
||||
/* Teste si le PDF caviardé est disponible (HEAD request) */
|
||||
async function pdfAvailable(dossierId, filename) {
|
||||
try {
|
||||
const resp = await fetch('/api/pdf/' + dossierId + '/' + encodeURIComponent(filename), {method: 'HEAD'});
|
||||
return resp.ok;
|
||||
} catch (e) { return false; }
|
||||
}
|
||||
|
||||
/* Construit l'URL du PDF avec highlight + page */
|
||||
function buildPdfUrl(dossierId, filename, page, excerpt) {
|
||||
let url = '/api/pdf/' + dossierId + '/' + encodeURIComponent(filename);
|
||||
const params = [];
|
||||
if (excerpt) params.push('highlight=' + encodeURIComponent(excerpt));
|
||||
if (page) params.push('page=' + page);
|
||||
if (params.length) url += '?' + params.join('&');
|
||||
url += '#page=' + (page || 1);
|
||||
return url;
|
||||
}
|
||||
|
||||
/* Affiche un PDF dans l'iframe */
|
||||
function loadPdf(dossierId, filename, page, excerpt) {
|
||||
const content = document.getElementById('source-content');
|
||||
const url = buildPdfUrl(dossierId, filename, page, excerpt);
|
||||
content.className = 'source-content-pdf';
|
||||
content.innerHTML = '<iframe src="' + url + '" style="width:100%;height:100%;border:none;"></iframe>';
|
||||
// Marquer le bouton actif
|
||||
document.querySelectorAll('.src-file-btn').forEach(b => b.classList.remove('active'));
|
||||
document.querySelectorAll('.src-file-btn').forEach(b => {
|
||||
if (b.textContent === filename) b.classList.add('active');
|
||||
});
|
||||
}
|
||||
|
||||
/* Affiche le modal source — PDF caviardé si disponible, sinon fallback texte */
|
||||
async function showSource(excerpt, page, texte) {
|
||||
// Pour le surlignage PDF, on utilise le texte du diagnostic (pas l'excerpt brut)
|
||||
const highlightText = texte || excerpt;
|
||||
const modal = document.getElementById('source-modal');
|
||||
const modalInner = document.getElementById('source-modal-inner');
|
||||
const content = document.getElementById('source-content');
|
||||
const title = document.getElementById('source-title');
|
||||
|
||||
title.textContent = 'Document source — Page ' + page;
|
||||
content.innerHTML = '<em style="color:#94a3b8;">Chargement...</em>';
|
||||
content.className = '';
|
||||
modalInner.className = '';
|
||||
modal.style.display = 'block';
|
||||
|
||||
// Essayer le mode PDF
|
||||
if (_sourceFiles && _sourceFiles.length > 0 && _dossierId) {
|
||||
const firstFile = _sourceFiles[0];
|
||||
const available = await pdfAvailable(_dossierId, firstFile);
|
||||
if (available) {
|
||||
modalInner.className = '';
|
||||
if (_sourceFiles.length === 1) {
|
||||
loadPdf(_dossierId, firstFile, page, highlightText);
|
||||
} else {
|
||||
// Multi-PDF : boutons de sélection + iframe
|
||||
const safeHighlight = (highlightText || '').replace(/\\/g, '\\\\').replace(/'/g, "\\'");
|
||||
let html = '<div style="padding:0.5rem 0.75rem;border-bottom:1px solid #e2e8f0;display:flex;gap:0.5rem;flex-wrap:wrap;">';
|
||||
_sourceFiles.forEach(function(f) {
|
||||
const safeF = f.replace(/\\/g, '\\\\').replace(/'/g, "\\'");
|
||||
html += '<button class="src-file-btn" onclick="loadPdf(\'' + _dossierId + '\', \'' + safeF + '\', ' + page + ', \'' + safeHighlight + '\')">' + f + '</button>';
|
||||
});
|
||||
html += '</div>';
|
||||
html += '<iframe id="pdf-frame" style="width:100%;flex:1;border:none;"></iframe>';
|
||||
content.className = 'source-content-pdf';
|
||||
content.style.display = 'flex';
|
||||
content.style.flexDirection = 'column';
|
||||
content.innerHTML = html;
|
||||
// Charger le premier PDF
|
||||
const iframe = content.querySelector('iframe');
|
||||
iframe.src = buildPdfUrl(_dossierId, firstFile, page, highlightText);
|
||||
content.querySelector('.src-file-btn').classList.add('active');
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback : mode texte (ancien comportement)
|
||||
modalInner.className = 'source-modal-text';
|
||||
content.className = '';
|
||||
content.style.display = '';
|
||||
|
||||
const texts = await loadSourceTexts();
|
||||
const allText = Object.values(texts).join('\n\n--- ---\n\n');
|
||||
|
||||
@@ -583,7 +656,6 @@ async function showSource(excerpt, page) {
|
||||
// Chercher l'extrait dans le texte et le surligner
|
||||
if (searchText.length > 10) {
|
||||
let idx = allText.indexOf(searchText);
|
||||
// Fallback : chercher un morceau central (résiste mieux à l'anonymisation)
|
||||
if (idx < 0 && searchText.length > 60) {
|
||||
const mid = Math.floor(searchText.length / 2);
|
||||
searchText = searchText.substring(mid - 30, mid + 30);
|
||||
@@ -600,7 +672,6 @@ async function showSource(excerpt, page) {
|
||||
mark.id = 'source-highlight';
|
||||
content.appendChild(mark);
|
||||
content.appendChild(document.createTextNode(after));
|
||||
// Scroll vers le surlignage
|
||||
setTimeout(() => {
|
||||
const el = document.getElementById('source-highlight');
|
||||
if (el) el.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||||
@@ -609,11 +680,15 @@ async function showSource(excerpt, page) {
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback : afficher le texte brut sans surlignage
|
||||
content.textContent = allText;
|
||||
}
|
||||
|
||||
function closeSource() {
|
||||
const content = document.getElementById('source-content');
|
||||
// Détruire l'iframe pour stopper le chargement PDF
|
||||
content.innerHTML = '';
|
||||
content.style.display = '';
|
||||
content.className = '';
|
||||
document.getElementById('source-modal').style.display = 'none';
|
||||
}
|
||||
|
||||
@@ -631,7 +706,7 @@ document.addEventListener('keydown', function(e) {
|
||||
document.addEventListener('click', function(e) {
|
||||
const btn = e.target.closest('.src-btn');
|
||||
if (btn && btn.dataset.page) {
|
||||
showSource(btn.dataset.excerpt || '', parseInt(btn.dataset.page));
|
||||
showSource(btn.dataset.excerpt || '', parseInt(btn.dataset.page), btn.dataset.texte || '');
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -407,7 +407,7 @@ class TestGenerateResponse:
|
||||
]
|
||||
call_count = {"n": 0}
|
||||
|
||||
def ollama_side_effect(prompt, temperature=0.1, max_tokens=4000):
|
||||
def ollama_side_effect(prompt, temperature=0.1, max_tokens=4000, **kwargs):
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] == 1:
|
||||
return {"comprehension_contestation": "Extraction...", "elements_cliniques_pertinents": [], "points_accord_potentiels": [], "codes_en_jeu": {}}
|
||||
@@ -448,7 +448,7 @@ class TestGenerateResponse:
|
||||
mock_ollama.return_value = None
|
||||
call_count = {"n": 0}
|
||||
|
||||
def anthropic_side_effect(prompt, temperature=0.1, max_tokens=4000):
|
||||
def anthropic_side_effect(prompt, temperature=0.1, max_tokens=4000, **kwargs):
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] == 1:
|
||||
return {"comprehension_contestation": "Extraction Haiku...", "elements_cliniques_pertinents": [], "points_accord_potentiels": [], "codes_en_jeu": {}}
|
||||
@@ -1155,7 +1155,7 @@ class TestExtractionPass:
|
||||
"""L'orchestrateur appelle extraction + argumentation + validation."""
|
||||
call_count = {"n": 0}
|
||||
|
||||
def ollama_side_effect(prompt, temperature=0.1, max_tokens=4000):
|
||||
def ollama_side_effect(prompt, temperature=0.1, max_tokens=4000, **kwargs):
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] == 1:
|
||||
return {
|
||||
@@ -1249,7 +1249,7 @@ class TestValidateAdversarial:
|
||||
"""Incohérences détectées → avertissements dans le texte formaté."""
|
||||
call_count = {"n": 0}
|
||||
|
||||
def ollama_side_effect(prompt, temperature=0.1, max_tokens=4000):
|
||||
def ollama_side_effect(prompt, temperature=0.1, max_tokens=4000, **kwargs):
|
||||
call_count["n"] += 1
|
||||
if call_count["n"] == 1:
|
||||
return {"comprehension_contestation": "Extraction", "elements_cliniques_pertinents": [], "points_accord_potentiels": [], "codes_en_jeu": {}}
|
||||
|
||||
@@ -49,15 +49,14 @@ class TestOllamaCache:
|
||||
cache.save()
|
||||
assert not path.exists()
|
||||
|
||||
def test_model_change_invalidates(self, tmp_path):
|
||||
path = tmp_path / "cache.json"
|
||||
cache = OllamaCache(path, "gemma3:12b")
|
||||
def test_model_change_returns_none(self, tmp_path):
|
||||
"""Entrées d'un autre modèle retournent None (pas d'invalidation globale)."""
|
||||
cache = OllamaCache(tmp_path / "cache.json", "gemma3:12b")
|
||||
cache.put("HTA", "das", {"code": "I10"})
|
||||
cache.save()
|
||||
|
||||
cache2 = OllamaCache(path, "llama3:8b")
|
||||
assert cache2.get("HTA", "das") is None
|
||||
assert len(cache2) == 0
|
||||
# Même cache, modèle différent → miss
|
||||
assert cache.get("HTA", "das", model="llama3:8b") is None
|
||||
# Modèle original → hit
|
||||
assert cache.get("HTA", "das") == {"code": "I10"}
|
||||
|
||||
def test_corrupted_file(self, tmp_path):
|
||||
path = tmp_path / "cache.json"
|
||||
@@ -95,14 +94,75 @@ class TestOllamaCache:
|
||||
assert not errors
|
||||
assert len(cache) == 20
|
||||
|
||||
def test_json_format(self, tmp_path):
|
||||
"""Le fichier JSON contient le modèle et les entrées."""
|
||||
def test_json_format_new(self, tmp_path):
|
||||
"""Le nouveau format stocke le modèle PAR ENTRÉE (pas global)."""
|
||||
path = tmp_path / "cache.json"
|
||||
cache = OllamaCache(path, "gemma3:12b")
|
||||
cache.put("HTA", "das", {"code": "I10"})
|
||||
cache.save()
|
||||
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
assert raw["model"] == "gemma3:12b"
|
||||
assert "entries" in raw
|
||||
assert len(raw["entries"]) == 1
|
||||
assert "model" not in raw # plus de model global
|
||||
# Chaque entrée contient model + result
|
||||
entry = list(raw["entries"].values())[0]
|
||||
assert entry["model"] == "gemma3:12b"
|
||||
assert entry["result"] == {"code": "I10"}
|
||||
|
||||
def test_migration_old_format(self, tmp_path):
|
||||
"""Ancien format (model global) migré automatiquement."""
|
||||
path = tmp_path / "cache.json"
|
||||
# Écrire un cache ancien format
|
||||
old_data = {
|
||||
"model": "gemma3:12b",
|
||||
"entries": {
|
||||
"das::hta": {"code": "I10", "confidence": "high"},
|
||||
},
|
||||
}
|
||||
path.write_text(json.dumps(old_data), encoding="utf-8")
|
||||
|
||||
cache = OllamaCache(path, "gemma3:12b")
|
||||
# L'entrée doit être accessible
|
||||
assert cache.get("HTA", "das") == {"code": "I10", "confidence": "high"}
|
||||
assert len(cache) == 1
|
||||
|
||||
# Sauvegarder et vérifier le nouveau format
|
||||
cache.save()
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
assert "model" not in raw
|
||||
entry = raw["entries"]["das::hta"]
|
||||
assert entry["model"] == "gemma3:12b"
|
||||
assert entry["result"]["code"] == "I10"
|
||||
|
||||
def test_migration_old_format_different_model(self, tmp_path):
|
||||
"""Migration ancien format : les entrées sont bien taggées avec l'ancien modèle."""
|
||||
path = tmp_path / "cache.json"
|
||||
old_data = {
|
||||
"model": "old-model",
|
||||
"entries": {
|
||||
"das::hta": {"code": "I10"},
|
||||
},
|
||||
}
|
||||
path.write_text(json.dumps(old_data), encoding="utf-8")
|
||||
|
||||
# Charger avec un modèle différent
|
||||
cache = OllamaCache(path, "new-model")
|
||||
# L'entrée est taggée "old-model" → miss avec "new-model"
|
||||
assert cache.get("HTA", "das") is None
|
||||
# Mais accessible avec l'ancien modèle
|
||||
assert cache.get("HTA", "das", model="old-model") == {"code": "I10"}
|
||||
|
||||
def test_put_with_explicit_model(self, tmp_path):
|
||||
"""put() avec model= explicite stocke ce modèle."""
|
||||
cache = OllamaCache(tmp_path / "cache.json", "default-model")
|
||||
cache.put("HTA", "das", {"code": "I10"}, model="explicit-model")
|
||||
# get sans model → utilise default → miss
|
||||
assert cache.get("HTA", "das") is None
|
||||
# get avec le bon modèle → hit
|
||||
assert cache.get("HTA", "das", model="explicit-model") == {"code": "I10"}
|
||||
|
||||
def test_get_returns_none_if_model_mismatch(self, tmp_path):
|
||||
"""get() retourne None si le modèle stocké ≠ modèle demandé."""
|
||||
cache = OllamaCache(tmp_path / "cache.json", "gemma3:12b")
|
||||
cache.put("HTA", "das", {"code": "I10"})
|
||||
assert cache.get("HTA", "das", model="llama3:8b") is None
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
"""Tests pour le viewer Flask."""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from src.viewer.app import create_app, compute_group_stats, severity_badge, format_duration, format_cpam_text
|
||||
from src.viewer.pdf_redactor import load_entities_from_report, redact_pdf, highlight_text
|
||||
from src.config import DossierMedical, Diagnostic, ActeCCAM
|
||||
|
||||
|
||||
@@ -155,3 +159,141 @@ class TestSourceTextEndpoint:
|
||||
"""Path traversal bloqué."""
|
||||
response = client.get("/api/source-text/../../etc")
|
||||
assert response.status_code in (403, 404)
|
||||
|
||||
|
||||
class TestPdfRedactorUnit:
|
||||
def test_load_entities_from_report(self, tmp_path):
|
||||
"""Charge les entités depuis un rapport JSON."""
|
||||
report = {
|
||||
"source_file": "test.pdf",
|
||||
"entities_found": [
|
||||
{"original": "Jean Dupont", "replacement": "[NOM_1]", "source": "ner", "category": "person"},
|
||||
{"original": "12345678901", "replacement": "[RPPS_1]", "source": "regex", "category": "rpps"},
|
||||
{"original": "A", "replacement": "[X]", "source": "ner", "category": "person"}, # trop court
|
||||
{"original": "[NOM_1]", "replacement": "[NOM_1]", "source": "ner", "category": "person"}, # pseudonyme
|
||||
],
|
||||
}
|
||||
report_path = tmp_path / "test_report.json"
|
||||
report_path.write_text(json.dumps(report), encoding="utf-8")
|
||||
entities = load_entities_from_report(report_path)
|
||||
assert "Jean Dupont" in entities
|
||||
assert "12345678901" in entities
|
||||
assert "A" not in entities # trop court
|
||||
assert "[NOM_1]" not in entities # pseudonyme
|
||||
|
||||
def test_redact_pdf_produces_bytes(self, tmp_path):
|
||||
"""redact_pdf retourne des bytes PDF valides."""
|
||||
import fitz
|
||||
# Créer un PDF de test avec du texte
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((72, 72), "Jean Dupont est le patient.", fontsize=12)
|
||||
pdf_path = tmp_path / "test.pdf"
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
result = redact_pdf(pdf_path, {"Jean Dupont"})
|
||||
assert isinstance(result, bytes)
|
||||
assert len(result) > 0
|
||||
# Vérifier que c'est bien un PDF
|
||||
assert result[:5] == b"%PDF-"
|
||||
|
||||
# Vérifier que le texte caviardé n'est plus présent
|
||||
doc2 = fitz.open(stream=result, filetype="pdf")
|
||||
text = doc2[0].get_text()
|
||||
doc2.close()
|
||||
assert "Jean Dupont" not in text
|
||||
|
||||
def test_highlight_text_adds_annotation(self, tmp_path):
|
||||
"""highlight_text ajoute une annotation de surlignage."""
|
||||
import fitz
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((72, 72), "CRP elevee a 180 mg/L", fontsize=12)
|
||||
pdf_bytes = doc.tobytes()
|
||||
doc.close()
|
||||
|
||||
result = highlight_text(pdf_bytes, "CRP elevee", page_num=1)
|
||||
assert isinstance(result, bytes)
|
||||
# Le PDF avec surlignage doit être différent de l'original
|
||||
assert result != pdf_bytes
|
||||
# Vérifier qu'au moins une annotation existe sur la page
|
||||
doc2 = fitz.open(stream=result, filetype="pdf")
|
||||
page2 = doc2[0]
|
||||
annot_count = 0
|
||||
for annot in page2.annots():
|
||||
annot_count += 1
|
||||
doc2.close()
|
||||
assert annot_count >= 1
|
||||
|
||||
def test_highlight_text_empty_excerpt(self, tmp_path):
|
||||
"""highlight_text avec texte vide retourne le PDF inchangé."""
|
||||
import fitz
|
||||
doc = fitz.open()
|
||||
doc.new_page()
|
||||
pdf_bytes = doc.tobytes()
|
||||
doc.close()
|
||||
|
||||
result = highlight_text(pdf_bytes, "")
|
||||
assert result == pdf_bytes
|
||||
|
||||
def test_highlight_text_ellipsis_cleaned(self, tmp_path):
|
||||
"""highlight_text nettoie les ... de l'excerpt."""
|
||||
import fitz
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((72, 72), "Patient present une infection urinaire", fontsize=12)
|
||||
pdf_bytes = doc.tobytes()
|
||||
doc.close()
|
||||
|
||||
result = highlight_text(pdf_bytes, "...infection urinaire...", page_num=1)
|
||||
doc2 = fitz.open(stream=result, filetype="pdf")
|
||||
annots = list(doc2[0].annots())
|
||||
doc2.close()
|
||||
assert len(annots) >= 1
|
||||
|
||||
def test_highlight_text_multiline_excerpt(self, tmp_path):
|
||||
"""highlight_text fonctionne avec un excerpt multi-lignes (cas réel)."""
|
||||
import fitz
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
# Simuler un PDF avec plusieurs lignes de texte
|
||||
page.insert_text((72, 72), "Motif d'hospitalisation: Lombofessalgie", fontsize=12)
|
||||
page.insert_text((72, 92), "chez patiente suivie pour spondylarthrite", fontsize=12)
|
||||
page.insert_text((72, 112), "Praticien hospitalier", fontsize=12)
|
||||
page.insert_text((72, 132), "Antecedents medicaux importants", fontsize=12)
|
||||
pdf_bytes = doc.tobytes()
|
||||
doc.close()
|
||||
|
||||
# Excerpt multi-lignes typique (comme dans les vrais dossiers)
|
||||
multiline_excerpt = (
|
||||
"...Motif d'hospitalisation: Lombofessalgie\n"
|
||||
"chez patiente suivie pour spondylarthrite\n"
|
||||
"Praticien hospitalier\n"
|
||||
"Antecedents medicaux importants..."
|
||||
)
|
||||
result = highlight_text(pdf_bytes, multiline_excerpt, page_num=1)
|
||||
assert result != pdf_bytes
|
||||
doc2 = fitz.open(stream=result, filetype="pdf")
|
||||
annot_count = 0
|
||||
for annot in doc2[0].annots():
|
||||
annot_count += 1
|
||||
doc2.close()
|
||||
assert annot_count >= 1
|
||||
|
||||
|
||||
class TestPdfEndpoint:
|
||||
def test_pdf_404_nonexistent(self, client):
|
||||
"""Un PDF inexistant retourne 404."""
|
||||
response = client.get("/api/pdf/nonexistent_dossier/nonexistent.pdf")
|
||||
assert response.status_code == 404
|
||||
|
||||
def test_pdf_security_path_traversal(self, client):
|
||||
"""Path traversal bloqué."""
|
||||
response = client.get("/api/pdf/../../etc/passwd.pdf")
|
||||
assert response.status_code in (403, 404)
|
||||
|
||||
def test_pdf_non_pdf_extension(self, client):
|
||||
"""Un fichier non-PDF retourne 404."""
|
||||
response = client.get("/api/pdf/some_dossier/file.txt")
|
||||
assert response.status_code == 404
|
||||
|
||||
Reference in New Issue
Block a user