chore: add .gitignore
This commit is contained in:
22
t2a_install_rag_cleanup/.env.example
Normal file
22
t2a_install_rag_cleanup/.env.example
Normal file
@@ -0,0 +1,22 @@
|
||||
# === Référentiels PDF (chemins absolus vers les PDFs ATIH) ===
|
||||
# T2A_CIM10_PDF=/chemin/vers/cim-10-fr.pdf
|
||||
# T2A_GUIDE_METHODO_PDF=/chemin/vers/guide_methodo_mco.pdf
|
||||
# T2A_CCAM_PDF=/chemin/vers/ccam_descriptive.pdf
|
||||
|
||||
# === Ollama ===
|
||||
# OLLAMA_URL=http://localhost:11434
|
||||
# OLLAMA_MODEL=gemma3:12b
|
||||
# OLLAMA_TIMEOUT=120
|
||||
# OLLAMA_MAX_PARALLEL=2
|
||||
|
||||
# === Modèles IA ===
|
||||
# T2A_EMBEDDING_MODEL=dangvantuan/sentence-camembert-large
|
||||
# T2A_NER_MODEL=Jean-Baptiste/camembert-ner
|
||||
# T2A_NER_THRESHOLD=0.80
|
||||
|
||||
# === Établissement ===
|
||||
# T2A_FINESS=000000000
|
||||
# T2A_NUM_UM=0000
|
||||
|
||||
# === Anonymisation ===
|
||||
# T2A_KEEP_ESTABLISHMENT=True
|
||||
23
t2a_install_rag_cleanup/.gitignore
vendored
Normal file
23
t2a_install_rag_cleanup/.gitignore
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
# Python
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.egg-info/
|
||||
.pytest_cache/
|
||||
.hypothesis/
|
||||
|
||||
# Données générées
|
||||
output/
|
||||
input/
|
||||
data/
|
||||
|
||||
# Référentiels (volumineux, non versionnés)
|
||||
*.pdf
|
||||
*.xls
|
||||
*.xlsx
|
||||
|
||||
# Configuration locale
|
||||
.env
|
||||
|
||||
# IDE / outils
|
||||
.claude/
|
||||
17
t2a_install_rag_cleanup/requirements.txt
Normal file
17
t2a_install_rag_cleanup/requirements.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
pdfplumber>=0.10.0
|
||||
transformers>=4.35.0,<5.0.0
|
||||
torch>=2.1.0
|
||||
protobuf>=3.20.0,<4.0.0
|
||||
regex>=2023.0
|
||||
pydantic>=2.5.0
|
||||
pytest>=7.4.0
|
||||
sentencepiece>=0.1.99,<0.2.0
|
||||
edsnlp[ml]>=0.17.0
|
||||
faiss-cpu>=1.7.0
|
||||
sentence-transformers>=2.2.0
|
||||
requests>=2.28.0
|
||||
flask>=3.0.0
|
||||
python-dotenv>=1.0.0
|
||||
openpyxl>=3.0.0
|
||||
pandas>=2.0.0
|
||||
PyMuPDF>=1.24.0
|
||||
45
t2a_install_rag_cleanup/run.sh
Normal file
45
t2a_install_rag_cleanup/run.sh
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
echo "🚀 Démarrage de l'application T2A..."
|
||||
|
||||
# Vérifier si l'environnement virtuel existe
|
||||
if [ ! -d ".venv" ]; then
|
||||
echo "📦 Création de l'environnement virtuel..."
|
||||
python3 -m venv .venv
|
||||
fi
|
||||
|
||||
# Activer l'environnement virtuel
|
||||
echo "🔧 Activation de l'environnement virtuel..."
|
||||
source .venv/bin/activate
|
||||
|
||||
# Installer/mettre à jour les dépendances
|
||||
if [ ! -f ".venv/.deps_installed" ] || [ "requirements.txt" -nt ".venv/.deps_installed" ]; then
|
||||
echo "📥 Installation des dépendances..."
|
||||
pip install -q --upgrade pip
|
||||
pip install -q -r requirements.txt
|
||||
touch .venv/.deps_installed
|
||||
else
|
||||
echo "✅ Dépendances déjà installées"
|
||||
fi
|
||||
|
||||
# Créer les répertoires nécessaires
|
||||
mkdir -p input output/anonymized output/structured output/reports data/rag_index data/referentiels
|
||||
|
||||
echo ""
|
||||
echo "✨ Application prête !"
|
||||
echo ""
|
||||
echo "📂 Répertoires :"
|
||||
echo " - input/ : Placez vos PDFs ici"
|
||||
echo " - output/ : Résultats du traitement"
|
||||
echo ""
|
||||
echo "🌐 Lancement du viewer sur http://localhost:5000"
|
||||
echo ""
|
||||
echo " Appuyez sur Ctrl+C pour arrêter"
|
||||
echo ""
|
||||
|
||||
# Lancer le viewer
|
||||
python3 -m src.viewer
|
||||
313
t2a_install_rag_cleanup/scripts/benchmark_models.py
Normal file
313
t2a_install_rag_cleanup/scripts/benchmark_models.py
Normal file
@@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Benchmark A/B : gemma3:12b (base) vs pmsi-coder-v2 (fine-tuné).
|
||||
|
||||
Compare les codes CIM-10 produits par les deux modèles sur N dossiers.
|
||||
Teste DP + DAS (échantillon) pour chaque dossier.
|
||||
|
||||
Usage: python scripts/benchmark_models.py [--n 50] [--das-max 5]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from src.config import STRUCTURED_DIR, OLLAMA_URL, DossierMedical
|
||||
from src.medical.cim10_dict import load_dict, normalize_code, validate_code
|
||||
|
||||
import requests
|
||||
|
||||
MODEL_BASE = "gemma3:12b"
|
||||
MODEL_FINETUNED = "pmsi-coder-v2"
|
||||
|
||||
PROMPT_TEMPLATE = """Tu es un médecin DIM expert en codage PMSI.
|
||||
Code le diagnostic suivant en CIM-10. Choisis le code le plus spécifique possible.
|
||||
|
||||
DIAGNOSTIC : "{texte}"
|
||||
TYPE : {type_diag}
|
||||
|
||||
{contexte}
|
||||
|
||||
Réponds UNIQUEMENT avec un objet JSON :
|
||||
{{"code": "X99.9", "confidence": "high|medium|low", "justification": "explication courte"}}"""
|
||||
|
||||
|
||||
def call_model(prompt: str, model: str, timeout: int = 120) -> tuple[dict | None, float]:
|
||||
"""Appelle un modèle Ollama et retourne (résultat, durée_s)."""
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"temperature": 0.1, "num_predict": 500},
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
raw = resp.json().get("response", "")
|
||||
duration = time.time() - t0
|
||||
try:
|
||||
return json.loads(raw), duration
|
||||
except json.JSONDecodeError:
|
||||
return None, duration
|
||||
except Exception as e:
|
||||
return None, time.time() - t0
|
||||
|
||||
|
||||
def load_dossiers(n: int) -> list[dict]:
|
||||
"""Charge N dossiers fusionnés diversifiés."""
|
||||
dossiers = []
|
||||
for subdir in sorted(STRUCTURED_DIR.iterdir()):
|
||||
if not subdir.is_dir():
|
||||
continue
|
||||
for f in subdir.glob("*fusionne*.json"):
|
||||
if ".gemma_" in f.name or ".bak" in f.name:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(f.read_text(encoding="utf-8"))
|
||||
d = DossierMedical.model_validate(data)
|
||||
if d.diagnostic_principal and d.diagnostic_principal.cim10_suggestion:
|
||||
dossiers.append({
|
||||
"name": subdir.name,
|
||||
"dossier": d,
|
||||
"path": str(f),
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
break
|
||||
random.seed(42)
|
||||
random.shuffle(dossiers)
|
||||
return dossiers[:n]
|
||||
|
||||
|
||||
def build_contexte(d: DossierMedical) -> str:
|
||||
"""Construit un contexte clinique résumé."""
|
||||
parts = []
|
||||
s = d.sejour
|
||||
if s.age is not None:
|
||||
parts.append(f"Patient {s.sexe or '?'}, {s.age} ans")
|
||||
if s.duree_sejour is not None:
|
||||
parts.append(f"Durée séjour : {s.duree_sejour}j")
|
||||
if d.diagnostic_principal:
|
||||
parts.append(f"DP : {d.diagnostic_principal.texte}")
|
||||
bio = [f"{b.test}={b.valeur}" for b in d.biologie_cle[:5] if b.valeur]
|
||||
if bio:
|
||||
parts.append(f"Bio : {', '.join(bio)}")
|
||||
return "CONTEXTE : " + " | ".join(parts) if parts else ""
|
||||
|
||||
|
||||
def code_match_level(code_a: str, code_b: str) -> str:
|
||||
"""Retourne le niveau de correspondance entre deux codes."""
|
||||
if code_a == code_b:
|
||||
return "exact"
|
||||
if code_a[:3] == code_b[:3]:
|
||||
return "categorie"
|
||||
return "diff"
|
||||
|
||||
|
||||
def run_benchmark(n: int = 50, das_max: int = 5):
|
||||
print(f"=== Benchmark A/B : {MODEL_BASE} vs {MODEL_FINETUNED} ===")
|
||||
print(f" Dossiers : {n}, DAS max/dossier : {das_max}\n")
|
||||
|
||||
# Vérifier que les deux modèles sont disponibles
|
||||
for model in [MODEL_BASE, MODEL_FINETUNED]:
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={"model": model, "prompt": "test", "stream": False,
|
||||
"options": {"num_predict": 1}},
|
||||
timeout=60,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
print(f" {model} : OK")
|
||||
except Exception as e:
|
||||
print(f" {model} : ERREUR — {e}")
|
||||
sys.exit(1)
|
||||
|
||||
dossiers = load_dossiers(n)
|
||||
print(f"\nDossiers chargés : {len(dossiers)}\n")
|
||||
|
||||
cim10 = load_dict()
|
||||
t_global_start = time.time()
|
||||
|
||||
dp_results = []
|
||||
das_results = []
|
||||
|
||||
for i, item in enumerate(dossiers, 1):
|
||||
d = item["dossier"]
|
||||
dp = d.diagnostic_principal
|
||||
name = item["name"]
|
||||
ctx = build_contexte(d)
|
||||
|
||||
# === DP ===
|
||||
prompt_dp = PROMPT_TEMPLATE.format(
|
||||
texte=dp.texte,
|
||||
type_diag="DP (diagnostic principal)",
|
||||
contexte=ctx,
|
||||
)
|
||||
res_base, t_base = call_model(prompt_dp, MODEL_BASE)
|
||||
res_ft, t_ft = call_model(prompt_dp, MODEL_FINETUNED)
|
||||
|
||||
code_base = normalize_code(res_base.get("code", "")) if res_base else "ERREUR"
|
||||
code_ft = normalize_code(res_ft.get("code", "")) if res_ft else "ERREUR"
|
||||
conf_base = res_base.get("confidence", "?") if res_base else "?"
|
||||
conf_ft = res_ft.get("confidence", "?") if res_ft else "?"
|
||||
valid_base = validate_code(code_base)[0] if code_base != "ERREUR" else False
|
||||
valid_ft = validate_code(code_ft)[0] if code_ft != "ERREUR" else False
|
||||
|
||||
pipeline_code = dp.cim10_suggestion
|
||||
match_level = code_match_level(code_base, code_ft)
|
||||
|
||||
dp_result = {
|
||||
"dossier": name,
|
||||
"texte": dp.texte[:80],
|
||||
"pipeline": pipeline_code,
|
||||
"base": code_base,
|
||||
"ft": code_ft,
|
||||
"conf_base": conf_base,
|
||||
"conf_ft": conf_ft,
|
||||
"valid_base": valid_base,
|
||||
"valid_ft": valid_ft,
|
||||
"match": match_level,
|
||||
"t_base": round(t_base, 2),
|
||||
"t_ft": round(t_ft, 2),
|
||||
}
|
||||
dp_results.append(dp_result)
|
||||
|
||||
tag = {"exact": "=", "categorie": "~", "diff": "X"}[match_level]
|
||||
print(f" [{i:2d}/{len(dossiers)}] {name:<20s} DP=\"{dp.texte[:35]:<35s}\" "
|
||||
f"base={code_base:<7s} ft={code_ft:<7s} [{tag}] "
|
||||
f"({t_base:.1f}s / {t_ft:.1f}s)")
|
||||
|
||||
# === DAS (échantillon) ===
|
||||
das_list = [das for das in d.diagnostics_associes
|
||||
if das.texte and das.cim10_suggestion]
|
||||
if len(das_list) > das_max:
|
||||
random.seed(hash(name))
|
||||
das_list = random.sample(das_list, das_max)
|
||||
|
||||
for das in das_list:
|
||||
prompt_das = PROMPT_TEMPLATE.format(
|
||||
texte=das.texte,
|
||||
type_diag="DAS (diagnostic associé significatif)",
|
||||
contexte=ctx,
|
||||
)
|
||||
res_b, tb = call_model(prompt_das, MODEL_BASE)
|
||||
res_f, tf = call_model(prompt_das, MODEL_FINETUNED)
|
||||
|
||||
cb = normalize_code(res_b.get("code", "")) if res_b else "ERREUR"
|
||||
cf = normalize_code(res_f.get("code", "")) if res_f else "ERREUR"
|
||||
vb = validate_code(cb)[0] if cb != "ERREUR" else False
|
||||
vf = validate_code(cf)[0] if cf != "ERREUR" else False
|
||||
|
||||
das_results.append({
|
||||
"dossier": name,
|
||||
"texte": das.texte[:80],
|
||||
"pipeline": das.cim10_suggestion,
|
||||
"base": cb,
|
||||
"ft": cf,
|
||||
"conf_base": (res_b or {}).get("confidence", "?"),
|
||||
"conf_ft": (res_f or {}).get("confidence", "?"),
|
||||
"valid_base": vb,
|
||||
"valid_ft": vf,
|
||||
"match": code_match_level(cb, cf),
|
||||
"t_base": round(tb, 2),
|
||||
"t_ft": round(tf, 2),
|
||||
})
|
||||
|
||||
t_global = time.time() - t_global_start
|
||||
|
||||
# === RÉSUMÉ ===
|
||||
print(f"\n{'='*75}")
|
||||
print(f"RÉSUMÉ — {len(dp_results)} dossiers, {len(das_results)} DAS testés")
|
||||
print(f"Durée totale : {t_global/60:.1f} min\n")
|
||||
|
||||
for label, results in [("DP", dp_results), ("DAS", das_results)]:
|
||||
if not results:
|
||||
continue
|
||||
nt = len(results)
|
||||
n_exact = sum(1 for r in results if r["match"] == "exact")
|
||||
n_cat = sum(1 for r in results if r["match"] == "categorie")
|
||||
n_diff = sum(1 for r in results if r["match"] == "diff")
|
||||
n_vb = sum(1 for r in results if r["valid_base"])
|
||||
n_vf = sum(1 for r in results if r["valid_ft"])
|
||||
avg_tb = sum(r["t_base"] for r in results) / nt
|
||||
avg_tf = sum(r["t_ft"] for r in results) / nt
|
||||
|
||||
# Confiance
|
||||
conf_b = {}
|
||||
conf_f = {}
|
||||
for r in results:
|
||||
conf_b[r["conf_base"]] = conf_b.get(r["conf_base"], 0) + 1
|
||||
conf_f[r["conf_ft"]] = conf_f.get(r["conf_ft"], 0) + 1
|
||||
|
||||
# Concordance avec pipeline (gemma run original)
|
||||
n_base_eq_pipe = sum(1 for r in results if r["base"] == r["pipeline"])
|
||||
n_ft_eq_pipe = sum(1 for r in results if r["ft"] == r["pipeline"])
|
||||
n_base_cat_pipe = sum(1 for r in results
|
||||
if r["base"][:3] == r["pipeline"][:3])
|
||||
n_ft_cat_pipe = sum(1 for r in results
|
||||
if r["ft"][:3] == r["pipeline"][:3])
|
||||
|
||||
print(f" --- {label} ({nt} diagnostics) ---")
|
||||
print(f" Concordance base↔ft :")
|
||||
print(f" Exact : {n_exact}/{nt} ({100*n_exact/nt:.0f}%)")
|
||||
print(f" Catégorie : {n_exact+n_cat}/{nt} ({100*(n_exact+n_cat)/nt:.0f}%)")
|
||||
print(f" Différent : {n_diff}/{nt} ({100*n_diff/nt:.0f}%)")
|
||||
print(f" Codes valides :")
|
||||
print(f" base : {n_vb}/{nt} ({100*n_vb/nt:.0f}%)")
|
||||
print(f" ft : {n_vf}/{nt} ({100*n_vf/nt:.0f}%)")
|
||||
print(f" vs pipeline (gemma original) :")
|
||||
print(f" base=pipe : {n_base_eq_pipe}/{nt} exact, {n_base_cat_pipe}/{nt} catégorie")
|
||||
print(f" ft=pipe : {n_ft_eq_pipe}/{nt} exact, {n_ft_cat_pipe}/{nt} catégorie")
|
||||
print(f" Temps moyen : base={avg_tb:.2f}s ft={avg_tf:.2f}s (Δ={100*(avg_tf-avg_tb)/avg_tb:+.0f}%)")
|
||||
print(f" Confiance base : {conf_b}")
|
||||
print(f" Confiance ft : {conf_f}")
|
||||
print()
|
||||
|
||||
# Lister les différences DP
|
||||
diffs_dp = [r for r in dp_results if r["match"] == "diff"]
|
||||
if diffs_dp:
|
||||
print(f" Différences DP ({len(diffs_dp)}) :")
|
||||
for r in diffs_dp:
|
||||
vb = "✓" if r["valid_base"] else "✗"
|
||||
vf = "✓" if r["valid_ft"] else "✗"
|
||||
print(f" {r['dossier']:<18s} \"{r['texte'][:40]}\"")
|
||||
print(f" base={r['base']:<7s}{vb} ft={r['ft']:<7s}{vf} pipe={r['pipeline']}")
|
||||
|
||||
# Sauvegarder
|
||||
out = {
|
||||
"meta": {
|
||||
"date": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
"model_base": MODEL_BASE,
|
||||
"model_ft": MODEL_FINETUNED,
|
||||
"n_dossiers": len(dp_results),
|
||||
"n_das": len(das_results),
|
||||
"duration_min": round(t_global / 60, 1),
|
||||
},
|
||||
"dp": dp_results,
|
||||
"das": das_results,
|
||||
}
|
||||
out_path = Path(__file__).parent.parent / "output" / "benchmark_ab.json"
|
||||
out_path.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"\nRésultats détaillés : {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--n", type=int, default=50,
|
||||
help="Nombre de dossiers à tester")
|
||||
parser.add_argument("--das-max", type=int, default=5,
|
||||
help="Max DAS testés par dossier")
|
||||
args = parser.parse_args()
|
||||
run_benchmark(args.n, args.das_max)
|
||||
231
t2a_install_rag_cleanup/scripts/select_validation_dossiers.py
Normal file
231
t2a_install_rag_cleanup/scripts/select_validation_dossiers.py
Normal file
@@ -0,0 +1,231 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Sélectionne 50 dossiers pour le gold standard de validation DIM.
|
||||
|
||||
- 25 dossiers CPAM (cas complexes, déjà contrôlés)
|
||||
- 25 dossiers non-CPAM stratifiés par CMD, confiance DP, nombre de DAS
|
||||
|
||||
Crée data/gold_standard/_selection.json et initialise les annotations vides.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Ajouter le répertoire racine au path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from src.config import STRUCTURED_DIR, BASE_DIR, DossierMedical
|
||||
|
||||
GOLD_DIR = BASE_DIR / "data" / "gold_standard"
|
||||
TARGET_TOTAL = 50
|
||||
TARGET_CPAM = 25
|
||||
|
||||
|
||||
def load_all_dossiers() -> list[dict]:
|
||||
"""Charge tous les dossiers fusionnés depuis output/structured/."""
|
||||
dossiers = []
|
||||
for subdir in sorted(STRUCTURED_DIR.iterdir()):
|
||||
if not subdir.is_dir():
|
||||
continue
|
||||
# Chercher le fichier fusionné
|
||||
fusionne = None
|
||||
for f in subdir.glob("*fusionne*.json"):
|
||||
fusionne = f
|
||||
break
|
||||
if not fusionne:
|
||||
# Prendre le premier JSON du dossier
|
||||
jsons = sorted(subdir.glob("*.json"))
|
||||
if jsons:
|
||||
fusionne = jsons[0]
|
||||
if not fusionne:
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(fusionne.read_text(encoding="utf-8"))
|
||||
dossier = DossierMedical.model_validate(data)
|
||||
rel_path = str(fusionne.relative_to(STRUCTURED_DIR))
|
||||
group_name = subdir.name
|
||||
dossiers.append({
|
||||
"dossier_id": f"{group_name}/{fusionne.stem}",
|
||||
"group_name": group_name,
|
||||
"path_rel": rel_path,
|
||||
"dossier": dossier,
|
||||
})
|
||||
except Exception as e:
|
||||
print(f" Erreur chargement {fusionne.name}: {e}")
|
||||
return dossiers
|
||||
|
||||
|
||||
def select_dossiers(all_dossiers: list[dict]) -> list[dict]:
|
||||
"""Sélectionne les 50 dossiers selon la stratégie définie."""
|
||||
# Séparer CPAM / non-CPAM
|
||||
cpam = [d for d in all_dossiers if d["dossier"].controles_cpam]
|
||||
non_cpam = [d for d in all_dossiers if not d["dossier"].controles_cpam]
|
||||
|
||||
print(f"Dossiers CPAM disponibles : {len(cpam)}")
|
||||
print(f"Dossiers non-CPAM disponibles : {len(non_cpam)}")
|
||||
|
||||
# Prendre tous les CPAM (ou max TARGET_CPAM)
|
||||
selected_cpam = cpam[:TARGET_CPAM]
|
||||
remaining_target = TARGET_TOTAL - len(selected_cpam)
|
||||
|
||||
# Stratifier les non-CPAM
|
||||
selected_non_cpam = stratified_sample(non_cpam, remaining_target)
|
||||
|
||||
selected = selected_cpam + selected_non_cpam
|
||||
print(f"\nSélection finale : {len(selected)} dossiers")
|
||||
print(f" - CPAM : {len(selected_cpam)}")
|
||||
print(f" - Non-CPAM : {len(selected_non_cpam)}")
|
||||
|
||||
return selected
|
||||
|
||||
|
||||
def stratified_sample(dossiers: list[dict], n: int) -> list[dict]:
|
||||
"""Échantillonnage stratifié par CMD, confiance DP et nombre de DAS."""
|
||||
if len(dossiers) <= n:
|
||||
return dossiers
|
||||
|
||||
# Grouper par CMD
|
||||
by_cmd: dict[str, list[dict]] = {}
|
||||
for d in dossiers:
|
||||
ghm = d["dossier"].ghm_estimation
|
||||
cmd = ghm.cmd if ghm else "inconnu"
|
||||
by_cmd.setdefault(cmd or "inconnu", []).append(d)
|
||||
|
||||
selected = []
|
||||
seen_ids = set()
|
||||
|
||||
# Phase 1 : 1 dossier par CMD (diversité maximale)
|
||||
cmds = sorted(by_cmd.keys())
|
||||
random.seed(42) # Reproductible
|
||||
for cmd in cmds:
|
||||
if len(selected) >= n:
|
||||
break
|
||||
candidates = by_cmd[cmd]
|
||||
# Préférer un mix de confiances
|
||||
random.shuffle(candidates)
|
||||
d = candidates[0]
|
||||
selected.append(d)
|
||||
seen_ids.add(d["dossier_id"])
|
||||
|
||||
# Phase 2 : compléter avec diversité confiance DP
|
||||
if len(selected) < n:
|
||||
remaining = [d for d in dossiers if d["dossier_id"] not in seen_ids]
|
||||
# Trier par confiance DP (low > medium > high pour surreprésenter les cas difficiles)
|
||||
conf_order = {"low": 0, "medium": 1, "high": 2, None: 3}
|
||||
remaining.sort(key=lambda d: (
|
||||
conf_order.get(
|
||||
d["dossier"].diagnostic_principal.cim10_confidence
|
||||
if d["dossier"].diagnostic_principal else None,
|
||||
3
|
||||
),
|
||||
-len(d["dossier"].diagnostics_associes), # beaucoup de DAS d'abord
|
||||
))
|
||||
for d in remaining:
|
||||
if len(selected) >= n:
|
||||
break
|
||||
selected.append(d)
|
||||
|
||||
return selected[:n]
|
||||
|
||||
|
||||
def create_empty_annotation(dossier_id: str, dossier: DossierMedical) -> dict:
|
||||
"""Crée une annotation vide pour un dossier."""
|
||||
dp = dossier.diagnostic_principal
|
||||
das_list = []
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
das_list.append({
|
||||
"index": i,
|
||||
"texte_original": das.texte,
|
||||
"code_pipeline": das.cim10_suggestion or "",
|
||||
"confidence": das.cim10_confidence or "",
|
||||
"source": das.source or "",
|
||||
"statut": "correct",
|
||||
"code_corrige": None,
|
||||
"commentaire": "",
|
||||
})
|
||||
|
||||
return {
|
||||
"dossier_id": dossier_id,
|
||||
"validateur": "",
|
||||
"date_validation": "",
|
||||
"statut": "non_commence",
|
||||
"dp": {
|
||||
"texte_original": dp.texte if dp else "",
|
||||
"code_pipeline": dp.cim10_suggestion if dp else "",
|
||||
"confidence": dp.cim10_confidence if dp else "",
|
||||
"statut": "correct",
|
||||
"code_corrige": None,
|
||||
"commentaire": "",
|
||||
},
|
||||
"das": das_list,
|
||||
"das_ajoutes": [],
|
||||
"commentaire_general": "",
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print("=== Sélection des dossiers pour validation DIM ===\n")
|
||||
|
||||
all_dossiers = load_all_dossiers()
|
||||
print(f"Total dossiers chargés : {len(all_dossiers)}\n")
|
||||
|
||||
if not all_dossiers:
|
||||
print("Aucun dossier trouvé dans output/structured/")
|
||||
sys.exit(1)
|
||||
|
||||
selected = select_dossiers(all_dossiers)
|
||||
|
||||
# Créer le répertoire gold standard
|
||||
GOLD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Sauvegarder la sélection
|
||||
selection = {
|
||||
"date_selection": __import__("datetime").datetime.now().isoformat(timespec="seconds"),
|
||||
"total": len(selected),
|
||||
"cpam": sum(1 for d in selected if d["dossier"].controles_cpam),
|
||||
"non_cpam": sum(1 for d in selected if not d["dossier"].controles_cpam),
|
||||
"dossiers": [d["dossier_id"] for d in selected],
|
||||
}
|
||||
selection_path = GOLD_DIR / "_selection.json"
|
||||
selection_path.write_text(
|
||||
json.dumps(selection, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
print(f"\nSélection sauvegardée : {selection_path}")
|
||||
|
||||
# Initialiser les annotations vides
|
||||
created = 0
|
||||
for d in selected:
|
||||
dossier_id = d["dossier_id"]
|
||||
safe_name = dossier_id.replace("/", "__") + ".json"
|
||||
annot_path = GOLD_DIR / safe_name
|
||||
if not annot_path.exists():
|
||||
annotation = create_empty_annotation(dossier_id, d["dossier"])
|
||||
annot_path.write_text(
|
||||
json.dumps(annotation, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
created += 1
|
||||
|
||||
print(f"Annotations vides créées : {created}")
|
||||
print(f"Annotations existantes préservées : {len(selected) - created}")
|
||||
|
||||
# Résumé
|
||||
print(f"\n--- Résumé ---")
|
||||
for i, d in enumerate(selected, 1):
|
||||
dos = d["dossier"]
|
||||
dp_code = dos.diagnostic_principal.cim10_suggestion if dos.diagnostic_principal else "?"
|
||||
dp_conf = (dos.diagnostic_principal.cim10_confidence or "?") if dos.diagnostic_principal else "?"
|
||||
n_das = len(dos.diagnostics_associes)
|
||||
cpam_flag = " [CPAM]" if dos.controles_cpam else ""
|
||||
ghm = dos.ghm_estimation
|
||||
cmd = ghm.cmd if ghm else "?"
|
||||
print(f" {i:2d}. {d['group_name']:<20s} DP={dp_code:<6s} conf={dp_conf:<7s} DAS={n_das:2d} CMD={cmd}{cpam_flag}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
t2a_install_rag_cleanup/src/__init__.py
Normal file
0
t2a_install_rag_cleanup/src/__init__.py
Normal file
529
t2a_install_rag_cleanup/src/anonymization/anonymizer.py
Normal file
529
t2a_install_rag_cleanup/src/anonymization/anonymizer.py
Normal file
@@ -0,0 +1,529 @@
|
||||
"""Pipeline d'anonymisation en 3 phases : regex → NER → balayage final."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import regex as regex_mod
|
||||
|
||||
from ..config import KEEP_ESTABLISHMENT_NAME, AnonymizationReport
|
||||
from . import regex_patterns as patterns
|
||||
from .entity_registry import EntityRegistry
|
||||
from .ner_anonymizer import extract_person_entities
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Termes médicaux à ne pas anonymiser même s'ils ressemblent à des noms
|
||||
MEDICAL_TERMS_WHITELIST = {
|
||||
"balthazar", "sris", "ras", "atg", "pca", "bcy", "bcr",
|
||||
"nac", "nacl", "asat", "alat", "ggt", "pal", "crp", "imc",
|
||||
"en", "pa", "fc", "vvp", "ide", "iao", "mco", "urg", "bh",
|
||||
"kt", "vbp", "iv", "ap", "am", "ok", "apres", "sous",
|
||||
"normal", "normaux", "stable", "absent", "absente",
|
||||
"date", "heure", "type", "note", "etat", "code",
|
||||
"orale", "intraveineuse", "signé", "arrêté", "réalisé",
|
||||
# Termes médicaux fréquents à ne jamais anonymiser
|
||||
"cholécystectomie", "cholecystectomie", "cholangiographie",
|
||||
"pancréatite", "pancreatite", "lithiase", "lithiases",
|
||||
"cœlioscopie", "coelioscopie", "cholédoque", "choledoque",
|
||||
"angiocholite", "cholécystite", "cholecystite",
|
||||
"morphine", "paracétamol", "paracetamol", "cétirizine", "cetirizine",
|
||||
"tramadol", "contramal", "acupan", "nefopam",
|
||||
"service", "médecin", "medecin", "docteur", "chirurgie",
|
||||
"gastro", "entérologie", "enterologie", "oncologie",
|
||||
"hépato", "hepato", "digestif", "digestive",
|
||||
"proctologue", "nutritive", "pôle", "pole",
|
||||
"fonct", "fonctionnelle", "fonctionnelles",
|
||||
"praticiens", "hospitaliers", "interne", "clinique",
|
||||
"desc", "chef",
|
||||
"secrétariat", "infirmier", "infirmière",
|
||||
"unité", "hospitalisation", "urgences",
|
||||
"coordonnateur", "fédération", "federation",
|
||||
"navarre", "institut", "cancérologie",
|
||||
"bordeaux", "strasbourg", "reims", "limoges", "clermont", "ferrand",
|
||||
"palais",
|
||||
}
|
||||
|
||||
# Noms d'établissement à préserver si configuré
|
||||
ESTABLISHMENT_NAMES = {
|
||||
"centre hospitalier cote basque",
|
||||
"centre hospitalier côte basque",
|
||||
"ch-cotebasque",
|
||||
"icance",
|
||||
}
|
||||
|
||||
|
||||
class Anonymizer:
|
||||
"""Anonymiseur 3 phases pour documents médicaux."""
|
||||
|
||||
def __init__(self, parsed_data: dict | None = None):
|
||||
self.registry = EntityRegistry(whitelist=MEDICAL_TERMS_WHITELIST)
|
||||
self.report = AnonymizationReport(source_file="")
|
||||
self._parsed = parsed_data or {}
|
||||
|
||||
# Pré-enregistrer les entités connues du parsing
|
||||
self._register_parsed_entities()
|
||||
|
||||
def anonymize(self, text: str) -> str:
|
||||
"""Exécute les 3 phases d'anonymisation."""
|
||||
text = self._phase1_regex(text)
|
||||
text = self._phase2_ner(text)
|
||||
text = self._phase3_sweep(text)
|
||||
|
||||
self.report.total_replacements = (
|
||||
self.report.regex_replacements
|
||||
+ self.report.ner_replacements
|
||||
+ self.report.sweep_replacements
|
||||
)
|
||||
return text
|
||||
|
||||
# --- Phase 1 : Regex ---
|
||||
|
||||
def _phase1_regex(self, text: str) -> str:
|
||||
"""Anonymisation par patterns regex."""
|
||||
count = 0
|
||||
|
||||
# CRH footer combiné (IPP + Episode sur la même ligne)
|
||||
text, n = self._replace_crh_footer_ipp_episode(text)
|
||||
count += n
|
||||
|
||||
# Identifiants
|
||||
text, n = self._replace_pattern(
|
||||
text, patterns.IPP_PATTERN, "ipp",
|
||||
group_handler=self._handle_multi_group,
|
||||
)
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(
|
||||
text, patterns.EPISODE_PATTERN, "episode",
|
||||
group_handler=self._handle_multi_group,
|
||||
)
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.NIR_PATTERN, "nir")
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.FINESS_PATTERN, "finess")
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.RPPS_PATTERN, "rpps")
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.BARCODE_PATTERN, "code_barre")
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.BARCODE_REPEAT_PATTERN, "code_barre")
|
||||
count += n
|
||||
|
||||
# Contact
|
||||
text, n = self._replace_phone(text)
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(
|
||||
text, patterns.EMAIL_PATTERN, "email",
|
||||
skip_establishment_check=True,
|
||||
)
|
||||
count += n
|
||||
|
||||
text, n = self._replace_fax(text)
|
||||
count += n
|
||||
|
||||
# Adresses
|
||||
text, n = self._replace_addresses(text)
|
||||
count += n
|
||||
|
||||
# Scanner les patterns d'adresse inline (MAISON xxx, QUARTIER xxx...)
|
||||
text, n = self._replace_inline_addresses(text)
|
||||
count += n
|
||||
|
||||
# Dates de naissance
|
||||
text, n = self._replace_date_naissance(text)
|
||||
count += n
|
||||
|
||||
# Lieu de naissance
|
||||
text, n = self._replace_pattern(
|
||||
text, patterns.LIEU_NAISSANCE_PATTERN, "lieu_naissance",
|
||||
)
|
||||
count += n
|
||||
|
||||
# Noms structurés
|
||||
text, n = self._replace_structured_names(text)
|
||||
count += n
|
||||
|
||||
# Footers (Trackare et CRH)
|
||||
text, n = self._replace_footer(text)
|
||||
count += n
|
||||
|
||||
self.report.regex_replacements = count
|
||||
return text
|
||||
|
||||
# --- Phase 2 : NER ---
|
||||
|
||||
def _phase2_ner(self, text: str) -> str:
|
||||
"""Anonymisation par NER CamemBERT."""
|
||||
try:
|
||||
ner_entities = extract_person_entities(text)
|
||||
except Exception as e:
|
||||
logger.warning("NER indisponible (%s), phase 2 ignorée.", e)
|
||||
return text
|
||||
|
||||
count = 0
|
||||
# Trier par position décroissante pour remplacer de la fin au début
|
||||
ner_entities.sort(key=lambda e: e["start"], reverse=True)
|
||||
|
||||
for ent in ner_entities:
|
||||
word = ent["word"]
|
||||
if self._is_whitelisted(word):
|
||||
continue
|
||||
if self._is_establishment(word):
|
||||
continue
|
||||
|
||||
# Vérifier si déjà anonymisé (contient des crochets)
|
||||
if "[" in word and "]" in word:
|
||||
continue
|
||||
|
||||
pseudo = self.registry.get_replacement(word)
|
||||
if pseudo is None:
|
||||
pseudo = self.registry.register(word, "personne")
|
||||
|
||||
text = text[:ent["start"]] + pseudo + text[ent["end"]:]
|
||||
count += 1
|
||||
|
||||
self.report.entities_found.append({
|
||||
"original": word,
|
||||
"replacement": pseudo,
|
||||
"source": "ner",
|
||||
"score": ent["score"],
|
||||
})
|
||||
|
||||
self.report.ner_replacements = count
|
||||
return text
|
||||
|
||||
# --- Phase 3 : Balayage final ---
|
||||
|
||||
def _phase3_sweep(self, text: str) -> str:
|
||||
"""Balayage brute-force des entités connues restantes."""
|
||||
count = 0
|
||||
all_entities = self.registry.get_all_entities()
|
||||
|
||||
for original, replacement in sorted(
|
||||
all_entities.items(), key=lambda x: len(x[0]), reverse=True
|
||||
):
|
||||
if len(original) < 3:
|
||||
continue
|
||||
if self._is_whitelisted(original):
|
||||
continue
|
||||
|
||||
# Recherche insensible à la casse, avec frontières de mots
|
||||
escaped = re.escape(original)
|
||||
pattern = re.compile(r"\b" + escaped + r"\b", re.IGNORECASE)
|
||||
matches = pattern.findall(text)
|
||||
if matches:
|
||||
text = pattern.sub(replacement, text)
|
||||
count += len(matches)
|
||||
|
||||
self.report.sweep_replacements = count
|
||||
return text
|
||||
|
||||
# --- Helpers ---
|
||||
|
||||
def _register_parsed_entities(self) -> None:
|
||||
"""Pré-enregistre les entités extraites par les parsers."""
|
||||
patient = self._parsed.get("patient", {})
|
||||
|
||||
# Noms patient
|
||||
for key in ("nom_prenom", "nom_naissance", "nom_complet"):
|
||||
if patient.get(key):
|
||||
self.registry.register(patient[key], "patient")
|
||||
|
||||
# Adresse patient — enregistrer l'adresse complète et chaque mot significatif
|
||||
if patient.get("adresse"):
|
||||
self._register_address(patient["adresse"])
|
||||
if patient.get("ville"):
|
||||
self.registry.register(patient["ville"], "adresse")
|
||||
if patient.get("code_postal"):
|
||||
cp = patient["code_postal"]
|
||||
if patient.get("ville"):
|
||||
self.registry.register(f"{cp} {patient['ville']}", "adresse")
|
||||
if patient.get("lieu_naissance"):
|
||||
self.registry.register(patient["lieu_naissance"], "lieu_naissance")
|
||||
|
||||
# Médecins
|
||||
for med in self._parsed.get("medecins", []):
|
||||
self.registry.register(med, "medecin")
|
||||
|
||||
# Scanner le texte brut pour les lignes d'adresse non captées par le parser
|
||||
raw_text = self._parsed.get("contenu_medical", "")
|
||||
# Pas disponible ici, on le fera via les patterns dans phase 1
|
||||
|
||||
# Contacts
|
||||
for contact in self._parsed.get("contacts", []):
|
||||
# Extraire les noms des contacts
|
||||
names = re.findall(
|
||||
r"([A-ZÉÈÊËÀÂa-zéèêëàâ]{2,}(?:\s+[A-ZÉÈÊËÀÂa-zéèêëàâ]{2,})+)",
|
||||
contact,
|
||||
)
|
||||
for name in names:
|
||||
if not self._is_whitelisted(name):
|
||||
self.registry.register(name, "contact")
|
||||
|
||||
def _replace_pattern(
|
||||
self,
|
||||
text: str,
|
||||
pattern: regex_mod.Pattern,
|
||||
category: str,
|
||||
group_handler: Any = None,
|
||||
skip_establishment_check: bool = False,
|
||||
) -> tuple[str, int]:
|
||||
"""Remplace les matches d'un pattern."""
|
||||
count = 0
|
||||
for m in reversed(list(pattern.finditer(text))):
|
||||
if group_handler:
|
||||
matched_text = group_handler(m)
|
||||
else:
|
||||
matched_text = m.group(1) if m.lastindex else m.group(0)
|
||||
|
||||
if not matched_text:
|
||||
continue
|
||||
|
||||
if not skip_establishment_check and self._is_establishment(matched_text):
|
||||
continue
|
||||
|
||||
pseudo = self.registry.register(matched_text, category)
|
||||
|
||||
# Trouver le bon span à remplacer
|
||||
if group_handler:
|
||||
# Pour les multi-group, trouver quel groupe a matché
|
||||
for i in range(1, (m.lastindex or 0) + 1):
|
||||
if m.group(i) == matched_text:
|
||||
start, end = m.span(i)
|
||||
break
|
||||
else:
|
||||
start, end = m.span()
|
||||
elif m.lastindex:
|
||||
start, end = m.span(1)
|
||||
else:
|
||||
start, end = m.span()
|
||||
|
||||
text = text[:start] + pseudo + text[end:]
|
||||
count += 1
|
||||
|
||||
self.report.entities_found.append({
|
||||
"original": matched_text,
|
||||
"replacement": pseudo,
|
||||
"source": "regex",
|
||||
"category": category,
|
||||
})
|
||||
|
||||
return text, count
|
||||
|
||||
def _handle_multi_group(self, m: regex_mod.Match) -> str | None:
|
||||
"""Gère les patterns avec plusieurs groupes alternatifs."""
|
||||
for i in range(1, (m.lastindex or 0) + 1):
|
||||
if m.group(i):
|
||||
return m.group(i)
|
||||
return None
|
||||
|
||||
def _replace_crh_footer_ipp_episode(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les IPP/épisode dans les footers CRH (format combiné)."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.CRH_FOOTER_IPP_EPISODE.finditer(text))):
|
||||
ipp = m.group(1)
|
||||
episode = m.group(2)
|
||||
pseudo_ipp = self.registry.register(ipp, "ipp")
|
||||
pseudo_ep = self.registry.register(episode, "episode")
|
||||
replacement = f"IPP {pseudo_ipp} / N° Episode {pseudo_ep}"
|
||||
text = text[:m.start()] + replacement + text[m.end():]
|
||||
count += 2
|
||||
return text, count
|
||||
|
||||
def _replace_phone(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les numéros de téléphone."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.PHONE_PATTERN.finditer(text))):
|
||||
phone = m.group(0)
|
||||
# Ne pas anonymiser le standard de l'hôpital si configuré
|
||||
normalized = phone.replace(".", " ").replace("-", " ")
|
||||
if KEEP_ESTABLISHMENT_NAME and "05 59 44 35 35" in normalized:
|
||||
continue
|
||||
pseudo = self.registry.register(phone, "telephone")
|
||||
text = text[:m.start()] + pseudo + text[m.end():]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _replace_fax(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les numéros de fax."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.FAX_PATTERN.finditer(text))):
|
||||
fax_num = m.group(1)
|
||||
pseudo = self.registry.register(fax_num, "telephone")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _replace_addresses(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les adresses."""
|
||||
count = 0
|
||||
|
||||
# Lignes d'adresse
|
||||
for m in reversed(list(patterns.ADDRESS_LINE_PATTERN.finditer(text))):
|
||||
addr = m.group(1).strip()
|
||||
if len(addr) > 5 and not self._is_establishment(addr):
|
||||
pseudo = self.registry.register(addr, "adresse")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Code postal + ville (sauf l'hôpital / Bayonne)
|
||||
for m in reversed(list(patterns.CP_VILLE_PATTERN.finditer(text))):
|
||||
ville = m.group(2).strip()
|
||||
cp = m.group(1)
|
||||
full = f"{cp} {ville}"
|
||||
if self._is_establishment(full) or "BAYONNE" in ville.upper():
|
||||
if not KEEP_ESTABLISHMENT_NAME:
|
||||
pseudo = self.registry.register(full, "adresse")
|
||||
text = text[:m.start()] + pseudo + text[m.end():]
|
||||
count += 1
|
||||
else:
|
||||
pseudo = self.registry.register(full, "adresse")
|
||||
text = text[:m.start()] + pseudo + text[m.end():]
|
||||
count += 1
|
||||
|
||||
return text, count
|
||||
|
||||
def _replace_inline_addresses(self, text: str) -> tuple[str, int]:
|
||||
"""Capture les adresses inline (MAISON xxx, QUARTIER xxx, LOTISSEMENT xxx)."""
|
||||
count = 0
|
||||
# Pattern : MAISON/QUARTIER/LOTISSEMENT suivi de mots (noms propres de lieux)
|
||||
inline_addr = re.compile(
|
||||
r"((?:MAISON|QUARTIER|LOTISSEMENT|RESIDENCE|HAMEAU)\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s]+?)(?=\n|$|Dr|\d{5}|Chef|médical|coordonnateur)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for m in reversed(list(inline_addr.finditer(text))):
|
||||
addr = m.group(1).strip()
|
||||
if len(addr) > 5:
|
||||
self._register_address(addr)
|
||||
pseudo = self.registry.register(addr, "adresse")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _replace_date_naissance(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les dates de naissance."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.DATE_NAISSANCE_PATTERN.finditer(text))):
|
||||
date_str = m.group(1)
|
||||
pseudo = self.registry.register(date_str, "date_naissance")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _replace_structured_names(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les noms détectés par patterns structurels."""
|
||||
count = 0
|
||||
|
||||
# CRH footer patient : "Patient(e) : NOM PRENOM Né(e)"
|
||||
for m in reversed(list(patterns.CRH_FOOTER_PATIENT_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "patient")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Patient names
|
||||
for pat in [patterns.PATIENT_NAME_PATTERN, patterns.CIVILITE_NAME_PATTERN]:
|
||||
for m in reversed(list(pat.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "patient")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Doctor names (tous les patterns)
|
||||
for pat in [patterns.DR_NAME_PATTERN, patterns.MEDECIN_COURANT_PATTERN,
|
||||
patterns.MEDECIN_TRAITANT_PATTERN, patterns.MEDECIN_PEC_PATTERN]:
|
||||
for m in reversed(list(pat.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "medecin")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Note authors (with date suffix)
|
||||
for m in reversed(list(patterns.NOTE_AUTHOR_DATE_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Note authors (Prénom NOM pattern, sans date)
|
||||
for m in reversed(list(patterns.NOTE_AUTHOR_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# IAO
|
||||
for m in reversed(list(patterns.IAO_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Rédigé par
|
||||
for m in reversed(list(patterns.REDIGE_PAR_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Staff names from header
|
||||
for m in reversed(list(patterns.STAFF_NAME_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip() if m.group(1) else ""
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
self.report.regex_replacements += count
|
||||
return text, count
|
||||
|
||||
def _replace_footer(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les infos patient dans les footers (Trackare et CRH)."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.FOOTER_PATIENT_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
pseudo = self.registry.register(name, "patient")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _register_address(self, addr: str) -> None:
|
||||
"""Enregistre une adresse et ses mots significatifs."""
|
||||
self.registry.register(addr, "adresse")
|
||||
skip_words = {
|
||||
"maison", "quartier", "lotissement", "rue", "avenue",
|
||||
"boulevard", "chemin", "place", "route", "résidence",
|
||||
"hameau", "lieu", "dit", "impasse", "allée", "batiment",
|
||||
"bp", "cedex",
|
||||
}
|
||||
for word in addr.split():
|
||||
word_clean = word.strip(",.")
|
||||
if len(word_clean) >= 4 and word_clean.lower() not in skip_words:
|
||||
self.registry.register(word_clean, "adresse")
|
||||
|
||||
def _is_whitelisted(self, text: str) -> bool:
|
||||
"""Vérifie si un terme est dans la whitelist médicale."""
|
||||
return text.lower().strip() in MEDICAL_TERMS_WHITELIST
|
||||
|
||||
def _is_establishment(self, text: str) -> bool:
|
||||
"""Vérifie si le texte fait référence à l'établissement."""
|
||||
if not KEEP_ESTABLISHMENT_NAME:
|
||||
return False
|
||||
text_lower = text.lower().strip()
|
||||
return any(est in text_lower for est in ESTABLISHMENT_NAMES)
|
||||
86
t2a_install_rag_cleanup/src/anonymization/entity_registry.py
Normal file
86
t2a_install_rag_cleanup/src/anonymization/entity_registry.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Registre d'entités pour assurer la cohérence des remplacements."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class EntityRegistry:
|
||||
"""Maintient un mapping cohérent entre entités réelles et pseudonymes."""
|
||||
|
||||
def __init__(self, whitelist: set[str] | None = None):
|
||||
self._counters: dict[str, int] = defaultdict(int)
|
||||
self._mappings: dict[str, str] = {}
|
||||
self._category_map: dict[str, str] = {}
|
||||
self._whitelist: set[str] = whitelist or set()
|
||||
|
||||
def register(self, entity: str, category: str) -> str:
|
||||
"""Enregistre une entité et retourne son pseudonyme.
|
||||
|
||||
Si l'entité est déjà connue, retourne le même pseudonyme.
|
||||
"""
|
||||
key = self._normalize(entity)
|
||||
if not key:
|
||||
return entity
|
||||
|
||||
if key in self._mappings:
|
||||
return self._mappings[key]
|
||||
|
||||
self._counters[category] += 1
|
||||
count = self._counters[category]
|
||||
|
||||
pseudo = self._generate_pseudo(category, count)
|
||||
self._mappings[key] = pseudo
|
||||
self._category_map[key] = category
|
||||
|
||||
# Enregistrer aussi les sous-parties du nom (sauf termes médicaux)
|
||||
parts = key.split()
|
||||
if len(parts) > 1:
|
||||
for part in parts:
|
||||
if len(part) >= 3 and part not in self._whitelist:
|
||||
part_key = part
|
||||
if part_key not in self._mappings:
|
||||
self._mappings[part_key] = f"[{category.upper()}]"
|
||||
|
||||
return pseudo
|
||||
|
||||
def get_replacement(self, entity: str) -> str | None:
|
||||
"""Retourne le pseudonyme d'une entité connue, ou None."""
|
||||
key = self._normalize(entity)
|
||||
return self._mappings.get(key)
|
||||
|
||||
def get_all_entities(self) -> dict[str, str]:
|
||||
"""Retourne tous les mappings entity → pseudo."""
|
||||
return dict(self._mappings)
|
||||
|
||||
def get_all_original_names(self) -> list[str]:
|
||||
"""Retourne toutes les entités originales (noms avant normalisation)."""
|
||||
return list(self._mappings.keys())
|
||||
|
||||
def _normalize(self, text: str) -> str:
|
||||
"""Normalise un nom pour lookup : minuscules, espaces simplifiés."""
|
||||
text = text.strip()
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text.lower()
|
||||
|
||||
def _generate_pseudo(self, category: str, count: int) -> str:
|
||||
"""Génère un pseudonyme selon la catégorie."""
|
||||
labels = {
|
||||
"patient": f"[PATIENT_{count}]",
|
||||
"medecin": f"[MEDECIN_{count}]",
|
||||
"soignant": f"[SOIGNANT_{count}]",
|
||||
"contact": f"[CONTACT_{count}]",
|
||||
"personne": f"[PERSONNE_{count}]",
|
||||
"ipp": f"[IPP_{count}]",
|
||||
"episode": f"[EPISODE_{count}]",
|
||||
"nir": f"[NIR_{count}]",
|
||||
"telephone": f"[TEL_{count}]",
|
||||
"email": f"[EMAIL_{count}]",
|
||||
"adresse": f"[ADRESSE_{count}]",
|
||||
"date_naissance": f"[DATE_NAISS_{count}]",
|
||||
"lieu_naissance": f"[LIEU_NAISS_{count}]",
|
||||
"finess": f"[FINESS]",
|
||||
"code_barre": f"[CODE_BARRE_{count}]",
|
||||
}
|
||||
return labels.get(category, f"[{category.upper()}_{count}]")
|
||||
95
t2a_install_rag_cleanup/src/anonymization/ner_anonymizer.py
Normal file
95
t2a_install_rag_cleanup/src/anonymization/ner_anonymizer.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""NER via CamemBERT pour détecter les noms en texte libre."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ..config import NER_CONFIDENCE_THRESHOLD, NER_MODEL
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import Pipeline
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_pipeline: Pipeline | None = None
|
||||
|
||||
|
||||
def _get_pipeline() -> Pipeline:
|
||||
"""Charge le modèle NER (lazy loading)."""
|
||||
global _pipeline
|
||||
if _pipeline is None:
|
||||
logger.info("Chargement du modèle NER %s...", NER_MODEL)
|
||||
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(NER_MODEL, local_files_only=True)
|
||||
model = AutoModelForTokenClassification.from_pretrained(NER_MODEL, local_files_only=True)
|
||||
_pipeline = pipeline(
|
||||
"ner",
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
aggregation_strategy="simple",
|
||||
)
|
||||
logger.info("Modèle NER chargé.")
|
||||
return _pipeline
|
||||
|
||||
|
||||
def extract_person_entities(text: str) -> list[dict]:
|
||||
"""Extrait les entités de type PER (personnes) du texte.
|
||||
|
||||
Retourne une liste de dicts avec 'word', 'start', 'end', 'score'.
|
||||
"""
|
||||
pipe = _get_pipeline()
|
||||
|
||||
# CamemBERT a une limite de tokens — découper en chunks
|
||||
chunks = _split_text(text, max_chars=500)
|
||||
entities: list[dict] = []
|
||||
offset = 0
|
||||
|
||||
for chunk in chunks:
|
||||
results = pipe(chunk)
|
||||
for ent in results:
|
||||
if ent["entity_group"] == "PER" and ent["score"] >= NER_CONFIDENCE_THRESHOLD:
|
||||
word = ent["word"].strip()
|
||||
if len(word) >= 2:
|
||||
entities.append({
|
||||
"word": word,
|
||||
"start": ent["start"] + offset,
|
||||
"end": ent["end"] + offset,
|
||||
"score": float(ent["score"]),
|
||||
})
|
||||
offset += len(chunk)
|
||||
|
||||
return _deduplicate(entities)
|
||||
|
||||
|
||||
def _split_text(text: str, max_chars: int = 500) -> list[str]:
|
||||
"""Découpe le texte en chunks de taille raisonnable aux limites de phrases."""
|
||||
if len(text) <= max_chars:
|
||||
return [text]
|
||||
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + max_chars
|
||||
if end < len(text):
|
||||
# Chercher la fin de phrase la plus proche
|
||||
for sep in ["\n", ". ", ", ", " "]:
|
||||
pos = text.rfind(sep, start, end)
|
||||
if pos > start:
|
||||
end = pos + len(sep)
|
||||
break
|
||||
chunks.append(text[start:end])
|
||||
start = end
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _deduplicate(entities: list[dict]) -> list[dict]:
|
||||
"""Déduplique les entités par mot (garde le score le plus élevé)."""
|
||||
seen: dict[str, dict] = {}
|
||||
for ent in entities:
|
||||
key = ent["word"].lower()
|
||||
if key not in seen or ent["score"] > seen[key]["score"]:
|
||||
seen[key] = ent
|
||||
return list(seen.values())
|
||||
194
t2a_install_rag_cleanup/src/anonymization/regex_patterns.py
Normal file
194
t2a_install_rag_cleanup/src/anonymization/regex_patterns.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""Patterns regex pour la détection de données personnelles dans les documents médicaux FR."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import regex
|
||||
|
||||
# --- Identifiants ---
|
||||
|
||||
# IPP : séquence de 6-10 chiffres après "IPP" (avec ou sans :)
|
||||
IPP_PATTERN = regex.compile(
|
||||
r"(?:IPP\s*[:=]?\s*)(\d{6,10})"
|
||||
r"|"
|
||||
r"\((\d{8})\s*\)", # Footer "(01306172 )"
|
||||
)
|
||||
|
||||
# Numéro d'épisode (toutes les variantes)
|
||||
EPISODE_PATTERN = regex.compile(
|
||||
r"(?:Episode\s*(?:No|N°|N\.?)\s*[:=]?\s*)(\d{6,10})"
|
||||
r"|"
|
||||
r"(?:N°\s*Episode\s+)(\d{6,10})",
|
||||
)
|
||||
|
||||
# NIR / Numéro de sécurité sociale (15 chiffres)
|
||||
NIR_PATTERN = regex.compile(r"\b([12]\d{2}(?:0[1-9]|1[0-2])\d{2,3}\d{6}\s?\d{2})\b")
|
||||
|
||||
# FINESS (9 chiffres, souvent précédé de "Finess")
|
||||
FINESS_PATTERN = regex.compile(r"(?:Finess|FINESS)\s*[:\s]*\*?(\d{9})\*?")
|
||||
|
||||
# RPPS (11 chiffres)
|
||||
RPPS_PATTERN = regex.compile(r"RPPS\s*[:=]?\s*(\d{11})")
|
||||
|
||||
# Code-barres (nombre entre astérisques)
|
||||
BARCODE_PATTERN = regex.compile(r"\*(\d{9,15})\*")
|
||||
|
||||
# Numéro isolé après code-barres (même numéro répété sans astérisques)
|
||||
BARCODE_REPEAT_PATTERN = regex.compile(r"\*\d{9,15}\*\s*\n(\d{9,15})")
|
||||
|
||||
# --- Contact ---
|
||||
|
||||
# Téléphones FR : 10 chiffres avec séparateurs variés
|
||||
PHONE_PATTERN = regex.compile(
|
||||
r"\b(0[1-9])[\s.\-]?(\d{2})[\s.\-]?(\d{2})[\s.\-]?(\d{2})[\s.\-]?(\d{2})\b"
|
||||
)
|
||||
|
||||
# Emails (y compris @ch-cotebasque.fr qui contiennent des initiales de soignants)
|
||||
EMAIL_PATTERN = regex.compile(
|
||||
r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"
|
||||
)
|
||||
|
||||
# Fax
|
||||
FAX_PATTERN = regex.compile(
|
||||
r"Fax\s*:\s*(0[1-9][\s.\-]?\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2})"
|
||||
)
|
||||
|
||||
# --- Adresses ---
|
||||
|
||||
# Code postal + ville (uniquement les ALL_CAPS après 5 digits)
|
||||
CP_VILLE_PATTERN = regex.compile(
|
||||
r"\b(\d{5})\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ\s\-]{2,})\b"
|
||||
)
|
||||
|
||||
# Lignes d'adresse avec mots-clés (y compris noms propres basques/locaux)
|
||||
ADDRESS_LINE_PATTERN = regex.compile(
|
||||
r"^((?:(?:\d+\s*,?\s*)?(?:MAISON|LOTISSEMENT|QUARTIER|RUE|AVENUE|BOULEVARD|IMPASSE|CHEMIN|PLACE|ALLEE|ALLÉE|ROUTE|LIEU[\s-]DIT|RESIDENCE|RÉSIDENCE|BATIMENT|BÂTIMENT|HAMEAU)[\s\w\-''ÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ]+))$",
|
||||
regex.MULTILINE | regex.IGNORECASE,
|
||||
)
|
||||
|
||||
# Adresse complète multi-ligne (après nom patient dans CRH/Trackare)
|
||||
ADDRESS_BLOCK_PATTERN = regex.compile(
|
||||
r"(?:Adresse\s*:\s*)(.+?)(?:\s+Ville|\n)",
|
||||
)
|
||||
|
||||
# --- Dates de naissance ---
|
||||
|
||||
# Toutes les variantes : "né(e) le", "née le", "né le", "Né(e) le", "Date de naissance:"
|
||||
DATE_NAISSANCE_PATTERN = regex.compile(
|
||||
r"(?:[Nn][ée]+(?:\(e\))?\s+le\s+|Date de naissance\s*[:=]?\s*)(\d{2}/\d{2}/\d{4})"
|
||||
)
|
||||
|
||||
# --- Noms structurés ---
|
||||
|
||||
# Footer CRH : "Patient(e) : NOM PRENOM Né(e) le"
|
||||
CRH_FOOTER_PATIENT_PATTERN = regex.compile(
|
||||
r"Patient(?:\(e\))?\s*:\s*([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\-]+?)\s+(?:Né|né)"
|
||||
)
|
||||
|
||||
# Footer CRH : "IPP NNNNNNNN / N° Episode NNNNNNNN"
|
||||
CRH_FOOTER_IPP_EPISODE = regex.compile(
|
||||
r"IPP\s+(\d{6,10})\s*/\s*N°\s*Episode\s+(\d{6,10})"
|
||||
)
|
||||
|
||||
# Après "Nom de naissance:", "Nom et Prénom:", "Patient(e):"
|
||||
PATIENT_NAME_PATTERN = regex.compile(
|
||||
r"(?:Patient(?:\(e\))?\s*:\s*|Nom de naissance\s*:\s*|Nom et Prénom\s*:\s*)"
|
||||
r"([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\-]+)",
|
||||
)
|
||||
|
||||
# "MME/Mme/M./MR/Madame/Monsieur" suivi du nom
|
||||
CIVILITE_NAME_PATTERN = regex.compile(
|
||||
r"(?:MME|Mme|Madame|M\.|Mr|MR|Monsieur)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\.\-]+?)(?:\s+[Nn]é|\s+Date|\n|,)"
|
||||
)
|
||||
|
||||
# "DR." / "Dr" / "Docteur" suivi du nom du médecin
|
||||
DR_NAME_PATTERN = regex.compile(
|
||||
r"(?:DR\.?|Dr\.?|Docteur)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+){0,2})"
|
||||
)
|
||||
|
||||
# "Rédigé par" en pied de page CRH
|
||||
REDIGE_PAR_PATTERN = regex.compile(
|
||||
r"Rédigé par\s*:?\s*(.+?)(?:\n|$)"
|
||||
)
|
||||
|
||||
# "Liste des destinataires:" suivi de noms
|
||||
DESTINATAIRE_PATTERN = regex.compile(
|
||||
r"(?:Madame|Monsieur|DR\.?|Dr\.?)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\.\-]+?)(?:\n|$)"
|
||||
)
|
||||
|
||||
# Noms d'auteurs dans Trackare : "Note d'évolution Prénom NOM DD/MM/YYYY"
|
||||
NOTE_AUTHOR_DATE_PATTERN = regex.compile(
|
||||
r"(?:Note d'évolution|Note IDE|Histoire de la maladie|Conclusion Obs\.?\s*médicales?)\s+"
|
||||
r"(?:DR\.?\s+)?"
|
||||
r"([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)+)"
|
||||
r"\s+\d{2}/\d{2}/\d{4}",
|
||||
)
|
||||
|
||||
# Noms d'auteurs Trackare sans date immédiate : "Note IDE Prénom NOM texte..."
|
||||
# Le nom est toujours un Prénom (Capitalized) suivi d'un NOM (ALL CAPS)
|
||||
NOTE_AUTHOR_PATTERN = regex.compile(
|
||||
r"(?:Note d'évolution|Note IDE|Histoire de la maladie|Conclusion Obs\.?\s*médicales?)\s+"
|
||||
r"(?:DR\.?\s+)?"
|
||||
r"([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][a-zéèêëàâäùûüôöîïç]+\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ\-]{2,})"
|
||||
)
|
||||
|
||||
# Footer Trackare : "Patient: NOM PRENOM - Date de naissance: ..."
|
||||
FOOTER_PATIENT_PATTERN = regex.compile(
|
||||
r"Patient\s*:\s*([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\-]+?)\s*-\s*Date de naissance"
|
||||
)
|
||||
|
||||
# "Médecin traitant" block
|
||||
MEDECIN_TRAITANT_PATTERN = regex.compile(
|
||||
r"Médecin traitant\s*\n\s*(?:Nom\s+Adresse\s+.*\n)?\s*(?:DR\.?\s+)?(.+?)(?:\s+(?:Lotissement|Rue|Avenue|\d{5}))",
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
|
||||
# "Médecin courant:"
|
||||
MEDECIN_COURANT_PATTERN = regex.compile(
|
||||
r"Médecin courant\s*:\s*(?:DR\.?\s+)?([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)*)"
|
||||
)
|
||||
|
||||
# "Médecin de la prise en charge médicale NOM"
|
||||
MEDECIN_PEC_PATTERN = regex.compile(
|
||||
r"(?:Médecin de (?:la )?(?:prise en charge|décision)\s+médicale)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)*)"
|
||||
)
|
||||
|
||||
# IAO
|
||||
IAO_PATTERN = regex.compile(
|
||||
r"IAO\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)*)"
|
||||
)
|
||||
|
||||
# Cadre / personnel nommé dans l'en-tête CRH
|
||||
STAFF_NAME_PATTERN = regex.compile(
|
||||
r"(?:Mme|M\.)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-Za-zéèêëàâäùûüôöîïç\.\-]+\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-Za-zéèêëàâäùûüôöîïç\.\-]+)"
|
||||
)
|
||||
|
||||
# Lieu de naissance
|
||||
LIEU_NAISSANCE_PATTERN = regex.compile(
|
||||
r"Lieu de naissance\s*:\s*(.+?)(?:\n|$)"
|
||||
)
|
||||
|
||||
# Auteurs de prescription dans Trackare
|
||||
PRESCRIPTION_AUTHOR_PATTERN = regex.compile(
|
||||
r"(?:Presc\.\s*de\s*Sortie|Normal|Signé|Arrêté|Réalisé)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][a-zéèêëàâäùûüôöîïç]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-Za-zéèêëàâäùûüôöîïç\-]+)+)"
|
||||
)
|
||||
|
||||
|
||||
def get_all_name_patterns():
|
||||
"""Retourne la liste des patterns qui capturent des noms de personnes."""
|
||||
return [
|
||||
PATIENT_NAME_PATTERN,
|
||||
CIVILITE_NAME_PATTERN,
|
||||
DR_NAME_PATTERN,
|
||||
REDIGE_PAR_PATTERN,
|
||||
NOTE_AUTHOR_DATE_PATTERN,
|
||||
NOTE_AUTHOR_PATTERN,
|
||||
FOOTER_PATIENT_PATTERN,
|
||||
CRH_FOOTER_PATIENT_PATTERN,
|
||||
MEDECIN_TRAITANT_PATTERN,
|
||||
MEDECIN_COURANT_PATTERN,
|
||||
MEDECIN_PEC_PATTERN,
|
||||
IAO_PATTERN,
|
||||
STAFF_NAME_PATTERN,
|
||||
DESTINATAIRE_PATTERN,
|
||||
PRESCRIPTION_AUTHOR_PATTERN,
|
||||
]
|
||||
289
t2a_install_rag_cleanup/src/config.py
Normal file
289
t2a_install_rag_cleanup/src/config.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""Configuration globale et modèles de données pour le pipeline T2A."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# --- Chemins ---
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
INPUT_DIR = BASE_DIR / "input"
|
||||
OUTPUT_DIR = BASE_DIR / "output"
|
||||
ANONYMIZED_DIR = OUTPUT_DIR / "anonymized"
|
||||
STRUCTURED_DIR = OUTPUT_DIR / "structured"
|
||||
REPORTS_DIR = OUTPUT_DIR / "reports"
|
||||
|
||||
for d in (INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR, REPORTS_DIR):
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# --- Configuration anonymisation ---
|
||||
|
||||
KEEP_ESTABLISHMENT_NAME = os.environ.get("T2A_KEEP_ESTABLISHMENT", "True").lower() in ("true", "1", "yes")
|
||||
NER_MODEL = os.environ.get("T2A_NER_MODEL", "Jean-Baptiste/camembert-ner")
|
||||
NER_CONFIDENCE_THRESHOLD = float(os.environ.get("T2A_NER_THRESHOLD", "0.80"))
|
||||
|
||||
|
||||
# --- Configuration Ollama ---
|
||||
|
||||
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "gemma3:27b-cloud")
|
||||
OLLAMA_TIMEOUT = int(os.environ.get("OLLAMA_TIMEOUT", "120"))
|
||||
OLLAMA_CACHE_PATH = BASE_DIR / "data" / "ollama_cache.json"
|
||||
OLLAMA_MAX_PARALLEL = int(os.environ.get("OLLAMA_MAX_PARALLEL", "2"))
|
||||
|
||||
|
||||
# --- Configuration RUM / établissement ---
|
||||
|
||||
FINESS = os.environ.get("T2A_FINESS", "000000000")
|
||||
NUM_UM = os.environ.get("T2A_NUM_UM", "0000")
|
||||
|
||||
|
||||
# --- Configuration RAG ---
|
||||
|
||||
RAG_INDEX_DIR = BASE_DIR / "data" / "rag_index"
|
||||
REFERENTIELS_DIR = BASE_DIR / "data" / "referentiels"
|
||||
UPLOAD_MAX_SIZE_MB = 50
|
||||
ALLOWED_EXTENSIONS = {".pdf", ".csv", ".xlsx", ".xls", ".txt"}
|
||||
CIM10_DICT_PATH = BASE_DIR / "data" / "cim10_dict.json"
|
||||
CIM10_SUPPLEMENTS_PATH = BASE_DIR / "data" / "cim10_supplements.json"
|
||||
CMA_LEVELS_PATH = BASE_DIR / "data" / "cma_levels.json"
|
||||
CCAM_DICT_PATH = BASE_DIR / "data" / "ccam_dict.json"
|
||||
CIM10_PDF = Path(os.environ.get("T2A_CIM10_PDF", "/home/dom/ai/aivanov_CIM/cim-10-fr_2026_a_usage_pmsi_version_provisoire_111225.pdf"))
|
||||
GUIDE_METHODO_PDF = Path(os.environ.get("T2A_GUIDE_METHODO_PDF", "/home/dom/ai/aivanov_CIM/guide_methodo_mco_2026_version_provisoire.pdf"))
|
||||
CCAM_PDF = Path(os.environ.get("T2A_CCAM_PDF", "/home/dom/ai/aivanov_CIM/actualisation_ccam_descriptive_a_usage_pmsi_v4_2025.pdf"))
|
||||
|
||||
# --- Modèle d'embedding ---
|
||||
|
||||
EMBEDDING_MODEL = os.environ.get("T2A_EMBEDDING_MODEL", "dangvantuan/sentence-camembert-large")
|
||||
|
||||
# --- Modèle de re-ranking (cross-encoder, CPU uniquement) ---
|
||||
|
||||
RERANKER_MODEL = os.environ.get("T2A_RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
|
||||
|
||||
# --- Modèles de données CIM-10 ---
|
||||
|
||||
|
||||
class RAGSource(BaseModel):
|
||||
document: str
|
||||
page: Optional[int] = None
|
||||
code: Optional[str] = None
|
||||
extrait: Optional[str] = None
|
||||
|
||||
|
||||
class Sejour(BaseModel):
|
||||
sexe: Optional[str] = None
|
||||
age: Optional[int] = None
|
||||
date_entree: Optional[str] = None
|
||||
date_sortie: Optional[str] = None
|
||||
duree_sejour: Optional[int] = None
|
||||
mode_entree: Optional[str] = None
|
||||
mode_sortie: Optional[str] = None
|
||||
imc: Optional[float] = None
|
||||
poids: Optional[float] = None
|
||||
taille: Optional[float] = None
|
||||
|
||||
|
||||
class PreuveClinique(BaseModel):
|
||||
type: str # "biologie" | "imagerie" | "traitement" | "acte" | "clinique"
|
||||
element: str # "CRP 180 mg/L"
|
||||
interpretation: str # "syndrome inflammatoire majeur"
|
||||
|
||||
|
||||
class CodeDecision(BaseModel):
|
||||
"""Décision finale sur un code (audit-friendly).
|
||||
|
||||
- action=KEEP: on garde la suggestion
|
||||
- action=DOWNGRADE: on remplace par un code moins spécifique (ex: D50→D64.9)
|
||||
- action=REMOVE: on retire le code (ou on le laisse vide)
|
||||
"""
|
||||
|
||||
action: str = "KEEP" # KEEP | DOWNGRADE | REMOVE
|
||||
final_code: Optional[str] = None
|
||||
downgraded_from: Optional[str] = None
|
||||
reason: Optional[str] = None
|
||||
needs_info: list[str] = Field(default_factory=list)
|
||||
applied_rules: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class Diagnostic(BaseModel):
|
||||
texte: str
|
||||
cim10_suggestion: Optional[str] = None
|
||||
cim10_confidence: Optional[str] = None
|
||||
# Sortie finale (post-traitement qualité)
|
||||
cim10_final: Optional[str] = None
|
||||
cim10_decision: Optional[CodeDecision] = None
|
||||
justification: Optional[str] = None
|
||||
raisonnement: Optional[str] = None
|
||||
sources_rag: list[RAGSource] = Field(default_factory=list)
|
||||
preuves_cliniques: list[PreuveClinique] = Field(default_factory=list)
|
||||
est_cma: Optional[bool] = None
|
||||
est_cms: Optional[bool] = None
|
||||
niveau_severite: Optional[str] = None # "leger" | "modere" | "severe" | "non_evalue"
|
||||
niveau_cma: Optional[int] = None # 1 (pas CMA) | 2 | 3 | 4 (niveau officiel ATIH)
|
||||
source: Optional[str] = None # "trackare" | "edsnlp" | "regex" | "llm_das"
|
||||
source_page: Optional[int] = None # numéro de page (1-indexed) dans le PDF source
|
||||
source_excerpt: Optional[str] = None # extrait du texte source (~200 chars)
|
||||
|
||||
|
||||
class ActeCCAM(BaseModel):
|
||||
texte: str
|
||||
code_ccam_suggestion: Optional[str] = None
|
||||
ccam_confidence: Optional[str] = None
|
||||
justification: Optional[str] = None
|
||||
raisonnement: Optional[str] = None
|
||||
sources_rag: list[RAGSource] = Field(default_factory=list)
|
||||
date: Optional[str] = None
|
||||
validite: Optional[str] = None # "valide" | "obsolete" | "non_verifie"
|
||||
alertes: list[str] = Field(default_factory=list)
|
||||
source_page: Optional[int] = None
|
||||
source_excerpt: Optional[str] = None
|
||||
|
||||
|
||||
class Traitement(BaseModel):
|
||||
medicament: str
|
||||
posologie: Optional[str] = None
|
||||
code_atc: Optional[str] = None
|
||||
source_page: Optional[int] = None
|
||||
source_excerpt: Optional[str] = None
|
||||
|
||||
|
||||
class BiologieCle(BaseModel):
|
||||
test: str
|
||||
valeur: Optional[str] = None
|
||||
anomalie: Optional[bool] = None
|
||||
source_page: Optional[int] = None
|
||||
source_excerpt: Optional[str] = None
|
||||
|
||||
|
||||
class Imagerie(BaseModel):
|
||||
type: str
|
||||
conclusion: Optional[str] = None
|
||||
score: Optional[str] = None
|
||||
source_page: Optional[int] = None
|
||||
source_excerpt: Optional[str] = None
|
||||
|
||||
|
||||
class Antecedent(BaseModel):
|
||||
texte: str
|
||||
source_page: Optional[int] = None
|
||||
source_excerpt: Optional[str] = None
|
||||
|
||||
|
||||
class Complication(BaseModel):
|
||||
texte: str
|
||||
source_page: Optional[int] = None
|
||||
source_excerpt: Optional[str] = None
|
||||
|
||||
|
||||
class DossierMedical(BaseModel):
|
||||
source_file: str = ""
|
||||
document_type: str = ""
|
||||
sejour: Sejour = Field(default_factory=Sejour)
|
||||
diagnostic_principal: Optional[Diagnostic] = None
|
||||
diagnostics_associes: list[Diagnostic] = Field(default_factory=list)
|
||||
actes_ccam: list[ActeCCAM] = Field(default_factory=list)
|
||||
antecedents: list[Antecedent] = Field(default_factory=list)
|
||||
traitements_sortie: list[Traitement] = Field(default_factory=list)
|
||||
biologie_cle: list[BiologieCle] = Field(default_factory=list)
|
||||
imagerie: list[Imagerie] = Field(default_factory=list)
|
||||
complications: list[Complication] = Field(default_factory=list)
|
||||
alertes_codage: list[str] = Field(default_factory=list)
|
||||
source_files: list[str] = Field(default_factory=list)
|
||||
ghm_estimation: Optional[GHMEstimation] = None
|
||||
controles_cpam: list[ControleCPAM] = Field(default_factory=list)
|
||||
veto_report: Optional["VetoReport"] = None
|
||||
processing_time_s: float | None = None
|
||||
|
||||
@field_validator("antecedents", mode="before")
|
||||
@classmethod
|
||||
def _coerce_antecedents(cls, v):
|
||||
"""Backward compat : convertit les anciennes list[str] en list[Antecedent]."""
|
||||
if not isinstance(v, list):
|
||||
return v
|
||||
result = []
|
||||
for item in v:
|
||||
if isinstance(item, str):
|
||||
result.append({"texte": item})
|
||||
else:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
@field_validator("complications", mode="before")
|
||||
@classmethod
|
||||
def _coerce_complications(cls, v):
|
||||
"""Backward compat : convertit les anciennes list[str] en list[Complication]."""
|
||||
if not isinstance(v, list):
|
||||
return v
|
||||
result = []
|
||||
for item in v:
|
||||
if isinstance(item, str):
|
||||
result.append({"texte": item})
|
||||
else:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
# --- Rapport d'anonymisation ---
|
||||
|
||||
|
||||
class GHMEstimation(BaseModel):
|
||||
cmd: Optional[str] = None
|
||||
cmd_libelle: Optional[str] = None
|
||||
type_ghm: Optional[str] = None # "C" / "M" / "K"
|
||||
severite: int = 1 # 1-4
|
||||
ghm_approx: Optional[str] = None # ex: "07C??2"
|
||||
cma_count: int = 0
|
||||
cms_count: int = 0
|
||||
alertes: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ControleCPAM(BaseModel):
|
||||
numero_ogc: int
|
||||
titre: str = ""
|
||||
arg_ucr: str = ""
|
||||
decision_ucr: str = ""
|
||||
dp_ucr: Optional[str] = None
|
||||
da_ucr: Optional[str] = None
|
||||
dr_ucr: Optional[str] = None
|
||||
actes_ucr: Optional[str] = None
|
||||
contre_argumentation: Optional[str] = None
|
||||
response_data: Optional[dict] = None
|
||||
sources_reponse: list[RAGSource] = Field(default_factory=list)
|
||||
|
||||
|
||||
# --- Qualité / Vetos (contestabilité) ---
|
||||
|
||||
|
||||
class VetoIssue(BaseModel):
|
||||
"""Un problème détecté lors du contrôle de contestabilité."""
|
||||
|
||||
veto: str
|
||||
severity: str # HARD | MEDIUM | LOW
|
||||
where: str
|
||||
message: str
|
||||
|
||||
|
||||
class VetoReport(BaseModel):
|
||||
"""Rapport global de vetos pour un dossier."""
|
||||
|
||||
verdict: str # PASS | NEED_INFO | FAIL
|
||||
score_contestabilite: int = 100 # 0-100
|
||||
issues: list[VetoIssue] = Field(default_factory=list)
|
||||
|
||||
|
||||
class AnonymizationReport(BaseModel):
|
||||
source_file: str
|
||||
total_replacements: int = 0
|
||||
regex_replacements: int = 0
|
||||
ner_replacements: int = 0
|
||||
sweep_replacements: int = 0
|
||||
entities_found: list[dict] = Field(default_factory=list)
|
||||
0
t2a_install_rag_cleanup/src/control/__init__.py
Normal file
0
t2a_install_rag_cleanup/src/control/__init__.py
Normal file
115
t2a_install_rag_cleanup/src/control/cpam_parser.py
Normal file
115
t2a_install_rag_cleanup/src/control/cpam_parser.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""Parsing du fichier Excel de contrôle CPAM (UCR) et matching OGC."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import openpyxl
|
||||
|
||||
from ..config import ControleCPAM
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Colonnes attendues dans le fichier Excel
|
||||
_EXPECTED_COLUMNS = ("N° OGC", "Titre", "Arg_UCR", "Décision_UCR", "DP_UCR", "DA_UCR", "DR_UCR", "Actes_UCR")
|
||||
|
||||
|
||||
def parse_cpam_excel(path: str | Path) -> dict[int, list[ControleCPAM]]:
|
||||
"""Lit le fichier Excel de contrôle CPAM et retourne un dict OGC -> liste de contrôles.
|
||||
|
||||
Args:
|
||||
path: Chemin vers le fichier .xlsx CPAM.
|
||||
|
||||
Returns:
|
||||
Dict avec le numéro OGC comme clé et la liste des contrôles associés.
|
||||
"""
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
logger.error("Fichier CPAM introuvable : %s", path)
|
||||
return {}
|
||||
|
||||
wb = openpyxl.load_workbook(path, read_only=True)
|
||||
ws = wb[wb.sheetnames[0]]
|
||||
|
||||
# Lire l'en-tête
|
||||
rows = ws.iter_rows(values_only=True)
|
||||
header = next(rows, None)
|
||||
if header is None:
|
||||
logger.error("Fichier CPAM vide : %s", path)
|
||||
return {}
|
||||
|
||||
# Construire le mapping colonne -> index
|
||||
col_map = {}
|
||||
for i, col_name in enumerate(header):
|
||||
if col_name:
|
||||
col_map[col_name.strip()] = i
|
||||
|
||||
# Vérifier les colonnes requises
|
||||
missing = [c for c in _EXPECTED_COLUMNS[:4] if c not in col_map]
|
||||
if missing:
|
||||
logger.error("Colonnes manquantes dans le fichier CPAM : %s", missing)
|
||||
return {}
|
||||
|
||||
result: dict[int, list[ControleCPAM]] = {}
|
||||
count = 0
|
||||
|
||||
for row in rows:
|
||||
ogc_val = row[col_map["N° OGC"]]
|
||||
if ogc_val is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
numero_ogc = int(ogc_val)
|
||||
except (ValueError, TypeError):
|
||||
logger.warning("N° OGC invalide ignoré : %s", ogc_val)
|
||||
continue
|
||||
|
||||
controle = ControleCPAM(
|
||||
numero_ogc=numero_ogc,
|
||||
titre=str(row[col_map.get("Titre", 1)] or "").strip(),
|
||||
arg_ucr=str(row[col_map.get("Arg_UCR", 2)] or "").strip(),
|
||||
decision_ucr=str(row[col_map.get("Décision_UCR", 3)] or "").strip(),
|
||||
dp_ucr=_clean_optional(row, col_map.get("DP_UCR")),
|
||||
da_ucr=_clean_optional(row, col_map.get("DA_UCR")),
|
||||
dr_ucr=_clean_optional(row, col_map.get("DR_UCR")),
|
||||
actes_ucr=_clean_optional(row, col_map.get("Actes_UCR")),
|
||||
)
|
||||
|
||||
result.setdefault(numero_ogc, []).append(controle)
|
||||
count += 1
|
||||
|
||||
logger.info("CPAM : %d contrôles chargés pour %d OGC distincts", count, len(result))
|
||||
return result
|
||||
|
||||
|
||||
def _clean_optional(row: tuple, idx: int | None) -> str | None:
|
||||
"""Extrait une valeur optionnelle depuis une ligne Excel."""
|
||||
if idx is None or idx >= len(row):
|
||||
return None
|
||||
val = row[idx]
|
||||
if val is None:
|
||||
return None
|
||||
val = str(val).strip()
|
||||
return val if val else None
|
||||
|
||||
|
||||
def match_dossier_ogc(source_name: str, cpam_data: dict[int, list[ControleCPAM]]) -> list[ControleCPAM]:
|
||||
"""Cherche les contrôles CPAM correspondant à un dossier par préfixe OGC.
|
||||
|
||||
Le nom du dossier suit le format "17_23100690" où 17 est le N° OGC.
|
||||
|
||||
Args:
|
||||
source_name: Nom du sous-dossier (ex: "17_23100690").
|
||||
cpam_data: Dict OGC -> contrôles retourné par parse_cpam_excel().
|
||||
|
||||
Returns:
|
||||
Liste des contrôles CPAM pour cet OGC, ou liste vide.
|
||||
"""
|
||||
match = re.match(r"^(\d+)_", source_name)
|
||||
if not match:
|
||||
return []
|
||||
|
||||
ogc = int(match.group(1))
|
||||
return cpam_data.get(ogc, [])
|
||||
1046
t2a_install_rag_cleanup/src/control/cpam_response.py
Normal file
1046
t2a_install_rag_cleanup/src/control/cpam_response.py
Normal file
File diff suppressed because it is too large
Load Diff
0
t2a_install_rag_cleanup/src/export/__init__.py
Normal file
0
t2a_install_rag_cleanup/src/export/__init__.py
Normal file
190
t2a_install_rag_cleanup/src/export/rum_export.py
Normal file
190
t2a_install_rag_cleanup/src/export/rum_export.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""Export au format RUM (Résumé d'Unité Médicale) V016 pour le groupeur ATIH.
|
||||
|
||||
Génère une ligne fixe de 165 caractères suivie de zones variables
|
||||
(DAS en 8 chars, actes CCAM en 29 chars chacun).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from ..config import FINESS, NUM_UM, DossierMedical
|
||||
|
||||
|
||||
@dataclass
|
||||
class RUMConfig:
|
||||
finess: str = FINESS
|
||||
num_um: str = NUM_UM
|
||||
|
||||
|
||||
def _format_cim10(code: str | None) -> str:
|
||||
"""Formate un code CIM-10 sur 8 caractères (sans point, padded)."""
|
||||
if not code:
|
||||
return " " * 8
|
||||
clean = code.upper().replace(".", "").strip()
|
||||
return clean.ljust(8)[:8]
|
||||
|
||||
|
||||
def _format_date(date_str: str | None) -> str:
|
||||
"""Convertit une date DD/MM/YYYY ou YYYY-MM-DD en DDMMYYYY (8 chars)."""
|
||||
if not date_str:
|
||||
return " " * 8
|
||||
date_str = date_str.strip()
|
||||
# Format DD/MM/YYYY
|
||||
m = re.match(r"(\d{2})/(\d{2})/(\d{4})", date_str)
|
||||
if m:
|
||||
return f"{m.group(1)}{m.group(2)}{m.group(3)}"
|
||||
# Format YYYY-MM-DD
|
||||
m = re.match(r"(\d{4})-(\d{2})-(\d{2})", date_str)
|
||||
if m:
|
||||
return f"{m.group(3)}{m.group(2)}{m.group(1)}"
|
||||
return " " * 8
|
||||
|
||||
|
||||
def _format_sex(sexe: str | None) -> str:
|
||||
"""Convertit le sexe en code RUM (1=M, 2=F)."""
|
||||
if not sexe:
|
||||
return " "
|
||||
s = sexe.strip().upper()
|
||||
if s in ("M", "MASCULIN", "HOMME", "H"):
|
||||
return "1"
|
||||
if s in ("F", "FEMININ", "FÉMININ", "FEMME"):
|
||||
return "2"
|
||||
return " "
|
||||
|
||||
|
||||
def _map_mode_entree(text: str | None) -> str:
|
||||
"""Convertit le mode d'entrée textuel en code RUM (1 char)."""
|
||||
if not text:
|
||||
return " "
|
||||
t = text.strip().lower()
|
||||
mapping = {
|
||||
"domicile": "8",
|
||||
"mutation": "6",
|
||||
"transfert": "7",
|
||||
"urgences": "8",
|
||||
"urgence": "8",
|
||||
}
|
||||
for key, code in mapping.items():
|
||||
if key in t:
|
||||
return code
|
||||
return " "
|
||||
|
||||
|
||||
def _map_mode_sortie(text: str | None) -> str:
|
||||
"""Convertit le mode de sortie textuel en code RUM (1 char)."""
|
||||
if not text:
|
||||
return " "
|
||||
t = text.strip().lower()
|
||||
mapping = {
|
||||
"domicile": "8",
|
||||
"mutation": "6",
|
||||
"transfert": "7",
|
||||
"deces": "9",
|
||||
"décès": "9",
|
||||
"décédé": "9",
|
||||
"decede": "9",
|
||||
}
|
||||
for key, code in mapping.items():
|
||||
if key in t:
|
||||
return code
|
||||
return " "
|
||||
|
||||
|
||||
def _format_ccam_act(acte) -> str:
|
||||
"""Formate un acte CCAM sur 29 caractères.
|
||||
|
||||
Structure : code(7) + phase(1) + activité(1) + date(8) + doc/extension(12)
|
||||
"""
|
||||
code = (acte.code_ccam_suggestion or "").upper().replace(" ", "")
|
||||
code = code.ljust(7)[:7]
|
||||
phase = "1"
|
||||
activite = "1"
|
||||
date = _format_date(acte.date)
|
||||
extension = " " * 12
|
||||
return f"{code}{phase}{activite}{date}{extension}"
|
||||
|
||||
|
||||
def export_rum(dossier: DossierMedical, config: RUMConfig | None = None) -> str:
|
||||
"""Génère le texte RUM complet pour un dossier médical.
|
||||
|
||||
Returns:
|
||||
Chaîne texte au format RUM V016 (165 chars fixes + zones variables).
|
||||
"""
|
||||
if config is None:
|
||||
config = RUMConfig()
|
||||
|
||||
sejour = dossier.sejour
|
||||
dp = dossier.diagnostic_principal
|
||||
|
||||
# Compteurs
|
||||
das_list = dossier.diagnostics_associes
|
||||
actes_list = dossier.actes_ccam
|
||||
nb_das = len(das_list)
|
||||
nb_actes = len(actes_list)
|
||||
|
||||
# Numéros générés
|
||||
source = dossier.source_file or "UNKNOWN"
|
||||
num_rss = source.replace(".pdf", "").replace(" ", "_").ljust(20)[:20]
|
||||
num_admin = num_rss
|
||||
num_rum = source[:10].ljust(10)[:10]
|
||||
|
||||
# Construction de la zone fixe (165 caractères)
|
||||
parts = [
|
||||
" " * 2, # 1-2 : Version classification (vide)
|
||||
" " * 6, # 3-8 : GHM (vide, rempli par groupeur)
|
||||
" ", # 9 : Filler
|
||||
"016", # 10-12 : Version format
|
||||
" " * 3, # 13-15 : Code retour
|
||||
config.finess.ljust(9)[:9], # 16-24 : FINESS
|
||||
"016", # 25-27 : Version RUM
|
||||
num_rss, # 28-47 : N° RSS
|
||||
num_admin, # 48-67 : N° admin
|
||||
num_rum, # 68-77 : N° RUM
|
||||
_format_date(None), # 78-85 : Date naissance (non disponible)
|
||||
_format_sex(sejour.sexe), # 86 : Sexe
|
||||
config.num_um.ljust(4)[:4], # 87-90 : N° UM
|
||||
" " * 2, # 91-92 : Type autorisation
|
||||
_format_date(sejour.date_entree), # 93-100: Date entrée UM
|
||||
_map_mode_entree(sejour.mode_entree), # 101 : Mode entrée
|
||||
" ", # 102 : Provenance
|
||||
_format_date(sejour.date_sortie), # 103-110: Date sortie UM
|
||||
_map_mode_sortie(sejour.mode_sortie), # 111 : Mode sortie
|
||||
" ", # 112 : Destination
|
||||
" " * 5, # 113-117: CP résidence
|
||||
" " * 4, # 118-121: Poids nné
|
||||
" " * 2, # 122-123: Âge gestationnel
|
||||
"00", # 124-125: Nb séances
|
||||
str(nb_das).zfill(2)[-2:], # 126-127: Nb DAS
|
||||
"00", # 128-129: Nb DAD
|
||||
str(nb_actes).zfill(2)[-2:], # 130-131: Nb actes
|
||||
_format_cim10(dp.cim10_suggestion if dp else None), # 132-139: DP
|
||||
" " * 8, # 140-147: DR
|
||||
" " * 3, # 148-150: IGS2
|
||||
" " * 15, # 151-165: Réservé
|
||||
]
|
||||
|
||||
fixed = "".join(parts)
|
||||
assert len(fixed) == 165, f"Zone fixe RUM: attendu 165, obtenu {len(fixed)}"
|
||||
|
||||
# Zones variables
|
||||
variable_parts: list[str] = []
|
||||
|
||||
# DAS (8 chars chacun)
|
||||
for das in das_list:
|
||||
variable_parts.append(_format_cim10(das.cim10_suggestion))
|
||||
|
||||
# Actes CCAM (29 chars chacun)
|
||||
for acte in actes_list:
|
||||
variable_parts.append(_format_ccam_act(acte))
|
||||
|
||||
return fixed + "".join(variable_parts)
|
||||
|
||||
|
||||
def save_rum(dossier: DossierMedical, path: Path, config: RUMConfig | None = None) -> None:
|
||||
"""Exporte un dossier au format RUM dans un fichier."""
|
||||
rum_text = export_rum(dossier, config)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(rum_text, encoding="utf-8")
|
||||
0
t2a_install_rag_cleanup/src/extraction/__init__.py
Normal file
0
t2a_install_rag_cleanup/src/extraction/__init__.py
Normal file
129
t2a_install_rag_cleanup/src/extraction/crh_parser.py
Normal file
129
t2a_install_rag_cleanup/src/extraction/crh_parser.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""Parsing des Comptes Rendus d'Hospitalisation (CRH)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def parse_crh(text: str) -> dict:
|
||||
"""Parse un CRH et retourne les sections structurées."""
|
||||
result: dict = {
|
||||
"type": "crh",
|
||||
"patient": {},
|
||||
"sejour": {},
|
||||
"medecins": [],
|
||||
"contenu_medical": "",
|
||||
"sections": {},
|
||||
}
|
||||
|
||||
_extract_patient_info(text, result)
|
||||
_extract_sejour_info(text, result)
|
||||
_extract_medecins(text, result)
|
||||
_extract_medical_content(text, result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _extract_patient_info(text: str, result: dict) -> None:
|
||||
"""Extrait les informations patient du CRH."""
|
||||
# "MME NARBAIS AUDREY" ou "M. NOM PRENOM"
|
||||
m = re.search(
|
||||
r"(?:MME|M\.|MR)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\- ]+)",
|
||||
text[:2000],
|
||||
)
|
||||
if m:
|
||||
result["patient"]["nom_complet"] = m.group(1).strip()
|
||||
|
||||
# Adresse sous le nom patient — capturer les lignes entre le nom et le CP+Ville
|
||||
addr_match = re.search(
|
||||
r"(?:MME|M\.|MR|Madame|Monsieur)\s+[A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\s\-]+\n((?:.*\n){1,4}?\d{5}\s+[A-Z][A-Z\s\-]+)",
|
||||
text[:3000],
|
||||
)
|
||||
if addr_match:
|
||||
result["patient"]["adresse"] = addr_match.group(1).strip()
|
||||
|
||||
# "née le DD/MM/YYYY" ou "né le DD/MM/YYYY"
|
||||
m = re.search(r"n[ée]+\s+le\s+(\d{2}/\d{2}/\d{4})", text)
|
||||
if m:
|
||||
result["patient"]["date_naissance"] = m.group(1)
|
||||
|
||||
# Sexe depuis le titre
|
||||
if re.search(r"\bMME\b", text[:2000]):
|
||||
result["patient"]["sexe"] = "F"
|
||||
elif re.search(r"\b(?:M\.|MR)\b", text[:2000]):
|
||||
result["patient"]["sexe"] = "M"
|
||||
|
||||
# "Votre patiente" / "Votre patient"
|
||||
if "patiente" in text[:3000].lower():
|
||||
result["patient"]["sexe"] = "F"
|
||||
elif "patient" in text[:3000].lower():
|
||||
result["patient"].setdefault("sexe", "M")
|
||||
|
||||
|
||||
def _extract_sejour_info(text: str, result: dict) -> None:
|
||||
"""Extrait les dates et motif de séjour."""
|
||||
# "du DD/MM/YYYY au DD/MM/YYYY"
|
||||
m = re.search(
|
||||
r"du\s+(\d{2}/\d{2}/\d{4})\s+au\s+(\d{2}/\d{2}/\d{4})", text
|
||||
)
|
||||
if m:
|
||||
result["sejour"]["date_entree"] = m.group(1)
|
||||
result["sejour"]["date_sortie"] = m.group(2)
|
||||
|
||||
# "pour le motif suivant:" ou "pour le motif suivant :\n..."
|
||||
m = re.search(
|
||||
r"pour\s+le\s+motif\s+suivant\s*[:\s]*\n?(.*?)(?:\n\n|\.\s+[A-Z])",
|
||||
text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if m:
|
||||
result["sejour"]["motif"] = m.group(1).strip()
|
||||
|
||||
|
||||
def _extract_medecins(text: str, result: dict) -> None:
|
||||
"""Extrait les noms de médecins mentionnés."""
|
||||
# "Dr NOM" ou "DR NOM" ou "Dr. NOM" ou "Docteur NOM" ou "Dr F. NOM"
|
||||
for m in re.finditer(
|
||||
r"(?:Dr\.?|DR\.?|Docteur)\s+(?:[A-Z]\.\s+)?([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\-]+(?:\s+[A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\-]+)?)",
|
||||
text,
|
||||
):
|
||||
name = m.group(1).strip()
|
||||
if name not in result["medecins"] and len(name) > 2:
|
||||
result["medecins"].append(name)
|
||||
|
||||
|
||||
def _extract_medical_content(text: str, result: dict) -> None:
|
||||
"""Extrait le contenu médical principal."""
|
||||
# Chercher après "Mon cher confrère," et les infos d'hospitalisation
|
||||
m = re.search(
|
||||
r"(?:motif\s+suivant\s*[:\s]*\n?)(.*?)(?:Rédigé par|Cordialement|Confraternellement|Dr\s+\w+\s*$)",
|
||||
text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if m:
|
||||
result["contenu_medical"] = m.group(1).strip()
|
||||
else:
|
||||
# Fallback : prendre tout après "Mon cher confrère"
|
||||
m = re.search(
|
||||
r"Mon cher confrère,?\s*\n(.*?)(?:Rédigé par|$)",
|
||||
text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if m:
|
||||
result["contenu_medical"] = m.group(1).strip()
|
||||
|
||||
# Sections spécifiques
|
||||
section_patterns = [
|
||||
("motif_hospitalisation", r"(?:motif\s+(?:d'hospitalisation|suivant))\s*[:\s]*\n?(.*?)(?=\n\s*(?:Antécédents|Histoire|Examen|Au total|Devenir|TTT)|$)"),
|
||||
("antecedents", r"(?:Antécédents?)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Histoire|Examen|Traitement|Au total|Devenir)|$)"),
|
||||
("histoire_maladie", r"(?:Histoire de la maladie)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Examen|Biologie|Au total|Devenir)|$)"),
|
||||
("examen_clinique", r"(?:Examen clinique)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Biologie|Imagerie|Au total|Devenir)|$)"),
|
||||
("conclusion", r"(?:Au total|Conclusion)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Devenir|TTT|Traitement)|$)"),
|
||||
("traitement_sortie", r"(?:TTT de sortie|Traitement de sortie)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Devenir|Rédigé|Cordialement)|$)"),
|
||||
("devenir", r"(?:Devenir)\s*[:\s]*\n?(.*?)(?=\n\s*(?:TTT|Traitement|Rédigé|Cordialement)|$)"),
|
||||
]
|
||||
|
||||
for key, pattern in section_patterns:
|
||||
m = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
||||
if m:
|
||||
result["sections"][key] = m.group(1).strip()
|
||||
@@ -0,0 +1,94 @@
|
||||
"""Détection du type de document : CRH vs Trackare."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClassificationResult:
|
||||
"""Résultat de classification avec score de confiance."""
|
||||
doc_type: str
|
||||
confidence: float
|
||||
scores: dict[str, float]
|
||||
|
||||
|
||||
# Marqueurs pondérés : (texte, poids)
|
||||
_TRACKARE_MARKERS: list[tuple[str, int]] = [
|
||||
("ipp:", 3),
|
||||
("episode no:", 3),
|
||||
("dossier patient", 2),
|
||||
("détails des patients", 2),
|
||||
("détails épisode", 2),
|
||||
("liste des contacts", 1),
|
||||
("notes paramédicales", 1),
|
||||
("signes vitaux", 1),
|
||||
("traitements médicamenteux", 1),
|
||||
("observations médicales", 1),
|
||||
("constantes", 1),
|
||||
("prescriptions", 1),
|
||||
("presc. de sortie", 2),
|
||||
("type de note", 1),
|
||||
]
|
||||
|
||||
_CRH_MARKERS: list[tuple[str, int]] = [
|
||||
("mon cher confrère", 3),
|
||||
("cher confrère", 3),
|
||||
("chère consœur", 3),
|
||||
("compte rendu d'hospitalisation", 3),
|
||||
("compte-rendu", 2),
|
||||
("service de gastro", 2),
|
||||
("service de chirurgie", 2),
|
||||
("service de médecine", 2),
|
||||
("pôle spécialités", 1),
|
||||
("votre patient", 2),
|
||||
("votre patiente", 2),
|
||||
("au total", 1),
|
||||
("ttt de sortie", 1),
|
||||
("devenir", 1),
|
||||
("cordialement", 1),
|
||||
]
|
||||
|
||||
_SCAN_LENGTH = 5000
|
||||
|
||||
|
||||
def classify_with_confidence(text: str) -> ClassificationResult:
|
||||
"""Classifie un document avec un score de confiance.
|
||||
|
||||
Retourne un ClassificationResult avec le type, la confiance (0.0-1.0),
|
||||
et les scores détaillés.
|
||||
"""
|
||||
text_lower = text[:_SCAN_LENGTH].lower()
|
||||
|
||||
trackare_score = sum(weight for marker, weight in _TRACKARE_MARKERS if marker in text_lower)
|
||||
crh_score = sum(weight for marker, weight in _CRH_MARKERS if marker in text_lower)
|
||||
|
||||
total = trackare_score + crh_score
|
||||
if total == 0:
|
||||
return ClassificationResult(doc_type="crh", confidence=0.5, scores={"trackare": 0, "crh": 0})
|
||||
|
||||
if trackare_score > crh_score:
|
||||
confidence = trackare_score / total
|
||||
doc_type = "trackare"
|
||||
elif crh_score > trackare_score:
|
||||
confidence = crh_score / total
|
||||
doc_type = "crh"
|
||||
else:
|
||||
# Égalité — défaut CRH
|
||||
confidence = 0.5
|
||||
doc_type = "crh"
|
||||
|
||||
return ClassificationResult(
|
||||
doc_type=doc_type,
|
||||
confidence=round(confidence, 2),
|
||||
scores={"trackare": trackare_score, "crh": crh_score},
|
||||
)
|
||||
|
||||
|
||||
def classify(text: str) -> str:
|
||||
"""Classifie un document extrait en CRH ou Trackare.
|
||||
|
||||
Retourne "crh" ou "trackare".
|
||||
Signature inchangée pour rétrocompatibilité.
|
||||
"""
|
||||
return classify_with_confidence(text).doc_type
|
||||
124
t2a_install_rag_cleanup/src/extraction/document_splitter.py
Normal file
124
t2a_install_rag_cleanup/src/extraction/document_splitter.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Découpage de PDFs multi-dossiers en chunks indépendants.
|
||||
|
||||
Certains PDFs contiennent plusieurs séjours/épisodes :
|
||||
- Trackare : plusieurs Episode No dans un même export
|
||||
- CRH : plusieurs lettres de sortie concaténées
|
||||
|
||||
Ce module insère une étape de splitting entre l'extraction texte et le parsing.
|
||||
Chaque chunk est ensuite traité indépendamment par le pipeline existant.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def split_documents(text: str, doc_type: str) -> list[str]:
|
||||
"""Point d'entrée : découpe un texte en chunks selon le type de document.
|
||||
|
||||
Retourne toujours au moins [text] (pas de split si un seul dossier).
|
||||
"""
|
||||
if doc_type == "trackare":
|
||||
return _split_trackare(text)
|
||||
elif doc_type == "crh":
|
||||
return _split_crh(text)
|
||||
return [text]
|
||||
|
||||
|
||||
def _split_trackare(text: str) -> list[str]:
|
||||
"""Découpe un export Trackare multi-épisodes.
|
||||
|
||||
Stratégie :
|
||||
1. Compter les occurrences de "Episode No:"
|
||||
2. Si une seule → pas de split
|
||||
3. Si plusieurs → couper sur "Détails épisode" (ou second "Episode No:")
|
||||
4. Préfixer le bloc patient à chaque chunk
|
||||
"""
|
||||
episodes = list(re.finditer(r"Episode No:\s*\d+", text))
|
||||
if len(episodes) <= 1:
|
||||
return [text]
|
||||
|
||||
logger.info(" Trackare multi-épisodes détecté : %d épisodes", len(episodes))
|
||||
|
||||
# Identifier le bloc patient (avant le premier épisode/détails épisode)
|
||||
# Le bloc patient va du début jusqu'à "Détails épisode" ou le premier "Episode No:"
|
||||
first_episode_start = episodes[0].start()
|
||||
|
||||
# Chercher "Détails épisode" qui précède chaque bloc épisode
|
||||
details_markers = list(re.finditer(r"Détails épisode", text))
|
||||
|
||||
if len(details_markers) >= 2:
|
||||
# Couper sur "Détails épisode"
|
||||
split_points = [m.start() for m in details_markers]
|
||||
# Le bloc patient = tout avant le premier "Détails épisode"
|
||||
patient_block = text[:split_points[0]].rstrip()
|
||||
else:
|
||||
# Fallback : couper sur "Episode No:"
|
||||
split_points = [m.start() for m in episodes]
|
||||
# Le bloc patient = tout avant le premier "Episode No:"
|
||||
# Remonter pour inclure "Détails épisode" s'il existe avant
|
||||
if details_markers:
|
||||
patient_block = text[:details_markers[0].start()].rstrip()
|
||||
else:
|
||||
patient_block = text[:split_points[0]].rstrip()
|
||||
|
||||
chunks: list[str] = []
|
||||
for i, start in enumerate(split_points):
|
||||
end = split_points[i + 1] if i + 1 < len(split_points) else len(text)
|
||||
episode_text = text[start:end].rstrip()
|
||||
# Préfixer le bloc patient pour que le parser ait les infos complètes
|
||||
chunk = patient_block + "\n\n" + episode_text
|
||||
chunks.append(chunk)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _split_crh(text: str) -> list[str]:
|
||||
"""Découpe un PDF contenant plusieurs CRH concaténés.
|
||||
|
||||
Stratégie :
|
||||
1. Détecter les frontières par headers patient (MME|M\\.|MR) suivis de
|
||||
patterns CRH (dates séjour, "Mon cher confrère")
|
||||
2. Si une seule occurrence → pas de split
|
||||
3. Si plusieurs → couper sur chaque header patient
|
||||
"""
|
||||
# Chercher les headers patient typiques d'un début de CRH
|
||||
# On cherche le pattern complet : titre + nom en majuscules
|
||||
headers = list(re.finditer(
|
||||
r"(?:^|\n)(?=\s*(?:MME|M\.|MR)\s+[A-ZÉÈÊËÀÂ]{2,})",
|
||||
text,
|
||||
))
|
||||
|
||||
if len(headers) <= 1:
|
||||
return [text]
|
||||
|
||||
# Filtrer : ne garder que les headers qui sont vraiment des débuts de CRH
|
||||
# (suivis dans les 2000 chars par un pattern CRH typique)
|
||||
crh_starts: list[int] = []
|
||||
for h in headers:
|
||||
pos = h.start()
|
||||
# Sauter le \n initial si présent
|
||||
if text[pos:pos + 1] == "\n":
|
||||
pos += 1
|
||||
lookahead = text[pos:pos + 2000].lower()
|
||||
if (re.search(r"du\s+\d{2}/\d{2}/\d{4}\s+au\s+\d{2}/\d{2}/\d{4}", lookahead)
|
||||
or "mon cher confrère" in lookahead
|
||||
or "cher confrère" in lookahead
|
||||
or "chère consœur" in lookahead
|
||||
or "compte rendu" in lookahead):
|
||||
crh_starts.append(pos)
|
||||
|
||||
if len(crh_starts) <= 1:
|
||||
return [text]
|
||||
|
||||
logger.info(" CRH multi-documents détecté : %d CRH", len(crh_starts))
|
||||
|
||||
chunks: list[str] = []
|
||||
for i, start in enumerate(crh_starts):
|
||||
end = crh_starts[i + 1] if i + 1 < len(crh_starts) else len(text)
|
||||
chunks.append(text[start:end].rstrip())
|
||||
|
||||
return chunks
|
||||
91
t2a_install_rag_cleanup/src/extraction/page_tracker.py
Normal file
91
t2a_install_rag_cleanup/src/extraction/page_tracker.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""Suivi des pages sources pour la traçabilité des diagnostics.
|
||||
|
||||
Permet de retrouver la page d'origine et l'extrait de texte correspondant
|
||||
à un diagnostic extrait du PDF.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class PageTracker:
|
||||
"""Associe chaque position de caractère au numéro de page source.
|
||||
|
||||
Args:
|
||||
page_offsets: Liste de tuples (start, end) pour chaque page (0-indexed dans la liste).
|
||||
"""
|
||||
|
||||
def __init__(self, page_offsets: list[tuple[int, int]]):
|
||||
self._offsets = page_offsets
|
||||
|
||||
def char_to_page(self, char_pos: int) -> int:
|
||||
"""Retourne le numéro de page (1-indexed) pour une position de caractère."""
|
||||
for i, (start, end) in enumerate(self._offsets):
|
||||
if start <= char_pos < end:
|
||||
return i + 1
|
||||
# Si au-delà de la dernière page, retourner la dernière
|
||||
if self._offsets:
|
||||
return len(self._offsets)
|
||||
return 1
|
||||
|
||||
def find_page_for_text(self, text: str, full_text: str) -> Optional[int]:
|
||||
"""Cherche le texte dans full_text et retourne la page (1-indexed).
|
||||
|
||||
Effectue une recherche case-insensitive si la recherche exacte échoue.
|
||||
"""
|
||||
if not text or not full_text:
|
||||
return None
|
||||
|
||||
# Recherche exacte
|
||||
pos = full_text.find(text)
|
||||
if pos >= 0:
|
||||
return self.char_to_page(pos)
|
||||
|
||||
# Recherche case-insensitive
|
||||
pos = full_text.lower().find(text.lower())
|
||||
if pos >= 0:
|
||||
return self.char_to_page(pos)
|
||||
|
||||
# Recherche partielle (premiers 50 chars)
|
||||
short = text[:50].strip()
|
||||
if len(short) >= 10:
|
||||
pos = full_text.lower().find(short.lower())
|
||||
if pos >= 0:
|
||||
return self.char_to_page(pos)
|
||||
|
||||
return None
|
||||
|
||||
def extract_excerpt(
|
||||
self, text: str, full_text: str, context_chars: int = 100,
|
||||
) -> Optional[str]:
|
||||
"""Extrait le contexte autour du texte trouvé (~200 chars).
|
||||
|
||||
Returns:
|
||||
Extrait avec contexte, ou None si le texte n'est pas trouvé.
|
||||
"""
|
||||
if not text or not full_text:
|
||||
return None
|
||||
|
||||
# Recherche (exacte puis case-insensitive)
|
||||
pos = full_text.find(text)
|
||||
if pos < 0:
|
||||
pos = full_text.lower().find(text.lower())
|
||||
if pos < 0:
|
||||
short = text[:50].strip()
|
||||
if len(short) >= 10:
|
||||
pos = full_text.lower().find(short.lower())
|
||||
if pos < 0:
|
||||
return None
|
||||
|
||||
start = max(0, pos - context_chars)
|
||||
end = min(len(full_text), pos + len(text) + context_chars)
|
||||
|
||||
excerpt = full_text[start:end].strip()
|
||||
# Ajouter des ellipses
|
||||
if start > 0:
|
||||
excerpt = "..." + excerpt
|
||||
if end < len(full_text):
|
||||
excerpt = excerpt + "..."
|
||||
|
||||
return excerpt
|
||||
66
t2a_install_rag_cleanup/src/extraction/pdf_extractor.py
Normal file
66
t2a_install_rag_cleanup/src/extraction/pdf_extractor.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Extraction de texte et tableaux depuis les PDF via pdfplumber."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pdfplumber
|
||||
|
||||
from .page_tracker import PageTracker
|
||||
|
||||
|
||||
def extract_text(pdf_path: str | Path) -> str:
|
||||
"""Extrait le texte de toutes les pages d'un PDF."""
|
||||
pages_text: list[str] = []
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text() or ""
|
||||
pages_text.append(text)
|
||||
return "\n\n".join(pages_text)
|
||||
|
||||
|
||||
def extract_text_with_pages(pdf_path: str | Path) -> tuple[str, PageTracker]:
|
||||
"""Extrait le texte avec un tracker de pages pour la traçabilité.
|
||||
|
||||
Returns:
|
||||
(texte_complet, page_tracker) où page_tracker permet de retrouver
|
||||
la page source de chaque position de caractère.
|
||||
"""
|
||||
pages_text: list[str] = []
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text() or ""
|
||||
pages_text.append(text)
|
||||
|
||||
# Construire le texte complet avec "\n\n" comme séparateur (identique à extract_text)
|
||||
separator = "\n\n"
|
||||
page_offsets: list[tuple[int, int]] = []
|
||||
offset = 0
|
||||
for i, page_text in enumerate(pages_text):
|
||||
start = offset
|
||||
end = offset + len(page_text)
|
||||
page_offsets.append((start, end))
|
||||
offset = end + len(separator)
|
||||
|
||||
full_text = separator.join(pages_text)
|
||||
return full_text, PageTracker(page_offsets)
|
||||
|
||||
|
||||
def extract_pages(pdf_path: str | Path) -> list[str]:
|
||||
"""Extrait le texte page par page."""
|
||||
pages: list[str] = []
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
pages.append(page.extract_text() or "")
|
||||
return pages
|
||||
|
||||
|
||||
def extract_tables(pdf_path: str | Path) -> list[list[list[str | None]]]:
|
||||
"""Extrait tous les tableaux détectés dans le PDF."""
|
||||
all_tables: list[list[list[str | None]]] = []
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
tables = page.extract_tables() or []
|
||||
all_tables.extend(tables)
|
||||
return all_tables
|
||||
424
t2a_install_rag_cleanup/src/extraction/trackare_parser.py
Normal file
424
t2a_install_rag_cleanup/src/extraction/trackare_parser.py
Normal file
@@ -0,0 +1,424 @@
|
||||
"""Parsing des exports Trackare (dossier patient complet)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from src.medical.das_filter import clean_diagnostic_text, is_valid_diagnostic_text
|
||||
|
||||
|
||||
def parse_trackare(text: str) -> dict:
|
||||
"""Parse un export Trackare et retourne les sections structurées."""
|
||||
result: dict = {
|
||||
"type": "trackare",
|
||||
"patient": {},
|
||||
"sejour": {},
|
||||
"contacts": [],
|
||||
"medecins": [],
|
||||
"urgences": {},
|
||||
"observations_medicales": [],
|
||||
"notes_paramedicales": [],
|
||||
"signes_vitaux": {},
|
||||
"diagnostics": [],
|
||||
"traitements": [],
|
||||
"contenu_medical": "",
|
||||
}
|
||||
|
||||
_extract_patient_info(text, result)
|
||||
_extract_sejour_info(text, result)
|
||||
_extract_contacts(text, result)
|
||||
_extract_medecins(text, result)
|
||||
_extract_urgences(text, result)
|
||||
_extract_observations(text, result)
|
||||
_extract_notes_param(text, result)
|
||||
_extract_diagnostics(text, result)
|
||||
_extract_traitements(text, result)
|
||||
_extract_vitals(text, result)
|
||||
_build_medical_content(result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _extract_patient_info(text: str, result: dict) -> None:
|
||||
"""Extrait les informations du bloc 'Détails des patients'."""
|
||||
# Nom de naissance
|
||||
m = re.search(r"Nom de naissance:\s*(\S+)", text)
|
||||
if m:
|
||||
result["patient"]["nom_naissance"] = m.group(1).strip()
|
||||
|
||||
# Nom et Prénom
|
||||
m = re.search(r"Nom et Prénom:\s*(.+?)(?:\s+Date de naissance|\n)", text)
|
||||
if m:
|
||||
result["patient"]["nom_prenom"] = m.group(1).strip()
|
||||
|
||||
# IPP
|
||||
m = re.search(r"IPP:\s*(\d+)", text)
|
||||
if m:
|
||||
result["patient"]["ipp"] = m.group(1)
|
||||
|
||||
# Date de naissance
|
||||
m = re.search(r"Date de naissance:\s*(\d{2}/\d{2}/\d{4})", text)
|
||||
if m:
|
||||
result["patient"]["date_naissance"] = m.group(1)
|
||||
|
||||
# Sexe
|
||||
m = re.search(r"Sexe:\s*(\S+)", text)
|
||||
if m:
|
||||
sexe_raw = m.group(1).strip().lower()
|
||||
result["patient"]["sexe"] = "F" if "fém" in sexe_raw else "M"
|
||||
|
||||
# Lieu de naissance
|
||||
m = re.search(r"Lieu de naissance:\s*(.+?)(?:\n|$)", text)
|
||||
if m:
|
||||
result["patient"]["lieu_naissance"] = m.group(1).strip()
|
||||
|
||||
# Adresse
|
||||
m = re.search(r"Adresse:\s*(.+?)(?:\s+Ville de résidence|\n)", text)
|
||||
if m:
|
||||
result["patient"]["adresse"] = m.group(1).strip()
|
||||
|
||||
# Code postal et ville
|
||||
m = re.search(r"Code Postal:\s*(\d{5})", text)
|
||||
if m:
|
||||
result["patient"]["code_postal"] = m.group(1)
|
||||
m = re.search(r"Ville de résidence:\s*(.+?)(?:\n|$)", text)
|
||||
if m:
|
||||
result["patient"]["ville"] = m.group(1).strip()
|
||||
|
||||
# Taille, Poids, IMC (footer)
|
||||
m = re.search(r"Taille:\s*(\d+)\s*cm", text)
|
||||
if m:
|
||||
result["patient"]["taille_cm"] = int(m.group(1))
|
||||
m = re.search(r"Poids:\s*([\d.]+)\s*kg", text)
|
||||
if m:
|
||||
result["patient"]["poids_kg"] = float(m.group(1))
|
||||
m = re.search(r"IMC:\s*([\d.]+)", text)
|
||||
if m:
|
||||
result["patient"]["imc"] = float(m.group(1))
|
||||
|
||||
|
||||
def _extract_sejour_info(text: str, result: dict) -> None:
|
||||
"""Extrait les détails de l'épisode."""
|
||||
m = re.search(r"Episode No:\s*(\d+)", text)
|
||||
if m:
|
||||
result["sejour"]["episode"] = m.group(1)
|
||||
|
||||
m = re.search(r"Date d'admission:\s*(\d{2}/\d{2}/\d{4})", text)
|
||||
if m:
|
||||
result["sejour"]["date_entree"] = m.group(1)
|
||||
|
||||
m = re.search(r"Heure d'admission:\s*(\d{2}:\d{2})", text)
|
||||
if m:
|
||||
result["sejour"]["heure_entree"] = m.group(1)
|
||||
|
||||
m = re.search(r"Date de sortie:\s*(\d{2}/\d{2}/\d{4})", text)
|
||||
if m:
|
||||
result["sejour"]["date_sortie"] = m.group(1)
|
||||
|
||||
m = re.search(r"Heure de sortie:\s*(\d{2}:\d{2})", text)
|
||||
if m:
|
||||
result["sejour"]["heure_sortie"] = m.group(1)
|
||||
|
||||
m = re.search(r"Localisation:\s*(.+?)(?:\s+Médecin courant|\n)", text)
|
||||
if m:
|
||||
result["sejour"]["service"] = m.group(1).strip()
|
||||
|
||||
m = re.search(r"Médecin courant:\s*(.+?)(?:\n|$)", text)
|
||||
if m:
|
||||
result["sejour"]["medecin_courant"] = m.group(1).strip()
|
||||
|
||||
|
||||
def _extract_contacts(text: str, result: dict) -> None:
|
||||
"""Extrait la liste des contacts."""
|
||||
# Bloc "Liste des contacts"
|
||||
contact_block = re.search(
|
||||
r"Liste des contacts\n(.*?)(?=Passage aux Urgences|Signes Vitaux|Observations médicales)",
|
||||
text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not contact_block:
|
||||
return
|
||||
|
||||
block = contact_block.group(1)
|
||||
# Chaque ligne de contact contient relation, nom, prénom, tél
|
||||
for line in block.split("\n"):
|
||||
line = line.strip()
|
||||
if not line or line.startswith("Type de contact") or line.startswith("Tél"):
|
||||
continue
|
||||
# Chercher les noms et téléphones
|
||||
tel_match = re.search(r"(\d{2}[.\-\s]\d{2}[.\-\s]\d{2}[.\-\s]\d{2}[.\-\s]\d{2})", line)
|
||||
if tel_match or re.search(r"(?:Epoux|Époux|Épouse|Conjoint|Père|Mère|Fils|Fille|Frère|Soeur)", line, re.IGNORECASE):
|
||||
result["contacts"].append(line)
|
||||
|
||||
|
||||
def _extract_medecins(text: str, result: dict) -> None:
|
||||
"""Extrait les noms de médecins/soignants."""
|
||||
seen: set[str] = set()
|
||||
|
||||
def _add(name: str) -> None:
|
||||
name = _clean_person_name(name)
|
||||
if name and len(name) > 2 and name.lower() not in seen:
|
||||
seen.add(name.lower())
|
||||
result["medecins"].append(name)
|
||||
|
||||
# "DR. Prénom NOM" ou "Dr NOM" ou "Docteur NOM Prénom"
|
||||
for m in re.finditer(
|
||||
r"(?:DR\.?|Dr\.?|Docteur)\s+([A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:\s+[A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\-]+){0,2})",
|
||||
text,
|
||||
):
|
||||
_add(m.group(1))
|
||||
|
||||
# Auteurs d'observations : "Note d'évolution NOM Prénom DD/MM/YYYY"
|
||||
# ou multi-ligne "Note IDE Prénom\nNOM DD/MM/YYYY"
|
||||
for m in re.finditer(
|
||||
r"(?:Note d'évolution|Note IDE|Histoire de la maladie|Conclusion Obs\.?\s*médicales?)\s+"
|
||||
r"(?:DR\.?\s+)?"
|
||||
r"([A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:[\s\n]+[A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+)*?)"
|
||||
r"\s+\d{2}/\d{2}/\d{4}",
|
||||
text,
|
||||
):
|
||||
_add(m.group(1))
|
||||
|
||||
# Médecin de prise en charge / décision médicale
|
||||
for m in re.finditer(
|
||||
r"(?:Médecin de (?:la )?(?:prise en charge|décision)\s+médicale)\s+"
|
||||
r"([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:\s+[A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+){0,2})",
|
||||
text,
|
||||
):
|
||||
_add(m.group(1))
|
||||
|
||||
# IAO NOM Prénom
|
||||
for m in re.finditer(
|
||||
r"IAO\s+([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:\s+[A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+){0,2})",
|
||||
text,
|
||||
):
|
||||
_add(m.group(1))
|
||||
|
||||
# Prénom seul sur la ligne avant "DD/MM/YYYY...Note IDE...\nNOM HH:MM"
|
||||
# Ex: "Argitxu 02/03/2023\nNote IDE ...\nHIRIGOYEN 14:05"
|
||||
# ou "Stephanie 27/02/2023 TDM fait et à voir\nNote IDE\nCONSTANTIN 08:54"
|
||||
for m in re.finditer(
|
||||
r"([A-ZÉÈÊËÀÂ][a-zéèêëàâäùûüôöîïç]+)\s+\d{2}/\d{2}/\d{4}[^\n]*\n\s*Note IDE[^\n]*\n\s*([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\-]+)\s+\d{2}:\d{2}",
|
||||
text,
|
||||
):
|
||||
prenom = m.group(1)
|
||||
nom = m.group(2)
|
||||
_add(f"{prenom} {nom}")
|
||||
|
||||
|
||||
# Mots qui ne sont pas des noms de personnes
|
||||
_NOT_NAMES = {
|
||||
"non", "pas", "une", "des", "les", "par", "sur", "pour", "dans",
|
||||
"avec", "sans", "qui", "que", "est", "sont", "date", "heure",
|
||||
"cholecystectomie", "cholécystectomie", "cholangiographie",
|
||||
"complication", "vasculaire", "nécessaire", "donc", "note",
|
||||
"douleurs", "absence", "douleur", "lotissement", "priorité",
|
||||
"prescriptions", "technique", "alimentaire", "signé", "réalisé",
|
||||
"selles", "covid", "devenir", "algique", "normal", "regime",
|
||||
"reprise", "biprofenid", "orale", "gelule", "comprime",
|
||||
"glyc", "inj", "lipase", "protéines", "ionogramme",
|
||||
"créatinine", "glucose", "num", "crp", "ta", "bilirubine",
|
||||
"tp", "tca", "bh", "bs", "sortie", "transfert",
|
||||
}
|
||||
|
||||
|
||||
def _clean_person_name(raw: str) -> str:
|
||||
"""Nettoie un nom extrait en supprimant le texte parasite."""
|
||||
name = re.sub(r"\n+", " ", raw).strip()
|
||||
parts = name.split()
|
||||
clean: list[str] = []
|
||||
for part in parts:
|
||||
p = part.strip(".-")
|
||||
if not p:
|
||||
continue
|
||||
if p.lower() in _NOT_NAMES:
|
||||
break
|
||||
# Un mot-nom : commence par une majuscule
|
||||
if re.match(r"^[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ]", p):
|
||||
clean.append(p)
|
||||
else:
|
||||
break
|
||||
result = " ".join(clean).strip()
|
||||
# Rejeter si un seul mot de 1-2 lettres (initiale)
|
||||
if len(result) <= 2:
|
||||
return ""
|
||||
return result
|
||||
|
||||
|
||||
def _extract_urgences(text: str, result: dict) -> None:
|
||||
"""Extrait les données du passage aux urgences."""
|
||||
urg_block = re.search(
|
||||
r"Passage aux Urgences\n(.*?)(?=Signes Vitaux|Observations médicales|Antécédents)",
|
||||
text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not urg_block:
|
||||
return
|
||||
|
||||
block = urg_block.group(1)
|
||||
|
||||
m = re.search(r"Mode de transport.*?:\s*(.+)", block)
|
||||
if m:
|
||||
result["urgences"]["mode_transport"] = m.group(1).strip()
|
||||
|
||||
m = re.search(r"Mode d'entrée\s+(.+)", block)
|
||||
if m:
|
||||
result["urgences"]["mode_entree"] = m.group(1).strip()
|
||||
|
||||
m = re.search(r"Priorité\s+(Priorité \d)", block)
|
||||
if m:
|
||||
result["urgences"]["priorite"] = m.group(1)
|
||||
|
||||
# Motifs de prise en charge
|
||||
motifs = re.findall(
|
||||
r"Motif de prise en charge\s+(.+?)(?=\n(?:Observ\.|Médecin|Date|IAO))",
|
||||
block,
|
||||
re.DOTALL,
|
||||
)
|
||||
if motifs:
|
||||
result["urgences"]["motifs"] = [
|
||||
line.strip()
|
||||
for motif in motifs
|
||||
for line in motif.split("\n")
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
|
||||
def _extract_observations(text: str, result: dict) -> None:
|
||||
"""Extrait les observations médicales."""
|
||||
obs_block = re.search(
|
||||
r"Observations médicales\n(.*?)(?=Notes paramédicales|Surveillance Psychiatrie|Traitements médicamenteux|$)",
|
||||
text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not obs_block:
|
||||
return
|
||||
|
||||
block = obs_block.group(1)
|
||||
|
||||
# Découper par type d'observation
|
||||
entries = re.split(
|
||||
r"(Note d'évolution|Conclusion Obs\.\s*médicales|Histoire de la maladie)",
|
||||
block,
|
||||
)
|
||||
|
||||
i = 1
|
||||
while i < len(entries) - 1:
|
||||
obs_type = entries[i].strip()
|
||||
content = entries[i + 1].strip()
|
||||
|
||||
# Extraire auteur et date
|
||||
m = re.match(
|
||||
r"(?:DR\.?\s+)?([A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:\s+[A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+)*)\s+(\d{2}/\d{2}/\d{4})\s+(\d{2}:\d{2})\s*(.*)",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if m:
|
||||
result["observations_medicales"].append({
|
||||
"type": obs_type,
|
||||
"auteur": m.group(1).strip(),
|
||||
"date": m.group(2),
|
||||
"heure": m.group(3),
|
||||
"contenu": m.group(4).strip(),
|
||||
})
|
||||
else:
|
||||
result["observations_medicales"].append({
|
||||
"type": obs_type,
|
||||
"contenu": content,
|
||||
})
|
||||
i += 2
|
||||
|
||||
|
||||
def _extract_notes_param(text: str, result: dict) -> None:
|
||||
"""Extrait les notes paramédicales."""
|
||||
notes_block = re.search(
|
||||
r"Notes paramédicales\n(.*?)(?=Traitements médicamenteux|Surveillance|$)",
|
||||
text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not notes_block:
|
||||
return
|
||||
|
||||
block = notes_block.group(1)
|
||||
for m in re.finditer(
|
||||
r"Note IDE\s+([A-Za-zéèêëàâäùûüôöîïçÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ\.\-\s]+?)\s+(\d{2}/\d{2}/\d{4})\s+(\d{2}:\d{2})\s+(.*?)(?=Note IDE|$)",
|
||||
block,
|
||||
re.DOTALL,
|
||||
):
|
||||
result["notes_paramedicales"].append({
|
||||
"auteur": m.group(1).strip(),
|
||||
"date": m.group(2),
|
||||
"heure": m.group(3),
|
||||
"contenu": m.group(4).strip(),
|
||||
})
|
||||
|
||||
|
||||
def _extract_diagnostics(text: str, result: dict) -> None:
|
||||
"""Extrait les diagnostics codés."""
|
||||
# "Principal actif CODE DESCRIPTION"
|
||||
for m in re.finditer(
|
||||
r"(Principal|Associé|Significatif)\s+(actif|inactif)\s+([A-Z]\d{2}(?:\.\d{1,2})?)\s+(.+?)(?:\s+\[.*?\])?\s+\d{2}/\d{2}/\d{4}",
|
||||
text,
|
||||
):
|
||||
libelle = clean_diagnostic_text(m.group(4).strip())
|
||||
if not is_valid_diagnostic_text(libelle):
|
||||
continue
|
||||
result["diagnostics"].append({
|
||||
"type": m.group(1),
|
||||
"statut": m.group(2),
|
||||
"code_cim10": m.group(3),
|
||||
"libelle": libelle,
|
||||
})
|
||||
|
||||
|
||||
def _extract_traitements(text: str, result: dict) -> None:
|
||||
"""Extrait les traitements médicamenteux."""
|
||||
ttt_block = re.search(
|
||||
r"Traitements médicamenteux\n(.*?)$",
|
||||
text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not ttt_block:
|
||||
return
|
||||
|
||||
block = ttt_block.group(1)
|
||||
# Chercher les noms de médicaments (en majuscules)
|
||||
for m in re.finditer(
|
||||
r"([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂ0-9\s\-/%.,'`]+(?:MG|ML|SOL|INJ|CPR|GEL|AMP|POCHE)[A-ZÉÈÊËÀÂ0-9\s\-/%.,'`\(\)\[\]]*)\s+([\d\s]+\s*(?:mg|G|GEL|CPR|AMP|ML)?)\s*[-–]\s*(.+?)(?=\n[A-Z]|\Z)",
|
||||
block,
|
||||
re.DOTALL,
|
||||
):
|
||||
result["traitements"].append({
|
||||
"medicament": m.group(1).strip(),
|
||||
"dose": m.group(2).strip(),
|
||||
"frequence": m.group(3).strip().split("\n")[0],
|
||||
})
|
||||
|
||||
|
||||
def _extract_vitals(text: str, result: dict) -> None:
|
||||
"""Extrait les données anthropométriques clés."""
|
||||
m = re.search(r"Taille \[cm\]\s+([\d.]+)", text)
|
||||
if m:
|
||||
result["signes_vitaux"]["taille_cm"] = float(m.group(1))
|
||||
m = re.search(r"Poids \[kg\]\s+([\d.]+)", text)
|
||||
if m:
|
||||
result["signes_vitaux"]["poids_kg"] = float(m.group(1))
|
||||
m = re.search(r"Indice\s*\n?\s*de masse\s+([\d.]+)", text)
|
||||
if m:
|
||||
result["signes_vitaux"]["imc"] = float(m.group(1))
|
||||
|
||||
|
||||
def _build_medical_content(result: dict) -> None:
|
||||
"""Construit le texte médical complet à partir des observations."""
|
||||
parts: list[str] = []
|
||||
|
||||
if result["urgences"].get("motifs"):
|
||||
parts.append("Motifs: " + ", ".join(result["urgences"]["motifs"]))
|
||||
|
||||
for obs in result["observations_medicales"]:
|
||||
parts.append(obs.get("contenu", ""))
|
||||
|
||||
for note in result["notes_paramedicales"]:
|
||||
parts.append(note.get("contenu", ""))
|
||||
|
||||
result["contenu_medical"] = "\n\n".join(parts)
|
||||
469
t2a_install_rag_cleanup/src/main.py
Normal file
469
t2a_install_rag_cleanup/src/main.py
Normal file
@@ -0,0 +1,469 @@
|
||||
"""CLI + orchestrateur du pipeline d'anonymisation et extraction CIM-10."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from .anonymization.anonymizer import Anonymizer
|
||||
from .config import ANONYMIZED_DIR, INPUT_DIR, OUTPUT_DIR, REPORTS_DIR, STRUCTURED_DIR, AnonymizationReport, DossierMedical, VetoReport
|
||||
from .extraction.document_classifier import classify
|
||||
from .extraction.crh_parser import parse_crh
|
||||
from .extraction.document_splitter import split_documents
|
||||
from .extraction.pdf_extractor import extract_text, extract_text_with_pages
|
||||
from .extraction.trackare_parser import parse_trackare
|
||||
from .medical.cim10_extractor import extract_medical_info
|
||||
from .medical.ghm import estimate_ghm
|
||||
from .quality.veto_engine import apply_vetos
|
||||
from .quality.decision_engine import apply_decisions, decision_summaries
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _inject_veto_alerts(dossier: DossierMedical, veto: VetoReport, scope: str = "FINAL") -> None:
|
||||
"""Injecte les alertes liées aux vetos dans alertes_codage en évitant les doublons.
|
||||
|
||||
On *remplace* la section VETO précédente (qu'elle vienne d'un PDF individuel ou d'une passe de fusion),
|
||||
afin que le JSON fusionné reste lisible.
|
||||
"""
|
||||
cleaned: list[str] = []
|
||||
for line in (dossier.alertes_codage or []):
|
||||
if isinstance(line, str) and (line.startswith("VETOS:") or line.startswith("VETOS[") or line.startswith("VETO-")):
|
||||
continue
|
||||
cleaned.append(line)
|
||||
dossier.alertes_codage = cleaned
|
||||
|
||||
if veto.verdict != "PASS":
|
||||
dossier.alertes_codage.append(f"VETOS[{scope}]: {veto.verdict} (score={veto.score_contestabilite})")
|
||||
for it in veto.issues[:25]:
|
||||
dossier.alertes_codage.append(f"{it.veto} [{it.severity}] {it.where}: {it.message}")
|
||||
|
||||
|
||||
def _inject_decision_alerts(dossier: DossierMedical, scope: str = "FINAL") -> None:
|
||||
"""Injecte les décisions (downgrade/suppression) dans alertes_codage.
|
||||
|
||||
On remplace la section DECISION précédente pour garder un JSON lisible.
|
||||
"""
|
||||
cleaned: list[str] = []
|
||||
for line in (dossier.alertes_codage or []):
|
||||
if isinstance(line, str) and line.startswith("DECISION:"):
|
||||
continue
|
||||
cleaned.append(line)
|
||||
dossier.alertes_codage = cleaned
|
||||
|
||||
lines = decision_summaries(dossier)
|
||||
if lines:
|
||||
dossier.alertes_codage.append(f"DECISIONS[{scope}]: {len(lines)} ligne(s)")
|
||||
dossier.alertes_codage.extend(lines[:30])
|
||||
|
||||
|
||||
# Flags globaux
|
||||
_use_edsnlp = True
|
||||
_use_rag = True
|
||||
|
||||
|
||||
def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, AnonymizationReport]]:
|
||||
"""Traite un PDF : extraction → splitting → parsing → anonymisation → extraction CIM-10.
|
||||
|
||||
Retourne une liste de (texte_anonymisé, dossier, rapport) — un par dossier détecté.
|
||||
"""
|
||||
t0 = time.time()
|
||||
logger.info("Traitement de %s", pdf_path.name)
|
||||
|
||||
# 1. Extraction texte avec pages
|
||||
raw_text, page_tracker = extract_text_with_pages(pdf_path)
|
||||
logger.info(" Texte extrait : %d caractères", len(raw_text))
|
||||
|
||||
# 2. Classification
|
||||
doc_type = classify(raw_text)
|
||||
logger.info(" Type de document : %s", doc_type)
|
||||
|
||||
# 3. Splitting multi-dossiers
|
||||
chunks = split_documents(raw_text, doc_type)
|
||||
if len(chunks) > 1:
|
||||
logger.info(" Découpage : %d dossiers détectés dans %s", len(chunks), pdf_path.name)
|
||||
|
||||
results: list[tuple[str, DossierMedical, AnonymizationReport]] = []
|
||||
for i, chunk_text in enumerate(chunks):
|
||||
part_label = f" [part {i+1}/{len(chunks)}]" if len(chunks) > 1 else ""
|
||||
logger.info(" Traitement%s...", part_label)
|
||||
|
||||
# 4. Parsing
|
||||
if doc_type == "trackare":
|
||||
parsed = parse_trackare(chunk_text)
|
||||
else:
|
||||
parsed = parse_crh(chunk_text)
|
||||
|
||||
# 5. Anonymisation
|
||||
anonymizer = Anonymizer(parsed_data=parsed)
|
||||
anonymized_text = anonymizer.anonymize(chunk_text)
|
||||
report = anonymizer.report
|
||||
report.source_file = pdf_path.name
|
||||
logger.info(
|
||||
" Anonymisation%s : %d remplacements (regex=%d, ner=%d, sweep=%d)",
|
||||
part_label,
|
||||
report.total_replacements,
|
||||
report.regex_replacements,
|
||||
report.ner_replacements,
|
||||
report.sweep_replacements,
|
||||
)
|
||||
|
||||
# 6. Analyse edsnlp (optionnelle)
|
||||
edsnlp_result = None
|
||||
if _use_edsnlp:
|
||||
edsnlp_result = _run_edsnlp(anonymized_text)
|
||||
|
||||
# 7. Extraction médicale CIM-10
|
||||
dossier = extract_medical_info(
|
||||
parsed, anonymized_text, edsnlp_result, use_rag=_use_rag,
|
||||
page_tracker=page_tracker, raw_text=raw_text,
|
||||
)
|
||||
dossier.source_file = pdf_path.name
|
||||
dossier.document_type = doc_type
|
||||
logger.info(" DP%s : %s", part_label, dossier.diagnostic_principal)
|
||||
logger.info(" DAS : %d, Actes : %d", len(dossier.diagnostics_associes), len(dossier.actes_ccam))
|
||||
|
||||
# 8. Estimation GHM
|
||||
try:
|
||||
ghm = estimate_ghm(dossier)
|
||||
dossier.ghm_estimation = ghm
|
||||
logger.info(" GHM : CMD=%s, Type=%s, Sévérité=%d → %s",
|
||||
ghm.cmd or "?", ghm.type_ghm or "?",
|
||||
ghm.severite, ghm.ghm_approx or "?")
|
||||
except Exception:
|
||||
logger.warning(" Erreur estimation GHM", exc_info=True)
|
||||
|
||||
# 9. Vetos (contestabilité)
|
||||
try:
|
||||
veto = apply_vetos(dossier)
|
||||
dossier.veto_report = veto
|
||||
apply_decisions(dossier)
|
||||
_inject_decision_alerts(dossier, scope="PDF")
|
||||
_inject_veto_alerts(dossier, veto, scope="PDF")
|
||||
|
||||
except Exception:
|
||||
logger.warning(" Vetos : erreur lors du contrôle", exc_info=True)
|
||||
|
||||
dossier.processing_time_s = round(time.time() - t0, 2)
|
||||
results.append((anonymized_text, dossier, report))
|
||||
|
||||
logger.info(" Temps total : %.2fs", time.time() - t0)
|
||||
return results
|
||||
|
||||
|
||||
def _run_edsnlp(text: str):
|
||||
"""Exécute l'analyse edsnlp avec fallback gracieux."""
|
||||
try:
|
||||
from .medical.edsnlp_pipeline import analyze, is_available
|
||||
if not is_available():
|
||||
logger.info(" edsnlp non disponible, utilisation du mode regex seul")
|
||||
return None
|
||||
result = analyze(text)
|
||||
logger.info(
|
||||
" edsnlp : %d CIM-10, %d médicaments, %d dates",
|
||||
len(result.cim10_entities),
|
||||
len(result.drug_entities),
|
||||
len(result.date_entities),
|
||||
)
|
||||
return result
|
||||
except Exception:
|
||||
logger.warning(" edsnlp : erreur lors de l'analyse, fallback regex", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
def write_outputs(
|
||||
stem: str,
|
||||
anonymized_text: str,
|
||||
dossier: DossierMedical,
|
||||
report: AnonymizationReport,
|
||||
subdir: str | None = None,
|
||||
export_rum_flag: bool = False,
|
||||
) -> None:
|
||||
"""Écrit les fichiers de sortie."""
|
||||
anon_dir = ANONYMIZED_DIR / subdir if subdir else ANONYMIZED_DIR
|
||||
struct_dir = STRUCTURED_DIR / subdir if subdir else STRUCTURED_DIR
|
||||
rep_dir = REPORTS_DIR / subdir if subdir else REPORTS_DIR
|
||||
|
||||
anon_dir.mkdir(parents=True, exist_ok=True)
|
||||
struct_dir.mkdir(parents=True, exist_ok=True)
|
||||
rep_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Texte anonymisé
|
||||
anon_path = anon_dir / f"{stem}_anonymized.txt"
|
||||
anon_path.write_text(anonymized_text, encoding="utf-8")
|
||||
logger.info(" → %s", anon_path)
|
||||
|
||||
# JSON structuré
|
||||
json_path = struct_dir / f"{stem}_cim10.json"
|
||||
json_path.write_text(
|
||||
dossier.model_dump_json(indent=2, exclude_none=True),
|
||||
encoding="utf-8",
|
||||
)
|
||||
logger.info(" → %s", json_path)
|
||||
|
||||
# Rapport d'anonymisation
|
||||
report_path = rep_dir / f"{stem}_report.json"
|
||||
report_path.write_text(
|
||||
report.model_dump_json(indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
logger.info(" → %s", report_path)
|
||||
|
||||
# Export RUM
|
||||
if export_rum_flag:
|
||||
from .export.rum_export import save_rum
|
||||
rum_dir = OUTPUT_DIR / "rum"
|
||||
if subdir:
|
||||
rum_dir = rum_dir / subdir
|
||||
rum_dir.mkdir(parents=True, exist_ok=True)
|
||||
rum_path = rum_dir / f"{stem}_rum.txt"
|
||||
save_rum(dossier, rum_path)
|
||||
logger.info(" → %s", rum_path)
|
||||
|
||||
|
||||
def main(input_path: str | None = None) -> None:
|
||||
"""Point d'entrée principal."""
|
||||
global _use_edsnlp, _use_rag
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Anonymisation de documents médicaux PDF et extraction CIM-10",
|
||||
)
|
||||
parser.add_argument(
|
||||
"input",
|
||||
nargs="*",
|
||||
default=[input_path or "input/"],
|
||||
help="Chemin(s) vers des PDFs, dossiers patients, ou le dossier racine (défaut: input/)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-ner",
|
||||
action="store_true",
|
||||
help="Désactiver la phase NER (plus rapide, moins précis)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-edsnlp",
|
||||
action="store_true",
|
||||
help="Désactiver l'analyse edsnlp (mode regex seul)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-rag",
|
||||
action="store_true",
|
||||
help="Désactiver l'enrichissement RAG (FAISS + Ollama)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--build-dict",
|
||||
action="store_true",
|
||||
help="Générer le dictionnaire CIM-10 depuis metadata.json et quitter",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--build-ccam-dict",
|
||||
nargs="?",
|
||||
const="CCAM_V81.xls",
|
||||
metavar="PATH",
|
||||
help="Générer le dictionnaire CCAM depuis un fichier XLS (défaut: CCAM_V81.xls)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rebuild-index",
|
||||
action="store_true",
|
||||
help="Forcer la reconstruction de l'index FAISS",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--export-rum",
|
||||
action="store_true",
|
||||
help="Exporter les dossiers au format RUM V016 (pour groupeur ATIH)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--control-cpam",
|
||||
metavar="PATH",
|
||||
help="Fichier Excel de contrôle CPAM (enrichit les dossiers avec contre-argumentation)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.build_dict:
|
||||
from .medical.cim10_dict import build_dict
|
||||
build_dict()
|
||||
return
|
||||
|
||||
if args.build_ccam_dict:
|
||||
from .medical.ccam_dict import build_dict as build_ccam
|
||||
result = build_ccam(args.build_ccam_dict)
|
||||
logger.info("Dictionnaire CCAM : %d codes générés", len(result))
|
||||
return
|
||||
|
||||
if args.rebuild_index:
|
||||
from .medical.rag_index import build_index
|
||||
build_index(force=True)
|
||||
return
|
||||
|
||||
if args.no_ner:
|
||||
# Monkey-patch pour désactiver NER
|
||||
from .anonymization import ner_anonymizer
|
||||
ner_anonymizer.extract_person_entities = lambda text: []
|
||||
|
||||
if args.no_edsnlp:
|
||||
_use_edsnlp = False
|
||||
|
||||
if args.no_rag:
|
||||
_use_rag = False
|
||||
|
||||
export_rum_flag = args.export_rum
|
||||
|
||||
# Chargement contrôle CPAM (auto-détection ou flag explicite)
|
||||
cpam_data = None
|
||||
cpam_path = args.control_cpam
|
||||
if not cpam_path:
|
||||
# Auto-détection : chercher un .xlsx dans input/Control_cpam/
|
||||
cpam_dir = INPUT_DIR / "Control_cpam"
|
||||
if cpam_dir.is_dir():
|
||||
xlsx_files = sorted(cpam_dir.glob("*.xlsx"))
|
||||
if xlsx_files:
|
||||
cpam_path = str(xlsx_files[0])
|
||||
logger.info("CPAM : fichier détecté automatiquement → %s", cpam_path)
|
||||
if cpam_path:
|
||||
from .control.cpam_parser import parse_cpam_excel
|
||||
cpam_data = parse_cpam_excel(cpam_path)
|
||||
if not cpam_data:
|
||||
logger.warning("Aucun contrôle CPAM chargé depuis %s", cpam_path)
|
||||
|
||||
input_paths = args.input
|
||||
|
||||
# Collecte des groupes (pdfs, subdir) à traiter
|
||||
groups: list[tuple[list[Path], str | None]] = []
|
||||
|
||||
for p in input_paths:
|
||||
input_p = Path(p)
|
||||
if input_p.is_file():
|
||||
# Fichier unique → subdir = nom du dossier parent (si ce n'est pas input/)
|
||||
subdir = input_p.parent.name if input_p.parent.name != "input" else None
|
||||
groups.append(([input_p], subdir))
|
||||
elif input_p.is_dir():
|
||||
# Vérifier s'il y a des PDFs directement dans ce dossier
|
||||
root_pdfs = sorted(input_p.glob("*.pdf"))
|
||||
# Vérifier s'il y a des sous-dossiers avec PDFs
|
||||
sub_dirs = [c for c in sorted(input_p.iterdir()) if c.is_dir() and list(c.glob("*.pdf"))]
|
||||
|
||||
if sub_dirs:
|
||||
# C'est un dossier racine (comme input/) → traiter chaque sous-dossier
|
||||
for child in sub_dirs:
|
||||
sub_pdfs = sorted(child.glob("*.pdf"))
|
||||
groups.append((sub_pdfs, child.name))
|
||||
elif root_pdfs:
|
||||
# C'est un dossier patient directement → utiliser son nom comme subdir
|
||||
groups.append((root_pdfs, input_p.name))
|
||||
else:
|
||||
logger.error("Chemin introuvable : %s", input_p)
|
||||
sys.exit(1)
|
||||
|
||||
total = sum(len(pdfs) for pdfs, _ in groups)
|
||||
if total == 0:
|
||||
logger.warning("Aucun PDF trouvé dans %s", input_p)
|
||||
sys.exit(0)
|
||||
|
||||
logger.info("Traitement de %d PDF(s)...", total)
|
||||
|
||||
for pdfs, subdir in groups:
|
||||
if subdir:
|
||||
logger.info("--- Dossier %s (%d PDFs) ---", subdir, len(pdfs))
|
||||
|
||||
group_dossiers: list[DossierMedical] = []
|
||||
for pdf_path in pdfs:
|
||||
try:
|
||||
pdf_results = process_pdf(pdf_path)
|
||||
stem = pdf_path.stem.replace(" ", "_")
|
||||
multi = len(pdf_results) > 1
|
||||
for part_idx, (anonymized_text, dossier, report) in enumerate(pdf_results):
|
||||
part_stem = f"{stem}_part{part_idx + 1}" if multi else stem
|
||||
write_outputs(part_stem, anonymized_text, dossier, report, subdir=subdir, export_rum_flag=export_rum_flag)
|
||||
group_dossiers.append(dossier)
|
||||
except Exception:
|
||||
logger.exception("Erreur lors du traitement de %s", pdf_path.name)
|
||||
|
||||
# Fusion multi-PDFs si plusieurs documents dans le même groupe
|
||||
merged = None
|
||||
if len(group_dossiers) > 1 and subdir:
|
||||
try:
|
||||
from .medical.fusion import merge_dossiers
|
||||
merged = merge_dossiers(group_dossiers)
|
||||
|
||||
# Re-estimer le GHM sur le dossier fusionné (DP/DAS consolidés)
|
||||
try:
|
||||
ghm = estimate_ghm(merged)
|
||||
merged.ghm_estimation = ghm
|
||||
logger.info(" GHM fusionné : CMD=%s, Type=%s, Sévérité=%d → %s",
|
||||
ghm.cmd or "?", ghm.type_ghm or "?",
|
||||
ghm.severite, ghm.ghm_approx or "?")
|
||||
except Exception:
|
||||
logger.warning(" Erreur estimation GHM fusionné", exc_info=True)
|
||||
|
||||
struct_dir = STRUCTURED_DIR / subdir
|
||||
struct_dir.mkdir(parents=True, exist_ok=True)
|
||||
merged_path = struct_dir / f"{subdir}_fusionne_cim10.json"
|
||||
|
||||
# Export RUM du dossier fusionné
|
||||
if export_rum_flag:
|
||||
from .export.rum_export import save_rum
|
||||
rum_dir = OUTPUT_DIR / "rum" / subdir
|
||||
rum_dir.mkdir(parents=True, exist_ok=True)
|
||||
rum_path = rum_dir / f"{subdir}_fusionne_rum.txt"
|
||||
save_rum(merged, rum_path)
|
||||
logger.info(" → RUM fusionné : %s", rum_path)
|
||||
except Exception:
|
||||
logger.exception("Erreur lors de la fusion du groupe %s", subdir)
|
||||
merged = None
|
||||
|
||||
# Contrôle CPAM : enrichir le dossier principal (fusionné ou dernier)
|
||||
if cpam_data and subdir:
|
||||
try:
|
||||
from .control.cpam_parser import match_dossier_ogc
|
||||
controles = match_dossier_ogc(subdir, cpam_data)
|
||||
if controles:
|
||||
from .control.cpam_response import generate_cpam_response
|
||||
target = merged if merged else (group_dossiers[-1] if group_dossiers else None)
|
||||
if target:
|
||||
logger.info(" CPAM : %d contrôle(s) pour %s", len(controles), subdir)
|
||||
for ctrl in controles:
|
||||
text, response_data, sources = generate_cpam_response(target, ctrl)
|
||||
ctrl.contre_argumentation = text
|
||||
ctrl.response_data = response_data
|
||||
ctrl.sources_reponse = sources
|
||||
target.controles_cpam = controles
|
||||
except Exception:
|
||||
logger.exception("Erreur CPAM pour %s", subdir)
|
||||
|
||||
# Écrire le dossier fusionné (après enrichissement CPAM éventuel)
|
||||
if merged is not None and subdir:
|
||||
try:
|
||||
# Vetos sur la version finale (fusion + CPAM)
|
||||
try:
|
||||
veto = apply_vetos(merged)
|
||||
merged.veto_report = veto
|
||||
apply_decisions(merged)
|
||||
_inject_decision_alerts(merged, scope="FINAL")
|
||||
_inject_veto_alerts(merged, veto, scope="FINAL")
|
||||
|
||||
except Exception:
|
||||
logger.warning(" Vetos fusionné : erreur lors du contrôle", exc_info=True)
|
||||
|
||||
struct_dir = STRUCTURED_DIR / subdir
|
||||
struct_dir.mkdir(parents=True, exist_ok=True)
|
||||
merged_path = struct_dir / f"{subdir}_fusionne_cim10.json"
|
||||
merged_path.write_text(
|
||||
merged.model_dump_json(indent=2, exclude_none=True),
|
||||
encoding="utf-8",
|
||||
)
|
||||
logger.info(" → Dossier fusionné : %s", merged_path)
|
||||
except Exception:
|
||||
logger.exception("Erreur écriture dossier fusionné %s", subdir)
|
||||
|
||||
logger.info("Terminé.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
t2a_install_rag_cleanup/src/medical/__init__.py
Normal file
0
t2a_install_rag_cleanup/src/medical/__init__.py
Normal file
191
t2a_install_rag_cleanup/src/medical/ccam_dict.py
Normal file
191
t2a_install_rag_cleanup/src/medical/ccam_dict.py
Normal file
@@ -0,0 +1,191 @@
|
||||
"""Dictionnaire CCAM complet extrait depuis le fichier XLS officiel (CNAM).
|
||||
|
||||
Fournit un lookup intelligent avec normalisation Unicode pour la recherche
|
||||
de codes CCAM à partir de textes d'actes médicaux en français.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..config import CCAM_DICT_PATH
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Singleton : dictionnaire chargé une seule fois
|
||||
_dict_cache: dict[str, dict] | None = None
|
||||
# Cache des labels normalisés pour le substring matching
|
||||
_normalized_cache: list[tuple[str, str, str]] | None = None
|
||||
|
||||
_CCAM_CODE_RE = re.compile(r"^[A-Z]{4}\d{3}$")
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalise un texte : accent folding, lowercase, collapse whitespace."""
|
||||
text = text.replace("\u2019", "'").replace("\u2018", "'").replace("\u02BC", "'")
|
||||
nfkd = unicodedata.normalize("NFKD", text)
|
||||
stripped = "".join(c for c in nfkd if unicodedata.category(c) != "Mn")
|
||||
return re.sub(r"\s+", " ", stripped.lower()).strip()
|
||||
|
||||
|
||||
def build_dict(source_path: str | Path) -> dict[str, dict]:
|
||||
"""Construit le dictionnaire CCAM depuis un fichier XLS et l'écrit en JSON.
|
||||
|
||||
Format JSON : {code: {description, activite, tarif_s1, regroupement}}
|
||||
|
||||
Args:
|
||||
source_path: Chemin vers le fichier XLS CCAM (ex: CCAM_V81.xls).
|
||||
|
||||
Returns:
|
||||
Le dictionnaire code → infos.
|
||||
"""
|
||||
import xlrd
|
||||
|
||||
source_path = Path(source_path)
|
||||
if not source_path.exists():
|
||||
logger.error("Fichier XLS non trouvé : %s", source_path)
|
||||
return {}
|
||||
|
||||
wb = xlrd.open_workbook(str(source_path))
|
||||
sheet = wb.sheet_by_index(0)
|
||||
|
||||
result: dict[str, dict] = {}
|
||||
|
||||
for r in range(sheet.nrows):
|
||||
code = str(sheet.cell_value(r, 0)).strip()
|
||||
if not _CCAM_CODE_RE.match(code):
|
||||
continue
|
||||
|
||||
description = str(sheet.cell_value(r, 2)).strip()
|
||||
activite_raw = sheet.cell_value(r, 3)
|
||||
activite = int(activite_raw) if isinstance(activite_raw, float) else None
|
||||
|
||||
tarif_raw = sheet.cell_value(r, 5)
|
||||
tarif_s1 = round(tarif_raw, 2) if isinstance(tarif_raw, (int, float)) else None
|
||||
|
||||
regroupement = str(sheet.cell_value(r, 10)).strip() or None
|
||||
|
||||
result[code] = {
|
||||
"description": description,
|
||||
"activite": activite,
|
||||
"tarif_s1": tarif_s1,
|
||||
"regroupement": regroupement,
|
||||
}
|
||||
|
||||
# Écrire le fichier JSON
|
||||
CCAM_DICT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(CCAM_DICT_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info("Dictionnaire CCAM généré : %d codes → %s", len(result), CCAM_DICT_PATH)
|
||||
return result
|
||||
|
||||
|
||||
def load_dict() -> dict[str, dict]:
|
||||
"""Charge le dictionnaire CCAM (singleton lazy-loaded).
|
||||
|
||||
Si le fichier JSON n'existe pas, retourne un dict vide avec un warning.
|
||||
"""
|
||||
global _dict_cache
|
||||
if _dict_cache is not None:
|
||||
return _dict_cache
|
||||
|
||||
if CCAM_DICT_PATH.exists():
|
||||
with open(CCAM_DICT_PATH, encoding="utf-8") as f:
|
||||
_dict_cache = json.load(f)
|
||||
else:
|
||||
logger.warning("Dictionnaire CCAM absent : %s — lancez --build-ccam-dict", CCAM_DICT_PATH)
|
||||
_dict_cache = {}
|
||||
|
||||
return _dict_cache
|
||||
|
||||
|
||||
def _get_normalized_entries() -> list[tuple[str, str, str]]:
|
||||
"""Retourne une liste de (code, description, description_normalisée) triée par longueur."""
|
||||
global _normalized_cache
|
||||
if _normalized_cache is not None:
|
||||
return _normalized_cache
|
||||
|
||||
d = load_dict()
|
||||
entries = []
|
||||
for code, info in d.items():
|
||||
desc = info.get("description", "") if isinstance(info, dict) else str(info)
|
||||
norm = normalize_text(desc)
|
||||
entries.append((code, desc, norm))
|
||||
|
||||
# Trier par longueur de description décroissante (plus spécifique d'abord)
|
||||
entries.sort(key=lambda e: -len(e[2]))
|
||||
_normalized_cache = entries
|
||||
return _normalized_cache
|
||||
|
||||
|
||||
def lookup(
|
||||
text: str,
|
||||
domain_overrides: dict[str, str] | None = None,
|
||||
) -> str | None:
|
||||
"""Recherche un code CCAM pour un texte donné.
|
||||
|
||||
Stratégie en 3 niveaux :
|
||||
1. Match substring dans domain_overrides (prioritaire, ex: CCAM_MAP existant)
|
||||
2. Match exact normalisé dans le dictionnaire complet
|
||||
3. Match substring normalisé avec scoring par spécificité
|
||||
|
||||
Args:
|
||||
text: Le texte de l'acte médical à rechercher.
|
||||
domain_overrides: Dictionnaire terme→code prioritaire.
|
||||
|
||||
Returns:
|
||||
Le code CCAM trouvé ou None.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
text_norm = normalize_text(text)
|
||||
|
||||
# Niveau 1 : domain overrides (substring match)
|
||||
if domain_overrides:
|
||||
for terme, code in domain_overrides.items():
|
||||
if normalize_text(terme) in text_norm:
|
||||
return code
|
||||
|
||||
entries = _get_normalized_entries()
|
||||
|
||||
# Niveau 2 : match exact normalisé
|
||||
for code, _desc, norm_desc in entries:
|
||||
if norm_desc == text_norm:
|
||||
return code
|
||||
|
||||
# Niveau 3 : substring match normalisé (plus spécifique d'abord)
|
||||
for code, _desc, norm_desc in entries:
|
||||
if not norm_desc or len(norm_desc) < 4:
|
||||
continue
|
||||
if norm_desc in text_norm or text_norm in norm_desc:
|
||||
return code
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def validate_code(code: str) -> tuple[bool, str]:
|
||||
"""Vérifie si un code CCAM existe dans le dictionnaire.
|
||||
|
||||
Returns:
|
||||
(is_valid, description) — description vide si invalide.
|
||||
"""
|
||||
d = load_dict()
|
||||
if code in d:
|
||||
info = d[code]
|
||||
desc = info.get("description", "") if isinstance(info, dict) else str(info)
|
||||
return True, desc
|
||||
return False, ""
|
||||
|
||||
|
||||
def reset_cache() -> None:
|
||||
"""Réinitialise les caches (utile pour les tests)."""
|
||||
global _dict_cache, _normalized_cache
|
||||
_dict_cache = None
|
||||
_normalized_cache = None
|
||||
122
t2a_install_rag_cleanup/src/medical/ccam_noncumul.py
Normal file
122
t2a_install_rag_cleanup/src/medical/ccam_noncumul.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""Détection des incompatibilités de non-cumul entre actes CCAM.
|
||||
|
||||
Implémente 3 règles heuristiques basées sur les principes T2A :
|
||||
1. Même code de base (7 caractères) avec activités différentes
|
||||
2. Même regroupement chirurgical le même jour
|
||||
3. Paires de regroupements incompatibles connues
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..config import ActeCCAM
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Regroupements chirurgicaux soumis à cumul restreint (un seul par jour)
|
||||
REGROUPEMENT_UNIQUE_PAR_JOUR: set[str] = {
|
||||
"ADC", # Actes de chirurgie
|
||||
"ACO", # Actes de chirurgie orthopédique
|
||||
"ADO", # Actes de chirurgie ORL
|
||||
"ADA", # Actes de chirurgie abdominale/digestive
|
||||
"ADE", # Actes de chirurgie endoscopique
|
||||
}
|
||||
|
||||
# Paires de regroupements incompatibles
|
||||
NONCUMUL_REGROUPEMENT_PAIRS: set[frozenset[str]] = {
|
||||
frozenset({"ADC", "ADE"}),
|
||||
frozenset({"ADC", "ADO"}),
|
||||
frozenset({"ACO", "ADE"}),
|
||||
}
|
||||
|
||||
|
||||
def _get_regroupement(acte: ActeCCAM) -> str | None:
|
||||
"""Récupère le regroupement d'un acte depuis le dictionnaire CCAM."""
|
||||
if not acte.code_ccam_suggestion:
|
||||
return None
|
||||
try:
|
||||
from .ccam_dict import load_dict
|
||||
d = load_dict()
|
||||
info = d.get(acte.code_ccam_suggestion)
|
||||
if info and isinstance(info, dict):
|
||||
return info.get("regroupement")
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def check_noncumul(actes: list[ActeCCAM]) -> list[str]:
|
||||
"""Vérifie les règles de non-cumul entre actes CCAM.
|
||||
|
||||
Args:
|
||||
actes: Liste d'actes CCAM d'un dossier médical.
|
||||
|
||||
Returns:
|
||||
Liste d'alertes de non-cumul détectées.
|
||||
"""
|
||||
if len(actes) < 2:
|
||||
return []
|
||||
|
||||
alertes: list[str] = []
|
||||
|
||||
# Enrichir les actes avec leur regroupement
|
||||
actes_info: list[tuple[ActeCCAM, str | None]] = [
|
||||
(acte, _get_regroupement(acte)) for acte in actes
|
||||
]
|
||||
|
||||
# Règle 1 : même code de base (7 premiers caractères), activités différentes
|
||||
codes_base: dict[str, list[ActeCCAM]] = {}
|
||||
for acte in actes:
|
||||
code = acte.code_ccam_suggestion
|
||||
if code and len(code) >= 7:
|
||||
base = code[:7]
|
||||
codes_base.setdefault(base, []).append(acte)
|
||||
|
||||
for base, group in codes_base.items():
|
||||
if len(group) > 1:
|
||||
codes_full = [a.code_ccam_suggestion for a in group]
|
||||
alertes.append(
|
||||
f"NON-CUMUL: codes de même base {base} avec variantes "
|
||||
f"({', '.join(codes_full)}) — vérifier la facturation"
|
||||
)
|
||||
|
||||
# Règle 2 : même regroupement chirurgical le même jour
|
||||
regroup_par_jour: dict[tuple[str, str | None], list[ActeCCAM]] = {}
|
||||
for acte, regroup in actes_info:
|
||||
if regroup and regroup in REGROUPEMENT_UNIQUE_PAR_JOUR:
|
||||
key = (regroup, acte.date)
|
||||
regroup_par_jour.setdefault(key, []).append(acte)
|
||||
|
||||
for (regroup, date), group in regroup_par_jour.items():
|
||||
if len(group) > 1:
|
||||
codes = [a.code_ccam_suggestion or "?" for a in group]
|
||||
jour = f" le {date}" if date else ""
|
||||
alertes.append(
|
||||
f"NON-CUMUL: {len(group)} actes du regroupement {regroup}{jour} "
|
||||
f"({', '.join(codes)}) — cumul restreint"
|
||||
)
|
||||
|
||||
# Règle 3 : paires de regroupements incompatibles
|
||||
regroups_seen: list[tuple[str, ActeCCAM]] = [
|
||||
(r, a) for a, r in actes_info if r
|
||||
]
|
||||
checked: set[frozenset[int]] = set()
|
||||
for i, (r1, a1) in enumerate(regroups_seen):
|
||||
for j, (r2, a2) in enumerate(regroups_seen):
|
||||
if i >= j:
|
||||
continue
|
||||
pair_key = frozenset({i, j})
|
||||
if pair_key in checked:
|
||||
continue
|
||||
checked.add(pair_key)
|
||||
pair = frozenset({r1, r2})
|
||||
if pair in NONCUMUL_REGROUPEMENT_PAIRS:
|
||||
alertes.append(
|
||||
f"NON-CUMUL: regroupements incompatibles {r1}/{r2} "
|
||||
f"({a1.code_ccam_suggestion or '?'} + {a2.code_ccam_suggestion or '?'})"
|
||||
)
|
||||
|
||||
return alertes
|
||||
243
t2a_install_rag_cleanup/src/medical/cim10_dict.py
Normal file
243
t2a_install_rag_cleanup/src/medical/cim10_dict.py
Normal file
@@ -0,0 +1,243 @@
|
||||
"""Dictionnaire CIM-10 complet extrait depuis les métadonnées FAISS.
|
||||
|
||||
Fournit un lookup intelligent avec normalisation Unicode pour la recherche
|
||||
de codes CIM-10 à partir de textes médicaux en français.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..config import CIM10_DICT_PATH, CIM10_SUPPLEMENTS_PATH, RAG_INDEX_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Singleton : dictionnaire chargé une seule fois
|
||||
_dict_cache: dict[str, str] | None = None
|
||||
# Cache des labels normalisés pour le substring matching
|
||||
_normalized_cache: list[tuple[str, str, str]] | None = None
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalise un texte : accent folding, lowercase, collapse whitespace.
|
||||
|
||||
Utilise unicodedata pour supprimer les accents (NFD → suppression des
|
||||
combining marks), puis met en minuscules et collapse les espaces multiples.
|
||||
"""
|
||||
# Normaliser les apostrophes Unicode → ASCII
|
||||
text = text.replace("\u2019", "'").replace("\u2018", "'").replace("\u02BC", "'")
|
||||
# NFD decomposition puis suppression des combining marks (accents)
|
||||
nfkd = unicodedata.normalize("NFKD", text)
|
||||
stripped = "".join(c for c in nfkd if unicodedata.category(c) != "Mn")
|
||||
# Lowercase + collapse whitespace
|
||||
return re.sub(r"\s+", " ", stripped.lower()).strip()
|
||||
|
||||
|
||||
def build_dict() -> dict[str, str]:
|
||||
"""Construit le dictionnaire CIM-10 depuis les métadonnées RAG.
|
||||
|
||||
Extrait le code et le label (première ligne de l'extrait, sans le préfixe code)
|
||||
depuis chaque entrée CIM-10 du metadata.json existant.
|
||||
|
||||
Returns:
|
||||
Le dictionnaire code → label.
|
||||
"""
|
||||
# Nouveau format : metadata_ref.json (fallback legacy : metadata.json)
|
||||
metadata_path = RAG_INDEX_DIR / "metadata_ref.json"
|
||||
if not metadata_path.exists():
|
||||
legacy = RAG_INDEX_DIR / "metadata.json"
|
||||
if legacy.exists():
|
||||
metadata_path = legacy
|
||||
else:
|
||||
logger.error("Métadonnées RAG non trouvées : %s", metadata_path)
|
||||
return {}
|
||||
|
||||
with open(metadata_path, encoding="utf-8") as f:
|
||||
metadata = json.load(f)
|
||||
|
||||
result: dict[str, str] = {}
|
||||
for entry in metadata:
|
||||
if entry.get("document") != "cim10":
|
||||
continue
|
||||
code = entry.get("code")
|
||||
extrait = entry.get("extrait", "")
|
||||
if not code or not extrait:
|
||||
continue
|
||||
|
||||
# Extraire le label : première ligne, sans le préfixe "CODE "
|
||||
first_line = extrait.split("\n")[0].strip()
|
||||
# Retirer le préfixe code (ex: "K85.1 Pancréatite aigüe...")
|
||||
prefix = f"{code} "
|
||||
if first_line.startswith(prefix):
|
||||
label = first_line[len(prefix):]
|
||||
else:
|
||||
label = first_line
|
||||
|
||||
# Garder l'entrée la plus spécifique (avec point > sans point)
|
||||
if code not in result or not label:
|
||||
result[code] = label
|
||||
|
||||
# Écrire le fichier JSON
|
||||
CIM10_DICT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(CIM10_DICT_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info("Dictionnaire CIM-10 généré : %d codes → %s", len(result), CIM10_DICT_PATH)
|
||||
return result
|
||||
|
||||
|
||||
def load_dict() -> dict[str, str]:
|
||||
"""Charge le dictionnaire CIM-10 (singleton lazy-loaded).
|
||||
|
||||
Si le fichier JSON n'existe pas, tente de le construire depuis metadata.json.
|
||||
Fusionne ensuite les suppléments (sous-codes manquants) sans écraser le dict principal.
|
||||
"""
|
||||
global _dict_cache
|
||||
if _dict_cache is not None:
|
||||
return _dict_cache
|
||||
|
||||
if CIM10_DICT_PATH.exists():
|
||||
with open(CIM10_DICT_PATH, encoding="utf-8") as f:
|
||||
_dict_cache = json.load(f)
|
||||
else:
|
||||
logger.info("Dictionnaire CIM-10 absent, construction depuis metadata.json...")
|
||||
_dict_cache = build_dict()
|
||||
|
||||
# Fusionner les suppléments (ne remplace pas les entrées existantes)
|
||||
if CIM10_SUPPLEMENTS_PATH.exists():
|
||||
with open(CIM10_SUPPLEMENTS_PATH, encoding="utf-8") as f:
|
||||
supplements = json.load(f)
|
||||
added = 0
|
||||
for code, label in supplements.items():
|
||||
if code not in _dict_cache:
|
||||
_dict_cache[code] = label
|
||||
added += 1
|
||||
if added:
|
||||
logger.info("Suppléments CIM-10 : %d codes ajoutés depuis %s", added, CIM10_SUPPLEMENTS_PATH.name)
|
||||
|
||||
return _dict_cache
|
||||
|
||||
|
||||
def _get_normalized_entries() -> list[tuple[str, str, str]]:
|
||||
"""Retourne une liste de (code, label_original, label_normalisé) triée par spécificité.
|
||||
|
||||
Les codes avec point (sous-codes, plus spécifiques) sont en premier.
|
||||
"""
|
||||
global _normalized_cache
|
||||
if _normalized_cache is not None:
|
||||
return _normalized_cache
|
||||
|
||||
d = load_dict()
|
||||
entries = []
|
||||
for code, label in d.items():
|
||||
norm = normalize_text(label)
|
||||
entries.append((code, label, norm))
|
||||
|
||||
# Trier : sous-codes (avec point) d'abord, puis par longueur de label décroissante
|
||||
# pour préférer les matchs les plus spécifiques
|
||||
entries.sort(key=lambda e: (0 if "." in e[0] else 1, -len(e[2])))
|
||||
_normalized_cache = entries
|
||||
return _normalized_cache
|
||||
|
||||
|
||||
def lookup(
|
||||
text: str,
|
||||
domain_overrides: dict[str, str] | None = None,
|
||||
) -> str | None:
|
||||
"""Recherche un code CIM-10 pour un texte donné.
|
||||
|
||||
Stratégie en 3 niveaux :
|
||||
1. Match substring dans domain_overrides (prioritaire, ex: CIM10_MAP existant)
|
||||
2. Match exact normalisé dans le dictionnaire complet
|
||||
3. Match substring normalisé avec scoring par spécificité (préfère sous-codes)
|
||||
|
||||
Args:
|
||||
text: Le texte médical à rechercher.
|
||||
domain_overrides: Dictionnaire terme→code prioritaire (ex: CIM10_MAP).
|
||||
|
||||
Returns:
|
||||
Le code CIM-10 trouvé ou None.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
text_norm = normalize_text(text)
|
||||
|
||||
# Niveau 1 : domain overrides (substring match)
|
||||
if domain_overrides:
|
||||
for terme, code in domain_overrides.items():
|
||||
if normalize_text(terme) in text_norm:
|
||||
return code
|
||||
|
||||
# Niveau 2 : match exact normalisé dans le dictionnaire complet
|
||||
d = load_dict()
|
||||
for code, label in d.items():
|
||||
if normalize_text(label) == text_norm:
|
||||
return code
|
||||
|
||||
# Niveau 3 : substring match normalisé (plus spécifique d'abord)
|
||||
entries = _get_normalized_entries()
|
||||
for code, _label, norm_label in entries:
|
||||
if not norm_label or len(norm_label) < 4:
|
||||
continue
|
||||
if norm_label in text_norm:
|
||||
return code
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def normalize_code(code: str) -> str:
|
||||
"""Normalise un code CIM-10 : K810 → K81.0, k85.1 → K85.1."""
|
||||
code = code.strip().upper()
|
||||
# Insérer le point si absent : K810 → K81.0
|
||||
if len(code) > 3 and "." not in code:
|
||||
code = code[:3] + "." + code[3:]
|
||||
return code
|
||||
|
||||
|
||||
def validate_code(code: str) -> tuple[bool, str]:
|
||||
"""Vérifie si un code CIM-10 existe dans le dictionnaire.
|
||||
|
||||
Returns:
|
||||
(is_valid, label) — label vide si invalide.
|
||||
"""
|
||||
d = load_dict()
|
||||
normalized = normalize_code(code)
|
||||
if normalized in d:
|
||||
return True, d[normalized]
|
||||
# Tenter aussi le code brut (3 caractères sans point)
|
||||
raw = code.upper().strip()
|
||||
if raw in d:
|
||||
return True, d[raw]
|
||||
return False, ""
|
||||
|
||||
|
||||
def fallback_parent_code(code: str) -> str | None:
|
||||
"""Tente de corriger un code invalide en remontant au code parent.
|
||||
|
||||
Le LLM hallucine souvent des sous-codes (.8, .9) sur des codes
|
||||
standalone à 3 caractères (ex: D71.9 → D71, R69.8 → R69).
|
||||
|
||||
Returns:
|
||||
Le code parent valide, ou None si aucun fallback trouvé.
|
||||
"""
|
||||
normalized = normalize_code(code)
|
||||
# Extraire le code parent (3 caractères avant le point)
|
||||
if "." in normalized:
|
||||
parent = normalized.split(".")[0]
|
||||
is_valid, _ = validate_code(parent)
|
||||
if is_valid:
|
||||
return parent
|
||||
return None
|
||||
|
||||
|
||||
def reset_cache() -> None:
|
||||
"""Réinitialise les caches (utile pour les tests)."""
|
||||
global _dict_cache, _normalized_cache
|
||||
_dict_cache = None
|
||||
_normalized_cache = None
|
||||
1226
t2a_install_rag_cleanup/src/medical/cim10_extractor.py
Normal file
1226
t2a_install_rag_cleanup/src/medical/cim10_extractor.py
Normal file
File diff suppressed because it is too large
Load Diff
315
t2a_install_rag_cleanup/src/medical/clinical_context.py
Normal file
315
t2a_install_rag_cleanup/src/medical/clinical_context.py
Normal file
@@ -0,0 +1,315 @@
|
||||
"""Enrichissement du contexte clinique pour les prompts LLM.
|
||||
|
||||
Interprète les données brutes (biologie, traitements, séjour) en informations
|
||||
cliniques structurées pour améliorer la qualité du codage CIM-10.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ..config import DossierMedical
|
||||
from .cim10_extractor import BIO_NORMALS
|
||||
|
||||
# Seuils d'interprétation biologique (test → liste de (seuil, direction, interprétation))
|
||||
# Ordre décroissant : le premier seuil franchi donne l'interprétation
|
||||
BIO_INTERPRETATIONS: dict[str, list[tuple[float, str, str]]] = {
|
||||
"CRP": [
|
||||
(100, "high", "syndrome inflammatoire majeur"),
|
||||
(20, "high", "syndrome inflammatoire modéré"),
|
||||
(5, "high", "syndrome inflammatoire mineur"),
|
||||
],
|
||||
"Lipasémie": [
|
||||
(180, "high", "pancréatite biologique (>3N)"),
|
||||
(60, "high", "élévation modérée de la lipase"),
|
||||
],
|
||||
"ASAT": [
|
||||
(200, "high", "cytolyse hépatique majeure (>5N)"),
|
||||
(80, "high", "cytolyse hépatique modérée (>2N)"),
|
||||
],
|
||||
"ALAT": [
|
||||
(200, "high", "cytolyse hépatique majeure (>5N)"),
|
||||
(80, "high", "cytolyse hépatique modérée (>2N)"),
|
||||
],
|
||||
"Bilirubine totale": [
|
||||
(50, "high", "ictère franc"),
|
||||
(17, "high", "hyperbilirubinémie modérée"),
|
||||
],
|
||||
"Hémoglobine": [
|
||||
(7, "low", "anémie sévère (transfusion probable)"),
|
||||
(10, "low", "anémie modérée"),
|
||||
],
|
||||
"Créatinine": [
|
||||
(300, "high", "insuffisance rénale sévère"),
|
||||
(150, "high", "insuffisance rénale modérée"),
|
||||
],
|
||||
"Plaquettes": [
|
||||
(50, "low", "thrombopénie sévère"),
|
||||
(100, "low", "thrombopénie modérée"),
|
||||
],
|
||||
"Leucocytes": [
|
||||
(20, "high", "hyperleucocytose majeure (infection, inflammation)"),
|
||||
(2, "low", "leucopénie sévère (aplasie, immunodépression)"),
|
||||
],
|
||||
}
|
||||
|
||||
# Médicaments → condition implicite (clé en lowercase)
|
||||
TREATMENT_INDICATORS: dict[str, str] = {
|
||||
"insuline": "diabète insulino-traité",
|
||||
"metformine": "diabète type 2",
|
||||
"héparine": "anticoagulation (risque thromboembolique)",
|
||||
"enoxaparine": "anticoagulation (HBPM)",
|
||||
"lovenox": "anticoagulation (HBPM)",
|
||||
"warfarine": "anticoagulation au long cours (AVK)",
|
||||
"fluindione": "anticoagulation au long cours (AVK)",
|
||||
"amoxicilline": "antibiothérapie",
|
||||
"ceftriaxone": "antibiothérapie IV",
|
||||
"tazocilline": "antibiothérapie large spectre IV",
|
||||
"morphine": "analgésie palier 3 (douleur sévère)",
|
||||
"oxycodone": "analgésie palier 3 (douleur sévère)",
|
||||
"oxygène": "oxygénothérapie (insuffisance respiratoire)",
|
||||
"furosémide": "insuffisance cardiaque / rétention hydrique",
|
||||
"lasilix": "insuffisance cardiaque / rétention hydrique",
|
||||
}
|
||||
|
||||
|
||||
def interpret_bio_value(test: str, value_str: str, is_abnormal: bool | None) -> str | None:
|
||||
"""Retourne l'interprétation clinique d'une valeur bio, ou None si normale."""
|
||||
if test not in BIO_INTERPRETATIONS:
|
||||
return None
|
||||
|
||||
try:
|
||||
val = float(value_str.replace(",", ".").replace(" ", ""))
|
||||
except (ValueError, AttributeError):
|
||||
return None
|
||||
|
||||
# Si la valeur est normale (pas anormale), pas d'interprétation
|
||||
if is_abnormal is False:
|
||||
return None
|
||||
|
||||
thresholds = BIO_INTERPRETATIONS[test]
|
||||
for seuil, direction, interpretation in thresholds:
|
||||
if direction == "high" and val >= seuil:
|
||||
return interpretation
|
||||
if direction == "low" and val <= seuil:
|
||||
return interpretation
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def detect_treatment_indicators(traitements: list) -> list[dict]:
|
||||
"""Retourne les conditions implicites détectées via les traitements.
|
||||
|
||||
Args:
|
||||
traitements: Liste d'objets Traitement ou de dicts avec clé 'medicament'.
|
||||
|
||||
Returns:
|
||||
Liste de dicts {medicament, condition}.
|
||||
"""
|
||||
results = []
|
||||
seen_conditions: set[str] = set()
|
||||
|
||||
for t in traitements:
|
||||
med = t.medicament if hasattr(t, "medicament") else t.get("medicament", "")
|
||||
med_lower = med.lower().strip()
|
||||
|
||||
for keyword, condition in TREATMENT_INDICATORS.items():
|
||||
if keyword in med_lower and condition not in seen_conditions:
|
||||
results.append({"medicament": med, "condition": condition})
|
||||
seen_conditions.add(condition)
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def detect_severity_markers(dossier: DossierMedical) -> list[str]:
|
||||
"""Détecte les marqueurs de sévérité globaux."""
|
||||
markers = []
|
||||
|
||||
duree = dossier.sejour.duree_sejour
|
||||
if duree is not None:
|
||||
if duree > 14:
|
||||
markers.append(f"séjour prolongé ({duree} jours)")
|
||||
elif duree > 7:
|
||||
markers.append(f"séjour >7 jours ({duree} jours)")
|
||||
|
||||
age = dossier.sejour.age
|
||||
if age is not None:
|
||||
if age >= 80:
|
||||
markers.append(f"patient très âgé ({age} ans)")
|
||||
elif age >= 70:
|
||||
markers.append(f"patient âgé ({age} ans)")
|
||||
|
||||
imc = dossier.sejour.imc
|
||||
if imc is not None:
|
||||
if imc >= 40:
|
||||
markers.append(f"obésité morbide (IMC {imc})")
|
||||
elif imc >= 30:
|
||||
markers.append(f"obésité (IMC {imc})")
|
||||
|
||||
if dossier.complications:
|
||||
markers.append(f"{len(dossier.complications)} complication(s)")
|
||||
|
||||
return markers
|
||||
|
||||
|
||||
def build_enriched_context(dossier: DossierMedical) -> dict:
|
||||
"""Construit le contexte clinique enrichi (appel unique par dossier).
|
||||
|
||||
Returns:
|
||||
Dict avec les clés : patient, duree_sejour, antecedents,
|
||||
biologie (avec interprétations), imagerie, complications,
|
||||
dp_texte, das_codes_existants, interpretations_bio,
|
||||
conditions_traitements, marqueurs_severite.
|
||||
"""
|
||||
# Données de base (compatibles avec l'ancien format)
|
||||
ctx: dict = {
|
||||
"sexe": dossier.sejour.sexe,
|
||||
"age": dossier.sejour.age,
|
||||
"duree_sejour": dossier.sejour.duree_sejour,
|
||||
"imc": dossier.sejour.imc,
|
||||
"antecedents": [a.texte for a in dossier.antecedents[:5]],
|
||||
"biologie_cle": [(b.test, b.valeur, b.anomalie) for b in dossier.biologie_cle],
|
||||
"imagerie": [(i.type, (i.conclusion or "")[:200]) for i in dossier.imagerie],
|
||||
"complications": [c.texte for c in dossier.complications],
|
||||
}
|
||||
|
||||
# Interprétations biologiques
|
||||
interpretations = []
|
||||
for b in dossier.biologie_cle:
|
||||
interp = interpret_bio_value(b.test, b.valeur or "", b.anomalie)
|
||||
if interp:
|
||||
# Ajouter l'unité si connue
|
||||
unit = ""
|
||||
if b.test in ("CRP",):
|
||||
unit = " mg/L"
|
||||
elif b.test in ("Lipasémie", "ASAT", "ALAT", "GGT", "PAL"):
|
||||
unit = " UI/L"
|
||||
elif b.test in ("Bilirubine totale", "Créatinine"):
|
||||
unit = " µmol/L"
|
||||
elif b.test in ("Hémoglobine",):
|
||||
unit = " g/dL"
|
||||
elif b.test in ("Plaquettes", "Leucocytes"):
|
||||
unit = " G/L"
|
||||
interpretations.append({
|
||||
"test": b.test,
|
||||
"valeur": f"{b.valeur}{unit}",
|
||||
"interpretation": interp,
|
||||
})
|
||||
ctx["interpretations_bio"] = interpretations
|
||||
|
||||
# Conditions implicites via traitements
|
||||
ctx["conditions_traitements"] = detect_treatment_indicators(dossier.traitements_sortie)
|
||||
|
||||
# Marqueurs de sévérité
|
||||
ctx["marqueurs_severite"] = detect_severity_markers(dossier)
|
||||
|
||||
return ctx
|
||||
|
||||
|
||||
def format_enriched_context(context: dict) -> str:
|
||||
"""Formate le contexte enrichi en texte structuré pour le prompt.
|
||||
|
||||
Inclut les mêmes sections que l'ancien _format_contexte() PLUS :
|
||||
interprétations bio, conditions implicites traitements, marqueurs sévérité.
|
||||
"""
|
||||
lines = []
|
||||
|
||||
# Patient
|
||||
sexe = context.get("sexe")
|
||||
age = context.get("age")
|
||||
imc = context.get("imc")
|
||||
patient_parts = []
|
||||
if sexe:
|
||||
patient_parts.append(sexe)
|
||||
if age:
|
||||
patient_parts.append(f"{age} ans")
|
||||
if imc:
|
||||
patient_parts.append(f"IMC {imc}")
|
||||
if patient_parts:
|
||||
lines.append(f"- Patient : {', '.join(str(p) for p in patient_parts)}")
|
||||
|
||||
# Durée de séjour
|
||||
duree = context.get("duree_sejour")
|
||||
if duree:
|
||||
lines.append(f"- Durée séjour : {duree} jours")
|
||||
|
||||
# Antécédents
|
||||
antecedents = context.get("antecedents")
|
||||
if antecedents:
|
||||
lines.append(f"- Antécédents : {', '.join(antecedents[:5])}")
|
||||
|
||||
# Biologie (avec normes)
|
||||
biologie = context.get("biologie_cle")
|
||||
if biologie:
|
||||
bio_parts = []
|
||||
for b in biologie:
|
||||
test, valeur, anomalie = (
|
||||
b if isinstance(b, (list, tuple))
|
||||
else (b.get("test"), b.get("valeur"), b.get("anomalie"))
|
||||
)
|
||||
norme_str = ""
|
||||
if test in BIO_NORMALS:
|
||||
lo, hi = BIO_NORMALS[test]
|
||||
lo_s = int(lo) if lo == int(lo) else lo
|
||||
hi_s = int(hi) if hi == int(hi) else hi
|
||||
norme_str = f" [N: {lo_s}-{hi_s}]"
|
||||
marker = " (\u2191)" if anomalie else ""
|
||||
bio_parts.append(f"{test} {valeur}{norme_str}{marker}")
|
||||
lines.append(f"- Biologie : {', '.join(bio_parts)}")
|
||||
|
||||
# Imagerie
|
||||
imagerie = context.get("imagerie")
|
||||
if imagerie:
|
||||
for img in imagerie:
|
||||
img_type, conclusion = (
|
||||
img if isinstance(img, (list, tuple))
|
||||
else (img.get("type"), img.get("conclusion"))
|
||||
)
|
||||
if conclusion:
|
||||
lines.append(f"- Imagerie : {img_type} — {conclusion[:200]}")
|
||||
|
||||
# Complications
|
||||
complications = context.get("complications")
|
||||
if complications:
|
||||
lines.append(f"- Complications : {', '.join(complications)}")
|
||||
|
||||
# DP du séjour
|
||||
dp_texte = context.get("dp_texte")
|
||||
if dp_texte:
|
||||
lines.append(f"- DP du séjour : {dp_texte}")
|
||||
|
||||
# DAS déjà codés
|
||||
das_codes = context.get("das_codes_existants")
|
||||
if das_codes:
|
||||
lines.append(f"- DAS déjà codés : {', '.join(das_codes)}")
|
||||
|
||||
# --- Sections enrichies ---
|
||||
|
||||
# Interprétations biologiques
|
||||
interpretations = context.get("interpretations_bio", [])
|
||||
if interpretations:
|
||||
interp_parts = [
|
||||
f"{i['test']} {i['valeur']} \u2192 {i['interpretation']}"
|
||||
for i in interpretations
|
||||
]
|
||||
lines.append(f"\nINTERPRÉTATION CLINIQUE :")
|
||||
lines.append(f"- Biologie : {' ; '.join(interp_parts)}")
|
||||
|
||||
# Conditions implicites via traitements
|
||||
conditions = context.get("conditions_traitements", [])
|
||||
if conditions:
|
||||
cond_parts = [
|
||||
f"{c['medicament']} \u2192 {c['condition']}"
|
||||
for c in conditions
|
||||
]
|
||||
if not interpretations:
|
||||
lines.append(f"\nINTERPRÉTATION CLINIQUE :")
|
||||
lines.append(f"- Traitements indicatifs : {' ; '.join(cond_parts)}")
|
||||
|
||||
# Marqueurs de sévérité
|
||||
marqueurs = context.get("marqueurs_severite", [])
|
||||
if marqueurs:
|
||||
if not interpretations and not conditions:
|
||||
lines.append(f"\nINTERPRÉTATION CLINIQUE :")
|
||||
lines.append(f"- Marqueurs de sévérité : {', '.join(marqueurs)}")
|
||||
|
||||
return "\n".join(lines) if lines else "Non précisé"
|
||||
152
t2a_install_rag_cleanup/src/medical/das_filter.py
Normal file
152
t2a_install_rag_cleanup/src/medical/das_filter.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""Filtrage des diagnostics associés parasites (artefacts OCR trackare)."""
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
# Corrections de codes CIM-10 systématiquement mal attribués par le LLM
|
||||
# D55.9 (anémie enzymatique) est proposé pour "Anémie" non qualifiée → D64.9
|
||||
CODE_CORRECTIONS: dict[str, dict] = {
|
||||
"D55.9": {
|
||||
"correct_code": "D64.9",
|
||||
"condition_texte": r"^an[ée]mie$", # uniquement si texte = "Anémie" seul
|
||||
"reason": "Anémie non qualifiée → D64.9 (sans précision), pas D55.9 (enzymatique)",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def clean_diagnostic_text(text: str) -> str:
|
||||
"""Nettoie un texte de diagnostic (newlines, ponctuation trailing, espaces)."""
|
||||
text = text.replace("\n", " ")
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
text = text.rstrip(",.;:!")
|
||||
return text
|
||||
|
||||
|
||||
def is_valid_diagnostic_text(text: str) -> bool:
|
||||
"""Retourne True si le texte ressemble à un diagnostic médical légitime."""
|
||||
t = text.strip()
|
||||
|
||||
# 1. Trop court
|
||||
if len(t) < 3:
|
||||
return False
|
||||
|
||||
# 2. Chiffres purs (>= 50% de chiffres)
|
||||
digits = sum(c.isdigit() for c in t)
|
||||
if digits >= len(t) * 0.5:
|
||||
return False
|
||||
|
||||
# 3. Lettre + chiffres OCR : "H 51", "À 08", "H\n10", "K 3.6", "B 12,5"
|
||||
if re.match(r"^[A-ZÀ-Ú]\s*\d{1,3}([.,]\d+)?$", t):
|
||||
return False
|
||||
|
||||
# 4. Mots concaténés et/ou répétés avec espaces : "VentilationVentilation Ventilation..."
|
||||
if re.match(r"^([a-zà-ÿ]{3,})(\s*\1)+\s*$", t, re.IGNORECASE):
|
||||
return False
|
||||
|
||||
# 5. Mots répétés : tous identiques ("Absence absence", "Anticoagulant anticoagulant")
|
||||
# ou ≥ 3 occurrences du même mot
|
||||
words = t.lower().split()
|
||||
if len(words) >= 2:
|
||||
if len(set(words)) == 1:
|
||||
return False
|
||||
from collections import Counter
|
||||
counts = Counter(words)
|
||||
if counts.most_common(1)[0][1] >= 3:
|
||||
return False
|
||||
|
||||
# 6. Fragments non-médicaux
|
||||
if re.match(r"^(De |Du |Des |]\s)", t):
|
||||
return False
|
||||
if t in {"Isolement", "Pp 500"}:
|
||||
return False
|
||||
|
||||
# 7. Ponctuation initiale (artefacts OCR) : ", sans précision"
|
||||
if re.match(r'^[,.\-;:!)\]]\s', t):
|
||||
return False
|
||||
|
||||
# 8. Pattern "À X.X" / "A X.X" (valeurs numériques OCR)
|
||||
if re.match(r'^[ÀA]\s+\d+([.,]\d+)?$', t):
|
||||
return False
|
||||
|
||||
# 9. Crochets (artefacts OCR) : "Episode [episode"
|
||||
if '[' in t or ']' in t:
|
||||
return False
|
||||
|
||||
# 10. Termes de laboratoire isolés (un seul mot ≠ diagnostic)
|
||||
_LAB_TERMS = {"hémoglobine", "créatinine", "plaquettes", "leucocytes", "glycémie",
|
||||
"natrémie", "kaliémie", "calcémie", "bilirubine", "albumine",
|
||||
"fibrinogène", "hématocrite", "cétonurie", "glycosurie"}
|
||||
if t.lower() in _LAB_TERMS:
|
||||
return False
|
||||
|
||||
# 11. Fragments anatomiques courts sans pathologie : "Dans la vessie", "Le rein"
|
||||
if re.match(r'^(Dans |La |Le |Les |Au |Aux )', t) and len(t) < 30:
|
||||
return False
|
||||
|
||||
# 12. En-têtes de systèmes anatomiques (catégories sans pathologie)
|
||||
_ANATOMICAL_HEADERS = {
|
||||
"musculaire", "squelettique", "cardiovasculaire", "pulmonaire",
|
||||
"neurologique", "digestif", "digestive", "hépatique", "rénal",
|
||||
"rénale", "urinaire", "cutané", "cutanée", "articulaire",
|
||||
"osseux", "osseuse", "gastrique", "intestinal", "intestinale",
|
||||
"cérébral", "thoracique", "abdominal", "abdominale",
|
||||
}
|
||||
if len(words) == 1 and t.lower() in _ANATOMICAL_HEADERS:
|
||||
return False
|
||||
# Catégorie + description vague : "Musculaire - masse musculaire"
|
||||
if re.match(r'^[A-ZÀ-Ú][a-zà-ÿ]+ - (masse|zone|région|état|bilan)', t, re.IGNORECASE):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# Paires de redondance sémantique CIM-10 en PMSI
|
||||
# Format: (dominated_prefix, dominant_prefixes)
|
||||
# Si un code commençant par dominated_prefix ET un code commençant par un dominant_prefix
|
||||
# sont tous deux en DAS, le dominated est supprimé.
|
||||
SEMANTIC_REDUNDANCIES: list[tuple[str, list[str]]] = [
|
||||
# I10 (HTA essentielle) redondant si I11/I12/I13 présent (cardio/néphropathie hypertensive)
|
||||
("I10", ["I11", "I12", "I13"]),
|
||||
# N30 (cystite) redondant si N39.0 présent (infection urinaire)
|
||||
("N30", ["N39"]),
|
||||
# J18 (pneumonie SAI) redondant si J15/J16 présent (pneumonie spécifique)
|
||||
("J18", ["J15", "J16"]),
|
||||
]
|
||||
|
||||
|
||||
def apply_semantic_dedup(das_list: list) -> list:
|
||||
"""Retire les DAS rendus redondants par la présence d'un code plus spécifique.
|
||||
|
||||
Utilise SEMANTIC_REDUNDANCIES pour déterminer les paires dominé/dominant.
|
||||
Accepte une liste de Diagnostic (avec attribut cim10_suggestion).
|
||||
"""
|
||||
codes_present = {d.cim10_suggestion for d in das_list if d.cim10_suggestion}
|
||||
to_remove: set[str] = set()
|
||||
|
||||
for dominated_prefix, dominant_prefixes in SEMANTIC_REDUNDANCIES:
|
||||
dominated_codes = [c for c in codes_present if c.startswith(dominated_prefix)]
|
||||
if not dominated_codes:
|
||||
continue
|
||||
has_dominant = any(
|
||||
c.startswith(dp) for c in codes_present for dp in dominant_prefixes
|
||||
)
|
||||
if has_dominant:
|
||||
to_remove.update(dominated_codes)
|
||||
|
||||
if not to_remove:
|
||||
return das_list
|
||||
return [d for d in das_list if d.cim10_suggestion not in to_remove]
|
||||
|
||||
|
||||
def correct_known_miscodes(code: str, texte: str) -> str | None:
|
||||
"""Corrige les codes CIM-10 systématiquement mal attribués par le LLM.
|
||||
|
||||
Returns:
|
||||
Le code corrigé, ou None si pas de correction nécessaire.
|
||||
"""
|
||||
correction = CODE_CORRECTIONS.get(code)
|
||||
if not correction:
|
||||
return None
|
||||
if re.match(correction["condition_texte"], texte.strip(), re.IGNORECASE):
|
||||
return correction["correct_code"]
|
||||
return None
|
||||
140
t2a_install_rag_cleanup/src/medical/edsnlp_pipeline.py
Normal file
140
t2a_install_rag_cleanup/src/medical/edsnlp_pipeline.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""Pipeline edsnlp pour l'extraction médicale (CIM-10, médicaments, négation)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_nlp = None
|
||||
_available = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CIM10Entity:
|
||||
texte: str
|
||||
code: str
|
||||
negation: bool = False
|
||||
hypothese: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class DrugEntity:
|
||||
texte: str
|
||||
code_atc: Optional[str] = None
|
||||
negation: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class DateEntity:
|
||||
texte: str
|
||||
value: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EdsnlpResult:
|
||||
cim10_entities: list[CIM10Entity] = field(default_factory=list)
|
||||
drug_entities: list[DrugEntity] = field(default_factory=list)
|
||||
date_entities: list[DateEntity] = field(default_factory=list)
|
||||
|
||||
|
||||
def is_available() -> bool:
|
||||
"""Vérifie si edsnlp est installé et utilisable."""
|
||||
global _available
|
||||
if _available is not None:
|
||||
return _available
|
||||
try:
|
||||
import edsnlp # noqa: F401
|
||||
_available = True
|
||||
except ImportError:
|
||||
_available = False
|
||||
return _available
|
||||
|
||||
|
||||
def get_pipeline():
|
||||
"""Retourne le pipeline edsnlp (singleton lazy-loaded)."""
|
||||
global _nlp
|
||||
if _nlp is not None:
|
||||
return _nlp
|
||||
|
||||
if not is_available():
|
||||
raise RuntimeError("edsnlp n'est pas installé")
|
||||
|
||||
import edsnlp
|
||||
|
||||
logger.info("Initialisation du pipeline edsnlp...")
|
||||
nlp = edsnlp.blank("eds")
|
||||
|
||||
nlp.add_pipe("eds.normalizer")
|
||||
nlp.add_pipe("eds.sentences")
|
||||
nlp.add_pipe("eds.cim10", config=dict(attr="NORM", term_matcher="simstring"))
|
||||
nlp.add_pipe("eds.drugs", config=dict(attr="NORM", term_matcher="exact"))
|
||||
nlp.add_pipe("eds.negation")
|
||||
nlp.add_pipe("eds.hypothesis")
|
||||
nlp.add_pipe("eds.dates")
|
||||
|
||||
_nlp = nlp
|
||||
logger.info("Pipeline edsnlp initialisé avec succès")
|
||||
return _nlp
|
||||
|
||||
|
||||
def analyze(text: str) -> EdsnlpResult:
|
||||
"""Analyse un texte médical avec edsnlp.
|
||||
|
||||
Retourne les entités CIM-10, médicaments et dates détectées.
|
||||
"""
|
||||
result = EdsnlpResult()
|
||||
|
||||
if not is_available():
|
||||
return result
|
||||
|
||||
try:
|
||||
nlp = get_pipeline()
|
||||
doc = nlp(text)
|
||||
except Exception:
|
||||
logger.exception("Erreur lors de l'analyse edsnlp")
|
||||
return result
|
||||
|
||||
for ent in doc.ents:
|
||||
negation = getattr(ent._, "negation", False) or False
|
||||
hypothese = getattr(ent._, "hypothesis", False) or False
|
||||
|
||||
if ent.label_ == "cim10":
|
||||
code = ent.kb_id_ or ""
|
||||
if code:
|
||||
result.cim10_entities.append(CIM10Entity(
|
||||
texte=ent.text,
|
||||
code=code,
|
||||
negation=negation,
|
||||
hypothese=hypothese,
|
||||
))
|
||||
elif ent.label_ == "drug":
|
||||
code_atc = ent.kb_id_ or None
|
||||
result.drug_entities.append(DrugEntity(
|
||||
texte=ent.text,
|
||||
code_atc=code_atc,
|
||||
negation=negation,
|
||||
))
|
||||
|
||||
# Dates
|
||||
for span in doc.spans.get("dates", []):
|
||||
date_value = None
|
||||
if hasattr(span._, "date"):
|
||||
date_obj = span._.date
|
||||
if date_obj is not None:
|
||||
date_value = str(date_obj)
|
||||
result.date_entities.append(DateEntity(
|
||||
texte=span.text,
|
||||
value=date_value,
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def reset():
|
||||
"""Réinitialise le pipeline (utile pour les tests)."""
|
||||
global _nlp, _available
|
||||
_nlp = None
|
||||
_available = None
|
||||
169
t2a_install_rag_cleanup/src/medical/exclusion_rules.py
Normal file
169
t2a_install_rag_cleanup/src/medical/exclusion_rules.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""Règles d'exclusion diagnostique : symptôme (Chapitre XVIII) vs diagnostic précis.
|
||||
|
||||
Lorsqu'un symptôme (R00-R99) et un diagnostic précis (Chapitres I-XIV, A00-N99)
|
||||
coexistent et que le symptôme est expliqué par le diagnostic précis, le symptôme
|
||||
ne doit PAS être codé comme DAS (règle ATIH de non-redondance).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
def is_symptom_code(code: str) -> bool:
|
||||
"""Vérifie si un code CIM-10 appartient au Chapitre XVIII (R00-R99 = Symptômes)."""
|
||||
if not code:
|
||||
return False
|
||||
return bool(re.match(r"^R\d{2}", code, re.IGNORECASE))
|
||||
|
||||
|
||||
def is_precise_diagnosis(code: str) -> bool:
|
||||
"""Vérifie si un code CIM-10 appartient aux Chapitres I-XIV (A00-N99)."""
|
||||
if not code:
|
||||
return False
|
||||
return bool(re.match(r"^[A-N]\d{2}", code, re.IGNORECASE))
|
||||
|
||||
|
||||
# Mapping R-code → set de codes précis qui excluent le symptôme.
|
||||
# Chaque R-code est exclu si l'un des codes précis (ou un code commençant par
|
||||
# l'une de ces racines) est présent parmi les diagnostics du séjour.
|
||||
EXCLUSION_MAP: dict[str, set[str]] = {
|
||||
# R10 — Douleur abdominale → exclu par pathologies digestives précises
|
||||
"R10": {"K35", "K80", "K81", "K83", "K85", "K86", "K56", "K57", "K25", "K26", "K29"},
|
||||
"R10.1": {"K80", "K81", "K83"}, # Douleur hypochondre droit
|
||||
"R10.3": {"K35", "K36", "K37"}, # Douleur hypogastre
|
||||
"R10.4": {"K35", "K80", "K85", "K56", "K57"}, # Douleur abdominale autre/non précisée
|
||||
|
||||
# R11 — Nausées et vomissements
|
||||
"R11": {"K29", "K80", "K81", "K85", "K56", "K91"},
|
||||
|
||||
# R17 — Ictère → exclu par pathologies hépatobiliaires
|
||||
"R17": {"K80", "K83", "K70", "K71", "K72", "K73", "K74", "B15", "B16", "B17", "B18", "B19", "C22"},
|
||||
|
||||
# R50 — Fièvre → exclu par infections précises
|
||||
"R50": {"A41", "J18", "J15", "J13", "J14", "J06", "N10", "N39", "K81", "K83",
|
||||
"L03", "T81", "A09", "A04"},
|
||||
"R50.9": {"A41", "J18", "J15", "J13", "J14", "N10", "N39", "K81"},
|
||||
|
||||
# R07 — Douleur thoracique → exclu par pathologies cardiaques/pulmonaires
|
||||
"R07": {"I20", "I21", "I22", "I23", "I24", "I25", "I26", "J18", "J93"},
|
||||
"R07.4": {"I20", "I21", "I24", "I25"},
|
||||
|
||||
# R06 — Dyspnée → exclu par pathologies respiratoires/cardiaques
|
||||
"R06": {"J18", "J44", "J45", "J96", "I50", "I26"},
|
||||
"R06.0": {"J18", "J44", "J45", "J96", "I50", "I26"},
|
||||
|
||||
# R31 — Hématurie → exclu par pathologies urologiques/rénales
|
||||
"R31": {"N20", "N13", "C64", "C67", "N02", "N00", "N01"},
|
||||
|
||||
# R04 — Hémoptysie → exclu par pathologies pulmonaires
|
||||
"R04": {"J18", "C34", "I26", "A16"},
|
||||
|
||||
# R63.4 — Perte de poids → exclu par tumeurs, infections chroniques
|
||||
"R63.4": {"C15", "C16", "C18", "C19", "C20", "C22", "C25", "C34", "C50",
|
||||
"A15", "A16", "B20", "B21", "B22", "B23", "B24", "E46"},
|
||||
|
||||
# R00 — Anomalies du rythme cardiaque → exclu par troubles du rythme précis
|
||||
"R00": {"I47", "I48", "I49"},
|
||||
"R00.0": {"I47", "I48"}, # Tachycardie
|
||||
"R00.1": {"I49.5", "I49.8"}, # Bradycardie
|
||||
}
|
||||
|
||||
|
||||
def _code_matches(code: str, roots: set[str]) -> bool:
|
||||
"""Vérifie si un code CIM-10 commence par l'une des racines données."""
|
||||
if not code:
|
||||
return False
|
||||
code_upper = code.upper()
|
||||
for root in roots:
|
||||
if code_upper.startswith(root.upper()):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExclusionResult:
|
||||
"""Résultat de l'application des règles d'exclusion."""
|
||||
cleaned_das: list # Diagnostics DAS conservés
|
||||
excluded: list # Diagnostics DAS exclus
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
def check_exclusions(dp, das_list: list) -> ExclusionResult:
|
||||
"""Applique les règles d'exclusion symptôme vs diagnostic précis.
|
||||
|
||||
Args:
|
||||
dp: Diagnostic principal (objet avec attribut cim10_suggestion).
|
||||
das_list: Liste des diagnostics associés (même type).
|
||||
|
||||
Returns:
|
||||
ExclusionResult avec les DAS nettoyés, exclus, et les warnings.
|
||||
"""
|
||||
# Collecter tous les codes du séjour (DP + DAS)
|
||||
all_codes: list[str] = []
|
||||
if dp and dp.cim10_suggestion:
|
||||
all_codes.append(dp.cim10_suggestion)
|
||||
for das in das_list:
|
||||
if das.cim10_suggestion:
|
||||
all_codes.append(das.cim10_suggestion)
|
||||
|
||||
# Identifier les codes précis présents (Chapitres I-XIV)
|
||||
precise_codes = [c for c in all_codes if is_precise_diagnosis(c)]
|
||||
|
||||
cleaned = []
|
||||
excluded = []
|
||||
warnings = []
|
||||
|
||||
for das in das_list:
|
||||
code = das.cim10_suggestion
|
||||
if not code or not is_symptom_code(code):
|
||||
# Non-symptôme : toujours conservé
|
||||
cleaned.append(das)
|
||||
continue
|
||||
|
||||
# Vérifier si ce symptôme est exclu par un diagnostic précis
|
||||
should_exclude = False
|
||||
excluding_code = None
|
||||
|
||||
# Chercher dans EXCLUSION_MAP : d'abord le code exact, puis la racine (3 chars)
|
||||
exclusion_roots = EXCLUSION_MAP.get(code.upper())
|
||||
if exclusion_roots is None:
|
||||
# Essayer la racine 3 caractères (ex: R10.4 → R10)
|
||||
root3 = code.upper()[:3]
|
||||
exclusion_roots = EXCLUSION_MAP.get(root3)
|
||||
|
||||
if exclusion_roots:
|
||||
for precise in precise_codes:
|
||||
if _code_matches(precise, exclusion_roots):
|
||||
should_exclude = True
|
||||
excluding_code = precise
|
||||
break
|
||||
|
||||
if should_exclude:
|
||||
excluded.append(das)
|
||||
warnings.append(
|
||||
f"DAS '{das.texte}' ({code}) exclu : symptôme redondant avec "
|
||||
f"le diagnostic précis {excluding_code}"
|
||||
)
|
||||
else:
|
||||
cleaned.append(das)
|
||||
|
||||
# Vérifier aussi si le DP est un symptôme avec un diagnostic précis en DAS
|
||||
if dp and dp.cim10_suggestion and is_symptom_code(dp.cim10_suggestion):
|
||||
dp_code = dp.cim10_suggestion
|
||||
exclusion_roots = EXCLUSION_MAP.get(dp_code.upper())
|
||||
if exclusion_roots is None:
|
||||
exclusion_roots = EXCLUSION_MAP.get(dp_code.upper()[:3])
|
||||
|
||||
if exclusion_roots:
|
||||
for precise in precise_codes:
|
||||
if _code_matches(precise, exclusion_roots):
|
||||
warnings.append(
|
||||
f"ALERTE DP : le DP '{dp.texte}' ({dp_code}) est un symptôme "
|
||||
f"alors qu'un diagnostic précis {precise} est présent — "
|
||||
f"vérifier si le DP devrait être changé"
|
||||
)
|
||||
break
|
||||
|
||||
return ExclusionResult(cleaned_das=cleaned, excluded=excluded, warnings=warnings)
|
||||
294
t2a_install_rag_cleanup/src/medical/fusion.py
Normal file
294
t2a_install_rag_cleanup/src/medical/fusion.py
Normal file
@@ -0,0 +1,294 @@
|
||||
"""Fusion de dossiers médicaux multi-PDFs pour un même patient.
|
||||
|
||||
Combine les informations de plusieurs documents (Trackare, CRH, CRO) en un
|
||||
dossier unique avec des règles de priorité et de déduplication.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from ..config import (
|
||||
ActeCCAM,
|
||||
Antecedent,
|
||||
BiologieCle,
|
||||
Complication,
|
||||
Diagnostic,
|
||||
DossierMedical,
|
||||
Imagerie,
|
||||
Sejour,
|
||||
Traitement,
|
||||
)
|
||||
from ..medical.das_filter import is_valid_diagnostic_text, apply_semantic_dedup
|
||||
from ..medical.cim10_extractor import _is_dp_family_redundant
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Priorité des types de documents pour les données de séjour
|
||||
_DOC_PRIORITY = {"trackare": 0, "crh": 1, "cro": 2}
|
||||
|
||||
|
||||
def _cim10_specificity(code: str | None) -> int:
|
||||
"""Score de spécificité d'un code CIM-10 : longueur sans le point."""
|
||||
if not code:
|
||||
return 0
|
||||
return len(code.replace(".", ""))
|
||||
|
||||
|
||||
def _prefer_most_specific_dp(dossiers: list[DossierMedical]) -> Diagnostic | None:
|
||||
"""Sélectionne le DP le plus spécifique parmi tous les dossiers."""
|
||||
candidates: list[tuple[Diagnostic, int]] = []
|
||||
for d in dossiers:
|
||||
if d.diagnostic_principal:
|
||||
spec = _cim10_specificity(d.diagnostic_principal.cim10_suggestion)
|
||||
candidates.append((d.diagnostic_principal, spec))
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# Tri : spécificité décroissante, puis confiance (high > medium > low)
|
||||
conf_order = {"high": 0, "medium": 1, "low": 2}
|
||||
candidates.sort(
|
||||
key=lambda x: (-x[1], conf_order.get(x[0].cim10_confidence or "", 3))
|
||||
)
|
||||
return candidates[0][0]
|
||||
|
||||
|
||||
def _merge_sejour(dossiers: list[DossierMedical]) -> Sejour:
|
||||
"""Fusionne les informations de séjour avec priorité Trackare > CRH > CRO."""
|
||||
# Trier par priorité de type de document
|
||||
sorted_dossiers = sorted(
|
||||
dossiers,
|
||||
key=lambda d: _DOC_PRIORITY.get(d.document_type, 99),
|
||||
)
|
||||
|
||||
merged = Sejour()
|
||||
for d in sorted_dossiers:
|
||||
s = d.sejour
|
||||
if s.sexe and not merged.sexe:
|
||||
merged.sexe = s.sexe
|
||||
if s.age is not None and merged.age is None:
|
||||
merged.age = s.age
|
||||
if s.date_entree and not merged.date_entree:
|
||||
merged.date_entree = s.date_entree
|
||||
if s.date_sortie and not merged.date_sortie:
|
||||
merged.date_sortie = s.date_sortie
|
||||
if s.duree_sejour is not None and merged.duree_sejour is None:
|
||||
merged.duree_sejour = s.duree_sejour
|
||||
if s.mode_entree and not merged.mode_entree:
|
||||
merged.mode_entree = s.mode_entree
|
||||
if s.mode_sortie and not merged.mode_sortie:
|
||||
merged.mode_sortie = s.mode_sortie
|
||||
if s.imc is not None and merged.imc is None:
|
||||
merged.imc = s.imc
|
||||
if s.poids is not None and merged.poids is None:
|
||||
merged.poids = s.poids
|
||||
if s.taille is not None and merged.taille is None:
|
||||
merged.taille = s.taille
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def _is_enriched(d: Diagnostic) -> bool:
|
||||
"""Retourne True si le diagnostic a une justification RAG."""
|
||||
return bool(d.justification or d.sources_rag)
|
||||
|
||||
|
||||
def _dedup_diagnostics(all_das: list[Diagnostic]) -> list[Diagnostic]:
|
||||
"""Déduplique les diagnostics associés par code CIM-10, garde la meilleure confiance."""
|
||||
conf_order = {"high": 0, "medium": 1, "low": 2}
|
||||
seen: dict[str | None, Diagnostic] = {}
|
||||
|
||||
for d in all_das:
|
||||
key = d.cim10_suggestion
|
||||
if key is None:
|
||||
# Sans code, dédup par texte normalisé
|
||||
key = f"__text__{d.texte.lower().strip()}"
|
||||
|
||||
if key not in seen:
|
||||
seen[key] = d
|
||||
else:
|
||||
existing = seen[key]
|
||||
new_conf = conf_order.get(d.cim10_confidence or "", 3)
|
||||
old_conf = conf_order.get(existing.cim10_confidence or "", 3)
|
||||
# Garder celui avec la meilleure confiance, ou à confiance égale celui enrichi
|
||||
if new_conf < old_conf or (new_conf == old_conf and _is_enriched(d) and not _is_enriched(existing)):
|
||||
seen[key] = d
|
||||
|
||||
# Supprimer les codes parents quand un code plus spécifique existe
|
||||
# Ex: K85 retiré si K85.9 présent (K85 est préfixe strict de K859)
|
||||
codes = {k for k in seen if k and not k.startswith("__text__")}
|
||||
normalized = {c: c.replace(".", "") for c in codes}
|
||||
parents_to_remove: set[str] = set()
|
||||
for code_a in codes:
|
||||
norm_a = normalized[code_a]
|
||||
for code_b in codes:
|
||||
if code_a == code_b:
|
||||
continue
|
||||
norm_b = normalized[code_b]
|
||||
if norm_b.startswith(norm_a) and len(norm_b) > len(norm_a):
|
||||
parents_to_remove.add(code_a)
|
||||
break
|
||||
|
||||
for parent in parents_to_remove:
|
||||
del seen[parent]
|
||||
|
||||
return list(seen.values())
|
||||
|
||||
|
||||
def _dedup_actes(all_actes: list[ActeCCAM]) -> list[ActeCCAM]:
|
||||
"""Déduplique les actes CCAM par code."""
|
||||
seen: dict[str | None, ActeCCAM] = {}
|
||||
for a in all_actes:
|
||||
key = a.code_ccam_suggestion
|
||||
if key is None:
|
||||
key = f"__text__{a.texte.lower().strip()}"
|
||||
|
||||
if key not in seen:
|
||||
seen[key] = a
|
||||
else:
|
||||
existing = seen[key]
|
||||
# Garder celui avec date si possible
|
||||
if a.date and not existing.date:
|
||||
seen[key] = a
|
||||
|
||||
return list(seen.values())
|
||||
|
||||
|
||||
def merge_dossiers(dossiers: list[DossierMedical]) -> DossierMedical:
|
||||
"""Fusionne plusieurs dossiers médicaux d'un même patient.
|
||||
|
||||
Args:
|
||||
dossiers: Liste de DossierMedical issus de PDFs différents.
|
||||
|
||||
Returns:
|
||||
Un DossierMedical fusionné.
|
||||
"""
|
||||
if len(dossiers) == 1:
|
||||
result = dossiers[0].model_copy(deep=True)
|
||||
result.source_files = [result.source_file]
|
||||
# Appliquer la dédup famille DP + sémantique même pour un seul dossier
|
||||
dp_code = result.diagnostic_principal.cim10_suggestion if result.diagnostic_principal else None
|
||||
if dp_code:
|
||||
result.diagnostics_associes = [
|
||||
d for d in result.diagnostics_associes
|
||||
if not d.cim10_suggestion or not _is_dp_family_redundant(d.cim10_suggestion, dp_code)
|
||||
]
|
||||
result.diagnostics_associes = apply_semantic_dedup(result.diagnostics_associes)
|
||||
return result
|
||||
|
||||
merged = DossierMedical()
|
||||
|
||||
# Source files
|
||||
merged.source_files = [d.source_file for d in dossiers if d.source_file]
|
||||
|
||||
# Séjour
|
||||
merged.sejour = _merge_sejour(dossiers)
|
||||
|
||||
# Diagnostic principal : le plus spécifique
|
||||
merged.diagnostic_principal = _prefer_most_specific_dp(dossiers)
|
||||
|
||||
# Collecter tous les DAS + DP non retenus comme DAS
|
||||
all_das: list[Diagnostic] = []
|
||||
for d in dossiers:
|
||||
all_das.extend(d.diagnostics_associes)
|
||||
# Si le DP de ce dossier est différent du DP fusionné, l'ajouter comme DAS
|
||||
# mais seulement si le texte est un diagnostic valide (filtre artefacts OCR)
|
||||
if (
|
||||
d.diagnostic_principal
|
||||
and merged.diagnostic_principal
|
||||
and d.diagnostic_principal.cim10_suggestion
|
||||
!= merged.diagnostic_principal.cim10_suggestion
|
||||
and is_valid_diagnostic_text(d.diagnostic_principal.texte)
|
||||
):
|
||||
all_das.append(d.diagnostic_principal)
|
||||
|
||||
merged.diagnostics_associes = _dedup_diagnostics(all_das)
|
||||
|
||||
# Retirer les DAS redondants avec le DP (même code, famille, parent/enfant)
|
||||
dp_code = merged.diagnostic_principal.cim10_suggestion if merged.diagnostic_principal else None
|
||||
if dp_code:
|
||||
merged.diagnostics_associes = [
|
||||
d for d in merged.diagnostics_associes
|
||||
if not d.cim10_suggestion or not _is_dp_family_redundant(d.cim10_suggestion, dp_code)
|
||||
]
|
||||
|
||||
# Redondances sémantiques entre DAS
|
||||
merged.diagnostics_associes = apply_semantic_dedup(merged.diagnostics_associes)
|
||||
|
||||
# Actes CCAM
|
||||
all_actes: list[ActeCCAM] = []
|
||||
for d in dossiers:
|
||||
all_actes.extend(d.actes_ccam)
|
||||
merged.actes_ccam = _dedup_actes(all_actes)
|
||||
|
||||
# Biologie : union, dédup par (test, valeur)
|
||||
bio_seen: set[tuple[str, str | None]] = set()
|
||||
for d in dossiers:
|
||||
for b in d.biologie_cle:
|
||||
key = (b.test, b.valeur)
|
||||
if key not in bio_seen:
|
||||
merged.biologie_cle.append(b)
|
||||
bio_seen.add(key)
|
||||
|
||||
# Imagerie : union, dédup par (type, conclusion)
|
||||
img_seen: set[tuple[str, str | None]] = set()
|
||||
for d in dossiers:
|
||||
for i in d.imagerie:
|
||||
key = (i.type, i.conclusion)
|
||||
if key not in img_seen:
|
||||
merged.imagerie.append(i)
|
||||
img_seen.add(key)
|
||||
|
||||
# Traitements : union, dédup par médicament (normalisé)
|
||||
med_seen: set[str] = set()
|
||||
for d in dossiers:
|
||||
for t in d.traitements_sortie:
|
||||
key = t.medicament.lower().strip()
|
||||
if key not in med_seen:
|
||||
merged.traitements_sortie.append(t)
|
||||
med_seen.add(key)
|
||||
|
||||
# Antécédents : union, dédup par texte normalisé
|
||||
ant_seen: set[str] = set()
|
||||
for d in dossiers:
|
||||
for a in d.antecedents:
|
||||
key = a.texte.lower().strip()
|
||||
if key not in ant_seen:
|
||||
merged.antecedents.append(a)
|
||||
ant_seen.add(key)
|
||||
|
||||
# Complications : union, dédup par texte normalisé
|
||||
comp_seen: set[str] = set()
|
||||
for d in dossiers:
|
||||
for c in d.complications:
|
||||
key = c.texte.lower().strip()
|
||||
if key not in comp_seen:
|
||||
merged.complications.append(c)
|
||||
comp_seen.add(key)
|
||||
|
||||
# Alertes : alerte de fusion en tête + union
|
||||
merged.alertes_codage = [f"FUSION: {len(dossiers)} documents fusionnés"]
|
||||
alert_seen: set[str] = set()
|
||||
for d in dossiers:
|
||||
for a in d.alertes_codage:
|
||||
if a not in alert_seen:
|
||||
merged.alertes_codage.append(a)
|
||||
alert_seen.add(a)
|
||||
|
||||
# Document type : le type prioritaire
|
||||
sorted_by_prio = sorted(
|
||||
dossiers,
|
||||
key=lambda d: _DOC_PRIORITY.get(d.document_type, 99),
|
||||
)
|
||||
merged.document_type = sorted_by_prio[0].document_type
|
||||
|
||||
logger.info(
|
||||
"Fusion de %d dossiers : DP=%s, %d DAS, %d actes",
|
||||
len(dossiers),
|
||||
merged.diagnostic_principal.cim10_suggestion if merged.diagnostic_principal else "aucun",
|
||||
len(merged.diagnostics_associes),
|
||||
len(merged.actes_ccam),
|
||||
)
|
||||
|
||||
return merged
|
||||
225
t2a_install_rag_cleanup/src/medical/ghm.py
Normal file
225
t2a_install_rag_cleanup/src/medical/ghm.py
Normal file
@@ -0,0 +1,225 @@
|
||||
"""Estimation heuristique du GHM (Groupe Homogène de Malades).
|
||||
|
||||
L'algorithme officiel (ATIH FG-MCO) est propriétaire. Ce module fournit une
|
||||
estimation approximative utile comme pré-codage / aide au DIM :
|
||||
1. CMD depuis le DP (table de plages CIM-10)
|
||||
2. Type de prise en charge depuis les actes CCAM
|
||||
3. Sévérité depuis les CMA/CMS
|
||||
4. Construction du code GHM approximatif
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import bisect
|
||||
from typing import Optional
|
||||
|
||||
from ..config import DossierMedical, GHMEstimation
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Table CIM-10 → CMD (Catégorie Majeure de Diagnostic)
|
||||
# Triée par borne inférieure pour lookup par bisect.
|
||||
# Format : (debut, fin, cmd, libelle)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CMD_RANGES: list[tuple[str, str, str, str]] = [
|
||||
("A00", "A99", "18", "Maladies infectieuses et parasitaires"),
|
||||
("B00", "B19", "18", "Maladies infectieuses et parasitaires"),
|
||||
("B20", "B24", "25", "Maladies dues au VIH"),
|
||||
("B25", "B99", "18", "Maladies infectieuses et parasitaires"),
|
||||
("C00", "C97", "17", "Tumeurs malignes"),
|
||||
("D00", "D09", "17", "Tumeurs malignes"),
|
||||
("D10", "D48", "16", "Tumeurs bénignes, hémopathies"),
|
||||
("D50", "D89", "16", "Tumeurs bénignes, hémopathies"),
|
||||
("E00", "E07", "10", "Maladies endocriniennes"),
|
||||
("E10", "E14", "10", "Maladies endocriniennes"),
|
||||
("E15", "E46", "10", "Maladies endocriniennes"),
|
||||
("E47", "E90", "10", "Maladies endocriniennes"),
|
||||
("F00", "F09", "19", "Maladies mentales"),
|
||||
("F10", "F19", "20", "Troubles mentaux liés à l'alcool et aux toxiques"),
|
||||
("F20", "F99", "19", "Maladies mentales"),
|
||||
("G00", "G99", "01", "Affections du système nerveux"),
|
||||
("H00", "H59", "02", "Affections de l'oeil"),
|
||||
("H60", "H95", "03", "Affections ORL"),
|
||||
("I00", "I99", "05", "Affections de l'appareil circulatoire"),
|
||||
("J00", "J99", "04", "Affections de l'appareil respiratoire"),
|
||||
("K00", "K67", "06", "Affections du tube digestif"),
|
||||
("K70", "K87", "07", "Affections hépatobiliaires et pancréatiques"),
|
||||
("K90", "K93", "06", "Affections du tube digestif"),
|
||||
("L00", "L99", "09", "Affections de la peau"),
|
||||
("M00", "M99", "08", "Affections du système ostéo-articulaire"),
|
||||
("N00", "N39", "11", "Affections du rein et des voies urinaires"),
|
||||
("N40", "N51", "12", "Affections de l'appareil génital masculin"),
|
||||
("N60", "N98", "13", "Affections de l'appareil génital féminin"),
|
||||
("N99", "N99", "11", "Affections du rein et des voies urinaires"),
|
||||
("O00", "O99", "14", "Grossesses, accouchements, post-partum"),
|
||||
("P00", "P96", "15", "Nouveau-nés, période périnatale"),
|
||||
("Q00", "Q99", "15", "Nouveau-nés, période périnatale"),
|
||||
("R00", "R99", "23", "Facteurs influençant l'état de santé (symptômes)"),
|
||||
("S00", "S99", "21", "Traumatismes"),
|
||||
("T00", "T19", "21", "Traumatismes"),
|
||||
("T20", "T32", "22", "Brûlures"),
|
||||
("T33", "T98", "21", "Traumatismes"),
|
||||
("U00", "U99", "26", "Catégories spéciales"),
|
||||
("V00", "Y98", "24", "Causes externes"),
|
||||
("Z00", "Z99", "23", "Facteurs influençant l'état de santé"),
|
||||
]
|
||||
|
||||
# Pré-calcul : liste triée des bornes inférieures pour bisect
|
||||
_CMD_STARTS = [r[0] for r in _CMD_RANGES]
|
||||
|
||||
|
||||
def find_cmd(code_cim10: str) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Trouve la CMD correspondant à un code CIM-10.
|
||||
|
||||
Returns:
|
||||
(cmd, libelle) ou (None, None) si non trouvé.
|
||||
"""
|
||||
if not code_cim10:
|
||||
return None, None
|
||||
|
||||
# Normaliser : majuscules, retirer le point
|
||||
code = code_cim10.upper().replace(".", "").strip()
|
||||
if len(code) < 3:
|
||||
return None, None
|
||||
|
||||
# Prendre les 3 premiers caractères pour le lookup
|
||||
code3 = code[:3]
|
||||
|
||||
# bisect pour trouver la plage candidate
|
||||
idx = bisect.bisect_right(_CMD_STARTS, code3) - 1
|
||||
if idx < 0:
|
||||
return None, None
|
||||
|
||||
debut, fin, cmd, libelle = _CMD_RANGES[idx]
|
||||
if debut <= code3 <= fin:
|
||||
return cmd, libelle
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Préfixes CCAM classants (chirurgicaux)
|
||||
# Les codes CCAM commençant par ces lettres correspondent à des organes
|
||||
# et sont considérés chirurgicaux quand ils désignent un acte opératoire.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CCAM_CHIRURGICAL_PREFIXES = {"H", "J", "K", "L", "N", "P", "Q"}
|
||||
|
||||
# Préfixes interventionnels (imagerie, endoscopie)
|
||||
_CCAM_INTERVENTIONNEL_PREFIXES = {"Z", "Y"}
|
||||
|
||||
|
||||
def _detect_type_ghm(actes_ccam: list) -> str:
|
||||
"""Détermine le type de prise en charge depuis les actes CCAM.
|
||||
|
||||
Returns:
|
||||
"C" (chirurgical), "K" (interventionnel) ou "M" (médical).
|
||||
"""
|
||||
has_chirurgical = False
|
||||
has_interventionnel = False
|
||||
|
||||
for acte in actes_ccam:
|
||||
code = acte.code_ccam_suggestion
|
||||
if not code or len(code) < 4:
|
||||
continue
|
||||
|
||||
prefix = code[0].upper()
|
||||
if prefix in _CCAM_CHIRURGICAL_PREFIXES:
|
||||
has_chirurgical = True
|
||||
break
|
||||
if prefix in _CCAM_INTERVENTIONNEL_PREFIXES:
|
||||
has_interventionnel = True
|
||||
|
||||
if has_chirurgical:
|
||||
return "C"
|
||||
if has_interventionnel:
|
||||
return "K"
|
||||
return "M"
|
||||
|
||||
|
||||
def _compute_severity(das_list: list) -> tuple[int, int, int]:
|
||||
"""Calcule le niveau de sévérité à partir des DAS.
|
||||
|
||||
Utilise le max des niveau_cma officiels ATIH quand disponibles,
|
||||
avec fallback sur le comptage CMA/CMS.
|
||||
|
||||
Returns:
|
||||
(niveau, cma_count, cms_count)
|
||||
"""
|
||||
cma_count = 0
|
||||
cms_count = 0
|
||||
max_cma_level = 1
|
||||
|
||||
for das in das_list:
|
||||
niveau_cma = getattr(das, "niveau_cma", None)
|
||||
if niveau_cma and niveau_cma > 1:
|
||||
max_cma_level = max(max_cma_level, niveau_cma)
|
||||
if getattr(das, "est_cma", False):
|
||||
cma_count += 1
|
||||
if getattr(das, "est_cms", False):
|
||||
cms_count += 1
|
||||
|
||||
# Priorité au niveau CMA officiel ATIH
|
||||
if max_cma_level > 1:
|
||||
niveau = max_cma_level
|
||||
elif cms_count >= 2:
|
||||
niveau = 4
|
||||
elif cms_count >= 1 or cma_count >= 3:
|
||||
niveau = 3
|
||||
elif cma_count >= 2:
|
||||
niveau = 2
|
||||
else:
|
||||
niveau = 1
|
||||
|
||||
return niveau, cma_count, cms_count
|
||||
|
||||
|
||||
def estimate_ghm(dossier: DossierMedical) -> GHMEstimation:
|
||||
"""Estime le GHM d'un dossier médical.
|
||||
|
||||
Heuristique en 4 étapes :
|
||||
1. CMD depuis le DP
|
||||
2. Type de prise en charge depuis les actes CCAM
|
||||
3. Sévérité depuis les CMA/CMS
|
||||
4. Construction du code approximatif
|
||||
"""
|
||||
estimation = GHMEstimation()
|
||||
|
||||
# 1. CMD depuis le DP
|
||||
dp = dossier.diagnostic_principal
|
||||
dp_code = dp.cim10_suggestion if dp else None
|
||||
|
||||
if not dp:
|
||||
estimation.alertes.append("DP absent — CMD non déterminable")
|
||||
elif not dp_code:
|
||||
estimation.alertes.append("DP sans code CIM-10 — CMD non déterminable")
|
||||
else:
|
||||
cmd, libelle = find_cmd(dp_code)
|
||||
if cmd:
|
||||
estimation.cmd = cmd
|
||||
estimation.cmd_libelle = libelle
|
||||
else:
|
||||
estimation.alertes.append(f"CMD inconnue pour le code {dp_code}")
|
||||
|
||||
# Alerte DP symptomatique
|
||||
code_letter = dp_code.upper().replace(".", "").strip()[:1]
|
||||
if code_letter in ("R", "Z"):
|
||||
estimation.alertes.append(
|
||||
f"DP symptomatique ({dp_code}) — risque de CMD 23, impact tarif"
|
||||
)
|
||||
|
||||
# 2. Type de prise en charge
|
||||
estimation.type_ghm = _detect_type_ghm(dossier.actes_ccam)
|
||||
|
||||
# 3. Sévérité
|
||||
niveau, cma_count, cms_count = _compute_severity(dossier.diagnostics_associes)
|
||||
estimation.severite = niveau
|
||||
estimation.cma_count = cma_count
|
||||
estimation.cms_count = cms_count
|
||||
|
||||
# 4. Code approximatif
|
||||
if estimation.cmd and estimation.type_ghm:
|
||||
estimation.ghm_approx = f"{estimation.cmd}{estimation.type_ghm}??{estimation.severite}"
|
||||
|
||||
return estimation
|
||||
85
t2a_install_rag_cleanup/src/medical/ollama_cache.py
Normal file
85
t2a_install_rag_cleanup/src/medical/ollama_cache.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Cache persistant thread-safe pour les résultats Ollama."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OllamaCache:
|
||||
"""Cache JSON persistant pour éviter les appels Ollama redondants.
|
||||
|
||||
Clé = (texte_diagnostic_normalisé, type).
|
||||
Le modèle Ollama est stocké dans les métadonnées : si le modèle change,
|
||||
le cache est automatiquement invalidé.
|
||||
"""
|
||||
|
||||
def __init__(self, cache_path: Path, model: str):
|
||||
self._path = cache_path
|
||||
self._model = model
|
||||
self._lock = threading.Lock()
|
||||
self._data: dict[str, dict] = {}
|
||||
self._dirty = False
|
||||
self._load()
|
||||
|
||||
def _load(self) -> None:
|
||||
"""Charge le cache depuis le disque."""
|
||||
if not self._path.exists():
|
||||
logger.info("Cache Ollama : nouveau cache (%s)", self._path)
|
||||
return
|
||||
try:
|
||||
raw = json.loads(self._path.read_text(encoding="utf-8"))
|
||||
if raw.get("model") != self._model:
|
||||
logger.info(
|
||||
"Cache Ollama : modèle changé (%s → %s), cache invalidé",
|
||||
raw.get("model"), self._model,
|
||||
)
|
||||
return
|
||||
self._data = raw.get("entries", {})
|
||||
logger.info("Cache Ollama : %d entrées chargées", len(self._data))
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.warning("Cache Ollama : fichier corrompu (%s), réinitialisé", e)
|
||||
self._data = {}
|
||||
|
||||
@staticmethod
|
||||
def _make_key(texte: str, diag_type: str) -> str:
|
||||
"""Construit une clé normalisée."""
|
||||
return f"{diag_type}::{texte.strip().lower()}"
|
||||
|
||||
def get(self, texte: str, diag_type: str) -> dict | None:
|
||||
"""Récupère un résultat caché, ou None si absent."""
|
||||
key = self._make_key(texte, diag_type)
|
||||
with self._lock:
|
||||
return self._data.get(key)
|
||||
|
||||
def put(self, texte: str, diag_type: str, result: dict) -> None:
|
||||
"""Stocke un résultat dans le cache."""
|
||||
key = self._make_key(texte, diag_type)
|
||||
with self._lock:
|
||||
self._data[key] = result
|
||||
self._dirty = True
|
||||
|
||||
def save(self) -> None:
|
||||
"""Persiste le cache sur disque si modifié."""
|
||||
with self._lock:
|
||||
if not self._dirty:
|
||||
return
|
||||
self._path.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {
|
||||
"model": self._model,
|
||||
"entries": self._data,
|
||||
}
|
||||
self._path.write_text(
|
||||
json.dumps(payload, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
self._dirty = False
|
||||
logger.info("Cache Ollama : %d entrées sauvegardées", len(self._data))
|
||||
|
||||
def __len__(self) -> int:
|
||||
with self._lock:
|
||||
return len(self._data)
|
||||
135
t2a_install_rag_cleanup/src/medical/ollama_client.py
Normal file
135
t2a_install_rag_cleanup/src/medical/ollama_client.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""Client LLM partagé — Ollama (local) avec fallback Anthropic Haiku."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
from ..config import OLLAMA_URL, OLLAMA_MODEL, OLLAMA_TIMEOUT
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Fallback Anthropic ---
|
||||
_ANTHROPIC_MODEL = os.environ.get("ANTHROPIC_FALLBACK_MODEL", "claude-haiku-4-5-20251001")
|
||||
_anthropic_client = None
|
||||
|
||||
|
||||
def _get_anthropic_client():
|
||||
"""Lazy-init du client Anthropic (uniquement si clé API présente)."""
|
||||
global _anthropic_client
|
||||
if _anthropic_client is not None:
|
||||
return _anthropic_client
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
return None
|
||||
try:
|
||||
import anthropic
|
||||
_anthropic_client = anthropic.Anthropic(api_key=api_key)
|
||||
return _anthropic_client
|
||||
except Exception as e:
|
||||
logger.warning("Anthropic SDK non disponible : %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def call_anthropic(
|
||||
prompt: str,
|
||||
temperature: float = 0.1,
|
||||
max_tokens: int = 2500,
|
||||
) -> dict | None:
|
||||
"""Appelle l'API Anthropic (Haiku)."""
|
||||
client = _get_anthropic_client()
|
||||
if client is None:
|
||||
return None
|
||||
try:
|
||||
response = client.messages.create(
|
||||
model=_ANTHROPIC_MODEL,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
raw = response.content[0].text
|
||||
result = parse_json_response(raw)
|
||||
if result is not None:
|
||||
logger.debug("Anthropic fallback OK (%s)", _ANTHROPIC_MODEL)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.warning("Anthropic fallback erreur : %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def parse_json_response(raw: str) -> dict | None:
|
||||
"""Parse une réponse JSON, en gérant les blocs markdown."""
|
||||
text = raw.strip()
|
||||
if text.startswith("```"):
|
||||
first_nl = text.find("\n")
|
||||
if first_nl != -1:
|
||||
text = text[first_nl + 1:]
|
||||
if text.rstrip().endswith("```"):
|
||||
text = text.rstrip()[:-3]
|
||||
text = text.strip()
|
||||
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("LLM : JSON invalide : %s", raw[:200])
|
||||
return None
|
||||
|
||||
|
||||
def call_ollama(
|
||||
prompt: str,
|
||||
temperature: float = 0.1,
|
||||
max_tokens: int = 2500,
|
||||
model: str | None = None,
|
||||
timeout: int | None = None,
|
||||
) -> dict | None:
|
||||
"""Appelle Ollama en mode JSON natif, avec fallback Anthropic si indisponible.
|
||||
|
||||
Args:
|
||||
prompt: Le prompt à envoyer.
|
||||
temperature: Température de génération (défaut: 0.1).
|
||||
max_tokens: Nombre max de tokens (défaut: 2500).
|
||||
model: Modèle Ollama à utiliser (défaut: OLLAMA_MODEL global).
|
||||
timeout: Timeout en secondes (défaut: OLLAMA_TIMEOUT global).
|
||||
|
||||
Returns:
|
||||
Le dict JSON parsé, ou None en cas d'erreur.
|
||||
"""
|
||||
use_model = model or OLLAMA_MODEL
|
||||
use_timeout = timeout or OLLAMA_TIMEOUT
|
||||
for attempt in range(2):
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": use_model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"num_predict": max_tokens,
|
||||
},
|
||||
},
|
||||
timeout=use_timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json().get("response", "")
|
||||
result = parse_json_response(raw)
|
||||
if result is not None:
|
||||
return result
|
||||
if attempt == 0:
|
||||
logger.info("Ollama (%s) : retry après échec de parsing", use_model)
|
||||
except requests.ConnectionError:
|
||||
logger.info("Ollama indisponible → fallback Anthropic (%s)", _ANTHROPIC_MODEL)
|
||||
return call_anthropic(prompt, temperature, max_tokens)
|
||||
except requests.Timeout:
|
||||
logger.warning("Ollama (%s) timeout après %ds → fallback Anthropic",
|
||||
use_model, use_timeout)
|
||||
return call_anthropic(prompt, temperature, max_tokens)
|
||||
except (requests.RequestException, json.JSONDecodeError) as e:
|
||||
logger.warning("Ollama erreur : %s", e)
|
||||
return None
|
||||
return None
|
||||
725
t2a_install_rag_cleanup/src/medical/rag_index.py
Normal file
725
t2a_install_rag_cleanup/src/medical/rag_index.py
Normal file
@@ -0,0 +1,725 @@
|
||||
"""Indexation FAISS des documents de référence.
|
||||
|
||||
Objectif : éviter que des documents "procédure/méthodo" influencent le codage.
|
||||
|
||||
On maintient donc 2 index FAISS :
|
||||
- ref : référentiels (CIM-10, CCAM, référentiels uploadés en ref:...)
|
||||
- proc : procédures / guide méthodologique (guide_methodo + uploadés en proc:...)
|
||||
|
||||
Backwards compat : si les nouveaux fichiers n'existent pas, on retombe sur faiss.index.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pdfplumber
|
||||
|
||||
from ..config import RAG_INDEX_DIR, CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF, CCAM_DICT_PATH, REFERENTIELS_DIR, EMBEDDING_MODEL
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Singletons pour les index chargés en mémoire
|
||||
_loaded: dict[str, tuple] = {}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
text: str
|
||||
document: str # "cim10", "guide_methodo", "ccam"
|
||||
page: Optional[int] = None
|
||||
code: Optional[str] = None
|
||||
|
||||
|
||||
def _paths(kind: str) -> tuple[Path, Path]:
|
||||
"""Retourne (index_path, meta_path) pour un type d'index.
|
||||
|
||||
kind:
|
||||
- "ref" : référentiels
|
||||
- "proc" : procédures
|
||||
- "all" : legacy (faiss.index)
|
||||
"""
|
||||
kind = (kind or "ref").lower()
|
||||
if kind == "proc":
|
||||
return (RAG_INDEX_DIR / "faiss_proc.index", RAG_INDEX_DIR / "metadata_proc.json")
|
||||
if kind == "all":
|
||||
return (RAG_INDEX_DIR / "faiss.index", RAG_INDEX_DIR / "metadata.json")
|
||||
# ref (default)
|
||||
return (RAG_INDEX_DIR / "faiss_ref.index", RAG_INDEX_DIR / "metadata_ref.json")
|
||||
|
||||
|
||||
def _kind_for_chunk(chunk: Chunk) -> str:
|
||||
"""Détermine le type d'index cible pour un chunk."""
|
||||
doc = (chunk.document or "").lower()
|
||||
if doc == "guide_methodo" or doc.startswith("proc:"):
|
||||
return "proc"
|
||||
return "ref"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chunking CIM-10
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _chunk_cim10(pdf_path: Path) -> list[Chunk]:
|
||||
"""Découpe le PDF CIM-10 en double chunking : sous-codes individuels + parents 3-char."""
|
||||
chunks: list[Chunk] = []
|
||||
current_code3: str | None = None
|
||||
current_code3_text: list[str] = []
|
||||
current_code3_page: int | None = None
|
||||
|
||||
# Sous-codes en cours d'accumulation
|
||||
current_subcode: str | None = None
|
||||
current_subcode_text: list[str] = []
|
||||
current_subcode_page: int | None = None
|
||||
|
||||
code3_pattern = re.compile(r"^([A-Z]\d{2})\s+(.+)")
|
||||
subcode_pattern = re.compile(r"^([A-Z]\d{2}\.\d+)\s+(.+)")
|
||||
|
||||
logger.info("Extraction des chunks CIM-10 (double chunking) depuis %s", pdf_path.name)
|
||||
|
||||
def _flush_subcode():
|
||||
"""Sauvegarde le chunk sous-code en cours."""
|
||||
if current_subcode and current_subcode_text:
|
||||
chunk_text = "\n".join(current_subcode_text)
|
||||
if len(chunk_text.split()) >= 3:
|
||||
chunks.append(Chunk(
|
||||
text=chunk_text,
|
||||
document="cim10",
|
||||
page=current_subcode_page,
|
||||
code=current_subcode,
|
||||
))
|
||||
|
||||
def _flush_code3():
|
||||
"""Sauvegarde le chunk parent 3-char en cours."""
|
||||
_flush_subcode()
|
||||
if current_code3 and current_code3_text:
|
||||
chunk_text = "\n".join(current_code3_text)
|
||||
if len(chunk_text.split()) >= 5:
|
||||
chunks.append(Chunk(
|
||||
text=chunk_text,
|
||||
document="cim10",
|
||||
page=current_code3_page,
|
||||
code=current_code3,
|
||||
))
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
m_sub = subcode_pattern.match(line)
|
||||
m3 = code3_pattern.match(line)
|
||||
|
||||
if m_sub:
|
||||
# Nouveau sous-code → flush le sous-code précédent
|
||||
_flush_subcode()
|
||||
current_subcode = m_sub.group(1)
|
||||
current_subcode_text = [line]
|
||||
current_subcode_page = page_num
|
||||
# Ajouter aussi au chunk parent
|
||||
if current_code3:
|
||||
current_code3_text.append(line)
|
||||
elif m3 and not m_sub:
|
||||
# Nouveau code 3-char → flush tout le bloc précédent
|
||||
_flush_code3()
|
||||
current_code3 = m3.group(1)
|
||||
current_code3_text = [line]
|
||||
current_code3_page = page_num
|
||||
current_subcode = None
|
||||
current_subcode_text = []
|
||||
current_subcode_page = None
|
||||
else:
|
||||
# Ligne de continuation
|
||||
if current_subcode:
|
||||
current_subcode_text.append(line)
|
||||
if current_code3:
|
||||
current_code3_text.append(line)
|
||||
|
||||
# Flush final
|
||||
_flush_code3()
|
||||
|
||||
logger.info("CIM-10 : %d chunks extraits (double chunking sous-codes + parents)", len(chunks))
|
||||
return chunks
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chunking Guide Méthodologique MCO
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _chunk_guide_methodo(pdf_path: Path) -> list[Chunk]:
|
||||
"""Découpe le Guide Méthodologique MCO par sections/titres."""
|
||||
chunks: list[Chunk] = []
|
||||
current_title: str | None = None
|
||||
current_text: list[str] = []
|
||||
current_page: int | None = None
|
||||
|
||||
# Patterns de titres de sections (chapitres, sous-chapitres)
|
||||
title_patterns = [
|
||||
re.compile(r"^((?:CHAPITRE|TITRE|PARTIE)\s+[IVXLCDM0-9]+.*)$", re.IGNORECASE),
|
||||
re.compile(r"^(\d+\.\d*\s+[A-ZÉÈÊÀÂÔÙÛÜ].{5,})$"),
|
||||
re.compile(r"^([A-ZÉÈÊÀÂÔÙÛÜ][A-ZÉÈÊÀÂÔÙÛÜ\s]{10,})$"),
|
||||
]
|
||||
|
||||
logger.info("Extraction des chunks Guide Métho depuis %s", pdf_path.name)
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
is_title = False
|
||||
for pat in title_patterns:
|
||||
if pat.match(line):
|
||||
is_title = True
|
||||
break
|
||||
|
||||
if is_title and len(line) > 8:
|
||||
# Sauvegarder le chunk précédent
|
||||
if current_title and current_text:
|
||||
chunk_text = current_title + "\n" + "\n".join(current_text)
|
||||
if len(chunk_text.split()) >= 20:
|
||||
chunks.append(Chunk(
|
||||
text=chunk_text,
|
||||
document="guide_methodo",
|
||||
page=current_page,
|
||||
))
|
||||
current_title = line
|
||||
current_text = []
|
||||
current_page = page_num
|
||||
else:
|
||||
current_text.append(line)
|
||||
|
||||
# Dernier chunk
|
||||
if current_title and current_text:
|
||||
chunk_text = current_title + "\n" + "\n".join(current_text)
|
||||
if len(chunk_text.split()) >= 20:
|
||||
chunks.append(Chunk(
|
||||
text=chunk_text,
|
||||
document="guide_methodo",
|
||||
page=current_page,
|
||||
))
|
||||
|
||||
# Si trop peu de chunks (le PDF ne suit pas les patterns de titre),
|
||||
# fallback : découper par pages groupées par 3
|
||||
if len(chunks) < 10:
|
||||
logger.info("Guide Métho : fallback découpe par pages (peu de titres détectés)")
|
||||
chunks = []
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
page_texts: list[str] = []
|
||||
start_page = 1
|
||||
for page_num, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
page_texts.append(text)
|
||||
if len(page_texts) >= 3:
|
||||
combined = "\n".join(page_texts)
|
||||
if len(combined.split()) >= 20:
|
||||
chunks.append(Chunk(
|
||||
text=combined,
|
||||
document="guide_methodo",
|
||||
page=start_page,
|
||||
))
|
||||
page_texts = []
|
||||
start_page = page_num + 1
|
||||
if page_texts:
|
||||
combined = "\n".join(page_texts)
|
||||
if len(combined.split()) >= 20:
|
||||
chunks.append(Chunk(
|
||||
text=combined,
|
||||
document="guide_methodo",
|
||||
page=start_page,
|
||||
))
|
||||
|
||||
logger.info("Guide Métho : %d chunks extraits", len(chunks))
|
||||
return chunks
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chunking CCAM
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _chunk_ccam(pdf_path: Path) -> list[Chunk]:
|
||||
"""Découpe le PDF CCAM en chunks par code d'acte."""
|
||||
chunks: list[Chunk] = []
|
||||
ccam_pattern = re.compile(r"([A-Z]{4}\d{3})\s+(.*)")
|
||||
|
||||
logger.info("Extraction des chunks CCAM depuis %s", pdf_path.name)
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
current_code: str | None = None
|
||||
current_lines: list[str] = []
|
||||
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
m = ccam_pattern.match(line)
|
||||
if m:
|
||||
if current_code and current_lines:
|
||||
chunks.append(Chunk(
|
||||
text="\n".join(current_lines),
|
||||
document="ccam",
|
||||
page=page_num,
|
||||
code=current_code,
|
||||
))
|
||||
current_code = m.group(1)
|
||||
current_lines = [line]
|
||||
elif current_code:
|
||||
current_lines.append(line)
|
||||
|
||||
if current_code and current_lines:
|
||||
chunks.append(Chunk(
|
||||
text="\n".join(current_lines),
|
||||
document="ccam",
|
||||
page=page_num,
|
||||
code=current_code,
|
||||
))
|
||||
|
||||
# Fallback : si aucun code CCAM détecté, indexer par page
|
||||
if not chunks:
|
||||
logger.info("CCAM : aucun code détecté, fallback par page")
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text()
|
||||
if text and len(text.split()) >= 10:
|
||||
chunks.append(Chunk(
|
||||
text=text,
|
||||
document="ccam",
|
||||
page=page_num,
|
||||
))
|
||||
|
||||
logger.info("CCAM : %d chunks extraits", len(chunks))
|
||||
return chunks
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chunking CCAM depuis le dictionnaire JSON
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _chunk_ccam_from_dict() -> list[Chunk]:
|
||||
"""Génère des chunks CCAM depuis ccam_dict.json (un chunk par code+description).
|
||||
|
||||
Prioritaire sur les chunks PDF si le dictionnaire existe.
|
||||
"""
|
||||
if not CCAM_DICT_PATH.exists():
|
||||
return []
|
||||
|
||||
import json as _json
|
||||
with open(CCAM_DICT_PATH, encoding="utf-8") as f:
|
||||
ccam_dict = _json.load(f)
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
for code, info in ccam_dict.items():
|
||||
desc = info.get("description", "") if isinstance(info, dict) else str(info)
|
||||
if not desc:
|
||||
continue
|
||||
regroupement = info.get("regroupement", "") if isinstance(info, dict) else ""
|
||||
tarif = info.get("tarif_s1") if isinstance(info, dict) else None
|
||||
text_parts = [f"{code} {desc}"]
|
||||
if regroupement:
|
||||
text_parts.append(f"Regroupement: {regroupement}")
|
||||
if tarif is not None:
|
||||
text_parts.append(f"Tarif S1: {tarif}€")
|
||||
chunks.append(Chunk(
|
||||
text="\n".join(text_parts),
|
||||
document="ccam",
|
||||
code=code,
|
||||
))
|
||||
|
||||
logger.info("CCAM dict : %d chunks générés depuis %s", len(chunks), CCAM_DICT_PATH)
|
||||
return chunks
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chunking CIM-10 Index Alphabétique
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _chunk_cim10_alpha(pdf_path: Path) -> list[Chunk]:
|
||||
"""Parse la section INDEX ALPHABÉTIQUE du PDF CIM-10.
|
||||
|
||||
Détecte les entrées de type "terme → code" et génère des chunks
|
||||
avec document="cim10_alpha".
|
||||
"""
|
||||
chunks: list[Chunk] = []
|
||||
# Pattern : ligne avec un terme suivi d'un code CIM-10 en fin de ligne
|
||||
entry_pattern = re.compile(r"^(.+?)\s+([A-Z]\d{2}(?:\.\d+)?)\s*$")
|
||||
|
||||
logger.info("Extraction de l'index alphabétique CIM-10 depuis %s", pdf_path.name)
|
||||
|
||||
in_alpha_section = False
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Détecter le début de la section index alphabétique
|
||||
text_upper = text.upper()
|
||||
if "INDEX ALPHAB" in text_upper:
|
||||
in_alpha_section = True
|
||||
# Certaines pages avant l'index : ne pas parser
|
||||
if not in_alpha_section:
|
||||
continue
|
||||
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
m = entry_pattern.match(line)
|
||||
if m:
|
||||
terme = m.group(1).strip()
|
||||
code = m.group(2)
|
||||
if len(terme) >= 3:
|
||||
chunks.append(Chunk(
|
||||
text=f"{terme} → {code}",
|
||||
document="cim10_alpha",
|
||||
page=page_num,
|
||||
code=code,
|
||||
))
|
||||
|
||||
logger.info("CIM-10 index alphabétique : %d entrées extraites", len(chunks))
|
||||
return chunks
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Construction de l'index FAISS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_index(force: bool = False) -> None:
|
||||
"""Construit les index FAISS à partir des PDFs de référence.
|
||||
|
||||
- ref : CIM-10 (+ index alpha) + CCAM
|
||||
- proc : Guide méthodologique
|
||||
|
||||
Args:
|
||||
force: Si True, reconstruit même si l'index existe déjà.
|
||||
"""
|
||||
import faiss
|
||||
import numpy as np
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
ref_index_path, ref_meta_path = _paths("ref")
|
||||
proc_index_path, proc_meta_path = _paths("proc")
|
||||
|
||||
# Si tout existe déjà et pas de force
|
||||
ref_ok = ref_index_path.exists() and ref_meta_path.exists()
|
||||
proc_ok = proc_index_path.exists() and proc_meta_path.exists()
|
||||
guide_expected = GUIDE_METHODO_PDF.exists()
|
||||
if not force and ref_ok and ((not guide_expected) or proc_ok):
|
||||
logger.info("Index FAISS déjà existants dans %s (use force=True pour reconstruire)", RAG_INDEX_DIR)
|
||||
return
|
||||
|
||||
# Collecter les chunks
|
||||
ref_chunks: list[Chunk] = []
|
||||
proc_chunks: list[Chunk] = []
|
||||
|
||||
# CIM-10 (référentiel)
|
||||
if CIM10_PDF.exists():
|
||||
ref_chunks.extend(_chunk_cim10(CIM10_PDF))
|
||||
ref_chunks.extend(_chunk_cim10_alpha(CIM10_PDF))
|
||||
else:
|
||||
logger.warning("PDF non trouvé : %s", CIM10_PDF)
|
||||
|
||||
# Guide méthodologique (procédures)
|
||||
if GUIDE_METHODO_PDF.exists():
|
||||
proc_chunks.extend(_chunk_guide_methodo(GUIDE_METHODO_PDF))
|
||||
else:
|
||||
logger.warning("PDF non trouvé : %s", GUIDE_METHODO_PDF)
|
||||
|
||||
# CCAM (référentiel)
|
||||
ccam_dict_chunks = _chunk_ccam_from_dict()
|
||||
if ccam_dict_chunks:
|
||||
ref_chunks.extend(ccam_dict_chunks)
|
||||
elif CCAM_PDF.exists():
|
||||
ref_chunks.extend(_chunk_ccam(CCAM_PDF))
|
||||
else:
|
||||
logger.warning("Ni dictionnaire CCAM ni PDF CCAM trouvé")
|
||||
|
||||
if not ref_chunks and not proc_chunks:
|
||||
logger.error("Aucun chunk extrait — vérifiez les chemins des PDFs")
|
||||
return
|
||||
|
||||
logger.info("Total ref : %d chunks | total proc : %d chunks", len(ref_chunks), len(proc_chunks))
|
||||
|
||||
# Embeddings — GPU si disponible
|
||||
import torch
|
||||
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
logger.info("Chargement du modèle d'embedding %s (%s)...", EMBEDDING_MODEL, _device)
|
||||
model = SentenceTransformer(EMBEDDING_MODEL, device=_device)
|
||||
model.max_seq_length = 512 # CamemBERT max position embeddings
|
||||
|
||||
def _write_index(chunks: list[Chunk], idx_path: Path, meta_path: Path, label: str) -> None:
|
||||
if not chunks:
|
||||
return
|
||||
texts = [c.text[:2000] for c in chunks]
|
||||
logger.info("Calcul des embeddings (%s) pour %d chunks...", label, len(texts))
|
||||
embeddings = model.encode(texts, show_progress_bar=True, normalize_embeddings=True, batch_size=64)
|
||||
embeddings = np.array(embeddings, dtype=np.float32)
|
||||
dim = embeddings.shape[1]
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
RAG_INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||
faiss.write_index(index, str(idx_path))
|
||||
|
||||
metadata = [asdict(c) for c in chunks]
|
||||
for m in metadata:
|
||||
m["extrait"] = m.pop("text")[:800]
|
||||
meta_path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
logger.info("Index FAISS sauvegardé (%s) : %s (%d vecteurs, dim=%d)", label, idx_path, len(chunks), dim)
|
||||
|
||||
_write_index(ref_chunks, ref_index_path, ref_meta_path, "ref")
|
||||
_write_index(proc_chunks, proc_index_path, proc_meta_path, "proc")
|
||||
|
||||
# Invalider les singletons
|
||||
reset_index()
|
||||
|
||||
|
||||
def get_index(kind: str = "ref") -> tuple | None:
|
||||
"""Charge un index FAISS et ses métadonnées (singleton lazy-loaded).
|
||||
|
||||
Args:
|
||||
kind: "ref" | "proc" | "all".
|
||||
|
||||
Returns:
|
||||
Tuple (faiss_index, metadata_list) ou None si l'index n'existe pas.
|
||||
"""
|
||||
kind = (kind or "ref").lower()
|
||||
|
||||
if kind in _loaded:
|
||||
return _loaded[kind]
|
||||
|
||||
index_path, meta_path = _paths(kind)
|
||||
|
||||
# Backwards compat : si ref/proc absent, fallback sur all
|
||||
if kind in ("ref", "proc") and (not index_path.exists() or not meta_path.exists()):
|
||||
legacy_idx, legacy_meta = _paths("all")
|
||||
if legacy_idx.exists() and legacy_meta.exists():
|
||||
logger.warning("Index %s absent — fallback legacy faiss.index", kind)
|
||||
index_path, meta_path = legacy_idx, legacy_meta
|
||||
else:
|
||||
logger.warning("Index FAISS non trouvé dans %s — lancez build_index() d'abord", RAG_INDEX_DIR)
|
||||
return None
|
||||
|
||||
if not index_path.exists() or not meta_path.exists():
|
||||
logger.warning("Index FAISS non trouvé (%s) dans %s — lancez build_index() d'abord", kind, RAG_INDEX_DIR)
|
||||
return None
|
||||
|
||||
import faiss
|
||||
|
||||
faiss_index = faiss.read_index(str(index_path))
|
||||
metadata = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
|
||||
logger.info("Index FAISS chargé (%s) : %d vecteurs", kind, faiss_index.ntotal)
|
||||
_loaded[kind] = (faiss_index, metadata)
|
||||
return _loaded[kind]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chunking générique pour fichiers utilisateur (référentiels)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def chunk_user_file(file_path: Path, doc_name: str) -> list[Chunk]:
|
||||
"""Découpe un fichier utilisateur en chunks pour indexation FAISS.
|
||||
|
||||
Dispatch selon l'extension :
|
||||
- PDF : pages groupées par 2
|
||||
- CSV/Excel : une ligne = un chunk
|
||||
- TXT : paragraphes (blocs séparés par lignes vides)
|
||||
|
||||
Args:
|
||||
file_path: Chemin du fichier.
|
||||
doc_name: Nom du document (utilisé comme identifiant dans les métadonnées).
|
||||
|
||||
Returns:
|
||||
Liste de Chunk prêts pour l'indexation.
|
||||
"""
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix == ".pdf":
|
||||
return _chunk_user_pdf(file_path, doc_name)
|
||||
elif suffix in (".csv", ".xlsx", ".xls"):
|
||||
return _chunk_user_tabular(file_path, doc_name)
|
||||
elif suffix == ".txt":
|
||||
return _chunk_user_txt(file_path, doc_name)
|
||||
else:
|
||||
logger.warning("Extension non supportée pour chunking : %s", suffix)
|
||||
return []
|
||||
|
||||
|
||||
def _chunk_user_pdf(file_path: Path, doc_name: str) -> list[Chunk]:
|
||||
"""Découpe un PDF utilisateur en chunks de 2 pages."""
|
||||
chunks: list[Chunk] = []
|
||||
try:
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
page_texts: list[str] = []
|
||||
start_page = 1
|
||||
for page_num, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
page_texts.append(text)
|
||||
if len(page_texts) >= 2:
|
||||
combined = "\n".join(page_texts)
|
||||
if len(combined.split()) >= 10:
|
||||
chunks.append(Chunk(
|
||||
text=combined,
|
||||
document=doc_name,
|
||||
page=start_page,
|
||||
))
|
||||
page_texts = []
|
||||
start_page = page_num + 1
|
||||
if page_texts:
|
||||
combined = "\n".join(page_texts)
|
||||
if len(combined.split()) >= 10:
|
||||
chunks.append(Chunk(
|
||||
text=combined,
|
||||
document=doc_name,
|
||||
page=start_page,
|
||||
))
|
||||
except Exception:
|
||||
logger.warning("Erreur lors du chunking PDF %s", file_path, exc_info=True)
|
||||
logger.info("Référentiel PDF %s : %d chunks", doc_name, len(chunks))
|
||||
return chunks
|
||||
|
||||
|
||||
def _chunk_user_tabular(file_path: Path, doc_name: str) -> list[Chunk]:
|
||||
"""Découpe un CSV/Excel : une ligne = un chunk."""
|
||||
chunks: list[Chunk] = []
|
||||
try:
|
||||
import pandas as pd
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix == ".csv":
|
||||
df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip")
|
||||
else:
|
||||
df = pd.read_excel(file_path)
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
text = " | ".join(str(v) for v in row.values if pd.notna(v))
|
||||
if len(text.split()) >= 3:
|
||||
chunks.append(Chunk(
|
||||
text=text,
|
||||
document=doc_name,
|
||||
page=int(idx) + 1,
|
||||
))
|
||||
except Exception:
|
||||
logger.warning("Erreur lors du chunking tabular %s", file_path, exc_info=True)
|
||||
logger.info("Référentiel tabular %s : %d chunks", doc_name, len(chunks))
|
||||
return chunks
|
||||
|
||||
|
||||
def _chunk_user_txt(file_path: Path, doc_name: str) -> list[Chunk]:
|
||||
"""Découpe un fichier TXT en paragraphes (blocs séparés par lignes vides)."""
|
||||
chunks: list[Chunk] = []
|
||||
try:
|
||||
text = file_path.read_text(encoding="utf-8")
|
||||
paragraphs = re.split(r"\n\s*\n", text)
|
||||
for i, para in enumerate(paragraphs):
|
||||
para = para.strip()
|
||||
if len(para.split()) >= 5:
|
||||
chunks.append(Chunk(
|
||||
text=para,
|
||||
document=doc_name,
|
||||
page=i + 1,
|
||||
))
|
||||
except Exception:
|
||||
logger.warning("Erreur lors du chunking TXT %s", file_path, exc_info=True)
|
||||
logger.info("Référentiel TXT %s : %d chunks", doc_name, len(chunks))
|
||||
return chunks
|
||||
|
||||
|
||||
def add_chunks_to_index(chunks: list[Chunk]) -> int:
|
||||
"""Ajoute des chunks à l'index FAISS existant (incrémental).
|
||||
|
||||
Charge l'index si nécessaire, encode les chunks, ajoute les vecteurs,
|
||||
et sauvegarde le tout.
|
||||
|
||||
Args:
|
||||
chunks: Liste de Chunk à ajouter.
|
||||
|
||||
Returns:
|
||||
Nombre de chunks effectivement ajoutés.
|
||||
"""
|
||||
if not chunks:
|
||||
return 0
|
||||
|
||||
import faiss
|
||||
import numpy as np
|
||||
from .rag_search import _get_embed_model
|
||||
|
||||
# Dans 99% des cas, on veut éviter de mélanger : on route vers ref/proc selon le préfixe.
|
||||
# Si l'appelant veut forcer, il peut passer des chunks avec document="proc:...".
|
||||
kind = _kind_for_chunk(chunks[0])
|
||||
index_path, meta_path = _paths(kind)
|
||||
|
||||
# Backwards compat : si on n'a que l'ancien index, on l'utilise.
|
||||
if not index_path.exists() or not meta_path.exists():
|
||||
legacy_idx, legacy_meta = _paths("all")
|
||||
if legacy_idx.exists() and legacy_meta.exists():
|
||||
index_path, meta_path = legacy_idx, legacy_meta
|
||||
|
||||
# Charger l'index existant ou en créer un nouveau
|
||||
if index_path.exists() and meta_path.exists():
|
||||
faiss_idx = faiss.read_index(str(index_path))
|
||||
metadata = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
else:
|
||||
model = _get_embed_model()
|
||||
# Obtenir la dimension via un encodage test
|
||||
test_vec = model.encode(["test"], normalize_embeddings=True)
|
||||
dim = test_vec.shape[1]
|
||||
faiss_idx = faiss.IndexFlatIP(dim)
|
||||
metadata = []
|
||||
|
||||
# Encoder les nouveaux chunks
|
||||
model = _get_embed_model()
|
||||
texts = [c.text[:2000] for c in chunks]
|
||||
embeddings = model.encode(texts, normalize_embeddings=True, batch_size=64)
|
||||
embeddings = np.array(embeddings, dtype=np.float32)
|
||||
|
||||
# Ajouter à l'index
|
||||
faiss_idx.add(embeddings)
|
||||
|
||||
# Ajouter les métadonnées
|
||||
from dataclasses import asdict
|
||||
for chunk in chunks:
|
||||
meta = asdict(chunk)
|
||||
meta["extrait"] = meta.pop("text")[:800]
|
||||
metadata.append(meta)
|
||||
|
||||
# Sauvegarder
|
||||
RAG_INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||
faiss.write_index(faiss_idx, str(index_path))
|
||||
meta_path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
# Invalider le singleton pour forcer le rechargement
|
||||
reset_index()
|
||||
|
||||
logger.info("Index FAISS : %d chunks ajoutés (total : %d)", len(chunks), faiss_idx.ntotal)
|
||||
return len(chunks)
|
||||
|
||||
|
||||
def reset_index() -> None:
|
||||
"""Invalide les singletons FAISS pour forcer le rechargement au prochain accès."""
|
||||
_loaded.clear()
|
||||
837
t2a_install_rag_cleanup/src/medical/rag_search.py
Normal file
837
t2a_install_rag_cleanup/src/medical/rag_search.py
Normal file
@@ -0,0 +1,837 @@
|
||||
"""Recherche RAG (FAISS) + génération via Ollama pour le codage CIM-10."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from ..config import (
|
||||
ActeCCAM, Diagnostic, DossierMedical, PreuveClinique, RAGSource,
|
||||
OLLAMA_CACHE_PATH, OLLAMA_MAX_PARALLEL, OLLAMA_MODEL,
|
||||
EMBEDDING_MODEL, RERANKER_MODEL,
|
||||
)
|
||||
from .cim10_dict import normalize_code, validate_code as cim10_validate, fallback_parent_code
|
||||
from .cim10_extractor import BIO_NORMALS
|
||||
from .clinical_context import build_enriched_context, format_enriched_context
|
||||
from .ccam_dict import validate_code as ccam_validate
|
||||
from .ollama_client import call_ollama, parse_json_response
|
||||
from .ollama_cache import OllamaCache
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Singleton pour le modèle d'embedding (chargé une seule fois)
|
||||
_embed_model = None
|
||||
_embed_lock = threading.Lock()
|
||||
_embed_failed = False # Sentinelle pour éviter les retries infinis
|
||||
|
||||
# Singleton pour le cross-encoder de re-ranking (CPU uniquement)
|
||||
_reranker_model = None
|
||||
|
||||
# Score minimum de similarité FAISS pour retenir un résultat
|
||||
_MIN_SCORE = 0.3
|
||||
# Seuil rehaussé pour le contexte CPAM (filtrage plus agressif du bruit)
|
||||
_MIN_SCORE_CPAM = 0.40
|
||||
|
||||
|
||||
def _get_embed_model():
|
||||
"""Charge le modèle d'embedding (singleton thread-safe).
|
||||
|
||||
Tente CUDA d'abord, fallback CPU si OOM (Ollama peut occuper la VRAM).
|
||||
low_cpu_mem_usage=False évite les meta tensors (accelerate + sentence-transformers 5.x).
|
||||
Un Lock empêche les chargements concurrents depuis le ThreadPool.
|
||||
"""
|
||||
global _embed_model, _embed_failed
|
||||
if _embed_model is not None:
|
||||
return _embed_model
|
||||
if _embed_failed:
|
||||
raise RuntimeError("Modèle d'embedding indisponible (échec précédent)")
|
||||
with _embed_lock:
|
||||
# Double-check après acquisition du lock
|
||||
if _embed_model is not None:
|
||||
return _embed_model
|
||||
if _embed_failed:
|
||||
raise RuntimeError("Modèle d'embedding indisponible (échec précédent)")
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import torch
|
||||
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
_model_kwargs = {"low_cpu_mem_usage": False}
|
||||
try:
|
||||
logger.info("Chargement du modèle d'embedding (%s)...", _device)
|
||||
_embed_model = SentenceTransformer(
|
||||
EMBEDDING_MODEL, device=_device, model_kwargs=_model_kwargs,
|
||||
)
|
||||
except (torch.OutOfMemoryError, torch.cuda.CudaError, torch.AcceleratorError,
|
||||
RuntimeError, NotImplementedError) as exc:
|
||||
exc_msg = str(exc).lower()
|
||||
if _device == "cuda" and ("memory" in exc_msg or "meta tensor" in exc_msg):
|
||||
logger.warning("CUDA erreur pour l'embedding — fallback CPU : %s", exc)
|
||||
torch.cuda.empty_cache()
|
||||
try:
|
||||
_embed_model = SentenceTransformer(
|
||||
EMBEDDING_MODEL, device="cpu", model_kwargs=_model_kwargs,
|
||||
)
|
||||
except Exception as exc2:
|
||||
logger.error("Fallback CPU aussi en échec : %s", exc2)
|
||||
_embed_failed = True
|
||||
raise
|
||||
else:
|
||||
_embed_failed = True
|
||||
raise
|
||||
_embed_model.max_seq_length = 512
|
||||
return _embed_model
|
||||
|
||||
|
||||
def _get_reranker():
|
||||
"""Charge le cross-encoder de re-ranking (singleton, CPU uniquement).
|
||||
|
||||
Forcé sur CPU pour ne pas interférer avec Ollama sur GPU.
|
||||
"""
|
||||
global _reranker_model
|
||||
if _reranker_model is None:
|
||||
from sentence_transformers import CrossEncoder
|
||||
logger.info("Chargement du cross-encoder de re-ranking (cpu)...")
|
||||
_reranker_model = CrossEncoder(RERANKER_MODEL, device="cpu")
|
||||
return _reranker_model
|
||||
|
||||
|
||||
def _rerank(query: str, results: list[dict], top_k: int) -> list[dict]:
|
||||
"""Re-classe les résultats FAISS via un cross-encoder.
|
||||
|
||||
Args:
|
||||
query: Texte de la requête originale.
|
||||
results: Résultats FAISS avec clé 'extrait'.
|
||||
top_k: Nombre de résultats à retourner.
|
||||
|
||||
Returns:
|
||||
Résultats re-classés par score cross-encoder, limités à top_k.
|
||||
"""
|
||||
if not results:
|
||||
return results
|
||||
|
||||
reranker = _get_reranker()
|
||||
|
||||
# Construire les paires (query, passage) pour le cross-encoder
|
||||
pairs = [(query, r.get("extrait", "")) for r in results]
|
||||
ce_scores = reranker.predict(pairs)
|
||||
|
||||
# Injecter le score cross-encoder et trier
|
||||
for r, ce_score in zip(results, ce_scores):
|
||||
r["score_faiss"] = r["score"]
|
||||
r["score"] = float(ce_score)
|
||||
|
||||
results.sort(key=lambda r: r["score"], reverse=True)
|
||||
return results[:top_k]
|
||||
|
||||
|
||||
def search_similar(query: str, top_k: int = 10) -> list[dict]:
|
||||
"""Recherche les passages les plus similaires dans l'index FAISS.
|
||||
|
||||
Args:
|
||||
query: Texte du diagnostic à rechercher.
|
||||
top_k: Nombre de résultats à retourner.
|
||||
|
||||
Returns:
|
||||
Liste de dicts avec les métadonnées + score de similarité,
|
||||
filtrés par score minimum et priorisant les sources CIM-10.
|
||||
"""
|
||||
from .rag_index import get_index
|
||||
import numpy as np
|
||||
|
||||
# Codage CIM-10 : on interroge l'index "ref" (pas le guide méthodo).
|
||||
result = get_index(kind="ref")
|
||||
if result is None:
|
||||
logger.warning("Index FAISS non disponible")
|
||||
return []
|
||||
|
||||
faiss_index, metadata = result
|
||||
|
||||
model = _get_embed_model()
|
||||
query_vec = model.encode([query], normalize_embeddings=True)
|
||||
query_vec = np.array(query_vec, dtype=np.float32)
|
||||
|
||||
# Chercher plus de résultats que top_k pour pouvoir filtrer ensuite
|
||||
fetch_k = min(top_k * 2, faiss_index.ntotal)
|
||||
scores, indices = faiss_index.search(query_vec, fetch_k)
|
||||
|
||||
raw_results = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx < 0:
|
||||
continue
|
||||
if float(score) < _MIN_SCORE:
|
||||
continue
|
||||
meta = metadata[idx].copy()
|
||||
meta["score"] = float(score)
|
||||
raw_results.append(meta)
|
||||
|
||||
# Codage : on garde uniquement CIM-10 + index alpha + éventuels référentiels uploadés en ref:...
|
||||
cim10_results = [r for r in raw_results if r["document"] in ("cim10", "cim10_alpha")]
|
||||
ref_uploads = [r for r in raw_results if str(r.get("document", "")).startswith("ref:")]
|
||||
|
||||
# Ne pas laisser les procédures/méthodo contaminer la sélection.
|
||||
other_results = ref_uploads
|
||||
|
||||
min_cim10 = min(6, len(cim10_results))
|
||||
final = cim10_results[:min_cim10]
|
||||
remaining_slots = top_k - len(final)
|
||||
# Remplir le reste avec les meilleurs résultats (CIM-10 restants + autres)
|
||||
remaining = cim10_results[min_cim10:] + other_results
|
||||
remaining.sort(key=lambda r: r["score"], reverse=True)
|
||||
final.extend(remaining[:remaining_slots])
|
||||
|
||||
return final
|
||||
|
||||
|
||||
def search_similar_ccam(query: str, top_k: int = 8) -> list[dict]:
|
||||
"""Recherche les passages CCAM les plus similaires dans l'index FAISS.
|
||||
|
||||
Même logique que search_similar() mais priorise les sources CCAM.
|
||||
"""
|
||||
from .rag_index import get_index
|
||||
import numpy as np
|
||||
|
||||
# CCAM : index "ref".
|
||||
result = get_index(kind="ref")
|
||||
if result is None:
|
||||
logger.warning("Index FAISS non disponible")
|
||||
return []
|
||||
|
||||
faiss_index, metadata = result
|
||||
|
||||
model = _get_embed_model()
|
||||
query_vec = model.encode([query], normalize_embeddings=True)
|
||||
query_vec = np.array(query_vec, dtype=np.float32)
|
||||
|
||||
fetch_k = min(top_k * 2, faiss_index.ntotal)
|
||||
scores, indices = faiss_index.search(query_vec, fetch_k)
|
||||
|
||||
raw_results = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx < 0:
|
||||
continue
|
||||
if float(score) < _MIN_SCORE:
|
||||
continue
|
||||
meta = metadata[idx].copy()
|
||||
meta["score"] = float(score)
|
||||
raw_results.append(meta)
|
||||
|
||||
# Prioriser les sources CCAM (au moins 5 sur top_k)
|
||||
ccam_results = [r for r in raw_results if r["document"] == "ccam"]
|
||||
other_results = [r for r in raw_results if r["document"] != "ccam"]
|
||||
|
||||
min_ccam = min(5, len(ccam_results))
|
||||
final = ccam_results[:min_ccam]
|
||||
remaining_slots = top_k - len(final)
|
||||
remaining = ccam_results[min_ccam:] + other_results
|
||||
remaining.sort(key=lambda r: r["score"], reverse=True)
|
||||
final.extend(remaining[:remaining_slots])
|
||||
|
||||
return final
|
||||
|
||||
|
||||
def search_similar_cpam(query: str, top_k: int = 8) -> list[dict]:
|
||||
"""Recherche RAG spécifique au contexte CPAM (contre-argumentation).
|
||||
|
||||
Différences avec search_similar() :
|
||||
- Priorité Guide Méthodologique (min 3 résultats) plutôt que CIM-10
|
||||
- Seuil de score rehaussé (0.40 vs 0.30) pour éliminer le bruit
|
||||
- Fetch élargi (top_k * 3) car filtrage plus agressif
|
||||
- Déduplication par code CIM-10 (garde le meilleur score par code)
|
||||
"""
|
||||
from .rag_index import get_index
|
||||
import numpy as np
|
||||
|
||||
# Contexte CPAM : on veut des procédures (guide) + définitions référentielles (CIM-10).
|
||||
proc = get_index(kind="proc")
|
||||
ref = get_index(kind="ref")
|
||||
if proc is None and ref is None:
|
||||
logger.warning("Index FAISS non disponible")
|
||||
return []
|
||||
|
||||
model = _get_embed_model()
|
||||
query_vec = model.encode([query], normalize_embeddings=True)
|
||||
query_vec = np.array(query_vec, dtype=np.float32)
|
||||
|
||||
def _search_one(result_tuple, fetch_mult: int) -> list[dict]:
|
||||
if result_tuple is None:
|
||||
return []
|
||||
faiss_index, metadata = result_tuple
|
||||
fetch_k = min(top_k * fetch_mult, faiss_index.ntotal)
|
||||
scores, indices = faiss_index.search(query_vec, fetch_k)
|
||||
out = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx < 0:
|
||||
continue
|
||||
if float(score) < _MIN_SCORE_CPAM:
|
||||
continue
|
||||
meta = metadata[idx].copy()
|
||||
meta["score"] = float(score)
|
||||
out.append(meta)
|
||||
return out
|
||||
|
||||
raw_proc = _search_one(proc, fetch_mult=3)
|
||||
raw_ref = _search_one(ref, fetch_mult=3)
|
||||
|
||||
# Filtrer clairement :
|
||||
# - proc : guide_methodo + uploads proc:
|
||||
raw_proc = [r for r in raw_proc if r.get("document") == "guide_methodo" or str(r.get("document", "")).startswith("proc:")]
|
||||
# - ref : CIM-10 + index alpha + uploads ref:
|
||||
raw_ref = [r for r in raw_ref if r.get("document") in ("cim10", "cim10_alpha") or str(r.get("document", "")).startswith("ref:")]
|
||||
|
||||
raw_results = raw_proc + raw_ref
|
||||
|
||||
# Dédupliquer par code CIM-10 (garder meilleur score par code)
|
||||
seen_codes: dict[str, dict] = {}
|
||||
deduped = []
|
||||
for r in raw_results:
|
||||
code = r.get("code")
|
||||
if code:
|
||||
if code in seen_codes:
|
||||
if r["score"] > seen_codes[code]["score"]:
|
||||
seen_codes[code] = r
|
||||
else:
|
||||
seen_codes[code] = r
|
||||
else:
|
||||
deduped.append(r) # pas de code → garder (guide_methodo, etc.)
|
||||
deduped.extend(seen_codes.values())
|
||||
deduped.sort(key=lambda r: r["score"], reverse=True)
|
||||
|
||||
# Re-ranking cross-encoder (CPU) pour affiner le classement
|
||||
reranked = _rerank(query, deduped, top_k=len(deduped))
|
||||
|
||||
# Prioriser le Guide Méthodologique (min 3 résultats)
|
||||
guide_results = [r for r in reranked if r.get("document") == "guide_methodo" or str(r.get("document", "")).startswith("proc:")]
|
||||
other_results = [
|
||||
r for r in reranked
|
||||
if not (r.get("document") == "guide_methodo" or str(r.get("document", "")).startswith("proc:"))
|
||||
]
|
||||
|
||||
min_guide = min(3, len(guide_results))
|
||||
final = guide_results[:min_guide]
|
||||
remaining_slots = top_k - len(final)
|
||||
remaining = guide_results[min_guide:] + other_results
|
||||
remaining.sort(key=lambda r: r["score"], reverse=True)
|
||||
final.extend(remaining[:remaining_slots])
|
||||
|
||||
return final
|
||||
|
||||
|
||||
def _format_contexte(contexte: dict) -> str:
|
||||
"""Formate le contexte patient de manière structurée pour le prompt."""
|
||||
lines = []
|
||||
|
||||
sexe = contexte.get("sexe")
|
||||
age = contexte.get("age")
|
||||
imc = contexte.get("imc")
|
||||
patient_parts = []
|
||||
if sexe:
|
||||
patient_parts.append(sexe)
|
||||
if age:
|
||||
patient_parts.append(f"{age} ans")
|
||||
if imc:
|
||||
patient_parts.append(f"IMC {imc}")
|
||||
if patient_parts:
|
||||
lines.append(f"- Patient : {', '.join(patient_parts)}")
|
||||
|
||||
duree = contexte.get("duree_sejour")
|
||||
if duree:
|
||||
lines.append(f"- Durée séjour : {duree} jours")
|
||||
|
||||
antecedents = contexte.get("antecedents")
|
||||
if antecedents:
|
||||
lines.append(f"- Antécédents : {', '.join(antecedents[:5])}")
|
||||
|
||||
biologie = contexte.get("biologie_cle")
|
||||
if biologie:
|
||||
bio_parts = []
|
||||
for b in biologie:
|
||||
test, valeur, anomalie = b if isinstance(b, (list, tuple)) else (b.get("test"), b.get("valeur"), b.get("anomalie"))
|
||||
# Ajouter la plage de référence si connue
|
||||
norme_str = ""
|
||||
if test in BIO_NORMALS:
|
||||
lo, hi = BIO_NORMALS[test]
|
||||
lo_s = int(lo) if lo == int(lo) else lo
|
||||
hi_s = int(hi) if hi == int(hi) else hi
|
||||
norme_str = f" [N: {lo_s}-{hi_s}]"
|
||||
marker = " (\u2191)" if anomalie else ""
|
||||
bio_parts.append(f"{test} {valeur}{norme_str}{marker}")
|
||||
lines.append(f"- Biologie : {', '.join(bio_parts)}")
|
||||
|
||||
imagerie = contexte.get("imagerie")
|
||||
if imagerie:
|
||||
for img in imagerie:
|
||||
img_type, conclusion = img if isinstance(img, (list, tuple)) else (img.get("type"), img.get("conclusion"))
|
||||
if conclusion:
|
||||
lines.append(f"- Imagerie : {img_type} — {conclusion[:200]}")
|
||||
|
||||
complications = contexte.get("complications")
|
||||
if complications:
|
||||
lines.append(f"- Complications : {', '.join(complications)}")
|
||||
|
||||
dp_texte = contexte.get("dp_texte")
|
||||
if dp_texte:
|
||||
lines.append(f"- DP du séjour : {dp_texte}")
|
||||
|
||||
das_codes = contexte.get("das_codes_existants")
|
||||
if das_codes:
|
||||
lines.append(f"- DAS déjà codés : {', '.join(das_codes)}")
|
||||
|
||||
return "\n".join(lines) if lines else "Non précisé"
|
||||
|
||||
|
||||
def _build_prompt(texte: str, sources: list[dict], contexte: dict, est_dp: bool = True) -> str:
|
||||
"""Construit le prompt expert DIM avec raisonnement structuré."""
|
||||
sources_text = ""
|
||||
for i, src in enumerate(sources, 1):
|
||||
doc_raw = str(src.get("document", ""))
|
||||
if doc_raw.startswith("ref:"):
|
||||
doc_name = f"Référentiel uploadé : {doc_raw[4:]}"
|
||||
elif doc_raw.startswith("proc:"):
|
||||
doc_name = f"Procédure uploadée : {doc_raw[5:]}"
|
||||
else:
|
||||
doc_name = {
|
||||
"cim10": "CIM-10 FR 2026",
|
||||
"cim10_alpha": "CIM-10 Index Alphabétique 2026",
|
||||
"guide_methodo": "Guide Méthodologique MCO 2026",
|
||||
"ccam": "CCAM PMSI V4 2025",
|
||||
}.get(doc_raw, doc_raw)
|
||||
|
||||
code_info = f" (code: {src['code']})" if src.get("code") else ""
|
||||
page_info = f" [page {src['page']}]" if src.get("page") else ""
|
||||
|
||||
sources_text += f"--- Source {i}: {doc_name}{code_info}{page_info} ---\n"
|
||||
sources_text += (src.get("extrait", "")[:800]) + "\n\n"
|
||||
|
||||
type_diag = "DP (diagnostic principal)" if est_dp else "DAS (diagnostic associé significatif)"
|
||||
ctx_str = format_enriched_context(contexte)
|
||||
|
||||
return f"""Tu es un médecin DIM (Département d'Information Médicale) expert en codage PMSI.
|
||||
Tu dois coder le diagnostic suivant en respectant STRICTEMENT les règles de l'ATIH.
|
||||
|
||||
RÈGLES IMPÉRATIVES :
|
||||
- Le code doit provenir UNIQUEMENT des sources CIM-10 fournies
|
||||
- Distingue la DESCRIPTION CLINIQUE (ce que le médecin écrit) de la LOGIQUE DE CODAGE (ce que l'ATIH impose)
|
||||
- Privilégie le code le plus SPÉCIFIQUE disponible (4e ou 5e caractère)
|
||||
- Vérifie les notes d'inclusion/exclusion de chaque code candidat
|
||||
- Si le diagnostic est un DP, il doit refléter le motif principal de prise en charge du séjour
|
||||
- Si c'est un DAS, il doit avoir mobilisé des ressources supplémentaires pendant le séjour
|
||||
- EXCLUSION SYMPTÔME : Si le diagnostic est un symptôme (R00-R99) et qu'un diagnostic précis (Chapitres I-XIV, A00-N99) expliquant ce symptôme est présent, le symptôme ne doit PAS être codé comme DAS
|
||||
|
||||
DIAGNOSTIC À CODER : "{texte}"
|
||||
TYPE : {type_diag}
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
SOURCES DE RÉFÉRENCE :
|
||||
{sources_text}
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant, sans aucun texte avant ou après :
|
||||
{{
|
||||
"analyse_clinique": "que signifie ce diagnostic sur le plan médical",
|
||||
"codes_candidats": "quels codes CIM-10 des sources sont compatibles",
|
||||
"discrimination": "pourquoi choisir ce code plutôt qu'un autre (inclusions/exclusions, spécificité)",
|
||||
"regle_pmsi": "conformité aux règles PMSI pour un {type_diag} (guide méthodologique)",
|
||||
"code": "X99.9",
|
||||
"confidence": "high ou medium ou low",
|
||||
"justification": "explication courte en français",
|
||||
"preuves_cliniques": [
|
||||
{{"type": "biologie|imagerie|traitement|acte|clinique", "element": "élément concret du dossier", "interpretation": "signification clinique justifiant le code"}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
|
||||
def _build_prompt_ccam(texte: str, sources: list[dict], contexte: dict) -> str:
|
||||
"""Construit le prompt expert DIM pour le codage CCAM avec raisonnement structuré."""
|
||||
sources_text = ""
|
||||
for i, src in enumerate(sources, 1):
|
||||
doc_raw = str(src.get("document", ""))
|
||||
if doc_raw.startswith("ref:"):
|
||||
doc_name = f"Référentiel uploadé : {doc_raw[4:]}"
|
||||
elif doc_raw.startswith("proc:"):
|
||||
doc_name = f"Procédure uploadée : {doc_raw[5:]}"
|
||||
else:
|
||||
doc_name = {
|
||||
"cim10": "CIM-10 FR 2026",
|
||||
"cim10_alpha": "CIM-10 Index Alphabétique 2026",
|
||||
"guide_methodo": "Guide Méthodologique MCO 2026",
|
||||
"ccam": "CCAM PMSI V4 2025",
|
||||
}.get(doc_raw, doc_raw)
|
||||
|
||||
code_info = f" (code: {src['code']})" if src.get("code") else ""
|
||||
page_info = f" [page {src['page']}]" if src.get("page") else ""
|
||||
|
||||
sources_text += f"--- Source {i}: {doc_name}{code_info}{page_info} ---\n"
|
||||
sources_text += (src.get("extrait", "")[:800]) + "\n\n"
|
||||
|
||||
ctx_str = format_enriched_context(contexte)
|
||||
|
||||
return f"""Tu es un médecin DIM (Département d'Information Médicale) expert en codage CCAM PMSI.
|
||||
Tu dois coder l'acte chirurgical/médical suivant en respectant STRICTEMENT la nomenclature CCAM.
|
||||
|
||||
RÈGLES IMPÉRATIVES :
|
||||
- Le code doit provenir UNIQUEMENT des sources CCAM fournies
|
||||
- Un code CCAM est composé de 4 lettres + 3 chiffres (ex: HMFC004)
|
||||
- Vérifie l'activité (1=acte technique, 4=anesthésie) et le regroupement
|
||||
- Tiens compte du tarif secteur 1 pour valider la cohérence
|
||||
- Si plusieurs codes sont possibles, choisis le plus spécifique à l'acte décrit
|
||||
- En cas de doute, indique confidence "low" plutôt que de proposer un code inadapté
|
||||
|
||||
ACTE À CODER : "{texte}"
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
SOURCES CCAM :
|
||||
{sources_text}
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant, sans aucun texte avant ou après :
|
||||
{{
|
||||
"analyse_acte": "que décrit cet acte sur le plan technique/chirurgical",
|
||||
"codes_candidats": "quels codes CCAM des sources sont compatibles",
|
||||
"discrimination": "pourquoi choisir ce code plutôt qu'un autre (activité, regroupement, tarif)",
|
||||
"code": "ABCD123",
|
||||
"confidence": "high ou medium ou low",
|
||||
"justification": "explication courte en français"
|
||||
}}"""
|
||||
|
||||
|
||||
def _parse_ollama_response(raw: str) -> dict | None:
|
||||
"""Parse la réponse JSON d'Ollama et reconstitue le raisonnement structuré."""
|
||||
parsed = parse_json_response(raw)
|
||||
if parsed is None:
|
||||
return None
|
||||
|
||||
# Reconstituer le raisonnement à partir des champs structurés
|
||||
reasoning_parts = []
|
||||
for key in ("analyse_clinique", "analyse_acte", "codes_candidats", "discrimination", "regle_pmsi"):
|
||||
val = parsed.pop(key, None)
|
||||
if val:
|
||||
titre = key.replace("_", " ").upper()
|
||||
reasoning_parts.append(f"{titre} :\n{val}")
|
||||
if reasoning_parts:
|
||||
parsed["raisonnement"] = "\n\n".join(reasoning_parts)
|
||||
|
||||
return parsed
|
||||
|
||||
|
||||
def _call_ollama(prompt: str) -> dict | None:
|
||||
"""Appelle Ollama (mode JSON) et parse la réponse avec reconstitution du raisonnement."""
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=2500)
|
||||
if result is None:
|
||||
return None
|
||||
# Reconstituer le raisonnement structuré
|
||||
reasoning_parts = []
|
||||
for key in ("analyse_clinique", "analyse_acte", "codes_candidats", "discrimination", "regle_pmsi"):
|
||||
val = result.pop(key, None)
|
||||
if val:
|
||||
titre = key.replace("_", " ").upper()
|
||||
reasoning_parts.append(f"{titre} :\n{val}")
|
||||
if reasoning_parts:
|
||||
result["raisonnement"] = "\n\n".join(reasoning_parts)
|
||||
return result
|
||||
|
||||
|
||||
def _apply_llm_result_diagnostic(diagnostic: Diagnostic, llm_result: dict) -> None:
|
||||
"""Applique un résultat LLM (frais ou caché) à un Diagnostic."""
|
||||
code = llm_result.get("code")
|
||||
confidence = llm_result.get("confidence")
|
||||
justification = llm_result.get("justification")
|
||||
raisonnement = llm_result.get("raisonnement")
|
||||
|
||||
if code:
|
||||
code = normalize_code(code)
|
||||
is_valid, _ = cim10_validate(code)
|
||||
if is_valid:
|
||||
diagnostic.cim10_suggestion = code
|
||||
else:
|
||||
# Tenter fallback vers le code parent (D71.9 → D71)
|
||||
parent = fallback_parent_code(code)
|
||||
if parent:
|
||||
logger.info(
|
||||
"RAG : code Ollama %s invalide → fallback parent %s pour « %s »",
|
||||
code, parent, diagnostic.texte,
|
||||
)
|
||||
diagnostic.cim10_suggestion = parent
|
||||
else:
|
||||
logger.warning(
|
||||
"RAG : code Ollama %s invalide pour « %s », code ignoré",
|
||||
code, diagnostic.texte,
|
||||
)
|
||||
if confidence in ("high", "medium", "low"):
|
||||
diagnostic.cim10_confidence = confidence
|
||||
if justification:
|
||||
diagnostic.justification = justification
|
||||
if raisonnement:
|
||||
diagnostic.raisonnement = raisonnement
|
||||
|
||||
# Stocker les preuves cliniques
|
||||
preuves = llm_result.get("preuves_cliniques", [])
|
||||
if preuves and isinstance(preuves, list):
|
||||
for p in preuves:
|
||||
if isinstance(p, dict) and p.get("element"):
|
||||
try:
|
||||
diagnostic.preuves_cliniques.append(PreuveClinique(
|
||||
type=p.get("type", "clinique"),
|
||||
element=p["element"],
|
||||
interpretation=p.get("interpretation", ""),
|
||||
))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def enrich_diagnostic(
|
||||
diagnostic: Diagnostic,
|
||||
contexte: dict,
|
||||
est_dp: bool = True,
|
||||
cache: OllamaCache | None = None,
|
||||
) -> None:
|
||||
"""Enrichit un Diagnostic avec le RAG (FAISS + Ollama).
|
||||
|
||||
Modifie le diagnostic en place. Fallback gracieux si FAISS ou Ollama échouent.
|
||||
"""
|
||||
diag_type = "dp" if est_dp else "das"
|
||||
|
||||
# 1. Vérifier le cache
|
||||
cached = cache.get(diagnostic.texte, diag_type) if cache else None
|
||||
|
||||
# 2. Recherche FAISS (toujours, pour les sources_rag fraîches)
|
||||
sources = search_similar(diagnostic.texte, top_k=10)
|
||||
|
||||
if not sources:
|
||||
logger.debug("Aucune source RAG trouvée pour : %s", diagnostic.texte)
|
||||
return
|
||||
|
||||
# 3. Stocker les sources RAG
|
||||
diagnostic.sources_rag = [
|
||||
RAGSource(
|
||||
document=s["document"],
|
||||
page=s.get("page"),
|
||||
code=s.get("code"),
|
||||
extrait=s.get("extrait", "")[:200],
|
||||
)
|
||||
for s in sources
|
||||
]
|
||||
|
||||
# 4. Si cache hit, appliquer et court-circuiter Ollama
|
||||
if cached is not None:
|
||||
logger.info("Cache hit pour %s : « %s »", diag_type.upper(), diagnostic.texte)
|
||||
_apply_llm_result_diagnostic(diagnostic, cached)
|
||||
return
|
||||
|
||||
# 5. Appel Ollama pour justification avec raisonnement structuré
|
||||
prompt = _build_prompt(diagnostic.texte, sources, contexte, est_dp=est_dp)
|
||||
llm_result = _call_ollama(prompt)
|
||||
|
||||
if llm_result:
|
||||
_apply_llm_result_diagnostic(diagnostic, llm_result)
|
||||
if cache:
|
||||
cache.put(diagnostic.texte, diag_type, llm_result)
|
||||
else:
|
||||
logger.info("Ollama non disponible — sources FAISS conservées sans justification LLM")
|
||||
|
||||
|
||||
def _apply_llm_result_acte(acte: ActeCCAM, llm_result: dict) -> None:
|
||||
"""Applique un résultat LLM (frais ou caché) à un ActeCCAM."""
|
||||
code = llm_result.get("code")
|
||||
confidence = llm_result.get("confidence")
|
||||
justification = llm_result.get("justification")
|
||||
raisonnement = llm_result.get("raisonnement")
|
||||
|
||||
if code:
|
||||
code = code.strip().upper()
|
||||
is_valid, _ = ccam_validate(code)
|
||||
if is_valid:
|
||||
acte.code_ccam_suggestion = code
|
||||
else:
|
||||
logger.warning(
|
||||
"RAG : code CCAM Ollama %s invalide pour « %s », code ignoré",
|
||||
code, acte.texte,
|
||||
)
|
||||
if confidence in ("high", "medium", "low"):
|
||||
acte.ccam_confidence = confidence
|
||||
if justification:
|
||||
acte.justification = justification
|
||||
if raisonnement:
|
||||
acte.raisonnement = raisonnement
|
||||
|
||||
|
||||
def enrich_acte(acte: ActeCCAM, contexte: dict, cache: OllamaCache | None = None) -> None:
|
||||
"""Enrichit un ActeCCAM avec le RAG (FAISS + Ollama).
|
||||
|
||||
Modifie l'acte en place. Fallback gracieux si FAISS ou Ollama échouent.
|
||||
"""
|
||||
# 1. Vérifier le cache
|
||||
cached = cache.get(acte.texte, "ccam") if cache else None
|
||||
|
||||
# 2. Recherche FAISS (sources CCAM priorisées)
|
||||
sources = search_similar_ccam(acte.texte, top_k=8)
|
||||
|
||||
if not sources:
|
||||
logger.debug("Aucune source RAG CCAM trouvée pour : %s", acte.texte)
|
||||
return
|
||||
|
||||
# 3. Stocker les sources RAG
|
||||
acte.sources_rag = [
|
||||
RAGSource(
|
||||
document=s["document"],
|
||||
page=s.get("page"),
|
||||
code=s.get("code"),
|
||||
extrait=s.get("extrait", "")[:200],
|
||||
)
|
||||
for s in sources
|
||||
]
|
||||
|
||||
# 4. Si cache hit, appliquer et court-circuiter Ollama
|
||||
if cached is not None:
|
||||
logger.info("Cache hit pour CCAM : « %s »", acte.texte)
|
||||
_apply_llm_result_acte(acte, cached)
|
||||
return
|
||||
|
||||
# 5. Appel Ollama pour justification avec raisonnement structuré
|
||||
prompt = _build_prompt_ccam(acte.texte, sources, contexte)
|
||||
llm_result = _call_ollama(prompt)
|
||||
|
||||
if llm_result:
|
||||
_apply_llm_result_acte(acte, llm_result)
|
||||
if cache:
|
||||
cache.put(acte.texte, "ccam", llm_result)
|
||||
else:
|
||||
logger.info("Ollama non disponible — sources FAISS CCAM conservées sans justification LLM")
|
||||
|
||||
|
||||
def _build_prompt_das_extraction(text: str, contexte: dict, existing_das: list[str], dp_texte: str) -> str:
|
||||
"""Construit le prompt pour l'extraction LLM de DAS supplémentaires."""
|
||||
ctx_str = format_enriched_context(contexte)
|
||||
existing_str = "\n".join(f"- {d}" for d in existing_das) if existing_das else "Aucun"
|
||||
|
||||
return f"""Tu es un médecin DIM (Département d'Information Médicale) expert en codage PMSI.
|
||||
Analyse le texte médical suivant et identifie les diagnostics associés significatifs (DAS) qui n'ont PAS encore été codés.
|
||||
|
||||
RÈGLES IMPÉRATIVES :
|
||||
- Un DAS doit avoir mobilisé des ressources supplémentaires pendant le séjour
|
||||
- Ne PAS proposer de doublons avec les DAS déjà codés ci-dessous
|
||||
- Ne PAS proposer le diagnostic principal comme DAS
|
||||
- Ne PAS coder les symptômes (R00-R99) si un diagnostic précis les explique
|
||||
- Ne PAS coder les antécédents non pertinents pour le séjour
|
||||
- Privilégie les codes CIM-10 les plus SPÉCIFIQUES (4e ou 5e caractère)
|
||||
- Ne propose que des diagnostics CLAIREMENT mentionnés dans le texte
|
||||
- ATTENTION aux valeurs biologiques : ne code PAS un diagnostic si les valeurs sont dans les normes indiquées entre crochets [N: min-max]. Exemple : Créatinine 76 [N: 50-120] = NORMAL, pas d'insuffisance rénale.
|
||||
|
||||
DIAGNOSTIC PRINCIPAL : {dp_texte or "Non identifié"}
|
||||
|
||||
DAS DÉJÀ CODÉS :
|
||||
{existing_str}
|
||||
|
||||
CONTEXTE CLINIQUE :
|
||||
{ctx_str}
|
||||
|
||||
TEXTE MÉDICAL :
|
||||
{text[:4000]}
|
||||
|
||||
Réponds UNIQUEMENT avec un objet JSON au format suivant, sans aucun texte avant ou après :
|
||||
{{
|
||||
"diagnostics_supplementaires": [
|
||||
{{
|
||||
"texte": "description du diagnostic",
|
||||
"code_cim10": "X99.9",
|
||||
"justification": "pourquoi ce DAS est pertinent pour le séjour"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Si aucun DAS supplémentaire n'est pertinent, retourne : {{"diagnostics_supplementaires": []}}"""
|
||||
|
||||
|
||||
def extract_das_llm(
|
||||
text: str,
|
||||
contexte: dict,
|
||||
existing_das: list[str],
|
||||
dp_texte: str,
|
||||
cache: OllamaCache | None = None,
|
||||
) -> list[dict]:
|
||||
"""Extrait des DAS supplémentaires via un pass LLM.
|
||||
|
||||
Args:
|
||||
text: Texte médical complet.
|
||||
contexte: Contexte patient (sexe, age, etc.).
|
||||
existing_das: Liste des DAS déjà codés (texte + code).
|
||||
dp_texte: Texte du diagnostic principal.
|
||||
cache: Cache Ollama optionnel.
|
||||
|
||||
Returns:
|
||||
Liste de dicts {texte, code_cim10, justification} pour les DAS détectés.
|
||||
"""
|
||||
import hashlib
|
||||
|
||||
# Clé de cache basée sur le hash du texte
|
||||
text_hash = hashlib.md5(text[:4000].encode()).hexdigest()[:16]
|
||||
cache_key_text = f"das_extract::{text_hash}"
|
||||
|
||||
# Vérifier le cache
|
||||
if cache is not None:
|
||||
cached = cache.get(cache_key_text, "das_llm")
|
||||
if cached is not None:
|
||||
logger.info("Cache hit pour extraction DAS LLM")
|
||||
return cached.get("diagnostics_supplementaires", [])
|
||||
|
||||
# Construire le prompt et appeler Ollama
|
||||
prompt = _build_prompt_das_extraction(text, contexte, existing_das, dp_texte)
|
||||
result = call_ollama(prompt, temperature=0.1, max_tokens=2000)
|
||||
|
||||
if result is None:
|
||||
logger.warning("Extraction DAS LLM : Ollama non disponible")
|
||||
return []
|
||||
|
||||
das_list = result.get("diagnostics_supplementaires", [])
|
||||
if not isinstance(das_list, list):
|
||||
logger.warning("Extraction DAS LLM : format inattendu")
|
||||
return []
|
||||
|
||||
# Stocker dans le cache
|
||||
if cache is not None:
|
||||
cache.put(cache_key_text, "das_llm", result)
|
||||
|
||||
logger.info("Extraction DAS LLM : %d diagnostics supplémentaires détectés", len(das_list))
|
||||
return das_list
|
||||
|
||||
|
||||
def enrich_dossier(dossier: DossierMedical) -> None:
|
||||
"""Enrichit le DP et tous les DAS d'un dossier via le RAG.
|
||||
|
||||
Utilise un cache persistant et parallélise les appels Ollama
|
||||
pour les DAS et actes CCAM (max_workers = OLLAMA_MAX_PARALLEL).
|
||||
"""
|
||||
cache = OllamaCache(OLLAMA_CACHE_PATH, OLLAMA_MODEL)
|
||||
|
||||
contexte = build_enriched_context(dossier)
|
||||
|
||||
# Phase 1 : DP seul (le contexte DAS en dépend)
|
||||
if dossier.diagnostic_principal:
|
||||
logger.info("RAG enrichissement DP : %s", dossier.diagnostic_principal.texte)
|
||||
enrich_diagnostic(dossier.diagnostic_principal, contexte, est_dp=True, cache=cache)
|
||||
|
||||
# Mettre à jour le contexte avec le DP pour les DAS
|
||||
if dossier.diagnostic_principal:
|
||||
contexte["dp_texte"] = dossier.diagnostic_principal.texte
|
||||
contexte["das_codes_existants"] = [
|
||||
f"{d.cim10_suggestion} ({d.texte})"
|
||||
for d in dossier.diagnostics_associes
|
||||
if d.cim10_suggestion
|
||||
]
|
||||
|
||||
# Phase 2 : DAS + Actes en parallèle
|
||||
das_list = dossier.diagnostics_associes
|
||||
actes_list = dossier.actes_ccam
|
||||
|
||||
if das_list or actes_list:
|
||||
with ThreadPoolExecutor(max_workers=OLLAMA_MAX_PARALLEL) as executor:
|
||||
futures = []
|
||||
for das in das_list:
|
||||
logger.info("RAG enrichissement DAS : %s", das.texte)
|
||||
futures.append(executor.submit(enrich_diagnostic, das, contexte, False, cache))
|
||||
for acte in actes_list:
|
||||
logger.info("RAG enrichissement CCAM : %s", acte.texte)
|
||||
futures.append(executor.submit(enrich_acte, acte, contexte, cache))
|
||||
for f in as_completed(futures):
|
||||
f.result() # propage les exceptions
|
||||
|
||||
cache.save()
|
||||
242
t2a_install_rag_cleanup/src/medical/severity.py
Normal file
242
t2a_install_rag_cleanup/src/medical/severity.py
Normal file
@@ -0,0 +1,242 @@
|
||||
"""Détection heuristique de sévérité et CMA/CMS pour le codage GHM.
|
||||
|
||||
Phase 1 : heuristique basée sur des marqueurs textuels et des racines CIM-10.
|
||||
Phase 2 (future) : tables CMA/CMS officielles ATIH.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from .cim10_dict import load_dict, normalize_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# --- Marqueurs de sévérité dans le texte ---
|
||||
|
||||
_SEVERE_MARKERS = {
|
||||
"aigu", "aigue", "severe", "grave", "maligne", "malin",
|
||||
"foudroyant", "foudroyante", "necrosant", "necrosante",
|
||||
"septique", "decompense", "decompensee", "choc",
|
||||
"defaillance", "hemorragique",
|
||||
"fulminant", "fulminante", "massif", "massive", "critique",
|
||||
}
|
||||
|
||||
_MODERATE_MARKERS = {
|
||||
"modere", "moderee", "moderes", "moderees",
|
||||
"subaigu", "subaigue", "subaiguë",
|
||||
"persistant", "persistante", "recidivant", "recidivante",
|
||||
}
|
||||
|
||||
_MILD_MARKERS = {
|
||||
"chronique", "leger", "legere",
|
||||
"benin", "benigne", "mineur", "mineure",
|
||||
"superficiel", "superficielle", "stable",
|
||||
}
|
||||
|
||||
|
||||
# --- Racines CIM-10 fréquemment CMA (heuristique Phase 1) ---
|
||||
# Ces racines sont connues pour être souvent classées CMA dans les tables ATIH.
|
||||
|
||||
_HEURISTIC_CMA_ROOTS: set[str] = {
|
||||
# Infectieux
|
||||
"A41", # Sepsis
|
||||
"A40", # Septicémie streptococcique
|
||||
# Hématologie / nutrition
|
||||
"D64", # Anémie
|
||||
"D65", # CIVD
|
||||
"E46", # Dénutrition
|
||||
"E87", # Troubles hydro-électrolytiques
|
||||
"E86", # Déshydratation
|
||||
# Métabolique
|
||||
"E11", # Diabète type 2 (avec complications)
|
||||
"E10", # Diabète type 1 (avec complications)
|
||||
# Cardiovasculaire
|
||||
"I48", # Fibrillation auriculaire
|
||||
"I50", # Insuffisance cardiaque
|
||||
"I26", # Embolie pulmonaire
|
||||
"I80", # Thrombose veineuse
|
||||
# Respiratoire
|
||||
"J18", # Pneumopathie
|
||||
"J96", # Insuffisance respiratoire
|
||||
"J69", # Pneumopathie d'inhalation
|
||||
# Rénal
|
||||
"N17", # Insuffisance rénale aiguë
|
||||
"N18", # Insuffisance rénale chronique
|
||||
"N39", # Infection urinaire
|
||||
# Hépatique
|
||||
"K72", # Insuffisance hépatique
|
||||
# Infectieux nosocomial
|
||||
"T81", # Complications d'actes (infection post-op)
|
||||
"T80", # Complications post-perfusion
|
||||
}
|
||||
|
||||
|
||||
_cma_levels: dict[str, int] | None = None
|
||||
|
||||
|
||||
def _load_cma_levels() -> dict[str, int]:
|
||||
"""Charge les niveaux CMA officiels depuis data/cma_levels.json (lazy-loaded)."""
|
||||
global _cma_levels
|
||||
if _cma_levels is not None:
|
||||
return _cma_levels
|
||||
from ..config import CMA_LEVELS_PATH
|
||||
try:
|
||||
data = json.loads(CMA_LEVELS_PATH.read_text(encoding="utf-8"))
|
||||
_cma_levels = {k: int(v) for k, v in data.items()}
|
||||
logger.debug("CMA levels chargés : %d codes", len(_cma_levels))
|
||||
except FileNotFoundError:
|
||||
logger.warning("Fichier CMA levels non trouvé : %s", CMA_LEVELS_PATH)
|
||||
_cma_levels = {}
|
||||
except Exception:
|
||||
logger.warning("Erreur chargement CMA levels", exc_info=True)
|
||||
_cma_levels = {}
|
||||
return _cma_levels
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeverityInfo:
|
||||
"""Résultat de l'évaluation de sévérité d'un diagnostic."""
|
||||
est_cma_probable: bool = False
|
||||
niveau_severite: str = "non_evalue" # "leger" | "modere" | "severe" | "non_evalue"
|
||||
niveau_cma: int = 1 # 1 (pas CMA), 2, 3 ou 4 (officiel ATIH)
|
||||
marqueurs_trouves: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
def _detect_severity_markers(text: str) -> tuple[str, list[str]]:
|
||||
"""Détecte les marqueurs de sévérité dans un texte normalisé.
|
||||
|
||||
Returns:
|
||||
(niveau, marqueurs_trouves) où niveau est "severe", "modere", "leger" ou "non_evalue".
|
||||
"""
|
||||
text_norm = normalize_text(text)
|
||||
words = set(text_norm.split())
|
||||
|
||||
found_severe = words & _SEVERE_MARKERS
|
||||
found_moderate = words & _MODERATE_MARKERS
|
||||
found_mild = words & _MILD_MARKERS
|
||||
|
||||
all_found = list(found_severe | found_moderate | found_mild)
|
||||
|
||||
if found_severe:
|
||||
return "severe", all_found
|
||||
if found_moderate:
|
||||
return "modere", all_found
|
||||
if found_mild:
|
||||
return "leger", all_found
|
||||
return "non_evalue", []
|
||||
|
||||
|
||||
def _is_heuristic_cma(code: str) -> bool:
|
||||
"""Vérifie si un code CIM-10 est probablement CMA selon les racines heuristiques."""
|
||||
if not code:
|
||||
return False
|
||||
code_upper = code.upper()
|
||||
for root in _HEURISTIC_CMA_ROOTS:
|
||||
if code_upper.startswith(root):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def evaluate_severity(diagnostic) -> SeverityInfo:
|
||||
"""Évalue la sévérité d'un diagnostic (texte + code CIM-10).
|
||||
|
||||
Utilise en priorité les niveaux CMA officiels ATIH (2/3/4),
|
||||
avec fallback sur l'heuristique par racines CIM-10.
|
||||
|
||||
Args:
|
||||
diagnostic: Objet avec attributs texte, cim10_suggestion.
|
||||
|
||||
Returns:
|
||||
SeverityInfo avec est_cma_probable, niveau_cma, niveau_severite, marqueurs_trouves.
|
||||
"""
|
||||
info = SeverityInfo()
|
||||
|
||||
# 1. Marqueurs textuels depuis le texte du diagnostic
|
||||
texte = diagnostic.texte or ""
|
||||
niveau, marqueurs = _detect_severity_markers(texte)
|
||||
|
||||
# 2. Chercher aussi dans le label du dictionnaire CIM-10
|
||||
code = diagnostic.cim10_suggestion
|
||||
if code:
|
||||
cim10_dict = load_dict()
|
||||
label = cim10_dict.get(code, "")
|
||||
if label:
|
||||
niveau_label, marqueurs_label = _detect_severity_markers(label)
|
||||
# Prendre le niveau le plus sévère
|
||||
severity_order = {"severe": 3, "modere": 2, "leger": 1, "non_evalue": 0}
|
||||
if severity_order.get(niveau_label, 0) > severity_order.get(niveau, 0):
|
||||
niveau = niveau_label
|
||||
marqueurs = list(set(marqueurs + marqueurs_label))
|
||||
|
||||
info.niveau_severite = niveau
|
||||
info.marqueurs_trouves = marqueurs
|
||||
|
||||
# 3. Lookup officiel CMA ATIH (prioritaire)
|
||||
if code:
|
||||
cma_levels = _load_cma_levels()
|
||||
official_level = cma_levels.get(code)
|
||||
if official_level:
|
||||
info.niveau_cma = official_level
|
||||
info.est_cma_probable = True
|
||||
elif _is_heuristic_cma(code):
|
||||
# Fallback heuristique → niveau 2
|
||||
info.niveau_cma = 2
|
||||
info.est_cma_probable = True
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def enrich_dossier_severity(dp, das_list: list) -> tuple[list[str], int, int]:
|
||||
"""Enrichit les diagnostics d'un dossier avec les informations de sévérité.
|
||||
|
||||
Modifie les diagnostics en place (attributs est_cma, est_cms, niveau_severite).
|
||||
|
||||
Args:
|
||||
dp: Diagnostic principal.
|
||||
das_list: Liste des diagnostics associés.
|
||||
|
||||
Returns:
|
||||
(alertes, cma_count, cms_count).
|
||||
"""
|
||||
alertes = []
|
||||
|
||||
# Évaluer le DP
|
||||
if dp and dp.cim10_suggestion:
|
||||
info = evaluate_severity(dp)
|
||||
dp.niveau_severite = info.niveau_severite
|
||||
dp.niveau_cma = info.niveau_cma
|
||||
if info.est_cma_probable:
|
||||
dp.est_cma = True
|
||||
|
||||
# Évaluer chaque DAS
|
||||
cma_count = 0
|
||||
cms_count = 0
|
||||
for das in das_list:
|
||||
if not das.cim10_suggestion:
|
||||
continue
|
||||
info = evaluate_severity(das)
|
||||
das.niveau_severite = info.niveau_severite
|
||||
das.niveau_cma = info.niveau_cma
|
||||
if info.est_cma_probable:
|
||||
das.est_cma = True
|
||||
cma_count += 1
|
||||
# CMS = CMA niveau 4 ou CMA sévère
|
||||
if info.niveau_cma >= 4 or info.niveau_severite == "severe":
|
||||
das.est_cms = True
|
||||
cms_count += 1
|
||||
alertes.append(
|
||||
f"CMA niveau {info.niveau_cma} : '{das.texte}' ({das.cim10_suggestion}) — "
|
||||
f"sévérité {info.niveau_severite}"
|
||||
+ (f", marqueurs : {', '.join(info.marqueurs_trouves)}" if info.marqueurs_trouves else "")
|
||||
)
|
||||
|
||||
if cma_count >= 2:
|
||||
alertes.insert(0, f"{cma_count} CMA probables détectées — impact potentiel sur le niveau de sévérité GHM")
|
||||
|
||||
return alertes, cma_count, cms_count
|
||||
1
t2a_install_rag_cleanup/src/quality/__init__.py
Normal file
1
t2a_install_rag_cleanup/src/quality/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Contrôles qualité (vetos) pour réduire la contestabilité CPAM."""
|
||||
170
t2a_install_rag_cleanup/src/quality/decision_engine.py
Normal file
170
t2a_install_rag_cleanup/src/quality/decision_engine.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""Moteur de décisions (post-traitement qualité).
|
||||
|
||||
But: conserver la proposition du modèle (cim10_suggestion) tout en produisant une
|
||||
*sortie finale* plus défendable (cim10_final + cim10_decision).
|
||||
|
||||
Ce module est déterministe, court, et auditable.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import Optional
|
||||
|
||||
from ..config import CodeDecision, Diagnostic, DossierMedical
|
||||
|
||||
|
||||
# --- Règles "étiologiques" : ne pas affirmer sans preuve spécifique ---
|
||||
|
||||
IRON_MARKERS = (
|
||||
"ferrit", # ferritine
|
||||
"transferr", # transferrine
|
||||
"saturation", # saturation transferrine
|
||||
"cst", # coefficient de saturation
|
||||
"carence mart",
|
||||
"martiale",
|
||||
"ferripr", # ferriprive
|
||||
"fer intraveineux",
|
||||
"fer iv",
|
||||
"traitement martial",
|
||||
)
|
||||
|
||||
|
||||
def _norm(s: str) -> str:
|
||||
s = s.replace("’", "'")
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
||||
s = s.lower()
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def _first_float(text: str) -> Optional[float]:
|
||||
m = re.search(r"(-?\d+(?:[\.,]\d+)?)", text)
|
||||
if not m:
|
||||
return None
|
||||
return float(m.group(1).replace(",", "."))
|
||||
|
||||
|
||||
def _parse_normal_range(text: str) -> tuple[Optional[float], Optional[float]]:
|
||||
# Ex: "[N: 12-17]" / "[N: 12 - 17]"
|
||||
m = re.search(r"\[\s*N\s*:\s*([0-9]+(?:[\.,][0-9]+)?)\s*-\s*([0-9]+(?:[\.,][0-9]+)?)\s*\]", text)
|
||||
if not m:
|
||||
return None, None
|
||||
lo = float(m.group(1).replace(",", "."))
|
||||
hi = float(m.group(2).replace(",", "."))
|
||||
return lo, hi
|
||||
|
||||
|
||||
def _anemia_bio(diag: Diagnostic) -> bool:
|
||||
# 1) via preuves_cliniques (souvent déjà interprétées)
|
||||
for p in diag.preuves_cliniques or []:
|
||||
blob = f"{p.element} {p.interpretation}".lower()
|
||||
if "hemoglob" in blob or "hémoglob" in blob or blob.strip().startswith("hb"):
|
||||
val = _first_float(p.element) or _first_float(p.interpretation)
|
||||
lo, _ = _parse_normal_range(p.element)
|
||||
lo = lo if lo is not None else 12.0
|
||||
if val is not None and val < lo:
|
||||
return True
|
||||
if "confirm" in blob and "anemie" in blob:
|
||||
return True
|
||||
# 2) fallback : le texte mentionne une anémie chiffrée
|
||||
ex = _norm(diag.source_excerpt or "")
|
||||
if "hemoglob" in ex or "hémoglob" in ex:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _iron_evidence_blob(dossier: DossierMedical, diag: Diagnostic) -> str:
|
||||
parts: list[str] = []
|
||||
|
||||
# Preuves patient (extraits + éléments structurés)
|
||||
if diag.source_excerpt:
|
||||
parts.append(str(diag.source_excerpt))
|
||||
|
||||
for p in diag.preuves_cliniques or []:
|
||||
parts.append(f"{p.element} {p.interpretation}")
|
||||
|
||||
# Biologie clé globale (si ferritine/fer a été capté ailleurs)
|
||||
for b in dossier.biologie_cle or []:
|
||||
parts.append(f"{b.test} {b.valeur or ''}")
|
||||
|
||||
# Traitements (si supplémentation martiale documentée)
|
||||
for t in dossier.traitements_sortie or []:
|
||||
parts.append(f"{t.medicament} {t.posologie or ''}")
|
||||
|
||||
return _norm("\n".join(parts))
|
||||
|
||||
|
||||
def apply_decisions(dossier: DossierMedical) -> None:
|
||||
"""Applique des décisions finales sur DP/DAS.
|
||||
|
||||
- Ne supprime pas la suggestion du modèle.
|
||||
- Remplit cim10_final systématiquement quand une suggestion existe.
|
||||
- Remplit cim10_decision uniquement si action != KEEP (pour garder le JSON lisible).
|
||||
"""
|
||||
|
||||
def _set_default_final(diag: Diagnostic):
|
||||
if diag.cim10_suggestion and diag.cim10_final is None:
|
||||
diag.cim10_final = diag.cim10_suggestion
|
||||
|
||||
# DP
|
||||
if dossier.diagnostic_principal:
|
||||
_set_default_final(dossier.diagnostic_principal)
|
||||
|
||||
# DAS
|
||||
for das in dossier.diagnostics_associes or []:
|
||||
_set_default_final(das)
|
||||
|
||||
# --- Règle: D50 sans preuve martiale -> downgrade D64.9 + needs_info ---
|
||||
for das in dossier.diagnostics_associes or []:
|
||||
if das.cim10_suggestion != "D50":
|
||||
continue
|
||||
|
||||
blob = _iron_evidence_blob(dossier, das)
|
||||
has_iron = any(m in blob for m in IRON_MARKERS)
|
||||
has_anemia = _anemia_bio(das)
|
||||
|
||||
# Si on n'a même pas d'anémie biologique, on n'automatise pas.
|
||||
if not has_anemia:
|
||||
continue
|
||||
|
||||
if not has_iron:
|
||||
das.cim10_final = "D64.9"
|
||||
das.cim10_decision = CodeDecision(
|
||||
action="DOWNGRADE",
|
||||
final_code="D64.9",
|
||||
downgraded_from="D50",
|
||||
reason="Anémie biologique sans preuve d'étiologie ferriprive (bilan martial absent/insuffisant).",
|
||||
needs_info=[
|
||||
"Bilan martial disponible ? (ferritine, fer, CST/transferrine)",
|
||||
"Mention explicite 'anémie ferriprive' ou carence martiale ?",
|
||||
"Traitement martial (fer per os/IV) documenté ?",
|
||||
],
|
||||
applied_rules=["RULE-D50-NEEDS-IRON"],
|
||||
)
|
||||
|
||||
|
||||
|
||||
def decision_summaries(dossier: DossierMedical) -> list[str]:
|
||||
"""Retourne une liste de lignes lisibles à injecter dans alertes_codage."""
|
||||
lines: list[str] = []
|
||||
|
||||
def _summ(where: str, d: Diagnostic):
|
||||
dec = d.cim10_decision
|
||||
if not dec or dec.action == "KEEP":
|
||||
return
|
||||
if dec.action == "DOWNGRADE":
|
||||
lines.append(f"DECISION: {where} {dec.downgraded_from}→{dec.final_code} ({', '.join(dec.applied_rules)})")
|
||||
for ni in dec.needs_info[:3]:
|
||||
lines.append(f"DECISION: besoin_info: {ni}")
|
||||
elif dec.action == "REMOVE":
|
||||
lines.append(f"DECISION: {where} {d.cim10_suggestion} supprimé ({', '.join(dec.applied_rules)})")
|
||||
|
||||
if dossier.diagnostic_principal:
|
||||
_summ("diagnostic_principal", dossier.diagnostic_principal)
|
||||
|
||||
for i, das in enumerate(dossier.diagnostics_associes or []):
|
||||
_summ(f"diagnostics_associes[{i}]", das)
|
||||
|
||||
return lines
|
||||
380
t2a_install_rag_cleanup/src/quality/veto_engine.py
Normal file
380
t2a_install_rag_cleanup/src/quality/veto_engine.py
Normal file
@@ -0,0 +1,380 @@
|
||||
"""Moteur de vetos (contrôle de contestabilité).
|
||||
|
||||
Objectif : bloquer automatiquement les propositions CIM-10/CCAM contestables
|
||||
(absence de preuve, négation/conditionnel, doublons incohérents, etc.).
|
||||
|
||||
Ce module est volontairement simple et déterministe : il doit être stable,
|
||||
audit-able, et indépendant des modèles.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import Iterable
|
||||
|
||||
from ..config import (
|
||||
ActeCCAM,
|
||||
BiologieCle,
|
||||
Diagnostic,
|
||||
DossierMedical,
|
||||
VetoIssue,
|
||||
VetoReport,
|
||||
)
|
||||
|
||||
|
||||
# NOTE: Vetos = déterministes et auditables.
|
||||
# On évite d'interpréter le « raisonnement » du modèle comme une preuve.
|
||||
|
||||
_NEGATION_CUES = (
|
||||
"pas de",
|
||||
"pas d",
|
||||
"absence de",
|
||||
"non retenu",
|
||||
"exclu",
|
||||
"a eliminer",
|
||||
"a éliminer",
|
||||
"negatif",
|
||||
"négatif",
|
||||
)
|
||||
|
||||
_CONDITIONAL_CUES = (
|
||||
"si",
|
||||
"s il", # OCR fréquent de "s'il"
|
||||
"eventuel",
|
||||
"éventuel",
|
||||
"suspect",
|
||||
"probable",
|
||||
"hypothese",
|
||||
"hypothèse",
|
||||
"?",
|
||||
)
|
||||
|
||||
_EVIDENCE_TEMPLATE_CUES = (
|
||||
"score",
|
||||
"fib4",
|
||||
"fibrosis-4",
|
||||
"test de depistage",
|
||||
"test de dépistage",
|
||||
"outil de depistage",
|
||||
"outil de dépistage",
|
||||
)
|
||||
|
||||
|
||||
def _norm(s: str) -> str:
|
||||
"""Normalisation légère (lower + sans accents) pour matcher OCR."""
|
||||
s = s.replace("’", "'")
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
||||
s = s.lower()
|
||||
# simplifier ponctuation en espaces
|
||||
s = re.sub(r"[^a-z0-9]+", " ", s)
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def _split_sentences(text: str) -> list[str]:
|
||||
# volontairement simple : robuste sur OCR
|
||||
text = text.replace("\r", "\n")
|
||||
parts = re.split(r"[\n\.\;\:]+", text)
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
|
||||
|
||||
def _concept_keywords(label: str) -> list[str]:
|
||||
"""Extrait des mots-clés discriminants depuis le libellé Diagnostic."""
|
||||
stop = {
|
||||
"de", "du", "des", "la", "le", "les", "un", "une", "et", "a", "au", "aux",
|
||||
"gauche", "droite", "bilaterale", "bilat", "chronique", "aigue", "aigu",
|
||||
"sans", "avec",
|
||||
}
|
||||
tokens = [t for t in _norm(label).split() if len(t) >= 4 and t not in stop]
|
||||
# garder l'ordre, éviter doublons
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for t in tokens:
|
||||
if t not in seen:
|
||||
seen.add(t)
|
||||
out.append(t)
|
||||
return out[:5]
|
||||
|
||||
|
||||
def _analyze_neg_cond(excerpts: Iterable[str], label: str) -> tuple[bool, bool, bool, bool]:
|
||||
"""Retourne (negated, conditional, contradictory, positive).
|
||||
|
||||
*negated* : une phrase qui contient le concept ET une négation proche.
|
||||
*conditional* : une phrase qui contient le concept ET un marqueur conditionnel.
|
||||
*positive* : une phrase qui contient le concept sans négation proche.
|
||||
*contradictory* : negated et positive.
|
||||
"""
|
||||
kws = _concept_keywords(label)
|
||||
if not kws:
|
||||
return False, False, False, False
|
||||
|
||||
negated = False
|
||||
conditional = False
|
||||
positive = False
|
||||
|
||||
for ex in excerpts:
|
||||
if not ex or not str(ex).strip():
|
||||
continue
|
||||
for sent in _split_sentences(str(ex)):
|
||||
ns = _norm(sent)
|
||||
if not ns:
|
||||
continue
|
||||
# le concept est-il mentionné ?
|
||||
hit_pos = None
|
||||
for kw in kws:
|
||||
pos = ns.find(kw)
|
||||
if pos != -1:
|
||||
hit_pos = pos
|
||||
break
|
||||
if hit_pos is None:
|
||||
continue
|
||||
|
||||
pre = ns[max(0, hit_pos - 40):hit_pos]
|
||||
has_neg = any(cue in pre for cue in _NEGATION_CUES)
|
||||
has_cond = any(cue in ns for cue in _CONDITIONAL_CUES)
|
||||
|
||||
if has_neg:
|
||||
negated = True
|
||||
else:
|
||||
positive = True
|
||||
|
||||
if has_cond:
|
||||
conditional = True
|
||||
|
||||
contradictory = negated and positive
|
||||
return negated, conditional, contradictory, positive
|
||||
|
||||
|
||||
def _evidence_excerpts(d: Diagnostic | ActeCCAM) -> list[str]:
|
||||
"""Ne retourne que des preuves (extraits), pas le raisonnement du modèle."""
|
||||
texts: list[str] = []
|
||||
if getattr(d, "source_excerpt", None):
|
||||
texts.append(str(getattr(d, "source_excerpt")))
|
||||
# Sources RAG (extraits)
|
||||
for s in getattr(d, "sources_rag", []) or []:
|
||||
if getattr(s, "extrait", None):
|
||||
texts.append(str(s.extrait))
|
||||
return [t for t in texts if t.strip()]
|
||||
|
||||
|
||||
def _has_evidence(d: Diagnostic | ActeCCAM) -> bool:
|
||||
if getattr(d, "source_excerpt", None):
|
||||
return True
|
||||
if getattr(d, "sources_rag", None):
|
||||
# un extrait RAG suffit
|
||||
for s in d.sources_rag:
|
||||
if s.extrait and str(s.extrait).strip():
|
||||
return True
|
||||
if isinstance(d, Diagnostic) and getattr(d, "preuves_cliniques", None):
|
||||
return len(d.preuves_cliniques) > 0
|
||||
return False
|
||||
|
||||
|
||||
def _has_template_evidence(excerpts: Iterable[str]) -> bool:
|
||||
joined = _norm("\n".join([str(x) for x in excerpts if x]))
|
||||
cues = [_norm(c) for c in _EVIDENCE_TEMPLATE_CUES]
|
||||
return any(cue in joined for cue in cues)
|
||||
|
||||
|
||||
def _parse_float(v: str | None) -> float | None:
|
||||
if v is None:
|
||||
return None
|
||||
s = str(v).strip().replace(",", ".")
|
||||
# extraire le premier nombre
|
||||
m = re.search(r"(-?\d+(?:\.\d+)?)", s)
|
||||
if not m:
|
||||
return None
|
||||
try:
|
||||
return float(m.group(1))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _get_bio_value(bios: list[BiologieCle], keywords: tuple[str, ...]) -> float | None:
|
||||
for b in bios:
|
||||
t = (b.test or "").lower()
|
||||
if any(k in t for k in keywords):
|
||||
return _parse_float(b.valeur)
|
||||
return None
|
||||
|
||||
|
||||
def apply_vetos(dossier: DossierMedical) -> VetoReport:
|
||||
"""Applique des vetos déterministes et retourne un rapport.
|
||||
|
||||
Verdicts :
|
||||
- FAIL : au moins un veto HARD.
|
||||
- NEED_INFO : pas de HARD, au moins un MEDIUM.
|
||||
- PASS : aucun HARD/MEDIUM.
|
||||
"""
|
||||
|
||||
issues: list[VetoIssue] = []
|
||||
seen_issue_keys: set[tuple[str, str, str]] = set() # (veto, where, message)
|
||||
|
||||
def add(veto: str, severity: str, where: str, message: str):
|
||||
key = (veto, where, message)
|
||||
if key in seen_issue_keys:
|
||||
return
|
||||
seen_issue_keys.add(key)
|
||||
issues.append(VetoIssue(veto=veto, severity=severity, where=where, message=message))
|
||||
|
||||
# -----------------------------
|
||||
# VETO-02 : code sans preuve
|
||||
# -----------------------------
|
||||
dp = dossier.diagnostic_principal
|
||||
if dp and dp.cim10_suggestion:
|
||||
if not _has_evidence(dp):
|
||||
add("VETO-02", "HARD", "diagnostic_principal", f"DP {dp.cim10_suggestion} sans preuve exploitable")
|
||||
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if das.cim10_suggestion and not _has_evidence(das):
|
||||
add("VETO-02", "MEDIUM", f"diagnostics_associes[{i}]", f"DAS {das.cim10_suggestion} sans preuve exploitable")
|
||||
|
||||
for i, acte in enumerate(dossier.actes_ccam):
|
||||
if acte.code_ccam_suggestion and not _has_evidence(acte):
|
||||
add("VETO-02", "HARD", f"actes_ccam[{i}]", f"Acte {acte.code_ccam_suggestion} sans preuve exploitable")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-03 : négation / conditionnel DANS LES PREUVES
|
||||
# (pas dans le raisonnement du modèle)
|
||||
# -------------------------------------------------
|
||||
if dp and dp.cim10_suggestion:
|
||||
excerpts = _evidence_excerpts(dp)
|
||||
neg, cond, contra, pos = _analyze_neg_cond(excerpts, dp.texte or dp.cim10_suggestion)
|
||||
if neg and not pos:
|
||||
add("VETO-03", "HARD", "diagnostic_principal", f"DP {dp.cim10_suggestion} contredit par la preuve (négation)")
|
||||
elif contra:
|
||||
add("VETO-03", "MEDIUM", "diagnostic_principal", f"DP {dp.cim10_suggestion} preuves contradictoires (positif vs négatif)")
|
||||
elif cond and dp.cim10_confidence == "high":
|
||||
add("VETO-03", "MEDIUM", "diagnostic_principal", f"DP {dp.cim10_suggestion} basé sur du conditionnel")
|
||||
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if not das.cim10_suggestion:
|
||||
continue
|
||||
excerpts = _evidence_excerpts(das)
|
||||
neg, cond, contra, pos = _analyze_neg_cond(excerpts, das.texte or das.cim10_suggestion)
|
||||
where = f"diagnostics_associes[{i}]"
|
||||
if neg and not pos:
|
||||
# En contrôle CPAM : une négation explicite = bloquant, surtout si le modèle est « high ».
|
||||
severity = "HARD" if das.cim10_confidence == "high" else "MEDIUM"
|
||||
add("VETO-03", severity, where, f"DAS {das.cim10_suggestion} contredit par la preuve (négation)")
|
||||
elif contra:
|
||||
add("VETO-03", "MEDIUM", where, f"DAS {das.cim10_suggestion} preuves contradictoires")
|
||||
elif cond and das.cim10_confidence == "high":
|
||||
add("VETO-03", "LOW", where, f"DAS {das.cim10_suggestion} potentiellement conditionnel")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-15 : preuve de type "score/test" (risque élevé de sur-codage)
|
||||
# -------------------------------------------------
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if not das.cim10_suggestion:
|
||||
continue
|
||||
excerpts = _evidence_excerpts(das)
|
||||
if _has_template_evidence(excerpts) and ("fibrose" in _norm(das.texte or "") or str(das.cim10_suggestion).startswith("K74")):
|
||||
add("VETO-15", "MEDIUM", f"diagnostics_associes[{i}]", f"{das.cim10_suggestion}: preuve issue d'un score/test (à confirmer par diagnostic explicite)")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-16 : incohérence libellé→code (heuristique)
|
||||
# -------------------------------------------------
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if not das.cim10_suggestion:
|
||||
continue
|
||||
label_n = _norm(das.texte or "")
|
||||
if "sacroili" in label_n and str(das.cim10_suggestion) == "M53.3":
|
||||
add("VETO-16", "MEDIUM", f"diagnostics_associes[{i}]", "Sacro-iliite : M53.3 semble hors-sujet (à revalider via candidats, ex. M46.1)")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-06 : DP dupliqué en DAS (incohérent)
|
||||
# -------------------------------------------------
|
||||
if dp and dp.cim10_suggestion:
|
||||
dp_code = dp.cim10_suggestion
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if das.cim10_suggestion == dp_code:
|
||||
add("VETO-06", "HARD", "diagnostics_associes", f"Code DP {dp_code} dupliqué dans les DAS (index {i})")
|
||||
break
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-07 : doublons DAS (à fusionner)
|
||||
# -------------------------------------------------
|
||||
seen: dict[str, int] = {}
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
c = das.cim10_suggestion
|
||||
if not c:
|
||||
continue
|
||||
if c in seen:
|
||||
add("VETO-07", "MEDIUM", "diagnostics_associes", f"Doublon DAS {c} (index {seen[c]} et {i})")
|
||||
else:
|
||||
seen[c] = i
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-09 : contradiction bio simple (plaquettes / créat)
|
||||
# -------------------------------------------------
|
||||
# Plaquettes : si code suggère thrombopénie (D69*) mais valeur normale
|
||||
plaquettes = _get_bio_value(dossier.biologie_cle, ("plaquette", "platelet"))
|
||||
if plaquettes is not None:
|
||||
# seuil volontairement large pour éviter faux positifs
|
||||
if dp and dp.cim10_suggestion and dp.cim10_suggestion.startswith("D69") and plaquettes >= 150:
|
||||
add("VETO-09", "HARD", "diagnostic_principal", f"DP {dp.cim10_suggestion} incompatible avec plaquettes={plaquettes} (sans preuve explicite)")
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if das.cim10_suggestion and das.cim10_suggestion.startswith("D69") and plaquettes >= 150:
|
||||
# Si les preuves disent explicitement "pas de thrombopénie" ou si le modèle est très confiant,
|
||||
# on passe en HARD (risque CPAM maximal).
|
||||
excerpts = _evidence_excerpts(das)
|
||||
neg, _, _, _ = _analyze_neg_cond(excerpts, das.texte or das.cim10_suggestion)
|
||||
severity = "HARD" if (das.cim10_confidence == "high" or neg) else "MEDIUM"
|
||||
add("VETO-09", severity, f"diagnostics_associes[{i}]", f"DAS {das.cim10_suggestion} incompatible avec plaquettes={plaquettes}")
|
||||
|
||||
creat = _get_bio_value(dossier.biologie_cle, ("créat", "creat", "creatin"))
|
||||
if creat is not None:
|
||||
# ultra prudence : on ne hard-fail pas sur l'IR, on alerte
|
||||
for i, das in enumerate(dossier.diagnostics_associes):
|
||||
if das.cim10_suggestion and das.cim10_suggestion.startswith(("N17", "N18", "N19")) and creat < 110 and das.cim10_confidence == "high":
|
||||
add("VETO-09", "LOW", f"diagnostics_associes[{i}]", f"IR {das.cim10_suggestion} à confirmer (créat={creat})")
|
||||
|
||||
# -------------------------------------------------
|
||||
# VETO-12 : sur-confiance
|
||||
# -------------------------------------------------
|
||||
def _overconf(d: Diagnostic | ActeCCAM) -> bool:
|
||||
conf = getattr(d, "cim10_confidence", None) or getattr(d, "ccam_confidence", None)
|
||||
return conf == "high" and not _has_evidence(d)
|
||||
|
||||
if dp and dp.cim10_suggestion and _overconf(dp):
|
||||
add("VETO-12", "HARD", "diagnostic_principal", f"DP {dp.cim10_suggestion} en high sans preuve")
|
||||
|
||||
|
||||
# -------------------------------------------------
|
||||
# Post-traitement : si un veto HARD existe pour un même 'where',
|
||||
# on évite de polluer avec des vetos plus faibles redondants.
|
||||
# Exemple : thrombopénie (VETO-09 HARD) -> VETO-03 devient secondaire.
|
||||
# -------------------------------------------------
|
||||
hard_where = {it.where for it in issues if it.severity == "HARD"}
|
||||
if hard_where:
|
||||
issues = [
|
||||
it for it in issues
|
||||
if not (it.where in hard_where and it.severity in ("LOW", "MEDIUM") and it.veto in ("VETO-03", "VETO-15"))
|
||||
]
|
||||
|
||||
# -----------------------------
|
||||
# Verdict + score
|
||||
# -----------------------------
|
||||
hard = any(i.severity == "HARD" for i in issues)
|
||||
medium = any(i.severity == "MEDIUM" for i in issues)
|
||||
|
||||
if hard:
|
||||
verdict = "FAIL"
|
||||
elif medium:
|
||||
verdict = "NEED_INFO"
|
||||
else:
|
||||
verdict = "PASS"
|
||||
|
||||
score = 100
|
||||
for it in issues:
|
||||
if it.severity == "HARD":
|
||||
score -= 30
|
||||
elif it.severity == "MEDIUM":
|
||||
score -= 10
|
||||
else:
|
||||
score -= 3
|
||||
score = max(0, min(100, score))
|
||||
|
||||
return VetoReport(verdict=verdict, score_contestabilite=score, issues=issues)
|
||||
0
t2a_install_rag_cleanup/src/viewer/__init__.py
Normal file
0
t2a_install_rag_cleanup/src/viewer/__init__.py
Normal file
20
t2a_install_rag_cleanup/src/viewer/__main__.py
Normal file
20
t2a_install_rag_cleanup/src/viewer/__main__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Point d'entrée : python -m src.viewer [--host 127.0.0.1] [--port 5000] [--debug]."""
|
||||
|
||||
import argparse
|
||||
|
||||
from .app import create_app
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Viewer CIM-10 T2A")
|
||||
parser.add_argument("--host", default="127.0.0.1")
|
||||
parser.add_argument("--port", type=int, default=5000)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
app = create_app()
|
||||
app.run(host=args.host, port=args.port, debug=args.debug)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
872
t2a_install_rag_cleanup/src/viewer/app.py
Normal file
872
t2a_install_rag_cleanup/src/viewer/app.py
Normal file
@@ -0,0 +1,872 @@
|
||||
"""App Flask — viewer CIM-10 T2A."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from flask import Flask, Response, abort, render_template, request, jsonify
|
||||
from markupsafe import Markup
|
||||
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from ..config import (
|
||||
ANONYMIZED_DIR, STRUCTURED_DIR, INPUT_DIR, REPORTS_DIR,
|
||||
OLLAMA_URL, CCAM_DICT_PATH, DossierMedical,
|
||||
ALLOWED_EXTENSIONS, UPLOAD_MAX_SIZE_MB,
|
||||
CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF, CIM10_DICT_PATH, CIM10_SUPPLEMENTS_PATH,
|
||||
)
|
||||
from .. import config as cfg
|
||||
from .referentiels import ReferentielManager
|
||||
from .validation import ValidationManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def compute_group_stats(items: list[dict]) -> dict:
|
||||
"""Calcule des statistiques agrégées pour un groupe de dossiers.
|
||||
|
||||
Returns:
|
||||
{das_count, alertes_count, actes_count, cma_count}
|
||||
"""
|
||||
das_count = 0
|
||||
alertes_count = 0
|
||||
actes_count = 0
|
||||
cma_count = 0
|
||||
|
||||
for item in items:
|
||||
d = item["dossier"]
|
||||
das_count += len(d.diagnostics_associes)
|
||||
alertes_count += len(d.alertes_codage)
|
||||
actes_count += len(d.actes_ccam)
|
||||
for diag in d.diagnostics_associes:
|
||||
if diag.est_cma:
|
||||
cma_count += 1
|
||||
if d.diagnostic_principal and d.diagnostic_principal.est_cma:
|
||||
cma_count += 1
|
||||
|
||||
return {
|
||||
"das_count": das_count,
|
||||
"alertes_count": alertes_count,
|
||||
"actes_count": actes_count,
|
||||
"cma_count": cma_count,
|
||||
}
|
||||
|
||||
|
||||
def compute_dashboard_stats(groups: dict[str, list[dict]]) -> dict:
|
||||
"""Calcule les statistiques globales du pipeline pour le dashboard."""
|
||||
total_dossiers = len(groups)
|
||||
total_fichiers = 0
|
||||
total_das = 0
|
||||
total_actes = 0
|
||||
total_alertes = 0
|
||||
total_cma = 0
|
||||
total_cpam = 0
|
||||
dp_confidence: Counter = Counter()
|
||||
dp_validity: Counter = Counter()
|
||||
code_counter: Counter = Counter()
|
||||
ghm_types: Counter = Counter()
|
||||
severity_dist: Counter = Counter()
|
||||
processing_times: list[float] = []
|
||||
|
||||
for items in groups.values():
|
||||
total_fichiers += len(items)
|
||||
for item in items:
|
||||
d = item["dossier"]
|
||||
total_das += len(d.diagnostics_associes)
|
||||
total_actes += len(d.actes_ccam)
|
||||
total_alertes += len(d.alertes_codage)
|
||||
total_cpam += len(d.controles_cpam)
|
||||
|
||||
if d.processing_time_s is not None:
|
||||
processing_times.append(d.processing_time_s)
|
||||
|
||||
# DP confidence & validity
|
||||
dp = d.diagnostic_principal
|
||||
if dp:
|
||||
conf = dp.cim10_confidence or "none"
|
||||
dp_confidence[conf] += 1
|
||||
if dp.cim10_suggestion:
|
||||
dp_validity["valide"] += 1
|
||||
code_counter[dp.cim10_suggestion] += 1
|
||||
else:
|
||||
dp_validity["absent"] += 1
|
||||
else:
|
||||
dp_confidence["none"] += 1
|
||||
dp_validity["absent"] += 1
|
||||
|
||||
# DAS codes + CMA
|
||||
for das in d.diagnostics_associes:
|
||||
if das.cim10_suggestion:
|
||||
code_counter[das.cim10_suggestion] += 1
|
||||
if das.est_cma:
|
||||
total_cma += 1
|
||||
if dp and dp.est_cma:
|
||||
total_cma += 1
|
||||
|
||||
# GHM
|
||||
ghm = d.ghm_estimation
|
||||
if ghm:
|
||||
if ghm.type_ghm:
|
||||
ghm_types[ghm.type_ghm] += 1
|
||||
severity_dist[ghm.severite] += 1
|
||||
|
||||
top_codes = code_counter.most_common(15)
|
||||
top_max = top_codes[0][1] if top_codes else 1
|
||||
|
||||
return {
|
||||
"total_dossiers": total_dossiers,
|
||||
"total_fichiers": total_fichiers,
|
||||
"total_das": total_das,
|
||||
"total_actes": total_actes,
|
||||
"total_alertes": total_alertes,
|
||||
"total_cma": total_cma,
|
||||
"total_cpam": total_cpam,
|
||||
"dp_confidence": dict(dp_confidence),
|
||||
"dp_validity": dict(dp_validity),
|
||||
"top_codes": top_codes,
|
||||
"top_max": top_max,
|
||||
"ghm_types": dict(ghm_types),
|
||||
"severity_dist": dict(severity_dist),
|
||||
"processing_time_total": sum(processing_times),
|
||||
"processing_time_avg": sum(processing_times) / len(processing_times) if processing_times else 0,
|
||||
}
|
||||
|
||||
|
||||
def collect_cpam_controls(groups: dict[str, list[dict]]) -> list[dict]:
|
||||
"""Collecte tous les contrôles CPAM de tous les dossiers."""
|
||||
controls = []
|
||||
for group_name, items in groups.items():
|
||||
for item in items:
|
||||
d = item["dossier"]
|
||||
dp_code = d.diagnostic_principal.cim10_suggestion if d.diagnostic_principal else None
|
||||
for ctrl in d.controles_cpam:
|
||||
controls.append({
|
||||
"group_name": group_name,
|
||||
"filepath": item["path_rel"],
|
||||
"ctrl": ctrl,
|
||||
"dp_code": dp_code,
|
||||
})
|
||||
controls.sort(key=lambda c: c["ctrl"].numero_ogc)
|
||||
return controls
|
||||
|
||||
|
||||
def get_builtin_referentiels() -> list[dict]:
|
||||
"""Retourne les infos sur les référentiels intégrés (PDFs + dicts)."""
|
||||
rag_index_meta = Path(STRUCTURED_DIR).parent / "data" / "rag_index" / "metadata.json"
|
||||
chunks_by_doc: dict[str, int] = {}
|
||||
if rag_index_meta.exists():
|
||||
try:
|
||||
import json as _json
|
||||
meta = _json.loads(rag_index_meta.read_text(encoding="utf-8"))
|
||||
for m in meta:
|
||||
doc = m.get("document", "")
|
||||
chunks_by_doc[doc] = chunks_by_doc.get(doc, 0) + 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
refs = []
|
||||
builtin_sources = [
|
||||
("CIM-10 FR 2026", CIM10_PDF, ".pdf", ["cim10", "cim10_alpha"]),
|
||||
("Guide Méthodologique MCO 2026", GUIDE_METHODO_PDF, ".pdf", ["guide_methodo"]),
|
||||
("CCAM 2025", CCAM_PDF, ".pdf", ["ccam"]),
|
||||
("Dictionnaire CIM-10", CIM10_DICT_PATH, ".json", []),
|
||||
("Suppléments CIM-10", CIM10_SUPPLEMENTS_PATH, ".json", []),
|
||||
("Dictionnaire CCAM", CCAM_DICT_PATH, ".json", []),
|
||||
]
|
||||
for name, path, ext, doc_keys in builtin_sources:
|
||||
size_mb = path.stat().st_size / (1024 * 1024) if path.exists() else 0
|
||||
chunks = sum(chunks_by_doc.get(k, 0) for k in doc_keys)
|
||||
refs.append({
|
||||
"name": name,
|
||||
"filename": path.name,
|
||||
"extension": ext,
|
||||
"size_mb": size_mb,
|
||||
"chunks": chunks,
|
||||
"exists": path.exists(),
|
||||
})
|
||||
return refs
|
||||
|
||||
|
||||
def load_ccam_dict() -> dict[str, dict]:
|
||||
"""Charge le dictionnaire CCAM pour les regroupements."""
|
||||
if CCAM_DICT_PATH.exists():
|
||||
try:
|
||||
data = json.loads(CCAM_DICT_PATH.read_text(encoding="utf-8"))
|
||||
return data
|
||||
except Exception:
|
||||
logger.warning("Impossible de charger le dictionnaire CCAM")
|
||||
return {}
|
||||
|
||||
|
||||
def scan_dossiers() -> dict[str, list[dict]]:
|
||||
"""Scanne output/structured/ et retourne les fichiers groupés par sous-dossier.
|
||||
|
||||
Returns:
|
||||
{"racine": [{name, path_rel, dossier}, ...], "sous-dossier": [...]}
|
||||
Chaque groupe contient aussi une clé "stats" avec les compteurs agrégés.
|
||||
"""
|
||||
groups: dict[str, list[dict]] = {}
|
||||
|
||||
for json_path in sorted(STRUCTURED_DIR.rglob("*.json")):
|
||||
rel = json_path.relative_to(STRUCTURED_DIR)
|
||||
parts = rel.parts
|
||||
|
||||
if len(parts) == 1:
|
||||
group_name = "racine"
|
||||
else:
|
||||
group_name = str(Path(*parts[:-1]))
|
||||
|
||||
try:
|
||||
data = json.loads(json_path.read_text(encoding="utf-8"))
|
||||
dossier = DossierMedical.model_validate(data)
|
||||
except Exception:
|
||||
logger.warning("Impossible de charger %s", json_path)
|
||||
continue
|
||||
|
||||
groups.setdefault(group_name, []).append({
|
||||
"name": json_path.stem,
|
||||
"path_rel": str(rel),
|
||||
"dossier": dossier,
|
||||
})
|
||||
|
||||
return groups
|
||||
|
||||
|
||||
def load_dossier(path_rel: str) -> DossierMedical:
|
||||
"""Charge un JSON et le désérialise. Vérifie contre le path traversal."""
|
||||
safe_path = (STRUCTURED_DIR / path_rel).resolve()
|
||||
if not safe_path.is_relative_to(STRUCTURED_DIR.resolve()):
|
||||
abort(403)
|
||||
if not safe_path.exists():
|
||||
abort(404)
|
||||
|
||||
data = json.loads(safe_path.read_text(encoding="utf-8"))
|
||||
return DossierMedical.model_validate(data)
|
||||
|
||||
|
||||
def fetch_ollama_models() -> list[str]:
|
||||
"""Appelle GET {OLLAMA_URL}/api/tags pour lister les modèles disponibles."""
|
||||
try:
|
||||
resp = requests.get(f"{cfg.OLLAMA_URL}/api/tags", timeout=5)
|
||||
resp.raise_for_status()
|
||||
models = resp.json().get("models", [])
|
||||
return [m["name"] for m in models]
|
||||
except Exception:
|
||||
logger.warning("Impossible de contacter Ollama pour lister les modèles")
|
||||
return []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Filtres Jinja2
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CONFIDENCE_COLORS = {
|
||||
"high": ("#16a34a", "#dcfce7"),
|
||||
"medium": ("#ca8a04", "#fef9c3"),
|
||||
"low": ("#dc2626", "#fee2e2"),
|
||||
}
|
||||
|
||||
_CONFIDENCE_LABELS = {
|
||||
"high": "Haute",
|
||||
"medium": "Moyenne",
|
||||
"low": "Basse",
|
||||
}
|
||||
|
||||
|
||||
def confidence_badge(value: str | None) -> Markup:
|
||||
if not value:
|
||||
return Markup("")
|
||||
fg, bg = _CONFIDENCE_COLORS.get(value, ("#6b7280", "#f3f4f6"))
|
||||
label = _CONFIDENCE_LABELS.get(value, value)
|
||||
return Markup(
|
||||
f'<span style="display:inline-block;padding:2px 8px;border-radius:9999px;'
|
||||
f'font-size:0.75rem;font-weight:600;color:{fg};background:{bg}">'
|
||||
f'{label}</span>'
|
||||
)
|
||||
|
||||
|
||||
def confidence_label(value: str | None) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
return _CONFIDENCE_LABELS.get(value, value)
|
||||
|
||||
|
||||
_SEVERITY_STYLES = {
|
||||
"severe": ("Sévère", "#dc2626", "#fee2e2"),
|
||||
"modere": ("Modéré", "#92400e", "#fef3c7"),
|
||||
"leger": ("Léger", "#065f46", "#d1fae5"),
|
||||
}
|
||||
|
||||
_CMA_LEVEL_STYLES = {
|
||||
1: ("1", "#6b7280", "#f3f4f6"), # gris — pas CMA
|
||||
2: ("2", "#065f46", "#d1fae5"), # vert
|
||||
3: ("3", "#92400e", "#fef3c7"), # jaune/orange
|
||||
4: ("4", "#dc2626", "#fee2e2"), # rouge
|
||||
}
|
||||
|
||||
|
||||
def format_duration(seconds: float | None) -> str:
|
||||
"""Formate une durée en secondes vers un format lisible (ex: 2min 30s)."""
|
||||
if seconds is None:
|
||||
return ""
|
||||
if seconds < 60:
|
||||
return f"{seconds:.1f}s"
|
||||
minutes = int(seconds // 60)
|
||||
secs = int(seconds % 60)
|
||||
if secs == 0:
|
||||
return f"{minutes}min"
|
||||
return f"{minutes}min {secs:02d}s"
|
||||
|
||||
|
||||
def severity_badge(value: str | None) -> Markup:
|
||||
if not value or value not in _SEVERITY_STYLES:
|
||||
return Markup("")
|
||||
label, fg, bg = _SEVERITY_STYLES[value]
|
||||
return Markup(
|
||||
f'<span style="display:inline-block;padding:2px 8px;border-radius:9999px;'
|
||||
f'font-size:0.75rem;font-weight:600;color:{fg};background:{bg}">'
|
||||
f'{label}</span>'
|
||||
)
|
||||
|
||||
|
||||
def cma_level_badge(value: int | None) -> Markup:
|
||||
"""Badge CMA niveau 1-4 avec couleurs graduées."""
|
||||
if value is None or value < 1:
|
||||
return Markup("")
|
||||
level = min(value, 4)
|
||||
label, fg, bg = _CMA_LEVEL_STYLES.get(level, _CMA_LEVEL_STYLES[1])
|
||||
title = {1: "Pas CMA", 2: "CMA niveau 2", 3: "CMA niveau 3", 4: "CMA niveau 4"}.get(level, "")
|
||||
return Markup(
|
||||
f'<span title="{title}" style="display:inline-block;padding:2px 8px;border-radius:9999px;'
|
||||
f'font-size:0.75rem;font-weight:600;color:{fg};background:{bg}">'
|
||||
f'CMA {label}</span>'
|
||||
)
|
||||
|
||||
|
||||
def format_dossier_name(name: str) -> str:
|
||||
"""Retourne le nom complet du dossier (ex: 1_23096332)."""
|
||||
if name == "racine":
|
||||
return "Non classés"
|
||||
return name
|
||||
|
||||
|
||||
def format_doc_name(name: str) -> str:
|
||||
"""Transforme un nom de fichier JSON en nom lisible."""
|
||||
n = name.lower()
|
||||
if "fusionne" in n:
|
||||
return "Fusionné"
|
||||
if n.startswith("cro") or n.startswith("crh"):
|
||||
return name.split("_")[0].upper()
|
||||
if "trackare" in n:
|
||||
return "Trackare"
|
||||
if "anapath" in n:
|
||||
return "Anapath"
|
||||
return name
|
||||
|
||||
|
||||
def format_cpam_text(text: str | None) -> Markup:
|
||||
"""Convertit un texte CPAM (section) en HTML avec puces et paragraphes."""
|
||||
if not text:
|
||||
return Markup("")
|
||||
from markupsafe import escape
|
||||
lines = str(text).split("\n")
|
||||
html_parts: list[str] = []
|
||||
in_list = False
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
if in_list:
|
||||
html_parts.append("</ul>")
|
||||
in_list = False
|
||||
html_parts.append("<br>")
|
||||
continue
|
||||
if stripped.startswith("- "):
|
||||
if not in_list:
|
||||
html_parts.append("<ul style='margin:0.3rem 0;padding-left:1.2rem;'>")
|
||||
in_list = True
|
||||
html_parts.append(f"<li>{escape(stripped[2:])}</li>")
|
||||
else:
|
||||
if in_list:
|
||||
html_parts.append("</ul>")
|
||||
in_list = False
|
||||
html_parts.append(f"<p style='margin:0.2rem 0;'>{escape(stripped)}</p>")
|
||||
if in_list:
|
||||
html_parts.append("</ul>")
|
||||
return Markup("\n".join(html_parts))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# App factory
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def create_app() -> Flask:
|
||||
app = Flask(__name__)
|
||||
|
||||
app.jinja_env.filters["confidence_badge"] = confidence_badge
|
||||
app.jinja_env.filters["confidence_label"] = confidence_label
|
||||
app.jinja_env.filters["severity_badge"] = severity_badge
|
||||
app.jinja_env.filters["cma_level_badge"] = cma_level_badge
|
||||
app.jinja_env.filters["format_duration"] = format_duration
|
||||
app.jinja_env.filters["format_dossier_name"] = format_dossier_name
|
||||
app.jinja_env.filters["format_doc_name"] = format_doc_name
|
||||
app.jinja_env.filters["format_cpam_text"] = format_cpam_text
|
||||
|
||||
ccam_dict = load_ccam_dict()
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
groups = scan_dossiers()
|
||||
group_stats = {name: compute_group_stats(items) for name, items in groups.items()}
|
||||
return render_template("index.html", groups=groups, group_stats=group_stats)
|
||||
|
||||
@app.route("/dossier/<path:filepath>")
|
||||
def detail(filepath: str):
|
||||
dossier = load_dossier(filepath)
|
||||
# Trouver les fichiers du même groupe pour la navigation
|
||||
groups = scan_dossiers()
|
||||
siblings = []
|
||||
current_group = None
|
||||
rel_parts = Path(filepath).parts
|
||||
if len(rel_parts) > 1:
|
||||
current_group = str(Path(*rel_parts[:-1]))
|
||||
siblings = groups.get(current_group, [])
|
||||
return render_template(
|
||||
"detail.html",
|
||||
dossier=dossier,
|
||||
filepath=filepath,
|
||||
ccam_dict=ccam_dict,
|
||||
siblings=siblings,
|
||||
current_group=current_group,
|
||||
)
|
||||
|
||||
@app.route("/dashboard")
|
||||
def dashboard():
|
||||
groups = scan_dossiers()
|
||||
stats = compute_dashboard_stats(groups)
|
||||
return render_template("dashboard.html", stats=stats, groups=groups)
|
||||
|
||||
@app.route("/cpam")
|
||||
def cpam_list():
|
||||
groups = scan_dossiers()
|
||||
controls = collect_cpam_controls(groups)
|
||||
return render_template("cpam.html", controls=controls, total=len(controls), groups=groups)
|
||||
|
||||
@app.route("/admin/models", methods=["GET"])
|
||||
def list_models():
|
||||
models = fetch_ollama_models()
|
||||
return jsonify({"models": models, "current": cfg.OLLAMA_MODEL})
|
||||
|
||||
@app.route("/admin/models", methods=["POST"])
|
||||
def set_model():
|
||||
data = request.get_json(silent=True) or {}
|
||||
new_model = data.get("model", "").strip()
|
||||
if not new_model:
|
||||
return jsonify({"error": "Champ 'model' requis"}), 400
|
||||
cfg.OLLAMA_MODEL = new_model
|
||||
logger.info("Modèle Ollama changé : %s", new_model)
|
||||
return jsonify({"ok": True, "model": cfg.OLLAMA_MODEL})
|
||||
|
||||
@app.route("/reprocess/<path:filepath>", methods=["POST"])
|
||||
def reprocess(filepath: str):
|
||||
"""Relance le pipeline complet : process PDFs + fusion + GHM + CPAM."""
|
||||
from ..main import process_pdf, write_outputs
|
||||
from ..medical.ghm import estimate_ghm
|
||||
|
||||
dossier = load_dossier(filepath)
|
||||
input_dir = Path(__file__).parent.parent.parent / "input"
|
||||
|
||||
# Collecter les PDFs sources (fusionné → source_files, simple → source_file)
|
||||
source_names = []
|
||||
if dossier.source_files:
|
||||
source_names = list(dossier.source_files)
|
||||
elif dossier.source_file:
|
||||
source_names = [dossier.source_file]
|
||||
|
||||
if not source_names:
|
||||
return jsonify({"error": "Fichier source introuvable"}), 400
|
||||
|
||||
# Résoudre les chemins PDF dans input/
|
||||
pdf_paths = []
|
||||
missing = []
|
||||
for name in source_names:
|
||||
found = None
|
||||
for p in input_dir.rglob(name):
|
||||
if p.is_file():
|
||||
found = p
|
||||
break
|
||||
if found:
|
||||
pdf_paths.append(found)
|
||||
else:
|
||||
missing.append(name)
|
||||
|
||||
if not pdf_paths:
|
||||
return jsonify({"error": f"PDF sources introuvables : {', '.join(missing)}"}), 404
|
||||
|
||||
try:
|
||||
# Déterminer le subdir depuis le premier PDF trouvé
|
||||
subdir = None
|
||||
if pdf_paths[0].parent != input_dir:
|
||||
subdir = pdf_paths[0].parent.name
|
||||
|
||||
# 1. Traiter chaque PDF
|
||||
group_dossiers = []
|
||||
for pdf_path in pdf_paths:
|
||||
pdf_results = process_pdf(pdf_path)
|
||||
stem = pdf_path.stem.replace(" ", "_")
|
||||
multi = len(pdf_results) > 1
|
||||
for part_idx, (anonymized_text, new_dossier, report) in enumerate(pdf_results):
|
||||
part_stem = f"{stem}_part{part_idx + 1}" if multi else stem
|
||||
write_outputs(part_stem, anonymized_text, new_dossier, report, subdir=subdir)
|
||||
group_dossiers.append(new_dossier)
|
||||
|
||||
# 2. Fusion multi-PDF
|
||||
merged = None
|
||||
if len(group_dossiers) > 1 and subdir:
|
||||
try:
|
||||
from ..medical.fusion import merge_dossiers
|
||||
merged = merge_dossiers(group_dossiers)
|
||||
try:
|
||||
ghm = estimate_ghm(merged)
|
||||
merged.ghm_estimation = ghm
|
||||
except Exception:
|
||||
logger.warning("Erreur estimation GHM fusionné", exc_info=True)
|
||||
except Exception:
|
||||
logger.exception("Erreur fusion groupe %s", subdir)
|
||||
|
||||
# 3. Contrôle CPAM (auto-détection Excel)
|
||||
target = merged if merged else (group_dossiers[-1] if group_dossiers else None)
|
||||
if target and subdir:
|
||||
cpam_dir = input_dir / "Control_cpam"
|
||||
cpam_path = None
|
||||
if cpam_dir.is_dir():
|
||||
xlsx_files = sorted(cpam_dir.glob("*.xlsx"))
|
||||
if xlsx_files:
|
||||
cpam_path = xlsx_files[0]
|
||||
if cpam_path:
|
||||
try:
|
||||
from ..control.cpam_parser import parse_cpam_excel, match_dossier_ogc
|
||||
from ..control.cpam_response import generate_cpam_response
|
||||
cpam_data = parse_cpam_excel(str(cpam_path))
|
||||
if cpam_data:
|
||||
controles = match_dossier_ogc(subdir, cpam_data)
|
||||
if controles:
|
||||
logger.info("CPAM reprocess : %d contrôle(s) pour %s",
|
||||
len(controles), subdir)
|
||||
for ctrl in controles:
|
||||
text, response_data, sources = generate_cpam_response(target, ctrl)
|
||||
ctrl.contre_argumentation = text
|
||||
ctrl.response_data = response_data
|
||||
ctrl.sources_reponse = sources
|
||||
target.controles_cpam = controles
|
||||
except Exception:
|
||||
logger.exception("Erreur CPAM reprocess pour %s", subdir)
|
||||
|
||||
# 4. Écrire le dossier fusionné (après CPAM)
|
||||
if merged is not None and subdir:
|
||||
struct_dir = STRUCTURED_DIR / subdir
|
||||
struct_dir.mkdir(parents=True, exist_ok=True)
|
||||
merged_path = struct_dir / f"{subdir}_fusionne_cim10.json"
|
||||
merged_path.write_text(
|
||||
merged.model_dump_json(indent=2, exclude_none=True),
|
||||
encoding="utf-8",
|
||||
)
|
||||
logger.info("Dossier fusionné réécrit : %s", merged_path)
|
||||
|
||||
msg = f"Traitement terminé ({len(group_dossiers)} dossier(s)"
|
||||
if merged:
|
||||
msg += ", fusionné"
|
||||
if target and getattr(target, "controles_cpam", None):
|
||||
msg += f", {len(target.controles_cpam)} contrôle(s) CPAM"
|
||||
if missing:
|
||||
msg += f", {len(missing)} PDF(s) manquant(s)"
|
||||
msg += ")"
|
||||
return jsonify({"ok": True, "message": msg})
|
||||
except Exception as e:
|
||||
logger.exception("Erreur lors du retraitement")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# API texte source anonymisé
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@app.route("/api/source-text/<path:dossier_id>")
|
||||
def source_text(dossier_id: str):
|
||||
"""Retourne le contenu texte anonymisé de tous les fichiers d'un dossier."""
|
||||
safe_dir = (ANONYMIZED_DIR / dossier_id).resolve()
|
||||
if not safe_dir.is_relative_to(ANONYMIZED_DIR.resolve()):
|
||||
abort(403)
|
||||
if not safe_dir.is_dir():
|
||||
abort(404)
|
||||
|
||||
result = {}
|
||||
for txt_path in sorted(safe_dir.glob("*_anonymized.txt")):
|
||||
try:
|
||||
result[txt_path.name] = txt_path.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
logger.warning("Impossible de lire %s", txt_path)
|
||||
return jsonify(result)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# API PDF caviardé
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@app.route("/api/pdf/<path:dossier_id>/<filename>")
|
||||
def serve_redacted_pdf(dossier_id: str, filename: str):
|
||||
"""Sert un PDF avec les données personnelles caviardées (rectangles noirs).
|
||||
|
||||
Query params optionnels :
|
||||
- highlight : texte à surligner en jaune
|
||||
- page : numéro de page (1-indexed) pour cibler le surlignage
|
||||
"""
|
||||
from .pdf_redactor import load_entities_from_report, redact_pdf, highlight_text
|
||||
|
||||
# Sécurité path traversal
|
||||
safe_dir = (INPUT_DIR / dossier_id).resolve()
|
||||
if not safe_dir.is_relative_to(INPUT_DIR.resolve()):
|
||||
abort(403)
|
||||
|
||||
pdf_path = safe_dir / filename
|
||||
if not pdf_path.exists() or pdf_path.suffix.lower() != ".pdf":
|
||||
abort(404)
|
||||
|
||||
# Charger les entités depuis le rapport d'anonymisation
|
||||
stem = Path(filename).stem.replace(" ", "_")
|
||||
report_path = REPORTS_DIR / dossier_id / f"{stem}_report.json"
|
||||
entities = load_entities_from_report(report_path) if report_path.exists() else set()
|
||||
|
||||
pdf_bytes = redact_pdf(pdf_path, entities)
|
||||
|
||||
# Surlignage optionnel
|
||||
highlight = request.args.get("highlight", "")
|
||||
page_num = request.args.get("page", type=int)
|
||||
if highlight:
|
||||
pdf_bytes = highlight_text(pdf_bytes, highlight, page_num)
|
||||
|
||||
return Response(pdf_bytes, mimetype="application/pdf")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Routes admin référentiels
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
ref_manager = ReferentielManager()
|
||||
|
||||
@app.route("/admin/referentiels")
|
||||
def admin_referentiels():
|
||||
refs = ref_manager.list_all()
|
||||
builtin = get_builtin_referentiels()
|
||||
return render_template("admin_referentiels.html", referentiels=refs, builtin_refs=builtin, max_size=UPLOAD_MAX_SIZE_MB)
|
||||
|
||||
@app.route("/admin/referentiels/upload", methods=["POST"])
|
||||
def upload_referentiel():
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "Aucun fichier envoyé"}), 400
|
||||
f = request.files["file"]
|
||||
if not f.filename:
|
||||
return jsonify({"error": "Nom de fichier vide"}), 400
|
||||
|
||||
filename = secure_filename(f.filename)
|
||||
try:
|
||||
file_data = f.read()
|
||||
ref = ref_manager.add_file(filename, file_data)
|
||||
return jsonify({"ok": True, "referentiel": ref})
|
||||
except ValueError as e:
|
||||
return jsonify({"error": str(e)}), 400
|
||||
|
||||
@app.route("/admin/referentiels/<ref_id>/index", methods=["POST"])
|
||||
def index_referentiel(ref_id: str):
|
||||
try:
|
||||
count = ref_manager.index_referentiel(ref_id)
|
||||
return jsonify({"ok": True, "chunks": count})
|
||||
except ValueError as e:
|
||||
return jsonify({"error": str(e)}), 404
|
||||
except Exception as e:
|
||||
logger.exception("Erreur lors de l'indexation du référentiel %s", ref_id)
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
@app.route("/admin/referentiels/<ref_id>", methods=["DELETE"])
|
||||
def delete_referentiel(ref_id: str):
|
||||
if ref_manager.remove(ref_id):
|
||||
return jsonify({"ok": True})
|
||||
return jsonify({"error": "Référentiel introuvable"}), 404
|
||||
|
||||
@app.route("/admin/referentiels/rebuild-index", methods=["POST"])
|
||||
def rebuild_index():
|
||||
try:
|
||||
from ..medical.rag_index import build_index
|
||||
build_index(force=True)
|
||||
# Réindexer tous les référentiels actifs
|
||||
reindexed = 0
|
||||
for ref in ref_manager.list_all():
|
||||
if ref["status"] == "indexed":
|
||||
ref_manager.index_referentiel(ref["id"])
|
||||
reindexed += 1
|
||||
return jsonify({"ok": True, "reindexed": reindexed})
|
||||
except Exception as e:
|
||||
logger.exception("Erreur lors du rebuild de l'index")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Routes validation DIM
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
val_manager = ValidationManager()
|
||||
|
||||
@app.route("/validation")
|
||||
def validation_list():
|
||||
groups = scan_dossiers()
|
||||
selection = val_manager.load_selection()
|
||||
annotations = {a["dossier_id"]: a for a in val_manager.list_annotations()}
|
||||
|
||||
# Construire la liste enrichie
|
||||
items = []
|
||||
for dossier_id in selection:
|
||||
annot = annotations.get(dossier_id, {})
|
||||
# Trouver les données pipeline
|
||||
parts = dossier_id.split("/")
|
||||
group_name = parts[0] if parts else ""
|
||||
group_items = groups.get(group_name, [])
|
||||
pipeline = None
|
||||
for gi in group_items:
|
||||
if "fusionne" in gi["name"]:
|
||||
pipeline = gi
|
||||
break
|
||||
if not pipeline and group_items:
|
||||
pipeline = group_items[0]
|
||||
|
||||
d = pipeline["dossier"] if pipeline else None
|
||||
items.append({
|
||||
"dossier_id": dossier_id,
|
||||
"group_name": group_name,
|
||||
"dp_code": d.diagnostic_principal.cim10_suggestion if d and d.diagnostic_principal else "",
|
||||
"dp_texte": d.diagnostic_principal.texte if d and d.diagnostic_principal else "",
|
||||
"dp_confidence": d.diagnostic_principal.cim10_confidence if d and d.diagnostic_principal else "",
|
||||
"nb_das": len(d.diagnostics_associes) if d else 0,
|
||||
"has_cpam": bool(d and d.controles_cpam),
|
||||
"statut": annot.get("statut", "non_commence"),
|
||||
"validateur": annot.get("validateur", ""),
|
||||
"date_validation": annot.get("date_validation", ""),
|
||||
})
|
||||
|
||||
total = len(items)
|
||||
valides = sum(1 for i in items if i["statut"] == "valide")
|
||||
en_cours = sum(1 for i in items if i["statut"] == "en_cours")
|
||||
|
||||
return render_template(
|
||||
"validation_list.html",
|
||||
items=items,
|
||||
total=total,
|
||||
valides=valides,
|
||||
en_cours=en_cours,
|
||||
groups=groups,
|
||||
)
|
||||
|
||||
@app.route("/validation/<path:dossier_id>")
|
||||
def validation_detail(dossier_id: str):
|
||||
groups = scan_dossiers()
|
||||
# Charger l'annotation
|
||||
annotation = val_manager.load_annotation(dossier_id)
|
||||
if not annotation:
|
||||
abort(404)
|
||||
|
||||
# Charger les données pipeline
|
||||
parts = dossier_id.split("/")
|
||||
group_name = parts[0] if parts else ""
|
||||
group_items = groups.get(group_name, [])
|
||||
pipeline = None
|
||||
for gi in group_items:
|
||||
if "fusionne" in gi["name"]:
|
||||
pipeline = gi
|
||||
break
|
||||
if not pipeline and group_items:
|
||||
pipeline = group_items[0]
|
||||
|
||||
dossier = pipeline["dossier"] if pipeline else None
|
||||
|
||||
# Navigation : dossier précédent / suivant
|
||||
selection = val_manager.load_selection()
|
||||
current_idx = selection.index(dossier_id) if dossier_id in selection else -1
|
||||
prev_id = selection[current_idx - 1] if current_idx > 0 else None
|
||||
next_id = selection[current_idx + 1] if current_idx < len(selection) - 1 else None
|
||||
|
||||
return render_template(
|
||||
"validation_detail.html",
|
||||
annotation=annotation,
|
||||
dossier=dossier,
|
||||
dossier_id=dossier_id,
|
||||
group_name=group_name,
|
||||
prev_id=prev_id,
|
||||
next_id=next_id,
|
||||
groups=groups,
|
||||
)
|
||||
|
||||
@app.route("/api/validation/save", methods=["POST"])
|
||||
def api_validation_save():
|
||||
data = request.get_json(silent=True)
|
||||
if not data or "dossier_id" not in data:
|
||||
return jsonify({"error": "dossier_id requis"}), 400
|
||||
dossier_id = data["dossier_id"]
|
||||
# Vérifier que le dossier fait partie de la sélection
|
||||
selection = val_manager.load_selection()
|
||||
if selection and dossier_id not in selection:
|
||||
return jsonify({"error": "Dossier non sélectionné pour validation"}), 403
|
||||
try:
|
||||
val_manager.save_annotation(dossier_id, data)
|
||||
return jsonify({"ok": True})
|
||||
except Exception as e:
|
||||
logger.exception("Erreur sauvegarde annotation %s", dossier_id)
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
@app.route("/api/cim10/search")
|
||||
def api_cim10_search():
|
||||
from ..medical.cim10_dict import load_dict, normalize_text
|
||||
q = request.args.get("q", "").strip()
|
||||
if len(q) < 2:
|
||||
return jsonify({"results": []})
|
||||
|
||||
cim10 = load_dict()
|
||||
q_norm = normalize_text(q)
|
||||
q_upper = q.upper().strip()
|
||||
|
||||
results = []
|
||||
# Recherche par code exact d'abord
|
||||
for code, label in cim10.items():
|
||||
if code.upper().startswith(q_upper):
|
||||
results.append({"code": code, "label": label})
|
||||
if len(results) >= 20:
|
||||
break
|
||||
|
||||
# Puis recherche par texte normalisé
|
||||
if len(results) < 20:
|
||||
for code, label in cim10.items():
|
||||
if any(r["code"] == code for r in results):
|
||||
continue
|
||||
if q_norm in normalize_text(label):
|
||||
results.append({"code": code, "label": label})
|
||||
if len(results) >= 20:
|
||||
break
|
||||
|
||||
return jsonify({"results": results})
|
||||
|
||||
@app.route("/validation/metrics")
|
||||
def validation_metrics():
|
||||
groups = scan_dossiers()
|
||||
metrics = val_manager.compute_metrics(groups)
|
||||
selection = val_manager.load_selection()
|
||||
return render_template(
|
||||
"validation_metrics.html",
|
||||
metrics=metrics,
|
||||
total_selection=len(selection),
|
||||
groups=groups,
|
||||
)
|
||||
|
||||
return app
|
||||
154
t2a_install_rag_cleanup/src/viewer/pdf_redactor.py
Normal file
154
t2a_install_rag_cleanup/src/viewer/pdf_redactor.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""Caviardage PDF à la volée — remplace les entités NER par des rectangles noirs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Cache simple : (pdf_path, frozenset(entities)) -> (bytes, timestamp)
|
||||
_pdf_cache: dict[tuple[str, frozenset[str]], tuple[bytes, float]] = {}
|
||||
_CACHE_TTL_S = 300 # 5 minutes
|
||||
|
||||
|
||||
def load_entities_from_report(report_path: Path) -> set[str]:
|
||||
"""Extrait les entités uniques à caviarder depuis le rapport d'anonymisation."""
|
||||
data = json.loads(report_path.read_text(encoding="utf-8"))
|
||||
entities: set[str] = set()
|
||||
for e in data.get("entities_found", []):
|
||||
orig = e.get("original", "")
|
||||
# Ignorer les pseudonymes et les chaînes trop courtes
|
||||
if not orig.startswith("[") and len(orig) >= 2:
|
||||
entities.add(orig)
|
||||
return entities
|
||||
|
||||
|
||||
def redact_pdf(pdf_path: Path, entities: set[str]) -> bytes:
|
||||
"""Ouvre un PDF, caviarde toutes les occurrences des entités, retourne les bytes."""
|
||||
cache_key = (str(pdf_path), frozenset(entities))
|
||||
|
||||
# Vérifier le cache
|
||||
if cache_key in _pdf_cache:
|
||||
cached_bytes, cached_time = _pdf_cache[cache_key]
|
||||
if time.time() - cached_time < _CACHE_TTL_S:
|
||||
return cached_bytes
|
||||
|
||||
doc = fitz.open(str(pdf_path))
|
||||
try:
|
||||
for page in doc:
|
||||
for entity in entities:
|
||||
rects = page.search_for(entity)
|
||||
for rect in rects:
|
||||
page.add_redact_annot(rect, fill=(0, 0, 0))
|
||||
page.apply_redactions()
|
||||
pdf_bytes = doc.tobytes()
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
# Mettre en cache
|
||||
_pdf_cache[cache_key] = (pdf_bytes, time.time())
|
||||
|
||||
# Nettoyer les entrées expirées
|
||||
now = time.time()
|
||||
expired = [k for k, (_, t) in _pdf_cache.items() if now - t >= _CACHE_TTL_S]
|
||||
for k in expired:
|
||||
_pdf_cache.pop(k, None)
|
||||
|
||||
return pdf_bytes
|
||||
|
||||
|
||||
def _strip_accents(s: str) -> str:
|
||||
"""Retire les accents d'une chaîne (é→e, è→e, etc.)."""
|
||||
nfkd = unicodedata.normalize("NFD", s)
|
||||
return "".join(c for c in nfkd if unicodedata.category(c) != "Mn")
|
||||
|
||||
|
||||
def _add_highlight(page, rects) -> None:
|
||||
"""Ajoute des annotations highlight jaunes sur une liste de rectangles."""
|
||||
for rect in rects:
|
||||
annot = page.add_highlight_annot(rect)
|
||||
annot.set_colors(stroke=(1, 0.95, 0)) # jaune
|
||||
annot.update()
|
||||
|
||||
|
||||
def highlight_text(pdf_bytes: bytes, text: str, page_num: int | None = None) -> bytes:
|
||||
"""Ajoute un surlignage jaune sur les occurrences d'un texte dans le PDF.
|
||||
|
||||
Appliqué après le caviardage (sur les bytes déjà caviardés).
|
||||
Si page_num est fourni (1-indexed), cherche uniquement sur cette page.
|
||||
|
||||
Le texte reçu est typiquement le nom du diagnostic/item médical (court,
|
||||
une seule ligne) — pas l'excerpt brut qui est multi-lignes et bruité.
|
||||
"""
|
||||
if not text or len(text) < 3:
|
||||
return pdf_bytes
|
||||
|
||||
# Nettoyer le texte : retirer les "..." ajoutés par extract_excerpt()
|
||||
clean = text.strip()
|
||||
if clean.startswith("..."):
|
||||
clean = clean[3:]
|
||||
if clean.endswith("..."):
|
||||
clean = clean[:-3]
|
||||
clean = clean.strip()
|
||||
if len(clean) < 3:
|
||||
return pdf_bytes
|
||||
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
try:
|
||||
pages = [doc[page_num - 1]] if page_num and 0 < page_num <= len(doc) else list(doc)
|
||||
|
||||
single_line = " ".join(clean.split())
|
||||
found = False
|
||||
|
||||
# Essai 1 : texte exact
|
||||
for page in pages:
|
||||
rects = page.search_for(single_line)
|
||||
if rects:
|
||||
_add_highlight(page, rects)
|
||||
found = True
|
||||
break
|
||||
|
||||
# Essai 2 : fallback accents — le texte du diagnostic peut manquer
|
||||
# d'accents ("renale") alors que le PDF les a ("rénale")
|
||||
if not found:
|
||||
page_text_cache: dict[int, str] = {}
|
||||
for page in pages:
|
||||
page_text = page.get_text()
|
||||
page_text_cache[page.number] = page_text
|
||||
# Chercher dans le texte normalisé (sans accents) du PDF
|
||||
page_text_stripped = _strip_accents(page_text)
|
||||
search_stripped = _strip_accents(single_line)
|
||||
idx = page_text_stripped.lower().find(search_stripped.lower())
|
||||
if idx >= 0:
|
||||
# Extraire le texte original (avec accents) à cette position
|
||||
original_match = page_text[idx:idx + len(search_stripped)]
|
||||
# Chercher ce texte exact dans le PDF
|
||||
rects = page.search_for(original_match)
|
||||
if rects:
|
||||
_add_highlight(page, rects)
|
||||
found = True
|
||||
break
|
||||
|
||||
# Essai 3 : si multi-lignes, chercher ligne par ligne
|
||||
if not found and "\n" in clean:
|
||||
for line in clean.split("\n"):
|
||||
line = line.strip()
|
||||
if len(line) >= 10:
|
||||
for page in pages:
|
||||
rects = page.search_for(line)
|
||||
if rects:
|
||||
_add_highlight(page, rects)
|
||||
found = True
|
||||
break
|
||||
if found:
|
||||
break
|
||||
|
||||
return doc.tobytes()
|
||||
finally:
|
||||
doc.close()
|
||||
160
t2a_install_rag_cleanup/src/viewer/referentiels.py
Normal file
160
t2a_install_rag_cleanup/src/viewer/referentiels.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Gestionnaire de référentiels utilisateur pour le RAG."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from ..config import REFERENTIELS_DIR, ALLOWED_EXTENSIONS, UPLOAD_MAX_SIZE_MB
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ReferentielManager:
|
||||
"""CRUD pour les fichiers de référentiels utilisateur.
|
||||
|
||||
Stocke les fichiers dans REFERENTIELS_DIR avec un index.json
|
||||
pour les métadonnées.
|
||||
"""
|
||||
|
||||
def __init__(self, referentiels_dir: Path | None = None):
|
||||
self._dir = referentiels_dir or REFERENTIELS_DIR
|
||||
self._dir.mkdir(parents=True, exist_ok=True)
|
||||
self._index_path = self._dir / "index.json"
|
||||
self._index: list[dict] = self._load_index()
|
||||
|
||||
def _load_index(self) -> list[dict]:
|
||||
if self._index_path.exists():
|
||||
try:
|
||||
return json.loads(self._index_path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
logger.warning("Index référentiels corrompu, réinitialisé")
|
||||
return []
|
||||
|
||||
def _save_index(self) -> None:
|
||||
self._index_path.write_text(
|
||||
json.dumps(self._index, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
def list_all(self) -> list[dict]:
|
||||
"""Retourne la liste de tous les référentiels."""
|
||||
return list(self._index)
|
||||
|
||||
def get(self, ref_id: str) -> dict | None:
|
||||
"""Retourne un référentiel par son ID."""
|
||||
for ref in self._index:
|
||||
if ref["id"] == ref_id:
|
||||
return ref
|
||||
return None
|
||||
|
||||
def add_file(self, filename: str, file_data: bytes) -> dict:
|
||||
"""Ajoute un fichier de référentiel.
|
||||
|
||||
Args:
|
||||
filename: Nom original du fichier.
|
||||
file_data: Contenu binaire du fichier.
|
||||
|
||||
Returns:
|
||||
Métadonnées du référentiel créé.
|
||||
|
||||
Raises:
|
||||
ValueError: Extension non autorisée ou taille dépassée.
|
||||
"""
|
||||
ext = Path(filename).suffix.lower()
|
||||
if ext not in ALLOWED_EXTENSIONS:
|
||||
raise ValueError(f"Extension '{ext}' non autorisée. Extensions valides : {ALLOWED_EXTENSIONS}")
|
||||
|
||||
size_mb = len(file_data) / (1024 * 1024)
|
||||
if size_mb > UPLOAD_MAX_SIZE_MB:
|
||||
raise ValueError(f"Fichier trop volumineux ({size_mb:.1f} Mo > {UPLOAD_MAX_SIZE_MB} Mo)")
|
||||
|
||||
ref_id = uuid.uuid4().hex[:12]
|
||||
safe_name = f"{ref_id}_{Path(filename).stem}{ext}"
|
||||
file_path = self._dir / safe_name
|
||||
|
||||
file_path.write_bytes(file_data)
|
||||
|
||||
ref = {
|
||||
"id": ref_id,
|
||||
"filename": filename,
|
||||
"stored_name": safe_name,
|
||||
"extension": ext,
|
||||
"size_bytes": len(file_data),
|
||||
"date_added": datetime.now().isoformat(),
|
||||
"status": "uploaded",
|
||||
"chunks_count": 0,
|
||||
}
|
||||
self._index.append(ref)
|
||||
self._save_index()
|
||||
|
||||
logger.info("Référentiel ajouté : %s (%s)", filename, ref_id)
|
||||
return ref
|
||||
|
||||
def remove(self, ref_id: str) -> bool:
|
||||
"""Supprime un référentiel (fichier + métadonnées).
|
||||
|
||||
Returns:
|
||||
True si trouvé et supprimé, False sinon.
|
||||
"""
|
||||
ref = self.get(ref_id)
|
||||
if not ref:
|
||||
return False
|
||||
|
||||
file_path = self._dir / ref["stored_name"]
|
||||
if file_path.exists():
|
||||
file_path.unlink()
|
||||
|
||||
self._index = [r for r in self._index if r["id"] != ref_id]
|
||||
self._save_index()
|
||||
|
||||
logger.info("Référentiel supprimé : %s (%s)", ref["filename"], ref_id)
|
||||
return True
|
||||
|
||||
def index_referentiel(self, ref_id: str) -> int:
|
||||
"""Indexe un référentiel dans FAISS.
|
||||
|
||||
Args:
|
||||
ref_id: ID du référentiel à indexer.
|
||||
|
||||
Returns:
|
||||
Nombre de chunks indexés.
|
||||
|
||||
Raises:
|
||||
ValueError: Référentiel introuvable.
|
||||
"""
|
||||
ref = self.get(ref_id)
|
||||
if not ref:
|
||||
raise ValueError(f"Référentiel {ref_id} introuvable")
|
||||
|
||||
file_path = self._dir / ref["stored_name"]
|
||||
if not file_path.exists():
|
||||
raise ValueError(f"Fichier {ref['stored_name']} introuvable")
|
||||
|
||||
from ..medical.rag_index import chunk_user_file, add_chunks_to_index
|
||||
|
||||
# Heuristique simple : si le fichier ressemble à une procédure/méthodo,
|
||||
# on l'isole pour éviter qu'il n'influence le codage.
|
||||
fname = (ref.get("filename") or "").lower()
|
||||
is_proc = any(k in fname for k in ("guide", "methodo", "méthodo", "procedure", "procédure", "pmsi", "atlh", "atih", "cpam"))
|
||||
prefix = "proc" if is_proc else "ref"
|
||||
doc_name = f"{prefix}:{ref['filename']}"
|
||||
chunks = chunk_user_file(file_path, doc_name)
|
||||
|
||||
if not chunks:
|
||||
ref["status"] = "empty"
|
||||
ref["chunks_count"] = 0
|
||||
self._save_index()
|
||||
return 0
|
||||
|
||||
count = add_chunks_to_index(chunks)
|
||||
ref["status"] = "indexed"
|
||||
ref["chunks_count"] = count
|
||||
self._save_index()
|
||||
|
||||
logger.info("Référentiel indexé : %s → %d chunks", ref["filename"], count)
|
||||
return count
|
||||
@@ -0,0 +1,266 @@
|
||||
{% extends "base.html" %}
|
||||
|
||||
{% block title %}Référentiels RAG{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
<div class="group-title">Admin</div>
|
||||
<a href="/admin/referentiels" style="color:#60a5fa;font-weight:600;border-left-color:#3b82f6;">Référentiels RAG</a>
|
||||
<a href="/">Retour aux dossiers</a>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h2>Référentiels RAG</h2>
|
||||
<p style="font-size:0.85rem;color:#64748b;margin-bottom:1.5rem;">
|
||||
Ajoutez des documents de référence (PDF, CSV, Excel, TXT) pour enrichir la base de connaissances du RAG.
|
||||
</p>
|
||||
|
||||
<!-- Zone upload -->
|
||||
<div class="card" style="margin-bottom:1.5rem;">
|
||||
<h3>Ajouter un référentiel</h3>
|
||||
<form id="upload-form" style="display:flex;gap:0.75rem;align-items:end;flex-wrap:wrap;margin-top:0.75rem;">
|
||||
<div>
|
||||
<label style="display:block;font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.05em;font-weight:600;margin-bottom:0.25rem;">Fichier</label>
|
||||
<input type="file" id="file-input" name="file" accept=".pdf,.csv,.xlsx,.xls,.txt"
|
||||
style="font-size:0.85rem;padding:0.35rem;">
|
||||
</div>
|
||||
<button type="submit" id="upload-btn"
|
||||
style="padding:0.5rem 1.25rem;border-radius:6px;border:none;background:#3b82f6;color:#fff;font-size:0.85rem;font-weight:600;cursor:pointer;">
|
||||
Uploader
|
||||
</button>
|
||||
<span id="upload-status" style="font-size:0.8rem;"></span>
|
||||
</form>
|
||||
<p style="font-size:0.7rem;color:#94a3b8;margin-top:0.5rem;">
|
||||
Extensions : .pdf, .csv, .xlsx, .xls, .txt — Max {{ max_size }} Mo
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<!-- Référentiels intégrés (built-in) -->
|
||||
<div class="card" style="margin-bottom:1.5rem;">
|
||||
<h3>Référentiels intégrés</h3>
|
||||
<p style="font-size:0.8rem;color:#64748b;margin-bottom:0.75rem;">
|
||||
Sources intégrées automatiquement dans l'index FAISS au build.
|
||||
</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Nom</th>
|
||||
<th>Fichier</th>
|
||||
<th>Type</th>
|
||||
<th>Taille</th>
|
||||
<th>Chunks</th>
|
||||
<th>Statut</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for ref in builtin_refs %}
|
||||
<tr>
|
||||
<td style="font-weight:600;">{{ ref.name }}</td>
|
||||
<td style="font-size:0.8rem;color:#64748b;"><code>{{ ref.filename }}</code></td>
|
||||
<td><span class="badge" style="background:#f1f5f9;color:#334155;">{{ ref.extension }}</span></td>
|
||||
<td>{{ "%.1f"|format(ref.size_mb) }} Mo</td>
|
||||
<td>
|
||||
{% if ref.chunks %}
|
||||
<strong>{{ ref.chunks }}</strong>
|
||||
{% else %}
|
||||
<span style="color:#94a3b8;">—</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if not ref.exists %}
|
||||
<span class="badge" style="background:#fee2e2;color:#dc2626;">Fichier absent</span>
|
||||
{% elif ref.chunks %}
|
||||
<span class="badge" style="background:#dcfce7;color:#16a34a;">Indexé</span>
|
||||
{% else %}
|
||||
<span class="badge" style="background:#f1f5f9;color:#64748b;">Dictionnaire</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<!-- Tableau référentiels utilisateur -->
|
||||
<div class="card">
|
||||
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:0.75rem;">
|
||||
<h3>Référentiels utilisateur</h3>
|
||||
<button id="rebuild-btn"
|
||||
style="padding:0.35rem 0.75rem;border-radius:6px;border:1px solid #e2e8f0;background:#fff;font-size:0.75rem;cursor:pointer;">
|
||||
Rebuild complet
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Nom</th>
|
||||
<th>Type</th>
|
||||
<th>Taille</th>
|
||||
<th>Date</th>
|
||||
<th>Chunks</th>
|
||||
<th>Statut</th>
|
||||
<th>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="ref-table">
|
||||
{% for ref in referentiels %}
|
||||
<tr id="row-{{ ref.id }}">
|
||||
<td>{{ ref.filename }}</td>
|
||||
<td><span class="badge" style="background:#f1f5f9;color:#334155;">{{ ref.extension }}</span></td>
|
||||
<td>{{ "%.1f"|format(ref.size_bytes / 1024 / 1024) }} Mo</td>
|
||||
<td style="font-size:0.8rem;">{{ ref.date_added[:10] }}</td>
|
||||
<td>{{ ref.chunks_count }}</td>
|
||||
<td>
|
||||
{% if ref.status == 'indexed' %}
|
||||
<span class="badge" style="background:#dcfce7;color:#16a34a;">Indexé</span>
|
||||
{% elif ref.status == 'empty' %}
|
||||
<span class="badge" style="background:#fef9c3;color:#ca8a04;">Vide</span>
|
||||
{% else %}
|
||||
<span class="badge" style="background:#f1f5f9;color:#64748b;">Uploadé</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<button onclick="indexRef('{{ ref.id }}')" class="action-btn"
|
||||
style="padding:2px 8px;border-radius:4px;border:1px solid #3b82f6;background:#eff6ff;color:#2563eb;font-size:0.75rem;cursor:pointer;margin-right:4px;">
|
||||
Indexer
|
||||
</button>
|
||||
<button onclick="deleteRef('{{ ref.id }}')" class="action-btn"
|
||||
style="padding:2px 8px;border-radius:4px;border:1px solid #fca5a5;background:#fef2f2;color:#dc2626;font-size:0.75rem;cursor:pointer;">
|
||||
Supprimer
|
||||
</button>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% if not referentiels %}
|
||||
<tr id="empty-row">
|
||||
<td colspan="7" style="text-align:center;color:#94a3b8;padding:2rem;">Aucun référentiel</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div id="global-status" style="margin-top:1rem;font-size:0.8rem;"></div>
|
||||
{% endblock %}
|
||||
|
||||
{% block scripts %}
|
||||
<script>
|
||||
(function() {
|
||||
const uploadForm = document.getElementById('upload-form');
|
||||
const fileInput = document.getElementById('file-input');
|
||||
const uploadBtn = document.getElementById('upload-btn');
|
||||
const uploadStatus = document.getElementById('upload-status');
|
||||
const globalStatus = document.getElementById('global-status');
|
||||
const rebuildBtn = document.getElementById('rebuild-btn');
|
||||
|
||||
uploadForm.addEventListener('submit', function(e) {
|
||||
e.preventDefault();
|
||||
const file = fileInput.files[0];
|
||||
if (!file) { uploadStatus.textContent = 'Sélectionnez un fichier'; return; }
|
||||
|
||||
const fd = new FormData();
|
||||
fd.append('file', file);
|
||||
|
||||
uploadBtn.disabled = true;
|
||||
uploadBtn.innerHTML = '<span class="spinner"></span>';
|
||||
uploadStatus.textContent = '';
|
||||
|
||||
fetch('/admin/referentiels/upload', { method: 'POST', body: fd })
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
uploadBtn.disabled = false;
|
||||
uploadBtn.textContent = 'Uploader';
|
||||
if (d.ok) {
|
||||
uploadStatus.style.color = '#16a34a';
|
||||
uploadStatus.textContent = 'Uploadé';
|
||||
setTimeout(() => location.reload(), 800);
|
||||
} else {
|
||||
uploadStatus.style.color = '#dc2626';
|
||||
uploadStatus.textContent = d.error || 'Erreur';
|
||||
}
|
||||
})
|
||||
.catch(() => {
|
||||
uploadBtn.disabled = false;
|
||||
uploadBtn.textContent = 'Uploader';
|
||||
uploadStatus.style.color = '#dc2626';
|
||||
uploadStatus.textContent = 'Erreur réseau';
|
||||
});
|
||||
});
|
||||
|
||||
window.indexRef = function(id) {
|
||||
const btn = event.target;
|
||||
btn.disabled = true;
|
||||
btn.innerHTML = '<span class="spinner" style="border-color:rgba(37,99,235,0.3);border-top-color:#2563eb;width:10px;height:10px;"></span>';
|
||||
|
||||
fetch('/admin/referentiels/' + id + '/index', { method: 'POST' })
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
if (d.ok) {
|
||||
globalStatus.style.color = '#16a34a';
|
||||
globalStatus.textContent = d.chunks + ' chunks indexés';
|
||||
setTimeout(() => location.reload(), 800);
|
||||
} else {
|
||||
btn.disabled = false;
|
||||
btn.textContent = 'Indexer';
|
||||
globalStatus.style.color = '#dc2626';
|
||||
globalStatus.textContent = d.error || 'Erreur';
|
||||
}
|
||||
})
|
||||
.catch(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = 'Indexer';
|
||||
globalStatus.style.color = '#dc2626';
|
||||
globalStatus.textContent = 'Erreur réseau';
|
||||
});
|
||||
};
|
||||
|
||||
window.deleteRef = function(id) {
|
||||
if (!confirm('Supprimer ce référentiel ?')) return;
|
||||
|
||||
fetch('/admin/referentiels/' + id, { method: 'DELETE' })
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
if (d.ok) {
|
||||
const row = document.getElementById('row-' + id);
|
||||
if (row) row.remove();
|
||||
globalStatus.style.color = '#16a34a';
|
||||
globalStatus.textContent = 'Supprimé';
|
||||
} else {
|
||||
globalStatus.style.color = '#dc2626';
|
||||
globalStatus.textContent = d.error || 'Erreur';
|
||||
}
|
||||
})
|
||||
.catch(() => {
|
||||
globalStatus.style.color = '#dc2626';
|
||||
globalStatus.textContent = 'Erreur réseau';
|
||||
});
|
||||
};
|
||||
|
||||
rebuildBtn.addEventListener('click', function() {
|
||||
if (!confirm('Reconstruire l\'index FAISS complet ? Cela peut prendre plusieurs minutes.')) return;
|
||||
rebuildBtn.disabled = true;
|
||||
rebuildBtn.innerHTML = '<span class="spinner" style="border-color:rgba(0,0,0,0.2);border-top-color:#333;width:10px;height:10px;"></span> Rebuild…';
|
||||
|
||||
fetch('/admin/referentiels/rebuild-index', { method: 'POST' })
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
rebuildBtn.disabled = false;
|
||||
rebuildBtn.textContent = 'Rebuild complet';
|
||||
if (d.ok) {
|
||||
globalStatus.style.color = '#16a34a';
|
||||
globalStatus.textContent = 'Index reconstruit (' + d.reindexed + ' référentiels réindexés)';
|
||||
} else {
|
||||
globalStatus.style.color = '#dc2626';
|
||||
globalStatus.textContent = d.error || 'Erreur';
|
||||
}
|
||||
})
|
||||
.catch(() => {
|
||||
rebuildBtn.disabled = false;
|
||||
rebuildBtn.textContent = 'Rebuild complet';
|
||||
globalStatus.style.color = '#dc2626';
|
||||
globalStatus.textContent = 'Erreur réseau';
|
||||
});
|
||||
});
|
||||
})();
|
||||
</script>
|
||||
{% endblock %}
|
||||
481
t2a_install_rag_cleanup/src/viewer/templates/base.html
Normal file
481
t2a_install_rag_cleanup/src/viewer/templates/base.html
Normal file
@@ -0,0 +1,481 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="fr">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>{% block title %}Viewer CIM-10{% endblock %} — T2A</title>
|
||||
<style>
|
||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
||||
background: #f1f5f9;
|
||||
color: #1e293b;
|
||||
display: flex;
|
||||
min-height: 100vh;
|
||||
}
|
||||
|
||||
/* Sidebar */
|
||||
.sidebar {
|
||||
width: 280px;
|
||||
min-width: 280px;
|
||||
background: #1e293b;
|
||||
color: #e2e8f0;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
bottom: 0;
|
||||
overflow-y: auto;
|
||||
}
|
||||
.sidebar-header {
|
||||
padding: 1.25rem 1rem;
|
||||
border-bottom: 1px solid #334155;
|
||||
}
|
||||
.sidebar-header h1 {
|
||||
font-size: 1.1rem;
|
||||
color: #f1f5f9;
|
||||
font-weight: 700;
|
||||
}
|
||||
.sidebar-header p {
|
||||
font-size: 0.75rem;
|
||||
color: #94a3b8;
|
||||
margin-top: 0.25rem;
|
||||
}
|
||||
.sidebar-nav {
|
||||
flex: 1;
|
||||
padding: 0.75rem 0;
|
||||
overflow-y: auto;
|
||||
}
|
||||
.sidebar-nav .group-title {
|
||||
padding: 0.5rem 1rem 0.25rem;
|
||||
font-size: 0.65rem;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.08em;
|
||||
color: #94a3b8;
|
||||
font-weight: 700;
|
||||
}
|
||||
.sidebar-nav a {
|
||||
display: block;
|
||||
padding: 0.4rem 1rem;
|
||||
color: #cbd5e1;
|
||||
text-decoration: none;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
border-left: 3px solid transparent;
|
||||
transition: all 0.15s;
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
.sidebar-nav a:hover {
|
||||
color: #f8fafc;
|
||||
background: #334155;
|
||||
border-left-color: #3b82f6;
|
||||
}
|
||||
.sidebar-nav a.sidebar-fusionne {
|
||||
color: #60a5fa;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
/* Search */
|
||||
.sidebar-search {
|
||||
padding: 0.75rem 1rem 0.5rem;
|
||||
border-bottom: 1px solid #334155;
|
||||
}
|
||||
.sidebar-search input {
|
||||
width: 100%;
|
||||
padding: 0.45rem 0.6rem;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #475569;
|
||||
background: #0f172a;
|
||||
color: #e2e8f0;
|
||||
font-size: 0.8rem;
|
||||
outline: none;
|
||||
transition: border-color 0.15s;
|
||||
}
|
||||
.sidebar-search input::placeholder { color: #64748b; }
|
||||
.sidebar-search input:focus { border-color: #3b82f6; }
|
||||
|
||||
/* Admin section */
|
||||
.sidebar-admin {
|
||||
padding: 1rem;
|
||||
border-top: 1px solid #334155;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
.sidebar-admin label {
|
||||
display: block;
|
||||
margin-bottom: 0.35rem;
|
||||
font-weight: 600;
|
||||
color: #cbd5e1;
|
||||
font-size: 0.7rem;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.05em;
|
||||
}
|
||||
.sidebar-admin select {
|
||||
width: 100%;
|
||||
padding: 0.4rem;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #475569;
|
||||
background: #0f172a;
|
||||
color: #e2e8f0;
|
||||
font-size: 0.8rem;
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
.sidebar-admin button {
|
||||
width: 100%;
|
||||
padding: 0.45rem;
|
||||
border-radius: 6px;
|
||||
border: none;
|
||||
background: #3b82f6;
|
||||
color: #fff;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.sidebar-admin button:hover { background: #2563eb; }
|
||||
.sidebar-admin .status-msg {
|
||||
margin-top: 0.35rem;
|
||||
font-size: 0.7rem;
|
||||
min-height: 1rem;
|
||||
}
|
||||
|
||||
/* Main content */
|
||||
.main {
|
||||
margin-left: 280px;
|
||||
flex: 1;
|
||||
padding: 2rem;
|
||||
}
|
||||
|
||||
/* Utilities */
|
||||
.card {
|
||||
background: #fff;
|
||||
border-radius: 10px;
|
||||
box-shadow: 0 1px 3px rgba(0,0,0,0.08);
|
||||
padding: 1.25rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
.badge {
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
border-radius: 9999px;
|
||||
font-size: 0.7rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
table { width: 100%; border-collapse: collapse; font-size: 0.85rem; }
|
||||
th, td { text-align: left; padding: 0.5rem 0.75rem; border-bottom: 1px solid #e2e8f0; }
|
||||
th { font-weight: 600; color: #475569; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; }
|
||||
tr.anomalie { background: #fef2f2; }
|
||||
details { margin-top: 0.35rem; }
|
||||
details summary {
|
||||
cursor: pointer;
|
||||
font-size: 0.75rem;
|
||||
color: #3b82f6;
|
||||
}
|
||||
details pre {
|
||||
font-size: 0.75rem;
|
||||
background: #f8fafc;
|
||||
padding: 0.5rem;
|
||||
border-radius: 6px;
|
||||
margin-top: 0.25rem;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
}
|
||||
h2 { font-size: 1.1rem; margin-bottom: 0.75rem; color: #0f172a; }
|
||||
h3 { font-size: 0.95rem; margin-bottom: 0.5rem; color: #334155; }
|
||||
.info-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
|
||||
gap: 0.75rem;
|
||||
}
|
||||
.info-item label { display: block; font-size: 0.7rem; color: #64748b; text-transform: uppercase; letter-spacing: 0.05em; font-weight: 600; }
|
||||
.info-item span { font-size: 0.9rem; }
|
||||
.section { margin-bottom: 1.5rem; }
|
||||
ul.bullet { list-style: disc; padding-left: 1.5rem; font-size: 0.85rem; }
|
||||
ul.bullet li { margin-bottom: 0.25rem; }
|
||||
a.back { font-size: 0.85rem; color: #3b82f6; text-decoration: none; }
|
||||
a.back:hover { text-decoration: underline; }
|
||||
|
||||
/* Badges compteurs */
|
||||
.badge-count {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 0.2rem;
|
||||
padding: 2px 8px;
|
||||
border-radius: 9999px;
|
||||
font-size: 0.7rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
.badge-das { background: #dbeafe; color: #1d4ed8; }
|
||||
.badge-actes { background: #e0e7ff; color: #3730a3; }
|
||||
.badge-alertes { background: #ffedd5; color: #c2410c; }
|
||||
.badge-cma { background: #fee2e2; color: #dc2626; }
|
||||
.badge-regroup { background: #f0fdf4; color: #166534; font-size: 0.65rem; }
|
||||
.badge-fusion { background: #ede9fe; color: #5b21b6; }
|
||||
|
||||
/* Alertes non-cumul (rouge) vs standard (orange) */
|
||||
.alerte-noncumul { color: #dc2626; font-weight: 600; }
|
||||
.alerte-standard { color: #9a3412; }
|
||||
|
||||
/* Source files */
|
||||
.source-files { font-size: 0.8rem; color: #64748b; margin-top: 0.5rem; }
|
||||
.source-files code { background: #f1f5f9; padding: 1px 4px; border-radius: 3px; }
|
||||
|
||||
/* Spinner animation */
|
||||
@keyframes spin { to { transform: rotate(360deg); } }
|
||||
.spinner {
|
||||
display: inline-block;
|
||||
width: 14px;
|
||||
height: 14px;
|
||||
border: 2px solid rgba(255,255,255,0.3);
|
||||
border-top-color: #fff;
|
||||
border-radius: 50%;
|
||||
animation: spin 0.8s linear infinite;
|
||||
}
|
||||
|
||||
/* Source tracking badges */
|
||||
.src-btn {
|
||||
display: inline-block;
|
||||
padding: 1px 6px;
|
||||
border-radius: 9999px;
|
||||
font-size: 0.65rem;
|
||||
font-weight: 600;
|
||||
background: #e0f2fe;
|
||||
color: #0369a1;
|
||||
border: 1px solid #bae6fd;
|
||||
cursor: pointer;
|
||||
margin-left: 0.3rem;
|
||||
vertical-align: middle;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.src-btn:hover { background: #bae6fd; }
|
||||
|
||||
/* Source modal */
|
||||
#source-modal {
|
||||
display: none;
|
||||
position: fixed;
|
||||
inset: 0;
|
||||
z-index: 9999;
|
||||
background: rgba(0,0,0,0.5);
|
||||
padding: 2rem;
|
||||
}
|
||||
#source-modal-inner {
|
||||
background: #fff;
|
||||
border-radius: 12px;
|
||||
max-width: 95vw;
|
||||
width: 95vw;
|
||||
margin: 0 auto;
|
||||
max-height: 95vh;
|
||||
height: 95vh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
box-shadow: 0 8px 30px rgba(0,0,0,0.2);
|
||||
}
|
||||
#source-modal-inner.source-modal-text {
|
||||
max-width: 900px;
|
||||
width: auto;
|
||||
max-height: 90vh;
|
||||
height: auto;
|
||||
}
|
||||
#source-header {
|
||||
padding: 1rem 1.25rem;
|
||||
border-bottom: 1px solid #e2e8f0;
|
||||
font-weight: 700;
|
||||
font-size: 0.9rem;
|
||||
color: #0f172a;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
#source-content {
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
padding: 1.25rem;
|
||||
font-size: 0.85rem;
|
||||
line-height: 1.6;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
color: #334155;
|
||||
}
|
||||
#source-content.source-content-pdf {
|
||||
padding: 0;
|
||||
white-space: normal;
|
||||
overflow: hidden;
|
||||
}
|
||||
#source-content mark {
|
||||
background: #fef08a;
|
||||
padding: 2px 0;
|
||||
border-radius: 2px;
|
||||
}
|
||||
#source-close-btn {
|
||||
padding: 0.4rem 1rem;
|
||||
background: #64748b;
|
||||
color: #fff;
|
||||
border: none;
|
||||
border-radius: 6px;
|
||||
cursor: pointer;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
#source-close-btn:hover { background: #475569; }
|
||||
|
||||
/* PDF file picker buttons */
|
||||
.src-file-btn {
|
||||
display: inline-block;
|
||||
padding: 0.35rem 0.75rem;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #cbd5e1;
|
||||
background: #f8fafc;
|
||||
color: #1e293b;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.15s;
|
||||
}
|
||||
.src-file-btn:hover { background: #e2e8f0; border-color: #3b82f6; }
|
||||
.src-file-btn.active { background: #3b82f6; color: #fff; border-color: #3b82f6; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<!-- Sidebar -->
|
||||
<aside class="sidebar">
|
||||
<div class="sidebar-header">
|
||||
<h1>T2A Viewer</h1>
|
||||
<p>Visualisation CIM-10</p>
|
||||
</div>
|
||||
<div class="sidebar-search">
|
||||
<input type="text" id="sidebar-search" placeholder="Rechercher un dossier…" autocomplete="off">
|
||||
</div>
|
||||
<nav class="sidebar-nav" id="sidebar-nav">
|
||||
{% block sidebar %}{% endblock %}
|
||||
</nav>
|
||||
<div class="sidebar-admin" style="border-top:1px solid #334155;padding:0.5rem 1rem;">
|
||||
<a href="/dashboard" style="display:block;color:#cbd5e1;text-decoration:none;font-size:0.8rem;font-weight:600;padding:0.35rem 0;transition:color 0.15s;"
|
||||
onmouseover="this.style.color='#f8fafc'" onmouseout="this.style.color='#cbd5e1'">
|
||||
Dashboard
|
||||
</a>
|
||||
<a href="/cpam" style="display:block;color:#cbd5e1;text-decoration:none;font-size:0.8rem;font-weight:600;padding:0.35rem 0;transition:color 0.15s;"
|
||||
onmouseover="this.style.color='#f8fafc'" onmouseout="this.style.color='#cbd5e1'">
|
||||
Contrôles CPAM
|
||||
</a>
|
||||
<a href="/admin/referentiels" style="display:block;color:#cbd5e1;text-decoration:none;font-size:0.8rem;font-weight:600;padding:0.35rem 0;transition:color 0.15s;"
|
||||
onmouseover="this.style.color='#f8fafc'" onmouseout="this.style.color='#cbd5e1'">
|
||||
Référentiels RAG
|
||||
</a>
|
||||
<a href="/validation" style="display:block;color:#fbbf24;text-decoration:none;font-size:0.8rem;font-weight:600;padding:0.35rem 0;transition:color 0.15s;"
|
||||
onmouseover="this.style.color='#fde68a'" onmouseout="this.style.color='#fbbf24'">
|
||||
Validation DIM
|
||||
</a>
|
||||
</div>
|
||||
<div class="sidebar-admin">
|
||||
<label for="model-select">Modèle Ollama</label>
|
||||
<select id="model-select"><option>Chargement…</option></select>
|
||||
<button id="model-apply">Appliquer</button>
|
||||
<div class="status-msg" id="model-status"></div>
|
||||
</div>
|
||||
</aside>
|
||||
|
||||
<!-- Main -->
|
||||
<div class="main">
|
||||
{% block content %}{% endblock %}
|
||||
</div>
|
||||
|
||||
<script>
|
||||
(function() {
|
||||
const sel = document.getElementById('model-select');
|
||||
const btn = document.getElementById('model-apply');
|
||||
const status = document.getElementById('model-status');
|
||||
|
||||
function loadModels() {
|
||||
fetch('/admin/models')
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
sel.innerHTML = '';
|
||||
if (d.models && d.models.length) {
|
||||
d.models.forEach(m => {
|
||||
const opt = document.createElement('option');
|
||||
opt.value = m;
|
||||
opt.textContent = m;
|
||||
if (m === d.current) opt.selected = true;
|
||||
sel.appendChild(opt);
|
||||
});
|
||||
} else {
|
||||
sel.innerHTML = '<option>Aucun modèle</option>';
|
||||
}
|
||||
})
|
||||
.catch(() => { sel.innerHTML = '<option>Erreur</option>'; });
|
||||
}
|
||||
|
||||
btn.addEventListener('click', function() {
|
||||
const model = sel.value;
|
||||
if (!model || model === 'Aucun modèle' || model === 'Erreur') return;
|
||||
status.textContent = '…';
|
||||
status.style.color = '#94a3b8';
|
||||
fetch('/admin/models', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({model: model})
|
||||
})
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
if (d.ok) {
|
||||
status.textContent = 'Modèle appliqué';
|
||||
status.style.color = '#16a34a';
|
||||
} else {
|
||||
status.textContent = d.error || 'Erreur';
|
||||
status.style.color = '#dc2626';
|
||||
}
|
||||
})
|
||||
.catch(() => {
|
||||
status.textContent = 'Erreur réseau';
|
||||
status.style.color = '#dc2626';
|
||||
});
|
||||
});
|
||||
|
||||
loadModels();
|
||||
})();
|
||||
|
||||
// Sidebar search filter
|
||||
(function() {
|
||||
const input = document.getElementById('sidebar-search');
|
||||
const nav = document.getElementById('sidebar-nav');
|
||||
if (!input || !nav) return;
|
||||
|
||||
input.addEventListener('input', function() {
|
||||
const q = this.value.toLowerCase().trim();
|
||||
const groups = nav.querySelectorAll('.group-title');
|
||||
|
||||
groups.forEach(function(groupEl) {
|
||||
// Collect all sibling links until next group-title
|
||||
const links = [];
|
||||
let next = groupEl.nextElementSibling;
|
||||
while (next && !next.classList.contains('group-title')) {
|
||||
if (next.tagName === 'A') links.push(next);
|
||||
next = next.nextElementSibling;
|
||||
}
|
||||
|
||||
if (!q) {
|
||||
groupEl.style.display = '';
|
||||
links.forEach(function(a) { a.style.display = ''; });
|
||||
return;
|
||||
}
|
||||
|
||||
const groupMatch = groupEl.textContent.toLowerCase().includes(q);
|
||||
let anyLinkMatch = false;
|
||||
|
||||
links.forEach(function(a) {
|
||||
const match = groupMatch || a.textContent.toLowerCase().includes(q);
|
||||
a.style.display = match ? '' : 'none';
|
||||
if (match) anyLinkMatch = true;
|
||||
});
|
||||
|
||||
groupEl.style.display = (groupMatch || anyLinkMatch) ? '' : 'none';
|
||||
});
|
||||
});
|
||||
})();
|
||||
</script>
|
||||
{% block scripts %}{% endblock %}
|
||||
</body>
|
||||
</html>
|
||||
88
t2a_install_rag_cleanup/src/viewer/templates/cpam.html
Normal file
88
t2a_install_rag_cleanup/src/viewer/templates/cpam.html
Normal file
@@ -0,0 +1,88 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %}Contrôles CPAM{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
{% for group_name, items in groups.items() %}
|
||||
<div class="group-title">{{ group_name | format_dossier_name }}</div>
|
||||
{% for item in items %}
|
||||
{% if 'fusionne' in item.name %}
|
||||
<a href="/dossier/{{ item.path_rel }}" class="sidebar-fusionne">★ Fusionné</a>
|
||||
{% else %}
|
||||
<a href="/dossier/{{ item.path_rel }}">{{ item.name | format_doc_name }}</a>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<a class="back" href="/">← Retour à la liste</a>
|
||||
|
||||
<div style="display:flex;align-items:center;gap:0.75rem;margin-top:1rem;margin-bottom:1rem;">
|
||||
<h2 style="margin:0;">Contrôles CPAM</h2>
|
||||
<span class="badge" style="background:#fef3c7;color:#b45309;font-size:0.85rem;padding:4px 12px;">{{ total }}</span>
|
||||
</div>
|
||||
|
||||
{% if not controls %}
|
||||
<div class="card">
|
||||
<p>Aucun contrôle CPAM trouvé dans les dossiers.</p>
|
||||
</div>
|
||||
{% else %}
|
||||
<div class="card" style="overflow-x:auto;">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Dossier</th>
|
||||
<th>OGC</th>
|
||||
<th>Titre</th>
|
||||
<th>Décision</th>
|
||||
<th>Codes contestés</th>
|
||||
<th>Contre-argumentation</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for c in controls %}
|
||||
<tr>
|
||||
<td>
|
||||
<a href="/dossier/{{ c.filepath }}" style="color:#3b82f6;text-decoration:none;font-weight:600;">
|
||||
{{ c.group_name | format_dossier_name }}
|
||||
</a>
|
||||
{% if c.dp_code %}
|
||||
<div style="font-size:0.7rem;color:#64748b;margin-top:2px;">DP: {{ c.dp_code }}</div>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td style="font-weight:600;">{{ c.ctrl.numero_ogc }}</td>
|
||||
<td style="max-width:200px;">{{ c.ctrl.titre }}</td>
|
||||
<td>
|
||||
{% if 'retient' in c.ctrl.decision_ucr|lower %}
|
||||
<span class="badge" style="background:#d1fae5;color:#065f46;">{{ c.ctrl.decision_ucr }}</span>
|
||||
{% elif 'confirme' in c.ctrl.decision_ucr|lower %}
|
||||
<span class="badge" style="background:#fee2e2;color:#dc2626;">{{ c.ctrl.decision_ucr }}</span>
|
||||
{% else %}
|
||||
<span class="badge" style="background:#e0e7ff;color:#3730a3;">{{ c.ctrl.decision_ucr }}</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<div style="display:flex;gap:0.3rem;flex-wrap:wrap;">
|
||||
{% if c.ctrl.dp_ucr %}<span class="badge" style="background:#fef3c7;color:#92400e;font-size:0.65rem;">DP: {{ c.ctrl.dp_ucr }}</span>{% endif %}
|
||||
{% if c.ctrl.da_ucr %}<span class="badge" style="background:#fef3c7;color:#92400e;font-size:0.65rem;">DA: {{ c.ctrl.da_ucr }}</span>{% endif %}
|
||||
{% if c.ctrl.dr_ucr %}<span class="badge" style="background:#fef3c7;color:#92400e;font-size:0.65rem;">DR: {{ c.ctrl.dr_ucr }}</span>{% endif %}
|
||||
{% if c.ctrl.actes_ucr %}<span class="badge" style="background:#fef3c7;color:#92400e;font-size:0.65rem;">Actes: {{ c.ctrl.actes_ucr }}</span>{% endif %}
|
||||
</div>
|
||||
</td>
|
||||
<td style="max-width:300px;">
|
||||
{% if c.ctrl.contre_argumentation %}
|
||||
<details>
|
||||
<summary>{{ c.ctrl.contre_argumentation[:80] }}{% if c.ctrl.contre_argumentation|length > 80 %}…{% endif %}</summary>
|
||||
<pre>{{ c.ctrl.contre_argumentation }}</pre>
|
||||
</details>
|
||||
{% else %}
|
||||
<span style="color:#94a3b8;font-size:0.8rem;">—</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
145
t2a_install_rag_cleanup/src/viewer/templates/dashboard.html
Normal file
145
t2a_install_rag_cleanup/src/viewer/templates/dashboard.html
Normal file
@@ -0,0 +1,145 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %}Dashboard{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
{% for group_name, items in groups.items() %}
|
||||
<div class="group-title">{{ group_name | format_dossier_name }}</div>
|
||||
{% for item in items %}
|
||||
{% if 'fusionne' in item.name %}
|
||||
<a href="/dossier/{{ item.path_rel }}" class="sidebar-fusionne">★ Fusionné</a>
|
||||
{% else %}
|
||||
<a href="/dossier/{{ item.path_rel }}">{{ item.name | format_doc_name }}</a>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<a class="back" href="/">← Retour à la liste</a>
|
||||
<h2 style="margin-top:1rem;">Dashboard</h2>
|
||||
|
||||
{# ---- Cartes métriques ---- #}
|
||||
<div style="display:grid;grid-template-columns:repeat(4,1fr);gap:0.75rem;margin-bottom:1.5rem;">
|
||||
{% set cards = [
|
||||
("Dossiers", stats.total_dossiers, "#3b82f6", "#dbeafe"),
|
||||
("Fichiers", stats.total_fichiers, "#6366f1", "#e0e7ff"),
|
||||
("DAS total", stats.total_das, "#1d4ed8", "#dbeafe"),
|
||||
("Actes total", stats.total_actes, "#3730a3", "#e0e7ff"),
|
||||
("Alertes", stats.total_alertes, "#c2410c", "#ffedd5"),
|
||||
("CMA", stats.total_cma, "#dc2626", "#fee2e2"),
|
||||
("Contrôles CPAM", stats.total_cpam, "#b45309", "#fef3c7"),
|
||||
("Temps total", stats.processing_time_total | format_duration, "#065f46", "#d1fae5"),
|
||||
] %}
|
||||
{% for label, value, fg, bg in cards %}
|
||||
<div class="card" style="text-align:center;padding:1rem;">
|
||||
<div style="font-size:0.7rem;text-transform:uppercase;letter-spacing:0.05em;color:#64748b;font-weight:600;">{{ label }}</div>
|
||||
<div style="font-size:1.5rem;font-weight:700;color:{{ fg }};margin-top:0.25rem;">{{ value }}</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
{# ---- Temps moyen ---- #}
|
||||
{% if stats.processing_time_avg %}
|
||||
<div class="card" style="margin-bottom:1rem;">
|
||||
<div style="font-size:0.8rem;color:#64748b;">Temps moyen par fichier : <strong style="color:#0f172a;">{{ stats.processing_time_avg | format_duration }}</strong></div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Distribution confiance DP ---- #}
|
||||
{% set conf = stats.dp_confidence %}
|
||||
{% set conf_total = (conf.get('high', 0) + conf.get('medium', 0) + conf.get('low', 0) + conf.get('none', 0)) or 1 %}
|
||||
<div class="card section">
|
||||
<h3>Confiance DP</h3>
|
||||
<div style="display:flex;height:28px;border-radius:6px;overflow:hidden;margin-bottom:0.5rem;">
|
||||
{% if conf.get('high', 0) %}
|
||||
<div style="width:{{ (conf.get('high', 0) / conf_total * 100)|round(1) }}%;background:#16a34a;" title="Haute : {{ conf.get('high', 0) }}"></div>
|
||||
{% endif %}
|
||||
{% if conf.get('medium', 0) %}
|
||||
<div style="width:{{ (conf.get('medium', 0) / conf_total * 100)|round(1) }}%;background:#ca8a04;" title="Moyenne : {{ conf.get('medium', 0) }}"></div>
|
||||
{% endif %}
|
||||
{% if conf.get('low', 0) %}
|
||||
<div style="width:{{ (conf.get('low', 0) / conf_total * 100)|round(1) }}%;background:#dc2626;" title="Basse : {{ conf.get('low', 0) }}"></div>
|
||||
{% endif %}
|
||||
{% if conf.get('none', 0) %}
|
||||
<div style="width:{{ (conf.get('none', 0) / conf_total * 100)|round(1) }}%;background:#94a3b8;" title="Aucune : {{ conf.get('none', 0) }}"></div>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div style="display:flex;gap:1.5rem;font-size:0.8rem;">
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#16a34a;margin-right:4px;"></span>Haute : {{ conf.get('high', 0) }}</span>
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#ca8a04;margin-right:4px;"></span>Moyenne : {{ conf.get('medium', 0) }}</span>
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#dc2626;margin-right:4px;"></span>Basse : {{ conf.get('low', 0) }}</span>
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#94a3b8;margin-right:4px;"></span>Aucune : {{ conf.get('none', 0) }}</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{# ---- Top 15 codes CIM-10 ---- #}
|
||||
{% if stats.top_codes %}
|
||||
<div class="card section">
|
||||
<h3>Top 15 codes CIM-10</h3>
|
||||
{% for code, count in stats.top_codes %}
|
||||
<div style="display:flex;align-items:center;gap:0.5rem;margin-bottom:0.4rem;">
|
||||
<code style="min-width:60px;font-size:0.8rem;font-weight:600;">{{ code }}</code>
|
||||
<div style="flex:1;height:20px;background:#f1f5f9;border-radius:4px;overflow:hidden;">
|
||||
<div style="width:{{ (count / stats.top_max * 100)|round(1) }}%;height:100%;background:#3b82f6;border-radius:4px;"></div>
|
||||
</div>
|
||||
<span style="min-width:30px;text-align:right;font-size:0.8rem;color:#64748b;">{{ count }}</span>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Distribution GHM types ---- #}
|
||||
{% set ghm = stats.ghm_types %}
|
||||
{% set ghm_total = (ghm.get('C', 0) + ghm.get('M', 0) + ghm.get('K', 0)) or 1 %}
|
||||
{% if ghm.get('C', 0) or ghm.get('M', 0) or ghm.get('K', 0) %}
|
||||
<div class="card section">
|
||||
<h3>Types GHM</h3>
|
||||
<div style="display:flex;height:28px;border-radius:6px;overflow:hidden;margin-bottom:0.5rem;">
|
||||
{% if ghm.get('C', 0) %}
|
||||
<div style="width:{{ (ghm.get('C', 0) / ghm_total * 100)|round(1) }}%;background:#dc2626;" title="Chirurgical : {{ ghm.get('C', 0) }}"></div>
|
||||
{% endif %}
|
||||
{% if ghm.get('M', 0) %}
|
||||
<div style="width:{{ (ghm.get('M', 0) / ghm_total * 100)|round(1) }}%;background:#3b82f6;" title="Médical : {{ ghm.get('M', 0) }}"></div>
|
||||
{% endif %}
|
||||
{% if ghm.get('K', 0) %}
|
||||
<div style="width:{{ (ghm.get('K', 0) / ghm_total * 100)|round(1) }}%;background:#f59e0b;" title="Interventionnel : {{ ghm.get('K', 0) }}"></div>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div style="display:flex;gap:1.5rem;font-size:0.8rem;">
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#dc2626;margin-right:4px;"></span>C — Chirurgical : {{ ghm.get('C', 0) }}</span>
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#3b82f6;margin-right:4px;"></span>M — Médical : {{ ghm.get('M', 0) }}</span>
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#f59e0b;margin-right:4px;"></span>K — Interventionnel : {{ ghm.get('K', 0) }}</span>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Distribution sévérité ---- #}
|
||||
{% set sev = stats.severity_dist %}
|
||||
{% set sev_total = (sev.get(1, 0) + sev.get(2, 0) + sev.get(3, 0) + sev.get(4, 0)) or 1 %}
|
||||
{% if sev.get(1, 0) or sev.get(2, 0) or sev.get(3, 0) or sev.get(4, 0) %}
|
||||
<div class="card section">
|
||||
<h3>Sévérité GHM</h3>
|
||||
<div style="display:flex;height:28px;border-radius:6px;overflow:hidden;margin-bottom:0.5rem;">
|
||||
{% if sev.get(1, 0) %}
|
||||
<div style="width:{{ (sev.get(1, 0) / sev_total * 100)|round(1) }}%;background:#16a34a;" title="Niveau 1 : {{ sev.get(1, 0) }}"></div>
|
||||
{% endif %}
|
||||
{% if sev.get(2, 0) %}
|
||||
<div style="width:{{ (sev.get(2, 0) / sev_total * 100)|round(1) }}%;background:#ca8a04;" title="Niveau 2 : {{ sev.get(2, 0) }}"></div>
|
||||
{% endif %}
|
||||
{% if sev.get(3, 0) %}
|
||||
<div style="width:{{ (sev.get(3, 0) / sev_total * 100)|round(1) }}%;background:#f97316;" title="Niveau 3 : {{ sev.get(3, 0) }}"></div>
|
||||
{% endif %}
|
||||
{% if sev.get(4, 0) %}
|
||||
<div style="width:{{ (sev.get(4, 0) / sev_total * 100)|round(1) }}%;background:#dc2626;" title="Niveau 4 : {{ sev.get(4, 0) }}"></div>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div style="display:flex;gap:1.5rem;font-size:0.8rem;">
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#16a34a;margin-right:4px;"></span>Niveau 1 : {{ sev.get(1, 0) }}</span>
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#ca8a04;margin-right:4px;"></span>Niveau 2 : {{ sev.get(2, 0) }}</span>
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#f97316;margin-right:4px;"></span>Niveau 3 : {{ sev.get(3, 0) }}</span>
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:#dc2626;margin-right:4px;"></span>Niveau 4 : {{ sev.get(4, 0) }}</span>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% endblock %}
|
||||
757
t2a_install_rag_cleanup/src/viewer/templates/detail.html
Normal file
757
t2a_install_rag_cleanup/src/viewer/templates/detail.html
Normal file
@@ -0,0 +1,757 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %}{{ dossier.source_file or filepath }}{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
<div class="group-title">Navigation</div>
|
||||
<a href="/">Retour à la liste</a>
|
||||
{% if siblings %}
|
||||
<div class="group-title" style="margin-top:1rem;">{{ current_group }}</div>
|
||||
{% for sib in siblings %}
|
||||
<a href="/dossier/{{ sib.path_rel }}" {% if sib.path_rel == filepath %}style="color:#f8fafc;border-left-color:#3b82f6;background:#334155;"{% endif %}>
|
||||
{{ sib.name }}
|
||||
</a>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
<div class="group-title" style="margin-top:1.5rem;">Actions</div>
|
||||
<button id="reprocess-btn" style="width:100%;padding:0.6rem;background:#3b82f6;color:white;border:none;border-radius:0.375rem;cursor:pointer;font-size:0.875rem;font-weight:600;margin-bottom:0.5rem;">Relancer l'étude</button>
|
||||
<div id="reprocess-status" style="font-size:0.75rem;padding:0.25rem;min-height:1.5rem;"></div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<a class="back" href="/">← Retour à la liste</a>
|
||||
|
||||
{# ---- En-tête ---- #}
|
||||
<div class="card" style="margin-top:1rem;">
|
||||
<h2>{{ dossier.source_file or filepath }}</h2>
|
||||
<div class="info-grid">
|
||||
{% if dossier.document_type %}
|
||||
<div class="info-item">
|
||||
<label>Type de document</label>
|
||||
<span class="badge" style="background:#e0e7ff;color:#3730a3;">{{ dossier.document_type }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if dossier.processing_time_s is not none %}
|
||||
<div class="info-item">
|
||||
<label>Temps de traitement</label>
|
||||
<span>{{ dossier.processing_time_s|format_duration }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% if dossier.source_files %}
|
||||
<div class="source-files" style="margin-top:0.75rem;">
|
||||
<label style="font-size:0.7rem;color:#64748b;text-transform:uppercase;letter-spacing:0.05em;font-weight:600;">Documents sources</label>
|
||||
<div style="margin-top:0.25rem;">
|
||||
{% for sf in dossier.source_files %}
|
||||
<code>{{ sf }}</code>{% if not loop.last %}, {% endif %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
{# ---- Séjour ---- #}
|
||||
{% set s = dossier.sejour %}
|
||||
{% if s.sexe or s.age or s.date_entree or s.date_sortie or s.duree_sejour is not none or s.imc or s.poids or s.taille %}
|
||||
<div class="card section">
|
||||
<h3>Séjour</h3>
|
||||
<div class="info-grid">
|
||||
{% if s.sexe %}<div class="info-item"><label>Sexe</label><span>{{ s.sexe }}</span></div>{% endif %}
|
||||
{% if s.age is not none %}<div class="info-item"><label>Âge</label><span>{{ s.age }} ans</span></div>{% endif %}
|
||||
{% if s.date_entree %}<div class="info-item"><label>Entrée</label><span>{{ s.date_entree }}</span></div>{% endif %}
|
||||
{% if s.date_sortie %}<div class="info-item"><label>Sortie</label><span>{{ s.date_sortie }}</span></div>{% endif %}
|
||||
{% if s.duree_sejour is not none %}<div class="info-item"><label>Durée</label><span>{{ s.duree_sejour }} jour(s)</span></div>{% endif %}
|
||||
{% if s.mode_entree %}<div class="info-item"><label>Mode entrée</label><span>{{ s.mode_entree }}</span></div>{% endif %}
|
||||
{% if s.mode_sortie %}<div class="info-item"><label>Mode sortie</label><span>{{ s.mode_sortie }}</span></div>{% endif %}
|
||||
{% if s.poids %}<div class="info-item"><label>Poids</label><span>{{ s.poids }} kg</span></div>{% endif %}
|
||||
{% if s.taille %}<div class="info-item"><label>Taille</label><span>{{ s.taille }} cm</span></div>{% endif %}
|
||||
{% if s.imc %}<div class="info-item"><label>IMC</label><span>{{ s.imc }}</span></div>{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Estimation GHM ---- #}
|
||||
{% if dossier.ghm_estimation %}
|
||||
{% set ghm = dossier.ghm_estimation %}
|
||||
<div class="card section" style="border-left:4px solid #8b5cf6;">
|
||||
<h3 style="color:#6d28d9;">Estimation GHM</h3>
|
||||
<div class="info-grid">
|
||||
{% if ghm.cmd %}
|
||||
<div class="info-item">
|
||||
<label>CMD</label>
|
||||
<span><strong>{{ ghm.cmd }}</strong>{% if ghm.cmd_libelle %} — {{ ghm.cmd_libelle }}{% endif %}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
<div class="info-item">
|
||||
<label>Type</label>
|
||||
{% if ghm.type_ghm == 'C' %}
|
||||
<span class="badge" style="background:#fee2e2;color:#dc2626;">C — Chirurgical</span>
|
||||
{% elif ghm.type_ghm == 'K' %}
|
||||
<span class="badge" style="background:#fef3c7;color:#92400e;">K — Interventionnel</span>
|
||||
{% elif ghm.type_ghm == 'M' %}
|
||||
<span class="badge" style="background:#dbeafe;color:#1d4ed8;">M — Médical</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="info-item">
|
||||
<label>Sévérité</label>
|
||||
{% if ghm.severite <= 1 %}
|
||||
<span class="badge" style="background:#d1fae5;color:#065f46;">Niveau {{ ghm.severite }}</span>
|
||||
{% elif ghm.severite == 2 %}
|
||||
<span class="badge" style="background:#fef3c7;color:#92400e;">Niveau {{ ghm.severite }}</span>
|
||||
{% elif ghm.severite == 3 %}
|
||||
<span class="badge" style="background:#fed7aa;color:#9a3412;">Niveau {{ ghm.severite }}</span>
|
||||
{% else %}
|
||||
<span class="badge" style="background:#fee2e2;color:#dc2626;">Niveau {{ ghm.severite }}</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% if ghm.ghm_approx %}
|
||||
<div class="info-item">
|
||||
<label>Code GHM approx.</label>
|
||||
<code style="font-size:1.1rem;font-weight:700;letter-spacing:0.05em;">{{ ghm.ghm_approx }}</code>
|
||||
</div>
|
||||
{% endif %}
|
||||
<div class="info-item">
|
||||
<label>CMA / CMS</label>
|
||||
<span>{{ ghm.cma_count }} CMA, {{ ghm.cms_count }} CMS</span>
|
||||
</div>
|
||||
</div>
|
||||
{% if ghm.alertes %}
|
||||
<div style="margin-top:0.75rem;">
|
||||
{% for alerte in ghm.alertes %}
|
||||
<div style="font-size:0.8rem;color:#c2410c;margin-bottom:0.2rem;">{{ alerte }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
<div style="margin-top:0.75rem;font-size:0.7rem;color:#94a3b8;font-style:italic;">
|
||||
Estimation heuristique — le GHM définitif nécessite le groupeur officiel ATIH
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Contrôle CPAM ---- #}
|
||||
{% if dossier.controles_cpam %}
|
||||
<div class="card section" style="border-left:4px solid #f59e0b;">
|
||||
<h3 style="color:#b45309;">Contrôle CPAM ({{ dossier.controles_cpam|length }})</h3>
|
||||
{% for ctrl in dossier.controles_cpam %}
|
||||
<div style="margin-bottom:1.5rem;{% if not loop.last %}border-bottom:1px solid #e2e8f0;padding-bottom:1rem;{% endif %}">
|
||||
<div style="display:flex;align-items:center;gap:0.5rem;margin-bottom:0.5rem;">
|
||||
<strong>OGC {{ ctrl.numero_ogc }} — {{ ctrl.titre }}</strong>
|
||||
{% if 'retient' in ctrl.decision_ucr|lower %}
|
||||
<span class="badge" style="background:#d1fae5;color:#065f46;">{{ ctrl.decision_ucr }}</span>
|
||||
{% elif 'confirme' in ctrl.decision_ucr|lower %}
|
||||
<span class="badge" style="background:#fee2e2;color:#dc2626;">{{ ctrl.decision_ucr }}</span>
|
||||
{% else %}
|
||||
<span class="badge" style="background:#e0e7ff;color:#3730a3;">{{ ctrl.decision_ucr }}</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
{# Argument CPAM #}
|
||||
{% if ctrl.arg_ucr %}
|
||||
<div style="border-left:3px solid #f59e0b;padding:0.5rem 0.75rem;background:#fffbeb;margin-bottom:0.75rem;font-size:0.85rem;color:#78350f;">
|
||||
<div style="font-size:0.7rem;color:#92400e;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Argument CPAM</div>
|
||||
{{ ctrl.arg_ucr }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# Codes contestés #}
|
||||
{% if ctrl.dp_ucr or ctrl.da_ucr or ctrl.dr_ucr or ctrl.actes_ucr %}
|
||||
<div style="margin-bottom:0.75rem;">
|
||||
<div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Codes contestés</div>
|
||||
<div style="display:flex;gap:0.5rem;flex-wrap:wrap;">
|
||||
{% if ctrl.dp_ucr %}<span class="badge" style="background:#fef3c7;color:#92400e;">DP: {{ ctrl.dp_ucr }}</span>{% endif %}
|
||||
{% if ctrl.da_ucr %}<span class="badge" style="background:#fef3c7;color:#92400e;">DA: {{ ctrl.da_ucr }}</span>{% endif %}
|
||||
{% if ctrl.dr_ucr %}<span class="badge" style="background:#fef3c7;color:#92400e;">DR: {{ ctrl.dr_ucr }}</span>{% endif %}
|
||||
{% if ctrl.actes_ucr %}<span class="badge" style="background:#fef3c7;color:#92400e;">Actes: {{ ctrl.actes_ucr }}</span>{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# Contre-argumentation structurée ou fallback texte brut #}
|
||||
{% if ctrl.response_data %}
|
||||
<div style="margin-bottom:0.75rem;">
|
||||
<div style="font-size:0.7rem;color:#1d4ed8;text-transform:uppercase;font-weight:600;margin-bottom:0.5rem;">Contre-argumentation</div>
|
||||
|
||||
{% if ctrl.response_data.analyse_contestation %}
|
||||
<div style="border-left:3px solid #94a3b8;padding:0.5rem 0.75rem;background:#f8fafc;margin-bottom:0.5rem;font-size:0.85rem;">
|
||||
<div style="font-size:0.7rem;color:#64748b;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Analyse de la contestation</div>
|
||||
{{ ctrl.response_data.analyse_contestation | format_cpam_text }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if ctrl.response_data.points_accord and ctrl.response_data.points_accord|lower not in ['aucun', 'non applicable', 'n/a', ''] %}
|
||||
<div style="border-left:3px solid #22c55e;padding:0.5rem 0.75rem;background:#f0fdf4;margin-bottom:0.5rem;font-size:0.85rem;">
|
||||
<div style="font-size:0.7rem;color:#16a34a;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Points d'accord</div>
|
||||
{{ ctrl.response_data.points_accord | format_cpam_text }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if ctrl.response_data.contre_arguments_medicaux %}
|
||||
<div style="border-left:3px solid #3b82f6;padding:0.5rem 0.75rem;background:#eff6ff;margin-bottom:0.5rem;font-size:0.85rem;">
|
||||
<div style="font-size:0.7rem;color:#1d4ed8;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Contre-arguments médicaux</div>
|
||||
{{ ctrl.response_data.contre_arguments_medicaux | format_cpam_text }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if ctrl.response_data.preuves_dossier %}
|
||||
<div style="border-left:3px solid #0ea5e9;padding:0.5rem 0.75rem;background:#f0f9ff;margin-bottom:0.5rem;font-size:0.85rem;">
|
||||
<div style="font-size:0.7rem;color:#0369a1;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Preuves du dossier</div>
|
||||
<ul style="margin:0.3rem 0;padding-left:1.2rem;">
|
||||
{% for p in ctrl.response_data.preuves_dossier %}
|
||||
{% if p is mapping %}
|
||||
<li style="margin-bottom:0.3rem;">
|
||||
<span style="display:inline-block;padding:1px 6px;border-radius:9999px;font-size:0.7rem;font-weight:600;background:#e0f2fe;color:#0369a1;">{{ p.element or p.get('type', '') }}</span>
|
||||
{{ p.valeur or '' }} <span style="color:#64748b;">→ {{ p.signification or '' }}</span>
|
||||
</li>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if ctrl.response_data.contre_arguments_asymetrie %}
|
||||
<div style="border-left:3px solid #8b5cf6;padding:0.5rem 0.75rem;background:#f5f3ff;margin-bottom:0.5rem;font-size:0.85rem;">
|
||||
<div style="font-size:0.7rem;color:#7c3aed;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Asymétrie d'information</div>
|
||||
{{ ctrl.response_data.contre_arguments_asymetrie | format_cpam_text }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if ctrl.response_data.contre_arguments_reglementaires %}
|
||||
<div style="border-left:3px solid #6366f1;padding:0.5rem 0.75rem;background:#eef2ff;margin-bottom:0.5rem;font-size:0.85rem;">
|
||||
<div style="font-size:0.7rem;color:#4f46e5;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Contre-arguments réglementaires</div>
|
||||
{{ ctrl.response_data.contre_arguments_reglementaires | format_cpam_text }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if ctrl.response_data.references %}
|
||||
<div style="border-left:3px solid #64748b;padding:0.5rem 0.75rem;background:#f8fafc;margin-bottom:0.5rem;font-size:0.85rem;">
|
||||
<div style="font-size:0.7rem;color:#475569;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Références</div>
|
||||
{% for ref in ctrl.response_data.references %}
|
||||
{% if ref is mapping %}
|
||||
<blockquote style="margin:0.3rem 0;padding:0.3rem 0.5rem;border-left:2px solid #cbd5e1;background:#f1f5f9;font-size:0.8rem;color:#334155;">
|
||||
<strong>[{{ ref.document or '' }}{% if ref.page %}, p.{{ ref.page }}{% endif %}]</strong>
|
||||
{{ ref.citation or '' }}
|
||||
</blockquote>
|
||||
{% elif ref is string %}
|
||||
<p style="margin:0.2rem 0;font-size:0.8rem;color:#334155;">{{ ref }}</p>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if ctrl.response_data.conclusion %}
|
||||
<div style="border-left:3px solid #f59e0b;padding:0.5rem 0.75rem;background:#fffbeb;margin-bottom:0.5rem;font-size:0.85rem;border:1px solid #fde68a;border-left:3px solid #f59e0b;border-radius:0.25rem;">
|
||||
<div style="font-size:0.7rem;color:#b45309;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Conclusion</div>
|
||||
{{ ctrl.response_data.conclusion | format_cpam_text }}
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% elif ctrl.contre_argumentation %}
|
||||
<div style="border-left:3px solid #3b82f6;padding:0.5rem 0.75rem;background:#eff6ff;margin-bottom:0.75rem;font-size:0.85rem;color:#1e3a5f;">
|
||||
<div style="font-size:0.7rem;color:#1d4ed8;text-transform:uppercase;font-weight:600;margin-bottom:0.25rem;">Contre-argumentation</div>
|
||||
<pre style="white-space:pre-wrap;font-family:inherit;margin:0;">{{ ctrl.contre_argumentation }}</pre>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# Sources RAG #}
|
||||
{% if ctrl.sources_reponse %}
|
||||
<details>
|
||||
<summary style="font-size:0.8rem;color:#64748b;">Sources RAG ({{ ctrl.sources_reponse|length }})</summary>
|
||||
{% for src in ctrl.sources_reponse %}
|
||||
<pre style="font-size:0.75rem;">{{ src.document }}{% if src.code %} — {{ src.code }}{% endif %}{% if src.page %} [p.{{ src.page }}]{% endif %}
|
||||
{{ src.extrait or '' }}</pre>
|
||||
{% endfor %}
|
||||
</details>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Alertes de codage ---- #}
|
||||
{% if dossier.alertes_codage %}
|
||||
<div class="card section" style="border-left:4px solid #f97316;background:#fff7ed;">
|
||||
<h3 style="color:#c2410c;">Alertes de codage ({{ dossier.alertes_codage|length }})</h3>
|
||||
<ul style="margin:0;padding-left:1.2rem;">
|
||||
{% for alerte in dossier.alertes_codage %}
|
||||
{% if alerte.startswith('NON-CUMUL') %}
|
||||
<li class="alerte-noncumul" style="font-size:0.85rem;margin-bottom:0.25rem;">{{ alerte }}</li>
|
||||
{% else %}
|
||||
<li class="alerte-standard" style="font-size:0.85rem;margin-bottom:0.25rem;">{{ alerte }}</li>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Diagnostic principal ---- #}
|
||||
{% if dossier.diagnostic_principal %}
|
||||
{% set dp = dossier.diagnostic_principal %}
|
||||
<div class="card section">
|
||||
<h3>Diagnostic principal</h3>
|
||||
<div style="font-size:0.95rem;margin-bottom:0.5rem;">
|
||||
{{ dp.texte }}
|
||||
{% if dp.source_page %}<button class="src-btn" data-texte="{{ dp.texte|e }}" data-excerpt="{{ dp.source_excerpt|default('',true)|e }}" data-page="{{ dp.source_page }}">p.{{ dp.source_page }}</button>{% endif %}
|
||||
</div>
|
||||
{% if dp.cim10_suggestion %}
|
||||
<span class="badge" style="background:#dbeafe;color:#1d4ed8;font-size:0.85rem;">{{ dp.cim10_suggestion }}</span>
|
||||
{{ dp.cim10_confidence | confidence_badge }}
|
||||
{% if dp.niveau_cma and dp.niveau_cma > 1 %}
|
||||
{{ dp.niveau_cma | cma_level_badge }}
|
||||
{% elif dp.est_cma %}
|
||||
<span class="badge" style="background:#fee2e2;color:#dc2626;font-size:0.75rem;">CMA</span>
|
||||
{% endif %}
|
||||
{{ dp.niveau_severite | severity_badge }}
|
||||
{% endif %}
|
||||
{% if dp.justification %}
|
||||
<div style="margin-top:0.5rem;font-size:0.8rem;color:#475569;">{{ dp.justification }}</div>
|
||||
{% endif %}
|
||||
{% if dp.preuves_cliniques %}
|
||||
<details style="margin-top:0.5rem;">
|
||||
<summary style="font-size:0.8rem;color:#0369a1;cursor:pointer;font-weight:600;">Preuves cliniques ({{ dp.preuves_cliniques|length }})</summary>
|
||||
<ul style="margin:0.25rem 0 0 0;padding-left:1.2rem;font-size:0.8rem;">
|
||||
{% for p in dp.preuves_cliniques %}
|
||||
<li style="margin-bottom:0.15rem;"><span class="badge" style="background:#e0f2fe;color:#0369a1;font-size:0.7rem;">{{ p.type }}</span> {{ p.element }} <span style="color:#64748b;">→ {{ p.interpretation }}</span></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</details>
|
||||
{% endif %}
|
||||
{% if dp.raisonnement %}
|
||||
<details style="margin-top:0.5rem;">
|
||||
<summary>Raisonnement LLM</summary>
|
||||
<pre>{{ dp.raisonnement }}</pre>
|
||||
</details>
|
||||
{% endif %}
|
||||
{% if dp.sources_rag %}
|
||||
<details>
|
||||
<summary>Sources RAG ({{ dp.sources_rag|length }})</summary>
|
||||
{% for src in dp.sources_rag %}
|
||||
<pre>{{ src.document }}{% if src.code %} — {{ src.code }}{% endif %}{% if src.page %} [p.{{ src.page }}]{% endif %}
|
||||
{{ src.extrait or '' }}</pre>
|
||||
{% endfor %}
|
||||
</details>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Diagnostics associés ---- #}
|
||||
{% if dossier.diagnostics_associes %}
|
||||
<div class="card section">
|
||||
<h3>Diagnostics associés ({{ dossier.diagnostics_associes|length }})</h3>
|
||||
<table>
|
||||
<thead><tr><th>Texte</th><th>CIM-10</th><th>Confiance</th><th>CMA</th><th>Source</th><th>Justification</th></tr></thead>
|
||||
<tbody>
|
||||
{% for das in dossier.diagnostics_associes %}
|
||||
<tr>
|
||||
<td>{{ das.texte }}</td>
|
||||
<td>{% if das.cim10_suggestion %}<span class="badge" style="background:#dbeafe;color:#1d4ed8;">{{ das.cim10_suggestion }}</span>{% endif %}</td>
|
||||
<td>{{ das.cim10_confidence | confidence_badge }}</td>
|
||||
<td>
|
||||
{% if das.niveau_cma and das.niveau_cma > 1 %}
|
||||
{{ das.niveau_cma | cma_level_badge }}
|
||||
{% elif das.est_cma %}
|
||||
<span class="badge" style="background:#fee2e2;color:#dc2626;font-size:0.7rem;">CMA</span>
|
||||
{% else %}
|
||||
—
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if das.source %}
|
||||
<span class="badge" style="background:#e0e7ff;color:#3730a3;font-size:0.7rem;">{{ das.source }}</span>
|
||||
{% endif %}
|
||||
{% if das.source_page %}
|
||||
<button class="src-btn" data-texte="{{ das.texte|e }}" data-excerpt="{{ das.source_excerpt|default('',true)|e }}" data-page="{{ das.source_page }}">p.{{ das.source_page }}</button>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td style="font-size:0.8rem;color:#475569;">
|
||||
{{ das.justification or '' }}
|
||||
{% if das.preuves_cliniques %}
|
||||
<details style="margin-top:0.3rem;"><summary style="font-size:0.7rem;color:#0369a1;cursor:pointer;">preuves ({{ das.preuves_cliniques|length }})</summary>
|
||||
<ul style="margin:0.15rem 0 0 0;padding-left:1rem;font-size:0.75rem;">
|
||||
{% for p in das.preuves_cliniques %}
|
||||
<li><span style="font-weight:600;color:#0369a1;">[{{ p.type }}]</span> {{ p.element }} <span style="color:#64748b;">→ {{ p.interpretation }}</span></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</details>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% if das.raisonnement %}
|
||||
<tr>
|
||||
<td colspan="6" style="padding:0 0.75rem 0.5rem;">
|
||||
<details>
|
||||
<summary>Raisonnement LLM</summary>
|
||||
<pre>{{ das.raisonnement }}</pre>
|
||||
</details>
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% if das.sources_rag %}
|
||||
<tr>
|
||||
<td colspan="6" style="padding:0 0.75rem 0.5rem;">
|
||||
<details>
|
||||
<summary>Sources RAG ({{ das.sources_rag|length }})</summary>
|
||||
{% for src in das.sources_rag %}
|
||||
<pre>{{ src.document }}{% if src.code %} — {{ src.code }}{% endif %}{% if src.page %} [p.{{ src.page }}]{% endif %}
|
||||
{{ src.extrait or '' }}</pre>
|
||||
{% endfor %}
|
||||
</details>
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Actes CCAM ---- #}
|
||||
{% if dossier.actes_ccam %}
|
||||
<div class="card section">
|
||||
<h3>Actes CCAM ({{ dossier.actes_ccam|length }})</h3>
|
||||
<table>
|
||||
<thead><tr><th>Texte</th><th>Code CCAM</th><th>Regroupement</th><th>Date</th><th>Validité</th><th>Source</th></tr></thead>
|
||||
<tbody>
|
||||
{% for a in dossier.actes_ccam %}
|
||||
<tr>
|
||||
<td>{{ a.texte }}</td>
|
||||
<td>{% if a.code_ccam_suggestion %}<span class="badge" style="background:#e0e7ff;color:#3730a3;">{{ a.code_ccam_suggestion }}</span>{% endif %}</td>
|
||||
<td>
|
||||
{% if a.code_ccam_suggestion and ccam_dict.get(a.code_ccam_suggestion, {}).get('regroupement') %}
|
||||
<span class="badge badge-regroup">{{ ccam_dict[a.code_ccam_suggestion]['regroupement'] }}</span>
|
||||
{% else %}
|
||||
—
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ a.date or '' }}</td>
|
||||
<td>
|
||||
{% if a.validite == 'valide' %}<span class="badge" style="background:#d1fae5;color:#065f46;">Valide</span>
|
||||
{% elif a.validite == 'obsolete' %}<span class="badge" style="background:#fee2e2;color:#dc2626;">Obsolète</span>
|
||||
{% else %}—{% endif %}
|
||||
{% for alerte in a.alertes %}
|
||||
<div style="font-size:0.7rem;color:#dc2626;">{{ alerte }}</div>
|
||||
{% endfor %}
|
||||
</td>
|
||||
<td>{% if a.source_page %}<button class="src-btn" data-texte="{{ a.texte|e }}" data-excerpt="{{ a.source_excerpt|default('',true)|e }}" data-page="{{ a.source_page }}">p.{{ a.source_page }}</button>{% endif %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Biologie clé ---- #}
|
||||
{% if dossier.biologie_cle %}
|
||||
<div class="card section">
|
||||
<h3>Biologie clé ({{ dossier.biologie_cle|length }})</h3>
|
||||
<table>
|
||||
<thead><tr><th>Test</th><th>Valeur</th><th>Anomalie</th><th>Source</th></tr></thead>
|
||||
<tbody>
|
||||
{% for b in dossier.biologie_cle %}
|
||||
<tr{% if b.anomalie %} class="anomalie"{% endif %}>
|
||||
<td>{{ b.test }}</td>
|
||||
<td>{{ b.valeur or '' }}</td>
|
||||
<td>{% if b.anomalie %}<span class="badge" style="background:#fee2e2;color:#dc2626;">Oui</span>{% else %}—{% endif %}</td>
|
||||
<td>{% if b.source_page %}<button class="src-btn" data-texte="{{ b.test|e }}" data-excerpt="{{ b.source_excerpt|default('',true)|e }}" data-page="{{ b.source_page }}">p.{{ b.source_page }}</button>{% endif %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Imagerie ---- #}
|
||||
{% if dossier.imagerie %}
|
||||
<div class="card section">
|
||||
<h3>Imagerie ({{ dossier.imagerie|length }})</h3>
|
||||
{% for img in dossier.imagerie %}
|
||||
<div style="margin-bottom:0.5rem;">
|
||||
<strong>{{ img.type }}</strong>
|
||||
{% if img.score %} — Score : {{ img.score }}{% endif %}
|
||||
{% if img.source_page %}<button class="src-btn" data-texte="{{ img.type|e }}" data-excerpt="{{ img.source_excerpt|default('',true)|e }}" data-page="{{ img.source_page }}">p.{{ img.source_page }}</button>{% endif %}
|
||||
{% if img.conclusion %}
|
||||
<div style="font-size:0.85rem;color:#475569;">{{ img.conclusion }}</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Traitements de sortie ---- #}
|
||||
{% if dossier.traitements_sortie %}
|
||||
<div class="card section">
|
||||
<h3>Traitements de sortie ({{ dossier.traitements_sortie|length }})</h3>
|
||||
<table>
|
||||
<thead><tr><th>Médicament</th><th>Posologie</th><th>Code ATC</th><th>Source</th></tr></thead>
|
||||
<tbody>
|
||||
{% for t in dossier.traitements_sortie %}
|
||||
<tr>
|
||||
<td>{{ t.medicament }}</td>
|
||||
<td>{{ t.posologie or '' }}</td>
|
||||
<td>{% if t.code_atc %}<span class="badge" style="background:#e0e7ff;color:#3730a3;">{{ t.code_atc }}</span>{% endif %}</td>
|
||||
<td>{% if t.source_page %}<button class="src-btn" data-texte="{{ t.medicament|e }}" data-excerpt="{{ t.source_excerpt|default('',true)|e }}" data-page="{{ t.source_page }}">p.{{ t.source_page }}</button>{% endif %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Antécédents ---- #}
|
||||
{% if dossier.antecedents %}
|
||||
<div class="card section">
|
||||
<h3>Antécédents ({{ dossier.antecedents|length }})</h3>
|
||||
<ul class="bullet">
|
||||
{% for a in dossier.antecedents %}
|
||||
<li>{{ a.texte }}{% if a.source_page %} <button class="src-btn" data-texte="{{ a.texte|e }}" data-excerpt="{{ a.source_excerpt|default('',true)|e }}" data-page="{{ a.source_page }}">p.{{ a.source_page }}</button>{% endif %}</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Complications ---- #}
|
||||
{% if dossier.complications %}
|
||||
<div class="card section">
|
||||
<h3>Complications ({{ dossier.complications|length }})</h3>
|
||||
<ul class="bullet">
|
||||
{% for c in dossier.complications %}
|
||||
<li>{{ c.texte }}{% if c.source_page %} <button class="src-btn" data-texte="{{ c.texte|e }}" data-excerpt="{{ c.source_excerpt|default('',true)|e }}" data-page="{{ c.source_page }}">p.{{ c.source_page }}</button>{% endif %}</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{# ---- Modal source ---- #}
|
||||
<div id="source-modal">
|
||||
<div id="source-modal-inner">
|
||||
<div id="source-header">
|
||||
<span id="source-title">Document source</span>
|
||||
<button id="source-close-btn" onclick="closeSource()">Fermer</button>
|
||||
</div>
|
||||
<div id="source-content"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
|
||||
{% block scripts %}
|
||||
<script>
|
||||
/* --- Source modal --- */
|
||||
let _sourceCache = null;
|
||||
const _dossierId = (function() {
|
||||
const fp = {{ filepath|tojson }};
|
||||
const parts = fp.split('/');
|
||||
return parts.length > 1 ? parts.slice(0, -1).join('/') : '';
|
||||
})();
|
||||
const _sourceFiles = {{ dossier.source_files|tojson }};
|
||||
|
||||
function getDossierId() { return _dossierId; }
|
||||
|
||||
async function loadSourceTexts() {
|
||||
if (_sourceCache !== null) return _sourceCache;
|
||||
if (!_dossierId) { _sourceCache = {}; return _sourceCache; }
|
||||
try {
|
||||
const resp = await fetch('/api/source-text/' + _dossierId);
|
||||
if (resp.ok) { _sourceCache = await resp.json(); }
|
||||
else { _sourceCache = {}; }
|
||||
} catch (e) { _sourceCache = {}; }
|
||||
return _sourceCache;
|
||||
}
|
||||
|
||||
/* Teste si le PDF caviardé est disponible (HEAD request) */
|
||||
async function pdfAvailable(dossierId, filename) {
|
||||
try {
|
||||
const resp = await fetch('/api/pdf/' + dossierId + '/' + encodeURIComponent(filename), {method: 'HEAD'});
|
||||
return resp.ok;
|
||||
} catch (e) { return false; }
|
||||
}
|
||||
|
||||
/* Construit l'URL du PDF avec highlight + page */
|
||||
function buildPdfUrl(dossierId, filename, page, excerpt) {
|
||||
let url = '/api/pdf/' + dossierId + '/' + encodeURIComponent(filename);
|
||||
const params = [];
|
||||
if (excerpt) params.push('highlight=' + encodeURIComponent(excerpt));
|
||||
if (page) params.push('page=' + page);
|
||||
if (params.length) url += '?' + params.join('&');
|
||||
url += '#page=' + (page || 1);
|
||||
return url;
|
||||
}
|
||||
|
||||
/* Affiche un PDF dans l'iframe */
|
||||
function loadPdf(dossierId, filename, page, excerpt) {
|
||||
const content = document.getElementById('source-content');
|
||||
const url = buildPdfUrl(dossierId, filename, page, excerpt);
|
||||
content.className = 'source-content-pdf';
|
||||
content.innerHTML = '<iframe src="' + url + '" style="width:100%;height:100%;border:none;"></iframe>';
|
||||
// Marquer le bouton actif
|
||||
document.querySelectorAll('.src-file-btn').forEach(b => b.classList.remove('active'));
|
||||
document.querySelectorAll('.src-file-btn').forEach(b => {
|
||||
if (b.textContent === filename) b.classList.add('active');
|
||||
});
|
||||
}
|
||||
|
||||
/* Affiche le modal source — PDF caviardé si disponible, sinon fallback texte */
|
||||
async function showSource(excerpt, page, texte) {
|
||||
// Pour le surlignage PDF, on utilise le texte du diagnostic (pas l'excerpt brut)
|
||||
const highlightText = texte || excerpt;
|
||||
const modal = document.getElementById('source-modal');
|
||||
const modalInner = document.getElementById('source-modal-inner');
|
||||
const content = document.getElementById('source-content');
|
||||
const title = document.getElementById('source-title');
|
||||
|
||||
title.textContent = 'Document source — Page ' + page;
|
||||
content.innerHTML = '<em style="color:#94a3b8;">Chargement...</em>';
|
||||
content.className = '';
|
||||
modalInner.className = '';
|
||||
modal.style.display = 'block';
|
||||
|
||||
// Essayer le mode PDF
|
||||
if (_sourceFiles && _sourceFiles.length > 0 && _dossierId) {
|
||||
const firstFile = _sourceFiles[0];
|
||||
const available = await pdfAvailable(_dossierId, firstFile);
|
||||
if (available) {
|
||||
modalInner.className = '';
|
||||
if (_sourceFiles.length === 1) {
|
||||
loadPdf(_dossierId, firstFile, page, highlightText);
|
||||
} else {
|
||||
// Multi-PDF : boutons de sélection + iframe
|
||||
const safeHighlight = (highlightText || '').replace(/\\/g, '\\\\').replace(/'/g, "\\'");
|
||||
let html = '<div style="padding:0.5rem 0.75rem;border-bottom:1px solid #e2e8f0;display:flex;gap:0.5rem;flex-wrap:wrap;">';
|
||||
_sourceFiles.forEach(function(f) {
|
||||
const safeF = f.replace(/\\/g, '\\\\').replace(/'/g, "\\'");
|
||||
html += '<button class="src-file-btn" onclick="loadPdf(\'' + _dossierId + '\', \'' + safeF + '\', ' + page + ', \'' + safeHighlight + '\')">' + f + '</button>';
|
||||
});
|
||||
html += '</div>';
|
||||
html += '<iframe id="pdf-frame" style="width:100%;flex:1;border:none;"></iframe>';
|
||||
content.className = 'source-content-pdf';
|
||||
content.style.display = 'flex';
|
||||
content.style.flexDirection = 'column';
|
||||
content.innerHTML = html;
|
||||
// Charger le premier PDF
|
||||
const iframe = content.querySelector('iframe');
|
||||
iframe.src = buildPdfUrl(_dossierId, firstFile, page, highlightText);
|
||||
content.querySelector('.src-file-btn').classList.add('active');
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback : mode texte (ancien comportement)
|
||||
modalInner.className = 'source-modal-text';
|
||||
content.className = '';
|
||||
content.style.display = '';
|
||||
|
||||
const texts = await loadSourceTexts();
|
||||
const allText = Object.values(texts).join('\n\n--- ---\n\n');
|
||||
|
||||
if (!allText) {
|
||||
content.innerHTML = '<em style="color:#94a3b8;">Texte source non disponible</em>';
|
||||
return;
|
||||
}
|
||||
|
||||
// Nettoyer l'extrait : retirer les "..." ajoutés par extract_excerpt()
|
||||
let searchText = (excerpt || '').trim();
|
||||
if (searchText.startsWith('...')) searchText = searchText.substring(3);
|
||||
if (searchText.endsWith('...')) searchText = searchText.slice(0, -3);
|
||||
searchText = searchText.trim();
|
||||
|
||||
// Chercher l'extrait dans le texte et le surligner
|
||||
if (searchText.length > 10) {
|
||||
let idx = allText.indexOf(searchText);
|
||||
if (idx < 0 && searchText.length > 60) {
|
||||
const mid = Math.floor(searchText.length / 2);
|
||||
searchText = searchText.substring(mid - 30, mid + 30);
|
||||
idx = allText.indexOf(searchText);
|
||||
}
|
||||
if (idx >= 0) {
|
||||
const before = allText.substring(0, idx);
|
||||
const match = allText.substring(idx, idx + searchText.length);
|
||||
const after = allText.substring(idx + searchText.length);
|
||||
content.innerHTML = '';
|
||||
content.appendChild(document.createTextNode(before));
|
||||
const mark = document.createElement('mark');
|
||||
mark.textContent = match;
|
||||
mark.id = 'source-highlight';
|
||||
content.appendChild(mark);
|
||||
content.appendChild(document.createTextNode(after));
|
||||
setTimeout(() => {
|
||||
const el = document.getElementById('source-highlight');
|
||||
if (el) el.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
||||
}, 100);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
content.textContent = allText;
|
||||
}
|
||||
|
||||
function closeSource() {
|
||||
const content = document.getElementById('source-content');
|
||||
// Détruire l'iframe pour stopper le chargement PDF
|
||||
content.innerHTML = '';
|
||||
content.style.display = '';
|
||||
content.className = '';
|
||||
document.getElementById('source-modal').style.display = 'none';
|
||||
}
|
||||
|
||||
// Fermer le modal en cliquant sur le fond
|
||||
document.getElementById('source-modal').addEventListener('click', function(e) {
|
||||
if (e.target === this) closeSource();
|
||||
});
|
||||
|
||||
// Fermer avec Escape
|
||||
document.addEventListener('keydown', function(e) {
|
||||
if (e.key === 'Escape') closeSource();
|
||||
});
|
||||
|
||||
// Délégation événements pour tous les boutons .src-btn
|
||||
document.addEventListener('click', function(e) {
|
||||
const btn = e.target.closest('.src-btn');
|
||||
if (btn && btn.dataset.page) {
|
||||
showSource(btn.dataset.excerpt || '', parseInt(btn.dataset.page), btn.dataset.texte || '');
|
||||
}
|
||||
});
|
||||
|
||||
/* --- Reprocess --- */
|
||||
document.getElementById('reprocess-btn').addEventListener('click', async () => {
|
||||
const btn = document.getElementById('reprocess-btn');
|
||||
const status = document.getElementById('reprocess-status');
|
||||
|
||||
btn.disabled = true;
|
||||
btn.style.background = '#64748b';
|
||||
btn.innerHTML = '<span style="display:inline-flex;align-items:center;gap:0.4rem;"><span class="spinner"></span> Traitement en cours...</span>';
|
||||
status.innerHTML = '<span style="color:#3b82f6;">Demande envoyée, traitement lancé. Veuillez patienter...</span>';
|
||||
|
||||
const startTime = Date.now();
|
||||
const timer = setInterval(() => {
|
||||
const elapsed = Math.floor((Date.now() - startTime) / 1000);
|
||||
const min = Math.floor(elapsed / 60);
|
||||
const sec = elapsed % 60;
|
||||
const timeStr = min > 0 ? min + 'min ' + String(sec).padStart(2, '0') + 's' : sec + 's';
|
||||
status.innerHTML = '<span style="color:#3b82f6;">Traitement en cours... ' + timeStr + '</span>';
|
||||
}, 1000);
|
||||
|
||||
try {
|
||||
const response = await fetch('/reprocess/{{ filepath }}', { method: 'POST' });
|
||||
clearInterval(timer);
|
||||
const data = await response.json();
|
||||
|
||||
if (data.ok) {
|
||||
status.innerHTML = '<span style="color:#16a34a;font-weight:600;">Traitement terminé. Rechargement...</span>';
|
||||
btn.style.background = '#16a34a';
|
||||
btn.innerHTML = 'Terminé';
|
||||
setTimeout(() => location.reload(), 1000);
|
||||
} else {
|
||||
status.innerHTML = '<span style="color:#dc2626;">' + (data.error || 'Erreur') + '</span>';
|
||||
btn.disabled = false;
|
||||
btn.style.background = '#3b82f6';
|
||||
btn.innerHTML = 'Relancer l\'étude';
|
||||
}
|
||||
} catch (err) {
|
||||
clearInterval(timer);
|
||||
status.innerHTML = '<span style="color:#dc2626;">Erreur réseau</span>';
|
||||
btn.disabled = false;
|
||||
btn.style.background = '#3b82f6';
|
||||
btn.innerHTML = 'Relancer l\'étude';
|
||||
}
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
100
t2a_install_rag_cleanup/src/viewer/templates/index.html
Normal file
100
t2a_install_rag_cleanup/src/viewer/templates/index.html
Normal file
@@ -0,0 +1,100 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %}Accueil{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
{% for group_name, items in groups.items() %}
|
||||
<div class="group-title">{{ group_name | format_dossier_name }}</div>
|
||||
{% for item in items %}
|
||||
{% if 'fusionne' in item.name %}
|
||||
<a href="/dossier/{{ item.path_rel }}" class="sidebar-fusionne">★ Fusionné</a>
|
||||
{% else %}
|
||||
<a href="/dossier/{{ item.path_rel }}">{{ item.name | format_doc_name }}</a>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h2>Dossiers médicaux traités</h2>
|
||||
|
||||
{% if not groups %}
|
||||
<div class="card">
|
||||
<p>Aucun dossier trouvé dans <code>output/structured/</code>.</p>
|
||||
<p style="margin-top:0.5rem;font-size:0.85rem;color:#64748b">
|
||||
Lancez le pipeline avec <code>python -m src.main</code> pour générer des fichiers.
|
||||
</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% for group_name, items in groups.items() %}
|
||||
<div class="section">
|
||||
{% set ns = namespace(total=0.0, count=0) %}
|
||||
{% for item in items %}
|
||||
{% if item.dossier.processing_time_s is not none %}
|
||||
{% set ns.total = ns.total + item.dossier.processing_time_s %}
|
||||
{% set ns.count = ns.count + 1 %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% set stats = group_stats.get(group_name, {}) %}
|
||||
<h3 style="display:flex;align-items:baseline;gap:0.75rem;flex-wrap:wrap;">
|
||||
{{ group_name | format_dossier_name }}
|
||||
<span style="font-size:0.75rem;font-weight:400;color:#64748b;">
|
||||
{{ items|length }} fichier(s){% if ns.count %} — total : {{ ns.total|format_duration }}{% endif %}
|
||||
</span>
|
||||
{% if stats %}
|
||||
<span class="badge-count badge-das">{{ stats.das_count }} DAS</span>
|
||||
<span class="badge-count badge-actes">{{ stats.actes_count }} actes</span>
|
||||
{% if stats.alertes_count %}<span class="badge-count badge-alertes">{{ stats.alertes_count }} alertes</span>{% endif %}
|
||||
{% if stats.cma_count %}<span class="badge-count badge-cma">{{ stats.cma_count }} CMA</span>{% endif %}
|
||||
{% endif %}
|
||||
</h3>
|
||||
{% if items|length > 1 %}
|
||||
{% for item in items if 'fusionne' in item.name %}
|
||||
{% if loop.first %}
|
||||
<div style="margin-bottom:0.75rem;">
|
||||
<a href="/dossier/{{ item.path_rel }}" class="badge-count badge-fusion" style="text-decoration:none;font-size:0.8rem;padding:4px 12px;">
|
||||
Vue patient fusionnée
|
||||
</a>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
<div style="display:grid;grid-template-columns:repeat(auto-fill,minmax(300px,1fr));gap:1rem;">
|
||||
{% for item in items %}
|
||||
<a href="/dossier/{{ item.path_rel }}" style="text-decoration:none;color:inherit;">
|
||||
<div class="card" style="cursor:pointer;transition:box-shadow 0.15s;">
|
||||
<div style="font-weight:600;font-size:0.9rem;margin-bottom:0.4rem;color:#0f172a;">
|
||||
{{ item.name | format_doc_name }}
|
||||
</div>
|
||||
<div style="display:flex;flex-wrap:wrap;gap:0.3rem;margin-bottom:0.4rem;">
|
||||
{% if item.dossier.document_type %}
|
||||
<span class="badge" style="background:#e0e7ff;color:#3730a3;">{{ item.dossier.document_type }}</span>
|
||||
{% endif %}
|
||||
{% if item.dossier.source_files %}<span class="badge badge-fusion">fusionné</span>{% endif %}
|
||||
{% if item.dossier.diagnostics_associes %}<span class="badge-count badge-das">{{ item.dossier.diagnostics_associes|length }} DAS</span>{% endif %}
|
||||
{% if item.dossier.actes_ccam %}<span class="badge-count badge-actes">{{ item.dossier.actes_ccam|length }} actes</span>{% endif %}
|
||||
{% if item.dossier.alertes_codage %}<span class="badge-count badge-alertes">{{ item.dossier.alertes_codage|length }} alertes</span>{% endif %}
|
||||
</div>
|
||||
{% if item.dossier.diagnostic_principal %}
|
||||
<div style="margin-top:0.5rem;font-size:0.8rem;color:#334155;">
|
||||
<strong>DP :</strong> {{ item.dossier.diagnostic_principal.texte[:80] }}{% if item.dossier.diagnostic_principal.texte|length > 80 %}…{% endif %}
|
||||
</div>
|
||||
{% if item.dossier.diagnostic_principal.cim10_suggestion %}
|
||||
<div style="margin-top:0.25rem;">
|
||||
<span class="badge" style="background:#dbeafe;color:#1d4ed8;">{{ item.dossier.diagnostic_principal.cim10_suggestion }}</span>
|
||||
{{ item.dossier.diagnostic_principal.cim10_confidence | confidence_badge }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% if item.dossier.processing_time_s is not none %}
|
||||
<div style="margin-top:0.5rem;font-size:0.75rem;color:#64748b;">
|
||||
Traitement : {{ item.dossier.processing_time_s|format_duration }}
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% endblock %}
|
||||
@@ -0,0 +1,404 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %}Validation — {{ group_name }}{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
{% for gn, items in groups.items() %}
|
||||
<div class="group-title">{{ gn | format_dossier_name }}</div>
|
||||
{% for item in items %}
|
||||
{% if 'fusionne' in item.name %}
|
||||
<a href="/dossier/{{ item.path_rel }}" class="sidebar-fusionne">★ Fusionné</a>
|
||||
{% else %}
|
||||
<a href="/dossier/{{ item.path_rel }}">{{ item.name | format_doc_name }}</a>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<style>
|
||||
.seg-btn { display:inline-block;padding:4px 12px;border:1px solid #cbd5e1;font-size:0.8rem;font-weight:600;cursor:pointer;background:#fff;color:#475569;transition:all 0.15s; }
|
||||
.seg-btn:first-child { border-radius:6px 0 0 6px; }
|
||||
.seg-btn:last-child { border-radius:0 6px 6px 0; }
|
||||
.seg-btn:not(:first-child) { border-left:none; }
|
||||
.seg-btn.active-correct { background:#dcfce7;color:#16a34a;border-color:#16a34a; }
|
||||
.seg-btn.active-modifie { background:#fef9c3;color:#ca8a04;border-color:#ca8a04; }
|
||||
.seg-btn.active-supprime { background:#fee2e2;color:#dc2626;border-color:#dc2626; }
|
||||
.code-input { padding:4px 8px;border:1px solid #cbd5e1;border-radius:6px;font-size:0.85rem;font-family:monospace;width:120px; }
|
||||
.comment-input { padding:4px 8px;border:1px solid #e2e8f0;border-radius:6px;font-size:0.8rem;width:100%;max-width:300px; }
|
||||
.autocomplete-wrapper { position:relative;display:inline-block; }
|
||||
.autocomplete-dropdown { position:absolute;top:100%;left:0;z-index:100;background:#fff;border:1px solid #cbd5e1;border-radius:6px;box-shadow:0 4px 12px rgba(0,0,0,0.1);max-height:250px;overflow-y:auto;width:400px;display:none; }
|
||||
.autocomplete-dropdown .ac-item { padding:6px 10px;cursor:pointer;font-size:0.8rem;border-bottom:1px solid #f1f5f9; }
|
||||
.autocomplete-dropdown .ac-item:hover { background:#f1f5f9; }
|
||||
.autocomplete-dropdown .ac-code { font-family:monospace;font-weight:700;color:#1e293b;margin-right:8px; }
|
||||
.autocomplete-dropdown .ac-label { color:#64748b; }
|
||||
.save-bar { position:sticky;bottom:0;background:#fff;border-top:2px solid #e2e8f0;padding:1rem;display:flex;align-items:center;gap:1rem;z-index:50; }
|
||||
.btn-save { padding:8px 20px;border-radius:8px;border:none;font-size:0.85rem;font-weight:600;cursor:pointer;transition:all 0.15s; }
|
||||
.btn-brouillon { background:#f1f5f9;color:#475569; }
|
||||
.btn-brouillon:hover { background:#e2e8f0; }
|
||||
.btn-valider { background:#16a34a;color:#fff; }
|
||||
.btn-valider:hover { background:#15803d; }
|
||||
.nav-link { font-size:0.85rem;color:#3b82f6;text-decoration:none;font-weight:600; }
|
||||
.nav-link:hover { text-decoration:underline; }
|
||||
.das-row-added { background:#f0fdf4; }
|
||||
</style>
|
||||
|
||||
<!-- Navigation -->
|
||||
<div style="display:flex;align-items:center;gap:1rem;margin-bottom:1rem;">
|
||||
<a href="/validation" class="back">Retour à la liste</a>
|
||||
<span style="color:#cbd5e1;">|</span>
|
||||
{% if prev_id %}
|
||||
<a href="/validation/{{ prev_id }}" class="nav-link">Précédent</a>
|
||||
{% endif %}
|
||||
{% if next_id %}
|
||||
<a href="/validation/{{ next_id }}" class="nav-link">Suivant</a>
|
||||
{% endif %}
|
||||
<span style="flex:1;"></span>
|
||||
<a href="/dossier/{{ group_name }}/{{ group_name }}_fusionne_cim10.json" class="nav-link" target="_blank">Voir le dossier complet</a>
|
||||
</div>
|
||||
|
||||
<h2>Validation : {{ group_name }}</h2>
|
||||
|
||||
{% if dossier %}
|
||||
<!-- Infos séjour (lecture seule) -->
|
||||
<div class="card" style="margin-bottom:1rem;">
|
||||
<h3>Séjour</h3>
|
||||
<div class="info-grid">
|
||||
{% if dossier.sejour.sexe %}<div class="info-item"><label>Sexe</label><span>{{ dossier.sejour.sexe }}</span></div>{% endif %}
|
||||
{% if dossier.sejour.age is not none %}<div class="info-item"><label>Âge</label><span>{{ dossier.sejour.age }} ans</span></div>{% endif %}
|
||||
{% if dossier.sejour.date_entree %}<div class="info-item"><label>Entrée</label><span>{{ dossier.sejour.date_entree }}</span></div>{% endif %}
|
||||
{% if dossier.sejour.date_sortie %}<div class="info-item"><label>Sortie</label><span>{{ dossier.sejour.date_sortie }}</span></div>{% endif %}
|
||||
{% if dossier.sejour.duree_sejour is not none %}<div class="info-item"><label>Durée</label><span>{{ dossier.sejour.duree_sejour }}j</span></div>{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- DP -->
|
||||
<div class="card" style="margin-bottom:1rem;">
|
||||
<h3>Diagnostic Principal</h3>
|
||||
{% set dp = annotation.dp %}
|
||||
<div style="margin-bottom:0.75rem;">
|
||||
<span style="font-size:0.9rem;">{{ dp.texte_original }}</span>
|
||||
</div>
|
||||
<div style="display:flex;align-items:center;gap:1rem;flex-wrap:wrap;">
|
||||
<span style="font-family:monospace;font-size:1rem;font-weight:700;background:#dbeafe;color:#1d4ed8;padding:2px 10px;border-radius:6px;">{{ dp.code_pipeline }}</span>
|
||||
{{ dp.confidence | confidence_badge }}
|
||||
|
||||
<!-- Boutons segmentés -->
|
||||
<div class="seg-group" data-target="dp">
|
||||
<span class="seg-btn active-correct" data-value="correct" onclick="setStatut(this)">Correct</span>
|
||||
<span class="seg-btn" data-value="modifie" onclick="setStatut(this)">Modifier</span>
|
||||
<span class="seg-btn" data-value="supprime" onclick="setStatut(this)">Supprimer</span>
|
||||
</div>
|
||||
|
||||
<!-- Champ code alternatif -->
|
||||
<div class="autocomplete-wrapper dp-code-field" style="display:none;">
|
||||
<input type="text" class="code-input" placeholder="Code CIM-10" data-ac="dp"
|
||||
value="{{ dp.code_corrige or '' }}" autocomplete="off">
|
||||
<div class="autocomplete-dropdown"></div>
|
||||
</div>
|
||||
|
||||
<input type="text" class="comment-input dp-comment" placeholder="Commentaire (optionnel)" value="{{ dp.commentaire or '' }}">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- DAS -->
|
||||
<div class="card" style="margin-bottom:1rem;">
|
||||
<h3>Diagnostics Associés ({{ annotation.das|length }})</h3>
|
||||
<table id="das-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width:30px;">#</th>
|
||||
<th>Texte</th>
|
||||
<th>Code pipeline</th>
|
||||
<th>Conf.</th>
|
||||
<th>Source</th>
|
||||
<th>Validation</th>
|
||||
<th>Code corrigé</th>
|
||||
<th>Commentaire</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for das in annotation.das %}
|
||||
<tr class="das-row" data-index="{{ das.index }}">
|
||||
<td style="color:#94a3b8;font-size:0.8rem;">{{ loop.index }}</td>
|
||||
<td style="font-size:0.85rem;max-width:250px;">{{ das.texte_original }}</td>
|
||||
<td><span style="font-family:monospace;font-weight:600;">{{ das.code_pipeline }}</span></td>
|
||||
<td>{{ das.confidence | confidence_badge }}</td>
|
||||
<td>
|
||||
{% if das.source %}
|
||||
<span style="font-size:0.7rem;padding:2px 6px;border-radius:4px;background:#f1f5f9;color:#475569;">{{ das.source }}</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<div class="seg-group" data-target="das-{{ das.index }}">
|
||||
<span class="seg-btn {% if das.statut == 'correct' %}active-correct{% endif %}" data-value="correct" onclick="setStatut(this)">OK</span>
|
||||
<span class="seg-btn {% if das.statut == 'modifie' %}active-modifie{% endif %}" data-value="modifie" onclick="setStatut(this)">Mod</span>
|
||||
<span class="seg-btn {% if das.statut == 'supprime' %}active-supprime{% endif %}" data-value="supprime" onclick="setStatut(this)">Sup</span>
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
<div class="autocomplete-wrapper das-code-field-{{ das.index }}" style="{% if das.statut != 'modifie' %}display:none;{% endif %}">
|
||||
<input type="text" class="code-input" placeholder="CIM-10" data-ac="das-{{ das.index }}"
|
||||
value="{{ das.code_corrige or '' }}" autocomplete="off">
|
||||
<div class="autocomplete-dropdown"></div>
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
<input type="text" class="comment-input das-comment" data-index="{{ das.index }}" placeholder="" value="{{ das.commentaire or '' }}">
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<!-- DAS ajoutés -->
|
||||
<div id="das-ajoutes" style="margin-top:1rem;">
|
||||
<h3 style="font-size:0.85rem;color:#16a34a;">DAS manquants (ajoutés par le DIM)</h3>
|
||||
<div id="das-ajoutes-list">
|
||||
{% for aj in annotation.das_ajoutes %}
|
||||
<div class="das-added-row das-row-added" style="display:flex;align-items:center;gap:0.5rem;padding:0.5rem 0;border-bottom:1px solid #e2e8f0;" data-aj-index="{{ loop.index0 }}">
|
||||
<input type="text" class="comment-input aj-texte" placeholder="Texte du diagnostic" value="{{ aj.texte or '' }}" style="flex:1;max-width:300px;">
|
||||
<div class="autocomplete-wrapper">
|
||||
<input type="text" class="code-input aj-code" placeholder="CIM-10" value="{{ aj.code or '' }}" autocomplete="off" data-ac="aj-{{ loop.index0 }}">
|
||||
<div class="autocomplete-dropdown"></div>
|
||||
</div>
|
||||
<input type="text" class="comment-input aj-comment" placeholder="Commentaire" value="{{ aj.commentaire or '' }}" style="max-width:200px;">
|
||||
<button onclick="removeAjoute(this)" style="background:none;border:none;color:#dc2626;cursor:pointer;font-size:1.1rem;padding:4px 8px;">×</button>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<button id="btn-add-das" onclick="addDasManquant()" style="margin-top:0.5rem;padding:6px 14px;border-radius:6px;border:1px dashed #16a34a;background:#f0fdf4;color:#16a34a;font-size:0.8rem;font-weight:600;cursor:pointer;">
|
||||
+ Ajouter un DAS manquant
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Commentaire général -->
|
||||
<div class="card" style="margin-bottom:5rem;">
|
||||
<h3>Commentaire général</h3>
|
||||
<textarea id="commentaire-general" rows="3"
|
||||
style="width:100%;padding:8px;border:1px solid #cbd5e1;border-radius:6px;font-size:0.85rem;resize:vertical;">{{ annotation.commentaire_general or '' }}</textarea>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- Barre de sauvegarde sticky -->
|
||||
<div class="save-bar">
|
||||
<button class="btn-save btn-brouillon" onclick="saveAnnotation('en_cours')">Enregistrer (brouillon)</button>
|
||||
<button class="btn-save btn-valider" onclick="saveAnnotation('valide')">Marquer comme validé</button>
|
||||
<span id="save-status" style="font-size:0.8rem;color:#64748b;"></span>
|
||||
<span style="flex:1;"></span>
|
||||
<span style="font-size:0.8rem;color:#94a3b8;">Dossier : {{ dossier_id }}</span>
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
|
||||
{% block scripts %}
|
||||
<script>
|
||||
const DOSSIER_ID = {{ dossier_id | tojson }};
|
||||
let ajouteCounter = {{ annotation.das_ajoutes|length }};
|
||||
|
||||
// --- Boutons segmentés ---
|
||||
function setStatut(btn) {
|
||||
const group = btn.parentElement;
|
||||
const target = group.dataset.target;
|
||||
const value = btn.dataset.value;
|
||||
|
||||
// Reset tous les boutons du groupe
|
||||
group.querySelectorAll('.seg-btn').forEach(function(b) {
|
||||
b.className = 'seg-btn';
|
||||
});
|
||||
btn.classList.add('active-' + value);
|
||||
|
||||
// Afficher/cacher le champ code
|
||||
let codeField;
|
||||
if (target === 'dp') {
|
||||
codeField = document.querySelector('.dp-code-field');
|
||||
} else {
|
||||
const idx = target.replace('das-', '');
|
||||
codeField = document.querySelector('.das-code-field-' + idx);
|
||||
}
|
||||
if (codeField) {
|
||||
codeField.style.display = (value === 'modifie') ? '' : 'none';
|
||||
}
|
||||
}
|
||||
|
||||
// --- Autocomplete CIM-10 ---
|
||||
let acDebounceTimer = null;
|
||||
|
||||
document.addEventListener('input', function(e) {
|
||||
if (!e.target.matches('[data-ac]') && !e.target.matches('.aj-code')) return;
|
||||
|
||||
const input = e.target;
|
||||
const dropdown = input.parentElement.querySelector('.autocomplete-dropdown');
|
||||
const q = input.value.trim();
|
||||
|
||||
if (q.length < 2) {
|
||||
dropdown.style.display = 'none';
|
||||
return;
|
||||
}
|
||||
|
||||
clearTimeout(acDebounceTimer);
|
||||
acDebounceTimer = setTimeout(function() {
|
||||
fetch('/api/cim10/search?q=' + encodeURIComponent(q))
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(data) {
|
||||
if (!data.results || !data.results.length) {
|
||||
dropdown.style.display = 'none';
|
||||
return;
|
||||
}
|
||||
dropdown.innerHTML = '';
|
||||
data.results.forEach(function(item) {
|
||||
const div = document.createElement('div');
|
||||
div.className = 'ac-item';
|
||||
div.innerHTML = '<span class="ac-code">' + item.code + '</span><span class="ac-label">' + item.label.substring(0, 80) + '</span>';
|
||||
div.addEventListener('click', function() {
|
||||
input.value = item.code;
|
||||
dropdown.style.display = 'none';
|
||||
});
|
||||
dropdown.appendChild(div);
|
||||
});
|
||||
dropdown.style.display = 'block';
|
||||
})
|
||||
.catch(function() { dropdown.style.display = 'none'; });
|
||||
}, 300);
|
||||
});
|
||||
|
||||
// Fermer les dropdowns quand on clique ailleurs
|
||||
document.addEventListener('click', function(e) {
|
||||
if (!e.target.matches('[data-ac]') && !e.target.matches('.aj-code') && !e.target.closest('.autocomplete-dropdown')) {
|
||||
document.querySelectorAll('.autocomplete-dropdown').forEach(function(d) { d.style.display = 'none'; });
|
||||
}
|
||||
});
|
||||
|
||||
// --- Ajouter DAS manquant ---
|
||||
function addDasManquant() {
|
||||
const list = document.getElementById('das-ajoutes-list');
|
||||
const idx = ajouteCounter++;
|
||||
const row = document.createElement('div');
|
||||
row.className = 'das-added-row das-row-added';
|
||||
row.style.cssText = 'display:flex;align-items:center;gap:0.5rem;padding:0.5rem 0;border-bottom:1px solid #e2e8f0;';
|
||||
row.dataset.ajIndex = idx;
|
||||
row.innerHTML = '<input type="text" class="comment-input aj-texte" placeholder="Texte du diagnostic" style="flex:1;max-width:300px;">' +
|
||||
'<div class="autocomplete-wrapper">' +
|
||||
'<input type="text" class="code-input aj-code" placeholder="CIM-10" autocomplete="off" data-ac="aj-' + idx + '">' +
|
||||
'<div class="autocomplete-dropdown"></div>' +
|
||||
'</div>' +
|
||||
'<input type="text" class="comment-input aj-comment" placeholder="Commentaire" style="max-width:200px;">' +
|
||||
'<button onclick="removeAjoute(this)" style="background:none;border:none;color:#dc2626;cursor:pointer;font-size:1.1rem;padding:4px 8px;">×</button>';
|
||||
list.appendChild(row);
|
||||
}
|
||||
|
||||
function removeAjoute(btn) {
|
||||
btn.closest('.das-added-row').remove();
|
||||
}
|
||||
|
||||
// --- Collecte et sauvegarde ---
|
||||
function collectAnnotation(statut) {
|
||||
const validateur = document.getElementById('validateur-name');
|
||||
const valName = validateur ? validateur.value : (localStorage.getItem('t2a_validateur') || '');
|
||||
|
||||
// DP
|
||||
const dpGroup = document.querySelector('.seg-group[data-target="dp"]');
|
||||
const dpActive = dpGroup ? dpGroup.querySelector('.seg-btn[class*="active-"]') : null;
|
||||
let dpStatut = 'correct';
|
||||
if (dpActive) {
|
||||
if (dpActive.classList.contains('active-modifie')) dpStatut = 'modifie';
|
||||
else if (dpActive.classList.contains('active-supprime')) dpStatut = 'supprime';
|
||||
}
|
||||
const dpCodeField = document.querySelector('.dp-code-field input');
|
||||
const dpComment = document.querySelector('.dp-comment');
|
||||
|
||||
const dp = {
|
||||
texte_original: {{ (annotation.dp.texte_original or '') | tojson }},
|
||||
code_pipeline: {{ (annotation.dp.code_pipeline or '') | tojson }},
|
||||
confidence: {{ (annotation.dp.confidence or '') | tojson }},
|
||||
statut: dpStatut,
|
||||
code_corrige: (dpStatut === 'modifie' && dpCodeField) ? dpCodeField.value : null,
|
||||
commentaire: dpComment ? dpComment.value : ''
|
||||
};
|
||||
|
||||
// DAS
|
||||
const dasRows = document.querySelectorAll('.das-row');
|
||||
const das = [];
|
||||
dasRows.forEach(function(row) {
|
||||
const idx = parseInt(row.dataset.index);
|
||||
const group = row.querySelector('.seg-group');
|
||||
const active = group ? group.querySelector('.seg-btn[class*="active-"]') : null;
|
||||
let dasStatut = 'correct';
|
||||
if (active) {
|
||||
if (active.classList.contains('active-modifie')) dasStatut = 'modifie';
|
||||
else if (active.classList.contains('active-supprime')) dasStatut = 'supprime';
|
||||
}
|
||||
const codeInput = row.querySelector('.code-input');
|
||||
const commentInput = row.querySelector('.das-comment');
|
||||
|
||||
das.push({
|
||||
index: idx,
|
||||
texte_original: row.querySelector('td:nth-child(2)').textContent.trim(),
|
||||
code_pipeline: row.querySelector('td:nth-child(3)').textContent.trim(),
|
||||
confidence: {{ annotation.das | tojson }}.find(function(d) { return d.index === idx; })?.confidence || '',
|
||||
source: {{ annotation.das | tojson }}.find(function(d) { return d.index === idx; })?.source || '',
|
||||
statut: dasStatut,
|
||||
code_corrige: (dasStatut === 'modifie' && codeInput) ? codeInput.value : null,
|
||||
commentaire: commentInput ? commentInput.value : ''
|
||||
});
|
||||
});
|
||||
|
||||
// DAS ajoutés
|
||||
const ajRows = document.querySelectorAll('.das-added-row');
|
||||
const das_ajoutes = [];
|
||||
ajRows.forEach(function(row) {
|
||||
const texte = row.querySelector('.aj-texte').value.trim();
|
||||
const code = row.querySelector('.aj-code').value.trim();
|
||||
const comment = row.querySelector('.aj-comment').value.trim();
|
||||
if (texte || code) {
|
||||
das_ajoutes.push({ texte: texte, code: code, commentaire: comment });
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
dossier_id: DOSSIER_ID,
|
||||
validateur: valName,
|
||||
statut: statut,
|
||||
dp: dp,
|
||||
das: das,
|
||||
das_ajoutes: das_ajoutes,
|
||||
commentaire_general: document.getElementById('commentaire-general').value
|
||||
};
|
||||
}
|
||||
|
||||
function saveAnnotation(statut) {
|
||||
const data = collectAnnotation(statut);
|
||||
const statusEl = document.getElementById('save-status');
|
||||
statusEl.textContent = 'Sauvegarde...';
|
||||
statusEl.style.color = '#64748b';
|
||||
|
||||
fetch('/api/validation/save', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(data)
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.ok) {
|
||||
statusEl.textContent = statut === 'valide' ? 'Validé !' : 'Brouillon enregistré';
|
||||
statusEl.style.color = '#16a34a';
|
||||
} else {
|
||||
statusEl.textContent = d.error || 'Erreur';
|
||||
statusEl.style.color = '#dc2626';
|
||||
}
|
||||
})
|
||||
.catch(function() {
|
||||
statusEl.textContent = 'Erreur réseau';
|
||||
statusEl.style.color = '#dc2626';
|
||||
});
|
||||
}
|
||||
|
||||
// Raccourci clavier : Ctrl+S pour sauvegarder en brouillon
|
||||
document.addEventListener('keydown', function(e) {
|
||||
if ((e.ctrlKey || e.metaKey) && e.key === 's') {
|
||||
e.preventDefault();
|
||||
saveAnnotation('en_cours');
|
||||
}
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
@@ -0,0 +1,179 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %}Validation DIM{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
{% for group_name, items in groups.items() %}
|
||||
<div class="group-title">{{ group_name | format_dossier_name }}</div>
|
||||
{% for item in items %}
|
||||
{% if 'fusionne' in item.name %}
|
||||
<a href="/dossier/{{ item.path_rel }}" class="sidebar-fusionne">★ Fusionné</a>
|
||||
{% else %}
|
||||
<a href="/dossier/{{ item.path_rel }}">{{ item.name | format_doc_name }}</a>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div style="display:flex;align-items:center;gap:1rem;margin-bottom:1.5rem;">
|
||||
<h2 style="margin:0;">Validation DIM</h2>
|
||||
<a href="/validation/metrics" style="font-size:0.85rem;color:#3b82f6;text-decoration:none;font-weight:600;">Voir les métriques</a>
|
||||
</div>
|
||||
|
||||
<!-- Barre de progression -->
|
||||
<div class="card" style="margin-bottom:1.5rem;">
|
||||
<div style="display:flex;align-items:center;gap:1rem;margin-bottom:0.75rem;">
|
||||
<span style="font-weight:600;font-size:0.9rem;">Progression</span>
|
||||
<span style="font-size:0.85rem;color:#64748b;">{{ valides }} / {{ total }} validés</span>
|
||||
{% if en_cours > 0 %}
|
||||
<span style="font-size:0.85rem;color:#ca8a04;">{{ en_cours }} en cours</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div style="background:#e2e8f0;border-radius:9999px;height:12px;overflow:hidden;">
|
||||
{% set pct_valide = (valides / total * 100) if total > 0 else 0 %}
|
||||
{% set pct_encours = (en_cours / total * 100) if total > 0 else 0 %}
|
||||
<div style="display:flex;height:100%;">
|
||||
<div style="width:{{ pct_valide }}%;background:#16a34a;transition:width 0.3s;"></div>
|
||||
<div style="width:{{ pct_encours }}%;background:#eab308;transition:width 0.3s;"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Nom du validateur -->
|
||||
<div class="card" style="margin-bottom:1rem;padding:0.75rem 1.25rem;">
|
||||
<div style="display:flex;align-items:center;gap:0.75rem;">
|
||||
<label for="validateur-name" style="font-size:0.8rem;font-weight:600;color:#475569;white-space:nowrap;">Nom du validateur :</label>
|
||||
<input type="text" id="validateur-name" placeholder="Dr. X"
|
||||
style="flex:1;padding:0.35rem 0.6rem;border:1px solid #cbd5e1;border-radius:6px;font-size:0.85rem;max-width:300px;">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Filtres -->
|
||||
<div style="display:flex;gap:0.5rem;margin-bottom:1rem;">
|
||||
<button class="filter-btn active" data-filter="all"
|
||||
style="padding:0.35rem 0.75rem;border-radius:6px;border:1px solid #cbd5e1;background:#fff;font-size:0.8rem;font-weight:600;cursor:pointer;">
|
||||
Tous ({{ total }})
|
||||
</button>
|
||||
<button class="filter-btn" data-filter="non_commence"
|
||||
style="padding:0.35rem 0.75rem;border-radius:6px;border:1px solid #cbd5e1;background:#fff;font-size:0.8rem;font-weight:600;cursor:pointer;">
|
||||
Non commencés ({{ total - valides - en_cours }})
|
||||
</button>
|
||||
<button class="filter-btn" data-filter="en_cours"
|
||||
style="padding:0.35rem 0.75rem;border-radius:6px;border:1px solid #cbd5e1;background:#fff;font-size:0.8rem;font-weight:600;cursor:pointer;">
|
||||
En cours ({{ en_cours }})
|
||||
</button>
|
||||
<button class="filter-btn" data-filter="valide"
|
||||
style="padding:0.35rem 0.75rem;border-radius:6px;border:1px solid #cbd5e1;background:#fff;font-size:0.8rem;font-weight:600;cursor:pointer;">
|
||||
Validés ({{ valides }})
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Tableau -->
|
||||
<div class="card" style="padding:0;">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>#</th>
|
||||
<th>Dossier</th>
|
||||
<th>DP</th>
|
||||
<th>Conf. DP</th>
|
||||
<th>DAS</th>
|
||||
<th>CPAM</th>
|
||||
<th>Statut</th>
|
||||
<th>Validateur</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for item in items %}
|
||||
<tr class="validation-row" data-statut="{{ item.statut }}">
|
||||
<td style="color:#94a3b8;font-size:0.8rem;">{{ loop.index }}</td>
|
||||
<td>
|
||||
<a href="/validation/{{ item.dossier_id }}" style="color:#1e293b;text-decoration:none;font-weight:600;font-size:0.85rem;">
|
||||
{{ item.group_name }}
|
||||
</a>
|
||||
</td>
|
||||
<td>
|
||||
<span style="font-family:monospace;font-size:0.85rem;font-weight:600;">{{ item.dp_code or '—' }}</span>
|
||||
{% if item.dp_texte %}
|
||||
<br><span style="font-size:0.75rem;color:#64748b;">{{ item.dp_texte[:50] }}{% if item.dp_texte|length > 50 %}…{% endif %}</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ item.dp_confidence | confidence_badge }}</td>
|
||||
<td style="text-align:center;font-weight:600;">{{ item.nb_das }}</td>
|
||||
<td style="text-align:center;">
|
||||
{% if item.has_cpam %}
|
||||
<span style="display:inline-block;padding:2px 8px;border-radius:9999px;font-size:0.7rem;font-weight:600;color:#7c3aed;background:#ede9fe;">CPAM</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if item.statut == 'valide' %}
|
||||
<span style="display:inline-block;padding:2px 8px;border-radius:9999px;font-size:0.7rem;font-weight:600;color:#16a34a;background:#dcfce7;">Validé</span>
|
||||
{% elif item.statut == 'en_cours' %}
|
||||
<span style="display:inline-block;padding:2px 8px;border-radius:9999px;font-size:0.7rem;font-weight:600;color:#ca8a04;background:#fef9c3;">En cours</span>
|
||||
{% else %}
|
||||
<span style="display:inline-block;padding:2px 8px;border-radius:9999px;font-size:0.7rem;font-weight:600;color:#6b7280;background:#f3f4f6;">Non commencé</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td style="font-size:0.8rem;color:#64748b;">{{ item.validateur }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
{% if not items %}
|
||||
<div class="card" style="text-align:center;padding:2rem;">
|
||||
<p style="color:#64748b;font-size:0.9rem;">Aucun dossier sélectionné pour validation.</p>
|
||||
<p style="margin-top:0.5rem;font-size:0.85rem;color:#94a3b8;">
|
||||
Lancez <code>python scripts/select_validation_dossiers.py</code> pour sélectionner les dossiers.
|
||||
</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% endblock %}
|
||||
|
||||
{% block scripts %}
|
||||
<script>
|
||||
(function() {
|
||||
// Persistance nom validateur dans localStorage
|
||||
const input = document.getElementById('validateur-name');
|
||||
if (input) {
|
||||
const saved = localStorage.getItem('t2a_validateur');
|
||||
if (saved) input.value = saved;
|
||||
input.addEventListener('input', function() {
|
||||
localStorage.setItem('t2a_validateur', this.value);
|
||||
});
|
||||
}
|
||||
|
||||
// Filtres
|
||||
document.querySelectorAll('.filter-btn').forEach(function(btn) {
|
||||
btn.addEventListener('click', function() {
|
||||
document.querySelectorAll('.filter-btn').forEach(function(b) {
|
||||
b.classList.remove('active');
|
||||
b.style.background = '#fff';
|
||||
b.style.color = '#1e293b';
|
||||
});
|
||||
this.classList.add('active');
|
||||
this.style.background = '#1e293b';
|
||||
this.style.color = '#fff';
|
||||
|
||||
const filter = this.dataset.filter;
|
||||
document.querySelectorAll('.validation-row').forEach(function(row) {
|
||||
if (filter === 'all' || row.dataset.statut === filter) {
|
||||
row.style.display = '';
|
||||
} else {
|
||||
row.style.display = 'none';
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// Activer le style du premier bouton
|
||||
const firstBtn = document.querySelector('.filter-btn.active');
|
||||
if (firstBtn) {
|
||||
firstBtn.style.background = '#1e293b';
|
||||
firstBtn.style.color = '#fff';
|
||||
}
|
||||
})();
|
||||
</script>
|
||||
{% endblock %}
|
||||
@@ -0,0 +1,243 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %}Métriques Validation DIM{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
{% for group_name, items in groups.items() %}
|
||||
<div class="group-title">{{ group_name | format_dossier_name }}</div>
|
||||
{% for item in items %}
|
||||
{% if 'fusionne' in item.name %}
|
||||
<a href="/dossier/{{ item.path_rel }}" class="sidebar-fusionne">★ Fusionné</a>
|
||||
{% else %}
|
||||
<a href="/dossier/{{ item.path_rel }}">{{ item.name | format_doc_name }}</a>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div style="display:flex;align-items:center;gap:1rem;margin-bottom:1.5rem;">
|
||||
<a href="/validation" class="back">Retour à la liste</a>
|
||||
<h2 style="margin:0;">Métriques de qualité</h2>
|
||||
</div>
|
||||
|
||||
{% if metrics.total_valides == 0 %}
|
||||
<div class="card" style="text-align:center;padding:2rem;">
|
||||
<p style="color:#64748b;font-size:0.9rem;">Aucun dossier validé pour le moment.</p>
|
||||
<p style="margin-top:0.5rem;font-size:0.85rem;color:#94a3b8;">
|
||||
Validez des dossiers depuis la <a href="/validation">liste de validation</a> pour voir les métriques.
|
||||
</p>
|
||||
</div>
|
||||
{% else %}
|
||||
|
||||
<!-- Progression -->
|
||||
<div class="card" style="margin-bottom:1.5rem;">
|
||||
<div style="display:flex;align-items:center;gap:1rem;margin-bottom:0.75rem;">
|
||||
<span style="font-weight:600;font-size:0.9rem;">Progression</span>
|
||||
<span style="font-size:0.85rem;color:#64748b;">{{ metrics.total_valides }} / {{ total_selection }} dossiers validés</span>
|
||||
</div>
|
||||
<div style="background:#e2e8f0;border-radius:9999px;height:12px;overflow:hidden;">
|
||||
{% set pct = (metrics.total_valides / total_selection * 100) if total_selection > 0 else 0 %}
|
||||
<div style="width:{{ pct }}%;background:#16a34a;transition:width 0.3s;height:100%;"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Métriques DP + DAS côte à côte -->
|
||||
<div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;margin-bottom:1.5rem;">
|
||||
|
||||
<!-- DP -->
|
||||
<div class="card">
|
||||
<h3>Diagnostic Principal</h3>
|
||||
<div class="info-grid" style="margin-top:0.75rem;">
|
||||
<div class="info-item">
|
||||
<label>Accuracy</label>
|
||||
<span style="font-size:1.3rem;font-weight:700;color:#16a34a;">{{ "%.1f" | format(metrics.dp.accuracy * 100) }}%</span>
|
||||
</div>
|
||||
<div class="info-item">
|
||||
<label>Correct</label>
|
||||
<span>{{ metrics.dp.correct }} / {{ metrics.dp.total }}</span>
|
||||
</div>
|
||||
<div class="info-item">
|
||||
<label>Modifié</label>
|
||||
<span style="color:#ca8a04;">{{ metrics.dp.modifie }}</span>
|
||||
</div>
|
||||
<div class="info-item">
|
||||
<label>Supprimé</label>
|
||||
<span style="color:#dc2626;">{{ metrics.dp.supprime }}</span>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Barre visuelle -->
|
||||
<div style="margin-top:1rem;display:flex;height:20px;border-radius:6px;overflow:hidden;">
|
||||
{% set dp_t = metrics.dp.total or 1 %}
|
||||
<div style="width:{{ metrics.dp.correct / dp_t * 100 }}%;background:#16a34a;" title="Correct"></div>
|
||||
<div style="width:{{ metrics.dp.modifie / dp_t * 100 }}%;background:#eab308;" title="Modifié"></div>
|
||||
<div style="width:{{ metrics.dp.supprime / dp_t * 100 }}%;background:#dc2626;" title="Supprimé"></div>
|
||||
</div>
|
||||
<div style="display:flex;gap:1rem;margin-top:0.35rem;font-size:0.7rem;color:#64748b;">
|
||||
<span><span style="display:inline-block;width:8px;height:8px;border-radius:50%;background:#16a34a;margin-right:3px;"></span>Correct</span>
|
||||
<span><span style="display:inline-block;width:8px;height:8px;border-radius:50%;background:#eab308;margin-right:3px;"></span>Modifié</span>
|
||||
<span><span style="display:inline-block;width:8px;height:8px;border-radius:50%;background:#dc2626;margin-right:3px;"></span>Supprimé</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- DAS -->
|
||||
<div class="card">
|
||||
<h3>Diagnostics Associés</h3>
|
||||
<div class="info-grid" style="margin-top:0.75rem;">
|
||||
<div class="info-item">
|
||||
<label>Precision</label>
|
||||
<span style="font-size:1.3rem;font-weight:700;color:#1d4ed8;">{{ "%.1f" | format(metrics.das.precision * 100) }}%</span>
|
||||
</div>
|
||||
<div class="info-item">
|
||||
<label>Recall</label>
|
||||
<span style="font-size:1.3rem;font-weight:700;color:#7c3aed;">{{ "%.1f" | format(metrics.das.recall * 100) }}%</span>
|
||||
</div>
|
||||
<div class="info-item">
|
||||
<label>F1-score</label>
|
||||
<span style="font-size:1.3rem;font-weight:700;color:#0f172a;">{{ "%.1f" | format(metrics.das.f1 * 100) }}%</span>
|
||||
</div>
|
||||
<div class="info-item">
|
||||
<label>Hallucination</label>
|
||||
<span style="font-size:1.3rem;font-weight:700;color:#dc2626;">{{ "%.1f" | format(metrics.das.hallucination_rate * 100) }}%</span>
|
||||
</div>
|
||||
</div>
|
||||
<div style="margin-top:0.75rem;font-size:0.8rem;color:#64748b;">
|
||||
<div style="display:flex;gap:1.5rem;">
|
||||
<span>Pipeline : {{ metrics.das.total_pipeline }} DAS</span>
|
||||
<span>Référence DIM : {{ metrics.das.reference }}</span>
|
||||
<span style="color:#16a34a;">Correct : {{ metrics.das.correct }}</span>
|
||||
<span style="color:#ca8a04;">Modifié : {{ metrics.das.modifie }}</span>
|
||||
<span style="color:#dc2626;">Supprimé : {{ metrics.das.supprime }}</span>
|
||||
<span style="color:#7c3aed;">Ajouté : {{ metrics.das.ajoutes }}</span>
|
||||
</div>
|
||||
</div>
|
||||
<!-- Taux manqués -->
|
||||
<div style="margin-top:0.5rem;font-size:0.8rem;">
|
||||
<span style="color:#64748b;">Taux DAS manqués : </span>
|
||||
<span style="font-weight:600;color:#7c3aed;">{{ "%.1f" | format(metrics.das.miss_rate * 100) }}%</span>
|
||||
<span style="color:#94a3b8;font-size:0.75rem;"> ({{ metrics.das.ajoutes }} ajoutés / {{ metrics.das.reference }} référence)</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Ventilation par confiance -->
|
||||
{% if metrics.by_confidence %}
|
||||
<div class="card" style="margin-bottom:1rem;">
|
||||
<h3>Par niveau de confiance</h3>
|
||||
<table style="margin-top:0.75rem;">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Confiance</th>
|
||||
<th>Total DAS</th>
|
||||
<th>Correct</th>
|
||||
<th>Modifié</th>
|
||||
<th>Supprimé</th>
|
||||
<th>Precision</th>
|
||||
<th>Hallucination</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for conf, bucket in metrics.by_confidence.items() %}
|
||||
<tr>
|
||||
<td>{{ conf | confidence_badge }}</td>
|
||||
<td>{{ bucket.total }}</td>
|
||||
<td style="color:#16a34a;">{{ bucket.correct }}</td>
|
||||
<td style="color:#ca8a04;">{{ bucket.modifie }}</td>
|
||||
<td style="color:#dc2626;">{{ bucket.supprime }}</td>
|
||||
<td style="font-weight:600;">{{ "%.1f" | format(bucket.precision * 100) }}%</td>
|
||||
<td style="font-weight:600;color:#dc2626;">{{ "%.1f" | format(bucket.hallucination * 100) }}%</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- Ventilation par source -->
|
||||
{% if metrics.by_source %}
|
||||
<div class="card" style="margin-bottom:1rem;">
|
||||
<h3>Par source d'extraction</h3>
|
||||
<table style="margin-top:0.75rem;">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Source</th>
|
||||
<th>Total DAS</th>
|
||||
<th>Correct</th>
|
||||
<th>Modifié</th>
|
||||
<th>Supprimé</th>
|
||||
<th>Precision</th>
|
||||
<th>Hallucination</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for source, bucket in metrics.by_source.items() %}
|
||||
<tr>
|
||||
<td><span style="font-size:0.8rem;padding:2px 8px;border-radius:4px;background:#f1f5f9;font-weight:600;">{{ source }}</span></td>
|
||||
<td>{{ bucket.total }}</td>
|
||||
<td style="color:#16a34a;">{{ bucket.correct }}</td>
|
||||
<td style="color:#ca8a04;">{{ bucket.modifie }}</td>
|
||||
<td style="color:#dc2626;">{{ bucket.supprime }}</td>
|
||||
<td style="font-weight:600;">{{ "%.1f" | format(bucket.precision * 100) }}%</td>
|
||||
<td style="font-weight:600;color:#dc2626;">{{ "%.1f" | format(bucket.hallucination * 100) }}%</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- Top corrections DAS -->
|
||||
{% if metrics.top_corrections %}
|
||||
<div class="card" style="margin-bottom:1rem;">
|
||||
<h3>Top corrections DAS (code pipeline → code DIM)</h3>
|
||||
<table style="margin-top:0.75rem;">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Code pipeline</th>
|
||||
<th></th>
|
||||
<th>Code corrigé</th>
|
||||
<th>Occurrences</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for (code_from, code_to), count in metrics.top_corrections %}
|
||||
<tr>
|
||||
<td><span style="font-family:monospace;font-weight:600;color:#dc2626;">{{ code_from }}</span></td>
|
||||
<td style="color:#94a3b8;">→</td>
|
||||
<td><span style="font-family:monospace;font-weight:600;color:#16a34a;">{{ code_to }}</span></td>
|
||||
<td>{{ count }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- Top corrections DP -->
|
||||
{% if metrics.dp_corrections %}
|
||||
<div class="card" style="margin-bottom:1rem;">
|
||||
<h3>Top corrections DP (code pipeline → code DIM)</h3>
|
||||
<table style="margin-top:0.75rem;">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Code pipeline</th>
|
||||
<th></th>
|
||||
<th>Code corrigé</th>
|
||||
<th>Occurrences</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for (code_from, code_to), count in metrics.dp_corrections %}
|
||||
<tr>
|
||||
<td><span style="font-family:monospace;font-weight:600;color:#dc2626;">{{ code_from }}</span></td>
|
||||
<td style="color:#94a3b8;">→</td>
|
||||
<td><span style="font-family:monospace;font-weight:600;color:#16a34a;">{{ code_to }}</span></td>
|
||||
<td>{{ count }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
272
t2a_install_rag_cleanup/src/viewer/validation.py
Normal file
272
t2a_install_rag_cleanup/src/viewer/validation.py
Normal file
@@ -0,0 +1,272 @@
|
||||
"""Gestionnaire de données pour la validation DIM (gold standard)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from ..config import BASE_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
GOLD_DIR = BASE_DIR / "data" / "gold_standard"
|
||||
|
||||
|
||||
class ValidationManager:
|
||||
"""Gère les annotations de validation DIM (fichiers JSON par dossier)."""
|
||||
|
||||
def __init__(self, gold_dir: Path | None = None):
|
||||
self.gold_dir = gold_dir or GOLD_DIR
|
||||
self.gold_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _annotation_path(self, dossier_id: str) -> Path:
|
||||
"""Chemin du fichier annotation pour un dossier donné."""
|
||||
safe_name = dossier_id.replace("/", "__") + ".json"
|
||||
return self.gold_dir / safe_name
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# CRUD
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def load_annotation(self, dossier_id: str) -> dict | None:
|
||||
"""Charge l'annotation existante pour un dossier."""
|
||||
path = self._annotation_path(dossier_id)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
logger.warning("Impossible de charger l'annotation %s", path)
|
||||
return None
|
||||
|
||||
def save_annotation(self, dossier_id: str, data: dict) -> None:
|
||||
"""Sauvegarde atomique (write temp + rename) d'une annotation."""
|
||||
path = self._annotation_path(dossier_id)
|
||||
data["dossier_id"] = dossier_id
|
||||
data.setdefault("date_validation", datetime.now().isoformat(timespec="seconds"))
|
||||
|
||||
fd, tmp_path = tempfile.mkstemp(
|
||||
dir=str(self.gold_dir), suffix=".tmp", prefix=".annot_"
|
||||
)
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
os.replace(tmp_path, path)
|
||||
except Exception:
|
||||
# Nettoyage en cas d'erreur
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
def list_annotations(self) -> list[dict]:
|
||||
"""Liste toutes les annotations avec métadonnées (statut, validateur, date)."""
|
||||
results = []
|
||||
for path in sorted(self.gold_dir.glob("*.json")):
|
||||
if path.name.startswith("_"):
|
||||
continue
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
results.append({
|
||||
"dossier_id": data.get("dossier_id", path.stem.replace("__", "/")),
|
||||
"statut": data.get("statut", "non_commence"),
|
||||
"validateur": data.get("validateur", ""),
|
||||
"date_validation": data.get("date_validation", ""),
|
||||
})
|
||||
except Exception:
|
||||
logger.warning("Annotation illisible : %s", path)
|
||||
return results
|
||||
|
||||
def load_selection(self) -> list[str]:
|
||||
"""Charge la liste des dossiers sélectionnés depuis _selection.json."""
|
||||
selection_path = self.gold_dir / "_selection.json"
|
||||
if not selection_path.exists():
|
||||
return []
|
||||
try:
|
||||
data = json.loads(selection_path.read_text(encoding="utf-8"))
|
||||
return data.get("dossiers", [])
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Métriques
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def compute_metrics(self, groups: dict) -> dict:
|
||||
"""Calcule precision, recall, F1 et hallucination rate depuis les annotations.
|
||||
|
||||
Args:
|
||||
groups: résultat de scan_dossiers() pour accéder aux données pipeline.
|
||||
|
||||
Returns:
|
||||
Dictionnaire de métriques globales et ventilées.
|
||||
"""
|
||||
annotations = []
|
||||
for path in sorted(self.gold_dir.glob("*.json")):
|
||||
if path.name.startswith("_"):
|
||||
continue
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
if data.get("statut") == "valide":
|
||||
annotations.append(data)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
total = len(annotations)
|
||||
if total == 0:
|
||||
return {"total_valides": 0}
|
||||
|
||||
# --- Métriques DP ---
|
||||
dp_correct = 0
|
||||
dp_modifie = 0
|
||||
dp_supprime = 0
|
||||
dp_corrections: list[tuple[str, str]] = [] # (pipeline, corrige)
|
||||
|
||||
# --- Métriques DAS ---
|
||||
das_correct = 0
|
||||
das_modifie = 0
|
||||
das_supprime = 0
|
||||
das_ajoutes = 0
|
||||
das_total_pipeline = 0
|
||||
das_corrections: list[tuple[str, str]] = []
|
||||
|
||||
# --- Ventilation par confiance ---
|
||||
by_confidence: dict[str, dict] = {}
|
||||
# --- Ventilation par source ---
|
||||
by_source: dict[str, dict] = {}
|
||||
|
||||
for annot in annotations:
|
||||
dossier_id = annot.get("dossier_id", "")
|
||||
|
||||
# Trouver les données pipeline correspondantes
|
||||
pipeline_data = self._find_pipeline_data(dossier_id, groups)
|
||||
|
||||
# DP
|
||||
dp = annot.get("dp", {})
|
||||
dp_statut = dp.get("statut", "correct")
|
||||
if dp_statut == "correct":
|
||||
dp_correct += 1
|
||||
elif dp_statut == "modifie":
|
||||
dp_modifie += 1
|
||||
code_orig = dp.get("code_pipeline", "")
|
||||
code_corr = dp.get("code_corrige", "")
|
||||
if code_orig and code_corr:
|
||||
dp_corrections.append((code_orig, code_corr))
|
||||
elif dp_statut == "supprime":
|
||||
dp_supprime += 1
|
||||
|
||||
# DAS
|
||||
das_list = annot.get("das", [])
|
||||
das_aj = annot.get("das_ajoutes", [])
|
||||
das_total_pipeline += len(das_list)
|
||||
das_ajoutes += len(das_aj)
|
||||
|
||||
for das in das_list:
|
||||
das_statut = das.get("statut", "correct")
|
||||
conf = das.get("confidence", "")
|
||||
source = das.get("source", "")
|
||||
|
||||
if das_statut == "correct":
|
||||
das_correct += 1
|
||||
elif das_statut == "modifie":
|
||||
das_modifie += 1
|
||||
code_orig = das.get("code_pipeline", "")
|
||||
code_corr = das.get("code_corrige", "")
|
||||
if code_orig and code_corr:
|
||||
das_corrections.append((code_orig, code_corr))
|
||||
elif das_statut == "supprime":
|
||||
das_supprime += 1
|
||||
|
||||
# Ventilation par confiance
|
||||
if conf:
|
||||
bucket = by_confidence.setdefault(conf, {
|
||||
"correct": 0, "modifie": 0, "supprime": 0, "total": 0
|
||||
})
|
||||
bucket["total"] += 1
|
||||
bucket[das_statut] = bucket.get(das_statut, 0) + 1
|
||||
|
||||
# Ventilation par source
|
||||
if source:
|
||||
bucket = by_source.setdefault(source, {
|
||||
"correct": 0, "modifie": 0, "supprime": 0, "total": 0
|
||||
})
|
||||
bucket["total"] += 1
|
||||
bucket[das_statut] = bucket.get(das_statut, 0) + 1
|
||||
|
||||
# --- Calculs ---
|
||||
# DAS reference = correct + modifié + ajoutés (les vrais DAS selon le DIM)
|
||||
das_reference = das_correct + das_modifie + das_ajoutes
|
||||
# DAS pipeline valides = correct + modifié (non supprimés)
|
||||
das_pipeline_valides = das_correct + das_modifie
|
||||
|
||||
precision = das_pipeline_valides / das_total_pipeline if das_total_pipeline > 0 else 0
|
||||
recall = das_pipeline_valides / das_reference if das_reference > 0 else 0
|
||||
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
||||
hallucination_rate = das_supprime / das_total_pipeline if das_total_pipeline > 0 else 0
|
||||
miss_rate = das_ajoutes / das_reference if das_reference > 0 else 0
|
||||
|
||||
# Top corrections DAS
|
||||
from collections import Counter
|
||||
correction_counter = Counter(das_corrections)
|
||||
top_corrections = correction_counter.most_common(10)
|
||||
|
||||
# Ventilation par confiance : calculer precision par bucket
|
||||
for bucket in by_confidence.values():
|
||||
t = bucket["total"]
|
||||
valides = bucket.get("correct", 0) + bucket.get("modifie", 0)
|
||||
bucket["precision"] = valides / t if t > 0 else 0
|
||||
bucket["hallucination"] = bucket.get("supprime", 0) / t if t > 0 else 0
|
||||
|
||||
for bucket in by_source.values():
|
||||
t = bucket["total"]
|
||||
valides = bucket.get("correct", 0) + bucket.get("modifie", 0)
|
||||
bucket["precision"] = valides / t if t > 0 else 0
|
||||
bucket["hallucination"] = bucket.get("supprime", 0) / t if t > 0 else 0
|
||||
|
||||
return {
|
||||
"total_valides": total,
|
||||
"dp": {
|
||||
"total": total,
|
||||
"correct": dp_correct,
|
||||
"modifie": dp_modifie,
|
||||
"supprime": dp_supprime,
|
||||
"accuracy": dp_correct / total if total > 0 else 0,
|
||||
},
|
||||
"das": {
|
||||
"total_pipeline": das_total_pipeline,
|
||||
"correct": das_correct,
|
||||
"modifie": das_modifie,
|
||||
"supprime": das_supprime,
|
||||
"ajoutes": das_ajoutes,
|
||||
"reference": das_reference,
|
||||
"precision": precision,
|
||||
"recall": recall,
|
||||
"f1": f1,
|
||||
"hallucination_rate": hallucination_rate,
|
||||
"miss_rate": miss_rate,
|
||||
},
|
||||
"by_confidence": by_confidence,
|
||||
"by_source": by_source,
|
||||
"top_corrections": top_corrections,
|
||||
"dp_corrections": Counter(dp_corrections).most_common(10),
|
||||
}
|
||||
|
||||
def _find_pipeline_data(self, dossier_id: str, groups: dict) -> dict | None:
|
||||
"""Trouve les données pipeline pour un dossier_id donné."""
|
||||
# dossier_id est de la forme "45_23183041/fusionne"
|
||||
parts = dossier_id.split("/")
|
||||
group_name = parts[0] if parts else ""
|
||||
items = groups.get(group_name, [])
|
||||
for item in items:
|
||||
if "fusionne" in item["name"]:
|
||||
return item
|
||||
return items[0] if items else None
|
||||
Reference in New Issue
Block a user