refactor: réorganisation référentiels, nouveaux modules extraction, nettoyage code obsolète

- Réorganisation data/referentiels/ : pdfs/, dicts/, user/ (structure unifiée)
- Fix badges "Source absente" sur page admin référentiels
- Ré-indexation COCOA 2025 (555 → 1451 chunks, couverture 94%)
- Fix VRAM OOM : embeddings forcés CPU via T2A_EMBED_CPU
- Nouveaux modules : document_router, docx_extractor, image_extractor, ocr_engine
- Module complétude (quality/completude.py + config YAML)
- Template DIM (synthèse dimensionnelle)
- Gunicorn config + systemd service t2a-viewer
- Suppression t2a_install_rag_cleanup/ (copie obsolète)
- Suppression scripts/ et scripts_t2a_v2/ (anciens benchmarks)
- Suppression 81 fichiers _doc.txt de test
- Cache Ollama : TTL configurable, corrections loader YAML
- Dashboard : améliorations templates (base, index, detail, cpam, validation)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-03-07 16:48:10 +01:00
parent 2578afb6ff
commit 4e2b4bd946
210 changed files with 6939 additions and 22104 deletions

View File

@@ -8,9 +8,13 @@ Orchestrateur principal — délègue aux sous-modules :
from __future__ import annotations
import json
import logging
import os
from datetime import datetime
from pathlib import Path
from ..config import ControleCPAM, DossierMedical, RAGSource, rule_enabled
from ..config import ControleCPAM, DossierMedical, RAGSource, STRUCTURED_DIR, rule_enabled
from ..medical.ollama_client import call_anthropic, call_ollama
from ..prompts import CPAM_EXTRACTION
@@ -50,6 +54,70 @@ from .cpam_validation import _CIM10_CODE_RE, _validate_adversarial as _validate_
logger = logging.getLogger(__name__)
def _save_version(
dossier: DossierMedical,
controle: ControleCPAM,
) -> None:
"""Sauvegarde la version actuelle de l'argumentaire avant régénération.
Stocke dans output/structured/{dossier}/_cpam_versions/{ogc}_{timestamp}.json
"""
if not controle.contre_argumentation and not controle.response_data:
return # rien à versionner
# Trouver le dossier structuré (depuis source_files ou source_file)
dossier_dir = None
if not STRUCTURED_DIR.is_dir():
logger.debug("Versioning : STRUCTURED_DIR inexistant, skip")
return
structured_dirs = [d for d in STRUCTURED_DIR.iterdir() if d.is_dir()]
# Tentative 1 : matcher un source_file contre les noms de sous-dossiers
candidates = list(dossier.source_files or [])
if dossier.source_file and dossier.source_file not in candidates:
candidates.append(dossier.source_file)
for src in candidates:
src_stem = Path(src).stem.replace(" ", "_")
for d in structured_dirs:
if src_stem in d.name:
dossier_dir = d
break
if dossier_dir:
break
if not dossier_dir:
logger.debug("Versioning : pas de dossier structuré trouvé, skip")
return
versions_dir = dossier_dir / "_cpam_versions"
versions_dir.mkdir(exist_ok=True)
# Compter les versions existantes pour cet OGC
existing = sorted(versions_dir.glob(f"{controle.numero_ogc}_*.json"))
version_num = len(existing) + 1
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{controle.numero_ogc}_{timestamp}_v{version_num}.json"
version_data = {
"numero_ogc": controle.numero_ogc,
"version": version_num,
"timestamp": timestamp,
"contre_argumentation": controle.contre_argumentation,
"response_data": controle.response_data,
"quality_tier": controle.quality_tier,
"validation_dim": controle.validation_dim,
}
(versions_dir / filename).write_text(
json.dumps(version_data, ensure_ascii=False, indent=2),
encoding="utf-8",
)
logger.info(" Version %d sauvegardée : %s", version_num, filename)
def _extraction_pass(
dossier: DossierMedical,
controle: ControleCPAM,
@@ -121,6 +189,9 @@ def generate_cpam_response(
logger.info("CPAM : génération contre-argumentation pour OGC %d%s",
controle.numero_ogc, controle.titre)
# 0. Versioning — sauvegarder la version précédente avant d'écraser
_save_version(dossier, controle)
# 1. Passe 1 — Extraction structurée (compréhension avant argumentation)
extraction = _extraction_pass(dossier, controle)
degraded_pass1 = extraction is None
@@ -137,12 +208,12 @@ def generate_cpam_response(
prompt, tag_map = _build_cpam_prompt(dossier, controle, sources, extraction)
# 4. Appel LLM — Ollama (rôle cpam) > Haiku fallback
result = call_ollama(prompt, temperature=0.1, max_tokens=16000, role="cpam")
result = call_ollama(prompt, temperature=0.1, max_tokens=8000, role="cpam")
if result is not None:
logger.info(" Contre-argumentation via Ollama")
else:
logger.info(" Ollama indisponible → fallback Anthropic Haiku")
result = call_anthropic(prompt, temperature=0.1, max_tokens=16000)
result = call_anthropic(prompt, temperature=0.1, max_tokens=8000)
if result is not None:
logger.info(" Contre-argumentation via Anthropic Haiku")
@@ -213,8 +284,8 @@ def generate_cpam_response(
if adversarial_warnings:
adversarial_warnings.append(f"Score de confiance : {score}/10")
# 8b. Boucle de correction (max 2 retries)
max_corrections = 2
# 8b. Boucle de correction (configurable via T2A_CPAM_MAX_CORRECTIONS, défaut 2)
max_corrections = int(os.environ.get("T2A_CPAM_MAX_CORRECTIONS", "2"))
for attempt in range(max_corrections):
if not (validation
and not validation.get("coherent", True)