164 lines
5.3 KiB
Python
164 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Régénération ciblée des contrôles CPAM classés Tier C ou sans response_data.
|
|
|
|
Usage :
|
|
cd /home/dom/ai/t2a_v2
|
|
.venv/bin/python3 scripts/regenerate_tier_c.py [--dry-run]
|
|
|
|
Le script :
|
|
1. Scanne output/structured/ pour trouver les contrôles Tier C + ceux sans response_data
|
|
2. Pour chaque contrôle, relance generate_cpam_response() avec le pipeline corrigé
|
|
3. Sauvegarde le JSON mis à jour (backup automatique .bak)
|
|
|
|
Options :
|
|
--dry-run Affiche les contrôles ciblés sans régénérer
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import shutil
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
# Ajouter le répertoire racine au path
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from src.config import DossierMedical
|
|
from src.control.cpam_response import generate_cpam_response
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)-7s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
STRUCTURED_DIR = ROOT / "output" / "structured"
|
|
|
|
|
|
def find_targets() -> list[tuple[Path, int]]:
|
|
"""Trouve les fichiers JSON contenant des contrôles Tier C ou sans response_data.
|
|
|
|
Returns:
|
|
Liste de (chemin_json, index_du_controle_dans_la_liste).
|
|
"""
|
|
targets: list[tuple[Path, int]] = []
|
|
|
|
for sub in sorted(STRUCTURED_DIR.iterdir()):
|
|
if not sub.is_dir():
|
|
continue
|
|
for jf in sub.glob("*_fusionne_cim10.json"):
|
|
data = json.loads(jf.read_text(encoding="utf-8"))
|
|
controles = data.get("controles_cpam", [])
|
|
for i, ctrl in enumerate(controles):
|
|
tier = ctrl.get("quality_tier")
|
|
has_resp = ctrl.get("response_data") is not None
|
|
if tier == "C" or not has_resp:
|
|
targets.append((jf, i))
|
|
|
|
return targets
|
|
|
|
|
|
def regenerate(targets: list[tuple[Path, int]]) -> dict[str, int]:
|
|
"""Régénère les contrôles CPAM ciblés.
|
|
|
|
Returns:
|
|
Statistiques {tier_A, tier_B, tier_C, errors}.
|
|
"""
|
|
stats = {"A": 0, "B": 0, "C": 0, "errors": 0}
|
|
# Grouper par fichier pour ne charger/sauver qu'une fois par dossier
|
|
by_file: dict[Path, list[int]] = {}
|
|
for path, idx in targets:
|
|
by_file.setdefault(path, []).append(idx)
|
|
|
|
total = len(targets)
|
|
done = 0
|
|
|
|
for json_path, indices in by_file.items():
|
|
dossier_id = json_path.parent.name
|
|
logger.info("=== Dossier %s (%d contrôle(s) à régénérer) ===", dossier_id, len(indices))
|
|
|
|
# Charger le dossier
|
|
data = json.loads(json_path.read_text(encoding="utf-8"))
|
|
dossier = DossierMedical.model_validate(data)
|
|
|
|
modified = False
|
|
|
|
for idx in indices:
|
|
ctrl = dossier.controles_cpam[idx]
|
|
done += 1
|
|
old_tier = ctrl.quality_tier or "?"
|
|
logger.info("[%d/%d] OGC %d — %s (ancien tier: %s)",
|
|
done, total, ctrl.numero_ogc, ctrl.titre[:60], old_tier)
|
|
|
|
t0 = time.time()
|
|
try:
|
|
text, response_data, sources = generate_cpam_response(dossier, ctrl)
|
|
elapsed = time.time() - t0
|
|
|
|
ctrl.contre_argumentation = text
|
|
ctrl.response_data = response_data
|
|
ctrl.sources_reponse = sources
|
|
|
|
new_tier = ctrl.quality_tier or "?"
|
|
stats[new_tier] = stats.get(new_tier, 0) + 1
|
|
modified = True
|
|
|
|
logger.info(" Résultat : tier %s → %s (%d chars, %.1fs)",
|
|
old_tier, new_tier, len(text), elapsed)
|
|
except Exception:
|
|
logger.exception(" ERREUR sur OGC %d", ctrl.numero_ogc)
|
|
stats["errors"] += 1
|
|
|
|
if modified:
|
|
# Backup + sauvegarde
|
|
backup_path = json_path.with_suffix(".json.bak")
|
|
shutil.copy2(json_path, backup_path)
|
|
json_path.write_text(
|
|
dossier.model_dump_json(indent=2, exclude_none=True),
|
|
encoding="utf-8",
|
|
)
|
|
logger.info(" Sauvegardé : %s (backup: %s)", json_path.name, backup_path.name)
|
|
|
|
return stats
|
|
|
|
|
|
def main() -> None:
|
|
dry_run = "--dry-run" in sys.argv
|
|
|
|
logger.info("Recherche des contrôles Tier C et sans response_data...")
|
|
targets = find_targets()
|
|
|
|
if not targets:
|
|
logger.info("Aucun contrôle à régénérer.")
|
|
return
|
|
|
|
logger.info("Trouvé %d contrôle(s) à régénérer :", len(targets))
|
|
for path, idx in targets:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
ctrl = data["controles_cpam"][idx]
|
|
tier = ctrl.get("quality_tier", "?")
|
|
has_resp = "oui" if ctrl.get("response_data") else "NON"
|
|
logger.info(" %s OGC %d — tier %s, response_data: %s",
|
|
path.parent.name, ctrl["numero_ogc"], tier, has_resp)
|
|
|
|
if dry_run:
|
|
logger.info("Mode dry-run — aucune régénération effectuée.")
|
|
return
|
|
|
|
t0 = time.time()
|
|
stats = regenerate(targets)
|
|
elapsed = time.time() - t0
|
|
|
|
logger.info("=== TERMINÉ en %.1f min ===", elapsed / 60)
|
|
logger.info("Distribution : A=%d, B=%d, C=%d, erreurs=%d",
|
|
stats.get("A", 0), stats.get("B", 0), stats.get("C", 0), stats["errors"])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|