t2a_v2/scripts_t2a_v2/regenerate_tier_c.py

#!/usr/bin/env python3
"""Régénération ciblée des contrôles CPAM classés Tier C ou sans response_data.

Usage :
    cd /home/dom/ai/t2a_v2
    .venv/bin/python3 scripts/regenerate_tier_c.py [--dry-run]

Le script :
1. Scanne output/structured/ pour trouver les contrôles Tier C + ceux sans response_data
2. Pour chaque contrôle, relance generate_cpam_response() avec le pipeline corrigé
3. Sauvegarde le JSON mis à jour (backup automatique .bak)

Options :
    --dry-run   Affiche les contrôles ciblés sans régénérer
"""

from __future__ import annotations

import json
import logging
import shutil
import sys
import time
from pathlib import Path

# Ajouter le répertoire racine au path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from src.config import DossierMedical
from src.control.cpam_response import generate_cpam_response

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)-7s %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

STRUCTURED_DIR = ROOT / "output" / "structured"


def find_targets() -> list[tuple[Path, int]]:
    """Trouve les fichiers JSON contenant des contrôles Tier C ou sans response_data.

    Returns:
        Liste de (chemin_json, index_du_controle_dans_la_liste).
    """
    targets: list[tuple[Path, int]] = []

    for sub in sorted(STRUCTURED_DIR.iterdir()):
        if not sub.is_dir():
            continue
        for jf in sub.glob("*_fusionne_cim10.json"):
            data = json.loads(jf.read_text(encoding="utf-8"))
            controles = data.get("controles_cpam", [])
            for i, ctrl in enumerate(controles):
                tier = ctrl.get("quality_tier")
                has_resp = ctrl.get("response_data") is not None
                if tier == "C" or not has_resp:
                    targets.append((jf, i))

    return targets


def regenerate(targets: list[tuple[Path, int]]) -> dict[str, int]:
    """Régénère les contrôles CPAM ciblés.

    Returns:
        Statistiques {tier_A, tier_B, tier_C, errors}.
    """
    stats = {"A": 0, "B": 0, "C": 0, "errors": 0}
    # Grouper par fichier pour ne charger/sauver qu'une fois par dossier
    by_file: dict[Path, list[int]] = {}
    for path, idx in targets:
        by_file.setdefault(path, []).append(idx)

    total = len(targets)
    done = 0

    for json_path, indices in by_file.items():
        dossier_id = json_path.parent.name
        logger.info("=== Dossier %s (%d contrôle(s) à régénérer) ===", dossier_id, len(indices))

        # Charger le dossier
        data = json.loads(json_path.read_text(encoding="utf-8"))
        dossier = DossierMedical.model_validate(data)

        modified = False

        for idx in indices:
            ctrl = dossier.controles_cpam[idx]
            done += 1
            old_tier = ctrl.quality_tier or "?"
            logger.info("[%d/%d] OGC %d — %s (ancien tier: %s)",
                        done, total, ctrl.numero_ogc, ctrl.titre[:60], old_tier)

            t0 = time.time()
            try:
                text, response_data, sources = generate_cpam_response(dossier, ctrl)
                elapsed = time.time() - t0

                ctrl.contre_argumentation = text
                ctrl.response_data = response_data
                ctrl.sources_reponse = sources

                new_tier = ctrl.quality_tier or "?"
                stats[new_tier] = stats.get(new_tier, 0) + 1
                modified = True

                logger.info("  Résultat : tier %s → %s (%d chars, %.1fs)",
                            old_tier, new_tier, len(text), elapsed)
            except Exception:
                logger.exception("  ERREUR sur OGC %d", ctrl.numero_ogc)
                stats["errors"] += 1

        if modified:
            # Backup + sauvegarde
            backup_path = json_path.with_suffix(".json.bak")
            shutil.copy2(json_path, backup_path)
            json_path.write_text(
                dossier.model_dump_json(indent=2, exclude_none=True),
                encoding="utf-8",
            )
            logger.info("  Sauvegardé : %s (backup: %s)", json_path.name, backup_path.name)

    return stats


def main() -> None:
    dry_run = "--dry-run" in sys.argv

    logger.info("Recherche des contrôles Tier C et sans response_data...")
    targets = find_targets()

    if not targets:
        logger.info("Aucun contrôle à régénérer.")
        return

    logger.info("Trouvé %d contrôle(s) à régénérer :", len(targets))
    for path, idx in targets:
        data = json.loads(path.read_text(encoding="utf-8"))
        ctrl = data["controles_cpam"][idx]
        tier = ctrl.get("quality_tier", "?")
        has_resp = "oui" if ctrl.get("response_data") else "NON"
        logger.info("  %s OGC %d — tier %s, response_data: %s",
                     path.parent.name, ctrl["numero_ogc"], tier, has_resp)

    if dry_run:
        logger.info("Mode dry-run — aucune régénération effectuée.")
        return

    t0 = time.time()
    stats = regenerate(targets)
    elapsed = time.time() - t0

    logger.info("=== TERMINÉ en %.1f min ===", elapsed / 60)
    logger.info("Distribution : A=%d, B=%d, C=%d, erreurs=%d",
                stats.get("A", 0), stats.get("B", 0), stats.get("C", 0), stats["errors"])


if __name__ == "__main__":
    main()