Aivanov_scan_ogc/bench_v11_vs_legacy.py

"""Comparaison systématique V1.1 vs legacy sur les 18 dossiers."""
import json
import os
from pathlib import Path

FIELDS_RECUEIL = [
    ("etablissement", "etablissement"),
    ("finess", "finess"),
    ("date_debut_controle", "date_debut_controle"),
    ("n_ogc", "n_ogc"),
    ("n_champ", "n_champ"),
    ("dates_sejour", "dates_sejour"),
    ("codage_etab.dp", "codage_etab.dp"),
    ("codage_etab.dp_libelle", "codage_etab.dp_libelle"),
    ("codage_etab.dr", "codage_etab.dr"),
    ("codage_reco.dp", "codage_reco.dp"),
    ("ghm_etab", "ghm_etab"),
    ("ghs_etab", "ghs_etab"),
    ("ghm_reco", "ghm_reco"),
    ("ghs_reco", "ghs_reco"),
    ("recodage_impactant", "recodage_impactant"),
    ("ghs_injustifie", "ghs_injustifie"),
    ("accord_desaccord", "accord_desaccord"),
    ("praticien_conseil", "praticien_conseil"),
]


def get(d, path):
    for k in path.split("."):
        d = d.get(k, "") if isinstance(d, dict) else ""
    return str(d).strip()


def count_das(d, path):
    for k in path.split("."):
        d = d.get(k, []) if isinstance(d, dict) else []
    return len(d) if isinstance(d, list) else 0


def load_pairs():
    pairs = []
    for f in sorted(Path("output").glob("OGC *.json")):
        v1_path = Path("output/v2") / f.name
        if not v1_path.exists(): continue
        with open(f) as g: legacy = json.load(g)
        with open(v1_path) as g: v11 = json.load(g)
        rec_legacy = (legacy.get("recueil") or {}).get("parsed") or {}
        rec_v11 = (v11.get("extraction") or {}).get("recueil") or {}
        pairs.append((f.stem, rec_legacy, rec_v11))
    return pairs


def bench():
    pairs = load_pairs()
    print(f"\n{'Dossier':10s}  {'Champ':28s}  {'legacy':30s}  {'v1.1':30s}")
    print("="*110)

    totals = {f: {"both": 0, "v11_only": 0, "leg_only": 0, "match": 0, "diff": 0}
              for f, _ in FIELDS_RECUEIL}
    totals["das_etab"] = {"both": 0, "v11_only": 0, "leg_only": 0, "match": 0, "diff": 0}

    for name, leg, v11 in pairs:
        # Champs simples
        for fk, lk in FIELDS_RECUEIL:
            vl = get(leg, lk)
            vv = get(v11, fk)
            if not vl and not vv: continue
            if vl and not vv: totals[fk]["leg_only"] += 1
            elif vv and not vl: totals[fk]["v11_only"] += 1
            else:
                totals[fk]["both"] += 1
                if fk == "codage_etab.dp_libelle":
                    ok = vl in vv or vv in vl
                else:
                    ok = vl == vv
                if ok: totals[fk]["match"] += 1
                else:
                    totals[fk]["diff"] += 1
        # DAS count
        cle = count_das(leg, "codage_etab.das")
        cv1 = count_das(v11, "codage_etab.das")
        if cle and cv1:
            totals["das_etab"]["both"] += 1
            if cle == cv1: totals["das_etab"]["match"] += 1
            else: totals["das_etab"]["diff"] += 1
        elif cle: totals["das_etab"]["leg_only"] += 1
        elif cv1: totals["das_etab"]["v11_only"] += 1

    print(f"\n{'Champ':28s}  {'match':>6s}  {'diff':>5s}  {'v11+':>5s}  {'leg+':>5s}  {'both':>5s}")
    print("-"*70)
    order = [f for f, _ in FIELDS_RECUEIL] + ["das_etab"]
    for fk in order:
        t = totals[fk]
        print(f"  {fk:28s}  {t['match']:>6d}  {t['diff']:>5d}  "
              f"{t['v11_only']:>5d}  {t['leg_only']:>5d}  {t['both']:>5d}")

    print("\nLégende :")
    print("  match  = les deux extraient la même valeur")
    print("  diff   = les deux extraient mais des valeurs différentes (à arbitrer)")
    print("  v11+   = V1.1 extrait, legacy vide")
    print("  leg+   = legacy extrait, V1.1 vide")
    print("  both   = nb dossiers où les deux ont extrait qqch (match+diff)")


if __name__ == "__main__":
    bench()