chore: add .gitignore

2026-03-05 00:37:41 +01:00
parent 542797a124
commit 2578afb6ff
1716 changed files with 1905609 additions and 18 deletions
--- a/tools/gold_crh_annotator.py
+++ b/tools/gold_crh_annotator.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+"""Annotateur gold CRH — outil CLI pour créer et valider le gold standard.
+
+3 modes :
+  --bootstrap   : Génère un CSV template depuis les JSON pipeline existants
+  --import-csv  : Convertit un CSV rempli en JSONL gold + index JSON
+  --check       : Valide un fichier JSONL gold (doublons, formats, stats)
+
+Usage:
+  python tools/gold_crh_annotator.py --bootstrap --out data/gold_crh/gold_template.csv
+  python tools/gold_crh_annotator.py --import-csv data/gold_crh/gold_template.csv
+  python tools/gold_crh_annotator.py --check data/gold_crh/gold_crh.jsonl
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+OUTPUT_DIR = ROOT / "output" / "structured"
+GOLD_DIR = ROOT / "data" / "gold_crh"
+DEFAULT_JSONL = GOLD_DIR / "gold_crh.jsonl"
+DEFAULT_INDEX = GOLD_DIR / "gold_crh_index.json"
+
+CSV_COLUMNS = [
+    "case_id",
+    "dp_expected_code",
+    "dp_expected_label",
+    "dp_acceptable_codes",
+    "dp_acceptable_family3",
+    "allow_symptom_dp",
+    "confidence",
+    "evidence_1_section",
+    "evidence_1_excerpt",
+    "evidence_2_section",
+    "evidence_2_excerpt",
+    "notes",
+]
+
+
+# ---------------------------------------------------------------------------
+# Bootstrap : JSON pipeline → CSV template
+# ---------------------------------------------------------------------------
+
+def find_merged_json(dossier_id: str) -> Path | None:
+    d = OUTPUT_DIR / dossier_id
+    if not d.exists():
+        return None
+    fusions = list(d.glob("*fusionne_cim10.json"))
+    if fusions:
+        return fusions[0]
+    cim10s = list(d.glob("*_cim10.json"))
+    return cim10s[0] if cim10s else None
+
+
+def bootstrap_template(out_path: Path, case_ids: list[str] | None = None) -> int:
+    """Génère un CSV template pré-rempli depuis les JSON pipeline."""
+    if case_ids:
+        dossier_ids = case_ids
+    else:
+        # Trouver tous les dossiers, filtrer les CRH ou sans DP trackare
+        dossier_ids = sorted(
+            d.name for d in OUTPUT_DIR.iterdir()
+            if d.is_dir() and find_merged_json(d.name)
+        )
+
+    rows: list[dict] = []
+    for did in dossier_ids:
+        path = find_merged_json(did)
+        if not path:
+            continue
+        try:
+            data = json.loads(path.read_text("utf-8"))
+        except (json.JSONDecodeError, OSError):
+            continue
+
+        dp = data.get("diagnostic_principal", {})
+        dp_code = dp.get("cim10_final") or dp.get("cim10_suggestion") or ""
+        dp_label = dp.get("texte", "")
+
+        # Pré-remplir avec le DP pipeline comme suggestion
+        rows.append({
+            "case_id": did,
+            "dp_expected_code": dp_code,
+            "dp_expected_label": dp_label,
+            "dp_acceptable_codes": "",
+            "dp_acceptable_family3": dp_code[:3] if dp_code else "",
+            "allow_symptom_dp": "false",
+            "confidence": "probable",
+            "evidence_1_section": "",
+            "evidence_1_excerpt": "",
+            "evidence_2_section": "",
+            "evidence_2_excerpt": "",
+            "notes": f"pipeline: {dp_code} ({dp.get('source', '?')})",
+        })
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(out_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
+        writer.writeheader()
+        writer.writerows(rows)
+
+    return len(rows)
+
+
+# ---------------------------------------------------------------------------
+# Import CSV → JSONL + index
+# ---------------------------------------------------------------------------
+
+def import_csv_to_jsonl(csv_path: Path, jsonl_path: Path | None = None,
+                        index_path: Path | None = None) -> tuple[int, list[str]]:
+    """Convertit un CSV rempli en JSONL gold + index JSON.
+
+    Retourne (nb_cas, erreurs).
+    """
+    from src.eval.gold_models import is_valid_cim10_format
+
+    jsonl_path = jsonl_path or DEFAULT_JSONL
+    index_path = index_path or DEFAULT_INDEX
+
+    if not csv_path.exists():
+        return 0, [f"Fichier CSV introuvable: {csv_path}"]
+
+    rows: list[dict] = []
+    errors: list[str] = []
+
+    with open(csv_path, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for i, row in enumerate(reader, 2):  # ligne 2+ (header = 1)
+            case_id = row.get("case_id", "").strip()
+            if not case_id:
+                errors.append(f"ligne {i}: case_id vide")
+                continue
+
+            dp_code = row.get("dp_expected_code", "").strip().upper()
+            dp_label = row.get("dp_expected_label", "").strip()
+
+            if not dp_code:
+                errors.append(f"ligne {i} ({case_id}): dp_expected_code obligatoire")
+                continue
+
+            if not is_valid_cim10_format(dp_code):
+                errors.append(f"ligne {i} ({case_id}): code CIM-10 invalide: {dp_code}")
+                continue
+
+            # Parse listes séparées par |
+            acceptable_codes = [
+                c.strip().upper()
+                for c in row.get("dp_acceptable_codes", "").split("|")
+                if c.strip()
+            ]
+            for ac in acceptable_codes:
+                if not is_valid_cim10_format(ac):
+                    errors.append(f"ligne {i} ({case_id}): code acceptable invalide: {ac}")
+
+            acceptable_family3 = [
+                f.strip().upper()
+                for f in row.get("dp_acceptable_family3", "").split("|")
+                if f.strip()
+            ]
+
+            allow_symptom = row.get("allow_symptom_dp", "false").strip().lower() in (
+                "true", "1", "yes", "oui"
+            )
+            confidence = row.get("confidence", "probable").strip().lower()
+            if confidence not in ("certain", "probable", "ambiguous"):
+                errors.append(f"ligne {i} ({case_id}): confidence invalide: {confidence}")
+                confidence = "probable"
+
+            # Evidence
+            evidence = []
+            for idx in (1, 2):
+                section = row.get(f"evidence_{idx}_section", "").strip()
+                excerpt = row.get(f"evidence_{idx}_excerpt", "").strip()
+                if section and excerpt:
+                    if len(excerpt) > 240:
+                        errors.append(
+                            f"ligne {i} ({case_id}): evidence_{idx}_excerpt "
+                            f"trop long ({len(excerpt)} > 240)"
+                        )
+                        excerpt = excerpt[:240]
+                    evidence.append({"section": section, "excerpt": excerpt})
+
+            notes = row.get("notes", "").strip()
+            if len(notes) > 400:
+                notes = notes[:400]
+
+            gold_case = {
+                "case_id": case_id,
+                "document_type": "crh",
+                "dp_expected": {"code": dp_code, "label": dp_label},
+                "dp_acceptable_codes": acceptable_codes,
+                "dp_acceptable_family3": acceptable_family3,
+                "allow_symptom_dp": allow_symptom,
+                "confidence": confidence,
+                "evidence": evidence,
+                "notes": notes,
+            }
+            rows.append(gold_case)
+
+    if not rows:
+        return 0, errors + ["Aucun cas valide trouvé dans le CSV"]
+
+    # Écrire JSONL
+    jsonl_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(jsonl_path, "w", encoding="utf-8") as f:
+        for gold_case in rows:
+            f.write(json.dumps(gold_case, ensure_ascii=False) + "\n")
+
+    # Écrire index (case_id → numéro de ligne 0-indexed)
+    index = {row["case_id"]: i for i, row in enumerate(rows)}
+    with open(index_path, "w", encoding="utf-8") as f:
+        json.dump(index, f, ensure_ascii=False, indent=2)
+
+    return len(rows), errors
+
+
+# ---------------------------------------------------------------------------
+# Check : validation JSONL
+# ---------------------------------------------------------------------------
+
+def check_gold(jsonl_path: Path) -> tuple[dict, list[str]]:
+    """Valide un fichier JSONL gold. Retourne (stats, erreurs)."""
+    from src.eval.gold_models import load_gold_jsonl
+
+    errors: list[str] = []
+
+    try:
+        cases = load_gold_jsonl(jsonl_path)
+    except FileNotFoundError as e:
+        return {}, [str(e)]
+    except ValueError as e:
+        return {}, [str(e)]
+
+    # Doublons
+    seen_ids: dict[str, int] = {}
+    for i, c in enumerate(cases):
+        if c.case_id in seen_ids:
+            errors.append(f"case_id dupliqué: {c.case_id} (lignes {seen_ids[c.case_id]+1} et {i+1})")
+        seen_ids[c.case_id] = i
+
+    # Stats
+    stats = {
+        "total": len(cases),
+        "certain": sum(1 for c in cases if c.confidence == "certain"),
+        "probable": sum(1 for c in cases if c.confidence == "probable"),
+        "ambiguous": sum(1 for c in cases if c.confidence == "ambiguous"),
+        "allow_symptom_dp": sum(1 for c in cases if c.allow_symptom_dp),
+        "with_evidence": sum(1 for c in cases if c.evidence),
+        "with_acceptable_codes": sum(1 for c in cases if c.dp_acceptable_codes),
+        "with_family3": sum(1 for c in cases if c.dp_acceptable_family3),
+    }
+
+    # Champs manquants / suspects
+    for c in cases:
+        if not c.dp_expected.label:
+            errors.append(f"{c.case_id}: dp_expected.label vide")
+        if not c.evidence:
+            errors.append(f"{c.case_id}: pas d'evidence (recommandé)")
+
+    return stats, errors
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Annotateur gold CRH — bootstrap / import / check"
+    )
+    parser.add_argument("--bootstrap", action="store_true",
+                        help="Générer un CSV template depuis les JSON pipeline")
+    parser.add_argument("--import-csv", type=str, metavar="CSV",
+                        help="Importer un CSV rempli → JSONL gold")
+    parser.add_argument("--check", type=str, metavar="JSONL",
+                        help="Vérifier un fichier JSONL gold")
+    parser.add_argument("--out", type=str, default=str(GOLD_DIR / "gold_template.csv"),
+                        help="Chemin de sortie (bootstrap)")
+    parser.add_argument("--cases", type=str, default="",
+                        help="IDs de cas séparés par virgule (bootstrap)")
+    args = parser.parse_args()
+
+    if not any([args.bootstrap, args.import_csv, args.check]):
+        parser.print_help()
+        sys.exit(1)
+
+    if args.bootstrap:
+        case_ids = [c.strip() for c in args.cases.split(",") if c.strip()] if args.cases else None
+        out = Path(args.out)
+        n = bootstrap_template(out, case_ids)
+        print(f"Template généré : {out} ({n} cas)")
+        print(f"  Colonnes : {', '.join(CSV_COLUMNS)}")
+        print(f"  Remplir le CSV puis : python tools/gold_crh_annotator.py --import-csv {out}")
+
+    if args.import_csv:
+        csv_path = Path(args.import_csv)
+        n, errors = import_csv_to_jsonl(csv_path)
+        if errors:
+            print(f"ATTENTION — {len(errors)} avertissement(s) :")
+            for e in errors[:20]:
+                print(f"  {e}")
+        print(f"\nImporté : {n} cas → {DEFAULT_JSONL}")
+        print(f"Index   : {DEFAULT_INDEX}")
+
+    if args.check:
+        jsonl_path = Path(args.check)
+        stats, errors = check_gold(jsonl_path)
+        if errors:
+            print(f"ERREURS ({len(errors)}) :")
+            for e in errors[:20]:
+                print(f"  {e}")
+            print()
+        if stats:
+            print(f"Gold CRH — {stats['total']} cas")
+            print(f"  Confiance : certain={stats['certain']}, "
+                  f"probable={stats['probable']}, ambiguous={stats['ambiguous']}")
+            print(f"  allow_symptom_dp : {stats['allow_symptom_dp']}")
+            print(f"  Avec evidence    : {stats['with_evidence']}")
+            print(f"  Avec codes alt   : {stats['with_acceptable_codes']}")
+            print(f"  Avec family3     : {stats['with_family3']}")
+        else:
+            print("Aucun cas chargé.")
+            sys.exit(1)
+
+        if not errors:
+            print("\nOK — aucune erreur détectée")
+
+
+if __name__ == "__main__":
+    main()