chore: add .gitignore

This commit is contained in:
dom
2026-03-05 00:37:41 +01:00
parent 542797a124
commit 2578afb6ff
1716 changed files with 1905609 additions and 18 deletions

337
tools/gold_crh_annotator.py Normal file
View File

@@ -0,0 +1,337 @@
#!/usr/bin/env python3
"""Annotateur gold CRH — outil CLI pour créer et valider le gold standard.
3 modes :
--bootstrap : Génère un CSV template depuis les JSON pipeline existants
--import-csv : Convertit un CSV rempli en JSONL gold + index JSON
--check : Valide un fichier JSONL gold (doublons, formats, stats)
Usage:
python tools/gold_crh_annotator.py --bootstrap --out data/gold_crh/gold_template.csv
python tools/gold_crh_annotator.py --import-csv data/gold_crh/gold_template.csv
python tools/gold_crh_annotator.py --check data/gold_crh/gold_crh.jsonl
"""
from __future__ import annotations
import argparse
import csv
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
OUTPUT_DIR = ROOT / "output" / "structured"
GOLD_DIR = ROOT / "data" / "gold_crh"
DEFAULT_JSONL = GOLD_DIR / "gold_crh.jsonl"
DEFAULT_INDEX = GOLD_DIR / "gold_crh_index.json"
CSV_COLUMNS = [
"case_id",
"dp_expected_code",
"dp_expected_label",
"dp_acceptable_codes",
"dp_acceptable_family3",
"allow_symptom_dp",
"confidence",
"evidence_1_section",
"evidence_1_excerpt",
"evidence_2_section",
"evidence_2_excerpt",
"notes",
]
# ---------------------------------------------------------------------------
# Bootstrap : JSON pipeline → CSV template
# ---------------------------------------------------------------------------
def find_merged_json(dossier_id: str) -> Path | None:
d = OUTPUT_DIR / dossier_id
if not d.exists():
return None
fusions = list(d.glob("*fusionne_cim10.json"))
if fusions:
return fusions[0]
cim10s = list(d.glob("*_cim10.json"))
return cim10s[0] if cim10s else None
def bootstrap_template(out_path: Path, case_ids: list[str] | None = None) -> int:
"""Génère un CSV template pré-rempli depuis les JSON pipeline."""
if case_ids:
dossier_ids = case_ids
else:
# Trouver tous les dossiers, filtrer les CRH ou sans DP trackare
dossier_ids = sorted(
d.name for d in OUTPUT_DIR.iterdir()
if d.is_dir() and find_merged_json(d.name)
)
rows: list[dict] = []
for did in dossier_ids:
path = find_merged_json(did)
if not path:
continue
try:
data = json.loads(path.read_text("utf-8"))
except (json.JSONDecodeError, OSError):
continue
dp = data.get("diagnostic_principal", {})
dp_code = dp.get("cim10_final") or dp.get("cim10_suggestion") or ""
dp_label = dp.get("texte", "")
# Pré-remplir avec le DP pipeline comme suggestion
rows.append({
"case_id": did,
"dp_expected_code": dp_code,
"dp_expected_label": dp_label,
"dp_acceptable_codes": "",
"dp_acceptable_family3": dp_code[:3] if dp_code else "",
"allow_symptom_dp": "false",
"confidence": "probable",
"evidence_1_section": "",
"evidence_1_excerpt": "",
"evidence_2_section": "",
"evidence_2_excerpt": "",
"notes": f"pipeline: {dp_code} ({dp.get('source', '?')})",
})
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
writer.writeheader()
writer.writerows(rows)
return len(rows)
# ---------------------------------------------------------------------------
# Import CSV → JSONL + index
# ---------------------------------------------------------------------------
def import_csv_to_jsonl(csv_path: Path, jsonl_path: Path | None = None,
index_path: Path | None = None) -> tuple[int, list[str]]:
"""Convertit un CSV rempli en JSONL gold + index JSON.
Retourne (nb_cas, erreurs).
"""
from src.eval.gold_models import is_valid_cim10_format
jsonl_path = jsonl_path or DEFAULT_JSONL
index_path = index_path or DEFAULT_INDEX
if not csv_path.exists():
return 0, [f"Fichier CSV introuvable: {csv_path}"]
rows: list[dict] = []
errors: list[str] = []
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader, 2): # ligne 2+ (header = 1)
case_id = row.get("case_id", "").strip()
if not case_id:
errors.append(f"ligne {i}: case_id vide")
continue
dp_code = row.get("dp_expected_code", "").strip().upper()
dp_label = row.get("dp_expected_label", "").strip()
if not dp_code:
errors.append(f"ligne {i} ({case_id}): dp_expected_code obligatoire")
continue
if not is_valid_cim10_format(dp_code):
errors.append(f"ligne {i} ({case_id}): code CIM-10 invalide: {dp_code}")
continue
# Parse listes séparées par |
acceptable_codes = [
c.strip().upper()
for c in row.get("dp_acceptable_codes", "").split("|")
if c.strip()
]
for ac in acceptable_codes:
if not is_valid_cim10_format(ac):
errors.append(f"ligne {i} ({case_id}): code acceptable invalide: {ac}")
acceptable_family3 = [
f.strip().upper()
for f in row.get("dp_acceptable_family3", "").split("|")
if f.strip()
]
allow_symptom = row.get("allow_symptom_dp", "false").strip().lower() in (
"true", "1", "yes", "oui"
)
confidence = row.get("confidence", "probable").strip().lower()
if confidence not in ("certain", "probable", "ambiguous"):
errors.append(f"ligne {i} ({case_id}): confidence invalide: {confidence}")
confidence = "probable"
# Evidence
evidence = []
for idx in (1, 2):
section = row.get(f"evidence_{idx}_section", "").strip()
excerpt = row.get(f"evidence_{idx}_excerpt", "").strip()
if section and excerpt:
if len(excerpt) > 240:
errors.append(
f"ligne {i} ({case_id}): evidence_{idx}_excerpt "
f"trop long ({len(excerpt)} > 240)"
)
excerpt = excerpt[:240]
evidence.append({"section": section, "excerpt": excerpt})
notes = row.get("notes", "").strip()
if len(notes) > 400:
notes = notes[:400]
gold_case = {
"case_id": case_id,
"document_type": "crh",
"dp_expected": {"code": dp_code, "label": dp_label},
"dp_acceptable_codes": acceptable_codes,
"dp_acceptable_family3": acceptable_family3,
"allow_symptom_dp": allow_symptom,
"confidence": confidence,
"evidence": evidence,
"notes": notes,
}
rows.append(gold_case)
if not rows:
return 0, errors + ["Aucun cas valide trouvé dans le CSV"]
# Écrire JSONL
jsonl_path.parent.mkdir(parents=True, exist_ok=True)
with open(jsonl_path, "w", encoding="utf-8") as f:
for gold_case in rows:
f.write(json.dumps(gold_case, ensure_ascii=False) + "\n")
# Écrire index (case_id → numéro de ligne 0-indexed)
index = {row["case_id"]: i for i, row in enumerate(rows)}
with open(index_path, "w", encoding="utf-8") as f:
json.dump(index, f, ensure_ascii=False, indent=2)
return len(rows), errors
# ---------------------------------------------------------------------------
# Check : validation JSONL
# ---------------------------------------------------------------------------
def check_gold(jsonl_path: Path) -> tuple[dict, list[str]]:
"""Valide un fichier JSONL gold. Retourne (stats, erreurs)."""
from src.eval.gold_models import load_gold_jsonl
errors: list[str] = []
try:
cases = load_gold_jsonl(jsonl_path)
except FileNotFoundError as e:
return {}, [str(e)]
except ValueError as e:
return {}, [str(e)]
# Doublons
seen_ids: dict[str, int] = {}
for i, c in enumerate(cases):
if c.case_id in seen_ids:
errors.append(f"case_id dupliqué: {c.case_id} (lignes {seen_ids[c.case_id]+1} et {i+1})")
seen_ids[c.case_id] = i
# Stats
stats = {
"total": len(cases),
"certain": sum(1 for c in cases if c.confidence == "certain"),
"probable": sum(1 for c in cases if c.confidence == "probable"),
"ambiguous": sum(1 for c in cases if c.confidence == "ambiguous"),
"allow_symptom_dp": sum(1 for c in cases if c.allow_symptom_dp),
"with_evidence": sum(1 for c in cases if c.evidence),
"with_acceptable_codes": sum(1 for c in cases if c.dp_acceptable_codes),
"with_family3": sum(1 for c in cases if c.dp_acceptable_family3),
}
# Champs manquants / suspects
for c in cases:
if not c.dp_expected.label:
errors.append(f"{c.case_id}: dp_expected.label vide")
if not c.evidence:
errors.append(f"{c.case_id}: pas d'evidence (recommandé)")
return stats, errors
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Annotateur gold CRH — bootstrap / import / check"
)
parser.add_argument("--bootstrap", action="store_true",
help="Générer un CSV template depuis les JSON pipeline")
parser.add_argument("--import-csv", type=str, metavar="CSV",
help="Importer un CSV rempli → JSONL gold")
parser.add_argument("--check", type=str, metavar="JSONL",
help="Vérifier un fichier JSONL gold")
parser.add_argument("--out", type=str, default=str(GOLD_DIR / "gold_template.csv"),
help="Chemin de sortie (bootstrap)")
parser.add_argument("--cases", type=str, default="",
help="IDs de cas séparés par virgule (bootstrap)")
args = parser.parse_args()
if not any([args.bootstrap, args.import_csv, args.check]):
parser.print_help()
sys.exit(1)
if args.bootstrap:
case_ids = [c.strip() for c in args.cases.split(",") if c.strip()] if args.cases else None
out = Path(args.out)
n = bootstrap_template(out, case_ids)
print(f"Template généré : {out} ({n} cas)")
print(f" Colonnes : {', '.join(CSV_COLUMNS)}")
print(f" Remplir le CSV puis : python tools/gold_crh_annotator.py --import-csv {out}")
if args.import_csv:
csv_path = Path(args.import_csv)
n, errors = import_csv_to_jsonl(csv_path)
if errors:
print(f"ATTENTION — {len(errors)} avertissement(s) :")
for e in errors[:20]:
print(f" {e}")
print(f"\nImporté : {n} cas → {DEFAULT_JSONL}")
print(f"Index : {DEFAULT_INDEX}")
if args.check:
jsonl_path = Path(args.check)
stats, errors = check_gold(jsonl_path)
if errors:
print(f"ERREURS ({len(errors)}) :")
for e in errors[:20]:
print(f" {e}")
print()
if stats:
print(f"Gold CRH — {stats['total']} cas")
print(f" Confiance : certain={stats['certain']}, "
f"probable={stats['probable']}, ambiguous={stats['ambiguous']}")
print(f" allow_symptom_dp : {stats['allow_symptom_dp']}")
print(f" Avec evidence : {stats['with_evidence']}")
print(f" Avec codes alt : {stats['with_acceptable_codes']}")
print(f" Avec family3 : {stats['with_family3']}")
else:
print("Aucun cas chargé.")
sys.exit(1)
if not errors:
print("\nOK — aucune erreur détectée")
if __name__ == "__main__":
main()