"""Tests unitaires pour pipeline.schema (nettoyage JSON).""" from __future__ import annotations from pipeline.schema import ( CLEAN_FIELDS_RECUEIL, DEBUG_FIELDS, SCHEMA_VERSION, clean_dossier, ) def _sample_raw(): """Un JSON pipeline type, riche en champs debug.""" return { "fichier": "OGC 7", "pdf_hash": "abc123", "pages": [{"page": 1, "type": "recueil"}], "extraction": { "recueil": { "etablissement": "CLINIQUE X", "finess": "330780206", "ghm_etab": "11M122", "ghs_etab": "4323", "codage_etab": {"dp": "K650"}, "accord_desaccord": "accord", "_checkbox_debug": {"ratio_accord": 0.38, "ratio_desaccord": 0.19}, "_parse_error": "whatever", "_truncated_loop": True, "_crop_recodage": {"dp": "K650", "_source": "crop"}, "_validation": { "summary": {"valid": 3, "invalid": 0, "empty": 2, "total_codes": 3}, "cross_checks": { "etab": {"checked": True, "coherent": True}, "reco": {"checked": False, "reason": "ghm manquant"}, }, "codage_etab": { "dp": {"code": "K650", "valid": True, "libelle_ref": "Péritonite"}, "dr": {"code": "", "valid": None}, "das": [], }, "codage_reco": {"dp": {}, "dr": {}, "das": []}, "ghm_etab": {"code": "11M122", "valid": True, "ghs_possibles": ["4323"]}, "ghs_etab": {"code": "4323", "valid": True}, "ghm_reco": {"code": "", "valid": None}, "ghs_reco": {"code": "", "valid": None}, }, }, "concertation_2": { "ghs_initial": "4323", "ghs_final": "4323", "decision": "retour_groupage_dim", "date_concertation": "13/03/2018", }, }, "_meta": {"pipeline_version": "v2", "ocr_model": "Qwen/Qwen2.5-VL-3B-Instruct"}, } class TestCleanDossier: def test_retourne_schema_version(self): out = clean_dossier(_sample_raw()) assert out["schema_version"] == SCHEMA_VERSION def test_retire_tous_les_champs_debug(self): """Aucun champ de DEBUG_FIELDS ne doit rester dans la sortie clean.""" out = clean_dossier(_sample_raw()) rec = out["extraction"]["recueil"] for debug_field in DEBUG_FIELDS: assert debug_field not in rec, \ f"{debug_field} devrait être retiré" def test_garde_les_champs_metier(self): out = clean_dossier(_sample_raw()) rec = out["extraction"]["recueil"] for f in ["etablissement", "finess", "ghm_etab", "ghs_etab", "codage_etab", "accord_desaccord"]: assert f in rec, f"{f} doit être présent dans clean" def test_validation_compactee(self): """La validation est conservée mais en format compact.""" out = clean_dossier(_sample_raw()) v = out["extraction"]["recueil"]["_validation"] # summary garde tel quel assert v["summary"]["valid"] == 3 # cross_checks compactés : juste le coherent booléen (ou None) assert v["cross_checks"] == { "etab_ghm_ghs_coherent": True, "reco_ghm_ghs_coherent": None, } # Les codes validés gardent libelle_ref quand dispo assert v["codage_etab"]["dp"]["valid"] is True assert v["codage_etab"]["dp"].get("libelle_ref") == "Péritonite" def test_concertation_2_conservee(self): out = clean_dossier(_sample_raw()) c2 = out["extraction"]["concertation_2"] assert c2["ghs_initial"] == "4323" assert c2["decision"] == "retour_groupage_dim" def test_champs_inconnus_ignorés(self): """Un champ qui n'est pas dans CLEAN_FIELDS_RECUEIL est retiré.""" raw = _sample_raw() raw["extraction"]["recueil"]["champ_inventé"] = "poubelle" out = clean_dossier(raw) assert "champ_inventé" not in out["extraction"]["recueil"] def test_meta_preservee(self): out = clean_dossier(_sample_raw()) assert out["_meta"]["pipeline_version"] == "v2" assert "Qwen" in out["_meta"]["ocr_model"] def test_pas_de_modification_input(self): """La fonction ne doit pas modifier l'input.""" raw = _sample_raw() before = raw["extraction"]["recueil"].copy() _ = clean_dossier(raw) assert raw["extraction"]["recueil"] == before