test: couvrir les modules purs du pipeline (96 nouveaux tests)
Suite de tests unitaires pour tous les modules pipeline qui ne dépendent pas du VLM — utiles pour garantir la non-régression après refactor et servir de spec vivante de chaque fonction. Fichiers : - tests/test_json_utils.py (20 tests) : parse_json_output + toutes les stratégies de récupération (fences, virgules manquantes, boucles vides, fermeture JSON, fallback _raw/_parse_error) - tests/test_deskew.py (11 tests) : détection Hough + correction, image synthétique + fixtures cache réel - tests/test_checkboxes.py (17 tests) : parse_ghs_injustifie, dark_ratio, inner_frac, et ground truth visuel sur 17 dossiers (mapping hash→OGC résolu au runtime pour éviter les constantes fragiles) - tests/test_validation.py (18 tests) : _check_cim10/ccam/ghm/ghs, cross-checks GHM↔GHS, annotate sur JSON vide et complet, preservation de l'input (copie défensive) - tests/test_schema.py (8 tests) : clean_dossier retire les champs debug, préserve les champs métier, compacte la validation, ne modifie pas l'input - tests/test_zones_config.py (8 tests) : load/save round-trip, merge avec defaults, résilience JSON corrompu, get_zone Total : 107 tests, 5.1 s d'exécution, tous passent. Aucune dépendance GPU, s'exécutent en CI. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
118
tests/test_schema.py
Normal file
118
tests/test_schema.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Tests unitaires pour pipeline.schema (nettoyage JSON)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pipeline.schema import (
|
||||
CLEAN_FIELDS_RECUEIL,
|
||||
DEBUG_FIELDS,
|
||||
SCHEMA_VERSION,
|
||||
clean_dossier,
|
||||
)
|
||||
|
||||
|
||||
def _sample_raw():
|
||||
"""Un JSON pipeline type, riche en champs debug."""
|
||||
return {
|
||||
"fichier": "OGC 7",
|
||||
"pdf_hash": "abc123",
|
||||
"pages": [{"page": 1, "type": "recueil"}],
|
||||
"extraction": {
|
||||
"recueil": {
|
||||
"etablissement": "CLINIQUE X",
|
||||
"finess": "330780206",
|
||||
"ghm_etab": "11M122",
|
||||
"ghs_etab": "4323",
|
||||
"codage_etab": {"dp": "K650"},
|
||||
"accord_desaccord": "accord",
|
||||
"_checkbox_debug": {"ratio_accord": 0.38, "ratio_desaccord": 0.19},
|
||||
"_parse_error": "whatever",
|
||||
"_truncated_loop": True,
|
||||
"_crop_recodage": {"dp": "K650", "_source": "crop"},
|
||||
"_validation": {
|
||||
"summary": {"valid": 3, "invalid": 0, "empty": 2, "total_codes": 3},
|
||||
"cross_checks": {
|
||||
"etab": {"checked": True, "coherent": True},
|
||||
"reco": {"checked": False, "reason": "ghm manquant"},
|
||||
},
|
||||
"codage_etab": {
|
||||
"dp": {"code": "K650", "valid": True, "libelle_ref": "Péritonite"},
|
||||
"dr": {"code": "", "valid": None},
|
||||
"das": [],
|
||||
},
|
||||
"codage_reco": {"dp": {}, "dr": {}, "das": []},
|
||||
"ghm_etab": {"code": "11M122", "valid": True,
|
||||
"ghs_possibles": ["4323"]},
|
||||
"ghs_etab": {"code": "4323", "valid": True},
|
||||
"ghm_reco": {"code": "", "valid": None},
|
||||
"ghs_reco": {"code": "", "valid": None},
|
||||
},
|
||||
},
|
||||
"concertation_2": {
|
||||
"ghs_initial": "4323",
|
||||
"ghs_final": "4323",
|
||||
"decision": "retour_groupage_dim",
|
||||
"date_concertation": "13/03/2018",
|
||||
},
|
||||
},
|
||||
"_meta": {"pipeline_version": "v2", "ocr_model": "Qwen/Qwen2.5-VL-3B-Instruct"},
|
||||
}
|
||||
|
||||
|
||||
class TestCleanDossier:
|
||||
def test_retourne_schema_version(self):
|
||||
out = clean_dossier(_sample_raw())
|
||||
assert out["schema_version"] == SCHEMA_VERSION
|
||||
|
||||
def test_retire_tous_les_champs_debug(self):
|
||||
"""Aucun champ de DEBUG_FIELDS ne doit rester dans la sortie clean."""
|
||||
out = clean_dossier(_sample_raw())
|
||||
rec = out["extraction"]["recueil"]
|
||||
for debug_field in DEBUG_FIELDS:
|
||||
assert debug_field not in rec, \
|
||||
f"{debug_field} devrait être retiré"
|
||||
|
||||
def test_garde_les_champs_metier(self):
|
||||
out = clean_dossier(_sample_raw())
|
||||
rec = out["extraction"]["recueil"]
|
||||
for f in ["etablissement", "finess", "ghm_etab", "ghs_etab",
|
||||
"codage_etab", "accord_desaccord"]:
|
||||
assert f in rec, f"{f} doit être présent dans clean"
|
||||
|
||||
def test_validation_compactee(self):
|
||||
"""La validation est conservée mais en format compact."""
|
||||
out = clean_dossier(_sample_raw())
|
||||
v = out["extraction"]["recueil"]["_validation"]
|
||||
# summary garde tel quel
|
||||
assert v["summary"]["valid"] == 3
|
||||
# cross_checks compactés : juste le coherent booléen (ou None)
|
||||
assert v["cross_checks"] == {
|
||||
"etab_ghm_ghs_coherent": True,
|
||||
"reco_ghm_ghs_coherent": None,
|
||||
}
|
||||
# Les codes validés gardent libelle_ref quand dispo
|
||||
assert v["codage_etab"]["dp"]["valid"] is True
|
||||
assert v["codage_etab"]["dp"].get("libelle_ref") == "Péritonite"
|
||||
|
||||
def test_concertation_2_conservee(self):
|
||||
out = clean_dossier(_sample_raw())
|
||||
c2 = out["extraction"]["concertation_2"]
|
||||
assert c2["ghs_initial"] == "4323"
|
||||
assert c2["decision"] == "retour_groupage_dim"
|
||||
|
||||
def test_champs_inconnus_ignorés(self):
|
||||
"""Un champ qui n'est pas dans CLEAN_FIELDS_RECUEIL est retiré."""
|
||||
raw = _sample_raw()
|
||||
raw["extraction"]["recueil"]["champ_inventé"] = "poubelle"
|
||||
out = clean_dossier(raw)
|
||||
assert "champ_inventé" not in out["extraction"]["recueil"]
|
||||
|
||||
def test_meta_preservee(self):
|
||||
out = clean_dossier(_sample_raw())
|
||||
assert out["_meta"]["pipeline_version"] == "v2"
|
||||
assert "Qwen" in out["_meta"]["ocr_model"]
|
||||
|
||||
def test_pas_de_modification_input(self):
|
||||
"""La fonction ne doit pas modifier l'input."""
|
||||
raw = _sample_raw()
|
||||
before = raw["extraction"]["recueil"].copy()
|
||||
_ = clean_dossier(raw)
|
||||
assert raw["extraction"]["recueil"] == before
|
||||
Reference in New Issue
Block a user