feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)

Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 20:44:32 +01:00
commit f70d138db3
13 changed files with 1699 additions and 0 deletions
--- a/extractor/validator.py
+++ b/extractor/validator.py
@@ -0,0 +1,148 @@
+"""
+Validation des données extraites.
+Vérifie les formats, la cohérence, et signale les anomalies.
+Applique un auto-fix en safety-net via le normalizer avant validation.
+"""
+import re
+import logging
+from dataclasses import dataclass, field
+
+from config import DECISION_VALUES, TYPE_DESACCORD_VALUES
+from extractor.normalizer import normalize_extraction
+
+logger = logging.getLogger(__name__)
+
+# Patterns de codes médicaux
+CIM10_PATTERN = re.compile(r'^[A-Z]\d{2}(?:\.\d{1,2})?$')
+CCAM_PATTERN = re.compile(r'^[A-Z]{4}\d{3}(?:-\d)?$')
+
+
+@dataclass
+class ValidationResult:
+    """Résultat de validation d'une extraction."""
+    is_valid: bool
+    warnings: list  # list[str]
+    errors: list  # list[str]
+    fixes: list = field(default_factory=list)  # list[str] — auto-corrections appliquées
+
+
+def _validate_codes(codes_str: str | None, field_name: str) -> list[str]:
+    """Valide une chaîne de codes CIM-10/CCAM."""
+    warnings = []
+    if not codes_str:
+        return warnings
+
+    codes = [c.strip() for c in codes_str.split(',')]
+    for code in codes:
+        if not code:
+            continue
+        is_cim10 = CIM10_PATTERN.match(code)
+        is_ccam = CCAM_PATTERN.match(code)
+        if not is_cim10 and not is_ccam:
+            warnings.append(f"{field_name} : code '{code}' ne correspond ni à CIM-10 ni à CCAM")
+
+    return warnings
+
+
+def validate_extraction(extraction) -> ValidationResult:
+    """
+    Valide une extraction OGC.
+    Applique d'abord un auto-fix en safety-net via le normalizer,
+    puis retourne les warnings et erreurs détectés.
+    """
+    warnings = []
+    errors = []
+    fixes = []
+
+    # Vérifier l'extraction elle-même
+    if not extraction.extraction_success:
+        errors.append(f"Extraction échouée : {extraction.error_message}")
+        return ValidationResult(is_valid=False, warnings=warnings, errors=errors)
+
+    # Safety-net : auto-fix via normalizer avant validation
+    fixes = normalize_extraction(extraction)
+
+    # Vérifier la décision
+    if extraction.decision_ucr and extraction.decision_ucr not in DECISION_VALUES:
+        warnings.append(f"Décision non standard : '{extraction.decision_ucr}'")
+
+    # Vérifier le type de désaccord
+    if extraction.type_desaccord and extraction.type_desaccord not in TYPE_DESACCORD_VALUES:
+        warnings.append(f"Type désaccord non standard : '{extraction.type_desaccord}'")
+
+    # Vérifier les codes
+    warnings.extend(_validate_codes(extraction.codes_etablissement, "codes_etablissement"))
+    warnings.extend(_validate_codes(extraction.codes_controleurs, "codes_controleurs"))
+    warnings.extend(_validate_codes(extraction.codes_retenus, "codes_retenus"))
+
+    # Vérifier la cohérence décision / codes retenus
+    if extraction.decision_ucr == "Défavorable" and not extraction.codes_retenus:
+        if extraction.codes_controleurs:
+            warnings.append("Décision défavorable mais codes_retenus vide — les codes contrôleurs devraient être retenus")
+
+    # Vérifier que le texte de décision n'est pas vide
+    if not extraction.texte_decision or len(extraction.texte_decision.strip()) < 20:
+        warnings.append("Texte de décision absent ou très court")
+
+    is_valid = len(errors) == 0
+    return ValidationResult(is_valid=is_valid, warnings=warnings, errors=errors, fixes=fixes)
+
+
+def validate_all(extractions: list) -> dict:
+    """
+    Valide toutes les extractions et retourne un rapport.
+    Inclut les auto-corrections appliquées par le safety-net.
+    """
+    total = len(extractions)
+    valid = 0
+    with_warnings = 0
+    failed = 0
+    all_warnings = []
+    all_errors = []
+    all_fixes = []
+
+    for ext in extractions:
+        result = validate_extraction(ext)
+
+        # Collecter les auto-corrections
+        if result.fixes:
+            for f in result.fixes:
+                fix_msg = f"OGC {ext.num_ogc} (Champ {ext.champ}) : {f}"
+                all_fixes.append(fix_msg)
+                logger.info(f"  🔧 {fix_msg}")
+
+        if result.is_valid:
+            valid += 1
+            if result.warnings:
+                with_warnings += 1
+                for w in result.warnings:
+                    all_warnings.append(f"OGC {ext.num_ogc} (Champ {ext.champ}) : {w}")
+        else:
+            failed += 1
+            for e in result.errors:
+                all_errors.append(f"OGC {ext.num_ogc} (Champ {ext.champ}) : {e}")
+
+    report = {
+        "total": total,
+        "valid": valid,
+        "with_warnings": with_warnings,
+        "failed": failed,
+        "warnings": all_warnings,
+        "errors": all_errors,
+        "fixes": all_fixes,
+        "total_fixes": len(all_fixes),
+    }
+
+    logger.info(f"Validation : {valid}/{total} OK, {with_warnings} avec warnings, {failed} échoués")
+    if all_fixes:
+        logger.info(f"  {len(all_fixes)} auto-corrections appliquées par le safety-net")
+    if all_warnings:
+        for w in all_warnings[:10]:  # Limiter l'affichage
+            logger.warning(f"  ⚠ {w}")
+        if len(all_warnings) > 10:
+            logger.warning(f"  ... et {len(all_warnings) - 10} autres warnings")
+    if all_errors:
+        for e in all_errors:
+            logger.error(f"  ✗ {e}")
+
+    return report