""" Schema d'extraction de donnees - Definition des champs et navigation Permet de definir un schema YAML decrivant les champs a extraire depuis des captures d'ecran (DPI, formulaires, listes...). """ import re from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional import yaml @dataclass class ExtractionField: """Definition d'un champ a extraire depuis un screenshot.""" name: str # Ex: "nom_patient", "date_naissance" description: str # Description pour le VLM field_type: str = "text" # "text", "date", "number", "boolean" required: bool = True validation_regex: Optional[str] = None # Regex de validation optionnelle def validate_value(self, value: Optional[str]) -> bool: """ Valider une valeur extraite pour ce champ. Returns: True si la valeur est valide """ # Champ requis mais absent if self.required and (value is None or str(value).strip() == ""): return False # Pas de valeur et pas requis => OK if value is None or str(value).strip() == "": return True value_str = str(value).strip() # Validation par type if self.field_type == "number": try: float(value_str.replace(",", ".").replace(" ", "")) except ValueError: return False elif self.field_type == "boolean": if value_str.lower() not in ( "true", "false", "oui", "non", "1", "0", "vrai", "faux" ): return False elif self.field_type == "date": # Accepter les formats courants FR date_patterns = [ r"\d{2}/\d{2}/\d{4}", # JJ/MM/AAAA r"\d{2}-\d{2}-\d{4}", # JJ-MM-AAAA r"\d{4}-\d{2}-\d{2}", # AAAA-MM-JJ (ISO) r"\d{2}\.\d{2}\.\d{4}", # JJ.MM.AAAA ] if not any(re.fullmatch(p, value_str) for p in date_patterns): return False # Validation regex custom if self.validation_regex: if not re.fullmatch(self.validation_regex, value_str): return False return True @dataclass class ExtractionSchema: """ Schema complet d'extraction : liste de champs + regles de navigation. Peut etre charge/sauvegarde en YAML pour reutilisation. """ name: str # Ex: "dossier_patient_DPI" description: str fields: List[ExtractionField] = field(default_factory=list) navigation: Dict[str, Any] = field(default_factory=dict) # --- Serialisation YAML --- @classmethod def from_yaml(cls, path: str) -> "ExtractionSchema": """ Charger un schema depuis un fichier YAML. Args: path: Chemin vers le fichier YAML Returns: Instance ExtractionSchema """ yaml_path = Path(path) if not yaml_path.exists(): raise FileNotFoundError(f"Schema YAML non trouve : {path}") with open(yaml_path, "r", encoding="utf-8") as f: data = yaml.safe_load(f) if not isinstance(data, dict): raise ValueError(f"Le fichier YAML doit contenir un dictionnaire, pas {type(data).__name__}") return cls._from_dict(data) @classmethod def from_dict(cls, data: Dict[str, Any]) -> "ExtractionSchema": """Construire un schema depuis un dictionnaire Python.""" return cls._from_dict(data) @classmethod def _from_dict(cls, data: Dict[str, Any]) -> "ExtractionSchema": """Construction interne depuis un dict.""" fields_raw = data.get("fields", []) fields = [] for fd in fields_raw: fields.append(ExtractionField( name=fd["name"], description=fd.get("description", ""), field_type=fd.get("type", fd.get("field_type", "text")), required=fd.get("required", True), validation_regex=fd.get("validation", fd.get("validation_regex")), )) return cls( name=data.get("name", "unnamed"), description=data.get("description", ""), fields=fields, navigation=data.get("navigation", {}), ) def to_yaml(self, path: str) -> None: """ Sauvegarder le schema en fichier YAML. Args: path: Chemin de sortie """ yaml_path = Path(path) yaml_path.parent.mkdir(parents=True, exist_ok=True) data = self.to_dict() with open(yaml_path, "w", encoding="utf-8") as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) def to_dict(self) -> Dict[str, Any]: """Convertir en dictionnaire serialisable.""" return { "name": self.name, "description": self.description, "fields": [ { "name": f.name, "description": f.description, "type": f.field_type, "required": f.required, **({"validation": f.validation_regex} if f.validation_regex else {}), } for f in self.fields ], "navigation": self.navigation, } # --- Utilitaires --- @property def required_fields(self) -> List[ExtractionField]: """Retourne la liste des champs obligatoires.""" return [f for f in self.fields if f.required] @property def field_names(self) -> List[str]: """Retourne la liste des noms de champs.""" return [f.name for f in self.fields] def get_field(self, name: str) -> Optional[ExtractionField]: """Recuperer un champ par son nom.""" for f in self.fields: if f.name == name: return f return None def validate_record(self, record: Dict[str, Any]) -> Dict[str, Any]: """ Valider un enregistrement complet contre le schema. Returns: Dict avec 'valid' (bool), 'errors' (list), 'completeness' (float) """ errors = [] valid_count = 0 for fld in self.fields: value = record.get(fld.name) if fld.validate_value(value): if value is not None and str(value).strip(): valid_count += 1 else: errors.append(f"Champ '{fld.name}' invalide: {value!r}") total = len(self.fields) if self.fields else 1 completeness = valid_count / total return { "valid": len(errors) == 0, "errors": errors, "completeness": completeness, }