feat(p1): persist workflows and semantic learning artifacts
This commit is contained in:
518
core/competences/persist.py
Normal file
518
core/competences/persist.py
Normal file
@@ -0,0 +1,518 @@
|
||||
"""Helpers de persistance pour les competences candidates (POC Lea-first).
|
||||
|
||||
Couvre :
|
||||
- slugification stricte (ASCII, regex ^[a-z][a-z0-9_]{2,79}$)
|
||||
- detection PII (regex MVP, paramétrable)
|
||||
- atomic write + rename POSIX
|
||||
- append-only audit JSONL avec verrou fcntl
|
||||
- detection de collision cross-states (candidate / supervised / stable)
|
||||
|
||||
Le module est volontairement minimal : il n'importe pas FastAPI ni le pipeline
|
||||
VWB, il ne fait pas de logique reseau. Il est consomme depuis
|
||||
``agent_v0/server_v1/api_stream.py`` endpoint ``/persist``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
try: # pragma: no cover - dependance externe deja presente dans le projet
|
||||
import yaml
|
||||
except ImportError as exc: # pragma: no cover
|
||||
raise RuntimeError("PyYAML est requis pour core.competences.persist") from exc
|
||||
|
||||
try:
|
||||
import fcntl # POSIX uniquement
|
||||
_HAS_FCNTL = True
|
||||
except ImportError: # pragma: no cover - Windows
|
||||
fcntl = None # type: ignore[assignment]
|
||||
_HAS_FCNTL = False
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
COMPETENCES_ROOT = REPO_ROOT / "data" / "competences"
|
||||
CANDIDATE_DIR = COMPETENCES_ROOT / "candidate"
|
||||
SUPERVISED_DIR = COMPETENCES_ROOT / "supervised"
|
||||
STABLE_DIR = COMPETENCES_ROOT / "stable"
|
||||
AUDIT_PATH = COMPETENCES_ROOT / "persist_audit.jsonl"
|
||||
INCOMPLETE_PATH = COMPETENCES_ROOT / "incomplete_learnings.jsonl"
|
||||
|
||||
# Pattern final autorise pour un slug de competence.
|
||||
SLUG_PATTERN = re.compile(r"^[a-z][a-z0-9_]{2,79}$")
|
||||
|
||||
# Detection PII MVP — regex parametrable via env RPA_PII_PATTERNS
|
||||
# (separes par |). Defaut : couvre patterns simples (IPP, NIR, email, tel FR).
|
||||
_DEFAULT_PII_PATTERNS = [
|
||||
r"\b\d{13}\b", # NIR FR (13 chiffres)
|
||||
r"\b\d{15}\b", # NIR FR + cle
|
||||
r"\bIPP[\s:_-]*\d{6,}\b", # IPP hospitalier
|
||||
r"[\w\.-]+@[\w\.-]+\.\w{2,}", # email
|
||||
r"\b0[1-9](?:[ .-]?\d{2}){4}\b", # telephone FR
|
||||
]
|
||||
|
||||
|
||||
def _compile_pii_patterns() -> list[re.Pattern[str]]:
|
||||
raw = os.environ.get("RPA_PII_PATTERNS")
|
||||
patterns = raw.split("|") if raw else _DEFAULT_PII_PATTERNS
|
||||
compiled: list[re.Pattern[str]] = []
|
||||
for pat in patterns:
|
||||
pat = pat.strip()
|
||||
if not pat:
|
||||
continue
|
||||
try:
|
||||
compiled.append(re.compile(pat, re.IGNORECASE))
|
||||
except re.error:
|
||||
continue
|
||||
return compiled
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Slugification
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def slugify(name: str) -> str:
|
||||
"""Convertir un nom libre en slug ASCII strict.
|
||||
|
||||
Regle :
|
||||
- translitteration NFKD (suppression accents)
|
||||
- lowercase, espaces / tirets / points -> '_'
|
||||
- chars hors [a-z0-9_] retires
|
||||
- underscores multiples reduits a 1
|
||||
- troncature a 80 chars max
|
||||
- doit matcher SLUG_PATTERN
|
||||
|
||||
Leve ValueError si le slug final ne matche pas le pattern.
|
||||
"""
|
||||
if not isinstance(name, str):
|
||||
raise ValueError("name doit etre une chaine non vide")
|
||||
raw = name.strip()
|
||||
if not raw:
|
||||
raise ValueError("name est vide")
|
||||
|
||||
# NFKD pour decomposer les accents puis suppression des combinaisons
|
||||
normalized = unicodedata.normalize("NFKD", raw)
|
||||
ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
|
||||
# Espaces / tirets / points / slashes -> underscore
|
||||
cleaned = re.sub(r"[\s\-./\\]+", "_", ascii_only.lower())
|
||||
# Tout ce qui n'est pas [a-z0-9_] -> supprime
|
||||
cleaned = re.sub(r"[^a-z0-9_]+", "", cleaned)
|
||||
# Reduire underscores multiples
|
||||
cleaned = re.sub(r"_+", "_", cleaned).strip("_")
|
||||
# Forcer commencement par une lettre (si commence par chiffre, prefixer)
|
||||
if cleaned and cleaned[0].isdigit():
|
||||
cleaned = f"c_{cleaned}"
|
||||
# Tronquer
|
||||
if len(cleaned) > 80:
|
||||
cleaned = cleaned[:80].rstrip("_")
|
||||
|
||||
if not SLUG_PATTERN.match(cleaned):
|
||||
raise ValueError(
|
||||
f"slug invalide '{cleaned}' (regle : {SLUG_PATTERN.pattern})"
|
||||
)
|
||||
return cleaned
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Collisions cross-states
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def detect_cross_state_collision(
|
||||
slug: str,
|
||||
*,
|
||||
competences_root: Path = COMPETENCES_ROOT,
|
||||
) -> Optional[str]:
|
||||
"""Retourne le sous-dossier ou un YAML <slug>.yaml existe deja, sinon None.
|
||||
|
||||
Verifie candidate/, supervised/, stable/.
|
||||
"""
|
||||
for sub in ("candidate", "supervised", "stable"):
|
||||
target = competences_root / sub / f"{slug}.yaml"
|
||||
if target.exists():
|
||||
return sub
|
||||
return None
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Detection PII
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def detect_pii(payload: Any) -> list[str]:
|
||||
"""Parcourt recursivement un payload (dict/list/str) et retourne la liste
|
||||
des patterns PII matches. Liste vide = pas de PII detecte.
|
||||
|
||||
L'appelant decide quoi en faire (HTTP 400 + log non-sensible).
|
||||
"""
|
||||
matches: list[str] = []
|
||||
patterns = _compile_pii_patterns()
|
||||
if not patterns:
|
||||
return matches
|
||||
|
||||
def _walk(node: Any) -> None:
|
||||
if isinstance(node, str):
|
||||
for pat in patterns:
|
||||
if pat.search(node):
|
||||
matches.append(pat.pattern)
|
||||
elif isinstance(node, dict):
|
||||
for v in node.values():
|
||||
_walk(v)
|
||||
elif isinstance(node, (list, tuple)):
|
||||
for v in node:
|
||||
_walk(v)
|
||||
|
||||
_walk(payload)
|
||||
# dedoublonner en preservant l'ordre
|
||||
seen = set()
|
||||
out: list[str] = []
|
||||
for p in matches:
|
||||
if p not in seen:
|
||||
seen.add(p)
|
||||
out.append(p)
|
||||
return out
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Atomic write
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def atomic_write_yaml(
|
||||
target_path: Path,
|
||||
data: dict[str, Any],
|
||||
*,
|
||||
persist_id: str,
|
||||
) -> Path:
|
||||
"""Ecrire un dict en YAML de maniere atomique.
|
||||
|
||||
1. Ecrit dans <target_dir>/.<basename>.tmp.<persist_id>
|
||||
2. os.rename vers target_path (POSIX atomic)
|
||||
3. En cas d'echec, supprime le .tmp si possible.
|
||||
|
||||
Retourne le chemin final (target_path).
|
||||
"""
|
||||
target_path = Path(target_path)
|
||||
target_dir = target_path.parent
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
tmp_name = f".{target_path.name}.tmp.{persist_id}"
|
||||
tmp_path = target_dir / tmp_name
|
||||
|
||||
try:
|
||||
with tmp_path.open("w", encoding="utf-8") as handle:
|
||||
yaml.safe_dump(
|
||||
data,
|
||||
handle,
|
||||
allow_unicode=True,
|
||||
sort_keys=False,
|
||||
default_flow_style=False,
|
||||
)
|
||||
handle.flush()
|
||||
try:
|
||||
os.fsync(handle.fileno())
|
||||
except OSError:
|
||||
pass
|
||||
# rename atomique (POSIX). Echoue si target existe deja sur Windows,
|
||||
# mais Linux (POSIX) ecrase silencieusement. On a verifie la collision
|
||||
# avant l'appel.
|
||||
os.rename(tmp_path, target_path)
|
||||
except Exception:
|
||||
if tmp_path.exists():
|
||||
try:
|
||||
tmp_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
return target_path
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Audit append (JSONL + verrou)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def audit_append(
|
||||
entry: dict[str, Any],
|
||||
*,
|
||||
audit_path: Path = AUDIT_PATH,
|
||||
) -> int:
|
||||
"""Append une ligne JSON dans le fichier audit, retourne audit_entry_id.
|
||||
|
||||
L'audit_entry_id est un compteur monotone derive du nombre de lignes
|
||||
avant l'append. La concurrence est serialisee via fcntl.flock (POSIX).
|
||||
Sur les systemes sans fcntl (Windows), l'ecriture est best-effort.
|
||||
"""
|
||||
audit_path = Path(audit_path)
|
||||
audit_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if "timestamp" not in entry:
|
||||
entry["timestamp"] = (
|
||||
datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
||||
)
|
||||
|
||||
# Open en append + lecture pour compter les lignes existantes (audit_entry_id).
|
||||
flags = "a+"
|
||||
with open(audit_path, flags, encoding="utf-8") as handle:
|
||||
if _HAS_FCNTL:
|
||||
try:
|
||||
fcntl.flock(handle.fileno(), fcntl.LOCK_EX) # type: ignore[union-attr]
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
handle.seek(0)
|
||||
line_count = sum(1 for _ in handle)
|
||||
audit_entry_id = line_count + 1
|
||||
entry["audit_entry_id"] = audit_entry_id
|
||||
handle.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
handle.flush()
|
||||
try:
|
||||
os.fsync(handle.fileno())
|
||||
except OSError:
|
||||
pass
|
||||
finally:
|
||||
if _HAS_FCNTL:
|
||||
try:
|
||||
fcntl.flock(handle.fileno(), fcntl.LOCK_UN) # type: ignore[union-attr]
|
||||
except OSError:
|
||||
pass
|
||||
return audit_entry_id
|
||||
|
||||
|
||||
def find_existing_audit_entry(
|
||||
persist_id: str,
|
||||
*,
|
||||
audit_path: Path = AUDIT_PATH,
|
||||
) -> Optional[dict[str, Any]]:
|
||||
"""Recherche une entree existante par persist_id pour l'idempotence."""
|
||||
if not persist_id:
|
||||
return None
|
||||
audit_path = Path(audit_path)
|
||||
if not audit_path.exists():
|
||||
return None
|
||||
try:
|
||||
with audit_path.open("r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
record = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if record.get("persist_id") == persist_id:
|
||||
return record
|
||||
except OSError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# YAML body construction
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
REQUIRED_YAML_FIELDS = (
|
||||
"schema_version",
|
||||
"id",
|
||||
"name",
|
||||
"version",
|
||||
"learning_state",
|
||||
"intent",
|
||||
"parameters",
|
||||
"preconditions",
|
||||
"methods",
|
||||
"success_marker",
|
||||
"failure_message_template",
|
||||
"promotion",
|
||||
"generalisation",
|
||||
"failure_log",
|
||||
"created_at",
|
||||
"last_updated_at",
|
||||
"methods_execution",
|
||||
)
|
||||
|
||||
|
||||
def build_competence_yaml(
|
||||
*,
|
||||
slug: str,
|
||||
name: str,
|
||||
workflow_ir: dict[str, Any],
|
||||
parameters: Optional[list[dict[str, Any]]],
|
||||
intent_fr: str,
|
||||
learning_state: str,
|
||||
session_id: Optional[str],
|
||||
machine_id: Optional[str],
|
||||
external_agent_id: Optional[str] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Construit le dict YAML conforme au schema de reference.
|
||||
|
||||
Aligne sur ``data/competences/candidate/key_win_r_wait_explorer_exe.yaml``.
|
||||
"""
|
||||
now_iso = datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
||||
steps = list(workflow_ir.get("steps") or [])
|
||||
preconditions = list(workflow_ir.get("preconditions") or [])
|
||||
success_marker = workflow_ir.get("success_marker") or {
|
||||
"mode": "all_of",
|
||||
"timeout_ms": 5000,
|
||||
"markers": [],
|
||||
}
|
||||
|
||||
methods: list[dict[str, Any]] = []
|
||||
for idx, step in enumerate(steps, start=1):
|
||||
if not isinstance(step, dict):
|
||||
continue
|
||||
method = dict(step)
|
||||
method.setdefault("id", f"step_{idx}_{step.get('kind') or 'action'}")
|
||||
if "primitive_ref" not in method and method.get("kind"):
|
||||
method["primitive_ref"] = method["kind"]
|
||||
method.setdefault("observed", False)
|
||||
methods.append(method)
|
||||
|
||||
params_dict: dict[str, Any] = {}
|
||||
for p in (parameters or []):
|
||||
if isinstance(p, dict) and p.get("name"):
|
||||
params_dict[str(p["name"])] = {
|
||||
"type": p.get("type", "string"),
|
||||
"required": bool(p.get("required", False)),
|
||||
"description": p.get("description", ""),
|
||||
}
|
||||
|
||||
yaml_body: dict[str, Any] = {
|
||||
"schema_version": 1,
|
||||
"id": slug,
|
||||
"name": name,
|
||||
"version": 1,
|
||||
"learning_state": learning_state,
|
||||
"intent": {"fr": intent_fr or name},
|
||||
"parameters": params_dict,
|
||||
"preconditions": preconditions,
|
||||
"methods": methods,
|
||||
"success_marker": success_marker,
|
||||
"failure_message_template": workflow_ir.get("failure_message_template")
|
||||
or {
|
||||
"intention": intent_fr or name,
|
||||
"attendu": "",
|
||||
"vu": "{observed_human_state}",
|
||||
"demande": "indiquer la correction attendue",
|
||||
},
|
||||
"promotion": {
|
||||
"history": [
|
||||
{
|
||||
"at": now_iso,
|
||||
"from": "observed",
|
||||
"to": learning_state,
|
||||
"by": "lea_persist_endpoint",
|
||||
"reason": "persisted via /api/v1/lea/competences/candidate/persist",
|
||||
}
|
||||
],
|
||||
"candidate_requires": [
|
||||
"method_trace_present",
|
||||
"success_marker_defined",
|
||||
"failure_message_template_valid",
|
||||
],
|
||||
"supervised_requires": ["replay_verified_once", "human_validation"],
|
||||
"stable_requires": {
|
||||
"min_successes": 3,
|
||||
"distinct_contexts": 3,
|
||||
"max_unexplained_failures": 0,
|
||||
},
|
||||
"t2_known_gaps": [],
|
||||
},
|
||||
"generalisation": {
|
||||
"seen_contexts": [],
|
||||
"method_success_rate": {},
|
||||
"variance_log": [],
|
||||
},
|
||||
"failure_log": [],
|
||||
"created_at": now_iso,
|
||||
"last_updated_at": now_iso,
|
||||
"methods_execution": "sequence",
|
||||
}
|
||||
|
||||
if session_id or machine_id or external_agent_id:
|
||||
yaml_body["chain_refs"] = {
|
||||
"source_session": session_id,
|
||||
"machine_id": machine_id,
|
||||
"external_agent_id": external_agent_id,
|
||||
}
|
||||
return yaml_body
|
||||
|
||||
|
||||
def validate_yaml_schema(data: dict[str, Any]) -> list[str]:
|
||||
"""Verifie la presence des champs obligatoires. Retourne la liste des manquants."""
|
||||
return [field for field in REQUIRED_YAML_FIELDS if field not in data]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Rate limit token-bucket simple (en memoire, par machine_id)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class PersistRateLimiter:
|
||||
"""Token-bucket minimal pour /persist.
|
||||
|
||||
Par defaut : 10 requetes / minute / machine_id (cf. specs §6).
|
||||
Instance unique attendue ; thread-safe via lock minimal.
|
||||
"""
|
||||
|
||||
def __init__(self, *, max_per_minute: int = 10, window_seconds: int = 60) -> None:
|
||||
self.max_per_minute = max_per_minute
|
||||
self.window_seconds = window_seconds
|
||||
self._timestamps: dict[str, list[float]] = {}
|
||||
|
||||
def allow(self, machine_id: str) -> tuple[bool, int]:
|
||||
"""Renvoie (allowed, retry_after_seconds).
|
||||
|
||||
retry_after_seconds = 0 si autorise.
|
||||
"""
|
||||
if not machine_id:
|
||||
return True, 0
|
||||
now = time.time()
|
||||
bucket = self._timestamps.setdefault(machine_id, [])
|
||||
# Purger les entrees hors fenetre
|
||||
bucket[:] = [ts for ts in bucket if now - ts < self.window_seconds]
|
||||
if len(bucket) >= self.max_per_minute:
|
||||
oldest = bucket[0]
|
||||
retry_after = max(1, int(self.window_seconds - (now - oldest)))
|
||||
return False, retry_after
|
||||
bucket.append(now)
|
||||
return True, 0
|
||||
|
||||
def reset(self, machine_id: Optional[str] = None) -> None:
|
||||
if machine_id is None:
|
||||
self._timestamps.clear()
|
||||
else:
|
||||
self._timestamps.pop(machine_id, None)
|
||||
|
||||
|
||||
# Instance partagee importable depuis api_stream
|
||||
persist_rate_limiter = PersistRateLimiter()
|
||||
|
||||
|
||||
__all__ = [
|
||||
"SLUG_PATTERN",
|
||||
"COMPETENCES_ROOT",
|
||||
"CANDIDATE_DIR",
|
||||
"AUDIT_PATH",
|
||||
"INCOMPLETE_PATH",
|
||||
"REQUIRED_YAML_FIELDS",
|
||||
"slugify",
|
||||
"detect_cross_state_collision",
|
||||
"detect_pii",
|
||||
"atomic_write_yaml",
|
||||
"audit_append",
|
||||
"find_existing_audit_entry",
|
||||
"build_competence_yaml",
|
||||
"validate_yaml_schema",
|
||||
"PersistRateLimiter",
|
||||
"persist_rate_limiter",
|
||||
]
|
||||
@@ -16,6 +16,48 @@ import io
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_first_json_object(text: str) -> Optional[Dict[str, Any]]:
|
||||
"""Extrait le premier objet JSON racine d'un texte qui peut contenir
|
||||
du contenu parasite après (typique des modèles VLM qui ajoutent une
|
||||
explication post-JSON).
|
||||
|
||||
Retourne None si aucun JSON valide n'est trouvé.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
# Trouver la première '{' au niveau racine
|
||||
start = text.find("{")
|
||||
if start < 0:
|
||||
return None
|
||||
depth = 0
|
||||
in_string = False
|
||||
escape = False
|
||||
for i in range(start, len(text)):
|
||||
c = text[i]
|
||||
if escape:
|
||||
escape = False
|
||||
continue
|
||||
if c == "\\" and in_string:
|
||||
escape = True
|
||||
continue
|
||||
if c == '"':
|
||||
in_string = not in_string
|
||||
continue
|
||||
if in_string:
|
||||
continue
|
||||
if c == "{":
|
||||
depth += 1
|
||||
elif c == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
candidate = text[start : i + 1]
|
||||
try:
|
||||
return json.loads(candidate)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
class OllamaClient:
|
||||
"""
|
||||
Client Ollama pour VLM
|
||||
@@ -219,7 +261,93 @@ class OllamaClient:
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
def generate_grounding(
|
||||
self,
|
||||
prompt: str,
|
||||
image_path: Optional[str] = None,
|
||||
image: Optional[Image.Image] = None,
|
||||
extra_images_b64: Optional[List[str]] = None,
|
||||
profile: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""D5-v2 (2026-05-25) : appel grounding VLM centralisé, prefill-aware.
|
||||
|
||||
Utilise le profil dédié `vlm_config.get_grounding_profile()` pour
|
||||
garantir num_ctx pinned (défaut 4096), prefill JSON, think=false,
|
||||
temperature=0, num_predict court. Évite les chemins qui retomberaient
|
||||
sur qwen2.5vl en ctx 8192.
|
||||
|
||||
Le profile peut être surchargé via param explicite (utile tests).
|
||||
|
||||
Reconstitue le JSON complet via prefill : la réponse Ollama est
|
||||
complétée par le préfixe `{"x_pct":` avant parsing, pour que
|
||||
`json.loads()` voit le JSON natif.
|
||||
|
||||
Args:
|
||||
prompt: prompt textuel (typiquement "Find element X")
|
||||
image_path / image / extra_images_b64: cf. generate()
|
||||
profile: override du profile grounding (sinon get_grounding_profile())
|
||||
|
||||
Returns:
|
||||
Dict avec `response` (texte complet incluant prefill), `success`,
|
||||
`error`, `parsed_json` (dict {x_pct, y_pct, confidence, ...} ou
|
||||
None si non parsable), `profile_used` (dict).
|
||||
|
||||
Notes:
|
||||
- Pas de fallback automatique sur fallback_model ici. Le caller
|
||||
décide de retry avec un autre modèle si besoin.
|
||||
- `keep_alive` du profile n'est PAS envoyé en payload (Ollama
|
||||
accepte mais non standard). À gérer côté pull/keep si critique.
|
||||
"""
|
||||
if profile is None:
|
||||
from core.detection.vlm_config import get_grounding_profile
|
||||
profile = get_grounding_profile(endpoint=self.endpoint)
|
||||
|
||||
# Préserver le modèle courant, switcher temporairement.
|
||||
original_model = self.model
|
||||
self.model = profile["model"]
|
||||
try:
|
||||
result = self.generate(
|
||||
prompt=prompt,
|
||||
image_path=image_path,
|
||||
image=image,
|
||||
extra_images_b64=extra_images_b64,
|
||||
temperature=profile["temperature"],
|
||||
max_tokens=profile["num_predict"],
|
||||
assistant_prefill=profile["prefill"],
|
||||
num_ctx=profile["num_ctx"],
|
||||
force_json=False, # prefill suffit, format=json ralentit qwen3.5
|
||||
)
|
||||
finally:
|
||||
self.model = original_model
|
||||
|
||||
# Logging non-bruyant : 1 ligne par appel grounding
|
||||
elapsed_hint = "" # caller mesure via time.perf_counter si besoin
|
||||
logger.info(
|
||||
"[PERF] vlm.grounding model=%s ctx=%d prefill=%s success=%s",
|
||||
profile["model"], profile["num_ctx"],
|
||||
"yes" if profile["prefill"] else "no",
|
||||
result.get("success", False),
|
||||
)
|
||||
|
||||
# Parse JSON prefill-aware. Le contenu complet inclut déjà le prefill
|
||||
# (reconstitué par generate()) sauf si prefill=None. Si pas de prefill,
|
||||
# tenter parse direct (le modèle peut avoir produit du JSON pur).
|
||||
parsed = None
|
||||
content = (result.get("response") or "").strip()
|
||||
if content:
|
||||
try:
|
||||
# Le JSON peut être suivi de texte parasite (qwen termine
|
||||
# parfois par des explications). Couper à la 1ère accolade
|
||||
# fermante au niveau racine.
|
||||
parsed = _extract_first_json_object(content)
|
||||
except Exception as e:
|
||||
logger.debug("[PERF] vlm.grounding parse failed: %s — content=%r", e, content[:160])
|
||||
|
||||
result["parsed_json"] = parsed
|
||||
result["profile_used"] = dict(profile)
|
||||
return result
|
||||
|
||||
def detect_ui_elements(self, image_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Détecter les éléments UI dans une image
|
||||
|
||||
@@ -134,13 +134,13 @@ def reset_vlm_model_cache():
|
||||
|
||||
|
||||
def is_thinking_model(model_name: str) -> bool:
|
||||
"""Détermine si un modèle est un modèle 'thinking' (qwen3).
|
||||
"""Détermine si un modèle est un modèle 'thinking' (qwen3, qwen3.5).
|
||||
|
||||
Les modèles thinking nécessitent un assistant prefill pour éviter
|
||||
le mode réflexion interne qui peut durer >180s avec des images.
|
||||
|
||||
Args:
|
||||
model_name: Nom du modèle (ex: "qwen3-vl:8b", "gemma4:e4b")
|
||||
model_name: Nom du modèle (ex: "qwen3-vl:8b", "qwen3.5:9b", "gemma4:e4b")
|
||||
|
||||
Returns:
|
||||
True si le modèle est de type thinking (nécessite prefill workaround)
|
||||
@@ -148,6 +148,92 @@ def is_thinking_model(model_name: str) -> bool:
|
||||
return "qwen3" in model_name.lower()
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────────────
|
||||
# D5-v2 (2026-05-25) : profil grounding dédié, centralisé, env-overridable
|
||||
# ────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# Profil grounding par défaut — qwen3.5:9b avec ctx 4096 et prefill JSON.
|
||||
# Cohérent avec décision Codex après revue Gemini : empêcher rechauffe
|
||||
# qwen2.5vl en ctx 8192 et garantir un chemin grounding reproductible.
|
||||
DEFAULT_GROUNDING_MODEL = "qwen3.5:9b"
|
||||
DEFAULT_GROUNDING_CTX = 4096
|
||||
DEFAULT_GROUNDING_PREFILL = '{"x_pct":'
|
||||
DEFAULT_GROUNDING_TEMPERATURE = 0.0
|
||||
DEFAULT_GROUNDING_NUM_PREDICT = 96 # ~80 tokens suffisent pour `{x_pct,y_pct,confidence}`
|
||||
DEFAULT_GROUNDING_KEEP_ALIVE = "30m" # éviter cold reload entre actions
|
||||
|
||||
# Fallback grounding : qwen2.5vl conservé pour compat existante (rpa-tag).
|
||||
DEFAULT_GROUNDING_FALLBACK = "qwen2.5vl:7b-rpa"
|
||||
|
||||
|
||||
def get_grounding_profile(endpoint: str = DEFAULT_OLLAMA_ENDPOINT) -> dict:
|
||||
"""Retourne le profil VLM pour les appels de grounding **format JSON**
|
||||
(réponse `{"x_pct": ..., "y_pct": ..., "confidence": ...}`).
|
||||
|
||||
⚠️ ATTENTION SCOPE D5-v3a (2026-05-25) :
|
||||
Ce profil est destiné aux appels qui consomment la sortie via prefill JSON
|
||||
(typiquement qwen3.5:9b avec prefill `{"x_pct":`). Il n'est PAS adapté
|
||||
aux appels grounding **format bbox_2d natif** de qwen2.5vl (utilisés
|
||||
dans `agent_v0/server_v1/resolve_engine.py:959-1013, 3008-3045` avec
|
||||
parsing via `core.grounding.bbox_parser.parse_bbox_to_norm`).
|
||||
|
||||
Conflit env var connu : `resolve_engine.py:959` lit aussi
|
||||
`RPA_GROUNDING_MODEL` mais attend un modèle bbox_2d (qwen2.5vl).
|
||||
Si tu setes `RPA_GROUNDING_MODEL=qwen3.5:9b`, ce profil OK mais le
|
||||
site bbox legacy de resolve_engine va recevoir un modèle incompatible.
|
||||
Reporté à D5-v3b : renommer en `RPA_BBOX_GROUNDING_MODEL` côté legacy
|
||||
+ introduire `OllamaClient.generate_bbox_grounding()`.
|
||||
|
||||
Centralise la politique pour empêcher les chemins VLM de retomber sur
|
||||
qwen2.5vl en num_ctx=8192 (Modelfile). Sortie consommée par
|
||||
OllamaClient.generate_grounding().
|
||||
|
||||
Env vars supportées :
|
||||
- RPA_GROUNDING_MODEL : modèle principal (défaut qwen3.5:9b)
|
||||
- RPA_GROUNDING_CTX : context window (défaut 4096)
|
||||
- RPA_GROUNDING_FALLBACK : modèle fallback (défaut qwen2.5vl:7b-rpa)
|
||||
- RPA_VLM_PREFILL=false : désactive le prefill JSON (rare, debug)
|
||||
|
||||
Returns:
|
||||
dict avec clés :
|
||||
- model: str
|
||||
- num_ctx: int
|
||||
- prefill: str ou None
|
||||
- temperature: float
|
||||
- num_predict: int
|
||||
- think: bool (False pour qwen3 et qwen3.5)
|
||||
- keep_alive: str
|
||||
- fallback_model: str
|
||||
"""
|
||||
model = os.environ.get("RPA_GROUNDING_MODEL", DEFAULT_GROUNDING_MODEL).strip()
|
||||
try:
|
||||
num_ctx = int(os.environ.get("RPA_GROUNDING_CTX", str(DEFAULT_GROUNDING_CTX)))
|
||||
except (TypeError, ValueError):
|
||||
num_ctx = DEFAULT_GROUNDING_CTX
|
||||
fallback = os.environ.get(
|
||||
"RPA_GROUNDING_FALLBACK", DEFAULT_GROUNDING_FALLBACK
|
||||
).strip()
|
||||
prefill_enabled = os.environ.get("RPA_VLM_PREFILL", "true").strip().lower() not in (
|
||||
"0", "false", "no", "off"
|
||||
)
|
||||
prefill = DEFAULT_GROUNDING_PREFILL if prefill_enabled else None
|
||||
|
||||
# think=False obligatoire pour qwen3/qwen3.5 (prefill = mécanisme principal)
|
||||
# et gemma4 (sinon tokens vides Ollama >=0.20).
|
||||
think_false = is_thinking_model(model) or needs_think_false(model)
|
||||
|
||||
return {
|
||||
"model": model,
|
||||
"num_ctx": num_ctx,
|
||||
"prefill": prefill,
|
||||
"temperature": DEFAULT_GROUNDING_TEMPERATURE,
|
||||
"num_predict": DEFAULT_GROUNDING_NUM_PREDICT,
|
||||
"think": not think_false, # API Ollama : think=False → on envoie False
|
||||
"keep_alive": DEFAULT_GROUNDING_KEEP_ALIVE,
|
||||
"fallback_model": fallback,
|
||||
}
|
||||
|
||||
|
||||
def needs_think_false(model_name: str) -> bool:
|
||||
"""Détermine si un modèle nécessite think=false dans le payload.
|
||||
|
||||
|
||||
@@ -59,8 +59,13 @@ class CLIPEmbedder(EmbedderBase):
|
||||
)
|
||||
|
||||
if device is None:
|
||||
# NOTE: utiliser le `torch` du scope module (l. 8). Un import local
|
||||
# ici rendait `torch` LOCAL à __init__ pour tout le scope, faisant
|
||||
# planter `with torch.no_grad():` plus bas en UnboundLocalError
|
||||
# quand l'appelant passait device="cpu" (l'import local n'était
|
||||
# alors pas exécuté). Voir inbox_codex/2026-05-25_1235_..._enquete-
|
||||
# feedbackbus-5004.md.
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
free_vram = torch.cuda.mem_get_info()[0] / 1024**3
|
||||
if free_vram > 1.5:
|
||||
|
||||
@@ -6,7 +6,11 @@ from .t2a_decision import (
|
||||
analyze_dpi,
|
||||
build_dpi_enriched,
|
||||
)
|
||||
from .ocr_extractor import extract_table_from_image, extract_text_from_image
|
||||
from .ocr_extractor import (
|
||||
extract_digits_tesseract_from_image,
|
||||
extract_table_from_image,
|
||||
extract_text_from_image,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"PROMPT_TEMPLATE",
|
||||
@@ -15,4 +19,5 @@ __all__ = [
|
||||
"build_dpi_enriched",
|
||||
"extract_text_from_image",
|
||||
"extract_table_from_image",
|
||||
"extract_digits_tesseract_from_image",
|
||||
]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Extracteur OCR — texte depuis une image (screenshot d'écran).
|
||||
|
||||
Utilise EasyOCR fr+en. Singleton (chargement modèle ~3s au premier appel).
|
||||
Ajoute un chemin Tesseract spécialisé pour les chiffres/IPP d'écrans propres.
|
||||
|
||||
Conçu pour le pipeline streaming serveur (actions `extract_text` /
|
||||
`extract_table`) : récupère un screenshot fresh (dernier heartbeat ou
|
||||
@@ -11,6 +12,7 @@ pour analyse downstream (ex: t2a_decision, boucle sur N patients).
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
@@ -20,6 +22,19 @@ logger = logging.getLogger(__name__)
|
||||
_easyocr_reader = None
|
||||
|
||||
|
||||
def easyocr_gpu_enabled(default: bool = False) -> bool:
|
||||
"""Return whether EasyOCR may allocate GPU memory.
|
||||
|
||||
The replay server shares the GPU with Ollama. Defaulting EasyOCR to CPU
|
||||
keeps VRAM available for the VLM; set RPA_EASYOCR_GPU=1 only for a measured
|
||||
OCR benchmark or a runtime that has spare VRAM.
|
||||
"""
|
||||
raw = os.getenv("RPA_EASYOCR_GPU", "")
|
||||
if not raw:
|
||||
return default
|
||||
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def _get_reader():
|
||||
"""Initialise EasyOCR fr+en au premier appel (singleton, CPU forcé).
|
||||
|
||||
@@ -29,8 +44,9 @@ def _get_reader():
|
||||
global _easyocr_reader
|
||||
if _easyocr_reader is None:
|
||||
import easyocr
|
||||
_easyocr_reader = easyocr.Reader(['fr', 'en'], gpu=False, verbose=False)
|
||||
logger.info("EasyOCR initialisé (fr+en, CPU)")
|
||||
gpu = easyocr_gpu_enabled(default=False)
|
||||
_easyocr_reader = easyocr.Reader(['fr', 'en'], gpu=gpu, verbose=False)
|
||||
logger.info("EasyOCR initialisé (fr+en, %s)", "GPU" if gpu else "CPU")
|
||||
return _easyocr_reader
|
||||
|
||||
|
||||
@@ -73,17 +89,86 @@ def extract_text_from_image(
|
||||
return ""
|
||||
|
||||
|
||||
def extract_digits_tesseract_from_image(
|
||||
image_path: str,
|
||||
region: Optional[Tuple[int, int, int, int]] = None,
|
||||
pattern: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
psm: int = 6,
|
||||
lang: str = "eng",
|
||||
whitelist: str = "0123456789",
|
||||
) -> List[str]:
|
||||
"""Extrait des valeurs numeriques via Tesseract.
|
||||
|
||||
Cas d'usage principal : IPP/champs chiffres dans des tableaux d'écran.
|
||||
Ce chemin est volontairement explicite pour ne pas changer le comportement
|
||||
EasyOCR general utilise par `extract_text`.
|
||||
|
||||
Args:
|
||||
image_path: chemin du PNG/JPG sur disque.
|
||||
region: (x, y, w, h) pour cropper avant OCR. None = image entière.
|
||||
pattern: regex Python appliquee aux sequences de chiffres extraites.
|
||||
Exemple IPP : r"^25\\d{6}$".
|
||||
limit: nombre maximal de valeurs retournees.
|
||||
psm: page segmentation mode Tesseract. 6 = bloc uniforme de texte.
|
||||
lang: langue Tesseract.
|
||||
whitelist: caracteres autorises. Par defaut chiffres uniquement.
|
||||
|
||||
Returns:
|
||||
Liste de sequences numeriques dans l'ordre de lecture Tesseract.
|
||||
En cas d'erreur, retourne une liste vide et log un warning.
|
||||
"""
|
||||
path = Path(image_path)
|
||||
if not path.exists():
|
||||
logger.warning("extract_digits_tesseract: fichier introuvable %s", image_path)
|
||||
return []
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
|
||||
with Image.open(path) as img:
|
||||
if region:
|
||||
x, y, w, h = region
|
||||
img = img.crop((x, y, x + w, y + h))
|
||||
if img.mode not in {"L", "RGB"}:
|
||||
img = img.convert("RGB")
|
||||
|
||||
config_parts = ["--psm", str(psm)]
|
||||
if whitelist:
|
||||
config_parts.extend(["-c", f"tessedit_char_whitelist={whitelist}"])
|
||||
text = pytesseract.image_to_string(
|
||||
img,
|
||||
lang=lang,
|
||||
config=" ".join(config_parts),
|
||||
)
|
||||
|
||||
values = re.findall(r"\d+", text)
|
||||
if pattern:
|
||||
compiled = re.compile(pattern)
|
||||
values = [v for v in values if compiled.match(v)]
|
||||
if limit:
|
||||
values = values[:limit]
|
||||
return values
|
||||
except Exception as e:
|
||||
logger.warning("extract_digits_tesseract échoué sur %s : %s", image_path, e)
|
||||
return []
|
||||
|
||||
|
||||
def extract_table_from_image(
|
||||
image_path: str,
|
||||
region: Optional[Tuple[int, int, int, int]] = None,
|
||||
pattern: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
engine: str = "easyocr",
|
||||
) -> List[str]:
|
||||
"""Extrait une liste de valeurs d'un tableau via OCR.
|
||||
|
||||
Cas d'usage principal : lire la liste des IPP d'un tableau de patients
|
||||
pour boucler dessus. EasyOCR retourne tous les tokens avec leur bbox,
|
||||
on filtre par regex puis on trie par position (y croissant).
|
||||
pour boucler dessus. Par défaut, EasyOCR retourne tous les tokens avec
|
||||
leur bbox, on filtre par regex puis on trie par position (y croissant).
|
||||
Pour des champs chiffres/IPP, `engine="tesseract"` active le chemin
|
||||
spécialisé Tesseract validé sur captures Easily.
|
||||
|
||||
Args:
|
||||
image_path: chemin du PNG sur disque.
|
||||
@@ -92,6 +177,7 @@ def extract_table_from_image(
|
||||
Si None : tous les tokens non vides sont retournés.
|
||||
Exemple IPP : r"^\\d{8,10}$" ou r"^25\\d{6}$"
|
||||
limit: nombre maximal d'entrées à retourner (None = sans limite).
|
||||
engine: "easyocr" (defaut) ou "tesseract" / "digits" / "ipp".
|
||||
|
||||
Returns:
|
||||
Liste de strings dans l'ordre top → bottom (par y de bbox).
|
||||
@@ -102,6 +188,15 @@ def extract_table_from_image(
|
||||
logger.warning("extract_table: fichier introuvable %s", image_path)
|
||||
return []
|
||||
|
||||
engine_name = (engine or "easyocr").strip().lower()
|
||||
if engine_name in {"tesseract", "digits", "ipp"}:
|
||||
return extract_digits_tesseract_from_image(
|
||||
image_path,
|
||||
region=region,
|
||||
pattern=pattern,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
@@ -99,10 +99,17 @@ class WorkflowPipeline:
|
||||
logger.info("✓ Fusion Engine initialized")
|
||||
|
||||
# 3. State Embedding Builder
|
||||
clip_embedders = {
|
||||
"image": self.clip_embedder,
|
||||
"text": self.clip_embedder,
|
||||
"title": self.clip_embedder,
|
||||
"ui": self.clip_embedder,
|
||||
}
|
||||
self.embedding_builder = StateEmbeddingBuilder(
|
||||
fusion_engine=self.fusion_engine,
|
||||
embedders=clip_embedders,
|
||||
output_dir=self.embeddings_dir,
|
||||
use_clip=True
|
||||
use_clip=False
|
||||
)
|
||||
logger.info("✓ State Embedding Builder initialized")
|
||||
|
||||
|
||||
38
core/semantic/__init__.py
Normal file
38
core/semantic/__init__.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""Phase 2.5 — Analyse sémantique post-apprentissage.
|
||||
|
||||
Module dédié à l'analyse sémantique des écrans capturés en phase Shadow,
|
||||
**après** ``/api/v1/shadow/stop`` et **avant** restitution Option C.
|
||||
|
||||
Specs : ``docs/POC/SPECS_PHASE_25_SEMANTIQUE_2026-06-01.md``
|
||||
|
||||
Principes (arbitrage Plato 2026-06-01) :
|
||||
- Post-apprentissage uniquement, **jamais en hot path replay**.
|
||||
- OmniParser encapsulé derrière garde-fou anti-fragilité.
|
||||
- Fallback OCR-seul (docTR) systématique en cas d'exception.
|
||||
- Stockage ``.semantic.yaml`` séparé du YAML compétence principal.
|
||||
- Opt-in par compétence (rétrocompat totale).
|
||||
"""
|
||||
|
||||
from .phase25_analyzer import (
|
||||
Phase25Analyzer,
|
||||
Phase25Result,
|
||||
ScreenAnalysis,
|
||||
SemanticStructure,
|
||||
SEMANTIC_DIR,
|
||||
OMNIPARSER_CACHE_DIR,
|
||||
OMNIPARSER_ERROR_LOG,
|
||||
PHASH_HAMMING_THRESHOLD,
|
||||
MAX_SCREENS_PER_SESSION,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Phase25Analyzer",
|
||||
"Phase25Result",
|
||||
"ScreenAnalysis",
|
||||
"SemanticStructure",
|
||||
"SEMANTIC_DIR",
|
||||
"OMNIPARSER_CACHE_DIR",
|
||||
"OMNIPARSER_ERROR_LOG",
|
||||
"PHASH_HAMMING_THRESHOLD",
|
||||
"MAX_SCREENS_PER_SESSION",
|
||||
]
|
||||
920
core/semantic/phase25_analyzer.py
Normal file
920
core/semantic/phase25_analyzer.py
Normal file
@@ -0,0 +1,920 @@
|
||||
"""Phase 2.5 — Analyseur sémantique post-apprentissage.
|
||||
|
||||
Module isolé qui prend en entrée un ensemble de screenshots capturés
|
||||
pendant la phase Shadow et produit un payload structuré
|
||||
``{tables, forms, buttons, text_blocks}`` par écran distinct,
|
||||
stocké dans un fichier ``.semantic.yaml`` séparé.
|
||||
|
||||
Specs : ``docs/POC/SPECS_PHASE_25_SEMANTIQUE_2026-06-01.md``
|
||||
|
||||
Garde-fous :
|
||||
- Wrapper try/except global autour de chaque appel OmniParser.
|
||||
- Fallback OCR-seul (docTR) si OmniParser indisponible ou KO.
|
||||
- Healthcheck OmniParser au démarrage : KO ⇒ bascule auto en dégradé.
|
||||
- Cache disque ``data/cache/omniparser/<session>/<index>.json``.
|
||||
- Cap 10 écrans distincts par session.
|
||||
- Aucun import de FastAPI, aucun appel réseau direct.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import concurrent.futures
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
try: # pragma: no cover - dépendance externe déjà présente dans le projet
|
||||
import yaml
|
||||
except ImportError as exc: # pragma: no cover
|
||||
raise RuntimeError("PyYAML est requis pour core.semantic.phase25_analyzer") from exc
|
||||
|
||||
try: # PIL toujours présent côté Linux dev / DGX
|
||||
from PIL import Image
|
||||
_HAS_PIL = True
|
||||
except ImportError: # pragma: no cover
|
||||
Image = None # type: ignore[assignment]
|
||||
_HAS_PIL = False
|
||||
|
||||
try:
|
||||
import imagehash # type: ignore
|
||||
_HAS_IMAGEHASH = True
|
||||
except ImportError: # pragma: no cover - fallback MD5 thumbnail
|
||||
imagehash = None # type: ignore[assignment]
|
||||
_HAS_IMAGEHASH = False
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Constantes et chemins
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
DATA_ROOT = REPO_ROOT / "data"
|
||||
SEMANTIC_DIR = DATA_ROOT / "competences" / "candidate"
|
||||
OMNIPARSER_CACHE_ROOT = DATA_ROOT / "cache" / "omniparser"
|
||||
OMNIPARSER_CACHE_DIR = OMNIPARSER_CACHE_ROOT # alias public
|
||||
LOGS_DIR = REPO_ROOT / "logs"
|
||||
OMNIPARSER_ERROR_LOG = LOGS_DIR / "omniparser_errors.log"
|
||||
|
||||
# Heuristique de regroupement perceptuel (cf. specs §3).
|
||||
PHASH_HAMMING_THRESHOLD = 8
|
||||
MAX_SCREENS_PER_SESSION = 10
|
||||
THUMBNAIL_SIZE = (256, 256) # fallback MD5
|
||||
|
||||
# Timeout par screenshot (cf. specs §2).
|
||||
OMNIPARSER_TIMEOUT_SEC = 30.0
|
||||
|
||||
# Slug autorisé (réutilisation du pattern persist : a-z0-9_).
|
||||
SLUG_PATTERN = re.compile(r"^[a-z][a-z0-9_]{2,79}$")
|
||||
# session_id autorisé : caractères inoffensifs uniquement.
|
||||
SESSION_ID_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_\-]{0,127}$")
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Dataclasses
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class SemanticStructure:
|
||||
"""Structure sémantique d'un écran (cf. specs §2)."""
|
||||
|
||||
tables: List[dict] = field(default_factory=list)
|
||||
forms: List[dict] = field(default_factory=list)
|
||||
buttons: List[dict] = field(default_factory=list)
|
||||
text_blocks: List[dict] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"tables": list(self.tables),
|
||||
"forms": list(self.forms),
|
||||
"buttons": list(self.buttons),
|
||||
"text_blocks": list(self.text_blocks),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScreenAnalysis:
|
||||
"""Analyse d'un écran représentatif (cf. specs §3)."""
|
||||
|
||||
index: int
|
||||
phash: str
|
||||
screen_id: str
|
||||
screenshot_path: Optional[str]
|
||||
structure: SemanticStructure
|
||||
degraded: bool = False
|
||||
degraded_reason: Optional[str] = None
|
||||
elapsed_sec: float = 0.0
|
||||
window_title: Optional[str] = None
|
||||
# Snapshot "contrat Codex" : représentation aplatie destinée à
|
||||
# l'agent-chat / dashboard. Calculée à la volée par to_dict().
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
elements = _structure_to_elements(self.structure)
|
||||
return {
|
||||
"index": self.index,
|
||||
"hash": self.phash,
|
||||
"screen_id": self.screen_id,
|
||||
"window_title": self.window_title,
|
||||
"screenshot_path": self.screenshot_path,
|
||||
"structure": self.structure.to_dict(),
|
||||
"elements": elements,
|
||||
"degraded": self.degraded,
|
||||
"degraded_reason": self.degraded_reason,
|
||||
"elapsed_sec": round(self.elapsed_sec, 3),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Phase25Result:
|
||||
"""Résultat global d'une analyse Phase 2.5."""
|
||||
|
||||
session_id: str
|
||||
generated_at: str
|
||||
omniparser_available: bool
|
||||
degraded: bool
|
||||
too_complex: bool
|
||||
screens: List[ScreenAnalysis] = field(default_factory=list)
|
||||
healthcheck_passed: bool = True
|
||||
healthcheck_reason: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"session_id": self.session_id,
|
||||
"generated_at": self.generated_at,
|
||||
"omniparser_available": self.omniparser_available,
|
||||
"degraded": self.degraded,
|
||||
"too_complex": self.too_complex,
|
||||
"healthcheck_passed": self.healthcheck_passed,
|
||||
"healthcheck_reason": self.healthcheck_reason,
|
||||
"screens": [s.to_dict() for s in self.screens],
|
||||
}
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Helpers : validation et FS
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _validate_session_id(session_id: Any) -> str:
|
||||
if not isinstance(session_id, str) or not session_id.strip():
|
||||
raise ValueError("session_id doit etre une chaine non vide")
|
||||
sid = session_id.strip()
|
||||
if not SESSION_ID_PATTERN.match(sid):
|
||||
raise ValueError(
|
||||
"session_id invalide (autorise : [A-Za-z0-9][A-Za-z0-9_-]{0,127})"
|
||||
)
|
||||
# Anti path-traversal de ceinture-bretelles : on refuse explicitement
|
||||
# toute tentative ../ même si le regex ne devrait pas la laisser passer.
|
||||
if ".." in sid or "/" in sid or "\\" in sid:
|
||||
raise ValueError("session_id invalide (path-traversal interdit)")
|
||||
return sid
|
||||
|
||||
|
||||
def _validate_slug(slug: Any) -> str:
|
||||
if not isinstance(slug, str):
|
||||
raise ValueError("slug doit etre une chaine")
|
||||
s = slug.strip()
|
||||
if not SLUG_PATTERN.match(s):
|
||||
raise ValueError(
|
||||
f"slug invalide '{s}' (regle : {SLUG_PATTERN.pattern})"
|
||||
)
|
||||
return s
|
||||
|
||||
|
||||
def _ensure_dir(path: Path) -> None:
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _log_omniparser_error(session_id: str, frame_index: int, exc: BaseException) -> None:
|
||||
"""Append-only sur ``logs/omniparser_errors.log`` (cf. specs §7)."""
|
||||
try:
|
||||
_ensure_dir(LOGS_DIR)
|
||||
entry = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"session_id": session_id,
|
||||
"frame_index": frame_index,
|
||||
"error_type": type(exc).__name__,
|
||||
"error_message": str(exc),
|
||||
"traceback": traceback.format_exception_only(type(exc), exc),
|
||||
}
|
||||
with OMNIPARSER_ERROR_LOG.open("a", encoding="utf-8") as fh:
|
||||
fh.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
except OSError as log_exc: # pragma: no cover - log best-effort
|
||||
logger.warning("[PHASE25] echec ecriture omniparser_errors.log : %s", log_exc)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Hash perceptuel (avec fallback MD5)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def compute_phash(image: "Image.Image") -> str:
|
||||
"""Calcule un hash perceptuel ou un hash MD5 thumbnail (fallback)."""
|
||||
if _HAS_IMAGEHASH and imagehash is not None:
|
||||
try:
|
||||
return str(imagehash.phash(image))
|
||||
except Exception as exc: # pragma: no cover
|
||||
logger.warning("[PHASE25] phash imagehash KO, fallback MD5 : %s", exc)
|
||||
# Fallback MD5 sur thumbnail.
|
||||
thumb = image.copy()
|
||||
thumb.thumbnail(THUMBNAIL_SIZE)
|
||||
buf = io.BytesIO()
|
||||
thumb.convert("RGB").save(buf, format="PNG")
|
||||
return "md5:" + hashlib.md5(buf.getvalue()).hexdigest()
|
||||
|
||||
|
||||
def _hamming_distance(h1: str, h2: str) -> int:
|
||||
"""Distance de Hamming entre deux phash imagehash, ou fallback MD5.
|
||||
|
||||
- Cas imagehash : on reconvertit via ``imagehash.hex_to_hash``.
|
||||
- Cas MD5 (préfixe ``md5:``) : 0 si égal, sinon distance "haute" pour ne
|
||||
jamais les considérer comme similaires (heuristique conservative).
|
||||
"""
|
||||
if h1.startswith("md5:") or h2.startswith("md5:"):
|
||||
return 0 if h1 == h2 else PHASH_HAMMING_THRESHOLD + 1
|
||||
if not _HAS_IMAGEHASH or imagehash is None:
|
||||
# Pas d'imagehash mais les hashes hex présents (rare) : XOR brut.
|
||||
try:
|
||||
i1 = int(h1, 16)
|
||||
i2 = int(h2, 16)
|
||||
return bin(i1 ^ i2).count("1")
|
||||
except ValueError:
|
||||
return PHASH_HAMMING_THRESHOLD + 1
|
||||
try:
|
||||
return abs(imagehash.hex_to_hash(h1) - imagehash.hex_to_hash(h2))
|
||||
except Exception:
|
||||
return PHASH_HAMMING_THRESHOLD + 1
|
||||
|
||||
|
||||
def identify_distinct_screens(
|
||||
frames: Sequence[Tuple[int, "Image.Image"]],
|
||||
threshold: int = PHASH_HAMMING_THRESHOLD,
|
||||
) -> List[Tuple[int, "Image.Image", str]]:
|
||||
"""Regroupe les frames par similarité phash et retourne un représentant par groupe.
|
||||
|
||||
Args:
|
||||
frames: séquence ``(frame_index, PIL.Image)``.
|
||||
threshold: Hamming distance max pour considérer deux frames identiques.
|
||||
|
||||
Returns:
|
||||
Liste ``(frame_index, image, phash)`` — un représentant par groupe,
|
||||
dans l'ordre temporel d'apparition (premier vu = représentant).
|
||||
"""
|
||||
representatives: List[Tuple[int, Image.Image, str]] = []
|
||||
for idx, img in frames:
|
||||
h = compute_phash(img)
|
||||
matched = False
|
||||
for ridx, _rimg, rhash in representatives:
|
||||
if _hamming_distance(h, rhash) <= threshold:
|
||||
matched = True
|
||||
logger.debug(
|
||||
"[PHASE25] frame %d regroupee avec representant %d (phash=%s)",
|
||||
idx, ridx, h,
|
||||
)
|
||||
break
|
||||
if not matched:
|
||||
representatives.append((idx, img, h))
|
||||
return representatives
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Conversion structure ⇄ "elements" (contrat Codex)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _structure_to_elements(struct: SemanticStructure) -> List[dict]:
|
||||
"""Aplatissement structure -> liste d'éléments {kind, label, bbox, confidence}."""
|
||||
elements: List[dict] = []
|
||||
for tbl in struct.tables:
|
||||
elements.append({
|
||||
"kind": "table",
|
||||
"label": tbl.get("label", "table"),
|
||||
"bbox": tbl.get("bbox", []),
|
||||
"confidence": float(tbl.get("confidence", 0.5)),
|
||||
})
|
||||
for frm in struct.forms:
|
||||
elements.append({
|
||||
"kind": "field",
|
||||
"label": frm.get("label", "field"),
|
||||
"bbox": frm.get("bbox", []),
|
||||
"confidence": float(frm.get("confidence", 0.5)),
|
||||
})
|
||||
for btn in struct.buttons:
|
||||
elements.append({
|
||||
"kind": "button",
|
||||
"label": btn.get("label", "button"),
|
||||
"bbox": btn.get("bbox", []),
|
||||
"confidence": float(btn.get("confidence", 0.5)),
|
||||
})
|
||||
for tb in struct.text_blocks:
|
||||
elements.append({
|
||||
"kind": "text_block",
|
||||
"label": tb.get("label", tb.get("text", "")),
|
||||
"bbox": tb.get("bbox", []),
|
||||
"confidence": float(tb.get("confidence", 0.5)),
|
||||
})
|
||||
return elements
|
||||
|
||||
|
||||
def _classify_element(label: str, kind_hint: str | None = None) -> str:
|
||||
"""Heuristique de classification d'un élément OmniParser.
|
||||
|
||||
Cohérente avec ``OmniParserAdapter._classify_element``, mais retourne
|
||||
nos catégories sémantiques : ``table | field | button | text_block``.
|
||||
"""
|
||||
lab = (label or "").lower()
|
||||
if kind_hint:
|
||||
kh = kind_hint.lower()
|
||||
if "table" in kh:
|
||||
return "table"
|
||||
if "input" in kh or "field" in kh or "edit" in kh:
|
||||
return "field"
|
||||
if "button" in kh or "btn" in kh:
|
||||
return "button"
|
||||
if any(kw in lab for kw in ("button", "btn", "submit", "valider", "annuler", "ok", "close")):
|
||||
return "button"
|
||||
if any(kw in lab for kw in ("input", "field", "saisie", "textbox", "champ")):
|
||||
return "field"
|
||||
if "table" in lab or "grille" in lab:
|
||||
return "table"
|
||||
return "text_block"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Adapter wrappers : OmniParser et docTR (fallback)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _OmniParserSafeWrapper:
|
||||
"""Wrap fragile OmniParserAdapter avec garde-fou anti-exception.
|
||||
|
||||
- Import paresseux (lazy) pour ne pas casser l'import du module si
|
||||
OmniParser n'est pas installé.
|
||||
- ``available=False`` ⇒ caller bascule en fallback OCR-seul.
|
||||
- Timeout effectif appliqué autour de chaque appel ``detect`` via
|
||||
``ThreadPoolExecutor`` + ``future.result(timeout=...)``.
|
||||
"""
|
||||
|
||||
# Executor module-level pour ne pas créer un pool par appel.
|
||||
_TIMEOUT_EXECUTOR: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
||||
|
||||
@classmethod
|
||||
def _get_executor(cls) -> concurrent.futures.ThreadPoolExecutor:
|
||||
if cls._TIMEOUT_EXECUTOR is None:
|
||||
cls._TIMEOUT_EXECUTOR = concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=2, thread_name_prefix="phase25-omniparser-timeout",
|
||||
)
|
||||
return cls._TIMEOUT_EXECUTOR
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._adapter: Any = None
|
||||
self._available: bool = False
|
||||
self._import_error: Optional[str] = None
|
||||
self._try_import()
|
||||
|
||||
def _try_import(self) -> None:
|
||||
try:
|
||||
from core.detection.omniparser_adapter import OmniParserAdapter # type: ignore
|
||||
self._adapter = OmniParserAdapter()
|
||||
self._available = bool(getattr(self._adapter, "available", False))
|
||||
if not self._available:
|
||||
# L'adapter existe mais le check de disponibilité a échoué.
|
||||
self._import_error = "OmniParser adapter installé mais modèles non disponibles"
|
||||
except Exception as exc:
|
||||
self._adapter = None
|
||||
self._available = False
|
||||
self._import_error = f"{type(exc).__name__}: {exc}"
|
||||
|
||||
@property
|
||||
def available(self) -> bool:
|
||||
return self._available
|
||||
|
||||
@property
|
||||
def import_error(self) -> Optional[str]:
|
||||
return self._import_error
|
||||
|
||||
def detect(
|
||||
self,
|
||||
image: "Image.Image",
|
||||
*,
|
||||
timeout: Optional[float] = None,
|
||||
) -> List[Any]:
|
||||
"""Appel sécurisé : enrobé d'un timeout dur, lève en cas d'exception.
|
||||
|
||||
Args:
|
||||
image: image PIL à analyser.
|
||||
timeout: timeout en secondes (défaut : ``OMNIPARSER_TIMEOUT_SEC``).
|
||||
Si dépassé ⇒ ``concurrent.futures.TimeoutError`` propagée au
|
||||
caller, qui bascule en fallback docTR + ``degraded=True``.
|
||||
"""
|
||||
if not self._available or self._adapter is None:
|
||||
return []
|
||||
effective_timeout = (
|
||||
timeout if timeout is not None else OMNIPARSER_TIMEOUT_SEC
|
||||
)
|
||||
executor = self._get_executor()
|
||||
future = executor.submit(self._adapter.detect, image)
|
||||
try:
|
||||
return list(future.result(timeout=effective_timeout))
|
||||
except concurrent.futures.TimeoutError as exc:
|
||||
# Le thread OmniParser continue son travail en arrière-plan mais
|
||||
# le résultat est ignoré ; le caller bascule en fallback docTR.
|
||||
logger.warning(
|
||||
"[PHASE25] OmniParser.detect timeout (%.1fs) -> fallback",
|
||||
effective_timeout,
|
||||
)
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.warning("[PHASE25] OmniParser.detect KO : %s", exc)
|
||||
raise # remonté au caller pour log + fallback
|
||||
|
||||
|
||||
def _detect_via_omniparser(
|
||||
wrapper: _OmniParserSafeWrapper,
|
||||
image: "Image.Image",
|
||||
*,
|
||||
timeout: Optional[float] = None,
|
||||
) -> List[Any]:
|
||||
return wrapper.detect(image, timeout=timeout)
|
||||
|
||||
|
||||
def _detect_via_doctr(image: "Image.Image", screenshot_path: Optional[str]) -> List[dict]:
|
||||
"""Fallback OCR-seul (docTR). Retourne une liste de text_blocks bruts.
|
||||
|
||||
Aucun VLM, aucune classification fine — juste OCR ⇒ ``text_blocks``.
|
||||
"""
|
||||
if not _HAS_PIL or image is None:
|
||||
return []
|
||||
try:
|
||||
from doctr.io import DocumentFile # type: ignore
|
||||
from doctr.models import ocr_predictor # type: ignore
|
||||
except ImportError:
|
||||
logger.info("[PHASE25] docTR non disponible pour fallback OCR")
|
||||
return []
|
||||
|
||||
# Cache predictor module-level pour éviter rechargement.
|
||||
global _DOCTR_PREDICTOR
|
||||
try:
|
||||
_DOCTR_PREDICTOR # type: ignore[used-before-def]
|
||||
except NameError:
|
||||
_DOCTR_PREDICTOR = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
if _DOCTR_PREDICTOR is None: # type: ignore[has-type]
|
||||
_DOCTR_PREDICTOR = ocr_predictor( # type: ignore[assignment]
|
||||
det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True,
|
||||
)
|
||||
except Exception as exc: # pragma: no cover
|
||||
logger.warning("[PHASE25] docTR init KO : %s", exc)
|
||||
return []
|
||||
|
||||
# docTR prend un fichier ou un array numpy ; on privilégie le chemin si fourni.
|
||||
blocks: List[dict] = []
|
||||
try:
|
||||
if screenshot_path and Path(screenshot_path).exists():
|
||||
doc = DocumentFile.from_images([screenshot_path])
|
||||
else:
|
||||
buf = io.BytesIO()
|
||||
image.convert("RGB").save(buf, format="PNG")
|
||||
buf.seek(0)
|
||||
doc = DocumentFile.from_images([buf.getvalue()])
|
||||
result = _DOCTR_PREDICTOR(doc) # type: ignore[misc]
|
||||
W, H = image.size
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line_obj in block.lines:
|
||||
text = " ".join(w.value for w in line_obj.words).strip()
|
||||
if not text:
|
||||
continue
|
||||
geom = line_obj.geometry # ((x1,y1), (x2,y2)) norm 0-1
|
||||
x1 = int(geom[0][0] * W)
|
||||
y1 = int(geom[0][1] * H)
|
||||
x2 = int(geom[1][0] * W)
|
||||
y2 = int(geom[1][1] * H)
|
||||
blocks.append({
|
||||
"label": text,
|
||||
"text": text,
|
||||
"bbox": [x1, y1, x2, y2],
|
||||
"confidence": 0.6, # docTR ne donne pas de score line-level facilement
|
||||
})
|
||||
except Exception as exc: # pragma: no cover
|
||||
logger.warning("[PHASE25] docTR predict KO : %s", exc)
|
||||
return []
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def _elements_to_structure(elements: Iterable[Any]) -> SemanticStructure:
|
||||
"""Convertit la liste OmniParser ``DetectedElement`` en SemanticStructure."""
|
||||
struct = SemanticStructure()
|
||||
for el in elements:
|
||||
# Compatible avec DetectedElement (dataclass) et dict.
|
||||
if hasattr(el, "label"):
|
||||
label = getattr(el, "label", "") or ""
|
||||
bbox = list(getattr(el, "bbox", ()) or ())
|
||||
conf = float(getattr(el, "confidence", 0.5) or 0.5)
|
||||
kind_hint = getattr(el, "element_type", None)
|
||||
elif isinstance(el, dict):
|
||||
label = str(el.get("label") or el.get("text") or "")
|
||||
bbox = list(el.get("bbox") or [])
|
||||
conf = float(el.get("confidence", el.get("score", 0.5)) or 0.5)
|
||||
kind_hint = el.get("element_type") or el.get("type")
|
||||
else:
|
||||
continue
|
||||
|
||||
kind = _classify_element(label, kind_hint)
|
||||
entry = {"label": label, "bbox": bbox, "confidence": conf}
|
||||
if kind == "table":
|
||||
struct.tables.append(entry)
|
||||
elif kind == "field":
|
||||
struct.forms.append(entry)
|
||||
elif kind == "button":
|
||||
struct.buttons.append(entry)
|
||||
else:
|
||||
struct.text_blocks.append({**entry, "text": label})
|
||||
return struct
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Cache disque
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _cache_path(session_id: str, frame_index: int) -> Path:
|
||||
sid = _validate_session_id(session_id)
|
||||
return OMNIPARSER_CACHE_ROOT / sid / f"{int(frame_index)}.json"
|
||||
|
||||
|
||||
def _cache_read(session_id: str, frame_index: int) -> Optional[dict]:
|
||||
path = _cache_path(session_id, frame_index)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
with path.open("r", encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
except (OSError, json.JSONDecodeError) as exc:
|
||||
logger.warning("[PHASE25] cache illisible %s : %s", path, exc)
|
||||
return None
|
||||
|
||||
|
||||
def _cache_write(session_id: str, frame_index: int, payload: dict) -> None:
|
||||
path = _cache_path(session_id, frame_index)
|
||||
try:
|
||||
_ensure_dir(path.parent)
|
||||
tmp = path.with_suffix(".json.tmp")
|
||||
with tmp.open("w", encoding="utf-8") as fh:
|
||||
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
||||
tmp.replace(path)
|
||||
except OSError as exc: # pragma: no cover
|
||||
logger.warning("[PHASE25] cache ecriture KO %s : %s", path, exc)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Analyseur principal
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Phase25Analyzer:
|
||||
"""Analyseur sémantique post-apprentissage.
|
||||
|
||||
Usage minimal :
|
||||
|
||||
analyzer = Phase25Analyzer(session_id="abc123")
|
||||
result = analyzer.analyze_frames(frames=[(0, img0), (12, img12), ...])
|
||||
path = analyzer.write_semantic_yaml(result, slug="ma_competence")
|
||||
|
||||
``frames`` est une séquence ``(frame_index, PIL.Image[, screenshot_path])``.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session_id: str,
|
||||
*,
|
||||
omniparser: Optional[_OmniParserSafeWrapper] = None,
|
||||
max_screens: int = MAX_SCREENS_PER_SESSION,
|
||||
timeout_sec: float = OMNIPARSER_TIMEOUT_SEC,
|
||||
) -> None:
|
||||
self.session_id = _validate_session_id(session_id)
|
||||
self.omniparser = omniparser if omniparser is not None else _OmniParserSafeWrapper()
|
||||
self.max_screens = max_screens
|
||||
self.timeout_sec = timeout_sec
|
||||
self._healthcheck_passed = True
|
||||
self._healthcheck_reason: Optional[str] = None
|
||||
|
||||
# -- healthcheck -------------------------------------------------------
|
||||
|
||||
def healthcheck(self) -> bool:
|
||||
"""Vérifie qu'OmniParser répond sur une image bidon (cf. specs §7).
|
||||
|
||||
- Si l'adapter est ``available=False`` ⇒ healthcheck KO (mais on
|
||||
continuera quand même en mode dégradé OCR-seul).
|
||||
- Si l'adapter lève une exception ⇒ KO + log dédié.
|
||||
"""
|
||||
if not _HAS_PIL:
|
||||
self._healthcheck_passed = False
|
||||
self._healthcheck_reason = "PIL indisponible"
|
||||
return False
|
||||
if not self.omniparser.available:
|
||||
self._healthcheck_passed = False
|
||||
self._healthcheck_reason = (
|
||||
self.omniparser.import_error or "OmniParser indisponible"
|
||||
)
|
||||
return False
|
||||
try:
|
||||
dummy = Image.new("RGB", (64, 64), color=(255, 255, 255))
|
||||
_ = self.omniparser.detect(dummy, timeout=self.timeout_sec)
|
||||
self._healthcheck_passed = True
|
||||
self._healthcheck_reason = None
|
||||
return True
|
||||
except Exception as exc:
|
||||
_log_omniparser_error(self.session_id, -1, exc)
|
||||
self._healthcheck_passed = False
|
||||
self._healthcheck_reason = f"{type(exc).__name__}: {exc}"
|
||||
return False
|
||||
|
||||
# -- analyse écran ----------------------------------------------------
|
||||
|
||||
def analyze_screen(
|
||||
self,
|
||||
frame_index: int,
|
||||
image: "Image.Image",
|
||||
phash: str,
|
||||
*,
|
||||
screenshot_path: Optional[str] = None,
|
||||
window_title: Optional[str] = None,
|
||||
force_fallback: bool = False,
|
||||
) -> ScreenAnalysis:
|
||||
"""Analyse un écran représentatif.
|
||||
|
||||
Stratégie :
|
||||
1. Cache disque (idempotence par session_id+frame_index).
|
||||
2. OmniParser via wrapper safe → sinon fallback OCR-seul docTR.
|
||||
3. Exception ⇒ log dédié + ``degraded=True`` + structure docTR.
|
||||
"""
|
||||
# 1. Cache
|
||||
cached = _cache_read(self.session_id, frame_index)
|
||||
if cached is not None:
|
||||
struct = SemanticStructure(
|
||||
tables=cached.get("structure", {}).get("tables", []),
|
||||
forms=cached.get("structure", {}).get("forms", []),
|
||||
buttons=cached.get("structure", {}).get("buttons", []),
|
||||
text_blocks=cached.get("structure", {}).get("text_blocks", []),
|
||||
)
|
||||
return ScreenAnalysis(
|
||||
index=frame_index,
|
||||
phash=cached.get("phash", phash),
|
||||
screen_id=cached.get("screen_id", f"screen_{frame_index:03d}"),
|
||||
screenshot_path=cached.get("screenshot_path", screenshot_path),
|
||||
structure=struct,
|
||||
degraded=bool(cached.get("degraded", False)),
|
||||
degraded_reason=cached.get("degraded_reason"),
|
||||
elapsed_sec=float(cached.get("elapsed_sec", 0.0)),
|
||||
window_title=cached.get("window_title", window_title),
|
||||
)
|
||||
|
||||
t0 = time.monotonic()
|
||||
degraded = False
|
||||
degraded_reason: Optional[str] = None
|
||||
structure: SemanticStructure
|
||||
|
||||
use_omniparser = self.omniparser.available and not force_fallback
|
||||
if use_omniparser:
|
||||
try:
|
||||
elements = _detect_via_omniparser(
|
||||
self.omniparser, image, timeout=self.timeout_sec,
|
||||
)
|
||||
structure = _elements_to_structure(elements)
|
||||
if not (structure.tables or structure.forms or structure.buttons or structure.text_blocks):
|
||||
# OmniParser n'a rien produit : on ajoute en complément docTR text_blocks.
|
||||
blocks = _detect_via_doctr(image, screenshot_path)
|
||||
structure.text_blocks.extend(blocks)
|
||||
except Exception as exc:
|
||||
_log_omniparser_error(self.session_id, frame_index, exc)
|
||||
degraded = True
|
||||
degraded_reason = f"omniparser_exception: {type(exc).__name__}"
|
||||
blocks = _detect_via_doctr(image, screenshot_path)
|
||||
structure = SemanticStructure(text_blocks=blocks)
|
||||
else:
|
||||
degraded = True
|
||||
degraded_reason = (
|
||||
"omniparser_unavailable: " + (self.omniparser.import_error or "n/a")
|
||||
if not self.omniparser.available
|
||||
else "forced_fallback"
|
||||
)
|
||||
blocks = _detect_via_doctr(image, screenshot_path)
|
||||
structure = SemanticStructure(text_blocks=blocks)
|
||||
|
||||
elapsed = time.monotonic() - t0
|
||||
analysis = ScreenAnalysis(
|
||||
index=frame_index,
|
||||
phash=phash,
|
||||
screen_id=f"screen_{frame_index:03d}",
|
||||
screenshot_path=screenshot_path,
|
||||
structure=structure,
|
||||
degraded=degraded,
|
||||
degraded_reason=degraded_reason,
|
||||
elapsed_sec=elapsed,
|
||||
window_title=window_title,
|
||||
)
|
||||
|
||||
# Cache écriture (best-effort).
|
||||
_cache_write(self.session_id, frame_index, analysis.to_dict())
|
||||
return analysis
|
||||
|
||||
# -- pipeline complet -------------------------------------------------
|
||||
|
||||
def analyze_frames(
|
||||
self,
|
||||
frames: Sequence[Tuple[int, "Image.Image"]],
|
||||
*,
|
||||
screenshot_paths: Optional[dict[int, str]] = None,
|
||||
window_titles: Optional[dict[int, str]] = None,
|
||||
run_healthcheck: bool = True,
|
||||
) -> Phase25Result:
|
||||
"""Pipeline complet : grouping phash → analyse → cap → résultat.
|
||||
|
||||
Args:
|
||||
frames: liste ``(frame_index, PIL.Image)``.
|
||||
screenshot_paths: mapping ``frame_index -> path`` (optionnel).
|
||||
window_titles: mapping ``frame_index -> window_title`` (optionnel).
|
||||
run_healthcheck: lancer le healthcheck OmniParser avant analyse.
|
||||
|
||||
Returns:
|
||||
``Phase25Result`` avec ``too_complex=True`` si > max_screens.
|
||||
"""
|
||||
if not _HAS_PIL:
|
||||
raise RuntimeError("PIL est requis pour Phase25Analyzer.analyze_frames")
|
||||
|
||||
if run_healthcheck:
|
||||
self.healthcheck()
|
||||
if not self._healthcheck_passed:
|
||||
logger.warning(
|
||||
"[PHASE25] healthcheck OmniParser KO (%s) -> mode degrade docTR",
|
||||
self._healthcheck_reason,
|
||||
)
|
||||
|
||||
force_fallback = not self._healthcheck_passed
|
||||
|
||||
# 1. Regrouper par similarité perceptuelle.
|
||||
reps = identify_distinct_screens(frames)
|
||||
|
||||
# 2. Cap MAX_SCREENS_PER_SESSION.
|
||||
too_complex = len(reps) > self.max_screens
|
||||
if too_complex:
|
||||
logger.warning(
|
||||
"[PHASE25] session %s : %d ecrans distincts > cap %d -> too_complex",
|
||||
self.session_id, len(reps), self.max_screens,
|
||||
)
|
||||
reps = reps[: self.max_screens]
|
||||
|
||||
# 3. Analyser chaque représentant.
|
||||
sp = screenshot_paths or {}
|
||||
wt = window_titles or {}
|
||||
screens: List[ScreenAnalysis] = []
|
||||
any_degraded = False
|
||||
for idx, img, phash in reps:
|
||||
analysis = self.analyze_screen(
|
||||
idx,
|
||||
img,
|
||||
phash,
|
||||
screenshot_path=sp.get(idx),
|
||||
window_title=wt.get(idx),
|
||||
force_fallback=force_fallback,
|
||||
)
|
||||
screens.append(analysis)
|
||||
any_degraded = any_degraded or analysis.degraded
|
||||
|
||||
return Phase25Result(
|
||||
session_id=self.session_id,
|
||||
generated_at=datetime.now(timezone.utc).isoformat(),
|
||||
omniparser_available=self.omniparser.available and self._healthcheck_passed,
|
||||
degraded=any_degraded or not self._healthcheck_passed,
|
||||
too_complex=too_complex,
|
||||
screens=screens,
|
||||
healthcheck_passed=self._healthcheck_passed,
|
||||
healthcheck_reason=self._healthcheck_reason,
|
||||
)
|
||||
|
||||
# -- écriture YAML -----------------------------------------------------
|
||||
|
||||
def write_semantic_yaml(
|
||||
self,
|
||||
result: Phase25Result,
|
||||
slug: str,
|
||||
*,
|
||||
target_dir: Optional[Path] = None,
|
||||
) -> Path:
|
||||
"""Écrit le ``.semantic.yaml`` à côté du YAML compétence candidate.
|
||||
|
||||
Args:
|
||||
result: Résultat d'analyse Phase 2.5.
|
||||
slug: slug compétence (validé contre SLUG_PATTERN).
|
||||
target_dir: répertoire cible (défaut : ``data/competences/candidate/``).
|
||||
|
||||
Returns:
|
||||
Path absolu du fichier écrit.
|
||||
|
||||
Raises:
|
||||
ValueError: slug invalide.
|
||||
OSError: écriture impossible.
|
||||
"""
|
||||
s = _validate_slug(slug)
|
||||
out_dir = target_dir if target_dir is not None else SEMANTIC_DIR
|
||||
out_dir = Path(out_dir)
|
||||
_ensure_dir(out_dir)
|
||||
|
||||
# Anti écrasement supervised/stable : on refuse explicitement.
|
||||
forbidden = {"supervised", "stable"}
|
||||
if out_dir.name in forbidden:
|
||||
raise ValueError(
|
||||
f"target_dir interdit '{out_dir.name}' (autorise : candidate uniquement)"
|
||||
)
|
||||
|
||||
payload = {
|
||||
"competence_id": s,
|
||||
"semantic_version": 1,
|
||||
"generated_at": result.generated_at,
|
||||
"session_id": result.session_id,
|
||||
"omniparser_available": result.omniparser_available,
|
||||
"degraded": result.degraded,
|
||||
"too_complex": result.too_complex,
|
||||
"healthcheck_passed": result.healthcheck_passed,
|
||||
"healthcheck_reason": result.healthcheck_reason,
|
||||
"screens": [],
|
||||
}
|
||||
for sc in result.screens:
|
||||
payload["screens"].append({
|
||||
"screen_id": sc.screen_id,
|
||||
"phash": sc.phash,
|
||||
"representative_frame_index": sc.index,
|
||||
"screenshot_path": sc.screenshot_path,
|
||||
"window_title": sc.window_title,
|
||||
"degraded": sc.degraded,
|
||||
"degraded_reason": sc.degraded_reason,
|
||||
"elapsed_sec": round(sc.elapsed_sec, 3),
|
||||
"structure": sc.structure.to_dict(),
|
||||
"annotations": [], # placeholder — annotation humaine ultérieure
|
||||
})
|
||||
|
||||
target = out_dir / f"{s}.semantic.yaml"
|
||||
tmp = target.with_suffix(".yaml.tmp")
|
||||
with tmp.open("w", encoding="utf-8") as fh:
|
||||
yaml.safe_dump(payload, fh, allow_unicode=True, sort_keys=False)
|
||||
tmp.replace(target)
|
||||
logger.info(
|
||||
"[PHASE25] semantic yaml ecrit : %s (screens=%d, degraded=%s)",
|
||||
target, len(result.screens), result.degraded,
|
||||
)
|
||||
return target
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Helpers utilitaires (chargement frames)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def load_frames_from_paths(paths_by_index: dict[int, str]) -> List[Tuple[int, "Image.Image"]]:
|
||||
"""Charge des images PIL à partir d'un mapping ``frame_index -> path``.
|
||||
|
||||
Ignore silencieusement les chemins inexistants (avec log warning).
|
||||
"""
|
||||
if not _HAS_PIL:
|
||||
raise RuntimeError("PIL est requis pour load_frames_from_paths")
|
||||
frames: List[Tuple[int, Image.Image]] = []
|
||||
for idx in sorted(paths_by_index.keys()):
|
||||
p = paths_by_index[idx]
|
||||
try:
|
||||
img = Image.open(p)
|
||||
img.load()
|
||||
frames.append((int(idx), img))
|
||||
except (FileNotFoundError, OSError) as exc:
|
||||
logger.warning("[PHASE25] frame %d illisible (%s) : %s", idx, p, exc)
|
||||
return frames
|
||||
|
||||
|
||||
__all__ = [
|
||||
"Phase25Analyzer",
|
||||
"Phase25Result",
|
||||
"ScreenAnalysis",
|
||||
"SemanticStructure",
|
||||
"SEMANTIC_DIR",
|
||||
"OMNIPARSER_CACHE_DIR",
|
||||
"OMNIPARSER_CACHE_ROOT",
|
||||
"OMNIPARSER_ERROR_LOG",
|
||||
"PHASH_HAMMING_THRESHOLD",
|
||||
"MAX_SCREENS_PER_SESSION",
|
||||
"compute_phash",
|
||||
"identify_distinct_screens",
|
||||
"load_frames_from_paths",
|
||||
]
|
||||
Reference in New Issue
Block a user