feat(p1): persist workflows and semantic learning artifacts

This commit is contained in:
Dom
2026-06-02 16:20:38 +02:00
parent 7a1a5cb6fd
commit 86b3c8f7e7
21 changed files with 3816 additions and 31 deletions

518
core/competences/persist.py Normal file
View File

@@ -0,0 +1,518 @@
"""Helpers de persistance pour les competences candidates (POC Lea-first).
Couvre :
- slugification stricte (ASCII, regex ^[a-z][a-z0-9_]{2,79}$)
- detection PII (regex MVP, paramétrable)
- atomic write + rename POSIX
- append-only audit JSONL avec verrou fcntl
- detection de collision cross-states (candidate / supervised / stable)
Le module est volontairement minimal : il n'importe pas FastAPI ni le pipeline
VWB, il ne fait pas de logique reseau. Il est consomme depuis
``agent_v0/server_v1/api_stream.py`` endpoint ``/persist``.
"""
from __future__ import annotations
import json
import os
import re
import time
import unicodedata
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterable, Optional
try: # pragma: no cover - dependance externe deja presente dans le projet
import yaml
except ImportError as exc: # pragma: no cover
raise RuntimeError("PyYAML est requis pour core.competences.persist") from exc
try:
import fcntl # POSIX uniquement
_HAS_FCNTL = True
except ImportError: # pragma: no cover - Windows
fcntl = None # type: ignore[assignment]
_HAS_FCNTL = False
REPO_ROOT = Path(__file__).resolve().parents[2]
COMPETENCES_ROOT = REPO_ROOT / "data" / "competences"
CANDIDATE_DIR = COMPETENCES_ROOT / "candidate"
SUPERVISED_DIR = COMPETENCES_ROOT / "supervised"
STABLE_DIR = COMPETENCES_ROOT / "stable"
AUDIT_PATH = COMPETENCES_ROOT / "persist_audit.jsonl"
INCOMPLETE_PATH = COMPETENCES_ROOT / "incomplete_learnings.jsonl"
# Pattern final autorise pour un slug de competence.
SLUG_PATTERN = re.compile(r"^[a-z][a-z0-9_]{2,79}$")
# Detection PII MVP — regex parametrable via env RPA_PII_PATTERNS
# (separes par |). Defaut : couvre patterns simples (IPP, NIR, email, tel FR).
_DEFAULT_PII_PATTERNS = [
r"\b\d{13}\b", # NIR FR (13 chiffres)
r"\b\d{15}\b", # NIR FR + cle
r"\bIPP[\s:_-]*\d{6,}\b", # IPP hospitalier
r"[\w\.-]+@[\w\.-]+\.\w{2,}", # email
r"\b0[1-9](?:[ .-]?\d{2}){4}\b", # telephone FR
]
def _compile_pii_patterns() -> list[re.Pattern[str]]:
raw = os.environ.get("RPA_PII_PATTERNS")
patterns = raw.split("|") if raw else _DEFAULT_PII_PATTERNS
compiled: list[re.Pattern[str]] = []
for pat in patterns:
pat = pat.strip()
if not pat:
continue
try:
compiled.append(re.compile(pat, re.IGNORECASE))
except re.error:
continue
return compiled
# ----------------------------------------------------------------------------
# Slugification
# ----------------------------------------------------------------------------
def slugify(name: str) -> str:
"""Convertir un nom libre en slug ASCII strict.
Regle :
- translitteration NFKD (suppression accents)
- lowercase, espaces / tirets / points -> '_'
- chars hors [a-z0-9_] retires
- underscores multiples reduits a 1
- troncature a 80 chars max
- doit matcher SLUG_PATTERN
Leve ValueError si le slug final ne matche pas le pattern.
"""
if not isinstance(name, str):
raise ValueError("name doit etre une chaine non vide")
raw = name.strip()
if not raw:
raise ValueError("name est vide")
# NFKD pour decomposer les accents puis suppression des combinaisons
normalized = unicodedata.normalize("NFKD", raw)
ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
# Espaces / tirets / points / slashes -> underscore
cleaned = re.sub(r"[\s\-./\\]+", "_", ascii_only.lower())
# Tout ce qui n'est pas [a-z0-9_] -> supprime
cleaned = re.sub(r"[^a-z0-9_]+", "", cleaned)
# Reduire underscores multiples
cleaned = re.sub(r"_+", "_", cleaned).strip("_")
# Forcer commencement par une lettre (si commence par chiffre, prefixer)
if cleaned and cleaned[0].isdigit():
cleaned = f"c_{cleaned}"
# Tronquer
if len(cleaned) > 80:
cleaned = cleaned[:80].rstrip("_")
if not SLUG_PATTERN.match(cleaned):
raise ValueError(
f"slug invalide '{cleaned}' (regle : {SLUG_PATTERN.pattern})"
)
return cleaned
# ----------------------------------------------------------------------------
# Collisions cross-states
# ----------------------------------------------------------------------------
def detect_cross_state_collision(
slug: str,
*,
competences_root: Path = COMPETENCES_ROOT,
) -> Optional[str]:
"""Retourne le sous-dossier ou un YAML <slug>.yaml existe deja, sinon None.
Verifie candidate/, supervised/, stable/.
"""
for sub in ("candidate", "supervised", "stable"):
target = competences_root / sub / f"{slug}.yaml"
if target.exists():
return sub
return None
# ----------------------------------------------------------------------------
# Detection PII
# ----------------------------------------------------------------------------
def detect_pii(payload: Any) -> list[str]:
"""Parcourt recursivement un payload (dict/list/str) et retourne la liste
des patterns PII matches. Liste vide = pas de PII detecte.
L'appelant decide quoi en faire (HTTP 400 + log non-sensible).
"""
matches: list[str] = []
patterns = _compile_pii_patterns()
if not patterns:
return matches
def _walk(node: Any) -> None:
if isinstance(node, str):
for pat in patterns:
if pat.search(node):
matches.append(pat.pattern)
elif isinstance(node, dict):
for v in node.values():
_walk(v)
elif isinstance(node, (list, tuple)):
for v in node:
_walk(v)
_walk(payload)
# dedoublonner en preservant l'ordre
seen = set()
out: list[str] = []
for p in matches:
if p not in seen:
seen.add(p)
out.append(p)
return out
# ----------------------------------------------------------------------------
# Atomic write
# ----------------------------------------------------------------------------
def atomic_write_yaml(
target_path: Path,
data: dict[str, Any],
*,
persist_id: str,
) -> Path:
"""Ecrire un dict en YAML de maniere atomique.
1. Ecrit dans <target_dir>/.<basename>.tmp.<persist_id>
2. os.rename vers target_path (POSIX atomic)
3. En cas d'echec, supprime le .tmp si possible.
Retourne le chemin final (target_path).
"""
target_path = Path(target_path)
target_dir = target_path.parent
target_dir.mkdir(parents=True, exist_ok=True)
tmp_name = f".{target_path.name}.tmp.{persist_id}"
tmp_path = target_dir / tmp_name
try:
with tmp_path.open("w", encoding="utf-8") as handle:
yaml.safe_dump(
data,
handle,
allow_unicode=True,
sort_keys=False,
default_flow_style=False,
)
handle.flush()
try:
os.fsync(handle.fileno())
except OSError:
pass
# rename atomique (POSIX). Echoue si target existe deja sur Windows,
# mais Linux (POSIX) ecrase silencieusement. On a verifie la collision
# avant l'appel.
os.rename(tmp_path, target_path)
except Exception:
if tmp_path.exists():
try:
tmp_path.unlink()
except OSError:
pass
raise
return target_path
# ----------------------------------------------------------------------------
# Audit append (JSONL + verrou)
# ----------------------------------------------------------------------------
def audit_append(
entry: dict[str, Any],
*,
audit_path: Path = AUDIT_PATH,
) -> int:
"""Append une ligne JSON dans le fichier audit, retourne audit_entry_id.
L'audit_entry_id est un compteur monotone derive du nombre de lignes
avant l'append. La concurrence est serialisee via fcntl.flock (POSIX).
Sur les systemes sans fcntl (Windows), l'ecriture est best-effort.
"""
audit_path = Path(audit_path)
audit_path.parent.mkdir(parents=True, exist_ok=True)
if "timestamp" not in entry:
entry["timestamp"] = (
datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
)
# Open en append + lecture pour compter les lignes existantes (audit_entry_id).
flags = "a+"
with open(audit_path, flags, encoding="utf-8") as handle:
if _HAS_FCNTL:
try:
fcntl.flock(handle.fileno(), fcntl.LOCK_EX) # type: ignore[union-attr]
except OSError:
pass
try:
handle.seek(0)
line_count = sum(1 for _ in handle)
audit_entry_id = line_count + 1
entry["audit_entry_id"] = audit_entry_id
handle.write(json.dumps(entry, ensure_ascii=False) + "\n")
handle.flush()
try:
os.fsync(handle.fileno())
except OSError:
pass
finally:
if _HAS_FCNTL:
try:
fcntl.flock(handle.fileno(), fcntl.LOCK_UN) # type: ignore[union-attr]
except OSError:
pass
return audit_entry_id
def find_existing_audit_entry(
persist_id: str,
*,
audit_path: Path = AUDIT_PATH,
) -> Optional[dict[str, Any]]:
"""Recherche une entree existante par persist_id pour l'idempotence."""
if not persist_id:
return None
audit_path = Path(audit_path)
if not audit_path.exists():
return None
try:
with audit_path.open("r", encoding="utf-8") as handle:
for line in handle:
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
continue
if record.get("persist_id") == persist_id:
return record
except OSError:
return None
return None
# ----------------------------------------------------------------------------
# YAML body construction
# ----------------------------------------------------------------------------
REQUIRED_YAML_FIELDS = (
"schema_version",
"id",
"name",
"version",
"learning_state",
"intent",
"parameters",
"preconditions",
"methods",
"success_marker",
"failure_message_template",
"promotion",
"generalisation",
"failure_log",
"created_at",
"last_updated_at",
"methods_execution",
)
def build_competence_yaml(
*,
slug: str,
name: str,
workflow_ir: dict[str, Any],
parameters: Optional[list[dict[str, Any]]],
intent_fr: str,
learning_state: str,
session_id: Optional[str],
machine_id: Optional[str],
external_agent_id: Optional[str] = None,
) -> dict[str, Any]:
"""Construit le dict YAML conforme au schema de reference.
Aligne sur ``data/competences/candidate/key_win_r_wait_explorer_exe.yaml``.
"""
now_iso = datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
steps = list(workflow_ir.get("steps") or [])
preconditions = list(workflow_ir.get("preconditions") or [])
success_marker = workflow_ir.get("success_marker") or {
"mode": "all_of",
"timeout_ms": 5000,
"markers": [],
}
methods: list[dict[str, Any]] = []
for idx, step in enumerate(steps, start=1):
if not isinstance(step, dict):
continue
method = dict(step)
method.setdefault("id", f"step_{idx}_{step.get('kind') or 'action'}")
if "primitive_ref" not in method and method.get("kind"):
method["primitive_ref"] = method["kind"]
method.setdefault("observed", False)
methods.append(method)
params_dict: dict[str, Any] = {}
for p in (parameters or []):
if isinstance(p, dict) and p.get("name"):
params_dict[str(p["name"])] = {
"type": p.get("type", "string"),
"required": bool(p.get("required", False)),
"description": p.get("description", ""),
}
yaml_body: dict[str, Any] = {
"schema_version": 1,
"id": slug,
"name": name,
"version": 1,
"learning_state": learning_state,
"intent": {"fr": intent_fr or name},
"parameters": params_dict,
"preconditions": preconditions,
"methods": methods,
"success_marker": success_marker,
"failure_message_template": workflow_ir.get("failure_message_template")
or {
"intention": intent_fr or name,
"attendu": "",
"vu": "{observed_human_state}",
"demande": "indiquer la correction attendue",
},
"promotion": {
"history": [
{
"at": now_iso,
"from": "observed",
"to": learning_state,
"by": "lea_persist_endpoint",
"reason": "persisted via /api/v1/lea/competences/candidate/persist",
}
],
"candidate_requires": [
"method_trace_present",
"success_marker_defined",
"failure_message_template_valid",
],
"supervised_requires": ["replay_verified_once", "human_validation"],
"stable_requires": {
"min_successes": 3,
"distinct_contexts": 3,
"max_unexplained_failures": 0,
},
"t2_known_gaps": [],
},
"generalisation": {
"seen_contexts": [],
"method_success_rate": {},
"variance_log": [],
},
"failure_log": [],
"created_at": now_iso,
"last_updated_at": now_iso,
"methods_execution": "sequence",
}
if session_id or machine_id or external_agent_id:
yaml_body["chain_refs"] = {
"source_session": session_id,
"machine_id": machine_id,
"external_agent_id": external_agent_id,
}
return yaml_body
def validate_yaml_schema(data: dict[str, Any]) -> list[str]:
"""Verifie la presence des champs obligatoires. Retourne la liste des manquants."""
return [field for field in REQUIRED_YAML_FIELDS if field not in data]
# ----------------------------------------------------------------------------
# Rate limit token-bucket simple (en memoire, par machine_id)
# ----------------------------------------------------------------------------
class PersistRateLimiter:
"""Token-bucket minimal pour /persist.
Par defaut : 10 requetes / minute / machine_id (cf. specs §6).
Instance unique attendue ; thread-safe via lock minimal.
"""
def __init__(self, *, max_per_minute: int = 10, window_seconds: int = 60) -> None:
self.max_per_minute = max_per_minute
self.window_seconds = window_seconds
self._timestamps: dict[str, list[float]] = {}
def allow(self, machine_id: str) -> tuple[bool, int]:
"""Renvoie (allowed, retry_after_seconds).
retry_after_seconds = 0 si autorise.
"""
if not machine_id:
return True, 0
now = time.time()
bucket = self._timestamps.setdefault(machine_id, [])
# Purger les entrees hors fenetre
bucket[:] = [ts for ts in bucket if now - ts < self.window_seconds]
if len(bucket) >= self.max_per_minute:
oldest = bucket[0]
retry_after = max(1, int(self.window_seconds - (now - oldest)))
return False, retry_after
bucket.append(now)
return True, 0
def reset(self, machine_id: Optional[str] = None) -> None:
if machine_id is None:
self._timestamps.clear()
else:
self._timestamps.pop(machine_id, None)
# Instance partagee importable depuis api_stream
persist_rate_limiter = PersistRateLimiter()
__all__ = [
"SLUG_PATTERN",
"COMPETENCES_ROOT",
"CANDIDATE_DIR",
"AUDIT_PATH",
"INCOMPLETE_PATH",
"REQUIRED_YAML_FIELDS",
"slugify",
"detect_cross_state_collision",
"detect_pii",
"atomic_write_yaml",
"audit_append",
"find_existing_audit_entry",
"build_competence_yaml",
"validate_yaml_schema",
"PersistRateLimiter",
"persist_rate_limiter",
]

View File

@@ -16,6 +16,48 @@ import io
logger = logging.getLogger(__name__)
def _extract_first_json_object(text: str) -> Optional[Dict[str, Any]]:
"""Extrait le premier objet JSON racine d'un texte qui peut contenir
du contenu parasite après (typique des modèles VLM qui ajoutent une
explication post-JSON).
Retourne None si aucun JSON valide n'est trouvé.
"""
if not text:
return None
# Trouver la première '{' au niveau racine
start = text.find("{")
if start < 0:
return None
depth = 0
in_string = False
escape = False
for i in range(start, len(text)):
c = text[i]
if escape:
escape = False
continue
if c == "\\" and in_string:
escape = True
continue
if c == '"':
in_string = not in_string
continue
if in_string:
continue
if c == "{":
depth += 1
elif c == "}":
depth -= 1
if depth == 0:
candidate = text[start : i + 1]
try:
return json.loads(candidate)
except json.JSONDecodeError:
return None
return None
class OllamaClient:
"""
Client Ollama pour VLM
@@ -219,7 +261,93 @@ class OllamaClient:
"success": False,
"error": str(e)
}
def generate_grounding(
self,
prompt: str,
image_path: Optional[str] = None,
image: Optional[Image.Image] = None,
extra_images_b64: Optional[List[str]] = None,
profile: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""D5-v2 (2026-05-25) : appel grounding VLM centralisé, prefill-aware.
Utilise le profil dédié `vlm_config.get_grounding_profile()` pour
garantir num_ctx pinned (défaut 4096), prefill JSON, think=false,
temperature=0, num_predict court. Évite les chemins qui retomberaient
sur qwen2.5vl en ctx 8192.
Le profile peut être surchargé via param explicite (utile tests).
Reconstitue le JSON complet via prefill : la réponse Ollama est
complétée par le préfixe `{"x_pct":` avant parsing, pour que
`json.loads()` voit le JSON natif.
Args:
prompt: prompt textuel (typiquement "Find element X")
image_path / image / extra_images_b64: cf. generate()
profile: override du profile grounding (sinon get_grounding_profile())
Returns:
Dict avec `response` (texte complet incluant prefill), `success`,
`error`, `parsed_json` (dict {x_pct, y_pct, confidence, ...} ou
None si non parsable), `profile_used` (dict).
Notes:
- Pas de fallback automatique sur fallback_model ici. Le caller
décide de retry avec un autre modèle si besoin.
- `keep_alive` du profile n'est PAS envoyé en payload (Ollama
accepte mais non standard). À gérer côté pull/keep si critique.
"""
if profile is None:
from core.detection.vlm_config import get_grounding_profile
profile = get_grounding_profile(endpoint=self.endpoint)
# Préserver le modèle courant, switcher temporairement.
original_model = self.model
self.model = profile["model"]
try:
result = self.generate(
prompt=prompt,
image_path=image_path,
image=image,
extra_images_b64=extra_images_b64,
temperature=profile["temperature"],
max_tokens=profile["num_predict"],
assistant_prefill=profile["prefill"],
num_ctx=profile["num_ctx"],
force_json=False, # prefill suffit, format=json ralentit qwen3.5
)
finally:
self.model = original_model
# Logging non-bruyant : 1 ligne par appel grounding
elapsed_hint = "" # caller mesure via time.perf_counter si besoin
logger.info(
"[PERF] vlm.grounding model=%s ctx=%d prefill=%s success=%s",
profile["model"], profile["num_ctx"],
"yes" if profile["prefill"] else "no",
result.get("success", False),
)
# Parse JSON prefill-aware. Le contenu complet inclut déjà le prefill
# (reconstitué par generate()) sauf si prefill=None. Si pas de prefill,
# tenter parse direct (le modèle peut avoir produit du JSON pur).
parsed = None
content = (result.get("response") or "").strip()
if content:
try:
# Le JSON peut être suivi de texte parasite (qwen termine
# parfois par des explications). Couper à la 1ère accolade
# fermante au niveau racine.
parsed = _extract_first_json_object(content)
except Exception as e:
logger.debug("[PERF] vlm.grounding parse failed: %s — content=%r", e, content[:160])
result["parsed_json"] = parsed
result["profile_used"] = dict(profile)
return result
def detect_ui_elements(self, image_path: str) -> Dict[str, Any]:
"""
Détecter les éléments UI dans une image

View File

@@ -134,13 +134,13 @@ def reset_vlm_model_cache():
def is_thinking_model(model_name: str) -> bool:
"""Détermine si un modèle est un modèle 'thinking' (qwen3).
"""Détermine si un modèle est un modèle 'thinking' (qwen3, qwen3.5).
Les modèles thinking nécessitent un assistant prefill pour éviter
le mode réflexion interne qui peut durer >180s avec des images.
Args:
model_name: Nom du modèle (ex: "qwen3-vl:8b", "gemma4:e4b")
model_name: Nom du modèle (ex: "qwen3-vl:8b", "qwen3.5:9b", "gemma4:e4b")
Returns:
True si le modèle est de type thinking (nécessite prefill workaround)
@@ -148,6 +148,92 @@ def is_thinking_model(model_name: str) -> bool:
return "qwen3" in model_name.lower()
# ────────────────────────────────────────────────────────────────────────────
# D5-v2 (2026-05-25) : profil grounding dédié, centralisé, env-overridable
# ────────────────────────────────────────────────────────────────────────────
# Profil grounding par défaut — qwen3.5:9b avec ctx 4096 et prefill JSON.
# Cohérent avec décision Codex après revue Gemini : empêcher rechauffe
# qwen2.5vl en ctx 8192 et garantir un chemin grounding reproductible.
DEFAULT_GROUNDING_MODEL = "qwen3.5:9b"
DEFAULT_GROUNDING_CTX = 4096
DEFAULT_GROUNDING_PREFILL = '{"x_pct":'
DEFAULT_GROUNDING_TEMPERATURE = 0.0
DEFAULT_GROUNDING_NUM_PREDICT = 96 # ~80 tokens suffisent pour `{x_pct,y_pct,confidence}`
DEFAULT_GROUNDING_KEEP_ALIVE = "30m" # éviter cold reload entre actions
# Fallback grounding : qwen2.5vl conservé pour compat existante (rpa-tag).
DEFAULT_GROUNDING_FALLBACK = "qwen2.5vl:7b-rpa"
def get_grounding_profile(endpoint: str = DEFAULT_OLLAMA_ENDPOINT) -> dict:
"""Retourne le profil VLM pour les appels de grounding **format JSON**
(réponse `{"x_pct": ..., "y_pct": ..., "confidence": ...}`).
⚠️ ATTENTION SCOPE D5-v3a (2026-05-25) :
Ce profil est destiné aux appels qui consomment la sortie via prefill JSON
(typiquement qwen3.5:9b avec prefill `{"x_pct":`). Il n'est PAS adapté
aux appels grounding **format bbox_2d natif** de qwen2.5vl (utilisés
dans `agent_v0/server_v1/resolve_engine.py:959-1013, 3008-3045` avec
parsing via `core.grounding.bbox_parser.parse_bbox_to_norm`).
Conflit env var connu : `resolve_engine.py:959` lit aussi
`RPA_GROUNDING_MODEL` mais attend un modèle bbox_2d (qwen2.5vl).
Si tu setes `RPA_GROUNDING_MODEL=qwen3.5:9b`, ce profil OK mais le
site bbox legacy de resolve_engine va recevoir un modèle incompatible.
Reporté à D5-v3b : renommer en `RPA_BBOX_GROUNDING_MODEL` côté legacy
+ introduire `OllamaClient.generate_bbox_grounding()`.
Centralise la politique pour empêcher les chemins VLM de retomber sur
qwen2.5vl en num_ctx=8192 (Modelfile). Sortie consommée par
OllamaClient.generate_grounding().
Env vars supportées :
- RPA_GROUNDING_MODEL : modèle principal (défaut qwen3.5:9b)
- RPA_GROUNDING_CTX : context window (défaut 4096)
- RPA_GROUNDING_FALLBACK : modèle fallback (défaut qwen2.5vl:7b-rpa)
- RPA_VLM_PREFILL=false : désactive le prefill JSON (rare, debug)
Returns:
dict avec clés :
- model: str
- num_ctx: int
- prefill: str ou None
- temperature: float
- num_predict: int
- think: bool (False pour qwen3 et qwen3.5)
- keep_alive: str
- fallback_model: str
"""
model = os.environ.get("RPA_GROUNDING_MODEL", DEFAULT_GROUNDING_MODEL).strip()
try:
num_ctx = int(os.environ.get("RPA_GROUNDING_CTX", str(DEFAULT_GROUNDING_CTX)))
except (TypeError, ValueError):
num_ctx = DEFAULT_GROUNDING_CTX
fallback = os.environ.get(
"RPA_GROUNDING_FALLBACK", DEFAULT_GROUNDING_FALLBACK
).strip()
prefill_enabled = os.environ.get("RPA_VLM_PREFILL", "true").strip().lower() not in (
"0", "false", "no", "off"
)
prefill = DEFAULT_GROUNDING_PREFILL if prefill_enabled else None
# think=False obligatoire pour qwen3/qwen3.5 (prefill = mécanisme principal)
# et gemma4 (sinon tokens vides Ollama >=0.20).
think_false = is_thinking_model(model) or needs_think_false(model)
return {
"model": model,
"num_ctx": num_ctx,
"prefill": prefill,
"temperature": DEFAULT_GROUNDING_TEMPERATURE,
"num_predict": DEFAULT_GROUNDING_NUM_PREDICT,
"think": not think_false, # API Ollama : think=False → on envoie False
"keep_alive": DEFAULT_GROUNDING_KEEP_ALIVE,
"fallback_model": fallback,
}
def needs_think_false(model_name: str) -> bool:
"""Détermine si un modèle nécessite think=false dans le payload.

View File

@@ -59,8 +59,13 @@ class CLIPEmbedder(EmbedderBase):
)
if device is None:
# NOTE: utiliser le `torch` du scope module (l. 8). Un import local
# ici rendait `torch` LOCAL à __init__ pour tout le scope, faisant
# planter `with torch.no_grad():` plus bas en UnboundLocalError
# quand l'appelant passait device="cpu" (l'import local n'était
# alors pas exécuté). Voir inbox_codex/2026-05-25_1235_..._enquete-
# feedbackbus-5004.md.
try:
import torch
if torch.cuda.is_available():
free_vram = torch.cuda.mem_get_info()[0] / 1024**3
if free_vram > 1.5:

View File

@@ -6,7 +6,11 @@ from .t2a_decision import (
analyze_dpi,
build_dpi_enriched,
)
from .ocr_extractor import extract_table_from_image, extract_text_from_image
from .ocr_extractor import (
extract_digits_tesseract_from_image,
extract_table_from_image,
extract_text_from_image,
)
__all__ = [
"PROMPT_TEMPLATE",
@@ -15,4 +19,5 @@ __all__ = [
"build_dpi_enriched",
"extract_text_from_image",
"extract_table_from_image",
"extract_digits_tesseract_from_image",
]

View File

@@ -1,6 +1,7 @@
"""Extracteur OCR — texte depuis une image (screenshot d'écran).
Utilise EasyOCR fr+en. Singleton (chargement modèle ~3s au premier appel).
Ajoute un chemin Tesseract spécialisé pour les chiffres/IPP d'écrans propres.
Conçu pour le pipeline streaming serveur (actions `extract_text` /
`extract_table`) : récupère un screenshot fresh (dernier heartbeat ou
@@ -11,6 +12,7 @@ pour analyse downstream (ex: t2a_decision, boucle sur N patients).
from __future__ import annotations
import logging
import os
import re
from pathlib import Path
from typing import List, Optional, Tuple
@@ -20,6 +22,19 @@ logger = logging.getLogger(__name__)
_easyocr_reader = None
def easyocr_gpu_enabled(default: bool = False) -> bool:
"""Return whether EasyOCR may allocate GPU memory.
The replay server shares the GPU with Ollama. Defaulting EasyOCR to CPU
keeps VRAM available for the VLM; set RPA_EASYOCR_GPU=1 only for a measured
OCR benchmark or a runtime that has spare VRAM.
"""
raw = os.getenv("RPA_EASYOCR_GPU", "")
if not raw:
return default
return raw.strip().lower() in {"1", "true", "yes", "on"}
def _get_reader():
"""Initialise EasyOCR fr+en au premier appel (singleton, CPU forcé).
@@ -29,8 +44,9 @@ def _get_reader():
global _easyocr_reader
if _easyocr_reader is None:
import easyocr
_easyocr_reader = easyocr.Reader(['fr', 'en'], gpu=False, verbose=False)
logger.info("EasyOCR initialisé (fr+en, CPU)")
gpu = easyocr_gpu_enabled(default=False)
_easyocr_reader = easyocr.Reader(['fr', 'en'], gpu=gpu, verbose=False)
logger.info("EasyOCR initialisé (fr+en, %s)", "GPU" if gpu else "CPU")
return _easyocr_reader
@@ -73,17 +89,86 @@ def extract_text_from_image(
return ""
def extract_digits_tesseract_from_image(
image_path: str,
region: Optional[Tuple[int, int, int, int]] = None,
pattern: Optional[str] = None,
limit: Optional[int] = None,
psm: int = 6,
lang: str = "eng",
whitelist: str = "0123456789",
) -> List[str]:
"""Extrait des valeurs numeriques via Tesseract.
Cas d'usage principal : IPP/champs chiffres dans des tableaux d'écran.
Ce chemin est volontairement explicite pour ne pas changer le comportement
EasyOCR general utilise par `extract_text`.
Args:
image_path: chemin du PNG/JPG sur disque.
region: (x, y, w, h) pour cropper avant OCR. None = image entière.
pattern: regex Python appliquee aux sequences de chiffres extraites.
Exemple IPP : r"^25\\d{6}$".
limit: nombre maximal de valeurs retournees.
psm: page segmentation mode Tesseract. 6 = bloc uniforme de texte.
lang: langue Tesseract.
whitelist: caracteres autorises. Par defaut chiffres uniquement.
Returns:
Liste de sequences numeriques dans l'ordre de lecture Tesseract.
En cas d'erreur, retourne une liste vide et log un warning.
"""
path = Path(image_path)
if not path.exists():
logger.warning("extract_digits_tesseract: fichier introuvable %s", image_path)
return []
try:
from PIL import Image
import pytesseract
with Image.open(path) as img:
if region:
x, y, w, h = region
img = img.crop((x, y, x + w, y + h))
if img.mode not in {"L", "RGB"}:
img = img.convert("RGB")
config_parts = ["--psm", str(psm)]
if whitelist:
config_parts.extend(["-c", f"tessedit_char_whitelist={whitelist}"])
text = pytesseract.image_to_string(
img,
lang=lang,
config=" ".join(config_parts),
)
values = re.findall(r"\d+", text)
if pattern:
compiled = re.compile(pattern)
values = [v for v in values if compiled.match(v)]
if limit:
values = values[:limit]
return values
except Exception as e:
logger.warning("extract_digits_tesseract échoué sur %s : %s", image_path, e)
return []
def extract_table_from_image(
image_path: str,
region: Optional[Tuple[int, int, int, int]] = None,
pattern: Optional[str] = None,
limit: Optional[int] = None,
engine: str = "easyocr",
) -> List[str]:
"""Extrait une liste de valeurs d'un tableau via OCR.
Cas d'usage principal : lire la liste des IPP d'un tableau de patients
pour boucler dessus. EasyOCR retourne tous les tokens avec leur bbox,
on filtre par regex puis on trie par position (y croissant).
pour boucler dessus. Par défaut, EasyOCR retourne tous les tokens avec
leur bbox, on filtre par regex puis on trie par position (y croissant).
Pour des champs chiffres/IPP, `engine="tesseract"` active le chemin
spécialisé Tesseract validé sur captures Easily.
Args:
image_path: chemin du PNG sur disque.
@@ -92,6 +177,7 @@ def extract_table_from_image(
Si None : tous les tokens non vides sont retournés.
Exemple IPP : r"^\\d{8,10}$" ou r"^25\\d{6}$"
limit: nombre maximal d'entrées à retourner (None = sans limite).
engine: "easyocr" (defaut) ou "tesseract" / "digits" / "ipp".
Returns:
Liste de strings dans l'ordre top → bottom (par y de bbox).
@@ -102,6 +188,15 @@ def extract_table_from_image(
logger.warning("extract_table: fichier introuvable %s", image_path)
return []
engine_name = (engine or "easyocr").strip().lower()
if engine_name in {"tesseract", "digits", "ipp"}:
return extract_digits_tesseract_from_image(
image_path,
region=region,
pattern=pattern,
limit=limit,
)
try:
from PIL import Image
import numpy as np

View File

@@ -99,10 +99,17 @@ class WorkflowPipeline:
logger.info("✓ Fusion Engine initialized")
# 3. State Embedding Builder
clip_embedders = {
"image": self.clip_embedder,
"text": self.clip_embedder,
"title": self.clip_embedder,
"ui": self.clip_embedder,
}
self.embedding_builder = StateEmbeddingBuilder(
fusion_engine=self.fusion_engine,
embedders=clip_embedders,
output_dir=self.embeddings_dir,
use_clip=True
use_clip=False
)
logger.info("✓ State Embedding Builder initialized")

38
core/semantic/__init__.py Normal file
View File

@@ -0,0 +1,38 @@
"""Phase 2.5 — Analyse sémantique post-apprentissage.
Module dédié à l'analyse sémantique des écrans capturés en phase Shadow,
**après** ``/api/v1/shadow/stop`` et **avant** restitution Option C.
Specs : ``docs/POC/SPECS_PHASE_25_SEMANTIQUE_2026-06-01.md``
Principes (arbitrage Plato 2026-06-01) :
- Post-apprentissage uniquement, **jamais en hot path replay**.
- OmniParser encapsulé derrière garde-fou anti-fragilité.
- Fallback OCR-seul (docTR) systématique en cas d'exception.
- Stockage ``.semantic.yaml`` séparé du YAML compétence principal.
- Opt-in par compétence (rétrocompat totale).
"""
from .phase25_analyzer import (
Phase25Analyzer,
Phase25Result,
ScreenAnalysis,
SemanticStructure,
SEMANTIC_DIR,
OMNIPARSER_CACHE_DIR,
OMNIPARSER_ERROR_LOG,
PHASH_HAMMING_THRESHOLD,
MAX_SCREENS_PER_SESSION,
)
__all__ = [
"Phase25Analyzer",
"Phase25Result",
"ScreenAnalysis",
"SemanticStructure",
"SEMANTIC_DIR",
"OMNIPARSER_CACHE_DIR",
"OMNIPARSER_ERROR_LOG",
"PHASH_HAMMING_THRESHOLD",
"MAX_SCREENS_PER_SESSION",
]

View File

@@ -0,0 +1,920 @@
"""Phase 2.5 — Analyseur sémantique post-apprentissage.
Module isolé qui prend en entrée un ensemble de screenshots capturés
pendant la phase Shadow et produit un payload structuré
``{tables, forms, buttons, text_blocks}`` par écran distinct,
stocké dans un fichier ``.semantic.yaml`` séparé.
Specs : ``docs/POC/SPECS_PHASE_25_SEMANTIQUE_2026-06-01.md``
Garde-fous :
- Wrapper try/except global autour de chaque appel OmniParser.
- Fallback OCR-seul (docTR) si OmniParser indisponible ou KO.
- Healthcheck OmniParser au démarrage : KO ⇒ bascule auto en dégradé.
- Cache disque ``data/cache/omniparser/<session>/<index>.json``.
- Cap 10 écrans distincts par session.
- Aucun import de FastAPI, aucun appel réseau direct.
"""
from __future__ import annotations
import concurrent.futures
import hashlib
import io
import json
import logging
import re
import time
import traceback
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterable, List, Optional, Sequence, Tuple
try: # pragma: no cover - dépendance externe déjà présente dans le projet
import yaml
except ImportError as exc: # pragma: no cover
raise RuntimeError("PyYAML est requis pour core.semantic.phase25_analyzer") from exc
try: # PIL toujours présent côté Linux dev / DGX
from PIL import Image
_HAS_PIL = True
except ImportError: # pragma: no cover
Image = None # type: ignore[assignment]
_HAS_PIL = False
try:
import imagehash # type: ignore
_HAS_IMAGEHASH = True
except ImportError: # pragma: no cover - fallback MD5 thumbnail
imagehash = None # type: ignore[assignment]
_HAS_IMAGEHASH = False
logger = logging.getLogger(__name__)
# ----------------------------------------------------------------------------
# Constantes et chemins
# ----------------------------------------------------------------------------
REPO_ROOT = Path(__file__).resolve().parents[2]
DATA_ROOT = REPO_ROOT / "data"
SEMANTIC_DIR = DATA_ROOT / "competences" / "candidate"
OMNIPARSER_CACHE_ROOT = DATA_ROOT / "cache" / "omniparser"
OMNIPARSER_CACHE_DIR = OMNIPARSER_CACHE_ROOT # alias public
LOGS_DIR = REPO_ROOT / "logs"
OMNIPARSER_ERROR_LOG = LOGS_DIR / "omniparser_errors.log"
# Heuristique de regroupement perceptuel (cf. specs §3).
PHASH_HAMMING_THRESHOLD = 8
MAX_SCREENS_PER_SESSION = 10
THUMBNAIL_SIZE = (256, 256) # fallback MD5
# Timeout par screenshot (cf. specs §2).
OMNIPARSER_TIMEOUT_SEC = 30.0
# Slug autorisé (réutilisation du pattern persist : a-z0-9_).
SLUG_PATTERN = re.compile(r"^[a-z][a-z0-9_]{2,79}$")
# session_id autorisé : caractères inoffensifs uniquement.
SESSION_ID_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_\-]{0,127}$")
# ----------------------------------------------------------------------------
# Dataclasses
# ----------------------------------------------------------------------------
@dataclass
class SemanticStructure:
"""Structure sémantique d'un écran (cf. specs §2)."""
tables: List[dict] = field(default_factory=list)
forms: List[dict] = field(default_factory=list)
buttons: List[dict] = field(default_factory=list)
text_blocks: List[dict] = field(default_factory=list)
def to_dict(self) -> dict:
return {
"tables": list(self.tables),
"forms": list(self.forms),
"buttons": list(self.buttons),
"text_blocks": list(self.text_blocks),
}
@dataclass
class ScreenAnalysis:
"""Analyse d'un écran représentatif (cf. specs §3)."""
index: int
phash: str
screen_id: str
screenshot_path: Optional[str]
structure: SemanticStructure
degraded: bool = False
degraded_reason: Optional[str] = None
elapsed_sec: float = 0.0
window_title: Optional[str] = None
# Snapshot "contrat Codex" : représentation aplatie destinée à
# l'agent-chat / dashboard. Calculée à la volée par to_dict().
def to_dict(self) -> dict:
elements = _structure_to_elements(self.structure)
return {
"index": self.index,
"hash": self.phash,
"screen_id": self.screen_id,
"window_title": self.window_title,
"screenshot_path": self.screenshot_path,
"structure": self.structure.to_dict(),
"elements": elements,
"degraded": self.degraded,
"degraded_reason": self.degraded_reason,
"elapsed_sec": round(self.elapsed_sec, 3),
}
@dataclass
class Phase25Result:
"""Résultat global d'une analyse Phase 2.5."""
session_id: str
generated_at: str
omniparser_available: bool
degraded: bool
too_complex: bool
screens: List[ScreenAnalysis] = field(default_factory=list)
healthcheck_passed: bool = True
healthcheck_reason: Optional[str] = None
def to_dict(self) -> dict:
return {
"session_id": self.session_id,
"generated_at": self.generated_at,
"omniparser_available": self.omniparser_available,
"degraded": self.degraded,
"too_complex": self.too_complex,
"healthcheck_passed": self.healthcheck_passed,
"healthcheck_reason": self.healthcheck_reason,
"screens": [s.to_dict() for s in self.screens],
}
# ----------------------------------------------------------------------------
# Helpers : validation et FS
# ----------------------------------------------------------------------------
def _validate_session_id(session_id: Any) -> str:
if not isinstance(session_id, str) or not session_id.strip():
raise ValueError("session_id doit etre une chaine non vide")
sid = session_id.strip()
if not SESSION_ID_PATTERN.match(sid):
raise ValueError(
"session_id invalide (autorise : [A-Za-z0-9][A-Za-z0-9_-]{0,127})"
)
# Anti path-traversal de ceinture-bretelles : on refuse explicitement
# toute tentative ../ même si le regex ne devrait pas la laisser passer.
if ".." in sid or "/" in sid or "\\" in sid:
raise ValueError("session_id invalide (path-traversal interdit)")
return sid
def _validate_slug(slug: Any) -> str:
if not isinstance(slug, str):
raise ValueError("slug doit etre une chaine")
s = slug.strip()
if not SLUG_PATTERN.match(s):
raise ValueError(
f"slug invalide '{s}' (regle : {SLUG_PATTERN.pattern})"
)
return s
def _ensure_dir(path: Path) -> None:
path.mkdir(parents=True, exist_ok=True)
def _log_omniparser_error(session_id: str, frame_index: int, exc: BaseException) -> None:
"""Append-only sur ``logs/omniparser_errors.log`` (cf. specs §7)."""
try:
_ensure_dir(LOGS_DIR)
entry = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"session_id": session_id,
"frame_index": frame_index,
"error_type": type(exc).__name__,
"error_message": str(exc),
"traceback": traceback.format_exception_only(type(exc), exc),
}
with OMNIPARSER_ERROR_LOG.open("a", encoding="utf-8") as fh:
fh.write(json.dumps(entry, ensure_ascii=False) + "\n")
except OSError as log_exc: # pragma: no cover - log best-effort
logger.warning("[PHASE25] echec ecriture omniparser_errors.log : %s", log_exc)
# ----------------------------------------------------------------------------
# Hash perceptuel (avec fallback MD5)
# ----------------------------------------------------------------------------
def compute_phash(image: "Image.Image") -> str:
"""Calcule un hash perceptuel ou un hash MD5 thumbnail (fallback)."""
if _HAS_IMAGEHASH and imagehash is not None:
try:
return str(imagehash.phash(image))
except Exception as exc: # pragma: no cover
logger.warning("[PHASE25] phash imagehash KO, fallback MD5 : %s", exc)
# Fallback MD5 sur thumbnail.
thumb = image.copy()
thumb.thumbnail(THUMBNAIL_SIZE)
buf = io.BytesIO()
thumb.convert("RGB").save(buf, format="PNG")
return "md5:" + hashlib.md5(buf.getvalue()).hexdigest()
def _hamming_distance(h1: str, h2: str) -> int:
"""Distance de Hamming entre deux phash imagehash, ou fallback MD5.
- Cas imagehash : on reconvertit via ``imagehash.hex_to_hash``.
- Cas MD5 (préfixe ``md5:``) : 0 si égal, sinon distance "haute" pour ne
jamais les considérer comme similaires (heuristique conservative).
"""
if h1.startswith("md5:") or h2.startswith("md5:"):
return 0 if h1 == h2 else PHASH_HAMMING_THRESHOLD + 1
if not _HAS_IMAGEHASH or imagehash is None:
# Pas d'imagehash mais les hashes hex présents (rare) : XOR brut.
try:
i1 = int(h1, 16)
i2 = int(h2, 16)
return bin(i1 ^ i2).count("1")
except ValueError:
return PHASH_HAMMING_THRESHOLD + 1
try:
return abs(imagehash.hex_to_hash(h1) - imagehash.hex_to_hash(h2))
except Exception:
return PHASH_HAMMING_THRESHOLD + 1
def identify_distinct_screens(
frames: Sequence[Tuple[int, "Image.Image"]],
threshold: int = PHASH_HAMMING_THRESHOLD,
) -> List[Tuple[int, "Image.Image", str]]:
"""Regroupe les frames par similarité phash et retourne un représentant par groupe.
Args:
frames: séquence ``(frame_index, PIL.Image)``.
threshold: Hamming distance max pour considérer deux frames identiques.
Returns:
Liste ``(frame_index, image, phash)`` — un représentant par groupe,
dans l'ordre temporel d'apparition (premier vu = représentant).
"""
representatives: List[Tuple[int, Image.Image, str]] = []
for idx, img in frames:
h = compute_phash(img)
matched = False
for ridx, _rimg, rhash in representatives:
if _hamming_distance(h, rhash) <= threshold:
matched = True
logger.debug(
"[PHASE25] frame %d regroupee avec representant %d (phash=%s)",
idx, ridx, h,
)
break
if not matched:
representatives.append((idx, img, h))
return representatives
# ----------------------------------------------------------------------------
# Conversion structure ⇄ "elements" (contrat Codex)
# ----------------------------------------------------------------------------
def _structure_to_elements(struct: SemanticStructure) -> List[dict]:
"""Aplatissement structure -> liste d'éléments {kind, label, bbox, confidence}."""
elements: List[dict] = []
for tbl in struct.tables:
elements.append({
"kind": "table",
"label": tbl.get("label", "table"),
"bbox": tbl.get("bbox", []),
"confidence": float(tbl.get("confidence", 0.5)),
})
for frm in struct.forms:
elements.append({
"kind": "field",
"label": frm.get("label", "field"),
"bbox": frm.get("bbox", []),
"confidence": float(frm.get("confidence", 0.5)),
})
for btn in struct.buttons:
elements.append({
"kind": "button",
"label": btn.get("label", "button"),
"bbox": btn.get("bbox", []),
"confidence": float(btn.get("confidence", 0.5)),
})
for tb in struct.text_blocks:
elements.append({
"kind": "text_block",
"label": tb.get("label", tb.get("text", "")),
"bbox": tb.get("bbox", []),
"confidence": float(tb.get("confidence", 0.5)),
})
return elements
def _classify_element(label: str, kind_hint: str | None = None) -> str:
"""Heuristique de classification d'un élément OmniParser.
Cohérente avec ``OmniParserAdapter._classify_element``, mais retourne
nos catégories sémantiques : ``table | field | button | text_block``.
"""
lab = (label or "").lower()
if kind_hint:
kh = kind_hint.lower()
if "table" in kh:
return "table"
if "input" in kh or "field" in kh or "edit" in kh:
return "field"
if "button" in kh or "btn" in kh:
return "button"
if any(kw in lab for kw in ("button", "btn", "submit", "valider", "annuler", "ok", "close")):
return "button"
if any(kw in lab for kw in ("input", "field", "saisie", "textbox", "champ")):
return "field"
if "table" in lab or "grille" in lab:
return "table"
return "text_block"
# ----------------------------------------------------------------------------
# Adapter wrappers : OmniParser et docTR (fallback)
# ----------------------------------------------------------------------------
class _OmniParserSafeWrapper:
"""Wrap fragile OmniParserAdapter avec garde-fou anti-exception.
- Import paresseux (lazy) pour ne pas casser l'import du module si
OmniParser n'est pas installé.
- ``available=False`` ⇒ caller bascule en fallback OCR-seul.
- Timeout effectif appliqué autour de chaque appel ``detect`` via
``ThreadPoolExecutor`` + ``future.result(timeout=...)``.
"""
# Executor module-level pour ne pas créer un pool par appel.
_TIMEOUT_EXECUTOR: Optional[concurrent.futures.ThreadPoolExecutor] = None
@classmethod
def _get_executor(cls) -> concurrent.futures.ThreadPoolExecutor:
if cls._TIMEOUT_EXECUTOR is None:
cls._TIMEOUT_EXECUTOR = concurrent.futures.ThreadPoolExecutor(
max_workers=2, thread_name_prefix="phase25-omniparser-timeout",
)
return cls._TIMEOUT_EXECUTOR
def __init__(self) -> None:
self._adapter: Any = None
self._available: bool = False
self._import_error: Optional[str] = None
self._try_import()
def _try_import(self) -> None:
try:
from core.detection.omniparser_adapter import OmniParserAdapter # type: ignore
self._adapter = OmniParserAdapter()
self._available = bool(getattr(self._adapter, "available", False))
if not self._available:
# L'adapter existe mais le check de disponibilité a échoué.
self._import_error = "OmniParser adapter installé mais modèles non disponibles"
except Exception as exc:
self._adapter = None
self._available = False
self._import_error = f"{type(exc).__name__}: {exc}"
@property
def available(self) -> bool:
return self._available
@property
def import_error(self) -> Optional[str]:
return self._import_error
def detect(
self,
image: "Image.Image",
*,
timeout: Optional[float] = None,
) -> List[Any]:
"""Appel sécurisé : enrobé d'un timeout dur, lève en cas d'exception.
Args:
image: image PIL à analyser.
timeout: timeout en secondes (défaut : ``OMNIPARSER_TIMEOUT_SEC``).
Si dépassé ⇒ ``concurrent.futures.TimeoutError`` propagée au
caller, qui bascule en fallback docTR + ``degraded=True``.
"""
if not self._available or self._adapter is None:
return []
effective_timeout = (
timeout if timeout is not None else OMNIPARSER_TIMEOUT_SEC
)
executor = self._get_executor()
future = executor.submit(self._adapter.detect, image)
try:
return list(future.result(timeout=effective_timeout))
except concurrent.futures.TimeoutError as exc:
# Le thread OmniParser continue son travail en arrière-plan mais
# le résultat est ignoré ; le caller bascule en fallback docTR.
logger.warning(
"[PHASE25] OmniParser.detect timeout (%.1fs) -> fallback",
effective_timeout,
)
raise
except Exception as exc:
logger.warning("[PHASE25] OmniParser.detect KO : %s", exc)
raise # remonté au caller pour log + fallback
def _detect_via_omniparser(
wrapper: _OmniParserSafeWrapper,
image: "Image.Image",
*,
timeout: Optional[float] = None,
) -> List[Any]:
return wrapper.detect(image, timeout=timeout)
def _detect_via_doctr(image: "Image.Image", screenshot_path: Optional[str]) -> List[dict]:
"""Fallback OCR-seul (docTR). Retourne une liste de text_blocks bruts.
Aucun VLM, aucune classification fine — juste OCR ⇒ ``text_blocks``.
"""
if not _HAS_PIL or image is None:
return []
try:
from doctr.io import DocumentFile # type: ignore
from doctr.models import ocr_predictor # type: ignore
except ImportError:
logger.info("[PHASE25] docTR non disponible pour fallback OCR")
return []
# Cache predictor module-level pour éviter rechargement.
global _DOCTR_PREDICTOR
try:
_DOCTR_PREDICTOR # type: ignore[used-before-def]
except NameError:
_DOCTR_PREDICTOR = None # type: ignore[assignment]
try:
if _DOCTR_PREDICTOR is None: # type: ignore[has-type]
_DOCTR_PREDICTOR = ocr_predictor( # type: ignore[assignment]
det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True,
)
except Exception as exc: # pragma: no cover
logger.warning("[PHASE25] docTR init KO : %s", exc)
return []
# docTR prend un fichier ou un array numpy ; on privilégie le chemin si fourni.
blocks: List[dict] = []
try:
if screenshot_path and Path(screenshot_path).exists():
doc = DocumentFile.from_images([screenshot_path])
else:
buf = io.BytesIO()
image.convert("RGB").save(buf, format="PNG")
buf.seek(0)
doc = DocumentFile.from_images([buf.getvalue()])
result = _DOCTR_PREDICTOR(doc) # type: ignore[misc]
W, H = image.size
for page in result.pages:
for block in page.blocks:
for line_obj in block.lines:
text = " ".join(w.value for w in line_obj.words).strip()
if not text:
continue
geom = line_obj.geometry # ((x1,y1), (x2,y2)) norm 0-1
x1 = int(geom[0][0] * W)
y1 = int(geom[0][1] * H)
x2 = int(geom[1][0] * W)
y2 = int(geom[1][1] * H)
blocks.append({
"label": text,
"text": text,
"bbox": [x1, y1, x2, y2],
"confidence": 0.6, # docTR ne donne pas de score line-level facilement
})
except Exception as exc: # pragma: no cover
logger.warning("[PHASE25] docTR predict KO : %s", exc)
return []
return blocks
def _elements_to_structure(elements: Iterable[Any]) -> SemanticStructure:
"""Convertit la liste OmniParser ``DetectedElement`` en SemanticStructure."""
struct = SemanticStructure()
for el in elements:
# Compatible avec DetectedElement (dataclass) et dict.
if hasattr(el, "label"):
label = getattr(el, "label", "") or ""
bbox = list(getattr(el, "bbox", ()) or ())
conf = float(getattr(el, "confidence", 0.5) or 0.5)
kind_hint = getattr(el, "element_type", None)
elif isinstance(el, dict):
label = str(el.get("label") or el.get("text") or "")
bbox = list(el.get("bbox") or [])
conf = float(el.get("confidence", el.get("score", 0.5)) or 0.5)
kind_hint = el.get("element_type") or el.get("type")
else:
continue
kind = _classify_element(label, kind_hint)
entry = {"label": label, "bbox": bbox, "confidence": conf}
if kind == "table":
struct.tables.append(entry)
elif kind == "field":
struct.forms.append(entry)
elif kind == "button":
struct.buttons.append(entry)
else:
struct.text_blocks.append({**entry, "text": label})
return struct
# ----------------------------------------------------------------------------
# Cache disque
# ----------------------------------------------------------------------------
def _cache_path(session_id: str, frame_index: int) -> Path:
sid = _validate_session_id(session_id)
return OMNIPARSER_CACHE_ROOT / sid / f"{int(frame_index)}.json"
def _cache_read(session_id: str, frame_index: int) -> Optional[dict]:
path = _cache_path(session_id, frame_index)
if not path.exists():
return None
try:
with path.open("r", encoding="utf-8") as fh:
return json.load(fh)
except (OSError, json.JSONDecodeError) as exc:
logger.warning("[PHASE25] cache illisible %s : %s", path, exc)
return None
def _cache_write(session_id: str, frame_index: int, payload: dict) -> None:
path = _cache_path(session_id, frame_index)
try:
_ensure_dir(path.parent)
tmp = path.with_suffix(".json.tmp")
with tmp.open("w", encoding="utf-8") as fh:
json.dump(payload, fh, ensure_ascii=False, indent=2)
tmp.replace(path)
except OSError as exc: # pragma: no cover
logger.warning("[PHASE25] cache ecriture KO %s : %s", path, exc)
# ----------------------------------------------------------------------------
# Analyseur principal
# ----------------------------------------------------------------------------
class Phase25Analyzer:
"""Analyseur sémantique post-apprentissage.
Usage minimal :
analyzer = Phase25Analyzer(session_id="abc123")
result = analyzer.analyze_frames(frames=[(0, img0), (12, img12), ...])
path = analyzer.write_semantic_yaml(result, slug="ma_competence")
``frames`` est une séquence ``(frame_index, PIL.Image[, screenshot_path])``.
"""
def __init__(
self,
session_id: str,
*,
omniparser: Optional[_OmniParserSafeWrapper] = None,
max_screens: int = MAX_SCREENS_PER_SESSION,
timeout_sec: float = OMNIPARSER_TIMEOUT_SEC,
) -> None:
self.session_id = _validate_session_id(session_id)
self.omniparser = omniparser if omniparser is not None else _OmniParserSafeWrapper()
self.max_screens = max_screens
self.timeout_sec = timeout_sec
self._healthcheck_passed = True
self._healthcheck_reason: Optional[str] = None
# -- healthcheck -------------------------------------------------------
def healthcheck(self) -> bool:
"""Vérifie qu'OmniParser répond sur une image bidon (cf. specs §7).
- Si l'adapter est ``available=False`` ⇒ healthcheck KO (mais on
continuera quand même en mode dégradé OCR-seul).
- Si l'adapter lève une exception ⇒ KO + log dédié.
"""
if not _HAS_PIL:
self._healthcheck_passed = False
self._healthcheck_reason = "PIL indisponible"
return False
if not self.omniparser.available:
self._healthcheck_passed = False
self._healthcheck_reason = (
self.omniparser.import_error or "OmniParser indisponible"
)
return False
try:
dummy = Image.new("RGB", (64, 64), color=(255, 255, 255))
_ = self.omniparser.detect(dummy, timeout=self.timeout_sec)
self._healthcheck_passed = True
self._healthcheck_reason = None
return True
except Exception as exc:
_log_omniparser_error(self.session_id, -1, exc)
self._healthcheck_passed = False
self._healthcheck_reason = f"{type(exc).__name__}: {exc}"
return False
# -- analyse écran ----------------------------------------------------
def analyze_screen(
self,
frame_index: int,
image: "Image.Image",
phash: str,
*,
screenshot_path: Optional[str] = None,
window_title: Optional[str] = None,
force_fallback: bool = False,
) -> ScreenAnalysis:
"""Analyse un écran représentatif.
Stratégie :
1. Cache disque (idempotence par session_id+frame_index).
2. OmniParser via wrapper safe → sinon fallback OCR-seul docTR.
3. Exception ⇒ log dédié + ``degraded=True`` + structure docTR.
"""
# 1. Cache
cached = _cache_read(self.session_id, frame_index)
if cached is not None:
struct = SemanticStructure(
tables=cached.get("structure", {}).get("tables", []),
forms=cached.get("structure", {}).get("forms", []),
buttons=cached.get("structure", {}).get("buttons", []),
text_blocks=cached.get("structure", {}).get("text_blocks", []),
)
return ScreenAnalysis(
index=frame_index,
phash=cached.get("phash", phash),
screen_id=cached.get("screen_id", f"screen_{frame_index:03d}"),
screenshot_path=cached.get("screenshot_path", screenshot_path),
structure=struct,
degraded=bool(cached.get("degraded", False)),
degraded_reason=cached.get("degraded_reason"),
elapsed_sec=float(cached.get("elapsed_sec", 0.0)),
window_title=cached.get("window_title", window_title),
)
t0 = time.monotonic()
degraded = False
degraded_reason: Optional[str] = None
structure: SemanticStructure
use_omniparser = self.omniparser.available and not force_fallback
if use_omniparser:
try:
elements = _detect_via_omniparser(
self.omniparser, image, timeout=self.timeout_sec,
)
structure = _elements_to_structure(elements)
if not (structure.tables or structure.forms or structure.buttons or structure.text_blocks):
# OmniParser n'a rien produit : on ajoute en complément docTR text_blocks.
blocks = _detect_via_doctr(image, screenshot_path)
structure.text_blocks.extend(blocks)
except Exception as exc:
_log_omniparser_error(self.session_id, frame_index, exc)
degraded = True
degraded_reason = f"omniparser_exception: {type(exc).__name__}"
blocks = _detect_via_doctr(image, screenshot_path)
structure = SemanticStructure(text_blocks=blocks)
else:
degraded = True
degraded_reason = (
"omniparser_unavailable: " + (self.omniparser.import_error or "n/a")
if not self.omniparser.available
else "forced_fallback"
)
blocks = _detect_via_doctr(image, screenshot_path)
structure = SemanticStructure(text_blocks=blocks)
elapsed = time.monotonic() - t0
analysis = ScreenAnalysis(
index=frame_index,
phash=phash,
screen_id=f"screen_{frame_index:03d}",
screenshot_path=screenshot_path,
structure=structure,
degraded=degraded,
degraded_reason=degraded_reason,
elapsed_sec=elapsed,
window_title=window_title,
)
# Cache écriture (best-effort).
_cache_write(self.session_id, frame_index, analysis.to_dict())
return analysis
# -- pipeline complet -------------------------------------------------
def analyze_frames(
self,
frames: Sequence[Tuple[int, "Image.Image"]],
*,
screenshot_paths: Optional[dict[int, str]] = None,
window_titles: Optional[dict[int, str]] = None,
run_healthcheck: bool = True,
) -> Phase25Result:
"""Pipeline complet : grouping phash → analyse → cap → résultat.
Args:
frames: liste ``(frame_index, PIL.Image)``.
screenshot_paths: mapping ``frame_index -> path`` (optionnel).
window_titles: mapping ``frame_index -> window_title`` (optionnel).
run_healthcheck: lancer le healthcheck OmniParser avant analyse.
Returns:
``Phase25Result`` avec ``too_complex=True`` si > max_screens.
"""
if not _HAS_PIL:
raise RuntimeError("PIL est requis pour Phase25Analyzer.analyze_frames")
if run_healthcheck:
self.healthcheck()
if not self._healthcheck_passed:
logger.warning(
"[PHASE25] healthcheck OmniParser KO (%s) -> mode degrade docTR",
self._healthcheck_reason,
)
force_fallback = not self._healthcheck_passed
# 1. Regrouper par similarité perceptuelle.
reps = identify_distinct_screens(frames)
# 2. Cap MAX_SCREENS_PER_SESSION.
too_complex = len(reps) > self.max_screens
if too_complex:
logger.warning(
"[PHASE25] session %s : %d ecrans distincts > cap %d -> too_complex",
self.session_id, len(reps), self.max_screens,
)
reps = reps[: self.max_screens]
# 3. Analyser chaque représentant.
sp = screenshot_paths or {}
wt = window_titles or {}
screens: List[ScreenAnalysis] = []
any_degraded = False
for idx, img, phash in reps:
analysis = self.analyze_screen(
idx,
img,
phash,
screenshot_path=sp.get(idx),
window_title=wt.get(idx),
force_fallback=force_fallback,
)
screens.append(analysis)
any_degraded = any_degraded or analysis.degraded
return Phase25Result(
session_id=self.session_id,
generated_at=datetime.now(timezone.utc).isoformat(),
omniparser_available=self.omniparser.available and self._healthcheck_passed,
degraded=any_degraded or not self._healthcheck_passed,
too_complex=too_complex,
screens=screens,
healthcheck_passed=self._healthcheck_passed,
healthcheck_reason=self._healthcheck_reason,
)
# -- écriture YAML -----------------------------------------------------
def write_semantic_yaml(
self,
result: Phase25Result,
slug: str,
*,
target_dir: Optional[Path] = None,
) -> Path:
"""Écrit le ``.semantic.yaml`` à côté du YAML compétence candidate.
Args:
result: Résultat d'analyse Phase 2.5.
slug: slug compétence (validé contre SLUG_PATTERN).
target_dir: répertoire cible (défaut : ``data/competences/candidate/``).
Returns:
Path absolu du fichier écrit.
Raises:
ValueError: slug invalide.
OSError: écriture impossible.
"""
s = _validate_slug(slug)
out_dir = target_dir if target_dir is not None else SEMANTIC_DIR
out_dir = Path(out_dir)
_ensure_dir(out_dir)
# Anti écrasement supervised/stable : on refuse explicitement.
forbidden = {"supervised", "stable"}
if out_dir.name in forbidden:
raise ValueError(
f"target_dir interdit '{out_dir.name}' (autorise : candidate uniquement)"
)
payload = {
"competence_id": s,
"semantic_version": 1,
"generated_at": result.generated_at,
"session_id": result.session_id,
"omniparser_available": result.omniparser_available,
"degraded": result.degraded,
"too_complex": result.too_complex,
"healthcheck_passed": result.healthcheck_passed,
"healthcheck_reason": result.healthcheck_reason,
"screens": [],
}
for sc in result.screens:
payload["screens"].append({
"screen_id": sc.screen_id,
"phash": sc.phash,
"representative_frame_index": sc.index,
"screenshot_path": sc.screenshot_path,
"window_title": sc.window_title,
"degraded": sc.degraded,
"degraded_reason": sc.degraded_reason,
"elapsed_sec": round(sc.elapsed_sec, 3),
"structure": sc.structure.to_dict(),
"annotations": [], # placeholder — annotation humaine ultérieure
})
target = out_dir / f"{s}.semantic.yaml"
tmp = target.with_suffix(".yaml.tmp")
with tmp.open("w", encoding="utf-8") as fh:
yaml.safe_dump(payload, fh, allow_unicode=True, sort_keys=False)
tmp.replace(target)
logger.info(
"[PHASE25] semantic yaml ecrit : %s (screens=%d, degraded=%s)",
target, len(result.screens), result.degraded,
)
return target
# ----------------------------------------------------------------------------
# Helpers utilitaires (chargement frames)
# ----------------------------------------------------------------------------
def load_frames_from_paths(paths_by_index: dict[int, str]) -> List[Tuple[int, "Image.Image"]]:
"""Charge des images PIL à partir d'un mapping ``frame_index -> path``.
Ignore silencieusement les chemins inexistants (avec log warning).
"""
if not _HAS_PIL:
raise RuntimeError("PIL est requis pour load_frames_from_paths")
frames: List[Tuple[int, Image.Image]] = []
for idx in sorted(paths_by_index.keys()):
p = paths_by_index[idx]
try:
img = Image.open(p)
img.load()
frames.append((int(idx), img))
except (FileNotFoundError, OSError) as exc:
logger.warning("[PHASE25] frame %d illisible (%s) : %s", idx, p, exc)
return frames
__all__ = [
"Phase25Analyzer",
"Phase25Result",
"ScreenAnalysis",
"SemanticStructure",
"SEMANTIC_DIR",
"OMNIPARSER_CACHE_DIR",
"OMNIPARSER_CACHE_ROOT",
"OMNIPARSER_ERROR_LOG",
"PHASH_HAMMING_THRESHOLD",
"MAX_SCREENS_PER_SESSION",
"compute_phash",
"identify_distinct_screens",
"load_frames_from_paths",
]