feat(p1): persist workflows and semantic learning artifacts
This commit is contained in:
38
core/semantic/__init__.py
Normal file
38
core/semantic/__init__.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""Phase 2.5 — Analyse sémantique post-apprentissage.
|
||||
|
||||
Module dédié à l'analyse sémantique des écrans capturés en phase Shadow,
|
||||
**après** ``/api/v1/shadow/stop`` et **avant** restitution Option C.
|
||||
|
||||
Specs : ``docs/POC/SPECS_PHASE_25_SEMANTIQUE_2026-06-01.md``
|
||||
|
||||
Principes (arbitrage Plato 2026-06-01) :
|
||||
- Post-apprentissage uniquement, **jamais en hot path replay**.
|
||||
- OmniParser encapsulé derrière garde-fou anti-fragilité.
|
||||
- Fallback OCR-seul (docTR) systématique en cas d'exception.
|
||||
- Stockage ``.semantic.yaml`` séparé du YAML compétence principal.
|
||||
- Opt-in par compétence (rétrocompat totale).
|
||||
"""
|
||||
|
||||
from .phase25_analyzer import (
|
||||
Phase25Analyzer,
|
||||
Phase25Result,
|
||||
ScreenAnalysis,
|
||||
SemanticStructure,
|
||||
SEMANTIC_DIR,
|
||||
OMNIPARSER_CACHE_DIR,
|
||||
OMNIPARSER_ERROR_LOG,
|
||||
PHASH_HAMMING_THRESHOLD,
|
||||
MAX_SCREENS_PER_SESSION,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Phase25Analyzer",
|
||||
"Phase25Result",
|
||||
"ScreenAnalysis",
|
||||
"SemanticStructure",
|
||||
"SEMANTIC_DIR",
|
||||
"OMNIPARSER_CACHE_DIR",
|
||||
"OMNIPARSER_ERROR_LOG",
|
||||
"PHASH_HAMMING_THRESHOLD",
|
||||
"MAX_SCREENS_PER_SESSION",
|
||||
]
|
||||
920
core/semantic/phase25_analyzer.py
Normal file
920
core/semantic/phase25_analyzer.py
Normal file
@@ -0,0 +1,920 @@
|
||||
"""Phase 2.5 — Analyseur sémantique post-apprentissage.
|
||||
|
||||
Module isolé qui prend en entrée un ensemble de screenshots capturés
|
||||
pendant la phase Shadow et produit un payload structuré
|
||||
``{tables, forms, buttons, text_blocks}`` par écran distinct,
|
||||
stocké dans un fichier ``.semantic.yaml`` séparé.
|
||||
|
||||
Specs : ``docs/POC/SPECS_PHASE_25_SEMANTIQUE_2026-06-01.md``
|
||||
|
||||
Garde-fous :
|
||||
- Wrapper try/except global autour de chaque appel OmniParser.
|
||||
- Fallback OCR-seul (docTR) si OmniParser indisponible ou KO.
|
||||
- Healthcheck OmniParser au démarrage : KO ⇒ bascule auto en dégradé.
|
||||
- Cache disque ``data/cache/omniparser/<session>/<index>.json``.
|
||||
- Cap 10 écrans distincts par session.
|
||||
- Aucun import de FastAPI, aucun appel réseau direct.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import concurrent.futures
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
try: # pragma: no cover - dépendance externe déjà présente dans le projet
|
||||
import yaml
|
||||
except ImportError as exc: # pragma: no cover
|
||||
raise RuntimeError("PyYAML est requis pour core.semantic.phase25_analyzer") from exc
|
||||
|
||||
try: # PIL toujours présent côté Linux dev / DGX
|
||||
from PIL import Image
|
||||
_HAS_PIL = True
|
||||
except ImportError: # pragma: no cover
|
||||
Image = None # type: ignore[assignment]
|
||||
_HAS_PIL = False
|
||||
|
||||
try:
|
||||
import imagehash # type: ignore
|
||||
_HAS_IMAGEHASH = True
|
||||
except ImportError: # pragma: no cover - fallback MD5 thumbnail
|
||||
imagehash = None # type: ignore[assignment]
|
||||
_HAS_IMAGEHASH = False
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Constantes et chemins
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
DATA_ROOT = REPO_ROOT / "data"
|
||||
SEMANTIC_DIR = DATA_ROOT / "competences" / "candidate"
|
||||
OMNIPARSER_CACHE_ROOT = DATA_ROOT / "cache" / "omniparser"
|
||||
OMNIPARSER_CACHE_DIR = OMNIPARSER_CACHE_ROOT # alias public
|
||||
LOGS_DIR = REPO_ROOT / "logs"
|
||||
OMNIPARSER_ERROR_LOG = LOGS_DIR / "omniparser_errors.log"
|
||||
|
||||
# Heuristique de regroupement perceptuel (cf. specs §3).
|
||||
PHASH_HAMMING_THRESHOLD = 8
|
||||
MAX_SCREENS_PER_SESSION = 10
|
||||
THUMBNAIL_SIZE = (256, 256) # fallback MD5
|
||||
|
||||
# Timeout par screenshot (cf. specs §2).
|
||||
OMNIPARSER_TIMEOUT_SEC = 30.0
|
||||
|
||||
# Slug autorisé (réutilisation du pattern persist : a-z0-9_).
|
||||
SLUG_PATTERN = re.compile(r"^[a-z][a-z0-9_]{2,79}$")
|
||||
# session_id autorisé : caractères inoffensifs uniquement.
|
||||
SESSION_ID_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_\-]{0,127}$")
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Dataclasses
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class SemanticStructure:
|
||||
"""Structure sémantique d'un écran (cf. specs §2)."""
|
||||
|
||||
tables: List[dict] = field(default_factory=list)
|
||||
forms: List[dict] = field(default_factory=list)
|
||||
buttons: List[dict] = field(default_factory=list)
|
||||
text_blocks: List[dict] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"tables": list(self.tables),
|
||||
"forms": list(self.forms),
|
||||
"buttons": list(self.buttons),
|
||||
"text_blocks": list(self.text_blocks),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScreenAnalysis:
|
||||
"""Analyse d'un écran représentatif (cf. specs §3)."""
|
||||
|
||||
index: int
|
||||
phash: str
|
||||
screen_id: str
|
||||
screenshot_path: Optional[str]
|
||||
structure: SemanticStructure
|
||||
degraded: bool = False
|
||||
degraded_reason: Optional[str] = None
|
||||
elapsed_sec: float = 0.0
|
||||
window_title: Optional[str] = None
|
||||
# Snapshot "contrat Codex" : représentation aplatie destinée à
|
||||
# l'agent-chat / dashboard. Calculée à la volée par to_dict().
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
elements = _structure_to_elements(self.structure)
|
||||
return {
|
||||
"index": self.index,
|
||||
"hash": self.phash,
|
||||
"screen_id": self.screen_id,
|
||||
"window_title": self.window_title,
|
||||
"screenshot_path": self.screenshot_path,
|
||||
"structure": self.structure.to_dict(),
|
||||
"elements": elements,
|
||||
"degraded": self.degraded,
|
||||
"degraded_reason": self.degraded_reason,
|
||||
"elapsed_sec": round(self.elapsed_sec, 3),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Phase25Result:
|
||||
"""Résultat global d'une analyse Phase 2.5."""
|
||||
|
||||
session_id: str
|
||||
generated_at: str
|
||||
omniparser_available: bool
|
||||
degraded: bool
|
||||
too_complex: bool
|
||||
screens: List[ScreenAnalysis] = field(default_factory=list)
|
||||
healthcheck_passed: bool = True
|
||||
healthcheck_reason: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"session_id": self.session_id,
|
||||
"generated_at": self.generated_at,
|
||||
"omniparser_available": self.omniparser_available,
|
||||
"degraded": self.degraded,
|
||||
"too_complex": self.too_complex,
|
||||
"healthcheck_passed": self.healthcheck_passed,
|
||||
"healthcheck_reason": self.healthcheck_reason,
|
||||
"screens": [s.to_dict() for s in self.screens],
|
||||
}
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Helpers : validation et FS
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _validate_session_id(session_id: Any) -> str:
|
||||
if not isinstance(session_id, str) or not session_id.strip():
|
||||
raise ValueError("session_id doit etre une chaine non vide")
|
||||
sid = session_id.strip()
|
||||
if not SESSION_ID_PATTERN.match(sid):
|
||||
raise ValueError(
|
||||
"session_id invalide (autorise : [A-Za-z0-9][A-Za-z0-9_-]{0,127})"
|
||||
)
|
||||
# Anti path-traversal de ceinture-bretelles : on refuse explicitement
|
||||
# toute tentative ../ même si le regex ne devrait pas la laisser passer.
|
||||
if ".." in sid or "/" in sid or "\\" in sid:
|
||||
raise ValueError("session_id invalide (path-traversal interdit)")
|
||||
return sid
|
||||
|
||||
|
||||
def _validate_slug(slug: Any) -> str:
|
||||
if not isinstance(slug, str):
|
||||
raise ValueError("slug doit etre une chaine")
|
||||
s = slug.strip()
|
||||
if not SLUG_PATTERN.match(s):
|
||||
raise ValueError(
|
||||
f"slug invalide '{s}' (regle : {SLUG_PATTERN.pattern})"
|
||||
)
|
||||
return s
|
||||
|
||||
|
||||
def _ensure_dir(path: Path) -> None:
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _log_omniparser_error(session_id: str, frame_index: int, exc: BaseException) -> None:
|
||||
"""Append-only sur ``logs/omniparser_errors.log`` (cf. specs §7)."""
|
||||
try:
|
||||
_ensure_dir(LOGS_DIR)
|
||||
entry = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"session_id": session_id,
|
||||
"frame_index": frame_index,
|
||||
"error_type": type(exc).__name__,
|
||||
"error_message": str(exc),
|
||||
"traceback": traceback.format_exception_only(type(exc), exc),
|
||||
}
|
||||
with OMNIPARSER_ERROR_LOG.open("a", encoding="utf-8") as fh:
|
||||
fh.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
except OSError as log_exc: # pragma: no cover - log best-effort
|
||||
logger.warning("[PHASE25] echec ecriture omniparser_errors.log : %s", log_exc)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Hash perceptuel (avec fallback MD5)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def compute_phash(image: "Image.Image") -> str:
|
||||
"""Calcule un hash perceptuel ou un hash MD5 thumbnail (fallback)."""
|
||||
if _HAS_IMAGEHASH and imagehash is not None:
|
||||
try:
|
||||
return str(imagehash.phash(image))
|
||||
except Exception as exc: # pragma: no cover
|
||||
logger.warning("[PHASE25] phash imagehash KO, fallback MD5 : %s", exc)
|
||||
# Fallback MD5 sur thumbnail.
|
||||
thumb = image.copy()
|
||||
thumb.thumbnail(THUMBNAIL_SIZE)
|
||||
buf = io.BytesIO()
|
||||
thumb.convert("RGB").save(buf, format="PNG")
|
||||
return "md5:" + hashlib.md5(buf.getvalue()).hexdigest()
|
||||
|
||||
|
||||
def _hamming_distance(h1: str, h2: str) -> int:
|
||||
"""Distance de Hamming entre deux phash imagehash, ou fallback MD5.
|
||||
|
||||
- Cas imagehash : on reconvertit via ``imagehash.hex_to_hash``.
|
||||
- Cas MD5 (préfixe ``md5:``) : 0 si égal, sinon distance "haute" pour ne
|
||||
jamais les considérer comme similaires (heuristique conservative).
|
||||
"""
|
||||
if h1.startswith("md5:") or h2.startswith("md5:"):
|
||||
return 0 if h1 == h2 else PHASH_HAMMING_THRESHOLD + 1
|
||||
if not _HAS_IMAGEHASH or imagehash is None:
|
||||
# Pas d'imagehash mais les hashes hex présents (rare) : XOR brut.
|
||||
try:
|
||||
i1 = int(h1, 16)
|
||||
i2 = int(h2, 16)
|
||||
return bin(i1 ^ i2).count("1")
|
||||
except ValueError:
|
||||
return PHASH_HAMMING_THRESHOLD + 1
|
||||
try:
|
||||
return abs(imagehash.hex_to_hash(h1) - imagehash.hex_to_hash(h2))
|
||||
except Exception:
|
||||
return PHASH_HAMMING_THRESHOLD + 1
|
||||
|
||||
|
||||
def identify_distinct_screens(
|
||||
frames: Sequence[Tuple[int, "Image.Image"]],
|
||||
threshold: int = PHASH_HAMMING_THRESHOLD,
|
||||
) -> List[Tuple[int, "Image.Image", str]]:
|
||||
"""Regroupe les frames par similarité phash et retourne un représentant par groupe.
|
||||
|
||||
Args:
|
||||
frames: séquence ``(frame_index, PIL.Image)``.
|
||||
threshold: Hamming distance max pour considérer deux frames identiques.
|
||||
|
||||
Returns:
|
||||
Liste ``(frame_index, image, phash)`` — un représentant par groupe,
|
||||
dans l'ordre temporel d'apparition (premier vu = représentant).
|
||||
"""
|
||||
representatives: List[Tuple[int, Image.Image, str]] = []
|
||||
for idx, img in frames:
|
||||
h = compute_phash(img)
|
||||
matched = False
|
||||
for ridx, _rimg, rhash in representatives:
|
||||
if _hamming_distance(h, rhash) <= threshold:
|
||||
matched = True
|
||||
logger.debug(
|
||||
"[PHASE25] frame %d regroupee avec representant %d (phash=%s)",
|
||||
idx, ridx, h,
|
||||
)
|
||||
break
|
||||
if not matched:
|
||||
representatives.append((idx, img, h))
|
||||
return representatives
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Conversion structure ⇄ "elements" (contrat Codex)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _structure_to_elements(struct: SemanticStructure) -> List[dict]:
|
||||
"""Aplatissement structure -> liste d'éléments {kind, label, bbox, confidence}."""
|
||||
elements: List[dict] = []
|
||||
for tbl in struct.tables:
|
||||
elements.append({
|
||||
"kind": "table",
|
||||
"label": tbl.get("label", "table"),
|
||||
"bbox": tbl.get("bbox", []),
|
||||
"confidence": float(tbl.get("confidence", 0.5)),
|
||||
})
|
||||
for frm in struct.forms:
|
||||
elements.append({
|
||||
"kind": "field",
|
||||
"label": frm.get("label", "field"),
|
||||
"bbox": frm.get("bbox", []),
|
||||
"confidence": float(frm.get("confidence", 0.5)),
|
||||
})
|
||||
for btn in struct.buttons:
|
||||
elements.append({
|
||||
"kind": "button",
|
||||
"label": btn.get("label", "button"),
|
||||
"bbox": btn.get("bbox", []),
|
||||
"confidence": float(btn.get("confidence", 0.5)),
|
||||
})
|
||||
for tb in struct.text_blocks:
|
||||
elements.append({
|
||||
"kind": "text_block",
|
||||
"label": tb.get("label", tb.get("text", "")),
|
||||
"bbox": tb.get("bbox", []),
|
||||
"confidence": float(tb.get("confidence", 0.5)),
|
||||
})
|
||||
return elements
|
||||
|
||||
|
||||
def _classify_element(label: str, kind_hint: str | None = None) -> str:
|
||||
"""Heuristique de classification d'un élément OmniParser.
|
||||
|
||||
Cohérente avec ``OmniParserAdapter._classify_element``, mais retourne
|
||||
nos catégories sémantiques : ``table | field | button | text_block``.
|
||||
"""
|
||||
lab = (label or "").lower()
|
||||
if kind_hint:
|
||||
kh = kind_hint.lower()
|
||||
if "table" in kh:
|
||||
return "table"
|
||||
if "input" in kh or "field" in kh or "edit" in kh:
|
||||
return "field"
|
||||
if "button" in kh or "btn" in kh:
|
||||
return "button"
|
||||
if any(kw in lab for kw in ("button", "btn", "submit", "valider", "annuler", "ok", "close")):
|
||||
return "button"
|
||||
if any(kw in lab for kw in ("input", "field", "saisie", "textbox", "champ")):
|
||||
return "field"
|
||||
if "table" in lab or "grille" in lab:
|
||||
return "table"
|
||||
return "text_block"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Adapter wrappers : OmniParser et docTR (fallback)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _OmniParserSafeWrapper:
|
||||
"""Wrap fragile OmniParserAdapter avec garde-fou anti-exception.
|
||||
|
||||
- Import paresseux (lazy) pour ne pas casser l'import du module si
|
||||
OmniParser n'est pas installé.
|
||||
- ``available=False`` ⇒ caller bascule en fallback OCR-seul.
|
||||
- Timeout effectif appliqué autour de chaque appel ``detect`` via
|
||||
``ThreadPoolExecutor`` + ``future.result(timeout=...)``.
|
||||
"""
|
||||
|
||||
# Executor module-level pour ne pas créer un pool par appel.
|
||||
_TIMEOUT_EXECUTOR: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
||||
|
||||
@classmethod
|
||||
def _get_executor(cls) -> concurrent.futures.ThreadPoolExecutor:
|
||||
if cls._TIMEOUT_EXECUTOR is None:
|
||||
cls._TIMEOUT_EXECUTOR = concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=2, thread_name_prefix="phase25-omniparser-timeout",
|
||||
)
|
||||
return cls._TIMEOUT_EXECUTOR
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._adapter: Any = None
|
||||
self._available: bool = False
|
||||
self._import_error: Optional[str] = None
|
||||
self._try_import()
|
||||
|
||||
def _try_import(self) -> None:
|
||||
try:
|
||||
from core.detection.omniparser_adapter import OmniParserAdapter # type: ignore
|
||||
self._adapter = OmniParserAdapter()
|
||||
self._available = bool(getattr(self._adapter, "available", False))
|
||||
if not self._available:
|
||||
# L'adapter existe mais le check de disponibilité a échoué.
|
||||
self._import_error = "OmniParser adapter installé mais modèles non disponibles"
|
||||
except Exception as exc:
|
||||
self._adapter = None
|
||||
self._available = False
|
||||
self._import_error = f"{type(exc).__name__}: {exc}"
|
||||
|
||||
@property
|
||||
def available(self) -> bool:
|
||||
return self._available
|
||||
|
||||
@property
|
||||
def import_error(self) -> Optional[str]:
|
||||
return self._import_error
|
||||
|
||||
def detect(
|
||||
self,
|
||||
image: "Image.Image",
|
||||
*,
|
||||
timeout: Optional[float] = None,
|
||||
) -> List[Any]:
|
||||
"""Appel sécurisé : enrobé d'un timeout dur, lève en cas d'exception.
|
||||
|
||||
Args:
|
||||
image: image PIL à analyser.
|
||||
timeout: timeout en secondes (défaut : ``OMNIPARSER_TIMEOUT_SEC``).
|
||||
Si dépassé ⇒ ``concurrent.futures.TimeoutError`` propagée au
|
||||
caller, qui bascule en fallback docTR + ``degraded=True``.
|
||||
"""
|
||||
if not self._available or self._adapter is None:
|
||||
return []
|
||||
effective_timeout = (
|
||||
timeout if timeout is not None else OMNIPARSER_TIMEOUT_SEC
|
||||
)
|
||||
executor = self._get_executor()
|
||||
future = executor.submit(self._adapter.detect, image)
|
||||
try:
|
||||
return list(future.result(timeout=effective_timeout))
|
||||
except concurrent.futures.TimeoutError as exc:
|
||||
# Le thread OmniParser continue son travail en arrière-plan mais
|
||||
# le résultat est ignoré ; le caller bascule en fallback docTR.
|
||||
logger.warning(
|
||||
"[PHASE25] OmniParser.detect timeout (%.1fs) -> fallback",
|
||||
effective_timeout,
|
||||
)
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.warning("[PHASE25] OmniParser.detect KO : %s", exc)
|
||||
raise # remonté au caller pour log + fallback
|
||||
|
||||
|
||||
def _detect_via_omniparser(
|
||||
wrapper: _OmniParserSafeWrapper,
|
||||
image: "Image.Image",
|
||||
*,
|
||||
timeout: Optional[float] = None,
|
||||
) -> List[Any]:
|
||||
return wrapper.detect(image, timeout=timeout)
|
||||
|
||||
|
||||
def _detect_via_doctr(image: "Image.Image", screenshot_path: Optional[str]) -> List[dict]:
|
||||
"""Fallback OCR-seul (docTR). Retourne une liste de text_blocks bruts.
|
||||
|
||||
Aucun VLM, aucune classification fine — juste OCR ⇒ ``text_blocks``.
|
||||
"""
|
||||
if not _HAS_PIL or image is None:
|
||||
return []
|
||||
try:
|
||||
from doctr.io import DocumentFile # type: ignore
|
||||
from doctr.models import ocr_predictor # type: ignore
|
||||
except ImportError:
|
||||
logger.info("[PHASE25] docTR non disponible pour fallback OCR")
|
||||
return []
|
||||
|
||||
# Cache predictor module-level pour éviter rechargement.
|
||||
global _DOCTR_PREDICTOR
|
||||
try:
|
||||
_DOCTR_PREDICTOR # type: ignore[used-before-def]
|
||||
except NameError:
|
||||
_DOCTR_PREDICTOR = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
if _DOCTR_PREDICTOR is None: # type: ignore[has-type]
|
||||
_DOCTR_PREDICTOR = ocr_predictor( # type: ignore[assignment]
|
||||
det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True,
|
||||
)
|
||||
except Exception as exc: # pragma: no cover
|
||||
logger.warning("[PHASE25] docTR init KO : %s", exc)
|
||||
return []
|
||||
|
||||
# docTR prend un fichier ou un array numpy ; on privilégie le chemin si fourni.
|
||||
blocks: List[dict] = []
|
||||
try:
|
||||
if screenshot_path and Path(screenshot_path).exists():
|
||||
doc = DocumentFile.from_images([screenshot_path])
|
||||
else:
|
||||
buf = io.BytesIO()
|
||||
image.convert("RGB").save(buf, format="PNG")
|
||||
buf.seek(0)
|
||||
doc = DocumentFile.from_images([buf.getvalue()])
|
||||
result = _DOCTR_PREDICTOR(doc) # type: ignore[misc]
|
||||
W, H = image.size
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line_obj in block.lines:
|
||||
text = " ".join(w.value for w in line_obj.words).strip()
|
||||
if not text:
|
||||
continue
|
||||
geom = line_obj.geometry # ((x1,y1), (x2,y2)) norm 0-1
|
||||
x1 = int(geom[0][0] * W)
|
||||
y1 = int(geom[0][1] * H)
|
||||
x2 = int(geom[1][0] * W)
|
||||
y2 = int(geom[1][1] * H)
|
||||
blocks.append({
|
||||
"label": text,
|
||||
"text": text,
|
||||
"bbox": [x1, y1, x2, y2],
|
||||
"confidence": 0.6, # docTR ne donne pas de score line-level facilement
|
||||
})
|
||||
except Exception as exc: # pragma: no cover
|
||||
logger.warning("[PHASE25] docTR predict KO : %s", exc)
|
||||
return []
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def _elements_to_structure(elements: Iterable[Any]) -> SemanticStructure:
|
||||
"""Convertit la liste OmniParser ``DetectedElement`` en SemanticStructure."""
|
||||
struct = SemanticStructure()
|
||||
for el in elements:
|
||||
# Compatible avec DetectedElement (dataclass) et dict.
|
||||
if hasattr(el, "label"):
|
||||
label = getattr(el, "label", "") or ""
|
||||
bbox = list(getattr(el, "bbox", ()) or ())
|
||||
conf = float(getattr(el, "confidence", 0.5) or 0.5)
|
||||
kind_hint = getattr(el, "element_type", None)
|
||||
elif isinstance(el, dict):
|
||||
label = str(el.get("label") or el.get("text") or "")
|
||||
bbox = list(el.get("bbox") or [])
|
||||
conf = float(el.get("confidence", el.get("score", 0.5)) or 0.5)
|
||||
kind_hint = el.get("element_type") or el.get("type")
|
||||
else:
|
||||
continue
|
||||
|
||||
kind = _classify_element(label, kind_hint)
|
||||
entry = {"label": label, "bbox": bbox, "confidence": conf}
|
||||
if kind == "table":
|
||||
struct.tables.append(entry)
|
||||
elif kind == "field":
|
||||
struct.forms.append(entry)
|
||||
elif kind == "button":
|
||||
struct.buttons.append(entry)
|
||||
else:
|
||||
struct.text_blocks.append({**entry, "text": label})
|
||||
return struct
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Cache disque
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _cache_path(session_id: str, frame_index: int) -> Path:
|
||||
sid = _validate_session_id(session_id)
|
||||
return OMNIPARSER_CACHE_ROOT / sid / f"{int(frame_index)}.json"
|
||||
|
||||
|
||||
def _cache_read(session_id: str, frame_index: int) -> Optional[dict]:
|
||||
path = _cache_path(session_id, frame_index)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
with path.open("r", encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
except (OSError, json.JSONDecodeError) as exc:
|
||||
logger.warning("[PHASE25] cache illisible %s : %s", path, exc)
|
||||
return None
|
||||
|
||||
|
||||
def _cache_write(session_id: str, frame_index: int, payload: dict) -> None:
|
||||
path = _cache_path(session_id, frame_index)
|
||||
try:
|
||||
_ensure_dir(path.parent)
|
||||
tmp = path.with_suffix(".json.tmp")
|
||||
with tmp.open("w", encoding="utf-8") as fh:
|
||||
json.dump(payload, fh, ensure_ascii=False, indent=2)
|
||||
tmp.replace(path)
|
||||
except OSError as exc: # pragma: no cover
|
||||
logger.warning("[PHASE25] cache ecriture KO %s : %s", path, exc)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Analyseur principal
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Phase25Analyzer:
|
||||
"""Analyseur sémantique post-apprentissage.
|
||||
|
||||
Usage minimal :
|
||||
|
||||
analyzer = Phase25Analyzer(session_id="abc123")
|
||||
result = analyzer.analyze_frames(frames=[(0, img0), (12, img12), ...])
|
||||
path = analyzer.write_semantic_yaml(result, slug="ma_competence")
|
||||
|
||||
``frames`` est une séquence ``(frame_index, PIL.Image[, screenshot_path])``.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session_id: str,
|
||||
*,
|
||||
omniparser: Optional[_OmniParserSafeWrapper] = None,
|
||||
max_screens: int = MAX_SCREENS_PER_SESSION,
|
||||
timeout_sec: float = OMNIPARSER_TIMEOUT_SEC,
|
||||
) -> None:
|
||||
self.session_id = _validate_session_id(session_id)
|
||||
self.omniparser = omniparser if omniparser is not None else _OmniParserSafeWrapper()
|
||||
self.max_screens = max_screens
|
||||
self.timeout_sec = timeout_sec
|
||||
self._healthcheck_passed = True
|
||||
self._healthcheck_reason: Optional[str] = None
|
||||
|
||||
# -- healthcheck -------------------------------------------------------
|
||||
|
||||
def healthcheck(self) -> bool:
|
||||
"""Vérifie qu'OmniParser répond sur une image bidon (cf. specs §7).
|
||||
|
||||
- Si l'adapter est ``available=False`` ⇒ healthcheck KO (mais on
|
||||
continuera quand même en mode dégradé OCR-seul).
|
||||
- Si l'adapter lève une exception ⇒ KO + log dédié.
|
||||
"""
|
||||
if not _HAS_PIL:
|
||||
self._healthcheck_passed = False
|
||||
self._healthcheck_reason = "PIL indisponible"
|
||||
return False
|
||||
if not self.omniparser.available:
|
||||
self._healthcheck_passed = False
|
||||
self._healthcheck_reason = (
|
||||
self.omniparser.import_error or "OmniParser indisponible"
|
||||
)
|
||||
return False
|
||||
try:
|
||||
dummy = Image.new("RGB", (64, 64), color=(255, 255, 255))
|
||||
_ = self.omniparser.detect(dummy, timeout=self.timeout_sec)
|
||||
self._healthcheck_passed = True
|
||||
self._healthcheck_reason = None
|
||||
return True
|
||||
except Exception as exc:
|
||||
_log_omniparser_error(self.session_id, -1, exc)
|
||||
self._healthcheck_passed = False
|
||||
self._healthcheck_reason = f"{type(exc).__name__}: {exc}"
|
||||
return False
|
||||
|
||||
# -- analyse écran ----------------------------------------------------
|
||||
|
||||
def analyze_screen(
|
||||
self,
|
||||
frame_index: int,
|
||||
image: "Image.Image",
|
||||
phash: str,
|
||||
*,
|
||||
screenshot_path: Optional[str] = None,
|
||||
window_title: Optional[str] = None,
|
||||
force_fallback: bool = False,
|
||||
) -> ScreenAnalysis:
|
||||
"""Analyse un écran représentatif.
|
||||
|
||||
Stratégie :
|
||||
1. Cache disque (idempotence par session_id+frame_index).
|
||||
2. OmniParser via wrapper safe → sinon fallback OCR-seul docTR.
|
||||
3. Exception ⇒ log dédié + ``degraded=True`` + structure docTR.
|
||||
"""
|
||||
# 1. Cache
|
||||
cached = _cache_read(self.session_id, frame_index)
|
||||
if cached is not None:
|
||||
struct = SemanticStructure(
|
||||
tables=cached.get("structure", {}).get("tables", []),
|
||||
forms=cached.get("structure", {}).get("forms", []),
|
||||
buttons=cached.get("structure", {}).get("buttons", []),
|
||||
text_blocks=cached.get("structure", {}).get("text_blocks", []),
|
||||
)
|
||||
return ScreenAnalysis(
|
||||
index=frame_index,
|
||||
phash=cached.get("phash", phash),
|
||||
screen_id=cached.get("screen_id", f"screen_{frame_index:03d}"),
|
||||
screenshot_path=cached.get("screenshot_path", screenshot_path),
|
||||
structure=struct,
|
||||
degraded=bool(cached.get("degraded", False)),
|
||||
degraded_reason=cached.get("degraded_reason"),
|
||||
elapsed_sec=float(cached.get("elapsed_sec", 0.0)),
|
||||
window_title=cached.get("window_title", window_title),
|
||||
)
|
||||
|
||||
t0 = time.monotonic()
|
||||
degraded = False
|
||||
degraded_reason: Optional[str] = None
|
||||
structure: SemanticStructure
|
||||
|
||||
use_omniparser = self.omniparser.available and not force_fallback
|
||||
if use_omniparser:
|
||||
try:
|
||||
elements = _detect_via_omniparser(
|
||||
self.omniparser, image, timeout=self.timeout_sec,
|
||||
)
|
||||
structure = _elements_to_structure(elements)
|
||||
if not (structure.tables or structure.forms or structure.buttons or structure.text_blocks):
|
||||
# OmniParser n'a rien produit : on ajoute en complément docTR text_blocks.
|
||||
blocks = _detect_via_doctr(image, screenshot_path)
|
||||
structure.text_blocks.extend(blocks)
|
||||
except Exception as exc:
|
||||
_log_omniparser_error(self.session_id, frame_index, exc)
|
||||
degraded = True
|
||||
degraded_reason = f"omniparser_exception: {type(exc).__name__}"
|
||||
blocks = _detect_via_doctr(image, screenshot_path)
|
||||
structure = SemanticStructure(text_blocks=blocks)
|
||||
else:
|
||||
degraded = True
|
||||
degraded_reason = (
|
||||
"omniparser_unavailable: " + (self.omniparser.import_error or "n/a")
|
||||
if not self.omniparser.available
|
||||
else "forced_fallback"
|
||||
)
|
||||
blocks = _detect_via_doctr(image, screenshot_path)
|
||||
structure = SemanticStructure(text_blocks=blocks)
|
||||
|
||||
elapsed = time.monotonic() - t0
|
||||
analysis = ScreenAnalysis(
|
||||
index=frame_index,
|
||||
phash=phash,
|
||||
screen_id=f"screen_{frame_index:03d}",
|
||||
screenshot_path=screenshot_path,
|
||||
structure=structure,
|
||||
degraded=degraded,
|
||||
degraded_reason=degraded_reason,
|
||||
elapsed_sec=elapsed,
|
||||
window_title=window_title,
|
||||
)
|
||||
|
||||
# Cache écriture (best-effort).
|
||||
_cache_write(self.session_id, frame_index, analysis.to_dict())
|
||||
return analysis
|
||||
|
||||
# -- pipeline complet -------------------------------------------------
|
||||
|
||||
def analyze_frames(
|
||||
self,
|
||||
frames: Sequence[Tuple[int, "Image.Image"]],
|
||||
*,
|
||||
screenshot_paths: Optional[dict[int, str]] = None,
|
||||
window_titles: Optional[dict[int, str]] = None,
|
||||
run_healthcheck: bool = True,
|
||||
) -> Phase25Result:
|
||||
"""Pipeline complet : grouping phash → analyse → cap → résultat.
|
||||
|
||||
Args:
|
||||
frames: liste ``(frame_index, PIL.Image)``.
|
||||
screenshot_paths: mapping ``frame_index -> path`` (optionnel).
|
||||
window_titles: mapping ``frame_index -> window_title`` (optionnel).
|
||||
run_healthcheck: lancer le healthcheck OmniParser avant analyse.
|
||||
|
||||
Returns:
|
||||
``Phase25Result`` avec ``too_complex=True`` si > max_screens.
|
||||
"""
|
||||
if not _HAS_PIL:
|
||||
raise RuntimeError("PIL est requis pour Phase25Analyzer.analyze_frames")
|
||||
|
||||
if run_healthcheck:
|
||||
self.healthcheck()
|
||||
if not self._healthcheck_passed:
|
||||
logger.warning(
|
||||
"[PHASE25] healthcheck OmniParser KO (%s) -> mode degrade docTR",
|
||||
self._healthcheck_reason,
|
||||
)
|
||||
|
||||
force_fallback = not self._healthcheck_passed
|
||||
|
||||
# 1. Regrouper par similarité perceptuelle.
|
||||
reps = identify_distinct_screens(frames)
|
||||
|
||||
# 2. Cap MAX_SCREENS_PER_SESSION.
|
||||
too_complex = len(reps) > self.max_screens
|
||||
if too_complex:
|
||||
logger.warning(
|
||||
"[PHASE25] session %s : %d ecrans distincts > cap %d -> too_complex",
|
||||
self.session_id, len(reps), self.max_screens,
|
||||
)
|
||||
reps = reps[: self.max_screens]
|
||||
|
||||
# 3. Analyser chaque représentant.
|
||||
sp = screenshot_paths or {}
|
||||
wt = window_titles or {}
|
||||
screens: List[ScreenAnalysis] = []
|
||||
any_degraded = False
|
||||
for idx, img, phash in reps:
|
||||
analysis = self.analyze_screen(
|
||||
idx,
|
||||
img,
|
||||
phash,
|
||||
screenshot_path=sp.get(idx),
|
||||
window_title=wt.get(idx),
|
||||
force_fallback=force_fallback,
|
||||
)
|
||||
screens.append(analysis)
|
||||
any_degraded = any_degraded or analysis.degraded
|
||||
|
||||
return Phase25Result(
|
||||
session_id=self.session_id,
|
||||
generated_at=datetime.now(timezone.utc).isoformat(),
|
||||
omniparser_available=self.omniparser.available and self._healthcheck_passed,
|
||||
degraded=any_degraded or not self._healthcheck_passed,
|
||||
too_complex=too_complex,
|
||||
screens=screens,
|
||||
healthcheck_passed=self._healthcheck_passed,
|
||||
healthcheck_reason=self._healthcheck_reason,
|
||||
)
|
||||
|
||||
# -- écriture YAML -----------------------------------------------------
|
||||
|
||||
def write_semantic_yaml(
|
||||
self,
|
||||
result: Phase25Result,
|
||||
slug: str,
|
||||
*,
|
||||
target_dir: Optional[Path] = None,
|
||||
) -> Path:
|
||||
"""Écrit le ``.semantic.yaml`` à côté du YAML compétence candidate.
|
||||
|
||||
Args:
|
||||
result: Résultat d'analyse Phase 2.5.
|
||||
slug: slug compétence (validé contre SLUG_PATTERN).
|
||||
target_dir: répertoire cible (défaut : ``data/competences/candidate/``).
|
||||
|
||||
Returns:
|
||||
Path absolu du fichier écrit.
|
||||
|
||||
Raises:
|
||||
ValueError: slug invalide.
|
||||
OSError: écriture impossible.
|
||||
"""
|
||||
s = _validate_slug(slug)
|
||||
out_dir = target_dir if target_dir is not None else SEMANTIC_DIR
|
||||
out_dir = Path(out_dir)
|
||||
_ensure_dir(out_dir)
|
||||
|
||||
# Anti écrasement supervised/stable : on refuse explicitement.
|
||||
forbidden = {"supervised", "stable"}
|
||||
if out_dir.name in forbidden:
|
||||
raise ValueError(
|
||||
f"target_dir interdit '{out_dir.name}' (autorise : candidate uniquement)"
|
||||
)
|
||||
|
||||
payload = {
|
||||
"competence_id": s,
|
||||
"semantic_version": 1,
|
||||
"generated_at": result.generated_at,
|
||||
"session_id": result.session_id,
|
||||
"omniparser_available": result.omniparser_available,
|
||||
"degraded": result.degraded,
|
||||
"too_complex": result.too_complex,
|
||||
"healthcheck_passed": result.healthcheck_passed,
|
||||
"healthcheck_reason": result.healthcheck_reason,
|
||||
"screens": [],
|
||||
}
|
||||
for sc in result.screens:
|
||||
payload["screens"].append({
|
||||
"screen_id": sc.screen_id,
|
||||
"phash": sc.phash,
|
||||
"representative_frame_index": sc.index,
|
||||
"screenshot_path": sc.screenshot_path,
|
||||
"window_title": sc.window_title,
|
||||
"degraded": sc.degraded,
|
||||
"degraded_reason": sc.degraded_reason,
|
||||
"elapsed_sec": round(sc.elapsed_sec, 3),
|
||||
"structure": sc.structure.to_dict(),
|
||||
"annotations": [], # placeholder — annotation humaine ultérieure
|
||||
})
|
||||
|
||||
target = out_dir / f"{s}.semantic.yaml"
|
||||
tmp = target.with_suffix(".yaml.tmp")
|
||||
with tmp.open("w", encoding="utf-8") as fh:
|
||||
yaml.safe_dump(payload, fh, allow_unicode=True, sort_keys=False)
|
||||
tmp.replace(target)
|
||||
logger.info(
|
||||
"[PHASE25] semantic yaml ecrit : %s (screens=%d, degraded=%s)",
|
||||
target, len(result.screens), result.degraded,
|
||||
)
|
||||
return target
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Helpers utilitaires (chargement frames)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def load_frames_from_paths(paths_by_index: dict[int, str]) -> List[Tuple[int, "Image.Image"]]:
|
||||
"""Charge des images PIL à partir d'un mapping ``frame_index -> path``.
|
||||
|
||||
Ignore silencieusement les chemins inexistants (avec log warning).
|
||||
"""
|
||||
if not _HAS_PIL:
|
||||
raise RuntimeError("PIL est requis pour load_frames_from_paths")
|
||||
frames: List[Tuple[int, Image.Image]] = []
|
||||
for idx in sorted(paths_by_index.keys()):
|
||||
p = paths_by_index[idx]
|
||||
try:
|
||||
img = Image.open(p)
|
||||
img.load()
|
||||
frames.append((int(idx), img))
|
||||
except (FileNotFoundError, OSError) as exc:
|
||||
logger.warning("[PHASE25] frame %d illisible (%s) : %s", idx, p, exc)
|
||||
return frames
|
||||
|
||||
|
||||
__all__ = [
|
||||
"Phase25Analyzer",
|
||||
"Phase25Result",
|
||||
"ScreenAnalysis",
|
||||
"SemanticStructure",
|
||||
"SEMANTIC_DIR",
|
||||
"OMNIPARSER_CACHE_DIR",
|
||||
"OMNIPARSER_CACHE_ROOT",
|
||||
"OMNIPARSER_ERROR_LOG",
|
||||
"PHASH_HAMMING_THRESHOLD",
|
||||
"MAX_SCREENS_PER_SESSION",
|
||||
"compute_phash",
|
||||
"identify_distinct_screens",
|
||||
"load_frames_from_paths",
|
||||
]
|
||||
Reference in New Issue
Block a user