feat(p1): persist workflows and semantic learning artifacts

2026-06-02 16:20:38 +02:00
parent 7a1a5cb6fd
commit 86b3c8f7e7
21 changed files with 3816 additions and 31 deletions
--- a/core/llm/init.py
+++ b/core/llm/init.py
@@ -6,7 +6,11 @@ from .t2a_decision import (
    analyze_dpi,
    build_dpi_enriched,
 )
-from .ocr_extractor import extract_table_from_image, extract_text_from_image
+from .ocr_extractor import (
+    extract_digits_tesseract_from_image,
+    extract_table_from_image,
+    extract_text_from_image,
+)

 __all__ = [
    "PROMPT_TEMPLATE",
@@ -15,4 +19,5 @@ __all__ = [
    "build_dpi_enriched",
    "extract_text_from_image",
    "extract_table_from_image",
+    "extract_digits_tesseract_from_image",
 ]
--- a/core/llm/ocr_extractor.py
+++ b/core/llm/ocr_extractor.py
@@ -1,6 +1,7 @@
 """Extracteur OCR — texte depuis une image (screenshot d'écran).

 Utilise EasyOCR fr+en. Singleton (chargement modèle ~3s au premier appel).
+Ajoute un chemin Tesseract spécialisé pour les chiffres/IPP d'écrans propres.

 Conçu pour le pipeline streaming serveur (actions `extract_text` /
 `extract_table`) : récupère un screenshot fresh (dernier heartbeat ou
@@ -11,6 +12,7 @@ pour analyse downstream (ex: t2a_decision, boucle sur N patients).
 from __future__ import annotations

 import logging
+import os
 import re
 from pathlib import Path
 from typing import List, Optional, Tuple
@@ -20,6 +22,19 @@ logger = logging.getLogger(__name__)
 _easyocr_reader = None


+def easyocr_gpu_enabled(default: bool = False) -> bool:
+    """Return whether EasyOCR may allocate GPU memory.
+
+    The replay server shares the GPU with Ollama. Defaulting EasyOCR to CPU
+    keeps VRAM available for the VLM; set RPA_EASYOCR_GPU=1 only for a measured
+    OCR benchmark or a runtime that has spare VRAM.
+    """
+    raw = os.getenv("RPA_EASYOCR_GPU", "")
+    if not raw:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+
+
 def _get_reader():
    """Initialise EasyOCR fr+en au premier appel (singleton, CPU forcé).

@@ -29,8 +44,9 @@ def _get_reader():
    global _easyocr_reader
    if _easyocr_reader is None:
        import easyocr
-        _easyocr_reader = easyocr.Reader(['fr', 'en'], gpu=False, verbose=False)
-        logger.info("EasyOCR initialisé (fr+en, CPU)")
+        gpu = easyocr_gpu_enabled(default=False)
+        _easyocr_reader = easyocr.Reader(['fr', 'en'], gpu=gpu, verbose=False)
+        logger.info("EasyOCR initialisé (fr+en, %s)", "GPU" if gpu else "CPU")
    return _easyocr_reader


@@ -73,17 +89,86 @@ def extract_text_from_image(
        return ""


+def extract_digits_tesseract_from_image(
+    image_path: str,
+    region: Optional[Tuple[int, int, int, int]] = None,
+    pattern: Optional[str] = None,
+    limit: Optional[int] = None,
+    psm: int = 6,
+    lang: str = "eng",
+    whitelist: str = "0123456789",
+) -> List[str]:
+    """Extrait des valeurs numeriques via Tesseract.
+
+    Cas d'usage principal : IPP/champs chiffres dans des tableaux d'écran.
+    Ce chemin est volontairement explicite pour ne pas changer le comportement
+    EasyOCR general utilise par `extract_text`.
+
+    Args:
+        image_path: chemin du PNG/JPG sur disque.
+        region: (x, y, w, h) pour cropper avant OCR. None = image entière.
+        pattern: regex Python appliquee aux sequences de chiffres extraites.
+                 Exemple IPP : r"^25\\d{6}$".
+        limit: nombre maximal de valeurs retournees.
+        psm: page segmentation mode Tesseract. 6 = bloc uniforme de texte.
+        lang: langue Tesseract.
+        whitelist: caracteres autorises. Par defaut chiffres uniquement.
+
+    Returns:
+        Liste de sequences numeriques dans l'ordre de lecture Tesseract.
+        En cas d'erreur, retourne une liste vide et log un warning.
+    """
+    path = Path(image_path)
+    if not path.exists():
+        logger.warning("extract_digits_tesseract: fichier introuvable %s", image_path)
+        return []
+
+    try:
+        from PIL import Image
+        import pytesseract
+
+        with Image.open(path) as img:
+            if region:
+                x, y, w, h = region
+                img = img.crop((x, y, x + w, y + h))
+            if img.mode not in {"L", "RGB"}:
+                img = img.convert("RGB")
+
+            config_parts = ["--psm", str(psm)]
+            if whitelist:
+                config_parts.extend(["-c", f"tessedit_char_whitelist={whitelist}"])
+            text = pytesseract.image_to_string(
+                img,
+                lang=lang,
+                config=" ".join(config_parts),
+            )
+
+        values = re.findall(r"\d+", text)
+        if pattern:
+            compiled = re.compile(pattern)
+            values = [v for v in values if compiled.match(v)]
+        if limit:
+            values = values[:limit]
+        return values
+    except Exception as e:
+        logger.warning("extract_digits_tesseract échoué sur %s : %s", image_path, e)
+        return []
+
+
 def extract_table_from_image(
    image_path: str,
    region: Optional[Tuple[int, int, int, int]] = None,
    pattern: Optional[str] = None,
    limit: Optional[int] = None,
+    engine: str = "easyocr",
 ) -> List[str]:
    """Extrait une liste de valeurs d'un tableau via OCR.

    Cas d'usage principal : lire la liste des IPP d'un tableau de patients
-    pour boucler dessus. EasyOCR retourne tous les tokens avec leur bbox,
-    on filtre par regex puis on trie par position (y croissant).
+    pour boucler dessus. Par défaut, EasyOCR retourne tous les tokens avec
+    leur bbox, on filtre par regex puis on trie par position (y croissant).
+    Pour des champs chiffres/IPP, `engine="tesseract"` active le chemin
+    spécialisé Tesseract validé sur captures Easily.

    Args:
        image_path: chemin du PNG sur disque.
@@ -92,6 +177,7 @@ def extract_table_from_image(
                 Si None : tous les tokens non vides sont retournés.
                 Exemple IPP : r"^\\d{8,10}$" ou r"^25\\d{6}$"
        limit: nombre maximal d'entrées à retourner (None = sans limite).
+        engine: "easyocr" (defaut) ou "tesseract" / "digits" / "ipp".

    Returns:
        Liste de strings dans l'ordre top → bottom (par y de bbox).
@@ -102,6 +188,15 @@ def extract_table_from_image(
        logger.warning("extract_table: fichier introuvable %s", image_path)
        return []

+    engine_name = (engine or "easyocr").strip().lower()
+    if engine_name in {"tesseract", "digits", "ipp"}:
+        return extract_digits_tesseract_from_image(
+            image_path,
+            region=region,
+            pattern=pattern,
+            limit=limit,
+        )
+
    try:
        from PIL import Image
        import numpy as np