feat(p1): persist workflows and semantic learning artifacts

This commit is contained in:
Dom
2026-06-02 16:20:38 +02:00
parent 7a1a5cb6fd
commit 86b3c8f7e7
21 changed files with 3816 additions and 31 deletions

View File

@@ -988,7 +988,9 @@ def _resolve_by_grounding(
{"role": "user", "content": prompt, "images": [shot_b64]},
],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 100},
# D5-v3a (2026-05-25) num_ctx=4096 explicite : éviter fuite 8192
# via Modelfile qwen2.5vl:7b-rpa (PARAMETER num_ctx 8192).
"options": {"temperature": 0.1, "num_predict": 100, "num_ctx": 4096},
}, timeout=60)
content = resp.json().get("message", {}).get("content", "")
except Exception as e:
@@ -1016,7 +1018,9 @@ def _resolve_by_grounding(
{"role": "user", "content": prompt_mi, "images": [shot_b64, anchor_b64]},
],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 50},
# D5-v3a (2026-05-25) num_ctx=4096 explicite : éviter fuite
# 8192 via Modelfile qwen2.5vl:7b-rpa.
"options": {"temperature": 0.1, "num_predict": 50, "num_ctx": 4096},
}, timeout=60)
content2 = resp2.json().get("message", {}).get("content", "")
elapsed = time.time() - t0
@@ -2482,10 +2486,15 @@ def _get_validation_ocr_reader():
if _VALIDATION_OCR_READER is None and not _VALIDATION_OCR_FAILED:
try:
import easyocr # type: ignore
from core.llm.ocr_extractor import easyocr_gpu_enabled
gpu = easyocr_gpu_enabled(default=False)
_VALIDATION_OCR_READER = easyocr.Reader(
['fr', 'en'], gpu=True, verbose=False
['fr', 'en'], gpu=gpu, verbose=False
)
logger.info(
"[REPLAY] EasyOCR validator chargé (fr+en, %s)",
"GPU" if gpu else "CPU",
)
logger.info("[REPLAY] EasyOCR validator chargé (fr+en, GPU)")
except Exception as e:
logger.warning("[REPLAY] EasyOCR validator indisponible (%s) — pré-check désactivé", e)
_VALIDATION_OCR_FAILED = True
@@ -2507,8 +2516,15 @@ def _normalize_for_match(s: str) -> str:
def _text_match_fuzzy(expected: str, observed: str, min_token_ratio: float = 0.60) -> bool:
"""Match tolérant aux imperfections OCR.
1. Substring exacte → match.
2. Sinon : split en tokens ≥3 caractères, retourne True si au moins
1. Substring exacte (expected ⊂ observed) → match.
2. C-P1 (2026-05-25) : tolérance préfixe — observed est un préfixe
d'expected avec longueur ≥ 4 chars ET ≥ 50% de la longueur expected.
Couvre le cas OCR partiel "Enregi" / "Enregistrer" (6 chars sur 11
= 54%, préfixe strict) où l'OCR coupe une ligne longue. Garde-fous :
- len ≥ 4 évite "Sa" / "Save" (faux positif probable)
- 50% évite "Bo" / "Bouton" et "Enregi" / "Enregistrer sous" (qui
serait 37%, rejet correct).
3. Sinon : split en tokens ≥3 caractères, retourne True si au moins
`min_token_ratio` des tokens attendus apparaissent dans observed.
Ex : "Coller ou saisir le dossier patient" → tokens
['coller', 'saisir', 'dossier', 'patient'] ; si OCR voit "u saisir
@@ -2523,6 +2539,13 @@ def _text_match_fuzzy(expected: str, observed: str, min_token_ratio: float = 0.6
return True
if nexp in nobs:
return True
# C-P1 : tolérance préfixe sur OCR partiel
if (
len(nobs) >= 4
and len(nobs) * 2 >= len(nexp)
and nexp.startswith(nobs)
):
return True
tokens = [t for t in nexp.split() if len(t) >= 3]
if not tokens:
return False
@@ -3010,7 +3033,9 @@ def _locate_popup_button(
"model": "qwen2.5vl:7b",
"messages": [{"role": "user", "content": prompt, "images": [screenshot_b64]}],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 50},
# D5-v3a (2026-05-25) num_ctx=4096 explicite : éviter fuite 8192
# via Modelfile qwen2.5vl:7b/-rpa (PARAMETER num_ctx 8192).
"options": {"temperature": 0.1, "num_predict": 50, "num_ctx": 4096},
},
timeout=15,
)