feat: chat unifié, GestureCatalog, Copilot, Léa UI, extraction données, vérification replay
Refonte majeure du système Agent Chat et ajout de nombreux modules : - Chat unifié : suppression du dual Workflows/Agent Libre, tout passe par /api/chat avec résolution en 3 niveaux (workflow → geste → "montre-moi") - GestureCatalog : 38 raccourcis clavier universels Windows avec matching sémantique, substitution automatique dans les replays, et endpoint /api/gestures - Mode Copilot : exécution pas-à-pas des workflows avec validation humaine via WebSocket (approve/skip/abort) avant chaque action - Léa UI (agent_v0/lea_ui/) : interface PyQt5 pour Windows avec overlay transparent pour feedback visuel pendant le replay - Data Extraction (core/extraction/) : moteur d'extraction visuelle de données (OCR + VLM → SQLite), avec schémas YAML et export CSV/Excel - ReplayVerifier (agent_v0/server_v1/) : vérification post-action par comparaison de screenshots, avec logique de retry (max 3) - IntentParser durci : meilleur fallback regex, type GREETING, patterns améliorés - Dashboard : nouvelles pages gestures, streaming, extractions - Tests : 63 tests GestureCatalog, 47 tests extraction, corrections tests existants - Dépréciation : /api/agent/plan et /api/agent/execute retournent HTTP 410, suppression du code hardcodé _plan_to_replay_actions Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
29
core/extraction/__init__.py
Normal file
29
core/extraction/__init__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""
|
||||
Module d'extraction de donnees structurees depuis des captures d'ecran.
|
||||
|
||||
Ce module orchestre le cycle complet :
|
||||
schema YAML -> navigation -> screenshot -> VLM/OCR -> validation -> SQLite -> CSV/Excel
|
||||
|
||||
Classes principales :
|
||||
- ExtractionSchema : definition des champs et regles de navigation
|
||||
- ExtractionField : definition d'un champ individuel
|
||||
- FieldExtractor : extraction via VLM (Ollama) ou OCR (docTR)
|
||||
- DataStore : stockage SQLite + export CSV/Excel
|
||||
- IterationController : controle de la boucle de navigation
|
||||
- ExtractionEngine : orchestrateur principal
|
||||
"""
|
||||
|
||||
from .schema import ExtractionField, ExtractionSchema
|
||||
from .field_extractor import FieldExtractor
|
||||
from .data_store import DataStore
|
||||
from .iteration_controller import IterationController
|
||||
from .extraction_engine import ExtractionEngine
|
||||
|
||||
__all__ = [
|
||||
"ExtractionField",
|
||||
"ExtractionSchema",
|
||||
"FieldExtractor",
|
||||
"DataStore",
|
||||
"IterationController",
|
||||
"ExtractionEngine",
|
||||
]
|
||||
420
core/extraction/data_store.py
Normal file
420
core/extraction/data_store.py
Normal file
@@ -0,0 +1,420 @@
|
||||
"""
|
||||
DataStore - Stockage SQLite des donnees extraites + export CSV/Excel
|
||||
|
||||
Chaque session d'extraction (ExtractionSchema applique a un ecran) cree
|
||||
une entree dans la table `extractions`. Les enregistrements individuels
|
||||
sont stockes dans la table `records` avec leurs donnees JSON, le chemin
|
||||
du screenshot source et un score de confiance.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from .schema import ExtractionSchema
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DataStore:
|
||||
"""Stockage des donnees extraites dans SQLite avec export CSV/Excel."""
|
||||
|
||||
def __init__(self, db_path: str = "data/extractions/store.db"):
|
||||
self.db_path = Path(db_path)
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._init_db()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Initialisation
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _init_db(self) -> None:
|
||||
"""Creer les tables si necessaire."""
|
||||
with self._connect() as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS extractions (
|
||||
id TEXT PRIMARY KEY,
|
||||
schema_name TEXT NOT NULL,
|
||||
schema_json TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'in_progress',
|
||||
record_count INTEGER NOT NULL DEFAULT 0
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS records (
|
||||
id TEXT PRIMARY KEY,
|
||||
extraction_id TEXT NOT NULL,
|
||||
data_json TEXT NOT NULL,
|
||||
screenshot_path TEXT,
|
||||
confidence REAL NOT NULL DEFAULT 0.0,
|
||||
errors_json TEXT,
|
||||
created_at TEXT NOT NULL,
|
||||
FOREIGN KEY (extraction_id) REFERENCES extractions(id)
|
||||
)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_records_extraction
|
||||
ON records(extraction_id)
|
||||
""")
|
||||
|
||||
def _connect(self) -> sqlite3.Connection:
|
||||
"""Ouvrir une connexion SQLite."""
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
return conn
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Extractions (sessions)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def create_extraction(self, schema: ExtractionSchema) -> str:
|
||||
"""
|
||||
Creer une nouvelle session d'extraction.
|
||||
|
||||
Args:
|
||||
schema: Schema d'extraction
|
||||
|
||||
Returns:
|
||||
extraction_id (UUID)
|
||||
"""
|
||||
extraction_id = str(uuid.uuid4())
|
||||
now = datetime.utcnow().isoformat()
|
||||
|
||||
with self._connect() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO extractions (id, schema_name, schema_json, created_at, updated_at, status)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
extraction_id,
|
||||
schema.name,
|
||||
json.dumps(schema.to_dict(), ensure_ascii=False),
|
||||
now,
|
||||
now,
|
||||
"in_progress",
|
||||
),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Extraction creee : %s (schema=%s)", extraction_id[:8], schema.name
|
||||
)
|
||||
return extraction_id
|
||||
|
||||
def finish_extraction(self, extraction_id: str, status: str = "completed") -> None:
|
||||
"""Marquer une extraction comme terminee."""
|
||||
now = datetime.utcnow().isoformat()
|
||||
with self._connect() as conn:
|
||||
conn.execute(
|
||||
"UPDATE extractions SET status = ?, updated_at = ? WHERE id = ?",
|
||||
(status, now, extraction_id),
|
||||
)
|
||||
|
||||
def get_extraction(self, extraction_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Recuperer les metadonnees d'une extraction."""
|
||||
with self._connect() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT * FROM extractions WHERE id = ?", (extraction_id,)
|
||||
).fetchone()
|
||||
if row:
|
||||
return dict(row)
|
||||
return None
|
||||
|
||||
def list_extractions(self, limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""Lister les extractions recentes."""
|
||||
with self._connect() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM extractions ORDER BY created_at DESC LIMIT ?",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Records (enregistrements)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def add_record(
|
||||
self,
|
||||
extraction_id: str,
|
||||
data: Dict[str, Any],
|
||||
screenshot_path: Optional[str] = None,
|
||||
confidence: float = 0.0,
|
||||
errors: Optional[List[str]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Ajouter un enregistrement extrait.
|
||||
|
||||
Args:
|
||||
extraction_id: ID de la session d'extraction
|
||||
data: Donnees extraites (dict)
|
||||
screenshot_path: Chemin du screenshot source
|
||||
confidence: Score de confiance [0, 1]
|
||||
errors: Liste d'erreurs de validation
|
||||
|
||||
Returns:
|
||||
record_id (UUID)
|
||||
"""
|
||||
record_id = str(uuid.uuid4())
|
||||
now = datetime.utcnow().isoformat()
|
||||
|
||||
with self._connect() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO records (id, extraction_id, data_json, screenshot_path,
|
||||
confidence, errors_json, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
record_id,
|
||||
extraction_id,
|
||||
json.dumps(data, ensure_ascii=False),
|
||||
screenshot_path,
|
||||
confidence,
|
||||
json.dumps(errors or [], ensure_ascii=False),
|
||||
now,
|
||||
),
|
||||
)
|
||||
# Mettre a jour le compteur
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE extractions
|
||||
SET record_count = record_count + 1, updated_at = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(now, extraction_id),
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
"Record ajoute : %s (extraction=%s, confiance=%.2f)",
|
||||
record_id[:8],
|
||||
extraction_id[:8],
|
||||
confidence,
|
||||
)
|
||||
return record_id
|
||||
|
||||
def get_records(self, extraction_id: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Recuperer tous les enregistrements d'une extraction.
|
||||
|
||||
Returns:
|
||||
Liste de dicts avec les cles : id, data, screenshot_path,
|
||||
confidence, errors, created_at
|
||||
"""
|
||||
with self._connect() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, data_json, screenshot_path, confidence,
|
||||
errors_json, created_at
|
||||
FROM records
|
||||
WHERE extraction_id = ?
|
||||
ORDER BY created_at ASC
|
||||
""",
|
||||
(extraction_id,),
|
||||
).fetchall()
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
results.append({
|
||||
"id": row["id"],
|
||||
"data": json.loads(row["data_json"]),
|
||||
"screenshot_path": row["screenshot_path"],
|
||||
"confidence": row["confidence"],
|
||||
"errors": json.loads(row["errors_json"]) if row["errors_json"] else [],
|
||||
"created_at": row["created_at"],
|
||||
})
|
||||
return results
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Export
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def export_csv(self, extraction_id: str, output_path: str) -> str:
|
||||
"""
|
||||
Exporter les enregistrements en CSV.
|
||||
|
||||
Args:
|
||||
extraction_id: ID de la session
|
||||
output_path: Chemin du fichier CSV de sortie
|
||||
|
||||
Returns:
|
||||
Chemin du fichier cree
|
||||
"""
|
||||
records = self.get_records(extraction_id)
|
||||
if not records:
|
||||
raise ValueError(f"Aucun enregistrement pour l'extraction {extraction_id}")
|
||||
|
||||
out = Path(output_path)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Determiner les colonnes depuis le premier record
|
||||
all_keys = self._collect_all_keys(records)
|
||||
|
||||
with open(out, "w", newline="", encoding="utf-8-sig") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=all_keys, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
for rec in records:
|
||||
writer.writerow(rec["data"])
|
||||
|
||||
logger.info("Export CSV : %s (%d lignes)", output_path, len(records))
|
||||
return str(out)
|
||||
|
||||
def export_excel(self, extraction_id: str, output_path: str) -> str:
|
||||
"""
|
||||
Exporter les enregistrements en Excel (openpyxl).
|
||||
|
||||
Args:
|
||||
extraction_id: ID de la session
|
||||
output_path: Chemin du fichier Excel de sortie
|
||||
|
||||
Returns:
|
||||
Chemin du fichier cree
|
||||
|
||||
Raises:
|
||||
ImportError: Si openpyxl n'est pas installe
|
||||
"""
|
||||
try:
|
||||
import openpyxl
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"openpyxl est requis pour l'export Excel. "
|
||||
"Installez-le : pip install openpyxl"
|
||||
)
|
||||
|
||||
records = self.get_records(extraction_id)
|
||||
if not records:
|
||||
raise ValueError(f"Aucun enregistrement pour l'extraction {extraction_id}")
|
||||
|
||||
out = Path(output_path)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
all_keys = self._collect_all_keys(records)
|
||||
|
||||
wb = openpyxl.Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Extraction"
|
||||
|
||||
# En-tetes
|
||||
for col_idx, key in enumerate(all_keys, start=1):
|
||||
cell = ws.cell(row=1, column=col_idx, value=key)
|
||||
cell.font = openpyxl.styles.Font(bold=True)
|
||||
|
||||
# Donnees
|
||||
for row_idx, rec in enumerate(records, start=2):
|
||||
for col_idx, key in enumerate(all_keys, start=1):
|
||||
ws.cell(row=row_idx, column=col_idx, value=rec["data"].get(key, ""))
|
||||
|
||||
# Ajuster la largeur des colonnes
|
||||
for col_idx, key in enumerate(all_keys, start=1):
|
||||
max_len = max(
|
||||
len(str(key)),
|
||||
*(len(str(rec["data"].get(key, ""))) for rec in records),
|
||||
)
|
||||
ws.column_dimensions[openpyxl.utils.get_column_letter(col_idx)].width = min(max_len + 2, 50)
|
||||
|
||||
wb.save(str(out))
|
||||
logger.info("Export Excel : %s (%d lignes)", output_path, len(records))
|
||||
return str(out)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Statistiques
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_stats(self, extraction_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Statistiques d'une extraction.
|
||||
|
||||
Returns:
|
||||
Dict avec : record_count, avg_confidence, completeness,
|
||||
field_coverage, status, duration
|
||||
"""
|
||||
extraction = self.get_extraction(extraction_id)
|
||||
if not extraction:
|
||||
return {"error": f"Extraction {extraction_id} introuvable"}
|
||||
|
||||
records = self.get_records(extraction_id)
|
||||
|
||||
if not records:
|
||||
return {
|
||||
"extraction_id": extraction_id,
|
||||
"schema_name": extraction["schema_name"],
|
||||
"status": extraction["status"],
|
||||
"record_count": 0,
|
||||
"avg_confidence": 0.0,
|
||||
"completeness": 0.0,
|
||||
"field_coverage": {},
|
||||
}
|
||||
|
||||
# Confiance moyenne
|
||||
confidences = [r["confidence"] for r in records]
|
||||
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
||||
|
||||
# Couverture par champ : pourcentage de records ayant une valeur non-nulle
|
||||
schema_data = json.loads(extraction["schema_json"])
|
||||
field_names = [f["name"] for f in schema_data.get("fields", [])]
|
||||
|
||||
field_coverage = {}
|
||||
for fname in field_names:
|
||||
filled = sum(
|
||||
1 for r in records
|
||||
if r["data"].get(fname) is not None
|
||||
and str(r["data"][fname]).strip() != ""
|
||||
)
|
||||
field_coverage[fname] = filled / len(records) if records else 0.0
|
||||
|
||||
# Completude globale
|
||||
completeness = (
|
||||
sum(field_coverage.values()) / len(field_coverage)
|
||||
if field_coverage else 0.0
|
||||
)
|
||||
|
||||
# Erreurs
|
||||
total_errors = sum(len(r.get("errors", [])) for r in records)
|
||||
|
||||
return {
|
||||
"extraction_id": extraction_id,
|
||||
"schema_name": extraction["schema_name"],
|
||||
"status": extraction["status"],
|
||||
"record_count": len(records),
|
||||
"avg_confidence": round(avg_confidence, 3),
|
||||
"completeness": round(completeness, 3),
|
||||
"field_coverage": {k: round(v, 3) for k, v in field_coverage.items()},
|
||||
"total_errors": total_errors,
|
||||
"created_at": extraction["created_at"],
|
||||
"updated_at": extraction["updated_at"],
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Nettoyage
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def delete_extraction(self, extraction_id: str) -> bool:
|
||||
"""Supprimer une extraction et tous ses records."""
|
||||
with self._connect() as conn:
|
||||
conn.execute("DELETE FROM records WHERE extraction_id = ?", (extraction_id,))
|
||||
result = conn.execute("DELETE FROM extractions WHERE id = ?", (extraction_id,))
|
||||
return result.rowcount > 0
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utilitaires internes
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _collect_all_keys(records: List[Dict[str, Any]]) -> List[str]:
|
||||
"""Collecter toutes les cles uniques des records, en preservant l'ordre."""
|
||||
seen = set()
|
||||
keys = []
|
||||
for rec in records:
|
||||
for k in rec["data"].keys():
|
||||
if k not in seen:
|
||||
seen.add(k)
|
||||
keys.append(k)
|
||||
return keys
|
||||
312
core/extraction/extraction_engine.py
Normal file
312
core/extraction/extraction_engine.py
Normal file
@@ -0,0 +1,312 @@
|
||||
"""
|
||||
ExtractionEngine - Orchestrateur principal du moteur d'extraction de donnees
|
||||
|
||||
Orchestre le cycle complet :
|
||||
naviguer -> screenshot -> extraire -> valider -> stocker -> suivant
|
||||
|
||||
S'appuie sur FieldExtractor (VLM/OCR), DataStore (SQLite), et
|
||||
IterationController (navigation) pour realiser l'extraction automatisee
|
||||
de donnees depuis des interfaces utilisateur.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from .data_store import DataStore
|
||||
from .field_extractor import FieldExtractor
|
||||
from .iteration_controller import IterationController
|
||||
from .schema import ExtractionSchema
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExtractionEngine:
|
||||
"""
|
||||
Moteur d'extraction principal.
|
||||
|
||||
Orchestre le cycle : naviguer -> screenshot -> extraire -> stocker -> suivant.
|
||||
|
||||
Modes d'utilisation :
|
||||
1. Automatique : start_extraction() — boucle complete avec navigation
|
||||
2. Manuel : extract_current_screen() — extraction ponctuelle d'un screenshot
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
schema: ExtractionSchema,
|
||||
store: Optional[DataStore] = None,
|
||||
field_extractor: Optional[FieldExtractor] = None,
|
||||
streaming_server_url: str = "http://localhost:5005",
|
||||
screenshot_dir: str = "data/extractions/screenshots",
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
schema: Schema d'extraction decrivant les champs et la navigation
|
||||
store: DataStore pour le stockage (cree un par defaut si absent)
|
||||
field_extractor: Extracteur de champs (cree un par defaut si absent)
|
||||
streaming_server_url: URL du streaming server Agent V1
|
||||
screenshot_dir: Repertoire pour sauvegarder les screenshots
|
||||
"""
|
||||
self.schema = schema
|
||||
self.store = store or DataStore()
|
||||
self.field_extractor = field_extractor or FieldExtractor()
|
||||
self.controller = IterationController(schema, streaming_server_url)
|
||||
self.streaming_server_url = streaming_server_url.rstrip("/")
|
||||
self.screenshot_dir = Path(screenshot_dir)
|
||||
self.screenshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Etat interne
|
||||
self._current_extraction_id: Optional[str] = None
|
||||
self._is_running = False
|
||||
self._should_stop = False
|
||||
self._progress_callback: Optional[Callable] = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# API publique - Extraction automatique
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def start_extraction(
|
||||
self,
|
||||
session_id: str,
|
||||
on_progress: Optional[Callable[[Dict[str, Any]], None]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Demarrer une session d'extraction automatique.
|
||||
|
||||
Boucle :
|
||||
1. Creer l'extraction dans le store
|
||||
2. Pour chaque enregistrement :
|
||||
a. Prendre un screenshot
|
||||
b. Extraire les champs
|
||||
c. Valider
|
||||
d. Stocker
|
||||
e. Naviguer au suivant
|
||||
3. Finaliser et retourner l'extraction_id
|
||||
|
||||
Args:
|
||||
session_id: ID de la session de streaming (pour navigation)
|
||||
on_progress: Callback appele a chaque record (optionnel)
|
||||
|
||||
Returns:
|
||||
extraction_id
|
||||
"""
|
||||
self._is_running = True
|
||||
self._should_stop = False
|
||||
self._progress_callback = on_progress
|
||||
|
||||
# Creer la session d'extraction
|
||||
extraction_id = self.store.create_extraction(self.schema)
|
||||
self._current_extraction_id = extraction_id
|
||||
|
||||
logger.info(
|
||||
"Demarrage extraction %s (schema=%s, max=%d)",
|
||||
extraction_id[:8],
|
||||
self.schema.name,
|
||||
self.controller.max_records,
|
||||
)
|
||||
|
||||
try:
|
||||
while self.controller.has_next() and not self._should_stop:
|
||||
idx = self.controller.current_index
|
||||
|
||||
# 1. Screenshot
|
||||
screenshot_path = self._take_screenshot(session_id, idx)
|
||||
if screenshot_path is None:
|
||||
logger.warning("Screenshot echoue a l'index %d, on continue", idx)
|
||||
# Naviguer quand meme pour ne pas rester bloque
|
||||
self.controller.navigate_to_next(session_id)
|
||||
continue
|
||||
|
||||
# 2. Extraction
|
||||
result = self.extract_current_screen(screenshot_path)
|
||||
|
||||
# 3. Stockage
|
||||
self.store.add_record(
|
||||
extraction_id=extraction_id,
|
||||
data=result["data"],
|
||||
screenshot_path=screenshot_path,
|
||||
confidence=result["confidence"],
|
||||
errors=result.get("errors"),
|
||||
)
|
||||
|
||||
# 4. Callback de progression
|
||||
if self._progress_callback:
|
||||
progress = self.get_progress()
|
||||
progress["last_record"] = result["data"]
|
||||
progress["last_confidence"] = result["confidence"]
|
||||
self._progress_callback(progress)
|
||||
|
||||
logger.info(
|
||||
"Record %d/%d extrait (confiance=%.2f)",
|
||||
idx + 1,
|
||||
self.controller.max_records,
|
||||
result["confidence"],
|
||||
)
|
||||
|
||||
# 5. Navigation
|
||||
if not self.controller.navigate_to_next(session_id):
|
||||
logger.info("Fin de navigation a l'index %d", idx)
|
||||
break
|
||||
|
||||
# Finaliser
|
||||
status = "stopped" if self._should_stop else "completed"
|
||||
self.store.finish_extraction(extraction_id, status=status)
|
||||
|
||||
logger.info(
|
||||
"Extraction %s terminee : %s (%d records)",
|
||||
extraction_id[:8],
|
||||
status,
|
||||
self.controller.current_index,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Erreur pendant l'extraction : %s", e)
|
||||
self.store.finish_extraction(extraction_id, status="error")
|
||||
raise
|
||||
|
||||
finally:
|
||||
self._is_running = False
|
||||
self._current_extraction_id = None
|
||||
|
||||
return extraction_id
|
||||
|
||||
def stop_extraction(self) -> None:
|
||||
"""Demander l'arret de l'extraction en cours."""
|
||||
if self._is_running:
|
||||
logger.info("Arret demande pour l'extraction en cours")
|
||||
self._should_stop = True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# API publique - Extraction ponctuelle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def extract_current_screen(self, screenshot_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Extraire les champs du screenshot actuel sans navigation.
|
||||
|
||||
Args:
|
||||
screenshot_path: Chemin vers le screenshot
|
||||
|
||||
Returns:
|
||||
Dict avec 'data', 'confidence', 'errors', 'validation'
|
||||
"""
|
||||
# Extraction
|
||||
result = self.field_extractor.extract_fields(screenshot_path, self.schema)
|
||||
|
||||
# Validation contre le schema
|
||||
validation = self.schema.validate_record(result["data"])
|
||||
result["validation"] = validation
|
||||
|
||||
return result
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# API publique - Progression
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_progress(self) -> Dict[str, Any]:
|
||||
"""Retourne la progression actuelle de l'extraction."""
|
||||
nav_progress = self.controller.progress
|
||||
stats = {}
|
||||
|
||||
if self._current_extraction_id:
|
||||
stats = self.store.get_stats(self._current_extraction_id)
|
||||
|
||||
return {
|
||||
"extraction_id": self._current_extraction_id,
|
||||
"is_running": self._is_running,
|
||||
"navigation": nav_progress,
|
||||
"stats": stats,
|
||||
"schema_name": self.schema.name,
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Screenshot
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _take_screenshot(self, session_id: str, index: int) -> Optional[str]:
|
||||
"""
|
||||
Prendre un screenshot via le streaming server.
|
||||
|
||||
Essaie d'appeler l'API du streaming server pour obtenir
|
||||
le screenshot courant. En cas d'echec, retourne None.
|
||||
|
||||
Args:
|
||||
session_id: ID de la session de streaming
|
||||
index: Index de l'enregistrement courant
|
||||
|
||||
Returns:
|
||||
Chemin du screenshot sauvegarde, ou None
|
||||
"""
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{self.streaming_server_url}/api/screenshot",
|
||||
params={"session_id": session_id},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
# Sauvegarder le screenshot
|
||||
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"record_{index:04d}_{timestamp}.png"
|
||||
filepath = self.screenshot_dir / filename
|
||||
|
||||
with open(filepath, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
return str(filepath)
|
||||
else:
|
||||
logger.warning(
|
||||
"Screenshot echoue : HTTP %d", response.status_code
|
||||
)
|
||||
return None
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.warning(
|
||||
"Streaming server non accessible pour screenshot"
|
||||
)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Erreur screenshot : %s", e)
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utilitaires
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def extract_from_file(self, screenshot_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Raccourci pour extraire depuis un fichier existant
|
||||
et stocker le resultat.
|
||||
|
||||
Utile pour du retraitement offline de screenshots.
|
||||
|
||||
Args:
|
||||
screenshot_path: Chemin vers un screenshot existant
|
||||
|
||||
Returns:
|
||||
Dict avec les donnees extraites et le record_id
|
||||
"""
|
||||
if self._current_extraction_id is None:
|
||||
extraction_id = self.store.create_extraction(self.schema)
|
||||
else:
|
||||
extraction_id = self._current_extraction_id
|
||||
|
||||
result = self.extract_current_screen(screenshot_path)
|
||||
|
||||
record_id = self.store.add_record(
|
||||
extraction_id=extraction_id,
|
||||
data=result["data"],
|
||||
screenshot_path=screenshot_path,
|
||||
confidence=result["confidence"],
|
||||
errors=result.get("errors"),
|
||||
)
|
||||
|
||||
result["record_id"] = record_id
|
||||
result["extraction_id"] = extraction_id
|
||||
return result
|
||||
327
core/extraction/field_extractor.py
Normal file
327
core/extraction/field_extractor.py
Normal file
@@ -0,0 +1,327 @@
|
||||
"""
|
||||
FieldExtractor - Extraction de champs structures depuis des screenshots
|
||||
|
||||
Utilise un VLM (Ollama) pour comprendre le contenu visuel et en extraire
|
||||
des donnees structurees selon un schema predefini.
|
||||
Fallback OCR via docTR si le VLM echoue.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from .schema import ExtractionField, ExtractionSchema
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration Ollama (coherente avec le reste du projet)
|
||||
OLLAMA_DEFAULT_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||
OLLAMA_DEFAULT_MODEL = os.environ.get("VLM_MODEL", "qwen3-vl:8b")
|
||||
|
||||
|
||||
class FieldExtractor:
|
||||
"""
|
||||
Extrait des champs structures depuis un screenshot.
|
||||
|
||||
Pipeline :
|
||||
1. VLM : envoyer screenshot + schema au VLM pour extraction structuree
|
||||
2. Validation : verifier les regex, types, champs requis
|
||||
3. (Optionnel) OCR fallback si VLM indisponible
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ollama_url: str = OLLAMA_DEFAULT_URL,
|
||||
ollama_model: str = OLLAMA_DEFAULT_MODEL,
|
||||
timeout: int = 60,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
ollama_url: URL du serveur Ollama
|
||||
ollama_model: Modele VLM a utiliser
|
||||
timeout: Timeout en secondes pour les appels VLM
|
||||
"""
|
||||
self.ollama_url = ollama_url.rstrip("/")
|
||||
self.ollama_model = ollama_model
|
||||
self.timeout = timeout
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# API publique
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def extract_fields(
|
||||
self,
|
||||
screenshot_path: str,
|
||||
schema: ExtractionSchema,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Extraire les champs definis par le schema depuis un screenshot.
|
||||
|
||||
Args:
|
||||
screenshot_path: Chemin vers l'image (PNG/JPEG)
|
||||
schema: Schema d'extraction
|
||||
|
||||
Returns:
|
||||
Dict avec les champs extraits + metadonnees
|
||||
{
|
||||
"data": {"nom": "DUPONT", "prenom": "Jean", ...},
|
||||
"confidence": 0.85,
|
||||
"errors": [],
|
||||
"raw_response": "..."
|
||||
}
|
||||
"""
|
||||
path = Path(screenshot_path)
|
||||
if not path.exists():
|
||||
return {
|
||||
"data": {},
|
||||
"confidence": 0.0,
|
||||
"errors": [f"Fichier introuvable : {screenshot_path}"],
|
||||
"raw_response": None,
|
||||
}
|
||||
|
||||
# Encoder l'image en base64
|
||||
image_b64 = self._encode_image(path)
|
||||
|
||||
# Extraction via VLM
|
||||
raw_data, raw_response = self._extract_via_vlm(image_b64, schema.fields)
|
||||
|
||||
if raw_data is None:
|
||||
logger.warning("VLM extraction echouee, tentative OCR fallback")
|
||||
raw_data = self._extract_via_ocr_fallback(path, schema.fields)
|
||||
raw_response = "(ocr fallback)"
|
||||
|
||||
# Validation et nettoyage
|
||||
validated = {}
|
||||
errors: List[str] = []
|
||||
valid_count = 0
|
||||
|
||||
for fld in schema.fields:
|
||||
value = raw_data.get(fld.name) if raw_data else None
|
||||
# Nettoyer
|
||||
if value is not None:
|
||||
value = str(value).strip()
|
||||
if value == "" or value.lower() in ("null", "none", "n/a"):
|
||||
value = None
|
||||
|
||||
validated[fld.name] = value
|
||||
|
||||
if not fld.validate_value(value):
|
||||
errors.append(
|
||||
f"Champ '{fld.name}' invalide ou manquant : {value!r}"
|
||||
)
|
||||
else:
|
||||
if value is not None and str(value).strip():
|
||||
valid_count += 1
|
||||
|
||||
total = len(schema.fields) if schema.fields else 1
|
||||
confidence = valid_count / total
|
||||
|
||||
return {
|
||||
"data": validated,
|
||||
"confidence": confidence,
|
||||
"errors": errors,
|
||||
"raw_response": raw_response,
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Extraction VLM
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _extract_via_vlm(
|
||||
self, image_b64: str, fields: List[ExtractionField]
|
||||
) -> tuple:
|
||||
"""
|
||||
Appeler le VLM (Ollama) pour extraction structuree.
|
||||
|
||||
Returns:
|
||||
(dict_donnees | None, raw_response_text | None)
|
||||
"""
|
||||
prompt = self._build_extraction_prompt(fields)
|
||||
|
||||
try:
|
||||
# Desactiver le mode thinking pour Qwen3
|
||||
effective_prompt = prompt
|
||||
if "qwen" in self.ollama_model.lower():
|
||||
effective_prompt = f"/nothink {prompt}"
|
||||
|
||||
payload = {
|
||||
"model": self.ollama_model,
|
||||
"prompt": effective_prompt,
|
||||
"images": [image_b64],
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {
|
||||
"temperature": 0.1,
|
||||
"num_predict": 2000,
|
||||
},
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.ollama_url}/api/generate",
|
||||
json=payload,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
"Erreur Ollama %d : %s",
|
||||
response.status_code,
|
||||
response.text[:300],
|
||||
)
|
||||
return None, None
|
||||
|
||||
result = response.json()
|
||||
raw_text = result.get("response", "").strip()
|
||||
logger.debug("Reponse VLM brute : %s", raw_text[:500])
|
||||
|
||||
parsed = self._parse_vlm_response(raw_text)
|
||||
return parsed, raw_text
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
logger.error("Timeout VLM apres %ds", self.timeout)
|
||||
return None, None
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.error("Ollama non accessible a %s", self.ollama_url)
|
||||
return None, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Erreur VLM inattendue : %s", e)
|
||||
return None, None
|
||||
|
||||
def _build_extraction_prompt(self, fields: List[ExtractionField]) -> str:
|
||||
"""Construire le prompt d'extraction structure pour le VLM."""
|
||||
field_descriptions = []
|
||||
for f in fields:
|
||||
desc = f"- {f.name} ({f.field_type}): {f.description}"
|
||||
if f.required:
|
||||
desc += " [OBLIGATOIRE]"
|
||||
if f.validation_regex:
|
||||
desc += f" (format: {f.validation_regex})"
|
||||
field_descriptions.append(desc)
|
||||
|
||||
fields_text = "\n".join(field_descriptions)
|
||||
|
||||
return f"""Regarde cette capture d'ecran et extrais les informations suivantes.
|
||||
|
||||
CHAMPS A EXTRAIRE :
|
||||
{fields_text}
|
||||
|
||||
INSTRUCTIONS :
|
||||
1. Extrais chaque champ tel qu'il apparait a l'ecran
|
||||
2. Si un champ n'est pas visible, mets null
|
||||
3. Pour les dates, conserve le format tel qu'affiche
|
||||
4. Pour les nombres, conserve le format avec virgule si present
|
||||
5. Reponds UNIQUEMENT en JSON valide
|
||||
|
||||
FORMAT DE REPONSE :
|
||||
Un objet JSON avec les cles correspondant aux noms de champs ci-dessus.
|
||||
Exemple : {{"nom": "DUPONT", "prenom": "Jean", "date_naissance": "15/03/1965"}}
|
||||
|
||||
Extrais maintenant les donnees :"""
|
||||
|
||||
def _parse_vlm_response(self, text: str) -> Optional[Dict[str, Any]]:
|
||||
"""Parser la reponse JSON du VLM."""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Essayer le parse direct
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Chercher un objet JSON dans la reponse
|
||||
match = re.search(r"\{[\s\S]*\}", text)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Chercher entre balises ```json ... ```
|
||||
match = re.search(r"```(?:json)?\s*(\{[\s\S]*?\})\s*```", text)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
logger.warning("Impossible de parser la reponse VLM en JSON")
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# OCR Fallback
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _extract_via_ocr_fallback(
|
||||
self, image_path: Path, fields: List[ExtractionField]
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Fallback : extraire du texte brut via OCR (docTR) puis tenter
|
||||
un mapping basique vers les champs.
|
||||
|
||||
Ce fallback est tres basique ; il fournit le texte brut
|
||||
sans mapping intelligent. Le VLM reste la methode privilegiee.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image as PILImage
|
||||
|
||||
img = PILImage.open(str(image_path))
|
||||
|
||||
# Tenter docTR
|
||||
try:
|
||||
from doctr.io import DocumentFile
|
||||
from doctr.models import ocr_predictor
|
||||
|
||||
predictor = ocr_predictor(det_arch="db_mobilenet_v3_large", reco_arch="crnn_mobilenet_v3_large", pretrained=True)
|
||||
doc = DocumentFile.from_images([str(image_path)])
|
||||
result = predictor(doc)
|
||||
|
||||
# Extraire tout le texte
|
||||
all_text = []
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
line_text = " ".join(w.value for w in line.words)
|
||||
all_text.append(line_text)
|
||||
|
||||
full_text = "\n".join(all_text)
|
||||
logger.info("OCR fallback : %d lignes extraites", len(all_text))
|
||||
|
||||
# Retourner le texte complet dans un champ special
|
||||
return {"_ocr_text": full_text}
|
||||
|
||||
except ImportError:
|
||||
logger.warning("docTR non disponible pour le fallback OCR")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Erreur OCR fallback : %s", e)
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Utilitaires
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _encode_image(path: Path) -> str:
|
||||
"""Encoder une image en base64."""
|
||||
with open(path, "rb") as f:
|
||||
return base64.b64encode(f.read()).decode("utf-8")
|
||||
|
||||
def check_vlm_available(self) -> bool:
|
||||
"""Verifier si le VLM Ollama est accessible."""
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{self.ollama_url}/api/tags", timeout=5
|
||||
)
|
||||
return response.status_code == 200
|
||||
except (requests.RequestException, ConnectionError, TimeoutError):
|
||||
return False
|
||||
258
core/extraction/iteration_controller.py
Normal file
258
core/extraction/iteration_controller.py
Normal file
@@ -0,0 +1,258 @@
|
||||
"""
|
||||
IterationController - Controle de navigation entre enregistrements
|
||||
|
||||
Gere la boucle de navigation : passage au record suivant, pagination,
|
||||
scroll, etc. Communique avec le streaming server (Agent V1) pour
|
||||
envoyer les actions de navigation sur la machine cible.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from .schema import ExtractionSchema
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IterationController:
|
||||
"""
|
||||
Controle la navigation entre les enregistrements a extraire.
|
||||
|
||||
Types de navigation supportes :
|
||||
- list_detail : cliquer sur chaque element d'une liste
|
||||
- pagination : bouton suivant / page suivante
|
||||
- scroll : defilement vertical
|
||||
- manual : l'utilisateur navigue manuellement
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
schema: ExtractionSchema,
|
||||
streaming_server_url: str = "http://localhost:5005",
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
schema: Schema d'extraction (contient les regles de navigation)
|
||||
streaming_server_url: URL du streaming server Agent V1
|
||||
"""
|
||||
self.schema = schema
|
||||
self.server_url = streaming_server_url.rstrip("/")
|
||||
self.current_index = 0
|
||||
self.max_records = schema.navigation.get("max_records", 100)
|
||||
self.nav_type = schema.navigation.get("type", "manual")
|
||||
self.nav_action = schema.navigation.get("next_record", "click_next_in_list")
|
||||
self.nav_delay = schema.navigation.get("delay_ms", 1000)
|
||||
|
||||
# Etat interne
|
||||
self._started = False
|
||||
self._finished = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# API publique
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def has_next(self) -> bool:
|
||||
"""Retourne True s'il reste des enregistrements a traiter."""
|
||||
if self._finished:
|
||||
return False
|
||||
return self.current_index < self.max_records
|
||||
|
||||
def navigate_to_next(self, session_id: str) -> bool:
|
||||
"""
|
||||
Naviguer vers l'enregistrement suivant.
|
||||
|
||||
Envoie les actions de navigation au streaming server
|
||||
en fonction du type de navigation defini dans le schema.
|
||||
|
||||
Args:
|
||||
session_id: ID de la session de streaming
|
||||
|
||||
Returns:
|
||||
True si la navigation a reussi
|
||||
"""
|
||||
if not self.has_next():
|
||||
logger.info("Plus d'enregistrements a traiter (index=%d)", self.current_index)
|
||||
return False
|
||||
|
||||
success = False
|
||||
|
||||
if self.nav_type == "manual":
|
||||
# Mode manuel : on attend juste un delai
|
||||
logger.info(
|
||||
"Navigation manuelle : attente de %dms (index=%d)",
|
||||
self.nav_delay,
|
||||
self.current_index,
|
||||
)
|
||||
time.sleep(self.nav_delay / 1000)
|
||||
success = True
|
||||
|
||||
elif self.nav_type == "pagination":
|
||||
success = self._navigate_pagination(session_id)
|
||||
|
||||
elif self.nav_type == "list_detail":
|
||||
success = self._navigate_list_detail(session_id)
|
||||
|
||||
elif self.nav_type == "scroll":
|
||||
success = self._navigate_scroll(session_id)
|
||||
|
||||
else:
|
||||
logger.warning("Type de navigation inconnu : %s", self.nav_type)
|
||||
success = False
|
||||
|
||||
if success:
|
||||
self.current_index += 1
|
||||
logger.debug(
|
||||
"Navigation reussie -> index=%d/%d",
|
||||
self.current_index,
|
||||
self.max_records,
|
||||
)
|
||||
|
||||
return success
|
||||
|
||||
def navigate_to_record(self, session_id: str, index: int) -> bool:
|
||||
"""
|
||||
Naviguer vers un enregistrement specifique.
|
||||
|
||||
Args:
|
||||
session_id: ID de la session de streaming
|
||||
index: Index de l'enregistrement cible
|
||||
|
||||
Returns:
|
||||
True si la navigation a reussi
|
||||
"""
|
||||
if index < 0 or index >= self.max_records:
|
||||
logger.error("Index hors limites : %d (max=%d)", index, self.max_records)
|
||||
return False
|
||||
|
||||
# Naviguer pas a pas jusqu'a l'index cible
|
||||
steps = index - self.current_index
|
||||
if steps < 0:
|
||||
logger.warning(
|
||||
"Navigation arriere non supportee (current=%d, target=%d)",
|
||||
self.current_index,
|
||||
index,
|
||||
)
|
||||
return False
|
||||
|
||||
for _ in range(steps):
|
||||
if not self.navigate_to_next(session_id):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reinitialiser le controleur."""
|
||||
self.current_index = 0
|
||||
self._started = False
|
||||
self._finished = False
|
||||
|
||||
def mark_finished(self) -> None:
|
||||
"""Marquer l'iteration comme terminee (ex: fin de liste detectee)."""
|
||||
self._finished = True
|
||||
logger.info("Iteration marquee comme terminee a l'index %d", self.current_index)
|
||||
|
||||
@property
|
||||
def progress(self) -> Dict[str, Any]:
|
||||
"""Retourne la progression actuelle."""
|
||||
return {
|
||||
"current_index": self.current_index,
|
||||
"max_records": self.max_records,
|
||||
"progress_pct": round(
|
||||
(self.current_index / self.max_records * 100)
|
||||
if self.max_records > 0 else 0,
|
||||
1,
|
||||
),
|
||||
"nav_type": self.nav_type,
|
||||
"finished": self._finished,
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Navigation specifique
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _navigate_pagination(self, session_id: str) -> bool:
|
||||
"""Navigation par pagination (bouton suivant)."""
|
||||
action = {
|
||||
"type": "click",
|
||||
"target": self.nav_action,
|
||||
"description": "Cliquer sur le bouton suivant / page suivante",
|
||||
}
|
||||
return self._send_action(session_id, action)
|
||||
|
||||
def _navigate_list_detail(self, session_id: str) -> bool:
|
||||
"""Navigation dans une liste (cliquer sur l'element suivant)."""
|
||||
action = {
|
||||
"type": "click",
|
||||
"target": self.nav_action,
|
||||
"index": self.current_index,
|
||||
"description": f"Cliquer sur l'element {self.current_index + 1} de la liste",
|
||||
}
|
||||
return self._send_action(session_id, action)
|
||||
|
||||
def _navigate_scroll(self, session_id: str) -> bool:
|
||||
"""Navigation par defilement."""
|
||||
action = {
|
||||
"type": "scroll",
|
||||
"direction": "down",
|
||||
"amount": self.schema.navigation.get("scroll_amount", 300),
|
||||
"description": "Defiler vers le bas",
|
||||
}
|
||||
return self._send_action(session_id, action)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Communication avec le streaming server
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _send_action(self, session_id: str, action: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Envoyer une action de navigation au streaming server.
|
||||
|
||||
L'action est envoyee via l'API du streaming server (port 5005).
|
||||
Si le serveur n'est pas disponible, on simule un delai.
|
||||
|
||||
Args:
|
||||
session_id: ID de la session de streaming
|
||||
action: Description de l'action a executer
|
||||
|
||||
Returns:
|
||||
True si l'action a ete executee ou simulee
|
||||
"""
|
||||
try:
|
||||
payload = {
|
||||
"session_id": session_id,
|
||||
"action": action,
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.server_url}/api/action",
|
||||
json=payload,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
# Attendre le delai de navigation
|
||||
if self.nav_delay > 0:
|
||||
time.sleep(self.nav_delay / 1000)
|
||||
return True
|
||||
else:
|
||||
logger.warning(
|
||||
"Action de navigation echouee : HTTP %d", response.status_code
|
||||
)
|
||||
return False
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.warning(
|
||||
"Streaming server non accessible a %s — simulation du delai",
|
||||
self.server_url,
|
||||
)
|
||||
# Simuler l'attente de navigation (mode degrade)
|
||||
if self.nav_delay > 0:
|
||||
time.sleep(self.nav_delay / 1000)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Erreur envoi action de navigation : %s", e)
|
||||
return False
|
||||
217
core/extraction/schema.py
Normal file
217
core/extraction/schema.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""
|
||||
Schema d'extraction de donnees - Definition des champs et navigation
|
||||
|
||||
Permet de definir un schema YAML decrivant les champs a extraire
|
||||
depuis des captures d'ecran (DPI, formulaires, listes...).
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionField:
|
||||
"""Definition d'un champ a extraire depuis un screenshot."""
|
||||
|
||||
name: str # Ex: "nom_patient", "date_naissance"
|
||||
description: str # Description pour le VLM
|
||||
field_type: str = "text" # "text", "date", "number", "boolean"
|
||||
required: bool = True
|
||||
validation_regex: Optional[str] = None # Regex de validation optionnelle
|
||||
|
||||
def validate_value(self, value: Optional[str]) -> bool:
|
||||
"""
|
||||
Valider une valeur extraite pour ce champ.
|
||||
|
||||
Returns:
|
||||
True si la valeur est valide
|
||||
"""
|
||||
# Champ requis mais absent
|
||||
if self.required and (value is None or str(value).strip() == ""):
|
||||
return False
|
||||
|
||||
# Pas de valeur et pas requis => OK
|
||||
if value is None or str(value).strip() == "":
|
||||
return True
|
||||
|
||||
value_str = str(value).strip()
|
||||
|
||||
# Validation par type
|
||||
if self.field_type == "number":
|
||||
try:
|
||||
float(value_str.replace(",", ".").replace(" ", ""))
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
elif self.field_type == "boolean":
|
||||
if value_str.lower() not in (
|
||||
"true", "false", "oui", "non", "1", "0", "vrai", "faux"
|
||||
):
|
||||
return False
|
||||
|
||||
elif self.field_type == "date":
|
||||
# Accepter les formats courants FR
|
||||
date_patterns = [
|
||||
r"\d{2}/\d{2}/\d{4}", # JJ/MM/AAAA
|
||||
r"\d{2}-\d{2}-\d{4}", # JJ-MM-AAAA
|
||||
r"\d{4}-\d{2}-\d{2}", # AAAA-MM-JJ (ISO)
|
||||
r"\d{2}\.\d{2}\.\d{4}", # JJ.MM.AAAA
|
||||
]
|
||||
if not any(re.fullmatch(p, value_str) for p in date_patterns):
|
||||
return False
|
||||
|
||||
# Validation regex custom
|
||||
if self.validation_regex:
|
||||
if not re.fullmatch(self.validation_regex, value_str):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionSchema:
|
||||
"""
|
||||
Schema complet d'extraction : liste de champs + regles de navigation.
|
||||
|
||||
Peut etre charge/sauvegarde en YAML pour reutilisation.
|
||||
"""
|
||||
|
||||
name: str # Ex: "dossier_patient_DPI"
|
||||
description: str
|
||||
fields: List[ExtractionField] = field(default_factory=list)
|
||||
navigation: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
# --- Serialisation YAML ---
|
||||
|
||||
@classmethod
|
||||
def from_yaml(cls, path: str) -> "ExtractionSchema":
|
||||
"""
|
||||
Charger un schema depuis un fichier YAML.
|
||||
|
||||
Args:
|
||||
path: Chemin vers le fichier YAML
|
||||
|
||||
Returns:
|
||||
Instance ExtractionSchema
|
||||
"""
|
||||
yaml_path = Path(path)
|
||||
if not yaml_path.exists():
|
||||
raise FileNotFoundError(f"Schema YAML non trouve : {path}")
|
||||
|
||||
with open(yaml_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"Le fichier YAML doit contenir un dictionnaire, pas {type(data).__name__}")
|
||||
|
||||
return cls._from_dict(data)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ExtractionSchema":
|
||||
"""Construire un schema depuis un dictionnaire Python."""
|
||||
return cls._from_dict(data)
|
||||
|
||||
@classmethod
|
||||
def _from_dict(cls, data: Dict[str, Any]) -> "ExtractionSchema":
|
||||
"""Construction interne depuis un dict."""
|
||||
fields_raw = data.get("fields", [])
|
||||
fields = []
|
||||
for fd in fields_raw:
|
||||
fields.append(ExtractionField(
|
||||
name=fd["name"],
|
||||
description=fd.get("description", ""),
|
||||
field_type=fd.get("type", fd.get("field_type", "text")),
|
||||
required=fd.get("required", True),
|
||||
validation_regex=fd.get("validation", fd.get("validation_regex")),
|
||||
))
|
||||
|
||||
return cls(
|
||||
name=data.get("name", "unnamed"),
|
||||
description=data.get("description", ""),
|
||||
fields=fields,
|
||||
navigation=data.get("navigation", {}),
|
||||
)
|
||||
|
||||
def to_yaml(self, path: str) -> None:
|
||||
"""
|
||||
Sauvegarder le schema en fichier YAML.
|
||||
|
||||
Args:
|
||||
path: Chemin de sortie
|
||||
"""
|
||||
yaml_path = Path(path)
|
||||
yaml_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
data = self.to_dict()
|
||||
|
||||
with open(yaml_path, "w", encoding="utf-8") as f:
|
||||
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convertir en dictionnaire serialisable."""
|
||||
return {
|
||||
"name": self.name,
|
||||
"description": self.description,
|
||||
"fields": [
|
||||
{
|
||||
"name": f.name,
|
||||
"description": f.description,
|
||||
"type": f.field_type,
|
||||
"required": f.required,
|
||||
**({"validation": f.validation_regex} if f.validation_regex else {}),
|
||||
}
|
||||
for f in self.fields
|
||||
],
|
||||
"navigation": self.navigation,
|
||||
}
|
||||
|
||||
# --- Utilitaires ---
|
||||
|
||||
@property
|
||||
def required_fields(self) -> List[ExtractionField]:
|
||||
"""Retourne la liste des champs obligatoires."""
|
||||
return [f for f in self.fields if f.required]
|
||||
|
||||
@property
|
||||
def field_names(self) -> List[str]:
|
||||
"""Retourne la liste des noms de champs."""
|
||||
return [f.name for f in self.fields]
|
||||
|
||||
def get_field(self, name: str) -> Optional[ExtractionField]:
|
||||
"""Recuperer un champ par son nom."""
|
||||
for f in self.fields:
|
||||
if f.name == name:
|
||||
return f
|
||||
return None
|
||||
|
||||
def validate_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Valider un enregistrement complet contre le schema.
|
||||
|
||||
Returns:
|
||||
Dict avec 'valid' (bool), 'errors' (list), 'completeness' (float)
|
||||
"""
|
||||
errors = []
|
||||
valid_count = 0
|
||||
|
||||
for fld in self.fields:
|
||||
value = record.get(fld.name)
|
||||
if fld.validate_value(value):
|
||||
if value is not None and str(value).strip():
|
||||
valid_count += 1
|
||||
else:
|
||||
errors.append(f"Champ '{fld.name}' invalide: {value!r}")
|
||||
|
||||
total = len(self.fields) if self.fields else 1
|
||||
completeness = valid_count / total
|
||||
|
||||
return {
|
||||
"valid": len(errors) == 0,
|
||||
"errors": errors,
|
||||
"completeness": completeness,
|
||||
}
|
||||
Reference in New Issue
Block a user