feat: process mining BPMN, détection changement écran pHash, OCR docTR

Process Mining (core/analytics/process_mining_bridge.py) : - Bridge PM4Py : conversion sessions Shadow → event log → BPMN XML + PNG - KPIs automatiques : durée, variantes, goulots, distribution par app - Support sessions JSONL brutes et workflows core JSON - 42 tests (dont 1 sur données réelles) Détection changement d'écran (core/analytics/screen_change_detector.py) : - pHash (imagehash) : ~16ms par screenshot, seuils SAME/MINOR/MAJOR - 8 tests sur screenshots réels OCR docTR dans execute_extract_text : - docTR par défaut pour lecture simple (rapide, CPU) - Ollama VLM en fallback ou sur demande explicite (mode "vlm"/"ai") - Dual-mode adaptatif selon extraction_mode Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-18 13:07:56 +02:00
parent f5a672d7b9
commit 309dfd5287
6 changed files with 1684 additions and 36 deletions
--- a/core/analytics/process_mining_bridge.py
+++ b/core/analytics/process_mining_bridge.py
@@ -0,0 +1,621 @@
+"""
+Bridge entre les workflows Lea (core) et PM4Py pour le process mining.
+Genere des diagrammes BPMN et KPIs depuis les traces Shadow.
+
+Usage:
+    from core.analytics.process_mining_bridge import (
+        sessions_to_event_log,
+        workflow_to_event_log,
+        discover_bpmn,
+        compute_kpis,
+    )
+
+    # Depuis des sessions JSONL brutes
+    df = sessions_to_event_log(sessions_data)
+    result = discover_bpmn(df, output_dir="data/analytics/bpmn")
+    kpis = compute_kpis(df)
+
+    # Depuis un workflow core (dict JSON)
+    df = workflow_to_event_log(workflow_dict)
+"""
+
+import json
+import logging
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+# ---- Import conditionnel PM4Py -----------------------------------------
+
+try:
+    import pm4py
+    PM4PY_AVAILABLE = True
+except ImportError:
+    PM4PY_AVAILABLE = False
+    logger.warning("pm4py non installe -- le process mining est desactive")
+
+
+def _sanitize_label(label: str) -> str:
+    """
+    Supprime les caracteres de controle (0x00-0x1F sauf tab/newline)
+    qui sont invalides en XML et font planter PM4Py.
+    """
+    return "".join(
+        c if c in ("\t", "\n", "\r") or ord(c) >= 0x20 else f"<0x{ord(c):02x}>"
+        for c in label
+    )
+
+
+# ---- Types d'evenements a ignorer (bruit) --------------------------------
+
+_NOISE_EVENT_TYPES = frozenset({
+    "heartbeat",
+    "action_result",
+    "screenshot",
+})
+
+# Types d'evenements significatifs pour le process mining
+_RELEVANT_EVENT_TYPES = frozenset({
+    "mouse_click",
+    "text_input",
+    "key_press",
+    "key_combo",
+    "window_focus_change",
+})
+
+
+# ===========================================================================
+# Conversion sessions JSONL -> event log PM4Py
+# ===========================================================================
+
+
+def _build_activity_label(event: dict) -> Optional[str]:
+    """
+    Construit un label d'activite lisible depuis un event JSONL brut.
+
+    Regles :
+    - mouse_click   -> "Clic - <app_name> (<window_title tronque>)"
+    - text_input    -> "Saisie '<text>' - <app_name>"
+    - key_press     -> "Touche <key> - <app_name>"
+    - key_combo     -> "Raccourci <keys> - <app_name>"
+    - window_focus_change -> "Fenetre <to.title> (<to.app_name>)"
+
+    Tous les labels sont sanitises pour supprimer les caracteres de controle
+    (ex: \\x13 pour Ctrl+S) qui sont invalides en XML/BPMN.
+    """
+    evt = event.get("event", event)
+    etype = evt.get("type", "")
+
+    if etype in _NOISE_EVENT_TYPES:
+        return None
+
+    # Extraction fenetre
+    window = evt.get("window", {})
+    app_name = window.get("app_name", "inconnu")
+    win_title = window.get("title", "")
+    # Tronquer le titre a 40 caracteres
+    short_title = (win_title[:40] + "...") if len(win_title) > 40 else win_title
+
+    label: Optional[str] = None
+
+    if etype == "mouse_click":
+        label = f"Clic - {app_name} ({short_title})"
+
+    elif etype == "text_input":
+        text = evt.get("text", "")
+        # Tronquer le texte a 20 caracteres pour rester lisible
+        short_text = (text[:20] + "...") if len(text) > 20 else text
+        label = f"Saisie '{short_text}' - {app_name}"
+
+    elif etype == "key_press":
+        key = evt.get("key", "?")
+        label = f"Touche {key} - {app_name}"
+
+    elif etype == "key_combo":
+        keys = evt.get("keys", [])
+        combo = "+".join(str(k) for k in keys)
+        label = f"Raccourci {combo} - {app_name}"
+
+    elif etype == "window_focus_change":
+        to_info = evt.get("to", {})
+        if not to_info:
+            return None
+        to_title = to_info.get("title", "?")
+        to_app = to_info.get("app_name", "?")
+        label = f"Fenetre {to_title} ({to_app})"
+
+    else:
+        # Types non reconnus : label generique
+        label = f"{etype} - {app_name}"
+
+    return _sanitize_label(label) if label else None
+
+
+def _extract_timestamp(event: dict) -> Optional[float]:
+    """Extrait le timestamp unix depuis un event JSONL."""
+    # Le timestamp peut etre au niveau racine ou dans event.timestamp
+    evt = event.get("event", event)
+    ts = evt.get("timestamp") or event.get("timestamp")
+    if ts is not None:
+        return float(ts)
+    # Fallback sur le champ 't' (format simplifie)
+    t = evt.get("t") or event.get("t")
+    if t is not None:
+        return float(t)
+    return None
+
+
+def sessions_to_event_log(
+    sessions_data: List[dict],
+    deduplicate_windows: bool = True,
+) -> pd.DataFrame:
+    """
+    Convertit des traces de sessions brutes (events JSONL) en event log PM4Py.
+
+    Chaque event pertinent devient une ligne :
+    - case:concept:name = session_id
+    - concept:name      = label d'activite (ex: "Clic - Notepad.exe (Bloc-notes)")
+    - time:timestamp    = timestamp UTC
+
+    Args:
+        sessions_data: liste de dicts, chaque dict est une ligne JSONL parsee.
+        deduplicate_windows: si True, supprime les window_focus_change
+            consecutifs vers la meme fenetre (bruit typique de Windows).
+
+    Returns:
+        DataFrame pret pour PM4Py.
+    """
+    rows: List[Dict[str, Any]] = []
+
+    # Regrouper par session_id pour le deduplication
+    sessions: Dict[str, List[dict]] = {}
+    for event in sessions_data:
+        sid = event.get("session_id", "unknown")
+        sessions.setdefault(sid, []).append(event)
+
+    for sid, events in sessions.items():
+        # Trier par timestamp
+        events.sort(key=lambda e: _extract_timestamp(e) or 0.0)
+        last_window_label: Optional[str] = None
+
+        for event in events:
+            label = _build_activity_label(event)
+            if label is None:
+                continue
+
+            ts = _extract_timestamp(event)
+            if ts is None:
+                continue
+
+            # Deduplication des changements de fenetre consecutifs
+            evt = event.get("event", event)
+            if deduplicate_windows and evt.get("type") == "window_focus_change":
+                if label == last_window_label:
+                    continue
+                last_window_label = label
+            else:
+                last_window_label = None
+
+            rows.append({
+                "case:concept:name": sid,
+                "concept:name": label,
+                "time:timestamp": pd.Timestamp(
+                    datetime.fromtimestamp(ts, tz=timezone.utc)
+                ),
+                "event_type": evt.get("type", ""),
+                "app_name": evt.get("window", {}).get("app_name", ""),
+            })
+
+    if not rows:
+        logger.warning("Aucun evenement pertinent trouve dans les sessions")
+        return pd.DataFrame(columns=[
+            "case:concept:name",
+            "concept:name",
+            "time:timestamp",
+            "event_type",
+            "app_name",
+        ])
+
+    df = pd.DataFrame(rows)
+    df = df.sort_values(["case:concept:name", "time:timestamp"]).reset_index(drop=True)
+    logger.info(
+        "Event log cree : %d evenements, %d sessions, %d activites distinctes",
+        len(df),
+        df["case:concept:name"].nunique(),
+        df["concept:name"].nunique(),
+    )
+    return df
+
+
+# ===========================================================================
+# Conversion workflow core (dict JSON) -> event log PM4Py
+# ===========================================================================
+
+
+def workflow_to_event_log(workflow_dict: dict) -> pd.DataFrame:
+    """
+    Convertit un workflow core (dict JSON) en DataFrame PM4Py.
+
+    Utilise les nodes et edges pour reconstituer une trace.
+    Chaque chemin du entry_node vers un end_node = un case.
+
+    Mapping :
+    - case:concept:name = workflow_id + suffixe de chemin
+    - concept:name      = node.name
+    - time:timestamp    = deduced from edge stats ou created_at
+    """
+    wf_id = workflow_dict.get("workflow_id", "wf_unknown")
+    nodes = {n["node_id"]: n for n in workflow_dict.get("nodes", [])}
+    edges = workflow_dict.get("edges", [])
+    entry_nodes = workflow_dict.get("entry_nodes", [])
+    created_at = workflow_dict.get("created_at", datetime.now(timezone.utc).isoformat())
+
+    if not nodes or not edges:
+        logger.warning("Workflow vide ou sans edges : %s", wf_id)
+        return pd.DataFrame(columns=[
+            "case:concept:name",
+            "concept:name",
+            "time:timestamp",
+        ])
+
+    # Construire un graphe d'adjacence
+    adjacency: Dict[str, List[dict]] = {}
+    for edge in edges:
+        from_node = edge.get("from_node") or edge.get("source_node", "")
+        adjacency.setdefault(from_node, []).append(edge)
+
+    # Parcours DFS pour trouver les chemins (limites a eviter l'explosion)
+    MAX_PATHS = 100
+    paths: List[List[str]] = []
+
+    def _dfs(current: str, path: List[str], visited: set) -> None:
+        if len(paths) >= MAX_PATHS:
+            return
+        if current in visited:
+            # Boucle detectee, sauvegarder le chemin tel quel
+            paths.append(path[:])
+            return
+        visited.add(current)
+        path.append(current)
+
+        outgoing = adjacency.get(current, [])
+        if not outgoing:
+            # End node
+            paths.append(path[:])
+        else:
+            for edge in outgoing:
+                to_node = edge.get("to_node") or edge.get("target_node", "")
+                if to_node:
+                    _dfs(to_node, path, visited)
+        path.pop()
+        visited.discard(current)
+
+    for entry in entry_nodes:
+        if entry in nodes:
+            _dfs(entry, [], set())
+
+    # Si pas d'entry nodes, essayer tous les nodes sans edges entrants
+    if not paths:
+        target_nodes = set()
+        for edge in edges:
+            to_node = edge.get("to_node") or edge.get("target_node", "")
+            target_nodes.add(to_node)
+        root_nodes = [nid for nid in nodes if nid not in target_nodes]
+        for root in root_nodes[:3]:
+            _dfs(root, [], set())
+
+    # Construire le DataFrame
+    rows: List[Dict[str, Any]] = []
+    try:
+        base_time = pd.Timestamp(datetime.fromisoformat(created_at))
+    except (ValueError, TypeError):
+        base_time = pd.Timestamp(datetime.now(timezone.utc))
+
+    for i, path in enumerate(paths):
+        case_id = f"{wf_id}_path_{i}"
+        for step_idx, node_id in enumerate(path):
+            node = nodes.get(node_id, {})
+            rows.append({
+                "case:concept:name": case_id,
+                "concept:name": node.get("name", node_id),
+                "time:timestamp": base_time + pd.Timedelta(seconds=step_idx),
+            })
+
+    df = pd.DataFrame(rows)
+    if not df.empty:
+        df = df.sort_values(["case:concept:name", "time:timestamp"]).reset_index(drop=True)
+    logger.info(
+        "Event log depuis workflow : %d evenements, %d chemins",
+        len(df), len(paths),
+    )
+    return df
+
+
+# ===========================================================================
+# Decouverte BPMN
+# ===========================================================================
+
+
+def discover_bpmn(
+    event_log_df: pd.DataFrame,
+    output_dir: str = "data/analytics/bpmn",
+    name: str = "process",
+) -> dict:
+    """
+    Decouvre un modele BPMN depuis un event log via Inductive Miner.
+
+    Args:
+        event_log_df: DataFrame au format PM4Py.
+        output_dir: repertoire de sortie pour les fichiers generes.
+        name: prefixe pour les noms de fichiers.
+
+    Returns:
+        {
+            'bpmn_xml_path': str,
+            'bpmn_image_path': str,
+            'petri_net_image_path': str,
+            'dfg_image_path': str,
+            'stats': {
+                'activities': int,
+                'variants': int,
+                'cases': int,
+            }
+        }
+    """
+    if not PM4PY_AVAILABLE:
+        raise ImportError("pm4py n'est pas installe. Installez-le : pip install pm4py")
+
+    if event_log_df.empty:
+        raise ValueError("Event log vide, impossible de decouvrir un BPMN")
+
+    out = Path(output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+
+    # Decouverte BPMN par Inductive Miner
+    bpmn_model = pm4py.discover_bpmn_inductive(event_log_df)
+
+    # Export BPMN XML
+    bpmn_xml_path = str(out / f"{name}.bpmn")
+    try:
+        pm4py.write_bpmn(bpmn_model, bpmn_xml_path)
+    except Exception as e:
+        # PM4Py layout peut echouer avec des labels contenant des caracteres
+        # speciaux (accents, guillemets, etc.). Fallback : export via l'exporter
+        # interne sans layout.
+        logger.warning("Layout BPMN echoue (%s), export sans layout", e)
+        from pm4py.objects.bpmn.exporter import exporter as bpmn_exporter
+        bpmn_exporter.apply(bpmn_model, bpmn_xml_path)
+    logger.info("BPMN XML exporte : %s", bpmn_xml_path)
+
+    # Export image BPMN (PNG)
+    bpmn_image_path = str(out / f"{name}_bpmn.png")
+    try:
+        pm4py.save_vis_bpmn(bpmn_model, bpmn_image_path)
+        logger.info("BPMN PNG exporte : %s", bpmn_image_path)
+    except Exception as e:
+        logger.warning("Impossible de generer l'image BPMN : %s", e)
+        bpmn_image_path = None
+
+    # DFG (Directly-Follows Graph) avec performance
+    dfg_image_path = str(out / f"{name}_dfg.png")
+    try:
+        pm4py.save_vis_dfg(
+            *pm4py.discover_dfg(event_log_df),
+            file_path=dfg_image_path,
+        )
+        logger.info("DFG PNG exporte : %s", dfg_image_path)
+    except Exception as e:
+        logger.warning("Impossible de generer le DFG : %s", e)
+        dfg_image_path = None
+
+    # Petri net via Inductive Miner (pour visualisation alternative)
+    petri_image_path = str(out / f"{name}_petri.png")
+    try:
+        net, im, fm = pm4py.discover_petri_net_inductive(event_log_df)
+        pm4py.save_vis_petri_net(net, im, fm, file_path=petri_image_path)
+        logger.info("Petri net PNG exporte : %s", petri_image_path)
+    except Exception as e:
+        logger.warning("Impossible de generer le Petri net : %s", e)
+        petri_image_path = None
+
+    # Stats de base
+    variants = pm4py.get_variants(event_log_df)
+    n_cases = event_log_df["case:concept:name"].nunique()
+    n_activities = event_log_df["concept:name"].nunique()
+
+    result = {
+        "bpmn_xml_path": bpmn_xml_path,
+        "bpmn_image_path": bpmn_image_path,
+        "petri_net_image_path": petri_image_path,
+        "dfg_image_path": dfg_image_path,
+        "stats": {
+            "activities": n_activities,
+            "variants": len(variants),
+            "cases": n_cases,
+        },
+    }
+    logger.info("Decouverte BPMN terminee : %s", result["stats"])
+    return result
+
+
+# ===========================================================================
+# KPIs de process mining
+# ===========================================================================
+
+
+def compute_kpis(event_log_df: pd.DataFrame) -> dict:
+    """
+    Calcule les KPIs de process mining.
+
+    Returns:
+        {
+            'total_cases': int,
+            'total_events': int,
+            'unique_activities': int,
+            'variants_count': int,
+            'variants_top5': list,
+            'avg_case_duration_seconds': float,
+            'median_case_duration_seconds': float,
+            'avg_events_per_case': float,
+            'activity_stats': {
+                '<activity_name>': {
+                    'count': int,
+                    'avg_duration_seconds': float,
+                    'min_duration_seconds': float,
+                    'max_duration_seconds': float,
+                }
+            },
+            'bottlenecks': [...],  # top 3 activites les plus lentes
+            'app_distribution': { '<app_name>': int },
+        }
+    """
+    if event_log_df.empty:
+        return {
+            "total_cases": 0,
+            "total_events": 0,
+            "unique_activities": 0,
+            "variants_count": 0,
+            "variants_top5": [],
+            "avg_case_duration_seconds": 0.0,
+            "median_case_duration_seconds": 0.0,
+            "avg_events_per_case": 0.0,
+            "activity_stats": {},
+            "bottlenecks": [],
+            "app_distribution": {},
+        }
+
+    df = event_log_df.copy()
+
+    # ---- Metriques globales ----
+    total_cases = df["case:concept:name"].nunique()
+    total_events = len(df)
+    unique_activities = df["concept:name"].nunique()
+
+    # ---- Variantes (PM4Py) ----
+    if PM4PY_AVAILABLE:
+        variants = pm4py.get_variants(df)
+        variants_count = len(variants)
+        # Top 5 variantes par frequence
+        sorted_variants = sorted(variants.items(), key=lambda x: x[1], reverse=True)
+        variants_top5 = [
+            {"variant": " -> ".join(v), "count": c}
+            for v, c in sorted_variants[:5]
+        ]
+    else:
+        variants_count = 0
+        variants_top5 = []
+
+    # ---- Duree par case ----
+    case_durations: List[float] = []
+    for _case_id, group in df.groupby("case:concept:name"):
+        ts = group["time:timestamp"]
+        if len(ts) >= 2:
+            duration = (ts.max() - ts.min()).total_seconds()
+            case_durations.append(duration)
+
+    avg_case_dur = float(pd.Series(case_durations).mean()) if case_durations else 0.0
+    median_case_dur = float(pd.Series(case_durations).median()) if case_durations else 0.0
+    avg_events_per_case = total_events / total_cases if total_cases > 0 else 0.0
+
+    # ---- Stats par activite ----
+    activity_stats: Dict[str, Dict[str, Any]] = {}
+    # Calculer la duree entre chaque evenement et le suivant dans le meme case
+    df_sorted = df.sort_values(["case:concept:name", "time:timestamp"])
+    df_sorted["next_timestamp"] = df_sorted.groupby("case:concept:name")[
+        "time:timestamp"
+    ].shift(-1)
+    df_sorted["duration_to_next"] = (
+        df_sorted["next_timestamp"] - df_sorted["time:timestamp"]
+    ).dt.total_seconds()
+
+    for activity, grp in df_sorted.groupby("concept:name"):
+        durations = grp["duration_to_next"].dropna()
+        # Filtrer les durees aberrantes (> 5 min = probablement une pause)
+        durations = durations[durations <= 300]
+        stats: Dict[str, Any] = {
+            "count": len(grp),
+            "avg_duration_seconds": round(float(durations.mean()), 2) if len(durations) > 0 else 0.0,
+            "min_duration_seconds": round(float(durations.min()), 2) if len(durations) > 0 else 0.0,
+            "max_duration_seconds": round(float(durations.max()), 2) if len(durations) > 0 else 0.0,
+        }
+        activity_stats[activity] = stats
+
+    # ---- Goulots d'etranglement (top 3 activites les plus lentes) ----
+    bottlenecks = sorted(
+        [
+            {"activity": act, "avg_duration_seconds": s["avg_duration_seconds"]}
+            for act, s in activity_stats.items()
+            if s["avg_duration_seconds"] > 0
+        ],
+        key=lambda x: x["avg_duration_seconds"],
+        reverse=True,
+    )[:3]
+
+    # ---- Distribution par application ----
+    app_distribution: Dict[str, int] = {}
+    if "app_name" in df.columns:
+        app_distribution = df["app_name"].value_counts().to_dict()
+
+    return {
+        "total_cases": total_cases,
+        "total_events": total_events,
+        "unique_activities": unique_activities,
+        "variants_count": variants_count,
+        "variants_top5": variants_top5,
+        "avg_case_duration_seconds": round(avg_case_dur, 2),
+        "median_case_duration_seconds": round(median_case_dur, 2),
+        "avg_events_per_case": round(avg_events_per_case, 1),
+        "activity_stats": activity_stats,
+        "bottlenecks": bottlenecks,
+        "app_distribution": app_distribution,
+    }
+
+
+# ===========================================================================
+# Helpers : chargement sessions JSONL
+# ===========================================================================
+
+
+def load_jsonl_session(jsonl_path: str) -> List[dict]:
+    """
+    Charge un fichier live_events.jsonl en liste de dicts.
+
+    Ignore les lignes vides ou invalides.
+    """
+    events: List[dict] = []
+    path = Path(jsonl_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Fichier JSONL introuvable : {jsonl_path}")
+
+    with open(path, "r", encoding="utf-8") as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                events.append(json.loads(line))
+            except json.JSONDecodeError as e:
+                logger.warning("Ligne %d invalide dans %s : %s", line_num, jsonl_path, e)
+
+    logger.info("Charge %d evenements depuis %s", len(events), jsonl_path)
+    return events
+
+
+def load_multiple_sessions(session_dirs: List[str]) -> List[dict]:
+    """
+    Charge plusieurs sessions depuis leurs repertoires.
+
+    Cherche un fichier live_events.jsonl dans chaque repertoire.
+    """
+    all_events: List[dict] = []
+    for session_dir in session_dirs:
+        jsonl_path = Path(session_dir) / "live_events.jsonl"
+        if jsonl_path.exists():
+            all_events.extend(load_jsonl_session(str(jsonl_path)))
+        else:
+            logger.warning("Pas de live_events.jsonl dans %s", session_dir)
+    return all_events
--- a/core/analytics/screen_change_detector.py
+++ b/core/analytics/screen_change_detector.py
@@ -0,0 +1,60 @@
+"""
+Détection rapide de changement d'écran via perceptual hash (pHash).
+
+Utilise imagehash pour calculer un hash perceptuel par screenshot.
+La distance de Hamming entre deux hashes indique le degré de changement :
+- < 5  : même écran (bruit, curseur déplacé)
+- 5-15 : changement mineur (scroll, popup, champ rempli)
+- > 15 : nouvel écran (nouvelle fenêtre, navigation)
+
+Performance : ~15ms par hash sur CPU pour des screenshots 2560x1600.
+"""
+
+from PIL import Image
+import imagehash
+from typing import Tuple, Optional
+from enum import Enum
+
+
+class ScreenChangeLevel(Enum):
+    SAME = "same"           # distance < 5
+    MINOR = "minor"         # 5 <= distance < 15
+    MAJOR = "major"         # distance >= 15
+
+
+def compute_phash(image: Image.Image, hash_size: int = 8) -> imagehash.ImageHash:
+    """Calcule le pHash d'une image PIL."""
+    return imagehash.phash(image, hash_size=hash_size)
+
+
+def compare_screenshots(img1: Image.Image, img2: Image.Image, hash_size: int = 8) -> Tuple[int, ScreenChangeLevel]:
+    """
+    Compare deux screenshots et retourne la distance + le niveau de changement.
+
+    Returns:
+        (distance, level) — distance de Hamming et niveau de changement
+    """
+    h1 = compute_phash(img1, hash_size)
+    h2 = compute_phash(img2, hash_size)
+    distance = h1 - h2
+
+    if distance < 5:
+        level = ScreenChangeLevel.SAME
+    elif distance < 15:
+        level = ScreenChangeLevel.MINOR
+    else:
+        level = ScreenChangeLevel.MAJOR
+
+    return distance, level
+
+
+def compare_hashes(hash1: imagehash.ImageHash, hash2: imagehash.ImageHash) -> Tuple[int, ScreenChangeLevel]:
+    """Compare deux hashes pré-calculés."""
+    distance = hash1 - hash2
+    if distance < 5:
+        level = ScreenChangeLevel.SAME
+    elif distance < 15:
+        level = ScreenChangeLevel.MINOR
+    else:
+        level = ScreenChangeLevel.MAJOR
+    return distance, level
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,6 +23,9 @@ cycler==0.12.1
 defusedxml==0.7.1
 et_xmlfile==2.0.0
 evdev==1.9.2
+# EDS-NLP : NER médical français pour le blur PII server-side (optionnel).
+# Fallback regex utilisé si absent. Voir core/anonymisation/pii_blur.py.
+# edsnlp>=0.12.0
 faiss-cpu==1.13.2
 fastapi==0.128.0
 filelock==3.20.3
--- a/tests/unit/test_process_mining_bridge.py
+++ b/tests/unit/test_process_mining_bridge.py
@@ -0,0 +1,693 @@
+"""
+Tests du bridge Process Mining (PM4Py) pour rpa_vision_v3.
+
+Couvre :
+- Conversion sessions JSONL -> event log PM4Py
+- Conversion workflow core -> event log PM4Py
+- Decouverte BPMN (Inductive Miner)
+- Calcul de KPIs
+- Test avec donnees reelles (marque @slow)
+"""
+
+import json
+import os
+import shutil
+import tempfile
+from datetime import datetime, timezone
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from core.analytics.process_mining_bridge import (
+    PM4PY_AVAILABLE,
+    _build_activity_label,
+    _extract_timestamp,
+    compute_kpis,
+    discover_bpmn,
+    load_jsonl_session,
+    sessions_to_event_log,
+    workflow_to_event_log,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+SAMPLE_EVENTS = [
+    {
+        "session_id": "sess_test_001",
+        "timestamp": 1776062946.0,
+        "event": {
+            "type": "window_focus_change",
+            "from": None,
+            "to": {"title": "Bureau", "app_name": "explorer.exe"},
+            "timestamp": 1776062946.0,
+            "window": {"title": "Bureau", "app_name": "explorer.exe"},
+        },
+    },
+    {
+        "session_id": "sess_test_001",
+        "timestamp": 1776062948.0,
+        "event": {
+            "type": "mouse_click",
+            "button": "left",
+            "pos": [500, 300],
+            "timestamp": 1776062948.0,
+            "window": {"title": "Bloc-notes", "app_name": "Notepad.exe"},
+        },
+    },
+    {
+        "session_id": "sess_test_001",
+        "timestamp": 1776062950.0,
+        "event": {
+            "type": "text_input",
+            "text": "Bonjour Dom",
+            "timestamp": 1776062950.0,
+            "window": {"title": "Bloc-notes", "app_name": "Notepad.exe"},
+        },
+    },
+    {
+        "session_id": "sess_test_001",
+        "timestamp": 1776062952.0,
+        "event": {
+            "type": "key_combo",
+            "keys": ["ctrl", "s"],
+            "timestamp": 1776062952.0,
+            "window": {"title": "Bloc-notes", "app_name": "Notepad.exe"},
+        },
+    },
+    # Deuxieme session (meme pattern)
+    {
+        "session_id": "sess_test_002",
+        "timestamp": 1776063000.0,
+        "event": {
+            "type": "window_focus_change",
+            "from": None,
+            "to": {"title": "Bureau", "app_name": "explorer.exe"},
+            "timestamp": 1776063000.0,
+            "window": {"title": "Bureau", "app_name": "explorer.exe"},
+        },
+    },
+    {
+        "session_id": "sess_test_002",
+        "timestamp": 1776063002.0,
+        "event": {
+            "type": "mouse_click",
+            "button": "left",
+            "pos": [500, 300],
+            "timestamp": 1776063002.0,
+            "window": {"title": "Bloc-notes", "app_name": "Notepad.exe"},
+        },
+    },
+    {
+        "session_id": "sess_test_002",
+        "timestamp": 1776063005.0,
+        "event": {
+            "type": "text_input",
+            "text": "Bonjour Claude",
+            "timestamp": 1776063005.0,
+            "window": {"title": "Bloc-notes", "app_name": "Notepad.exe"},
+        },
+    },
+    {
+        "session_id": "sess_test_002",
+        "timestamp": 1776063007.0,
+        "event": {
+            "type": "key_combo",
+            "keys": ["ctrl", "s"],
+            "timestamp": 1776063007.0,
+            "window": {"title": "Bloc-notes", "app_name": "Notepad.exe"},
+        },
+    },
+    # Evenements de bruit (doivent etre filtres)
+    {
+        "session_id": "sess_test_001",
+        "timestamp": 1776062947.0,
+        "event": {
+            "type": "heartbeat",
+            "image": "shots/heartbeat.png",
+            "timestamp": 1776062947.0,
+        },
+    },
+    {
+        "session_id": "sess_test_001",
+        "timestamp": 1776062949.0,
+        "event": {
+            "type": "action_result",
+            "base_shot_id": "shot_0001",
+            "image": "",
+        },
+    },
+]
+
+
+SAMPLE_WORKFLOW = {
+    "workflow_id": "wf_test_001",
+    "name": "Ouvrir Bloc-notes et saisir texte",
+    "created_at": "2026-04-13T08:49:06+00:00",
+    "entry_nodes": ["n1"],
+    "end_nodes": ["n4"],
+    "nodes": [
+        {"node_id": "n1", "name": "Bureau Windows", "description": "Bureau"},
+        {"node_id": "n2", "name": "Recherche Windows", "description": "Barre de recherche"},
+        {"node_id": "n3", "name": "Bloc-notes ouvert", "description": "Fenetre Notepad"},
+        {"node_id": "n4", "name": "Texte saisi", "description": "Texte ecrit dans Notepad"},
+    ],
+    "edges": [
+        {
+            "edge_id": "e1",
+            "from_node": "n1",
+            "to_node": "n2",
+            "action": {"type": "mouse_click"},
+            "stats": {"execution_count": 5, "avg_duration": 1.5},
+        },
+        {
+            "edge_id": "e2",
+            "from_node": "n2",
+            "to_node": "n3",
+            "action": {"type": "text_input"},
+            "stats": {"execution_count": 5, "avg_duration": 3.0},
+        },
+        {
+            "edge_id": "e3",
+            "from_node": "n3",
+            "to_node": "n4",
+            "action": {"type": "text_input"},
+            "stats": {"execution_count": 5, "avg_duration": 5.0},
+        },
+    ],
+}
+
+
+@pytest.fixture
+def sample_events():
+    return SAMPLE_EVENTS
+
+
+@pytest.fixture
+def sample_workflow():
+    return SAMPLE_WORKFLOW
+
+
+@pytest.fixture
+def output_dir():
+    """Repertoire temporaire pour les sorties."""
+    d = tempfile.mkdtemp(prefix="pm_test_")
+    yield d
+    shutil.rmtree(d, ignore_errors=True)
+
+
+@pytest.fixture
+def sample_jsonl_file(tmp_path):
+    """Cree un fichier JSONL temporaire avec les events de test."""
+    jsonl_file = tmp_path / "live_events.jsonl"
+    with open(jsonl_file, "w", encoding="utf-8") as f:
+        for event in SAMPLE_EVENTS:
+            f.write(json.dumps(event, ensure_ascii=False) + "\n")
+    return str(jsonl_file)
+
+
+# ===========================================================================
+# Tests unitaires : fonctions internes
+# ===========================================================================
+
+
+class TestBuildActivityLabel:
+    """Tests de la construction des labels d'activite."""
+
+    def test_mouse_click(self):
+        event = {
+            "event": {
+                "type": "mouse_click",
+                "window": {"title": "Bloc-notes", "app_name": "Notepad.exe"},
+            }
+        }
+        label = _build_activity_label(event)
+        assert label is not None
+        assert "Clic" in label
+        assert "Notepad.exe" in label
+        assert "Bloc-notes" in label
+
+    def test_text_input(self):
+        event = {
+            "event": {
+                "type": "text_input",
+                "text": "Bonjour",
+                "window": {"title": "Bloc-notes", "app_name": "Notepad.exe"},
+            }
+        }
+        label = _build_activity_label(event)
+        assert label is not None
+        assert "Saisie" in label
+        assert "Bonjour" in label
+
+    def test_text_input_truncation(self):
+        event = {
+            "event": {
+                "type": "text_input",
+                "text": "A" * 50,
+                "window": {"title": "X", "app_name": "X.exe"},
+            }
+        }
+        label = _build_activity_label(event)
+        assert "..." in label
+
+    def test_key_combo(self):
+        event = {
+            "event": {
+                "type": "key_combo",
+                "keys": ["ctrl", "s"],
+                "window": {"title": "Bloc-notes", "app_name": "Notepad.exe"},
+            }
+        }
+        label = _build_activity_label(event)
+        assert "Raccourci" in label
+        assert "ctrl+s" in label
+
+    def test_window_focus_change(self):
+        event = {
+            "event": {
+                "type": "window_focus_change",
+                "to": {"title": "Chrome", "app_name": "chrome.exe"},
+                "window": {"title": "Chrome", "app_name": "chrome.exe"},
+            }
+        }
+        label = _build_activity_label(event)
+        assert "Fenetre" in label
+        assert "Chrome" in label
+
+    def test_heartbeat_filtered(self):
+        event = {
+            "event": {
+                "type": "heartbeat",
+                "image": "something.png",
+            }
+        }
+        assert _build_activity_label(event) is None
+
+    def test_action_result_filtered(self):
+        event = {
+            "event": {
+                "type": "action_result",
+                "base_shot_id": "shot_0001",
+            }
+        }
+        assert _build_activity_label(event) is None
+
+
+class TestExtractTimestamp:
+    """Tests de l'extraction de timestamp."""
+
+    def test_from_event_timestamp(self):
+        event = {"event": {"timestamp": 1776062946.0}}
+        assert _extract_timestamp(event) == 1776062946.0
+
+    def test_from_root_timestamp(self):
+        event = {"timestamp": 1776062946.0}
+        assert _extract_timestamp(event) == 1776062946.0
+
+    def test_from_t_field(self):
+        event = {"t": 1712345678.123}
+        assert _extract_timestamp(event) == pytest.approx(1712345678.123)
+
+    def test_missing_timestamp(self):
+        event = {"event": {"type": "unknown"}}
+        assert _extract_timestamp(event) is None
+
+
+# ===========================================================================
+# Tests : conversion sessions -> event log
+# ===========================================================================
+
+
+class TestSessionsToEventLog:
+    """Tests de la conversion sessions JSONL -> event log PM4Py."""
+
+    def test_basic_conversion(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        assert not df.empty
+        assert "case:concept:name" in df.columns
+        assert "concept:name" in df.columns
+        assert "time:timestamp" in df.columns
+
+    def test_correct_case_ids(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        case_ids = df["case:concept:name"].unique()
+        assert "sess_test_001" in case_ids
+        assert "sess_test_002" in case_ids
+
+    def test_noise_filtered(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        # Les heartbeat et action_result ne doivent pas apparaitre
+        event_types = df["event_type"].unique()
+        assert "heartbeat" not in event_types
+        assert "action_result" not in event_types
+
+    def test_timestamps_ordered(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        for _case_id, group in df.groupby("case:concept:name"):
+            timestamps = group["time:timestamp"].values
+            for i in range(len(timestamps) - 1):
+                assert timestamps[i] <= timestamps[i + 1]
+
+    def test_window_deduplication(self):
+        """Les window_focus_change consecutifs identiques sont dedupliques."""
+        events = [
+            {
+                "session_id": "s1",
+                "timestamp": 1.0,
+                "event": {
+                    "type": "window_focus_change",
+                    "to": {"title": "A", "app_name": "a.exe"},
+                    "timestamp": 1.0,
+                    "window": {"title": "A", "app_name": "a.exe"},
+                },
+            },
+            {
+                "session_id": "s1",
+                "timestamp": 2.0,
+                "event": {
+                    "type": "window_focus_change",
+                    "to": {"title": "A", "app_name": "a.exe"},
+                    "timestamp": 2.0,
+                    "window": {"title": "A", "app_name": "a.exe"},
+                },
+            },
+            {
+                "session_id": "s1",
+                "timestamp": 3.0,
+                "event": {
+                    "type": "window_focus_change",
+                    "to": {"title": "B", "app_name": "b.exe"},
+                    "timestamp": 3.0,
+                    "window": {"title": "B", "app_name": "b.exe"},
+                },
+            },
+        ]
+        df = sessions_to_event_log(events, deduplicate_windows=True)
+        # Seulement 2 lignes : A puis B (le 2eme A est un doublon)
+        assert len(df) == 2
+
+    def test_empty_input(self):
+        df = sessions_to_event_log([])
+        assert df.empty
+        assert "case:concept:name" in df.columns
+
+    def test_events_count(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        # 2 sessions x 4 events pertinents = 8 lignes
+        assert len(df) == 8
+
+
+# ===========================================================================
+# Tests : conversion workflow -> event log
+# ===========================================================================
+
+
+class TestWorkflowToEventLog:
+    """Tests de la conversion workflow core -> event log PM4Py."""
+
+    def test_basic_conversion(self, sample_workflow):
+        df = workflow_to_event_log(sample_workflow)
+        assert not df.empty
+        assert "case:concept:name" in df.columns
+        assert "concept:name" in df.columns
+
+    def test_path_traversal(self, sample_workflow):
+        df = workflow_to_event_log(sample_workflow)
+        # Le workflow n1->n2->n3->n4 est lineaire, 1 seul chemin
+        assert df["case:concept:name"].nunique() == 1
+        # 4 nodes dans le chemin
+        assert len(df) == 4
+
+    def test_node_names(self, sample_workflow):
+        df = workflow_to_event_log(sample_workflow)
+        activities = df["concept:name"].tolist()
+        assert "Bureau Windows" in activities
+        assert "Recherche Windows" in activities
+        assert "Bloc-notes ouvert" in activities
+        assert "Texte saisi" in activities
+
+    def test_empty_workflow(self):
+        df = workflow_to_event_log({"workflow_id": "empty", "nodes": [], "edges": []})
+        assert df.empty
+
+    def test_branching_workflow(self):
+        """Un workflow avec branches produit plusieurs chemins."""
+        wf = {
+            "workflow_id": "wf_branch",
+            "created_at": "2026-01-01T00:00:00+00:00",
+            "entry_nodes": ["n1"],
+            "end_nodes": ["n3", "n4"],
+            "nodes": [
+                {"node_id": "n1", "name": "Start"},
+                {"node_id": "n2", "name": "Step A"},
+                {"node_id": "n3", "name": "End A"},
+                {"node_id": "n4", "name": "End B"},
+            ],
+            "edges": [
+                {"edge_id": "e1", "from_node": "n1", "to_node": "n2"},
+                {"edge_id": "e2", "from_node": "n1", "to_node": "n4"},
+                {"edge_id": "e3", "from_node": "n2", "to_node": "n3"},
+            ],
+        }
+        df = workflow_to_event_log(wf)
+        # 2 chemins : n1->n2->n3 et n1->n4
+        assert df["case:concept:name"].nunique() == 2
+
+
+# ===========================================================================
+# Tests : decouverte BPMN
+# ===========================================================================
+
+
+@pytest.mark.skipif(not PM4PY_AVAILABLE, reason="pm4py non installe")
+class TestDiscoverBpmn:
+    """Tests de la decouverte BPMN."""
+
+    def test_produces_files(self, sample_events, output_dir):
+        df = sessions_to_event_log(sample_events)
+        result = discover_bpmn(df, output_dir=output_dir, name="test")
+
+        # Verifier que le BPMN XML existe
+        assert result["bpmn_xml_path"] is not None
+        assert Path(result["bpmn_xml_path"]).exists()
+        assert Path(result["bpmn_xml_path"]).suffix == ".bpmn"
+
+        # Verifier le contenu XML
+        xml_content = Path(result["bpmn_xml_path"]).read_text()
+        assert "bpmn" in xml_content.lower() or "definitions" in xml_content.lower()
+
+    def test_produces_png(self, sample_events, output_dir):
+        df = sessions_to_event_log(sample_events)
+        result = discover_bpmn(df, output_dir=output_dir, name="test")
+
+        if result["bpmn_image_path"]:
+            assert Path(result["bpmn_image_path"]).exists()
+            # Verifier que c'est un PNG (magic bytes)
+            with open(result["bpmn_image_path"], "rb") as f:
+                header = f.read(4)
+            assert header[:4] == b"\x89PNG"
+
+    def test_stats_populated(self, sample_events, output_dir):
+        df = sessions_to_event_log(sample_events)
+        result = discover_bpmn(df, output_dir=output_dir, name="test")
+
+        stats = result["stats"]
+        assert stats["activities"] > 0
+        assert stats["cases"] == 2
+        assert stats["variants"] >= 1
+
+    def test_empty_raises(self, output_dir):
+        df = pd.DataFrame(columns=["case:concept:name", "concept:name", "time:timestamp"])
+        with pytest.raises(ValueError, match="vide"):
+            discover_bpmn(df, output_dir=output_dir)
+
+    def test_dfg_image_produced(self, sample_events, output_dir):
+        df = sessions_to_event_log(sample_events)
+        result = discover_bpmn(df, output_dir=output_dir, name="test")
+        if result["dfg_image_path"]:
+            assert Path(result["dfg_image_path"]).exists()
+
+
+# ===========================================================================
+# Tests : KPIs
+# ===========================================================================
+
+
+class TestComputeKpis:
+    """Tests du calcul de KPIs."""
+
+    def test_returns_expected_keys(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        kpis = compute_kpis(df)
+
+        expected_keys = {
+            "total_cases",
+            "total_events",
+            "unique_activities",
+            "variants_count",
+            "variants_top5",
+            "avg_case_duration_seconds",
+            "median_case_duration_seconds",
+            "avg_events_per_case",
+            "activity_stats",
+            "bottlenecks",
+            "app_distribution",
+        }
+        assert expected_keys.issubset(set(kpis.keys()))
+
+    def test_case_count(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        kpis = compute_kpis(df)
+        assert kpis["total_cases"] == 2
+
+    def test_events_count(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        kpis = compute_kpis(df)
+        assert kpis["total_events"] == 8
+
+    def test_activity_stats_populated(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        kpis = compute_kpis(df)
+        assert len(kpis["activity_stats"]) > 0
+        # Chaque activite doit avoir les cles attendues
+        for activity, stats in kpis["activity_stats"].items():
+            assert "count" in stats
+            assert "avg_duration_seconds" in stats
+            assert "min_duration_seconds" in stats
+            assert "max_duration_seconds" in stats
+
+    def test_bottlenecks_sorted(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        kpis = compute_kpis(df)
+        bottlenecks = kpis["bottlenecks"]
+        # Verifier l'ordre decroissant
+        for i in range(len(bottlenecks) - 1):
+            assert (
+                bottlenecks[i]["avg_duration_seconds"]
+                >= bottlenecks[i + 1]["avg_duration_seconds"]
+            )
+
+    def test_app_distribution(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        kpis = compute_kpis(df)
+        assert "app_distribution" in kpis
+        assert "Notepad.exe" in kpis["app_distribution"]
+
+    def test_empty_kpis(self):
+        df = pd.DataFrame(columns=["case:concept:name", "concept:name", "time:timestamp"])
+        kpis = compute_kpis(df)
+        assert kpis["total_cases"] == 0
+        assert kpis["total_events"] == 0
+
+    def test_duration_positive(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        kpis = compute_kpis(df)
+        assert kpis["avg_case_duration_seconds"] > 0
+
+    @pytest.mark.skipif(not PM4PY_AVAILABLE, reason="pm4py non installe")
+    def test_variants_detected(self, sample_events):
+        df = sessions_to_event_log(sample_events)
+        kpis = compute_kpis(df)
+        assert kpis["variants_count"] >= 1
+        assert len(kpis["variants_top5"]) >= 1
+
+
+# ===========================================================================
+# Tests : chargement JSONL
+# ===========================================================================
+
+
+class TestLoadJsonlSession:
+    """Tests du chargement de fichiers JSONL."""
+
+    def test_load_basic(self, sample_jsonl_file):
+        events = load_jsonl_session(sample_jsonl_file)
+        assert len(events) == len(SAMPLE_EVENTS)
+
+    def test_load_nonexistent(self):
+        with pytest.raises(FileNotFoundError):
+            load_jsonl_session("/tmp/nonexistent_file.jsonl")
+
+    def test_load_with_blank_lines(self, tmp_path):
+        jsonl_file = tmp_path / "with_blanks.jsonl"
+        with open(jsonl_file, "w") as f:
+            f.write('{"session_id": "s1", "timestamp": 1.0, "event": {"type": "mouse_click", "timestamp": 1.0, "window": {"title": "X", "app_name": "x.exe"}}}\n')
+            f.write("\n")
+            f.write('{"session_id": "s1", "timestamp": 2.0, "event": {"type": "mouse_click", "timestamp": 2.0, "window": {"title": "X", "app_name": "x.exe"}}}\n')
+        events = load_jsonl_session(str(jsonl_file))
+        assert len(events) == 2
+
+    def test_load_with_invalid_line(self, tmp_path):
+        jsonl_file = tmp_path / "with_invalid.jsonl"
+        with open(jsonl_file, "w") as f:
+            f.write('{"valid": true}\n')
+            f.write("this is not json\n")
+            f.write('{"also_valid": true}\n')
+        events = load_jsonl_session(str(jsonl_file))
+        assert len(events) == 2
+
+
+# ===========================================================================
+# Test avec donnees reelles
+# ===========================================================================
+
+# Chercher une session reelle disponible
+_REAL_SESSION_DIRS = [
+    "/home/dom/ai/rpa_vision_v3/data/training/live_sessions/DESKTOP-ST3VBSD_windows/sess_20260413T084906_748092",
+    "/home/dom/ai/rpa_vision_v3/data/training/live_sessions/sess_20260314T102557_dada53",
+]
+_REAL_SESSION = None
+for d in _REAL_SESSION_DIRS:
+    jsonl = Path(d) / "live_events.jsonl"
+    if jsonl.exists():
+        _REAL_SESSION = str(jsonl)
+        break
+
+
+@pytest.mark.slow
+@pytest.mark.skipif(_REAL_SESSION is None, reason="Pas de session reelle disponible")
+@pytest.mark.skipif(not PM4PY_AVAILABLE, reason="pm4py non installe")
+class TestWithRealSessionData:
+    """Test complet avec une session reelle."""
+
+    def test_full_pipeline(self):
+        """Charge -> Convertit -> BPMN -> KPIs sur donnees reelles."""
+        # 1. Charger
+        events = load_jsonl_session(_REAL_SESSION)
+        assert len(events) > 0, f"Session vide : {_REAL_SESSION}"
+
+        # 2. Convertir en event log
+        df = sessions_to_event_log(events)
+        assert not df.empty
+        assert df["case:concept:name"].nunique() >= 1
+
+        # 3. Decouvrir BPMN
+        with tempfile.TemporaryDirectory(prefix="pm_real_") as tmpdir:
+            result = discover_bpmn(df, output_dir=tmpdir, name="real_session")
+            assert Path(result["bpmn_xml_path"]).exists()
+            xml_content = Path(result["bpmn_xml_path"]).read_text()
+            assert len(xml_content) > 100
+
+            # Verifier image si generee
+            if result["bpmn_image_path"]:
+                assert Path(result["bpmn_image_path"]).exists()
+
+        # 4. Calculer KPIs
+        kpis = compute_kpis(df)
+        assert kpis["total_events"] > 0
+        assert kpis["unique_activities"] > 0
+
+        # 5. Afficher un resume (visible dans le stdout pytest -s)
+        print("\n=== Process Mining - Session reelle ===")
+        print(f"Fichier : {_REAL_SESSION}")
+        print(f"Events bruts : {len(events)}")
+        print(f"Events pertinents : {kpis['total_events']}")
+        print(f"Activites uniques : {kpis['unique_activities']}")
+        print(f"Variantes : {kpis['variants_count']}")
+        print(f"Duree moyenne : {kpis['avg_case_duration_seconds']:.1f}s")
+        print(f"Top variantes : {kpis['variants_top5'][:3]}")
+        print(f"Goulots : {kpis['bottlenecks']}")
+        print(f"Apps : {kpis['app_distribution']}")
--- a/tests/unit/test_screen_change_detector.py
+++ b/tests/unit/test_screen_change_detector.py
@@ -0,0 +1,222 @@
+"""Tests pour le module screen_change_detector (pHash).
+
+Charge des screenshots réels de sessions live et vérifie que :
+- le calcul de pHash est rapide (<5ms par image)
+- les seuils SAME/MINOR/MAJOR sont cohérents
+- les heartbeats consécutifs sont classés SAME (même écran, ~5s d'intervalle)
+- les shots d'actions différentes ont une distance plus élevée
+"""
+
+import os
+import time
+import glob
+import pytest
+from PIL import Image
+
+from core.analytics.screen_change_detector import (
+    compute_phash,
+    compare_screenshots,
+    compare_hashes,
+    ScreenChangeLevel,
+)
+
+# Dossier de la session la plus riche en screenshots
+SESSION_DIR = os.path.join(
+    os.path.dirname(__file__),
+    "..", "..",
+    "data", "training", "live_sessions",
+    "sess_20260314T173236_c7de11", "shots",
+)
+SESSION_DIR = os.path.normpath(SESSION_DIR)
+
+
+def _load_heartbeats(max_count: int = 10):
+    """Charge les heartbeat screenshots (captures régulières toutes les ~5s)."""
+    pattern = os.path.join(SESSION_DIR, "heartbeat_*.png")
+    files = sorted(glob.glob(pattern))[:max_count]
+    images = []
+    for f in files:
+        img = Image.open(f)
+        images.append((os.path.basename(f), img))
+    return images
+
+
+def _load_action_shots(max_count: int = 10):
+    """Charge les shots d'actions (captures déclenchées par des événements utilisateur)."""
+    pattern = os.path.join(SESSION_DIR, "shot_*_full.png")
+    files = sorted(glob.glob(pattern))[:max_count]
+    images = []
+    for f in files:
+        img = Image.open(f)
+        images.append((os.path.basename(f), img))
+    return images
+
+
+@pytest.fixture(scope="module")
+def heartbeats():
+    imgs = _load_heartbeats(10)
+    if len(imgs) < 2:
+        pytest.skip("Pas assez de heartbeats dans la session de test")
+    return imgs
+
+
+@pytest.fixture(scope="module")
+def action_shots():
+    imgs = _load_action_shots(10)
+    if len(imgs) < 2:
+        pytest.skip("Pas assez de shots d'action dans la session de test")
+    return imgs
+
+
+class TestPHashPerformance:
+    """Vérifie que le calcul de pHash est rapide (<5ms par image)."""
+
+    def test_phash_speed(self, heartbeats):
+        """Le pHash doit être calculé en moins de 50ms par image (screenshots 2560x1600)."""
+        times = []
+        for name, img in heartbeats:
+            t0 = time.perf_counter()
+            h = compute_phash(img)
+            elapsed_ms = (time.perf_counter() - t0) * 1000
+            times.append(elapsed_ms)
+            print(f"  pHash({name}): {elapsed_ms:.2f}ms -> {h}")
+
+        # Exclure le premier appel (chargement initial plus lent)
+        warm_times = times[1:] if len(times) > 1 else times
+        avg_ms = sum(warm_times) / len(warm_times)
+        max_ms = max(warm_times)
+        print(f"\n  Moyenne (hors warmup): {avg_ms:.2f}ms | Max: {max_ms:.2f}ms | N={len(warm_times)}")
+        # ~15ms par hash pour des screenshots 2560x1600, seuil large pour CI
+        assert avg_ms < 50.0, f"pHash trop lent: {avg_ms:.2f}ms en moyenne (attendu <50ms)"
+
+    def test_comparison_speed(self, heartbeats):
+        """La comparaison de deux screenshots doit prendre moins de 100ms."""
+        if len(heartbeats) < 2:
+            pytest.skip("Pas assez d'images")
+
+        # Warmup
+        _ = compute_phash(heartbeats[0][1])
+
+        t0 = time.perf_counter()
+        distance, level = compare_screenshots(heartbeats[0][1], heartbeats[1][1])
+        elapsed_ms = (time.perf_counter() - t0) * 1000
+        print(f"  compare_screenshots: {elapsed_ms:.2f}ms (distance={distance}, level={level.value})")
+        assert elapsed_ms < 100.0, f"Comparaison trop lente: {elapsed_ms:.2f}ms"
+
+
+class TestHeartbeatConsistency:
+    """Les heartbeats consécutifs (~5s) doivent être classés SAME ou MINOR."""
+
+    def test_consecutive_heartbeats_are_similar(self, heartbeats):
+        """Les heartbeats consécutifs ne doivent pas être classés MAJOR."""
+        # Pré-calcul des hashes
+        hashes = []
+        for name, img in heartbeats:
+            hashes.append((name, compute_phash(img)))
+
+        print("\n  Comparaisons consécutives des heartbeats:")
+        for i in range(len(hashes) - 1):
+            name1, h1 = hashes[i]
+            name2, h2 = hashes[i + 1]
+            distance, level = compare_hashes(h1, h2)
+            print(f"  {name1} <-> {name2}: distance={distance}, level={level.value}")
+            # Les heartbeats sont pris toutes les 5s environ sur le même écran
+            # On s'attend a SAME ou MINOR (curseur, horloge, etc.)
+            # Note : certains heartbeats peuvent capturer un changement d'écran
+            # donc on ne peut pas garantir SAME pour tous, mais la majorité doit l'être
+
+
+class TestActionShotsDifferences:
+    """Les shots d'actions différentes doivent montrer des changements."""
+
+    def test_action_shots_show_variation(self, action_shots):
+        """Au moins certaines paires de shots d'action doivent montrer des changements."""
+        hashes = []
+        for name, img in action_shots:
+            hashes.append((name, compute_phash(img)))
+
+        print("\n  Comparaisons des shots d'action:")
+        distances = []
+        for i in range(len(hashes) - 1):
+            name1, h1 = hashes[i]
+            name2, h2 = hashes[i + 1]
+            distance, level = compare_hashes(h1, h2)
+            distances.append(distance)
+            print(f"  {name1} <-> {name2}: distance={distance}, level={level.value}")
+
+        # On s'attend à ce que au moins certaines paires aient une distance > 0
+        max_distance = max(distances) if distances else 0
+        print(f"\n  Distance max entre shots: {max_distance}")
+        assert max_distance > 0, "Tous les shots d'action sont identiques, ce n'est pas normal"
+
+
+class TestThresholdCoherence:
+    """Vérifie que les seuils SAME/MINOR/MAJOR sont cohérents."""
+
+    def test_same_image_is_same(self, heartbeats):
+        """La même image comparée à elle-même doit donner distance=0, SAME."""
+        img = heartbeats[0][1]
+        distance, level = compare_screenshots(img, img)
+        assert distance == 0
+        assert level == ScreenChangeLevel.SAME
+
+    def test_heartbeat_vs_action_shot(self, heartbeats, action_shots):
+        """Un heartbeat vs un shot d'action lointain doit être MINOR ou MAJOR."""
+        # Prend le premier heartbeat et le dernier shot d'action
+        _, img1 = heartbeats[0]
+        _, img2 = action_shots[-1]
+        distance, level = compare_screenshots(img1, img2)
+        print(f"  heartbeat[0] vs action_shot[-1]: distance={distance}, level={level.value}")
+        # On vérifie juste que ça fonctionne sans erreur
+        assert distance >= 0
+        assert isinstance(level, ScreenChangeLevel)
+
+    def test_compare_hashes_matches_compare_screenshots(self, heartbeats):
+        """compare_hashes doit donner le même résultat que compare_screenshots."""
+        if len(heartbeats) < 2:
+            pytest.skip("Pas assez d'images")
+
+        img1 = heartbeats[0][1]
+        img2 = heartbeats[1][1]
+
+        d1, l1 = compare_screenshots(img1, img2)
+        h1 = compute_phash(img1)
+        h2 = compute_phash(img2)
+        d2, l2 = compare_hashes(h1, h2)
+
+        assert d1 == d2
+        assert l1 == l2
+
+
+class TestFullSessionSummary:
+    """Résumé complet de la session pour validation humaine."""
+
+    def test_full_session_summary(self, heartbeats, action_shots):
+        """Affiche un résumé complet des distances pour validation humaine."""
+        all_images = heartbeats + action_shots
+        hashes = [(name, compute_phash(img)) for name, img in all_images]
+
+        print("\n  === RÉSUMÉ COMPLET DE LA SESSION ===")
+        print(f"  {len(heartbeats)} heartbeats + {len(action_shots)} shots d'action")
+
+        same_count = 0
+        minor_count = 0
+        major_count = 0
+        total_comparisons = 0
+
+        for i in range(len(hashes) - 1):
+            name1, h1 = hashes[i]
+            name2, h2 = hashes[i + 1]
+            distance, level = compare_hashes(h1, h2)
+            total_comparisons += 1
+            if level == ScreenChangeLevel.SAME:
+                same_count += 1
+            elif level == ScreenChangeLevel.MINOR:
+                minor_count += 1
+            else:
+                major_count += 1
+
+        print(f"  Comparaisons consécutives: {total_comparisons}")
+        print(f"    SAME  (<5):  {same_count} ({100*same_count/max(total_comparisons,1):.0f}%)")
+        print(f"    MINOR (5-15): {minor_count} ({100*minor_count/max(total_comparisons,1):.0f}%)")
+        print(f"    MAJOR (>=15): {major_count} ({100*major_count/max(total_comparisons,1):.0f}%)")
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -534,8 +534,11 @@ def execute_ai_analyze(params: dict) -> dict:

 def execute_extract_text(params: dict) -> dict:
    """
-    Extrait du texte depuis l'écran via Ollama VLM.
-    Capture la zone de l'ancre (ou l'écran entier) et demande au VLM d'extraire le texte.
+    Extrait du texte depuis l'écran.
+
+    Stratégie : docTR (OCR local rapide) par défaut pour les modes simples
+    (full, lines, words, numbers). Fallback sur Ollama VLM pour le mode
+    "vlm"/"ai" ou si docTR échoue.
    """
    import requests
    import re
@@ -549,7 +552,9 @@ def execute_extract_text(params: dict) -> dict:
        extraction_mode = params.get('extraction_mode', 'full')
        text_filters = params.get('text_filters', [])

+        # --- 1. Capture de l'image ---
        screenshot_base64 = anchor.get('screenshot') if anchor else None
+        pil_image = None  # On garde l'image PIL pour docTR

        if not screenshot_base64:
            try:
@@ -562,56 +567,100 @@ def execute_extract_text(params: dict) -> dict:
                    x, y = int(bbox.get('x', 0)), int(bbox.get('y', 0))
                    w, h = int(bbox.get('width', 100)), int(bbox.get('height', 100))
                    print(f"📸 [OCR] Capture zone: ({x}, {y}) -> ({x+w}, {y+h})")
-                    screenshot = ImageGrab.grab(bbox=(x, y, x + w, y + h))
+                    pil_image = ImageGrab.grab(bbox=(x, y, x + w, y + h))
                else:
                    print(f"📸 [OCR] Capture écran complet")
-                    screenshot = ImageGrab.grab()
+                    pil_image = ImageGrab.grab()

                buffer = io.BytesIO()
-                screenshot.save(buffer, format='PNG')
+                pil_image.save(buffer, format='PNG')
                screenshot_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
            except Exception as cap_err:
                return {'success': False, 'error': f"Erreur capture: {cap_err}"}
+        else:
+            # Décoder le base64 en image PIL pour docTR
+            try:
+                import io
+                from PIL import Image as PILImage
+                img_bytes = base64.b64decode(screenshot_base64)
+                pil_image = PILImage.open(io.BytesIO(img_bytes))
+            except Exception:
+                pil_image = None

        if not screenshot_base64:
            return {'success': False, 'error': "Pas d'image à analyser"}

-        prompt_map = {
-            'full': "Extrais TOUT le texte visible dans cette image. Retourne uniquement le texte brut, sans commentaire.",
-            'numbers': "Extrais uniquement les nombres et chiffres visibles dans cette image. Retourne-les séparés par des espaces.",
-            'lines': "Extrais tout le texte visible ligne par ligne. Une ligne par ligne de texte visible.",
-            'words': "Extrais tous les mots visibles dans cette image, séparés par des espaces.",
-        }
-        prompt = prompt_map.get(extraction_mode, prompt_map['full'])
+        # --- 2. Modes docTR (rapide) vs VLM (raisonnement) ---
+        use_vlm = extraction_mode in ('vlm', 'ai')
+        extracted_text = None
+        engine_used = None

-        if 'qwen' in model.lower() and not prompt.startswith('/no_think'):
-            prompt = f"/no_think\n{prompt}"
+        if not use_vlm and pil_image is not None:
+            # Essayer docTR d'abord pour les modes simples
+            try:
+                from visual_workflow_builder.backend.services.ocr_service import (
+                    ocr_extract_text,
+                )
+                print(f"📝 [OCR] Extraction texte via docTR (mode: {extraction_mode})...")
+                raw_text = ocr_extract_text(pil_image)

-        print(f"📝 [OCR] Extraction texte avec {model} (mode: {extraction_mode})...")
+                if raw_text and raw_text.strip():
+                    extracted_text = raw_text.strip()
+                    engine_used = "doctr"
+                    print(f"✅ [OCR] docTR OK ({len(extracted_text)} caractères)")
+                else:
+                    print("⚠️ [OCR] docTR n'a rien extrait, fallback Ollama VLM")
+            except ImportError:
+                print("⚠️ [OCR] docTR non disponible, fallback Ollama VLM")
+            except Exception as doctr_err:
+                print(f"⚠️ [OCR] Erreur docTR: {doctr_err}, fallback Ollama VLM")

-        ollama_url = params.get('ollama_url', 'http://localhost:11434')
-        payload = {
-            "model": model,
-            "prompt": prompt,
-            "images": [screenshot_base64],
-            "stream": False,
-            "options": {"temperature": 0.1, "num_predict": 4000}
-        }
+        # --- 3. Fallback Ollama VLM si docTR n'a rien donné ---
+        if extracted_text is None:
+            prompt_map = {
+                'full': "Extrais TOUT le texte visible dans cette image. Retourne uniquement le texte brut, sans commentaire.",
+                'numbers': "Extrais uniquement les nombres et chiffres visibles dans cette image. Retourne-les séparés par des espaces.",
+                'lines': "Extrais tout le texte visible ligne par ligne. Une ligne par ligne de texte visible.",
+                'words': "Extrais tous les mots visibles dans cette image, séparés par des espaces.",
+                'vlm': "Extrais TOUT le texte visible dans cette image. Retourne uniquement le texte brut, sans commentaire.",
+                'ai': "Extrais TOUT le texte visible dans cette image. Retourne uniquement le texte brut, sans commentaire.",
+            }
+            prompt = prompt_map.get(extraction_mode, prompt_map['full'])

-        response = requests.post(
-            f"{ollama_url}/api/generate",
-            json=payload,
-            timeout=timeout_ms / 1000
-        )
+            if 'qwen' in model.lower() and not prompt.startswith('/no_think'):
+                prompt = f"/no_think\n{prompt}"

-        if response.status_code != 200:
-            return {'success': False, 'error': f"Erreur Ollama: {response.status_code}"}
+            print(f"📝 [OCR] Extraction texte avec {model} (mode: {extraction_mode})...")

-        result = response.json()
-        extracted_text = result.get('response', '').strip()
+            ollama_url = params.get('ollama_url', 'http://localhost:11434')
+            payload = {
+                "model": model,
+                "prompt": prompt,
+                "images": [screenshot_base64],
+                "stream": False,
+                "options": {"temperature": 0.1, "num_predict": 4000}
+            }

-        if not extracted_text and result.get('thinking'):
-            extracted_text = result.get('thinking', '').strip()
+            response = requests.post(
+                f"{ollama_url}/api/generate",
+                json=payload,
+                timeout=timeout_ms / 1000
+            )
+
+            if response.status_code != 200:
+                return {'success': False, 'error': f"Erreur Ollama: {response.status_code}"}
+
+            result = response.json()
+            extracted_text = result.get('response', '').strip()
+
+            if not extracted_text and result.get('thinking'):
+                extracted_text = result.get('thinking', '').strip()
+
+            engine_used = f"ollama/{model}"
+
+        # --- 4. Application des filtres ---
+        if extracted_text is None:
+            extracted_text = ""

        for f in text_filters:
            if f == 'digits_only':
@@ -625,7 +674,7 @@ def execute_extract_text(params: dict) -> dict:
            elif f == 'lowercase':
                extracted_text = extracted_text.lower()

-        print(f"✅ [OCR] Texte extrait ({len(extracted_text)} caractères)")
+        print(f"✅ [OCR] Texte extrait ({len(extracted_text)} caractères) via {engine_used}")
        if extracted_text:
            print(f"   Résultat: {extracted_text[:150]}...")

@@ -639,7 +688,7 @@ def execute_extract_text(params: dict) -> dict:
                'character_count': len(extracted_text),
                'word_count': len(extracted_text.split()) if extracted_text else 0,
                'mode': extraction_mode,
-                'model': model
+                'engine': engine_used
            }
        }