t2a_v2/src/medical/ollama_client.py

"""Client LLM partagé — Ollama (local) avec fallback Anthropic Haiku."""

from __future__ import annotations

import json
import logging
import os
import time

import requests

from ..config import OLLAMA_URL, OLLAMA_MODEL, OLLAMA_TIMEOUT, get_model

logger = logging.getLogger(__name__)

# --- Fallback Anthropic ---
_ANTHROPIC_MODEL = os.environ.get("ANTHROPIC_FALLBACK_MODEL", "claude-haiku-4-5-20251001")
_anthropic_client = None


def _get_anthropic_client():
    """Lazy-init du client Anthropic (uniquement si clé API présente)."""
    global _anthropic_client
    if _anthropic_client is not None:
        return _anthropic_client
    api_key = os.environ.get("ANTHROPIC_API_KEY")
    if not api_key:
        return None
    try:
        import anthropic
        _anthropic_client = anthropic.Anthropic(api_key=api_key)
        return _anthropic_client
    except Exception as e:
        logger.warning("Anthropic SDK non disponible : %s", e)
        return None


def call_anthropic(
    prompt: str,
    temperature: float = 0.1,
    max_tokens: int = 2500,
) -> dict | None:
    """Appelle l'API Anthropic (Haiku)."""
    client = _get_anthropic_client()
    if client is None:
        return None
    try:
        response = client.messages.create(
            model=_ANTHROPIC_MODEL,
            max_tokens=max_tokens,
            temperature=temperature,
            messages=[{"role": "user", "content": prompt}],
        )
        raw = response.content[0].text
        result = parse_json_response(raw)
        if result is not None:
            logger.debug("Anthropic fallback OK (%s)", _ANTHROPIC_MODEL)
        return result
    except Exception as e:
        logger.warning("Anthropic fallback erreur : %s", e)
        return None


def _repair_truncated_json(text: str) -> dict | None:
    """Tente de réparer un JSON tronqué (réponse LLM coupée par max_tokens).

    Stratégie : fermer les chaînes, tableaux et objets ouverts puis réessayer.
    """
    # Étape 1 : détecter si on est dans une chaîne non fermée
    in_string = False
    escaped = False
    for ch in text:
        if escaped:
            escaped = False
            continue
        if ch == "\\":
            escaped = True
            continue
        if ch == '"':
            in_string = not in_string
    if in_string:
        text += '"'

    # Étape 2 : compter les ouvreurs/fermeurs non appariés
    in_str = False
    esc = False
    stack: list[str] = []
    for ch in text:
        if esc:
            esc = False
            continue
        if ch == "\\":
            esc = True
            continue
        if ch == '"':
            in_str = not in_str
            continue
        if in_str:
            continue
        if ch in ("{", "["):
            stack.append(ch)
        elif ch == "}" and stack and stack[-1] == "{":
            stack.pop()
        elif ch == "]" and stack and stack[-1] == "[":
            stack.pop()

    # Fermer en ordre inverse
    for opener in reversed(stack):
        text += "}" if opener == "{" else "]"

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        return None


def parse_json_response(raw: str) -> dict | None:
    """Parse une réponse JSON, en gérant les blocs markdown et le JSON tronqué."""
    text = raw.strip()
    if text.startswith("```"):
        first_nl = text.find("\n")
        if first_nl != -1:
            text = text[first_nl + 1:]
        # Trouver la fermeture ``` (peut être suivie de texte superflu du LLM)
        closing_idx = text.find("```")
        if closing_idx != -1:
            text = text[:closing_idx]
        text = text.strip()

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Tentative de réparation (JSON tronqué par max_tokens)
    repaired = _repair_truncated_json(text)
    if repaired is not None:
        logger.info("LLM : JSON tronqué réparé (%d chars)", len(text))
        return repaired

    logger.warning("LLM : JSON invalide : %s", raw[:200])
    return None


def call_ollama(
    prompt: str,
    temperature: float = 0.1,
    max_tokens: int = 2500,
    model: str | None = None,
    timeout: int | None = None,
    role: str | None = None,
) -> dict | None:
    """Appelle Ollama en mode JSON natif, avec fallback Anthropic si indisponible.

    Args:
        prompt: Le prompt à envoyer.
        temperature: Température de génération (défaut: 0.1).
        max_tokens: Nombre max de tokens (défaut: 2500).
        model: Modèle Ollama à utiliser (prioritaire sur role).
        timeout: Timeout en secondes (défaut: OLLAMA_TIMEOUT global).
        role: Rôle LLM (coding, cpam, validation, qc) → résolu via get_model().

    Returns:
        Le dict JSON parsé, ou None en cas d'erreur.
    """
    use_model = model or (get_model(role) if role else OLLAMA_MODEL)
    use_timeout = timeout or OLLAMA_TIMEOUT
    for attempt in range(3):
        try:
            response = requests.post(
                f"{OLLAMA_URL}/api/chat",
                json={
                    "model": use_model,
                    "messages": [{"role": "user", "content": prompt}],
                    "stream": False,
                    "format": "json",
                    "options": {
                        "temperature": temperature,
                        "num_predict": max_tokens,
                    },
                },
                timeout=use_timeout,
            )
            # 429 rate limit → retry avec backoff exponentiel
            if response.status_code == 429:
                delay = 2 ** attempt  # 1s, 2s, 4s
                logger.warning("Ollama 429 (rate limit) — retry dans %ds (tentative %d/3)",
                               delay, attempt + 1)
                time.sleep(delay)
                continue
            response.raise_for_status()
            resp_data = response.json()
            raw = resp_data.get("message", {}).get("content", "")
            done_reason = resp_data.get("done_reason", "")
            eval_count = resp_data.get("eval_count", 0)
            if done_reason == "length":
                logger.warning("Ollama : réponse tronquée (done_reason=length, %d tokens, %d chars)",
                               eval_count, len(raw))
            else:
                logger.debug("Ollama : réponse complète (%d tokens, %d chars)", eval_count, len(raw))
            result = parse_json_response(raw)
            if result is not None:
                return result
            if attempt < 2:
                logger.info("Ollama (%s) : retry après échec de parsing", use_model)
        except requests.ConnectionError:
            logger.info("Ollama indisponible → fallback Anthropic (%s)", _ANTHROPIC_MODEL)
            return call_anthropic(prompt, temperature, max_tokens)
        except requests.Timeout:
            logger.warning("Ollama (%s) timeout après %ds → fallback Anthropic",
                           use_model, use_timeout)
            return call_anthropic(prompt, temperature, max_tokens)
        except (requests.RequestException, json.JSONDecodeError) as e:
            logger.warning("Ollama erreur : %s", e)
            return None
    return None