"""Détection du type de page.

Deux stratégies :
1. `route_by_index` (rapide, défaut) : exploite le fait que les fiches OGC
   respectent un ordre standardisé de 6 pages. Pas d'OCR, 0 coût.
2. `detect_page_type` (OCR de l'en-tête) : fallback quand l'ordre standard
   n'est pas respecté ou quand on veut vérifier explicitement.
"""
import re
from pathlib import Path
from PIL import Image
from .ocr_qwen import QwenVLOCR
from .prompts import PAGE_TYPES


# Ordre canonique des 6 pages d'un dossier OGC standardisé
STANDARD_ORDER = [
    "recueil",
    "concertation_med",
    "hospitalisation",
    "preuves",
    "concertation_2",
    "concertation_1",
]


def route_by_index(num_pages: int) -> list[str]:
    """Retourne le type attendu pour chaque page selon l'ordre standard.

    Si le dossier a moins de 6 pages, on prend le préfixe de STANDARD_ORDER.
    Si plus de 6 pages, les pages supplémentaires sont marquées "inconnu".
    """
    types = []
    for i in range(num_pages):
        types.append(STANDARD_ORDER[i] if i < len(STANDARD_ORDER) else "inconnu")
    return types


def crop_header(image_path: Path, out_path: Path | None = None) -> Path:
    """Crop la bande d'en-tête (haut 12% de la page) pour classification rapide.

    Important : le fichier produit ne doit PAS matcher le glob 'page_*.png'
    qu'utilise pdf_to_images pour lister les pages, sinon il serait relu
    comme une page au run suivant (ratio d'aspect cassé).
    """
    img = Image.open(image_path)
    w, h = img.size
    header = img.crop((0, 0, w, int(h * 0.12)))
    if out_path is None:
        # Sous-dossier dédié pour isoler les crops temporaires
        headers_dir = image_path.parent / "_headers"
        headers_dir.mkdir(exist_ok=True)
        out_path = headers_dir / f"{image_path.stem}.png"
    header.save(out_path, "PNG")
    return out_path


def detect_page_type(image_path: Path, ocr: QwenVLOCR | None = None) -> tuple[str, str]:
    """Classifie une page. Retourne (type, header_text)."""
    ocr = ocr or QwenVLOCR()
    header_path = crop_header(image_path)
    res = ocr.run(header_path, "Text Recognition:", max_new_tokens=200)
    text = res["text"].upper()
    # Normaliser les caractères accentués pour le matching
    text_norm = re.sub(r"[ÉÈÊË]", "E", text)
    text_norm = re.sub(r"[ÀÂÄ]", "A", text_norm)
    for ptype, conf in PAGE_TYPES.items():
        if any(kw in text_norm for kw in conf["keywords"]):
            return ptype, res["text"]
    return "inconnu", res["text"]