feat: ajout RAG CIM-10 avec FAISS + Ollama

Implémente un système RAG (Retrieval Augmented Generation) qui indexe les documents de référence ATIH (CIM-10 FR 2026, Guide Métho MCO, CCAM PMSI) et utilise Ollama (mistral-small3.2:24b) pour justifier et valider le codage CIM-10 des diagnostics. - Nouveaux modèles Pydantic : RAGSource, Diagnostic étendu (confidence, justification, sources_rag) — rétrocompatible - Module rag_index.py : chunking des 3 PDFs, embedding sentence-camembert-large, index FAISS IndexFlatIP (3630 vecteurs) - Module rag_search.py : recherche FAISS + appel Ollama avec fallback double - Flag CLI --no-rag pour désactiver l'enrichissement RAG - 18 nouveaux tests (88/88 passent) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 17:47:08 +01:00
parent 4a12cd2676
commit 4d6fbef2b9
8 changed files with 885 additions and 4 deletions
--- a/src/medical/rag_index.py
+++ b/src/medical/rag_index.py
@@ -0,0 +1,352 @@
+"""Indexation FAISS des documents de référence CIM-10 / Guide métho / CCAM."""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Optional
+
+import pdfplumber
+
+from ..config import RAG_INDEX_DIR, CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF
+
+logger = logging.getLogger(__name__)
+
+# Singleton pour l'index chargé en mémoire
+_faiss_index = None
+_metadata: list[dict] = []
+
+
+@dataclass
+class Chunk:
+    text: str
+    document: str  # "cim10", "guide_methodo", "ccam"
+    page: Optional[int] = None
+    code: Optional[str] = None
+
+
+# ---------------------------------------------------------------------------
+# Chunking CIM-10
+# ---------------------------------------------------------------------------
+
+def _chunk_cim10(pdf_path: Path) -> list[Chunk]:
+    """Découpe le PDF CIM-10 en chunks par code 3 caractères (ex: K80, K85)."""
+    chunks: list[Chunk] = []
+    current_code: str | None = None
+    current_text: list[str] = []
+    current_page: int | None = None
+
+    # Pattern pour détecter un code CIM-10 à 3 caractères en début de ligne
+    code3_pattern = re.compile(r"^([A-Z]\d{2})\s+(.+)")
+    # Pattern pour les sous-codes (ex: K80.0, K80.1)
+    subcode_pattern = re.compile(r"^([A-Z]\d{2}\.\d+)\s+(.+)")
+
+    logger.info("Extraction des chunks CIM-10 depuis %s", pdf_path.name)
+
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, start=1):
+            text = page.extract_text()
+            if not text:
+                continue
+
+            for line in text.split("\n"):
+                line = line.strip()
+                if not line:
+                    continue
+
+                m = code3_pattern.match(line)
+                if m and not subcode_pattern.match(line):
+                    # Nouveau code 3-char → sauvegarder le chunk précédent
+                    if current_code and current_text:
+                        chunk_text = "\n".join(current_text)
+                        if len(chunk_text.split()) >= 5:
+                            chunks.append(Chunk(
+                                text=chunk_text,
+                                document="cim10",
+                                page=current_page,
+                                code=current_code,
+                            ))
+                    current_code = m.group(1)
+                    current_text = [line]
+                    current_page = page_num
+                else:
+                    if current_code:
+                        current_text.append(line)
+
+    # Dernier chunk
+    if current_code and current_text:
+        chunk_text = "\n".join(current_text)
+        if len(chunk_text.split()) >= 5:
+            chunks.append(Chunk(
+                text=chunk_text,
+                document="cim10",
+                page=current_page,
+                code=current_code,
+            ))
+
+    logger.info("CIM-10 : %d chunks extraits", len(chunks))
+    return chunks
+
+
+# ---------------------------------------------------------------------------
+# Chunking Guide Méthodologique MCO
+# ---------------------------------------------------------------------------
+
+def _chunk_guide_methodo(pdf_path: Path) -> list[Chunk]:
+    """Découpe le Guide Méthodologique MCO par sections/titres."""
+    chunks: list[Chunk] = []
+    current_title: str | None = None
+    current_text: list[str] = []
+    current_page: int | None = None
+
+    # Patterns de titres de sections (chapitres, sous-chapitres)
+    title_patterns = [
+        re.compile(r"^((?:CHAPITRE|TITRE|PARTIE)\s+[IVXLCDM0-9]+.*)$", re.IGNORECASE),
+        re.compile(r"^(\d+\.\d*\s+[A-ZÉÈÊÀÂÔÙÛÜ].{5,})$"),
+        re.compile(r"^([A-ZÉÈÊÀÂÔÙÛÜ][A-ZÉÈÊÀÂÔÙÛÜ\s]{10,})$"),
+    ]
+
+    logger.info("Extraction des chunks Guide Métho depuis %s", pdf_path.name)
+
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, start=1):
+            text = page.extract_text()
+            if not text:
+                continue
+
+            for line in text.split("\n"):
+                line = line.strip()
+                if not line:
+                    continue
+
+                is_title = False
+                for pat in title_patterns:
+                    if pat.match(line):
+                        is_title = True
+                        break
+
+                if is_title and len(line) > 8:
+                    # Sauvegarder le chunk précédent
+                    if current_title and current_text:
+                        chunk_text = current_title + "\n" + "\n".join(current_text)
+                        if len(chunk_text.split()) >= 20:
+                            chunks.append(Chunk(
+                                text=chunk_text,
+                                document="guide_methodo",
+                                page=current_page,
+                            ))
+                    current_title = line
+                    current_text = []
+                    current_page = page_num
+                else:
+                    current_text.append(line)
+
+    # Dernier chunk
+    if current_title and current_text:
+        chunk_text = current_title + "\n" + "\n".join(current_text)
+        if len(chunk_text.split()) >= 20:
+            chunks.append(Chunk(
+                text=chunk_text,
+                document="guide_methodo",
+                page=current_page,
+            ))
+
+    # Si trop peu de chunks (le PDF ne suit pas les patterns de titre),
+    # fallback : découper par pages groupées par 3
+    if len(chunks) < 10:
+        logger.info("Guide Métho : fallback découpe par pages (peu de titres détectés)")
+        chunks = []
+        with pdfplumber.open(pdf_path) as pdf:
+            page_texts: list[str] = []
+            start_page = 1
+            for page_num, page in enumerate(pdf.pages, start=1):
+                text = page.extract_text()
+                if text:
+                    page_texts.append(text)
+                if len(page_texts) >= 3:
+                    combined = "\n".join(page_texts)
+                    if len(combined.split()) >= 20:
+                        chunks.append(Chunk(
+                            text=combined,
+                            document="guide_methodo",
+                            page=start_page,
+                        ))
+                    page_texts = []
+                    start_page = page_num + 1
+            if page_texts:
+                combined = "\n".join(page_texts)
+                if len(combined.split()) >= 20:
+                    chunks.append(Chunk(
+                        text=combined,
+                        document="guide_methodo",
+                        page=start_page,
+                    ))
+
+    logger.info("Guide Métho : %d chunks extraits", len(chunks))
+    return chunks
+
+
+# ---------------------------------------------------------------------------
+# Chunking CCAM
+# ---------------------------------------------------------------------------
+
+def _chunk_ccam(pdf_path: Path) -> list[Chunk]:
+    """Découpe le PDF CCAM en chunks par code d'acte."""
+    chunks: list[Chunk] = []
+    ccam_pattern = re.compile(r"([A-Z]{4}\d{3})\s+(.*)")
+
+    logger.info("Extraction des chunks CCAM depuis %s", pdf_path.name)
+
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, start=1):
+            text = page.extract_text()
+            if not text:
+                continue
+
+            current_code: str | None = None
+            current_lines: list[str] = []
+
+            for line in text.split("\n"):
+                line = line.strip()
+                if not line:
+                    continue
+
+                m = ccam_pattern.match(line)
+                if m:
+                    if current_code and current_lines:
+                        chunks.append(Chunk(
+                            text="\n".join(current_lines),
+                            document="ccam",
+                            page=page_num,
+                            code=current_code,
+                        ))
+                    current_code = m.group(1)
+                    current_lines = [line]
+                elif current_code:
+                    current_lines.append(line)
+
+            if current_code and current_lines:
+                chunks.append(Chunk(
+                    text="\n".join(current_lines),
+                    document="ccam",
+                    page=page_num,
+                    code=current_code,
+                ))
+
+    # Fallback : si aucun code CCAM détecté, indexer par page
+    if not chunks:
+        logger.info("CCAM : aucun code détecté, fallback par page")
+        with pdfplumber.open(pdf_path) as pdf:
+            for page_num, page in enumerate(pdf.pages, start=1):
+                text = page.extract_text()
+                if text and len(text.split()) >= 10:
+                    chunks.append(Chunk(
+                        text=text,
+                        document="ccam",
+                        page=page_num,
+                    ))
+
+    logger.info("CCAM : %d chunks extraits", len(chunks))
+    return chunks
+
+
+# ---------------------------------------------------------------------------
+# Construction de l'index FAISS
+# ---------------------------------------------------------------------------
+
+def build_index(force: bool = False) -> None:
+    """Construit l'index FAISS à partir des 3 PDFs de référence.
+
+    Args:
+        force: Si True, reconstruit même si l'index existe déjà.
+    """
+    import faiss
+    import numpy as np
+    from sentence_transformers import SentenceTransformer
+
+    index_path = RAG_INDEX_DIR / "faiss.index"
+    meta_path = RAG_INDEX_DIR / "metadata.json"
+
+    if not force and index_path.exists() and meta_path.exists():
+        logger.info("Index FAISS déjà existant dans %s (use force=True pour reconstruire)", RAG_INDEX_DIR)
+        return
+
+    # Collecter tous les chunks
+    all_chunks: list[Chunk] = []
+
+    for pdf_path, chunk_fn in [
+        (CIM10_PDF, _chunk_cim10),
+        (GUIDE_METHODO_PDF, _chunk_guide_methodo),
+        (CCAM_PDF, _chunk_ccam),
+    ]:
+        if pdf_path.exists():
+            all_chunks.extend(chunk_fn(pdf_path))
+        else:
+            logger.warning("PDF non trouvé : %s", pdf_path)
+
+    if not all_chunks:
+        logger.error("Aucun chunk extrait — vérifiez les chemins des PDFs")
+        return
+
+    logger.info("Total : %d chunks à indexer", len(all_chunks))
+
+    # Embeddings — forcer CPU pour éviter les bugs CUDA avec ce modèle
+    logger.info("Chargement du modèle d'embedding dangvantuan/sentence-camembert-large (CPU)...")
+    model = SentenceTransformer("dangvantuan/sentence-camembert-large", device="cpu")
+    model.max_seq_length = 512  # CamemBERT max position embeddings
+
+    texts = [c.text[:2000] for c in all_chunks]  # Tronquer les chunks trop longs
+    logger.info("Calcul des embeddings pour %d chunks...", len(texts))
+    embeddings = model.encode(
+        texts, show_progress_bar=True, normalize_embeddings=True, batch_size=64,
+    )
+    embeddings = np.array(embeddings, dtype=np.float32)
+
+    # Index FAISS (IndexFlatIP = cosine similarity avec vecteurs normalisés)
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatIP(dim)
+    index.add(embeddings)
+
+    # Sauvegarder
+    RAG_INDEX_DIR.mkdir(parents=True, exist_ok=True)
+    faiss.write_index(index, str(index_path))
+
+    metadata = [asdict(c) for c in all_chunks]
+    # Ne pas sauvegarder le texte complet dans metadata (trop lourd),
+    # garder un extrait de 500 chars
+    for m in metadata:
+        m["extrait"] = m.pop("text")[:500]
+
+    meta_path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    logger.info("Index FAISS sauvegardé : %s (%d vecteurs, dim=%d)", index_path, len(all_chunks), dim)
+
+
+def get_index() -> tuple | None:
+    """Charge l'index FAISS et les métadonnées (singleton lazy-loaded).
+
+    Returns:
+        Tuple (faiss_index, metadata_list) ou None si l'index n'existe pas.
+    """
+    global _faiss_index, _metadata
+
+    if _faiss_index is not None:
+        return _faiss_index, _metadata
+
+    index_path = RAG_INDEX_DIR / "faiss.index"
+    meta_path = RAG_INDEX_DIR / "metadata.json"
+
+    if not index_path.exists() or not meta_path.exists():
+        logger.warning("Index FAISS non trouvé dans %s — lancez build_index() d'abord", RAG_INDEX_DIR)
+        return None
+
+    import faiss
+
+    _faiss_index = faiss.read_index(str(index_path))
+    _metadata = json.loads(meta_path.read_text(encoding="utf-8"))
+
+    logger.info("Index FAISS chargé : %d vecteurs", _faiss_index.ntotal)
+    return _faiss_index, _metadata