From aa397d5360be39c10e7cda5d251656418defd9c5 Mon Sep 17 00:00:00 2001 From: dom Date: Fri, 13 Feb 2026 19:46:33 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20configuration=20externalis=C3=A9e=20via?= =?UTF-8?q?=20.env=20+=20audit=20requirements?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Externalise 13 variables de config via python-dotenv (chemins PDF, modèles Ollama/embedding/NER, FINESS, seuils) avec défauts identiques - Centralise EMBEDDING_MODEL dans config.py (était hardcodé en 3 endroits) - Ajoute .env.example documenté et .env au .gitignore - Ajoute openpyxl et pandas manquants au requirements.txt - Ajoute data/referentiels au mkdir de run.sh Co-Authored-By: Claude Opus 4.6 --- .env.example | 22 ++++++++++++++++++++++ .gitignore | 3 +++ requirements.txt | 3 +++ run.sh | 2 +- src/config.py | 32 ++++++++++++++++++++------------ src/medical/rag_index.py | 6 +++--- src/medical/rag_search.py | 5 +++-- 7 files changed, 55 insertions(+), 18 deletions(-) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..bb8101d --- /dev/null +++ b/.env.example @@ -0,0 +1,22 @@ +# === Référentiels PDF (chemins absolus vers les PDFs ATIH) === +# T2A_CIM10_PDF=/chemin/vers/cim-10-fr.pdf +# T2A_GUIDE_METHODO_PDF=/chemin/vers/guide_methodo_mco.pdf +# T2A_CCAM_PDF=/chemin/vers/ccam_descriptive.pdf + +# === Ollama === +# OLLAMA_URL=http://localhost:11434 +# OLLAMA_MODEL=gemma3:12b +# OLLAMA_TIMEOUT=120 +# OLLAMA_MAX_PARALLEL=2 + +# === Modèles IA === +# T2A_EMBEDDING_MODEL=dangvantuan/sentence-camembert-large +# T2A_NER_MODEL=Jean-Baptiste/camembert-ner +# T2A_NER_THRESHOLD=0.80 + +# === Établissement === +# T2A_FINESS=000000000 +# T2A_NUM_UM=0000 + +# === Anonymisation === +# T2A_KEEP_ESTABLISHMENT=True diff --git a/.gitignore b/.gitignore index 8770dd6..be505c7 100644 --- a/.gitignore +++ b/.gitignore @@ -16,5 +16,8 @@ data/ *.xls *.xlsx +# Configuration locale +.env + # IDE / outils .claude/ diff --git a/requirements.txt b/requirements.txt index cf6fd69..db44d54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,6 @@ faiss-cpu>=1.7.0 sentence-transformers>=2.2.0 requests>=2.28.0 flask>=3.0.0 +python-dotenv>=1.0.0 +openpyxl>=3.0.0 +pandas>=2.0.0 diff --git a/run.sh b/run.sh index 56b481c..f2f420f 100755 --- a/run.sh +++ b/run.sh @@ -27,7 +27,7 @@ else fi # Créer les répertoires nécessaires -mkdir -p input output/anonymized output/structured output/reports data/rag_index +mkdir -p input output/anonymized output/structured output/reports data/rag_index data/referentiels echo "" echo "✨ Application prête !" diff --git a/src/config.py b/src/config.py index 615e238..2f7b98d 100644 --- a/src/config.py +++ b/src/config.py @@ -2,11 +2,15 @@ from __future__ import annotations +import os from pathlib import Path from typing import Optional +from dotenv import load_dotenv from pydantic import BaseModel, Field +load_dotenv() + # --- Chemins --- @@ -23,24 +27,24 @@ for d in (INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR, REPORTS_DIR): # --- Configuration anonymisation --- -KEEP_ESTABLISHMENT_NAME = True -NER_MODEL = "Jean-Baptiste/camembert-ner" -NER_CONFIDENCE_THRESHOLD = 0.80 +KEEP_ESTABLISHMENT_NAME = os.environ.get("T2A_KEEP_ESTABLISHMENT", "True").lower() in ("true", "1", "yes") +NER_MODEL = os.environ.get("T2A_NER_MODEL", "Jean-Baptiste/camembert-ner") +NER_CONFIDENCE_THRESHOLD = float(os.environ.get("T2A_NER_THRESHOLD", "0.80")) # --- Configuration Ollama --- -OLLAMA_URL = "http://localhost:11434" -OLLAMA_MODEL = "gemma3:12b" -OLLAMA_TIMEOUT = 120 +OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") +OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "gemma3:12b") +OLLAMA_TIMEOUT = int(os.environ.get("OLLAMA_TIMEOUT", "120")) OLLAMA_CACHE_PATH = BASE_DIR / "data" / "ollama_cache.json" -OLLAMA_MAX_PARALLEL = 2 +OLLAMA_MAX_PARALLEL = int(os.environ.get("OLLAMA_MAX_PARALLEL", "2")) # --- Configuration RUM / établissement --- -FINESS = "000000000" -NUM_UM = "0000" +FINESS = os.environ.get("T2A_FINESS", "000000000") +NUM_UM = os.environ.get("T2A_NUM_UM", "0000") # --- Configuration RAG --- @@ -52,9 +56,13 @@ ALLOWED_EXTENSIONS = {".pdf", ".csv", ".xlsx", ".xls", ".txt"} CIM10_DICT_PATH = BASE_DIR / "data" / "cim10_dict.json" CIM10_SUPPLEMENTS_PATH = BASE_DIR / "data" / "cim10_supplements.json" CCAM_DICT_PATH = BASE_DIR / "data" / "ccam_dict.json" -CIM10_PDF = Path("/home/dom/ai/aivanov_CIM/cim-10-fr_2026_a_usage_pmsi_version_provisoire_111225.pdf") -GUIDE_METHODO_PDF = Path("/home/dom/ai/aivanov_CIM/guide_methodo_mco_2026_version_provisoire.pdf") -CCAM_PDF = Path("/home/dom/ai/aivanov_CIM/actualisation_ccam_descriptive_a_usage_pmsi_v4_2025.pdf") +CIM10_PDF = Path(os.environ.get("T2A_CIM10_PDF", "/home/dom/ai/aivanov_CIM/cim-10-fr_2026_a_usage_pmsi_version_provisoire_111225.pdf")) +GUIDE_METHODO_PDF = Path(os.environ.get("T2A_GUIDE_METHODO_PDF", "/home/dom/ai/aivanov_CIM/guide_methodo_mco_2026_version_provisoire.pdf")) +CCAM_PDF = Path(os.environ.get("T2A_CCAM_PDF", "/home/dom/ai/aivanov_CIM/actualisation_ccam_descriptive_a_usage_pmsi_v4_2025.pdf")) + +# --- Modèle d'embedding --- + +EMBEDDING_MODEL = os.environ.get("T2A_EMBEDDING_MODEL", "dangvantuan/sentence-camembert-large") # --- Modèles de données CIM-10 --- diff --git a/src/medical/rag_index.py b/src/medical/rag_index.py index 183ab0a..0818f85 100644 --- a/src/medical/rag_index.py +++ b/src/medical/rag_index.py @@ -11,7 +11,7 @@ from typing import Optional import pdfplumber -from ..config import RAG_INDEX_DIR, CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF, CCAM_DICT_PATH, REFERENTIELS_DIR +from ..config import RAG_INDEX_DIR, CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF, CCAM_DICT_PATH, REFERENTIELS_DIR, EMBEDDING_MODEL logger = logging.getLogger(__name__) @@ -426,8 +426,8 @@ def build_index(force: bool = False) -> None: # Embeddings — GPU si disponible import torch _device = "cuda" if torch.cuda.is_available() else "cpu" - logger.info("Chargement du modèle d'embedding dangvantuan/sentence-camembert-large (%s)...", _device) - model = SentenceTransformer("dangvantuan/sentence-camembert-large", device=_device) + logger.info("Chargement du modèle d'embedding %s (%s)...", EMBEDDING_MODEL, _device) + model = SentenceTransformer(EMBEDDING_MODEL, device=_device) model.max_seq_length = 512 # CamemBERT max position embeddings texts = [c.text[:2000] for c in all_chunks] # Tronquer les chunks trop longs diff --git a/src/medical/rag_search.py b/src/medical/rag_search.py index 9d8bea5..a8a8621 100644 --- a/src/medical/rag_search.py +++ b/src/medical/rag_search.py @@ -8,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from ..config import ( ActeCCAM, Diagnostic, DossierMedical, RAGSource, OLLAMA_CACHE_PATH, OLLAMA_MAX_PARALLEL, OLLAMA_MODEL, + EMBEDDING_MODEL, ) from .cim10_dict import normalize_code, validate_code as cim10_validate from .cim10_extractor import BIO_NORMALS @@ -36,12 +37,12 @@ def _get_embed_model(): _device = "cuda" if torch.cuda.is_available() else "cpu" try: logger.info("Chargement du modèle d'embedding (%s)...", _device) - _embed_model = SentenceTransformer("dangvantuan/sentence-camembert-large", device=_device) + _embed_model = SentenceTransformer(EMBEDDING_MODEL, device=_device) except torch.OutOfMemoryError: if _device == "cuda": logger.warning("CUDA OOM pour l'embedding — fallback CPU") torch.cuda.empty_cache() - _embed_model = SentenceTransformer("dangvantuan/sentence-camembert-large", device="cpu") + _embed_model = SentenceTransformer(EMBEDDING_MODEL, device="cpu") else: raise _embed_model.max_seq_length = 512