feat: configuration externalisée via .env + audit requirements
- Externalise 13 variables de config via python-dotenv (chemins PDF, modèles Ollama/embedding/NER, FINESS, seuils) avec défauts identiques - Centralise EMBEDDING_MODEL dans config.py (était hardcodé en 3 endroits) - Ajoute .env.example documenté et .env au .gitignore - Ajoute openpyxl et pandas manquants au requirements.txt - Ajoute data/referentiels au mkdir de run.sh Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
22
.env.example
Normal file
22
.env.example
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# === Référentiels PDF (chemins absolus vers les PDFs ATIH) ===
|
||||||
|
# T2A_CIM10_PDF=/chemin/vers/cim-10-fr.pdf
|
||||||
|
# T2A_GUIDE_METHODO_PDF=/chemin/vers/guide_methodo_mco.pdf
|
||||||
|
# T2A_CCAM_PDF=/chemin/vers/ccam_descriptive.pdf
|
||||||
|
|
||||||
|
# === Ollama ===
|
||||||
|
# OLLAMA_URL=http://localhost:11434
|
||||||
|
# OLLAMA_MODEL=gemma3:12b
|
||||||
|
# OLLAMA_TIMEOUT=120
|
||||||
|
# OLLAMA_MAX_PARALLEL=2
|
||||||
|
|
||||||
|
# === Modèles IA ===
|
||||||
|
# T2A_EMBEDDING_MODEL=dangvantuan/sentence-camembert-large
|
||||||
|
# T2A_NER_MODEL=Jean-Baptiste/camembert-ner
|
||||||
|
# T2A_NER_THRESHOLD=0.80
|
||||||
|
|
||||||
|
# === Établissement ===
|
||||||
|
# T2A_FINESS=000000000
|
||||||
|
# T2A_NUM_UM=0000
|
||||||
|
|
||||||
|
# === Anonymisation ===
|
||||||
|
# T2A_KEEP_ESTABLISHMENT=True
|
||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -16,5 +16,8 @@ data/
|
|||||||
*.xls
|
*.xls
|
||||||
*.xlsx
|
*.xlsx
|
||||||
|
|
||||||
|
# Configuration locale
|
||||||
|
.env
|
||||||
|
|
||||||
# IDE / outils
|
# IDE / outils
|
||||||
.claude/
|
.claude/
|
||||||
|
|||||||
@@ -11,3 +11,6 @@ faiss-cpu>=1.7.0
|
|||||||
sentence-transformers>=2.2.0
|
sentence-transformers>=2.2.0
|
||||||
requests>=2.28.0
|
requests>=2.28.0
|
||||||
flask>=3.0.0
|
flask>=3.0.0
|
||||||
|
python-dotenv>=1.0.0
|
||||||
|
openpyxl>=3.0.0
|
||||||
|
pandas>=2.0.0
|
||||||
|
|||||||
2
run.sh
2
run.sh
@@ -27,7 +27,7 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Créer les répertoires nécessaires
|
# Créer les répertoires nécessaires
|
||||||
mkdir -p input output/anonymized output/structured output/reports data/rag_index
|
mkdir -p input output/anonymized output/structured output/reports data/rag_index data/referentiels
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "✨ Application prête !"
|
echo "✨ Application prête !"
|
||||||
|
|||||||
@@ -2,11 +2,15 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
# --- Chemins ---
|
# --- Chemins ---
|
||||||
|
|
||||||
@@ -23,24 +27,24 @@ for d in (INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR, REPORTS_DIR):
|
|||||||
|
|
||||||
# --- Configuration anonymisation ---
|
# --- Configuration anonymisation ---
|
||||||
|
|
||||||
KEEP_ESTABLISHMENT_NAME = True
|
KEEP_ESTABLISHMENT_NAME = os.environ.get("T2A_KEEP_ESTABLISHMENT", "True").lower() in ("true", "1", "yes")
|
||||||
NER_MODEL = "Jean-Baptiste/camembert-ner"
|
NER_MODEL = os.environ.get("T2A_NER_MODEL", "Jean-Baptiste/camembert-ner")
|
||||||
NER_CONFIDENCE_THRESHOLD = 0.80
|
NER_CONFIDENCE_THRESHOLD = float(os.environ.get("T2A_NER_THRESHOLD", "0.80"))
|
||||||
|
|
||||||
|
|
||||||
# --- Configuration Ollama ---
|
# --- Configuration Ollama ---
|
||||||
|
|
||||||
OLLAMA_URL = "http://localhost:11434"
|
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||||
OLLAMA_MODEL = "gemma3:12b"
|
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "gemma3:12b")
|
||||||
OLLAMA_TIMEOUT = 120
|
OLLAMA_TIMEOUT = int(os.environ.get("OLLAMA_TIMEOUT", "120"))
|
||||||
OLLAMA_CACHE_PATH = BASE_DIR / "data" / "ollama_cache.json"
|
OLLAMA_CACHE_PATH = BASE_DIR / "data" / "ollama_cache.json"
|
||||||
OLLAMA_MAX_PARALLEL = 2
|
OLLAMA_MAX_PARALLEL = int(os.environ.get("OLLAMA_MAX_PARALLEL", "2"))
|
||||||
|
|
||||||
|
|
||||||
# --- Configuration RUM / établissement ---
|
# --- Configuration RUM / établissement ---
|
||||||
|
|
||||||
FINESS = "000000000"
|
FINESS = os.environ.get("T2A_FINESS", "000000000")
|
||||||
NUM_UM = "0000"
|
NUM_UM = os.environ.get("T2A_NUM_UM", "0000")
|
||||||
|
|
||||||
|
|
||||||
# --- Configuration RAG ---
|
# --- Configuration RAG ---
|
||||||
@@ -52,9 +56,13 @@ ALLOWED_EXTENSIONS = {".pdf", ".csv", ".xlsx", ".xls", ".txt"}
|
|||||||
CIM10_DICT_PATH = BASE_DIR / "data" / "cim10_dict.json"
|
CIM10_DICT_PATH = BASE_DIR / "data" / "cim10_dict.json"
|
||||||
CIM10_SUPPLEMENTS_PATH = BASE_DIR / "data" / "cim10_supplements.json"
|
CIM10_SUPPLEMENTS_PATH = BASE_DIR / "data" / "cim10_supplements.json"
|
||||||
CCAM_DICT_PATH = BASE_DIR / "data" / "ccam_dict.json"
|
CCAM_DICT_PATH = BASE_DIR / "data" / "ccam_dict.json"
|
||||||
CIM10_PDF = Path("/home/dom/ai/aivanov_CIM/cim-10-fr_2026_a_usage_pmsi_version_provisoire_111225.pdf")
|
CIM10_PDF = Path(os.environ.get("T2A_CIM10_PDF", "/home/dom/ai/aivanov_CIM/cim-10-fr_2026_a_usage_pmsi_version_provisoire_111225.pdf"))
|
||||||
GUIDE_METHODO_PDF = Path("/home/dom/ai/aivanov_CIM/guide_methodo_mco_2026_version_provisoire.pdf")
|
GUIDE_METHODO_PDF = Path(os.environ.get("T2A_GUIDE_METHODO_PDF", "/home/dom/ai/aivanov_CIM/guide_methodo_mco_2026_version_provisoire.pdf"))
|
||||||
CCAM_PDF = Path("/home/dom/ai/aivanov_CIM/actualisation_ccam_descriptive_a_usage_pmsi_v4_2025.pdf")
|
CCAM_PDF = Path(os.environ.get("T2A_CCAM_PDF", "/home/dom/ai/aivanov_CIM/actualisation_ccam_descriptive_a_usage_pmsi_v4_2025.pdf"))
|
||||||
|
|
||||||
|
# --- Modèle d'embedding ---
|
||||||
|
|
||||||
|
EMBEDDING_MODEL = os.environ.get("T2A_EMBEDDING_MODEL", "dangvantuan/sentence-camembert-large")
|
||||||
|
|
||||||
|
|
||||||
# --- Modèles de données CIM-10 ---
|
# --- Modèles de données CIM-10 ---
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from typing import Optional
|
|||||||
|
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
|
||||||
from ..config import RAG_INDEX_DIR, CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF, CCAM_DICT_PATH, REFERENTIELS_DIR
|
from ..config import RAG_INDEX_DIR, CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF, CCAM_DICT_PATH, REFERENTIELS_DIR, EMBEDDING_MODEL
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -426,8 +426,8 @@ def build_index(force: bool = False) -> None:
|
|||||||
# Embeddings — GPU si disponible
|
# Embeddings — GPU si disponible
|
||||||
import torch
|
import torch
|
||||||
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
logger.info("Chargement du modèle d'embedding dangvantuan/sentence-camembert-large (%s)...", _device)
|
logger.info("Chargement du modèle d'embedding %s (%s)...", EMBEDDING_MODEL, _device)
|
||||||
model = SentenceTransformer("dangvantuan/sentence-camembert-large", device=_device)
|
model = SentenceTransformer(EMBEDDING_MODEL, device=_device)
|
||||||
model.max_seq_length = 512 # CamemBERT max position embeddings
|
model.max_seq_length = 512 # CamemBERT max position embeddings
|
||||||
|
|
||||||
texts = [c.text[:2000] for c in all_chunks] # Tronquer les chunks trop longs
|
texts = [c.text[:2000] for c in all_chunks] # Tronquer les chunks trop longs
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
from ..config import (
|
from ..config import (
|
||||||
ActeCCAM, Diagnostic, DossierMedical, RAGSource,
|
ActeCCAM, Diagnostic, DossierMedical, RAGSource,
|
||||||
OLLAMA_CACHE_PATH, OLLAMA_MAX_PARALLEL, OLLAMA_MODEL,
|
OLLAMA_CACHE_PATH, OLLAMA_MAX_PARALLEL, OLLAMA_MODEL,
|
||||||
|
EMBEDDING_MODEL,
|
||||||
)
|
)
|
||||||
from .cim10_dict import normalize_code, validate_code as cim10_validate
|
from .cim10_dict import normalize_code, validate_code as cim10_validate
|
||||||
from .cim10_extractor import BIO_NORMALS
|
from .cim10_extractor import BIO_NORMALS
|
||||||
@@ -36,12 +37,12 @@ def _get_embed_model():
|
|||||||
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
try:
|
try:
|
||||||
logger.info("Chargement du modèle d'embedding (%s)...", _device)
|
logger.info("Chargement du modèle d'embedding (%s)...", _device)
|
||||||
_embed_model = SentenceTransformer("dangvantuan/sentence-camembert-large", device=_device)
|
_embed_model = SentenceTransformer(EMBEDDING_MODEL, device=_device)
|
||||||
except torch.OutOfMemoryError:
|
except torch.OutOfMemoryError:
|
||||||
if _device == "cuda":
|
if _device == "cuda":
|
||||||
logger.warning("CUDA OOM pour l'embedding — fallback CPU")
|
logger.warning("CUDA OOM pour l'embedding — fallback CPU")
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
_embed_model = SentenceTransformer("dangvantuan/sentence-camembert-large", device="cpu")
|
_embed_model = SentenceTransformer(EMBEDDING_MODEL, device="cpu")
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
_embed_model.max_seq_length = 512
|
_embed_model.max_seq_length = 512
|
||||||
|
|||||||
Reference in New Issue
Block a user