feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)
Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
78
setup.sh
Executable file
78
setup.sh
Executable file
@@ -0,0 +1,78 @@
|
||||
#!/bin/bash
|
||||
# =============================================================
|
||||
# T2A Extractor — Installation complète
|
||||
# Ubuntu 24.04 — Python 3.12+
|
||||
# =============================================================
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
VENV_DIR="$SCRIPT_DIR/.venv"
|
||||
|
||||
echo "============================================"
|
||||
echo " T2A Extractor — Installation"
|
||||
echo "============================================"
|
||||
|
||||
# --- 1. Vérifier Python ---
|
||||
echo ""
|
||||
echo "[1/5] Vérification de Python..."
|
||||
if ! command -v python3 &>/dev/null; then
|
||||
echo " ✗ Python3 non trouvé. Installation..."
|
||||
sudo apt update && sudo apt install -y python3 python3-venv python3-pip
|
||||
fi
|
||||
PYTHON_VERSION=$(python3 --version 2>&1)
|
||||
echo " ✓ $PYTHON_VERSION"
|
||||
|
||||
# --- 2. Créer le venv ---
|
||||
echo ""
|
||||
echo "[2/5] Création de l'environnement virtuel..."
|
||||
if [ -d "$VENV_DIR" ]; then
|
||||
echo " → Suppression de l'ancien venv..."
|
||||
rm -rf "$VENV_DIR"
|
||||
fi
|
||||
python3 -m venv "$VENV_DIR"
|
||||
source "$VENV_DIR/bin/activate"
|
||||
pip install --upgrade pip setuptools wheel -q
|
||||
echo " ✓ Venv créé : $VENV_DIR"
|
||||
|
||||
# --- 3. Installer les dépendances ---
|
||||
echo ""
|
||||
echo "[3/5] Installation des dépendances Python..."
|
||||
pip install -r "$SCRIPT_DIR/requirements.txt" 2>&1 | tail -5
|
||||
echo " ✓ Dépendances installées"
|
||||
|
||||
# --- 4. Vérifier Ollama ---
|
||||
echo ""
|
||||
echo "[4/5] Vérification d'Ollama..."
|
||||
if ! command -v ollama &>/dev/null; then
|
||||
echo " ⚠ Ollama non installé."
|
||||
echo " → Installer avec : curl -fsSL https://ollama.com/install.sh | sh"
|
||||
echo " → Puis : ollama pull gemma3:27b-it-qat"
|
||||
else
|
||||
echo " ✓ Ollama installé : $(ollama --version 2>&1 || echo 'version inconnue')"
|
||||
echo " → Assurez-vous que le modèle est chargé : ollama pull gemma3:27b-it-qat"
|
||||
fi
|
||||
|
||||
# --- 5. Créer le dossier output ---
|
||||
echo ""
|
||||
echo "[5/5] Structure du projet..."
|
||||
mkdir -p "$SCRIPT_DIR/output"
|
||||
echo " ✓ Dossier output créé"
|
||||
|
||||
# --- Résumé ---
|
||||
echo ""
|
||||
echo "============================================"
|
||||
echo " Installation terminée !"
|
||||
echo "============================================"
|
||||
echo ""
|
||||
echo " Activation du venv :"
|
||||
echo " source $VENV_DIR/bin/activate"
|
||||
echo ""
|
||||
echo " Usage :"
|
||||
echo " python main.py <fichier.pdf>"
|
||||
echo " python main.py <fichier.pdf> --csv --verbose"
|
||||
echo ""
|
||||
echo " Avant la première utilisation :"
|
||||
echo " 1. Démarrer Ollama : ollama serve"
|
||||
echo " 2. Charger le modèle : ollama pull gemma3:27b-it-qat"
|
||||
echo " 3. Adapter config.py si nécessaire (OLLAMA_MODEL)"
|
||||
echo ""
|
||||
Reference in New Issue
Block a user