Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
79 lines
2.5 KiB
Bash
Executable File
79 lines
2.5 KiB
Bash
Executable File
#!/bin/bash
|
|
# =============================================================
|
|
# T2A Extractor — Installation complète
|
|
# Ubuntu 24.04 — Python 3.12+
|
|
# =============================================================
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
VENV_DIR="$SCRIPT_DIR/.venv"
|
|
|
|
echo "============================================"
|
|
echo " T2A Extractor — Installation"
|
|
echo "============================================"
|
|
|
|
# --- 1. Vérifier Python ---
|
|
echo ""
|
|
echo "[1/5] Vérification de Python..."
|
|
if ! command -v python3 &>/dev/null; then
|
|
echo " ✗ Python3 non trouvé. Installation..."
|
|
sudo apt update && sudo apt install -y python3 python3-venv python3-pip
|
|
fi
|
|
PYTHON_VERSION=$(python3 --version 2>&1)
|
|
echo " ✓ $PYTHON_VERSION"
|
|
|
|
# --- 2. Créer le venv ---
|
|
echo ""
|
|
echo "[2/5] Création de l'environnement virtuel..."
|
|
if [ -d "$VENV_DIR" ]; then
|
|
echo " → Suppression de l'ancien venv..."
|
|
rm -rf "$VENV_DIR"
|
|
fi
|
|
python3 -m venv "$VENV_DIR"
|
|
source "$VENV_DIR/bin/activate"
|
|
pip install --upgrade pip setuptools wheel -q
|
|
echo " ✓ Venv créé : $VENV_DIR"
|
|
|
|
# --- 3. Installer les dépendances ---
|
|
echo ""
|
|
echo "[3/5] Installation des dépendances Python..."
|
|
pip install -r "$SCRIPT_DIR/requirements.txt" 2>&1 | tail -5
|
|
echo " ✓ Dépendances installées"
|
|
|
|
# --- 4. Vérifier Ollama ---
|
|
echo ""
|
|
echo "[4/5] Vérification d'Ollama..."
|
|
if ! command -v ollama &>/dev/null; then
|
|
echo " ⚠ Ollama non installé."
|
|
echo " → Installer avec : curl -fsSL https://ollama.com/install.sh | sh"
|
|
echo " → Puis : ollama pull gemma3:27b-it-qat"
|
|
else
|
|
echo " ✓ Ollama installé : $(ollama --version 2>&1 || echo 'version inconnue')"
|
|
echo " → Assurez-vous que le modèle est chargé : ollama pull gemma3:27b-it-qat"
|
|
fi
|
|
|
|
# --- 5. Créer le dossier output ---
|
|
echo ""
|
|
echo "[5/5] Structure du projet..."
|
|
mkdir -p "$SCRIPT_DIR/output"
|
|
echo " ✓ Dossier output créé"
|
|
|
|
# --- Résumé ---
|
|
echo ""
|
|
echo "============================================"
|
|
echo " Installation terminée !"
|
|
echo "============================================"
|
|
echo ""
|
|
echo " Activation du venv :"
|
|
echo " source $VENV_DIR/bin/activate"
|
|
echo ""
|
|
echo " Usage :"
|
|
echo " python main.py <fichier.pdf>"
|
|
echo " python main.py <fichier.pdf> --csv --verbose"
|
|
echo ""
|
|
echo " Avant la première utilisation :"
|
|
echo " 1. Démarrer Ollama : ollama serve"
|
|
echo " 2. Charger le modèle : ollama pull gemma3:27b-it-qat"
|
|
echo " 3. Adapter config.py si nécessaire (OLLAMA_MODEL)"
|
|
echo ""
|