fix: align model name to gemma3:27b-cloud + add architecture diagram & deployment zip
setup.sh and README.md referenced gemma3:27b-it-qat while config.py uses gemma3:27b-cloud. Added architecture.html (Mermaid pipeline diagram) and t2a-extractor.zip for collaborator deployment. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -51,7 +51,7 @@ python main.py rapport_ucr.pdf -o /chemin/sortie --csv -v
|
|||||||
## Prérequis
|
## Prérequis
|
||||||
|
|
||||||
- Python 3.12+
|
- Python 3.12+
|
||||||
- Ollama avec un VLM (gemma3:27b-it-qat par défaut)
|
- Ollama avec un VLM (gemma3:27b-cloud par défaut)
|
||||||
- GPU recommandé pour docTR (fonctionne aussi en CPU)
|
- GPU recommandé pour docTR (fonctionne aussi en CPU)
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|||||||
115
architecture.html
Normal file
115
architecture.html
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="fr">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>T2A Extractor — Architecture</title>
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||||
|
background: #f5f5f5;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
align-items: center;
|
||||||
|
padding: 2rem;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
h1 { color: #333; margin-bottom: 0.5rem; }
|
||||||
|
p.subtitle { color: #666; margin-top: 0; }
|
||||||
|
.mermaid {
|
||||||
|
background: white;
|
||||||
|
border-radius: 12px;
|
||||||
|
padding: 2rem;
|
||||||
|
box-shadow: 0 2px 12px rgba(0,0,0,0.1);
|
||||||
|
max-width: 95vw;
|
||||||
|
overflow-x: auto;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>T2A Extractor</h1>
|
||||||
|
<p class="subtitle">Pipeline d'extraction structurée de rapports UCR</p>
|
||||||
|
<div class="mermaid">
|
||||||
|
flowchart TD
|
||||||
|
subgraph INPUT["📄 Entrée"]
|
||||||
|
PDF["PDF UCR<br/>natif + scanné"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph ETAPE1["📖 Étape 1 — Extraction texte"]
|
||||||
|
DETECT{"Page native<br/>ou scannée ?"}
|
||||||
|
PYMUPDF["<b>PyMuPDF</b><br/>texte natif"]
|
||||||
|
DOCTR["<b>docTR + Torch</b><br/>OCR"]
|
||||||
|
MERGE(["Texte brut complet"])
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph ETAPE2["✂️ Étape 2 — Segmentation"]
|
||||||
|
REGEX["Regex<br/>par Champ + OGC"]
|
||||||
|
OGC_BLOCKS["Blocs OGC<br/>individuels / groupés"]
|
||||||
|
CHAMP_BLOCKS["Blocs Champ<br/>décisions globales"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph ETAPE3["🤖 Étape 3 — Extraction structurée"]
|
||||||
|
OLLAMA["<b>Ollama</b><br/>gemma3:27b-cloud"]
|
||||||
|
JSON["JSON structuré<br/>11 champs par OGC"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph ETAPE35["🔧 Étape 3.5 — Normalisation"]
|
||||||
|
CIM10["Correction codes CIM-10<br/>&bull; OCR chiffre ↔ lettre<br/>&bull; Point manquant / mal placé<br/>&bull; Décimales excédentaires"]
|
||||||
|
RETENUS["Auto-remplissage<br/>codes_retenus"]
|
||||||
|
TEXTE["Fallback regex<br/>texte_decision"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph ETAPE4["✅ Étape 4 — Validation"]
|
||||||
|
VALID["Vérification formats<br/>CIM-10 / CCAM"]
|
||||||
|
SAFETY["Safety-net<br/>2e passe normalizer"]
|
||||||
|
COHERENCE["Cohérence<br/>décision ↔ codes"]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph ETAPE5["📊 Étape 5 — Export"]
|
||||||
|
EXCEL["<b>Excel</b> .xlsx<br/>coloration décisions"]
|
||||||
|
CSV["<b>CSV</b><br/>optionnel"]
|
||||||
|
end
|
||||||
|
|
||||||
|
PDF --> DETECT
|
||||||
|
DETECT -->|"≥ 50 chars"| PYMUPDF
|
||||||
|
DETECT -->|"< 50 chars"| DOCTR
|
||||||
|
PYMUPDF --> MERGE
|
||||||
|
DOCTR --> MERGE
|
||||||
|
|
||||||
|
MERGE --> REGEX
|
||||||
|
REGEX --> OGC_BLOCKS
|
||||||
|
REGEX --> CHAMP_BLOCKS
|
||||||
|
|
||||||
|
OGC_BLOCKS --> OLLAMA
|
||||||
|
CHAMP_BLOCKS --> OLLAMA
|
||||||
|
OLLAMA --> JSON
|
||||||
|
|
||||||
|
JSON --> CIM10
|
||||||
|
CIM10 --> RETENUS
|
||||||
|
RETENUS --> TEXTE
|
||||||
|
|
||||||
|
TEXTE --> VALID
|
||||||
|
VALID --> SAFETY
|
||||||
|
SAFETY --> COHERENCE
|
||||||
|
|
||||||
|
COHERENCE --> EXCEL
|
||||||
|
COHERENCE --> CSV
|
||||||
|
|
||||||
|
style INPUT fill:#e8f4fd,stroke:#2196F3,stroke-width:2px,color:#000
|
||||||
|
style ETAPE1 fill:#fff3e0,stroke:#FF9800,stroke-width:2px,color:#000
|
||||||
|
style ETAPE2 fill:#f3e5f5,stroke:#9C27B0,stroke-width:2px,color:#000
|
||||||
|
style ETAPE3 fill:#e8f5e9,stroke:#4CAF50,stroke-width:2px,color:#000
|
||||||
|
style ETAPE35 fill:#fce4ec,stroke:#E91E63,stroke-width:2px,color:#000
|
||||||
|
style ETAPE4 fill:#fff8e1,stroke:#FFC107,stroke-width:2px,color:#000
|
||||||
|
style ETAPE5 fill:#e0f2f1,stroke:#009688,stroke-width:2px,color:#000
|
||||||
|
</div>
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.min.js"></script>
|
||||||
|
<script>
|
||||||
|
mermaid.initialize({
|
||||||
|
startOnLoad: true,
|
||||||
|
theme: 'default',
|
||||||
|
flowchart: { useMaxWidth: true, htmlLabels: true, curve: 'basis' },
|
||||||
|
securityLevel: 'loose'
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
6
setup.sh
6
setup.sh
@@ -46,10 +46,10 @@ echo "[4/5] Vérification d'Ollama..."
|
|||||||
if ! command -v ollama &>/dev/null; then
|
if ! command -v ollama &>/dev/null; then
|
||||||
echo " ⚠ Ollama non installé."
|
echo " ⚠ Ollama non installé."
|
||||||
echo " → Installer avec : curl -fsSL https://ollama.com/install.sh | sh"
|
echo " → Installer avec : curl -fsSL https://ollama.com/install.sh | sh"
|
||||||
echo " → Puis : ollama pull gemma3:27b-it-qat"
|
echo " → Puis : ollama pull gemma3:27b-cloud"
|
||||||
else
|
else
|
||||||
echo " ✓ Ollama installé : $(ollama --version 2>&1 || echo 'version inconnue')"
|
echo " ✓ Ollama installé : $(ollama --version 2>&1 || echo 'version inconnue')"
|
||||||
echo " → Assurez-vous que le modèle est chargé : ollama pull gemma3:27b-it-qat"
|
echo " → Assurez-vous que le modèle est chargé : ollama pull gemma3:27b-cloud"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- 5. Créer le dossier output ---
|
# --- 5. Créer le dossier output ---
|
||||||
@@ -73,6 +73,6 @@ echo " python main.py <fichier.pdf> --csv --verbose"
|
|||||||
echo ""
|
echo ""
|
||||||
echo " Avant la première utilisation :"
|
echo " Avant la première utilisation :"
|
||||||
echo " 1. Démarrer Ollama : ollama serve"
|
echo " 1. Démarrer Ollama : ollama serve"
|
||||||
echo " 2. Charger le modèle : ollama pull gemma3:27b-it-qat"
|
echo " 2. Charger le modèle : ollama pull gemma3:27b-cloud"
|
||||||
echo " 3. Adapter config.py si nécessaire (OLLAMA_MODEL)"
|
echo " 3. Adapter config.py si nécessaire (OLLAMA_MODEL)"
|
||||||
echo ""
|
echo ""
|
||||||
|
|||||||
BIN
t2a-extractor.zip
Normal file
BIN
t2a-extractor.zip
Normal file
Binary file not shown.
Reference in New Issue
Block a user