setup.sh and README.md referenced gemma3:27b-it-qat while config.py uses gemma3:27b-cloud. Added architecture.html (Mermaid pipeline diagram) and t2a-extractor.zip for collaborator deployment. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
116 lines
4.0 KiB
HTML
116 lines
4.0 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="fr">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<title>T2A Extractor — Architecture</title>
|
|
<style>
|
|
body {
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
background: #f5f5f5;
|
|
display: flex;
|
|
flex-direction: column;
|
|
align-items: center;
|
|
padding: 2rem;
|
|
margin: 0;
|
|
}
|
|
h1 { color: #333; margin-bottom: 0.5rem; }
|
|
p.subtitle { color: #666; margin-top: 0; }
|
|
.mermaid {
|
|
background: white;
|
|
border-radius: 12px;
|
|
padding: 2rem;
|
|
box-shadow: 0 2px 12px rgba(0,0,0,0.1);
|
|
max-width: 95vw;
|
|
overflow-x: auto;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>T2A Extractor</h1>
|
|
<p class="subtitle">Pipeline d'extraction structurée de rapports UCR</p>
|
|
<div class="mermaid">
|
|
flowchart TD
|
|
subgraph INPUT["📄 Entrée"]
|
|
PDF["PDF UCR<br/>natif + scanné"]
|
|
end
|
|
|
|
subgraph ETAPE1["📖 Étape 1 — Extraction texte"]
|
|
DETECT{"Page native<br/>ou scannée ?"}
|
|
PYMUPDF["<b>PyMuPDF</b><br/>texte natif"]
|
|
DOCTR["<b>docTR + Torch</b><br/>OCR"]
|
|
MERGE(["Texte brut complet"])
|
|
end
|
|
|
|
subgraph ETAPE2["✂️ Étape 2 — Segmentation"]
|
|
REGEX["Regex<br/>par Champ + OGC"]
|
|
OGC_BLOCKS["Blocs OGC<br/>individuels / groupés"]
|
|
CHAMP_BLOCKS["Blocs Champ<br/>décisions globales"]
|
|
end
|
|
|
|
subgraph ETAPE3["🤖 Étape 3 — Extraction structurée"]
|
|
OLLAMA["<b>Ollama</b><br/>gemma3:27b-cloud"]
|
|
JSON["JSON structuré<br/>11 champs par OGC"]
|
|
end
|
|
|
|
subgraph ETAPE35["🔧 Étape 3.5 — Normalisation"]
|
|
CIM10["Correction codes CIM-10<br/>&bull; OCR chiffre ↔ lettre<br/>&bull; Point manquant / mal placé<br/>&bull; Décimales excédentaires"]
|
|
RETENUS["Auto-remplissage<br/>codes_retenus"]
|
|
TEXTE["Fallback regex<br/>texte_decision"]
|
|
end
|
|
|
|
subgraph ETAPE4["✅ Étape 4 — Validation"]
|
|
VALID["Vérification formats<br/>CIM-10 / CCAM"]
|
|
SAFETY["Safety-net<br/>2e passe normalizer"]
|
|
COHERENCE["Cohérence<br/>décision ↔ codes"]
|
|
end
|
|
|
|
subgraph ETAPE5["📊 Étape 5 — Export"]
|
|
EXCEL["<b>Excel</b> .xlsx<br/>coloration décisions"]
|
|
CSV["<b>CSV</b><br/>optionnel"]
|
|
end
|
|
|
|
PDF --> DETECT
|
|
DETECT -->|"≥ 50 chars"| PYMUPDF
|
|
DETECT -->|"< 50 chars"| DOCTR
|
|
PYMUPDF --> MERGE
|
|
DOCTR --> MERGE
|
|
|
|
MERGE --> REGEX
|
|
REGEX --> OGC_BLOCKS
|
|
REGEX --> CHAMP_BLOCKS
|
|
|
|
OGC_BLOCKS --> OLLAMA
|
|
CHAMP_BLOCKS --> OLLAMA
|
|
OLLAMA --> JSON
|
|
|
|
JSON --> CIM10
|
|
CIM10 --> RETENUS
|
|
RETENUS --> TEXTE
|
|
|
|
TEXTE --> VALID
|
|
VALID --> SAFETY
|
|
SAFETY --> COHERENCE
|
|
|
|
COHERENCE --> EXCEL
|
|
COHERENCE --> CSV
|
|
|
|
style INPUT fill:#e8f4fd,stroke:#2196F3,stroke-width:2px,color:#000
|
|
style ETAPE1 fill:#fff3e0,stroke:#FF9800,stroke-width:2px,color:#000
|
|
style ETAPE2 fill:#f3e5f5,stroke:#9C27B0,stroke-width:2px,color:#000
|
|
style ETAPE3 fill:#e8f5e9,stroke:#4CAF50,stroke-width:2px,color:#000
|
|
style ETAPE35 fill:#fce4ec,stroke:#E91E63,stroke-width:2px,color:#000
|
|
style ETAPE4 fill:#fff8e1,stroke:#FFC107,stroke-width:2px,color:#000
|
|
style ETAPE5 fill:#e0f2f1,stroke:#009688,stroke-width:2px,color:#000
|
|
</div>
|
|
<script src="https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.min.js"></script>
|
|
<script>
|
|
mermaid.initialize({
|
|
startOnLoad: true,
|
|
theme: 'default',
|
|
flowchart: { useMaxWidth: true, htmlLabels: true, curve: 'basis' },
|
|
securityLevel: 'loose'
|
|
});
|
|
</script>
|
|
</body>
|
|
</html>
|