refactor: réorganisation référentiels, nouveaux modules extraction, nettoyage code obsolète
- Réorganisation data/referentiels/ : pdfs/, dicts/, user/ (structure unifiée) - Fix badges "Source absente" sur page admin référentiels - Ré-indexation COCOA 2025 (555 → 1451 chunks, couverture 94%) - Fix VRAM OOM : embeddings forcés CPU via T2A_EMBED_CPU - Nouveaux modules : document_router, docx_extractor, image_extractor, ocr_engine - Module complétude (quality/completude.py + config YAML) - Template DIM (synthèse dimensionnelle) - Gunicorn config + systemd service t2a-viewer - Suppression t2a_install_rag_cleanup/ (copie obsolète) - Suppression scripts/ et scripts_t2a_v2/ (anciens benchmarks) - Suppression 81 fichiers _doc.txt de test - Cache Ollama : TTL configurable, corrections loader YAML - Dashboard : améliorations templates (base, index, detail, cpam, validation) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
141
src/main.py
141
src/main.py
@@ -26,12 +26,14 @@ from .config import (
|
||||
from .extraction.document_classifier import classify
|
||||
from .extraction.crh_parser import parse_crh
|
||||
from .extraction.document_splitter import split_documents
|
||||
from .extraction.document_router import SUPPORTED_EXTENSIONS, extract_document_with_pages
|
||||
from .extraction.pdf_extractor import extract_text, extract_text_with_pages
|
||||
from .extraction.trackare_parser import parse_trackare
|
||||
from .medical.cim10_extractor import extract_medical_info
|
||||
from .medical.ghm import estimate_ghm
|
||||
from .quality.veto_engine import apply_vetos
|
||||
from .quality.decision_engine import apply_decisions, decision_summaries
|
||||
from .quality.completude import build_completude_checklist
|
||||
from .quality.rules_router import build_rules_runtime_context
|
||||
|
||||
logging.basicConfig(
|
||||
@@ -141,17 +143,19 @@ _use_edsnlp = True
|
||||
_use_rag = True
|
||||
|
||||
|
||||
def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, AnonymizationReport]]:
|
||||
"""Traite un PDF : extraction → splitting → parsing → anonymisation → extraction CIM-10.
|
||||
def process_document(file_path: Path) -> list[tuple[str, DossierMedical, AnonymizationReport]]:
|
||||
"""Traite un document : extraction → splitting → parsing → anonymisation → extraction CIM-10.
|
||||
|
||||
Supporte PDF, images (JPEG/PNG/TIFF) et DOCX via le router d'extraction.
|
||||
|
||||
Retourne une liste de (texte_anonymisé, dossier, rapport) — un par dossier détecté.
|
||||
"""
|
||||
t0 = time.time()
|
||||
logger.info("Traitement de %s", pdf_path.name)
|
||||
logger.info("Traitement de %s", file_path.name)
|
||||
|
||||
# 1. Extraction texte avec pages
|
||||
raw_text, page_tracker = extract_text_with_pages(pdf_path)
|
||||
logger.info(" Texte extrait : %d caractères", len(raw_text))
|
||||
# 1. Extraction texte avec pages (multi-format)
|
||||
raw_text, page_tracker, extraction_stats = extract_document_with_pages(file_path)
|
||||
logger.info(" Texte extrait : %d caractères (%d pages, format=%s)", len(raw_text), extraction_stats.total_pages, extraction_stats.source_format)
|
||||
|
||||
# 2. Classification
|
||||
doc_type = classify(raw_text)
|
||||
@@ -160,7 +164,7 @@ def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, Anonymization
|
||||
# 3. Splitting multi-dossiers
|
||||
chunks = split_documents(raw_text, doc_type)
|
||||
if len(chunks) > 1:
|
||||
logger.info(" Découpage : %d dossiers détectés dans %s", len(chunks), pdf_path.name)
|
||||
logger.info(" Découpage : %d dossiers détectés dans %s", len(chunks), file_path.name)
|
||||
|
||||
results: list[tuple[str, DossierMedical, AnonymizationReport]] = []
|
||||
for i, chunk_text in enumerate(chunks):
|
||||
@@ -177,7 +181,7 @@ def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, Anonymization
|
||||
anonymizer = Anonymizer(parsed_data=parsed)
|
||||
anonymized_text = anonymizer.anonymize(chunk_text)
|
||||
report = anonymizer.report
|
||||
report.source_file = pdf_path.name
|
||||
report.source_file = file_path.name
|
||||
logger.info(
|
||||
" Anonymisation%s : %d remplacements (regex=%d, ner=%d, sweep=%d)",
|
||||
part_label,
|
||||
@@ -197,10 +201,18 @@ def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, Anonymization
|
||||
parsed, anonymized_text, edsnlp_result, use_rag=_use_rag,
|
||||
page_tracker=page_tracker, raw_text=raw_text,
|
||||
)
|
||||
dossier.source_file = pdf_path.name
|
||||
dossier.source_file = file_path.name
|
||||
dossier.document_type = doc_type
|
||||
logger.info(" DP%s : %s", part_label, dossier.diagnostic_principal)
|
||||
|
||||
# Injection des stats d'extraction dans quality_flags
|
||||
extraction_flags = extraction_stats.to_flags()
|
||||
if extraction_flags:
|
||||
dossier.quality_flags.update(extraction_flags)
|
||||
extraction_alert = extraction_stats.to_alert()
|
||||
if extraction_alert:
|
||||
dossier.alertes_codage.append(extraction_alert)
|
||||
|
||||
# 8. Vetos (contestabilité) + décisions (post-traitement)
|
||||
# Routage des règles (packs) : par défaut, on garde le socle vetos/decisions,
|
||||
# et on active des packs additionnels selon les signaux du dossier (codes/labs/extraits).
|
||||
@@ -216,14 +228,17 @@ def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, Anonymization
|
||||
if rules_ctx.get("triggers_fired"):
|
||||
logger.info(" Règles%s : triggers=%s", part_label, ",".join(rules_ctx["triggers_fired"]))
|
||||
except Exception:
|
||||
logger.warning(" Routage règles : erreur", exc_info=True)
|
||||
logger.error(" Routage règles : erreur", exc_info=True)
|
||||
dossier.quality_flags["rules_routing"] = "error"
|
||||
|
||||
veto = None
|
||||
try:
|
||||
veto = apply_vetos(dossier)
|
||||
dossier.veto_report = veto
|
||||
except Exception:
|
||||
logger.warning(" Vetos : erreur lors du contrôle", exc_info=True)
|
||||
logger.error(" Vetos : erreur lors du contrôle", exc_info=True)
|
||||
dossier.quality_flags["veto_engine"] = "error"
|
||||
dossier.alertes_codage.append("QUALITE DEGRADEE : moteur de vetos en erreur")
|
||||
|
||||
try:
|
||||
apply_decisions(dossier)
|
||||
@@ -231,11 +246,18 @@ def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, Anonymization
|
||||
if veto is not None:
|
||||
_inject_veto_alerts(dossier, veto, scope="PDF")
|
||||
except Exception:
|
||||
logger.warning(" Décisions : erreur lors du post-traitement", exc_info=True)
|
||||
logger.error(" Décisions : erreur lors du post-traitement", exc_info=True)
|
||||
dossier.quality_flags["decision_engine"] = "error"
|
||||
finally:
|
||||
if rules_token is not None:
|
||||
reset_rules_runtime(rules_token)
|
||||
|
||||
try:
|
||||
dossier.completude = build_completude_checklist(dossier)
|
||||
except Exception:
|
||||
logger.error(" Complétude : erreur lors du contrôle", exc_info=True)
|
||||
dossier.quality_flags["completude"] = "error"
|
||||
|
||||
# 9. Estimation GHM (sur codes finaux) + métriques (actifs vs écartés)
|
||||
try:
|
||||
metrics = _compute_metrics(dossier)
|
||||
@@ -260,14 +282,17 @@ def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, Anonymization
|
||||
ghm.ghm_approx or "?",
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(" Erreur estimation GHM/metrics", exc_info=True)
|
||||
logger.error(" Erreur estimation GHM/metrics", exc_info=True)
|
||||
dossier.quality_flags["ghm_estimation"] = "error"
|
||||
dossier.alertes_codage.append("QUALITE DEGRADEE : estimation GHM en erreur")
|
||||
|
||||
# 10. Finalizer DP (arbitrage Trackare vs CRH, traçabilité)
|
||||
try:
|
||||
from .medical.dp_finalizer import finalize_dp
|
||||
finalize_dp(dossier)
|
||||
except Exception:
|
||||
logger.warning(" Finalizer DP : erreur", exc_info=True)
|
||||
logger.error(" Finalizer DP : erreur", exc_info=True)
|
||||
dossier.quality_flags["dp_finalizer"] = "error"
|
||||
|
||||
dossier.processing_time_s = round(time.time() - t0, 2)
|
||||
results.append((anonymized_text, dossier, report))
|
||||
@@ -276,6 +301,10 @@ def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, Anonymization
|
||||
return results
|
||||
|
||||
|
||||
# Alias backward-compatible
|
||||
process_pdf = process_document
|
||||
|
||||
|
||||
def _run_edsnlp(text: str):
|
||||
"""Exécute l'analyse edsnlp avec fallback gracieux."""
|
||||
try:
|
||||
@@ -351,13 +380,13 @@ def main(input_path: str | None = None) -> None:
|
||||
global _use_edsnlp, _use_rag
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Anonymisation de documents médicaux PDF et extraction CIM-10",
|
||||
description="Anonymisation de documents médicaux et extraction CIM-10 (PDF, images, DOCX)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"input",
|
||||
nargs="*",
|
||||
default=[input_path or "input/"],
|
||||
help="Chemin(s) vers des PDFs, dossiers patients, ou le dossier racine (défaut: input/)",
|
||||
help="Chemin(s) vers des documents, dossiers patients, ou le dossier racine (défaut: input/)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-ner",
|
||||
@@ -459,6 +488,24 @@ def main(input_path: str | None = None) -> None:
|
||||
if args.no_rag:
|
||||
_use_rag = False
|
||||
|
||||
# Vérification FAISS obligatoire si RAG actif
|
||||
if _use_rag:
|
||||
from .medical.rag_index import check_faiss_ready
|
||||
faiss_status = check_faiss_ready()
|
||||
if faiss_status["ok"]:
|
||||
total_vecs = faiss_status["ref"] + faiss_status["proc"] + faiss_status["bio"] + faiss_status["legacy"]
|
||||
logger.info("FAISS OK : %d vecteurs (ref=%d, proc=%d, bio=%d)",
|
||||
total_vecs, faiss_status["ref"], faiss_status["proc"], faiss_status["bio"])
|
||||
else:
|
||||
for err in faiss_status["errors"]:
|
||||
logger.error("FAISS : %s", err)
|
||||
logger.error("FAISS non fonctionnel — le codage CIM-10 sera dégradé. "
|
||||
"Lancez : python3 -m src.main --rebuild-index")
|
||||
print("\n*** ATTENTION : Index FAISS absent ou invalide ***")
|
||||
print("*** Le RAG est désactivé — qualité de codage dégradée ***")
|
||||
print("*** Corrigez avec : python3 -m src.main --rebuild-index ***\n")
|
||||
_use_rag = False
|
||||
|
||||
export_rum_flag = args.export_rum
|
||||
|
||||
# Chargement contrôle CPAM (auto-détection ou flag explicite)
|
||||
@@ -480,7 +527,14 @@ def main(input_path: str | None = None) -> None:
|
||||
|
||||
input_paths = args.input
|
||||
|
||||
# Collecte des groupes (pdfs, subdir) à traiter
|
||||
def _glob_supported(directory: Path) -> list[Path]:
|
||||
"""Collecte tous les fichiers supportés dans un dossier."""
|
||||
files: list[Path] = []
|
||||
for ext in sorted(SUPPORTED_EXTENSIONS):
|
||||
files.extend(directory.glob(f"*{ext}"))
|
||||
return sorted(set(files))
|
||||
|
||||
# Collecte des groupes (documents, subdir) à traiter
|
||||
groups: list[tuple[list[Path], str | None]] = []
|
||||
|
||||
for p in input_paths:
|
||||
@@ -490,47 +544,47 @@ def main(input_path: str | None = None) -> None:
|
||||
subdir = input_p.parent.name if input_p.parent.name != "input" else None
|
||||
groups.append(([input_p], subdir))
|
||||
elif input_p.is_dir():
|
||||
# Vérifier s'il y a des PDFs directement dans ce dossier
|
||||
root_pdfs = sorted(input_p.glob("*.pdf"))
|
||||
# Vérifier s'il y a des sous-dossiers avec PDFs
|
||||
sub_dirs = [c for c in sorted(input_p.iterdir()) if c.is_dir() and list(c.glob("*.pdf"))]
|
||||
# Vérifier s'il y a des documents directement dans ce dossier
|
||||
root_docs = _glob_supported(input_p)
|
||||
# Vérifier s'il y a des sous-dossiers avec des documents
|
||||
sub_dirs = [c for c in sorted(input_p.iterdir()) if c.is_dir() and _glob_supported(c)]
|
||||
|
||||
if sub_dirs:
|
||||
# C'est un dossier racine (comme input/) → traiter chaque sous-dossier
|
||||
for child in sub_dirs:
|
||||
sub_pdfs = sorted(child.glob("*.pdf"))
|
||||
groups.append((sub_pdfs, child.name))
|
||||
elif root_pdfs:
|
||||
sub_docs = _glob_supported(child)
|
||||
groups.append((sub_docs, child.name))
|
||||
elif root_docs:
|
||||
# C'est un dossier patient directement → utiliser son nom comme subdir
|
||||
groups.append((root_pdfs, input_p.name))
|
||||
groups.append((root_docs, input_p.name))
|
||||
else:
|
||||
logger.error("Chemin introuvable : %s", input_p)
|
||||
sys.exit(1)
|
||||
|
||||
total = sum(len(pdfs) for pdfs, _ in groups)
|
||||
total = sum(len(docs) for docs, _ in groups)
|
||||
if total == 0:
|
||||
logger.warning("Aucun PDF trouvé dans %s", input_p)
|
||||
logger.warning("Aucun document supporté trouvé dans %s", input_p)
|
||||
sys.exit(0)
|
||||
|
||||
logger.info("Traitement de %d PDF(s)...", total)
|
||||
logger.info("Traitement de %d document(s)...", total)
|
||||
|
||||
def _process_group(pdfs: list[Path], subdir: str | None) -> None:
|
||||
"""Traite un groupe de PDFs (un dossier patient)."""
|
||||
def _process_group(docs: list[Path], subdir: str | None) -> None:
|
||||
"""Traite un groupe de documents (un dossier patient)."""
|
||||
if subdir:
|
||||
logger.info("--- Dossier %s (%d PDFs) ---", subdir, len(pdfs))
|
||||
logger.info("--- Dossier %s (%d documents) ---", subdir, len(docs))
|
||||
|
||||
group_dossiers: list[DossierMedical] = []
|
||||
for pdf_path in pdfs:
|
||||
for doc_path in docs:
|
||||
try:
|
||||
pdf_results = process_pdf(pdf_path)
|
||||
stem = pdf_path.stem.replace(" ", "_")
|
||||
multi = len(pdf_results) > 1
|
||||
for part_idx, (anonymized_text, dossier, report) in enumerate(pdf_results):
|
||||
doc_results = process_document(doc_path)
|
||||
stem = doc_path.stem.replace(" ", "_")
|
||||
multi = len(doc_results) > 1
|
||||
for part_idx, (anonymized_text, dossier, report) in enumerate(doc_results):
|
||||
part_stem = f"{stem}_part{part_idx + 1}" if multi else stem
|
||||
write_outputs(part_stem, anonymized_text, dossier, report, subdir=subdir, export_rum_flag=export_rum_flag)
|
||||
group_dossiers.append(dossier)
|
||||
except Exception:
|
||||
logger.exception("Erreur lors du traitement de %s", pdf_path.name)
|
||||
logger.exception("Erreur lors du traitement de %s", doc_path.name)
|
||||
|
||||
# Fusion multi-PDFs si plusieurs documents dans le même groupe
|
||||
merged = None
|
||||
@@ -611,6 +665,11 @@ def main(input_path: str | None = None) -> None:
|
||||
if rules_token is not None:
|
||||
reset_rules_runtime(rules_token)
|
||||
|
||||
try:
|
||||
merged.completude = build_completude_checklist(merged)
|
||||
except Exception:
|
||||
logger.warning(" Complétude fusionné : erreur lors du contrôle", exc_info=True)
|
||||
|
||||
# Re-estimer le GHM (sur codes finaux) + métriques (actifs vs écartés)
|
||||
try:
|
||||
metrics = _compute_metrics(merged)
|
||||
@@ -660,8 +719,8 @@ def main(input_path: str | None = None) -> None:
|
||||
logger.info("Mode parallèle : %d workers", args.workers)
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as executor:
|
||||
futures = {
|
||||
executor.submit(_process_group, pdfs, subdir): subdir
|
||||
for pdfs, subdir in groups
|
||||
executor.submit(_process_group, docs, subdir): subdir
|
||||
for docs, subdir in groups
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
@@ -669,8 +728,8 @@ def main(input_path: str | None = None) -> None:
|
||||
except Exception:
|
||||
logger.exception("Erreur groupe %s", futures[future])
|
||||
else:
|
||||
for pdfs, subdir in groups:
|
||||
_process_group(pdfs, subdir)
|
||||
for docs, subdir in groups:
|
||||
_process_group(docs, subdir)
|
||||
|
||||
logger.info("Terminé.")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user