feat: découpage PDFs multi-dossiers (Trackare multi-épisodes, CRH concaténés)
Ajoute une étape de splitting entre extraction texte et parsing. Chaque chunk est traité indépendamment par le pipeline existant, avec suffixe _partN en sortie. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
90
src/main.py
90
src/main.py
@@ -13,6 +13,7 @@ from .anonymization.anonymizer import Anonymizer
|
||||
from .config import ANONYMIZED_DIR, REPORTS_DIR, STRUCTURED_DIR, AnonymizationReport, DossierMedical
|
||||
from .extraction.document_classifier import classify
|
||||
from .extraction.crh_parser import parse_crh
|
||||
from .extraction.document_splitter import split_documents
|
||||
from .extraction.pdf_extractor import extract_text
|
||||
from .extraction.trackare_parser import parse_trackare
|
||||
from .medical.cim10_extractor import extract_medical_info
|
||||
@@ -28,8 +29,11 @@ _use_edsnlp = True
|
||||
_use_rag = True
|
||||
|
||||
|
||||
def process_pdf(pdf_path: Path) -> tuple[str, DossierMedical, AnonymizationReport]:
|
||||
"""Traite un PDF : extraction → parsing → anonymisation → extraction CIM-10."""
|
||||
def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, AnonymizationReport]]:
|
||||
"""Traite un PDF : extraction → splitting → parsing → anonymisation → extraction CIM-10.
|
||||
|
||||
Retourne une liste de (texte_anonymisé, dossier, rapport) — un par dossier détecté.
|
||||
"""
|
||||
t0 = time.time()
|
||||
logger.info("Traitement de %s", pdf_path.name)
|
||||
|
||||
@@ -41,40 +45,53 @@ def process_pdf(pdf_path: Path) -> tuple[str, DossierMedical, AnonymizationRepor
|
||||
doc_type = classify(raw_text)
|
||||
logger.info(" Type de document : %s", doc_type)
|
||||
|
||||
# 3. Parsing
|
||||
if doc_type == "trackare":
|
||||
parsed = parse_trackare(raw_text)
|
||||
else:
|
||||
parsed = parse_crh(raw_text)
|
||||
# 3. Splitting multi-dossiers
|
||||
chunks = split_documents(raw_text, doc_type)
|
||||
if len(chunks) > 1:
|
||||
logger.info(" Découpage : %d dossiers détectés dans %s", len(chunks), pdf_path.name)
|
||||
|
||||
# 4. Anonymisation
|
||||
anonymizer = Anonymizer(parsed_data=parsed)
|
||||
anonymized_text = anonymizer.anonymize(raw_text)
|
||||
report = anonymizer.report
|
||||
report.source_file = pdf_path.name
|
||||
logger.info(
|
||||
" Anonymisation : %d remplacements (regex=%d, ner=%d, sweep=%d)",
|
||||
report.total_replacements,
|
||||
report.regex_replacements,
|
||||
report.ner_replacements,
|
||||
report.sweep_replacements,
|
||||
)
|
||||
results: list[tuple[str, DossierMedical, AnonymizationReport]] = []
|
||||
for i, chunk_text in enumerate(chunks):
|
||||
part_label = f" [part {i+1}/{len(chunks)}]" if len(chunks) > 1 else ""
|
||||
logger.info(" Traitement%s...", part_label)
|
||||
|
||||
# 5. Analyse edsnlp (optionnelle)
|
||||
edsnlp_result = None
|
||||
if _use_edsnlp:
|
||||
edsnlp_result = _run_edsnlp(anonymized_text)
|
||||
# 4. Parsing
|
||||
if doc_type == "trackare":
|
||||
parsed = parse_trackare(chunk_text)
|
||||
else:
|
||||
parsed = parse_crh(chunk_text)
|
||||
|
||||
# 6. Extraction médicale CIM-10
|
||||
dossier = extract_medical_info(parsed, anonymized_text, edsnlp_result, use_rag=_use_rag)
|
||||
dossier.source_file = pdf_path.name
|
||||
dossier.document_type = doc_type
|
||||
dossier.processing_time_s = round(time.time() - t0, 2)
|
||||
logger.info(" DP : %s", dossier.diagnostic_principal)
|
||||
logger.info(" DAS : %d, Actes : %d", len(dossier.diagnostics_associes), len(dossier.actes_ccam))
|
||||
logger.info(" Temps de traitement : %.2fs", dossier.processing_time_s)
|
||||
# 5. Anonymisation
|
||||
anonymizer = Anonymizer(parsed_data=parsed)
|
||||
anonymized_text = anonymizer.anonymize(chunk_text)
|
||||
report = anonymizer.report
|
||||
report.source_file = pdf_path.name
|
||||
logger.info(
|
||||
" Anonymisation%s : %d remplacements (regex=%d, ner=%d, sweep=%d)",
|
||||
part_label,
|
||||
report.total_replacements,
|
||||
report.regex_replacements,
|
||||
report.ner_replacements,
|
||||
report.sweep_replacements,
|
||||
)
|
||||
|
||||
return anonymized_text, dossier, report
|
||||
# 6. Analyse edsnlp (optionnelle)
|
||||
edsnlp_result = None
|
||||
if _use_edsnlp:
|
||||
edsnlp_result = _run_edsnlp(anonymized_text)
|
||||
|
||||
# 7. Extraction médicale CIM-10
|
||||
dossier = extract_medical_info(parsed, anonymized_text, edsnlp_result, use_rag=_use_rag)
|
||||
dossier.source_file = pdf_path.name
|
||||
dossier.document_type = doc_type
|
||||
dossier.processing_time_s = round(time.time() - t0, 2)
|
||||
logger.info(" DP%s : %s", part_label, dossier.diagnostic_principal)
|
||||
logger.info(" DAS : %d, Actes : %d", len(dossier.diagnostics_associes), len(dossier.actes_ccam))
|
||||
|
||||
results.append((anonymized_text, dossier, report))
|
||||
|
||||
logger.info(" Temps total : %.2fs", time.time() - t0)
|
||||
return results
|
||||
|
||||
|
||||
def _run_edsnlp(text: str):
|
||||
@@ -252,10 +269,13 @@ def main(input_path: str | None = None) -> None:
|
||||
group_dossiers: list[DossierMedical] = []
|
||||
for pdf_path in pdfs:
|
||||
try:
|
||||
anonymized_text, dossier, report = process_pdf(pdf_path)
|
||||
pdf_results = process_pdf(pdf_path)
|
||||
stem = pdf_path.stem.replace(" ", "_")
|
||||
write_outputs(stem, anonymized_text, dossier, report, subdir=subdir)
|
||||
group_dossiers.append(dossier)
|
||||
multi = len(pdf_results) > 1
|
||||
for part_idx, (anonymized_text, dossier, report) in enumerate(pdf_results):
|
||||
part_stem = f"{stem}_part{part_idx + 1}" if multi else stem
|
||||
write_outputs(part_stem, anonymized_text, dossier, report, subdir=subdir)
|
||||
group_dossiers.append(dossier)
|
||||
except Exception:
|
||||
logger.exception("Erreur lors du traitement de %s", pdf_path.name)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user