From fc68fc6f6b18479baedb6ed5f3f8d6fa5c8176c2 Mon Sep 17 00:00:00 2001 From: dom Date: Tue, 10 Feb 2026 18:45:09 +0100 Subject: [PATCH] feat: traitement des sous-dossiers patients avec sorties miroir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Permet d'organiser les PDFs en sous-répertoires (un niveau) dans le dossier d'entrée. Les sorties reflètent cette structure dans output/. Les PDFs à la racine continuent de fonctionner comme avant. Co-Authored-By: Claude Opus 4.6 --- src/main.py | 56 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/src/main.py b/src/main.py index 65ac81d..284eb42 100644 --- a/src/main.py +++ b/src/main.py @@ -98,15 +98,24 @@ def write_outputs( anonymized_text: str, dossier: DossierMedical, report: AnonymizationReport, + subdir: str | None = None, ) -> None: """Écrit les fichiers de sortie.""" + anon_dir = ANONYMIZED_DIR / subdir if subdir else ANONYMIZED_DIR + struct_dir = STRUCTURED_DIR / subdir if subdir else STRUCTURED_DIR + rep_dir = REPORTS_DIR / subdir if subdir else REPORTS_DIR + + anon_dir.mkdir(parents=True, exist_ok=True) + struct_dir.mkdir(parents=True, exist_ok=True) + rep_dir.mkdir(parents=True, exist_ok=True) + # Texte anonymisé - anon_path = ANONYMIZED_DIR / f"{stem}_anonymized.txt" + anon_path = anon_dir / f"{stem}_anonymized.txt" anon_path.write_text(anonymized_text, encoding="utf-8") logger.info(" → %s", anon_path) # JSON structuré - json_path = STRUCTURED_DIR / f"{stem}_cim10.json" + json_path = struct_dir / f"{stem}_cim10.json" json_path.write_text( dossier.model_dump_json(indent=2, exclude_none=True), encoding="utf-8", @@ -114,7 +123,7 @@ def write_outputs( logger.info(" → %s", json_path) # Rapport d'anonymisation - report_path = REPORTS_DIR / f"{stem}_report.json" + report_path = rep_dir / f"{stem}_report.json" report_path.write_text( report.model_dump_json(indent=2), encoding="utf-8", @@ -164,27 +173,46 @@ def main(input_path: str | None = None) -> None: _use_rag = False input_p = Path(args.input) + + # Collecte des groupes (pdfs, subdir) à traiter + groups: list[tuple[list[Path], str | None]] = [] + if input_p.is_file(): - pdfs = [input_p] + groups.append(([input_p], None)) elif input_p.is_dir(): - pdfs = sorted(input_p.glob("*.pdf")) + # PDFs à la racine + root_pdfs = sorted(input_p.glob("*.pdf")) + if root_pdfs: + groups.append((root_pdfs, None)) + + # Sous-dossiers directs (un seul niveau) + for child in sorted(input_p.iterdir()): + if child.is_dir(): + sub_pdfs = sorted(child.glob("*.pdf")) + if sub_pdfs: + groups.append((sub_pdfs, child.name)) else: logger.error("Chemin introuvable : %s", input_p) sys.exit(1) - if not pdfs: + total = sum(len(pdfs) for pdfs, _ in groups) + if total == 0: logger.warning("Aucun PDF trouvé dans %s", input_p) sys.exit(0) - logger.info("Traitement de %d PDF(s)...", len(pdfs)) + logger.info("Traitement de %d PDF(s)...", total) - for pdf_path in pdfs: - try: - anonymized_text, dossier, report = process_pdf(pdf_path) - stem = pdf_path.stem.replace(" ", "_") - write_outputs(stem, anonymized_text, dossier, report) - except Exception: - logger.exception("Erreur lors du traitement de %s", pdf_path.name) + for pdfs, subdir in groups: + if subdir: + logger.info("--- Dossier %s (%d PDFs) ---", subdir, len(pdfs)) + + for pdf_path in pdfs: + try: + anonymized_text, dossier, report = process_pdf(pdf_path) + stem = pdf_path.stem.replace(" ", "_") + write_outputs(stem, anonymized_text, dossier, report, subdir=subdir) + except Exception: + logger.exception("Erreur lors du traitement de %s", pdf_path.name) logger.info("Terminé.")