feat: traitement des sous-dossiers patients avec sorties miroir

Permet d'organiser les PDFs en sous-répertoires (un niveau) dans le
dossier d'entrée. Les sorties reflètent cette structure dans output/.
Les PDFs à la racine continuent de fonctionner comme avant.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-02-10 18:45:09 +01:00
parent 4d6fbef2b9
commit fc68fc6f6b

View File

@@ -98,15 +98,24 @@ def write_outputs(
anonymized_text: str, anonymized_text: str,
dossier: DossierMedical, dossier: DossierMedical,
report: AnonymizationReport, report: AnonymizationReport,
subdir: str | None = None,
) -> None: ) -> None:
"""Écrit les fichiers de sortie.""" """Écrit les fichiers de sortie."""
anon_dir = ANONYMIZED_DIR / subdir if subdir else ANONYMIZED_DIR
struct_dir = STRUCTURED_DIR / subdir if subdir else STRUCTURED_DIR
rep_dir = REPORTS_DIR / subdir if subdir else REPORTS_DIR
anon_dir.mkdir(parents=True, exist_ok=True)
struct_dir.mkdir(parents=True, exist_ok=True)
rep_dir.mkdir(parents=True, exist_ok=True)
# Texte anonymisé # Texte anonymisé
anon_path = ANONYMIZED_DIR / f"{stem}_anonymized.txt" anon_path = anon_dir / f"{stem}_anonymized.txt"
anon_path.write_text(anonymized_text, encoding="utf-8") anon_path.write_text(anonymized_text, encoding="utf-8")
logger.info("%s", anon_path) logger.info("%s", anon_path)
# JSON structuré # JSON structuré
json_path = STRUCTURED_DIR / f"{stem}_cim10.json" json_path = struct_dir / f"{stem}_cim10.json"
json_path.write_text( json_path.write_text(
dossier.model_dump_json(indent=2, exclude_none=True), dossier.model_dump_json(indent=2, exclude_none=True),
encoding="utf-8", encoding="utf-8",
@@ -114,7 +123,7 @@ def write_outputs(
logger.info("%s", json_path) logger.info("%s", json_path)
# Rapport d'anonymisation # Rapport d'anonymisation
report_path = REPORTS_DIR / f"{stem}_report.json" report_path = rep_dir / f"{stem}_report.json"
report_path.write_text( report_path.write_text(
report.model_dump_json(indent=2), report.model_dump_json(indent=2),
encoding="utf-8", encoding="utf-8",
@@ -164,27 +173,46 @@ def main(input_path: str | None = None) -> None:
_use_rag = False _use_rag = False
input_p = Path(args.input) input_p = Path(args.input)
# Collecte des groupes (pdfs, subdir) à traiter
groups: list[tuple[list[Path], str | None]] = []
if input_p.is_file(): if input_p.is_file():
pdfs = [input_p] groups.append(([input_p], None))
elif input_p.is_dir(): elif input_p.is_dir():
pdfs = sorted(input_p.glob("*.pdf")) # PDFs à la racine
root_pdfs = sorted(input_p.glob("*.pdf"))
if root_pdfs:
groups.append((root_pdfs, None))
# Sous-dossiers directs (un seul niveau)
for child in sorted(input_p.iterdir()):
if child.is_dir():
sub_pdfs = sorted(child.glob("*.pdf"))
if sub_pdfs:
groups.append((sub_pdfs, child.name))
else: else:
logger.error("Chemin introuvable : %s", input_p) logger.error("Chemin introuvable : %s", input_p)
sys.exit(1) sys.exit(1)
if not pdfs: total = sum(len(pdfs) for pdfs, _ in groups)
if total == 0:
logger.warning("Aucun PDF trouvé dans %s", input_p) logger.warning("Aucun PDF trouvé dans %s", input_p)
sys.exit(0) sys.exit(0)
logger.info("Traitement de %d PDF(s)...", len(pdfs)) logger.info("Traitement de %d PDF(s)...", total)
for pdf_path in pdfs: for pdfs, subdir in groups:
try: if subdir:
anonymized_text, dossier, report = process_pdf(pdf_path) logger.info("--- Dossier %s (%d PDFs) ---", subdir, len(pdfs))
stem = pdf_path.stem.replace(" ", "_")
write_outputs(stem, anonymized_text, dossier, report) for pdf_path in pdfs:
except Exception: try:
logger.exception("Erreur lors du traitement de %s", pdf_path.name) anonymized_text, dossier, report = process_pdf(pdf_path)
stem = pdf_path.stem.replace(" ", "_")
write_outputs(stem, anonymized_text, dossier, report, subdir=subdir)
except Exception:
logger.exception("Erreur lors du traitement de %s", pdf_path.name)
logger.info("Terminé.") logger.info("Terminé.")