#!/usr/bin/env bash set -euo pipefail ROOT_INPUT="${1:-input}" # ex: input N="${2:-50}" # nb dossiers à traiter # Options FORCE="${FORCE:-0}" # FORCE=1 => retraiter même si output JSON existe CLEAN="${CLEAN:-0}" # CLEAN=1 => supprime outputs du dossier avant retraitement (recommandé avec FORCE) RANDOM_PICK="${RANDOM_PICK:-0}" # RANDOM_PICK=1 => choisir N dossiers aléatoires MAX_PARALLEL="${OLLAMA_MAX_PARALLEL:-1}" # Modèles export OLLAMA_CODER_MODEL="${OLLAMA_CODER_MODEL:-gemma3:27b}" export OLLAMA_VERIFIER_MODEL="${OLLAMA_VERIFIER_MODEL:-deepseek-v3.2:cloud}" export OLLAMA_MAX_PARALLEL="$MAX_PARALLEL" # Python du venv (fiable) PY="./.venv/bin/python" if [[ ! -x "$PY" ]]; then echo "❌ Venv introuvable: $PY" echo " Active ton venv ou crée-le, puis relance." exit 1 fi RUN_ID="$(date +%Y%m%d_%H%M%S)" RUN_DIR="output/batch_runs/$RUN_ID" LOG_DIR="output/batch_logs/$RUN_ID" mkdir -p "$RUN_DIR" "$LOG_DIR" IDS_FILE="$RUN_DIR/ids.txt" FILES_FILE="$RUN_DIR/files.txt" echo "=== Batch Run: $RUN_ID ===" | tee "$RUN_DIR/summary.txt" echo "ROOT_INPUT=$ROOT_INPUT N=$N FORCE=$FORCE CLEAN=$CLEAN RANDOM_PICK=$RANDOM_PICK" | tee -a "$RUN_DIR/summary.txt" echo "CODER=$OLLAMA_CODER_MODEL VERIFIER=$OLLAMA_VERIFIER_MODEL OLLAMA_MAX_PARALLEL=$OLLAMA_MAX_PARALLEL" | tee -a "$RUN_DIR/summary.txt" echo | tee -a "$RUN_DIR/summary.txt" # Liste des dossiers = sous-dossiers avec au moins 1 pdf LIST_CMD=(find "$ROOT_INPUT" -mindepth 1 -maxdepth 1 -type d -print) mapfile -t ALL_DIRS < <("${LIST_CMD[@]}" | while read -r d; do compgen -G "$d/*.pdf" >/dev/null && echo "$d" done) if [[ "${#ALL_DIRS[@]}" -eq 0 ]]; then echo "❌ Aucun dossier avec PDF trouvé dans: $ROOT_INPUT" exit 1 fi # Sélection N dossiers if [[ "$RANDOM_PICK" == "1" ]]; then mapfile -t DOSSIERS < <(printf "%s\n" "${ALL_DIRS[@]}" | shuf | head -n "$N") else mapfile -t DOSSIERS < <(printf "%s\n" "${ALL_DIRS[@]}" | sort | head -n "$N") fi echo "→ Dossiers sélectionnés: ${#DOSSIERS[@]}" | tee -a "$RUN_DIR/summary.txt" # Traitement for d in "${DOSSIERS[@]}"; do id="$(basename "$d")" out_json="output/structured/$id/${id}_fusionne_cim10.json" log="$LOG_DIR/${id}.log" # Enregistre l'ID (pour stats de fin) echo "$id" >> "$IDS_FILE" if [[ -f "$out_json" && "$FORCE" != "1" ]]; then echo "⏭️ SKIP $id (déjà traité)" | tee -a "$RUN_DIR/summary.txt" continue fi if [[ "$CLEAN" == "1" ]]; then rm -rf "output/structured/$id" "output/reports/$id" "output/anonymized/$id" 2>/dev/null || true fi echo "▶️ START $id" | tee -a "$RUN_DIR/summary.txt" ("$PY" -m src.main "$d") 2>&1 | tee "$log" echo "✅ DONE $id" | tee -a "$RUN_DIR/summary.txt" done # Construit la liste des fichiers JSON réellement présents pour ce run : > "$FILES_FILE" while read -r id; do f="output/structured/$id/${id}_fusionne_cim10.json" [[ -f "$f" ]] && echo "$f" >> "$FILES_FILE" done < "$IDS_FILE" COUNT_FILES=$(wc -l < "$FILES_FILE" | tr -d ' ') echo | tee -a "$RUN_DIR/summary.txt" echo "→ JSON trouvés pour stats: $COUNT_FILES" | tee -a "$RUN_DIR/summary.txt" if [[ "$COUNT_FILES" -eq 0 ]]; then echo "⚠️ Aucun JSON pour stats. Fin." | tee -a "$RUN_DIR/summary.txt" exit 0 fi echo | tee -a "$RUN_DIR/summary.txt" echo "=== STATS (sur ce run uniquement) ===" | tee -a "$RUN_DIR/summary.txt" # 1) Verdicts echo "--- Verdicts ---" | tee -a "$RUN_DIR/summary.txt" xargs -a "$FILES_FILE" jq -r '(.veto_report.verdict // "NO_REPORT")' \ | sort | uniq -c | sort -nr | tee -a "$RUN_DIR/summary.txt" # 2) Top VETOs echo | tee -a "$RUN_DIR/summary.txt" echo "--- Top VETOs ---" | tee -a "$RUN_DIR/summary.txt" xargs -a "$FILES_FILE" jq -r '.veto_report.issues[]?.veto' \ | sort | uniq -c | sort -nr | head -n 20 | tee -a "$RUN_DIR/summary.txt" # 3) HARD count echo | tee -a "$RUN_DIR/summary.txt" echo "--- Dossiers avec HARD ---" | tee -a "$RUN_DIR/summary.txt" while read -r f; do id="$(basename "$f" _fusionne_cim10.json)" hard=$(jq '[.veto_report.issues[]? | select(.severity=="HARD")] | length' "$f") [[ "$hard" -gt 0 ]] && printf "%s\tHARD=%s\n" "$id" "$hard" done < "$FILES_FILE" | sort -k2,2nr | tee -a "$RUN_DIR/summary.txt" # 4) Downgrades (cim10_final != cim10_suggestion) echo | tee -a "$RUN_DIR/summary.txt" echo "--- Downgrades (TOP 30) ---" | tee -a "$RUN_DIR/summary.txt" while read -r f; do id="$(basename "$f" _fusionne_cim10.json)" dw=$(jq ' ([ (.diagnostic_principal? | select(.cim10_final? and .cim10_suggestion? and .cim10_final != .cim10_suggestion) | 1), (.diagnostics_associes[]? | select(.cim10_final? and .cim10_suggestion? and .cim10_final != .cim10_suggestion) | 1) ] | add) // 0 ' "$f") [[ "$dw" -gt 0 ]] && printf "%s\tDOWN=%s\n" "$id" "$dw" done < "$FILES_FILE" | sort -k2,2nr | head -n 30 | tee -a "$RUN_DIR/summary.txt" echo | tee -a "$RUN_DIR/summary.txt" echo "✅ Stats écrites dans: $RUN_DIR/summary.txt" echo "📁 Logs dossier par dossier: $LOG_DIR/"