feat: parallélisation pipeline --workers N (ThreadPoolExecutor)

- Fix thread-safety FAISS index (Lock + double-check sur _loaded) - Fix thread-safety reranker (Lock + double-check sur _reranker_model) - main.py : flag --workers, extraction _process_group(), ThreadPoolExecutor - benchmark_quality.py : flag --workers, subprocess en parallèle - Validé sur 10 dossiers gold standard --workers 3 : 0 crash, codes identiques Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:30:51 +01:00
parent 0b94299975
commit 5cf7d74fa3
4 changed files with 109 additions and 37 deletions
--- a/scripts/benchmark_quality.py
+++ b/scripts/benchmark_quality.py
@@ -594,6 +594,7 @@ def main():
    parser.add_argument("--no-reprocess", action="store_true", help="Analyser les outputs existants sans relancer le pipeline")
    parser.add_argument("--clean", action="store_true", help="Supprimer les outputs avant retraitement")
    parser.add_argument("--seed", type=int, default=42, help="Seed pour la sélection aléatoire")
+    parser.add_argument("--workers", type=int, default=1, help="Nombre de dossiers traités en parallèle")
    args = parser.parse_args()

    # Sélection dossiers
@@ -632,23 +633,55 @@ def main():

    # Traitement
    per_dossier = []
-    for i, dossier_id in enumerate(dossiers, 1):
-        print(f"  [{i}/{len(dossiers)}] {dossier_id}", end="", flush=True)
+    total = len(dossiers)

-        if args.no_reprocess:
-            duration = 0.0
-            success = find_merged_json(dossier_id) is not None
-            if not success:
-                print(" — pas de JSON")
+    if args.workers > 1 and not args.no_reprocess:
+        # Mode parallèle : exécuter les pipelines en parallèle puis analyser
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+        print(f"  Mode parallèle : {args.workers} workers")
+        pipeline_results: dict[str, tuple[float, bool]] = {}
+        done = 0
+        with ThreadPoolExecutor(max_workers=args.workers) as executor:
+            futures = {
+                executor.submit(run_pipeline, dossier_id, args.clean): dossier_id
+                for dossier_id in dossiers
+            }
+            for future in as_completed(futures):
+                dossier_id = futures[future]
+                try:
+                    duration, success = future.result()
+                except Exception as e:
+                    print(f"    EXCEPTION {dossier_id}: {e}")
+                    duration, success = 0.0, False
+                pipeline_results[dossier_id] = (duration, success)
+                done += 1
+                mark = "✓" if success else "✗"
+                print(f"  [{done}/{total}] {dossier_id} — {duration:.1f}s {mark}")
+
+        # Analyse séquentielle (ordre stable)
+        for dossier_id in dossiers:
+            duration, success = pipeline_results[dossier_id]
+            metrics = analyze_dossier(dossier_id, cim10, duration)
+            per_dossier.append(metrics)
+    else:
+        # Mode séquentiel (ou --no-reprocess)
+        for i, dossier_id in enumerate(dossiers, 1):
+            print(f"  [{i}/{total}] {dossier_id}", end="", flush=True)
+
+            if args.no_reprocess:
+                duration = 0.0
+                success = find_merged_json(dossier_id) is not None
+                if not success:
+                    print(" — pas de JSON")
+                else:
+                    print(" — analyse existant")
            else:
-                print(" — analyse existant")
-        else:
-            print(" — traitement...", end="", flush=True)
-            duration, success = run_pipeline(dossier_id, args.clean)
-            print(f" {duration:.1f}s {'✓' if success else '✗'}")
+                print(" — traitement...", end="", flush=True)
+                duration, success = run_pipeline(dossier_id, args.clean)
+                print(f" {duration:.1f}s {'✓' if success else '✗'}")

-        metrics = analyze_dossier(dossier_id, cim10, duration)
-        per_dossier.append(metrics)
+            metrics = analyze_dossier(dossier_id, cim10, duration)
+            per_dossier.append(metrics)

    # Agrégation
    agg = compute_aggregate(per_dossier)