feat: parallélisation pipeline --workers N (ThreadPoolExecutor)

- Fix thread-safety FAISS index (Lock + double-check sur _loaded)
- Fix thread-safety reranker (Lock + double-check sur _reranker_model)
- main.py : flag --workers, extraction _process_group(), ThreadPoolExecutor
- benchmark_quality.py : flag --workers, subprocess en parallèle
- Validé sur 10 dossiers gold standard --workers 3 : 0 crash, codes identiques

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-02-20 01:30:51 +01:00
parent 0b94299975
commit 5cf7d74fa3
4 changed files with 109 additions and 37 deletions

View File

@@ -594,6 +594,7 @@ def main():
parser.add_argument("--no-reprocess", action="store_true", help="Analyser les outputs existants sans relancer le pipeline")
parser.add_argument("--clean", action="store_true", help="Supprimer les outputs avant retraitement")
parser.add_argument("--seed", type=int, default=42, help="Seed pour la sélection aléatoire")
parser.add_argument("--workers", type=int, default=1, help="Nombre de dossiers traités en parallèle")
args = parser.parse_args()
# Sélection dossiers
@@ -632,23 +633,55 @@ def main():
# Traitement
per_dossier = []
for i, dossier_id in enumerate(dossiers, 1):
print(f" [{i}/{len(dossiers)}] {dossier_id}", end="", flush=True)
total = len(dossiers)
if args.no_reprocess:
duration = 0.0
success = find_merged_json(dossier_id) is not None
if not success:
print(" — pas de JSON")
if args.workers > 1 and not args.no_reprocess:
# Mode parallèle : exécuter les pipelines en parallèle puis analyser
from concurrent.futures import ThreadPoolExecutor, as_completed
print(f" Mode parallèle : {args.workers} workers")
pipeline_results: dict[str, tuple[float, bool]] = {}
done = 0
with ThreadPoolExecutor(max_workers=args.workers) as executor:
futures = {
executor.submit(run_pipeline, dossier_id, args.clean): dossier_id
for dossier_id in dossiers
}
for future in as_completed(futures):
dossier_id = futures[future]
try:
duration, success = future.result()
except Exception as e:
print(f" EXCEPTION {dossier_id}: {e}")
duration, success = 0.0, False
pipeline_results[dossier_id] = (duration, success)
done += 1
mark = "" if success else ""
print(f" [{done}/{total}] {dossier_id}{duration:.1f}s {mark}")
# Analyse séquentielle (ordre stable)
for dossier_id in dossiers:
duration, success = pipeline_results[dossier_id]
metrics = analyze_dossier(dossier_id, cim10, duration)
per_dossier.append(metrics)
else:
# Mode séquentiel (ou --no-reprocess)
for i, dossier_id in enumerate(dossiers, 1):
print(f" [{i}/{total}] {dossier_id}", end="", flush=True)
if args.no_reprocess:
duration = 0.0
success = find_merged_json(dossier_id) is not None
if not success:
print(" — pas de JSON")
else:
print(" — analyse existant")
else:
print("analyse existant")
else:
print(" — traitement...", end="", flush=True)
duration, success = run_pipeline(dossier_id, args.clean)
print(f" {duration:.1f}s {'' if success else ''}")
print("traitement...", end="", flush=True)
duration, success = run_pipeline(dossier_id, args.clean)
print(f" {duration:.1f}s {'' if success else ''}")
metrics = analyze_dossier(dossier_id, cim10, duration)
per_dossier.append(metrics)
metrics = analyze_dossier(dossier_id, cim10, duration)
per_dossier.append(metrics)
# Agrégation
agg = compute_aggregate(per_dossier)