import os import sys import time import torch import gc from pyannote.audio import Pipeline from faster_whisper import WhisperModel import librosa def run_diarization_and_transcription(audio_file, hf_token): """ Version 3.0 : Feedback ultra-précis pour barre de progression. """ duration = librosa.get_duration(path=audio_file) print(f"[STATUS] PROGRESS:1") # 1. DIARISATION print("[PHASE 1/3] Diarisation (Analyse des voix)...") try: pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", token=hf_token) pipeline.to(torch.device("cpu")) raw_result = pipeline(audio_file) if hasattr(raw_result, 'annotation'): diarization = raw_result.annotation else: diarization = raw_result del pipeline gc.collect() except Exception as e: print(f"[ERROR] Diarisation : {e}") diarization = None print(f"[STATUS] PROGRESS:30") # 2. TRANSCRIPTION print("[PHASE 2/3] Transcription (Modèle Large-v3-Turbo)...") device = "cuda" if torch.cuda.is_available() else "cpu" compute_type = "int8_float16" if device == "cuda" else "int8" try: model = WhisperModel("large-v3-turbo", device=device, compute_type=compute_type, cpu_threads=16) segments, _ = model.transcribe(audio_file, beam_size=5, language="fr", word_timestamps=True) whisper_segments = [] for segment in segments: # Progression dynamique : 30% à 95% pct = 30 + (65 * (segment.end / duration)) print(f"[STATUS] PROGRESS:{int(pct)}") speaker = "VOIX" if diarization: max_overlap = 0 for turn, _, speaker_id in diarization.itertracks(yield_label=True): overlap = min(segment.end, turn.end) - max(segment.start, turn.start) if overlap > max_overlap: max_overlap = overlap speaker = speaker_id timestamp = f"[{time.strftime('%H:%M:%S', time.gmtime(segment.start))}]" line = f"{timestamp} {speaker}: {segment.text.strip()}" print(line) whisper_segments.append(line) del model gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() except Exception as e: print(f"[ERROR] Transcription : {e}") return None print(f"[STATUS] PROGRESS:100") return "\n".join(whisper_segments) if __name__ == "__main__": hf_token = os.getenv("HF_TOKEN") if len(sys.argv) > 1: audio_file = sys.argv[1] result = run_diarization_and_transcription(audio_file, hf_token) if result: output_file = audio_file.rsplit('.', 1)[0] + "_diarized.txt" with open(output_file, "w", encoding="utf-8") as f: f.write(result) print(f"\n[OK] Fini : {output_file}")