import torch from pyannote.audio import Pipeline import whisper AUDIO_FILE = "2024-07-29_audio.wav" filename = (AUDIO_FILE[::-1].split(".")[1].split("/")[0].split("\\")[0])[::-1] torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # Load Whisper model for transcription whisper_model = whisper.load_model("large") # Transcribe audio using Whisper def transcribe_audio(audio_path): result = whisper_model.transcribe(audio_path) segments = result["segments"] return [(segment["start"], segment["end"], segment["text"]) for segment in segments] # Initialize Pyannote Pipeline for diarization pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1") pipeline.to(torch.device("cuda")) # Perform diarization def perform_diarization(audio_path) -> Pipeline: diarization = pipeline(audio_path, min_speakers=5, max_speakers=7) # dump the diarization output to disk using RTTM format with open(f"diarization_{filename}.rttm", "w") as rttm: diarization.write_rttm(rttm) print("Finished diarization") return diarization # Load audio and perform both transcription and diarization transcription_segments = transcribe_audio(AUDIO_FILE) diarization: Pipeline = perform_diarization(AUDIO_FILE) # Print speaker and corresponding text print("\nSpeaker and Text Segments:") diarization_with_text = [] for segment in transcription_segments: start, end, text = segment for spk_segment, _, speaker_label in diarization.itertracks(yield_label=True): if spk_segment.start < end and spk_segment.end > start: diarization_with_text.append(f"Speaker {speaker_label}: {text}") break with open(f"transcript-diarization_{filename}.txt", "w") as fp: fp.writelines(f"{l}\n" for l in diarization_with_text) print("\n".join(diarization_with_text)) print("Saved diarization")