# instantiate the pipeline from pyannote.audio import Pipeline import torch audio_path = "short_transcript.wav" pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization-3.1", use_auth_token="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", ) # run the pipeline on an audio file diarization = pipeline(audio_path, min_speakers=6, max_speakers=7) # dump the diarization output to disk using RTTM format with open("short_transcript.rttm", "w") as rttm: diarization.write_rttm(rttm) import matplotlib.pyplot as plt import librosa import librosa.display # Load the audio file and compute its waveform audio, sr = librosa.load(audio_path, sr=None) # Plot the audio waveform plt.figure(figsize=(10, 6)) librosa.display.waveshow(audio, sr=sr, alpha=0.5, color="gray") plt.xlabel("Time (s)") plt.ylabel("Amplitude") plt.title("Speaker Diarization Results") # Plot speaker segments for segment, _, label in diarization.itertracks(yield_label=True): # Get start and end times of each speaker segment start, end = segment.start, segment.end plt.plot([start, end], [0.9, 0.9], label=f"Speaker {label}") # Avoid duplicate labels in legend handles, labels = plt.gca().get_legend_handles_labels() by_label = dict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys(), loc="upper right") plt.show()