Some diarization

2024-11-10 23:28:54 +01:00 · 2024-11-10 23:28:54 +01:00 · b71c78c5f2
commit b71c78c5f2
parent cc2d6f8210
1 changed files with 34 additions and 70 deletions
--- a/tavern_talk/diarization.py
+++ b/tavern_talk/diarization.py
@ -1,82 +1,46 @@
-audio_file = "./tavern_talk/short_transcript.wav"
+# instantiate the pipeline
-
+from pyannote.audio import Pipeline
 import torchaudio
 import torch
 from speechbrain.inference.classifiers import EncoderClassifier
 from scipy.cluster.vq import kmeans2
 import numpy as np
 import matplotlib.pyplot as plt
-# Load the speaker encoder model
+audio_path = "short_transcript.wav"
-classifier = EncoderClassifier.from_hparams(
+
-    source="speechbrain/spkrec-xvect-voxceleb", savedir="tmp_spkrec"
+pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="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",
 )
 # Load the ASR model from torchaudio
 asr_model = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H.get_model()
-# Define the audio file path
+# run the pipeline on an audio file
-signal, fs = torchaudio.load(audio_file)
+diarization = pipeline(audio_path, min_speakers=6, max_speakers=7)
-# Segment the audio into 1-second chunks with a 50% overlap for speaker embeddings
+# dump the diarization output to disk using RTTM format
-window_size = int(fs * 1.0)
+with open("short_transcript.rttm", "w") as rttm:
-overlap = int(fs * 0.5)
+    diarization.write_rttm(rttm)
 segments = []
 embeddings = []
 for start in range(0, signal.shape[1] - window_size, overlap):
    segment = signal[:, start : start + window_size]
    segments.append((start / fs, (start + window_size) / fs))
    embedding = classifier.encode_batch(segment)
    embeddings.append(embedding.squeeze(0).detach().cpu().numpy())
 # Convert embeddings to a 2D numpy array (num_segments x embedding_size)
 embeddings = np.vstack(embeddings)
 # Perform KMeans clustering on 2D embeddings
 centroids, labels = kmeans2(embeddings, k=6)  # Adjust 'k' based on number of speakers
 # Output diarization results with speaker labels and timestamps
 print("Diarization Results:")
 for i, (start, end) in enumerate(segments):
    print(f"{start:.2f}s - {end:.2f}s: Speaker {labels[i]}")
 # Perform ASR on the entire audio file and display the result
 with torch.inference_mode():
    asr_transcription = asr_model(signal)[0]  # Extract only the transcription result
    asr_text = asr_transcription.tolist()
 print("\nTranscription Results:")
 print(asr_text)
-# Optional: plot audio waveform with speaker probabilities
+import matplotlib.pyplot as plt
-def plot_diarization_with_audio(signal, fs, segments, labels):
+import librosa
-    # Plot audio waveform
+import librosa.display
-    plt.figure(figsize=(12, 6))
+
-    time = torch.arange(0, signal.shape[1]) / fs
+# Load the audio file and compute its waveform
-    plt.subplot(2, 1, 1)
+audio, sr = librosa.load(audio_path, sr=None)
-    plt.plot(time, signal.t().numpy())
+
-    plt.title("Audio Waveform")
+# Plot the audio waveform
 plt.figure(figsize=(10, 6))
 librosa.display.waveshow(audio, sr=sr, alpha=0.5, color="gray")
 plt.xlabel("Time (s)")
 plt.ylabel("Amplitude")
 plt.title("Speaker Diarization Results")
-    # Plot speaker diarization
+# Plot speaker segments
-    plt.subplot(2, 1, 2)
+for segment, _, label in diarization.itertracks(yield_label=True):
-    for i, (start, end) in enumerate(segments):
+    # Get start and end times of each speaker segment
-        speaker_label = labels[i]
+    start, end = segment.start, segment.end
-        plt.plot(
+    plt.plot([start, end], [0.9, 0.9], label=f"Speaker {label}")
-            [start, end],
+
-            [speaker_label, speaker_label],
+# Avoid duplicate labels in legend
-            label=f"Speaker {speaker_label}",
+handles, labels = plt.gca().get_legend_handles_labels()
-            linewidth=4,
+by_label = dict(zip(labels, handles))
-        )
+plt.legend(by_label.values(), by_label.keys(), loc="upper right")
    plt.xlabel("Time (s)")
    plt.ylabel("Speaker")
    plt.title("Speaker Diarization with Probability")
 plt.show()
 plot_diarization_with_audio(signal, fs, segments, labels)