2024-11-10 09:27:57 +01:00
|
|
|
import torch
|
2024-11-11 00:43:54 +01:00
|
|
|
from pyannote.audio import Pipeline
|
|
|
|
import whisper
|
2025-03-26 22:29:22 +01:00
|
|
|
|
|
|
|
AUDIO_FILE = "2024-07-29_audio.wav"
|
|
|
|
filename = (AUDIO_FILE[::-1].split(".")[1].split("/")[0].split("\\")[0])[::-1]
|
|
|
|
|
|
|
|
|
|
|
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
|
torch.backends.cudnn.allow_tf32 = True
|
2024-11-10 09:27:57 +01:00
|
|
|
|
2024-11-11 00:43:54 +01:00
|
|
|
# Load Whisper model for transcription
|
|
|
|
whisper_model = whisper.load_model("large")
|
2024-11-10 09:27:57 +01:00
|
|
|
|
|
|
|
|
2024-11-11 00:43:54 +01:00
|
|
|
# Transcribe audio using Whisper
|
|
|
|
def transcribe_audio(audio_path):
|
|
|
|
result = whisper_model.transcribe(audio_path)
|
|
|
|
segments = result["segments"]
|
|
|
|
return [(segment["start"], segment["end"], segment["text"]) for segment in segments]
|
2024-11-10 09:27:57 +01:00
|
|
|
|
|
|
|
|
2024-11-11 00:43:54 +01:00
|
|
|
# Initialize Pyannote Pipeline for diarization
|
|
|
|
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
|
|
|
|
pipeline.to(torch.device("cuda"))
|
2024-11-10 09:27:57 +01:00
|
|
|
|
|
|
|
|
2024-11-11 00:43:54 +01:00
|
|
|
# Perform diarization
|
|
|
|
def perform_diarization(audio_path) -> Pipeline:
|
|
|
|
diarization = pipeline(audio_path, min_speakers=5, max_speakers=7)
|
|
|
|
|
|
|
|
# dump the diarization output to disk using RTTM format
|
2025-03-26 22:29:22 +01:00
|
|
|
with open(f"diarization_{filename}.rttm", "w") as rttm:
|
2024-11-11 00:43:54 +01:00
|
|
|
diarization.write_rttm(rttm)
|
|
|
|
|
|
|
|
print("Finished diarization")
|
|
|
|
|
|
|
|
return diarization
|
|
|
|
|
|
|
|
|
|
|
|
# Load audio and perform both transcription and diarization
|
2025-03-26 22:29:22 +01:00
|
|
|
transcription_segments = transcribe_audio(AUDIO_FILE)
|
|
|
|
diarization: Pipeline = perform_diarization(AUDIO_FILE)
|
2024-11-11 00:43:54 +01:00
|
|
|
|
|
|
|
# Print speaker and corresponding text
|
|
|
|
print("\nSpeaker and Text Segments:")
|
2025-03-26 22:29:22 +01:00
|
|
|
diarization_with_text = []
|
2024-11-11 00:43:54 +01:00
|
|
|
for segment in transcription_segments:
|
|
|
|
start, end, text = segment
|
|
|
|
for spk_segment, _, speaker_label in diarization.itertracks(yield_label=True):
|
|
|
|
if spk_segment.start < end and spk_segment.end > start:
|
2025-03-26 22:29:22 +01:00
|
|
|
diarization_with_text.append(f"Speaker {speaker_label}: {text}")
|
2024-11-11 00:43:54 +01:00
|
|
|
break
|
2024-11-10 23:28:54 +01:00
|
|
|
|
|
|
|
|
2025-03-26 22:29:22 +01:00
|
|
|
with open(f"transcript-diarization_{filename}.txt", "w") as fp:
|
|
|
|
fp.writelines(f"{l}\n" for l in diarization_with_text)
|
2024-11-10 23:28:54 +01:00
|
|
|
|
2025-03-26 22:29:22 +01:00
|
|
|
print("\n".join(diarization_with_text))
|
2024-11-10 23:28:54 +01:00
|
|
|
|
2025-03-26 22:29:22 +01:00
|
|
|
print("Saved diarization")
|