From 80a3dbe5148b99f3746956f7adb15c087a28c3fa Mon Sep 17 00:00:00 2001
From: Peter Nguyen <peter@datacrunch.ca>
Date: Mon, 5 May 2025 21:02:42 -0400
Subject: [PATCH] Create webapp_wav2txt.py

---
 webapp_wav2txt.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 webapp_wav2txt.py

diff --git a/webapp_wav2txt.py b/webapp_wav2txt.py
new file mode 100644
index 0000000..0e25bc7
--- /dev/null
+++ b/webapp_wav2txt.py
@@ -0,0 +1,63 @@
+'''
+Before you run this, make sure these are installed:
+pip install torch
+pip install torchaudio
+pip install gradio
+pip install transformers
+
+You also need the following in your PATH environment variable: https://www.ffmpeg.org/download.html
+ffmpeg
+ffprobe
+
+Finally, when you first run this, it'll download the openai/whisper-medium model, which is about 3GB.
+'''
+
+import torch
+import torchaudio
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import gradio as gr
+
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+WHISPER_SAMPLE_RATE = 16000
+
+processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
+model = WhisperForConditionalGeneration.from_pretrained(
+    "openai/whisper-medium"
+).to(DEVICE)
+
+
+def preprocess_audio(audio_path: str) -> torch.Tensor:
+    audio, sample_rate = torchaudio.load(audio_path)
+    # Resample if necessary
+    if sample_rate != WHISPER_SAMPLE_RATE:
+        resampler = torchaudio.transforms.Resample(
+            orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE
+        )
+        audio = resampler(audio)
+    # Convert to mono
+    if audio.shape[0] > 1:
+        audio = torch.mean(audio, dim=0)
+    return audio.squeeze()
+
+
+def transcribe(audio_path: str) -> str:
+    audio_input = preprocess_audio(audio_path)
+    input_features = processor(
+        audio_input,
+        sampling_rate=WHISPER_SAMPLE_RATE,
+        return_tensors="pt",
+        language="japanese",
+    ).input_features.to(DEVICE)
+
+    predicted_ids = model.generate(input_features)
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    return transcription
+
+
+iface = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(type="filepath"),
+    outputs="text",
+    title="OpenAI Whisper - Speech Recognition",
+)
+iface.launch()