You've already forked PythonLib
mirror of
https://github.com/lifebottle/PythonLib.git
synced 2026-02-13 15:25:50 -08:00
Create webapp_wav2txt.py
This commit is contained in:
63
webapp_wav2txt.py
Normal file
63
webapp_wav2txt.py
Normal file
@@ -0,0 +1,63 @@
|
||||
'''
|
||||
Before you run this, make sure these are installed:
|
||||
pip install torch
|
||||
pip install torchaudio
|
||||
pip install gradio
|
||||
pip install transformers
|
||||
|
||||
You also need the following in your PATH environment variable: https://www.ffmpeg.org/download.html
|
||||
ffmpeg
|
||||
ffprobe
|
||||
|
||||
Finally, when you first run this, it'll download the openai/whisper-medium model, which is about 3GB.
|
||||
'''
|
||||
|
||||
import torch
|
||||
import torchaudio
|
||||
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
||||
import gradio as gr
|
||||
|
||||
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
WHISPER_SAMPLE_RATE = 16000
|
||||
|
||||
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
|
||||
model = WhisperForConditionalGeneration.from_pretrained(
|
||||
"openai/whisper-medium"
|
||||
).to(DEVICE)
|
||||
|
||||
|
||||
def preprocess_audio(audio_path: str) -> torch.Tensor:
|
||||
audio, sample_rate = torchaudio.load(audio_path)
|
||||
# Resample if necessary
|
||||
if sample_rate != WHISPER_SAMPLE_RATE:
|
||||
resampler = torchaudio.transforms.Resample(
|
||||
orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE
|
||||
)
|
||||
audio = resampler(audio)
|
||||
# Convert to mono
|
||||
if audio.shape[0] > 1:
|
||||
audio = torch.mean(audio, dim=0)
|
||||
return audio.squeeze()
|
||||
|
||||
|
||||
def transcribe(audio_path: str) -> str:
|
||||
audio_input = preprocess_audio(audio_path)
|
||||
input_features = processor(
|
||||
audio_input,
|
||||
sampling_rate=WHISPER_SAMPLE_RATE,
|
||||
return_tensors="pt",
|
||||
language="japanese",
|
||||
).input_features.to(DEVICE)
|
||||
|
||||
predicted_ids = model.generate(input_features)
|
||||
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
||||
return transcription
|
||||
|
||||
|
||||
iface = gr.Interface(
|
||||
fn=transcribe,
|
||||
inputs=gr.Audio(type="filepath"),
|
||||
outputs="text",
|
||||
title="OpenAI Whisper - Speech Recognition",
|
||||
)
|
||||
iface.launch()
|
||||
Reference in New Issue
Block a user