You've already forked PythonLib
mirror of
https://github.com/lifebottle/PythonLib.git
synced 2026-02-13 15:25:50 -08:00
added: comments with usage instructions.
This commit is contained in:
148
wav2txt_jp.py
148
wav2txt_jp.py
@@ -1,17 +1,45 @@
|
||||
# Usage:
|
||||
# python wav2txt.py --folder="sounds"
|
||||
# python wav2txt.py --folder="D:\Temp\sounds"
|
||||
#
|
||||
# Ignore UserWarning: 1Torch was not compiled with flash attention.
|
||||
#
|
||||
|
||||
'''
|
||||
Before you run this, make sure these are installed:
|
||||
pip install torch
|
||||
pip install torchaudio
|
||||
pip install transformers
|
||||
pip install PySoundFile
|
||||
|
||||
(Optional) For non-WAV files, you also need the following in your PATH environment variable:
|
||||
https://www.ffmpeg.org/download.html
|
||||
- ffmpeg
|
||||
- ffprobe
|
||||
|
||||
Finally, when you first run this, it'll download the openai/whisper-medium model, which is about 3GB.
|
||||
|
||||
If you have a supported nVIDIA GPU, consider downloading the CUDA Toolkit first:
|
||||
https://developer.nvidia.com/cuda-downloads
|
||||
|
||||
Then install torch from the generated command from here instead:
|
||||
https://pytorch.org/get-started/locally/
|
||||
|
||||
This should speed up the process.
|
||||
'''
|
||||
|
||||
import os
|
||||
import torch
|
||||
import torchaudio
|
||||
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
||||
import json
|
||||
import argparse
|
||||
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
||||
|
||||
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
WHISPER_SAMPLE_RATE = 16000
|
||||
|
||||
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
|
||||
model = WhisperForConditionalGeneration.from_pretrained(
|
||||
"openai/whisper-medium"
|
||||
).to(DEVICE)
|
||||
|
||||
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium").to(DEVICE)
|
||||
|
||||
def preprocess_audio(audio_path: str) -> torch.Tensor:
|
||||
audio, sample_rate = torchaudio.load(audio_path)
|
||||
@@ -26,7 +54,6 @@ def preprocess_audio(audio_path: str) -> torch.Tensor:
|
||||
audio = torch.mean(audio, dim=0)
|
||||
return audio.squeeze()
|
||||
|
||||
|
||||
def transcribe(audio_path: str) -> str:
|
||||
audio_input = preprocess_audio(audio_path)
|
||||
input_features = processor(
|
||||
@@ -40,60 +67,65 @@ def transcribe(audio_path: str) -> str:
|
||||
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
||||
return transcription
|
||||
|
||||
# Root directory containing the subfolders
|
||||
root_directory = "Logo"
|
||||
def main(root_directory):
|
||||
# Function to extract the numerical part of the filename
|
||||
def extract_number(filename):
|
||||
return int(''.join(filter(str.isdigit, filename)))
|
||||
|
||||
# Function to extract the numerical part of the filename
|
||||
def extract_number(filename):
|
||||
return int(''.join(filter(str.isdigit, filename)))
|
||||
# Function to transcribe audio
|
||||
def transcribe_audio(audio_path):
|
||||
try:
|
||||
transcription = transcribe(audio_path)
|
||||
except Exception as e:
|
||||
transcription = f"Error: {e}"
|
||||
return transcription
|
||||
|
||||
# Function to transcribe audio
|
||||
def transcribe_audio(audio_path):
|
||||
try:
|
||||
transcription = transcribe(audio_path)
|
||||
print(f"{audio_path}: {transcription}")
|
||||
except Exception as e:
|
||||
transcription = f"Error: {e}"
|
||||
print(f"{audio_path}: {transcription}")
|
||||
return transcription
|
||||
|
||||
# If there are no subfolders, process files in the root directory directly
|
||||
if not any(os.path.isdir(os.path.join(root_directory, subdir)) for subdir in os.listdir(root_directory)):
|
||||
subdir_path = root_directory
|
||||
subfolders = [""]
|
||||
else:
|
||||
subfolders = [subdir for subdir in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory, subdir))]
|
||||
|
||||
for subdir in subfolders:
|
||||
subdir_path = os.path.join(root_directory, subdir)
|
||||
results = {}
|
||||
i = 1
|
||||
|
||||
# Iterate over .wav files in the current directory
|
||||
for filename in os.listdir(subdir_path):
|
||||
if filename.endswith(".wav"):
|
||||
audio_path = os.path.join(subdir_path, filename)
|
||||
# Transcribe the audio
|
||||
transcription = transcribe_audio(audio_path)
|
||||
|
||||
# Store the result text in the dictionary
|
||||
results[filename] = transcription
|
||||
|
||||
print("Transcribed {} ({}/{})".format(filename, i, len(os.listdir(subdir_path))))
|
||||
i += 1
|
||||
|
||||
# Sort the results by filename numerically
|
||||
sorted_results = {k: results[k] for k in sorted(results, key=extract_number)}
|
||||
|
||||
# Output JSON file path for the current directory
|
||||
if subdir == "":
|
||||
output_file = os.path.join(root_directory + ".json")
|
||||
# If there are no subfolders, process files in the root directory directly
|
||||
if not any(os.path.isdir(os.path.join(root_directory, subdir)) for subdir in os.listdir(root_directory)):
|
||||
subdir_path = root_directory
|
||||
subfolders = [""]
|
||||
else:
|
||||
output_file = os.path.join(root_directory, f"{subdir}.json")
|
||||
subfolders = [subdir for subdir in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory, subdir))]
|
||||
|
||||
# Write the sorted results to a JSON file with non-ASCII characters preserved
|
||||
with open(output_file, "w", encoding="utf-8") as json_file:
|
||||
json.dump(sorted_results, json_file, indent=4, ensure_ascii=False)
|
||||
for subdir in subfolders:
|
||||
subdir_path = os.path.join(root_directory, subdir)
|
||||
results = {}
|
||||
i = 1
|
||||
|
||||
print("Transcription results saved to", output_file)
|
||||
|
||||
# Iterate over .wav files in the current directory
|
||||
for filename in os.listdir(subdir_path):
|
||||
if filename.endswith(".wav"):
|
||||
audio_path = os.path.join(subdir_path, filename)
|
||||
# Transcribe the audio
|
||||
transcription = transcribe_audio(audio_path)
|
||||
|
||||
# Store the result text in the dictionary
|
||||
results[filename] = transcription
|
||||
|
||||
print("Transcribed {} ({}/{}): {}".format(filename, i, len(os.listdir(subdir_path)), results[filename]))
|
||||
i += 1
|
||||
|
||||
# Sort the results by filename numerically
|
||||
sorted_results = {k: results[k] for k in sorted(results, key=extract_number)}
|
||||
|
||||
# Output JSON file path for the current directory
|
||||
if subdir == "":
|
||||
output_file = os.path.join(root_directory + ".json")
|
||||
else:
|
||||
output_file = os.path.join(root_directory, f"{subdir}.json")
|
||||
|
||||
# Write the sorted results to a JSON file with non-ASCII characters preserved
|
||||
with open(output_file, "w", encoding="utf-8") as json_file:
|
||||
json.dump(sorted_results, json_file, indent=4, ensure_ascii=False)
|
||||
|
||||
print("Transcription results saved to", output_file)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Transcribe .wav files in a directory and save results to JSON files.")
|
||||
parser.add_argument("--folder", type=str, help="Path to the root folder containing .wav files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.folder:
|
||||
main(args.folder)
|
||||
else:
|
||||
print("Error: Please provide the path to the root folder using the --folder argument.")
|
||||
|
||||
Reference in New Issue
Block a user