From 0cdd1d9ec235b22e6cdf1858a1f5878578577fdb Mon Sep 17 00:00:00 2001
From: Peter Nguyen <peter@datacrunch.ca>
Date: Thu, 16 May 2024 21:51:53 -0400
Subject: [PATCH] added: comments with usage instructions.

---
 wav2txt_jp.py | 148 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 90 insertions(+), 58 deletions(-)

diff --git a/wav2txt_jp.py b/wav2txt_jp.py
index 1f435fa..dd4bd01 100644
--- a/wav2txt_jp.py
+++ b/wav2txt_jp.py
@@ -1,17 +1,45 @@
+# Usage:
+# python wav2txt.py --folder="sounds"
+# python wav2txt.py --folder="D:\Temp\sounds"
+# 
+# Ignore UserWarning: 1Torch was not compiled with flash attention.
+#
+
+'''
+Before you run this, make sure these are installed:
+pip install torch
+pip install torchaudio
+pip install transformers
+pip install PySoundFile
+
+(Optional) For non-WAV files, you also need the following in your PATH environment variable:
+https://www.ffmpeg.org/download.html
+ - ffmpeg
+ - ffprobe
+
+Finally, when you first run this, it'll download the openai/whisper-medium model, which is about 3GB.
+
+If you have a supported nVIDIA GPU, consider downloading the CUDA Toolkit first:
+https://developer.nvidia.com/cuda-downloads
+
+Then install torch from the generated command from here instead:
+https://pytorch.org/get-started/locally/
+
+This should speed up the process.
+'''
+
 import os
 import torch
 import torchaudio
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import json
+import argparse
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 WHISPER_SAMPLE_RATE = 16000
 
 processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
-model = WhisperForConditionalGeneration.from_pretrained(
-    "openai/whisper-medium"
-).to(DEVICE)
-
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium").to(DEVICE)
 
 def preprocess_audio(audio_path: str) -> torch.Tensor:
     audio, sample_rate = torchaudio.load(audio_path)
@@ -26,7 +54,6 @@ def preprocess_audio(audio_path: str) -> torch.Tensor:
         audio = torch.mean(audio, dim=0)
     return audio.squeeze()
 
-
 def transcribe(audio_path: str) -> str:
     audio_input = preprocess_audio(audio_path)
     input_features = processor(
@@ -40,60 +67,65 @@ def transcribe(audio_path: str) -> str:
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
     return transcription
 
-# Root directory containing the subfolders
-root_directory = "Logo"
+def main(root_directory):
+    # Function to extract the numerical part of the filename
+    def extract_number(filename):
+        return int(''.join(filter(str.isdigit, filename)))
 
-# Function to extract the numerical part of the filename
-def extract_number(filename):
-    return int(''.join(filter(str.isdigit, filename)))
+    # Function to transcribe audio
+    def transcribe_audio(audio_path):
+        try:
+            transcription = transcribe(audio_path)
+        except Exception as e:
+            transcription = f"Error: {e}"
+        return transcription
 
-# Function to transcribe audio
-def transcribe_audio(audio_path):
-    try:
-        transcription = transcribe(audio_path)
-        print(f"{audio_path}: {transcription}")
-    except Exception as e:
-        transcription = f"Error: {e}"
-        print(f"{audio_path}: {transcription}")
-    return transcription
-
-# If there are no subfolders, process files in the root directory directly
-if not any(os.path.isdir(os.path.join(root_directory, subdir)) for subdir in os.listdir(root_directory)):
-    subdir_path = root_directory
-    subfolders = [""]
-else:
-    subfolders = [subdir for subdir in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory, subdir))]
-
-for subdir in subfolders:
-    subdir_path = os.path.join(root_directory, subdir)
-    results = {}
-    i = 1
-
-    # Iterate over .wav files in the current directory
-    for filename in os.listdir(subdir_path):
-        if filename.endswith(".wav"):
-            audio_path = os.path.join(subdir_path, filename)
-            # Transcribe the audio
-            transcription = transcribe_audio(audio_path)
-
-            # Store the result text in the dictionary
-            results[filename] = transcription
-
-            print("Transcribed {} ({}/{})".format(filename, i, len(os.listdir(subdir_path))))
-            i += 1
-
-    # Sort the results by filename numerically
-    sorted_results = {k: results[k] for k in sorted(results, key=extract_number)}
-
-    # Output JSON file path for the current directory
-    if subdir == "":
-        output_file = os.path.join(root_directory + ".json")
+    # If there are no subfolders, process files in the root directory directly
+    if not any(os.path.isdir(os.path.join(root_directory, subdir)) for subdir in os.listdir(root_directory)):
+        subdir_path = root_directory
+        subfolders = [""]
     else:
-        output_file = os.path.join(root_directory, f"{subdir}.json")
+        subfolders = [subdir for subdir in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory, subdir))]
 
-    # Write the sorted results to a JSON file with non-ASCII characters preserved
-    with open(output_file, "w", encoding="utf-8") as json_file:
-        json.dump(sorted_results, json_file, indent=4, ensure_ascii=False)
+    for subdir in subfolders:
+        subdir_path = os.path.join(root_directory, subdir)
+        results = {}
+        i = 1
 
-    print("Transcription results saved to", output_file)
-    
+        # Iterate over .wav files in the current directory
+        for filename in os.listdir(subdir_path):
+            if filename.endswith(".wav"):
+                audio_path = os.path.join(subdir_path, filename)
+                # Transcribe the audio
+                transcription = transcribe_audio(audio_path)
+
+                # Store the result text in the dictionary
+                results[filename] = transcription
+
+                print("Transcribed {} ({}/{}): {}".format(filename, i, len(os.listdir(subdir_path)), results[filename]))
+                i += 1
+
+        # Sort the results by filename numerically
+        sorted_results = {k: results[k] for k in sorted(results, key=extract_number)}
+
+        # Output JSON file path for the current directory
+        if subdir == "":
+            output_file = os.path.join(root_directory + ".json")
+        else:
+            output_file = os.path.join(root_directory, f"{subdir}.json")
+
+        # Write the sorted results to a JSON file with non-ASCII characters preserved
+        with open(output_file, "w", encoding="utf-8") as json_file:
+            json.dump(sorted_results, json_file, indent=4, ensure_ascii=False)
+
+        print("Transcription results saved to", output_file)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Transcribe .wav files in a directory and save results to JSON files.")
+    parser.add_argument("--folder", type=str, help="Path to the root folder containing .wav files.")
+    args = parser.parse_args()
+
+    if args.folder:
+        main(args.folder)
+    else:
+        print("Error: Please provide the path to the root folder using the --folder argument.")