Merge pull request #868 from OpenShot/audio-waveformer

New AudioWaveformer Class (for generating fast, graph-friendly audio datasets)
2026-03-02 08:53:52 -08:00 · 2022-11-02 21:54:45 -05:00
parent 670baa5a35 4265d84ff9
commit d7ab1c61ec
7 changed files with 369 additions and 2 deletions
--- a/bindings/python/openshot.i
+++ b/bindings/python/openshot.i
@@ -9,7 +9,7 @@
 //
 // SPDX-License-Identifier: LGPL-3.0-or-later

-%module openshot
+%module("threads"=1) openshot

 /* Suppress warnings about ignored operator= */
 %warnfilter(362);
@@ -45,16 +45,19 @@
 %template() std::map<std::string, int>;
 %template() std::pair<int, int>;
 %template() std::vector<int>;
+%template() std::vector<float>;
 %template() std::pair<double, double>;
 %template() std::pair<float, float>;
 %template() std::pair<std::string, std::string>;
 %template() std::vector<std::pair<std::string, std::string>>;
+%template() std::vector<std::vector<float>>;

 %{
 #include "OpenShotVersion.h"
 #include "ReaderBase.h"
 #include "WriterBase.h"
 #include "AudioDevices.h"
+#include "AudioWaveformer.h"
 #include "CacheBase.h"
 #include "CacheDisk.h"
 #include "CacheMemory.h"
@@ -263,6 +266,7 @@
 %include "ReaderBase.h"
 %include "WriterBase.h"
 %include "AudioDevices.h"
+%include "AudioWaveformer.h"
 %include "CacheBase.h"
 %include "CacheDisk.h"
 %include "CacheMemory.h"
--- a/bindings/ruby/openshot.i
+++ b/bindings/ruby/openshot.i
@@ -9,7 +9,7 @@
 //
 // SPDX-License-Identifier: LGPL-3.0-or-later

-%module openshot
+%module("threads"=1) openshot

 /* Suppress warnings about ignored operator= */
 %warnfilter(362);
@@ -45,10 +45,12 @@
 %template() std::map<std::string, int>;
 %template() std::pair<int, int>;
 %template() std::vector<int>;
+%template() std::vector<float>;
 %template() std::pair<double, double>;
 %template() std::pair<float, float>;
 %template() std::pair<std::string, std::string>;
 %template() std::vector<std::pair<std::string, std::string>>;
+%template() std::vector<std::vector<float>>;

 %{
 /* Ruby and FFmpeg define competing RSHIFT macros,
@@ -63,6 +65,7 @@
 #include "ReaderBase.h"
 #include "WriterBase.h"
 #include "AudioDevices.h"
+#include "AudioWaveformer.h"
 #include "CacheBase.h"
 #include "CacheDisk.h"
 #include "CacheMemory.h"
@@ -133,6 +136,7 @@
 %include "ReaderBase.h"
 %include "WriterBase.h"
 %include "AudioDevices.h"
+%include "AudioWaveformer.h"
 %include "CacheBase.h"
 %include "CacheDisk.h"
 %include "CacheMemory.h"
--- a/src/AudioWaveformer.cpp
+++ b/src/AudioWaveformer.cpp
@@ -0,0 +1,124 @@
+/**
+ * @file
+ * @brief Source file for AudioWaveformer class
+ * @author Jonathan Thomas <jonathan@openshot.org>
+ *
+ * @ref License
+ */
+
+// Copyright (c) 2008-2022 OpenShot Studios, LLC
+//
+// SPDX-License-Identifier: LGPL-3.0-or-later
+
+#include "AudioWaveformer.h"
+
+
+using namespace std;
+using namespace openshot;
+
+
+// Default constructor
+AudioWaveformer::AudioWaveformer(ReaderBase* new_reader) : reader(new_reader)
+{
+
+}
+
+// Destructor
+AudioWaveformer::~AudioWaveformer()
+{
+
+}
+
+// Extract audio samples from any ReaderBase class
+AudioWaveformData AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
+    AudioWaveformData data;
+
+    if (reader) {
+        // Open reader (if needed)
+        bool does_reader_have_video = reader->info.has_video;
+        if (!reader->IsOpen()) {
+            reader->Open();
+        }
+        // Disable video for faster processing
+        reader->info.has_video = false;
+
+        int sample_rate = reader->info.sample_rate;
+        int sample_divisor = sample_rate / num_per_second;
+        int total_samples = num_per_second * (reader->info.duration + 1.0);
+        int extracted_index = 0;
+
+        // Resize and clear audio buffers
+        data.resize(total_samples);
+        data.zero(total_samples);
+
+        // Loop through all frames
+        int sample_index = 0;
+        float samples_max = 0.0;
+        float chunk_max = 0.0;
+        float chunk_squared_sum = 0.0;
+
+        // How many channels are we using
+        int channel_count = 1;
+        if (channel == -1) {
+            channel_count = reader->info.channels;
+        }
+
+        for (auto f = 1; f <= reader->info.video_length; f++) {
+            // Get next frame
+            shared_ptr<openshot::Frame> frame = reader->GetFrame(f);
+
+            // Cache channels for this frame, to reduce # of calls to frame->GetAudioSamples
+            float* channels[channel_count];
+            for (auto channel_index = 0; channel_index < reader->info.channels; channel_index++) {
+                if (channel == channel_index || channel == -1) {
+                    channels[channel_index] = frame->GetAudioSamples(channel_index);
+                }
+            }
+
+            // Get sample value from a specific channel (or all channels)
+            for (auto s = 0; s < frame->GetAudioSamplesCount(); s++) {
+                for (auto channel_index = 0; channel_index < reader->info.channels; channel_index++) {
+                    if (channel == channel_index || channel == -1) {
+                        float *samples = channels[channel_index];
+                        float rms_sample_value = std::sqrt(samples[s] * samples[s]);
+
+                        // Accumulate sample averages
+                        chunk_squared_sum += rms_sample_value;
+                        chunk_max = std::max(chunk_max, rms_sample_value);
+                    }
+                }
+
+                sample_index += 1;
+
+                // Cut-off reached
+                if (sample_index % sample_divisor == 0) {
+                    float avg_squared_sum = chunk_squared_sum / (sample_divisor * channel_count);
+                    data.max_samples[extracted_index] = chunk_max;
+                    data.rms_samples[extracted_index] = avg_squared_sum;
+                    extracted_index++;
+
+                    // Track max/min values
+                    samples_max = std::max(samples_max, chunk_max);
+
+                    // reset sample total and index
+                    sample_index = 0;
+                    chunk_max = 0.0;
+                    chunk_squared_sum = 0.0;
+                }
+            }
+        }
+
+        // Scale all values to the -1 to +1 range (regardless of how small or how large the
+        // original audio sample values are)
+        if (normalize) {
+            float scale = 1.0f / samples_max;
+            data.scale(total_samples, scale);
+        }
+
+        // Resume previous has_video value
+        reader->info.has_video = does_reader_have_video;
+    }
+
+
+    return data;
+}
--- a/src/AudioWaveformer.h
+++ b/src/AudioWaveformer.h
@@ -0,0 +1,101 @@
+/**
+ * @file
+ * @brief Header file for AudioWaveformer class
+ * @author Jonathan Thomas <jonathan@openshot.org>
+ *
+ * @ref License
+ */
+
+// Copyright (c) 2008-2022 OpenShot Studios, LLC
+//
+// SPDX-License-Identifier: LGPL-3.0-or-later
+
+#ifndef OPENSHOT_WAVEFORMER_H
+#define OPENSHOT_WAVEFORMER_H
+
+#include "ReaderBase.h"
+#include "Frame.h"
+#include <vector>
+
+
+namespace openshot {
+
+    /**
+     * @brief This struct holds the extracted waveform data (both the RMS root-mean-squared average, and the max values)
+     *
+     * Because we extract 2 different datasets from the audio, we return this struct with access to both sets of data,
+     * the average root mean squared values, and the max sample values.
+     */
+    struct AudioWaveformData
+    {
+        std::vector<float> max_samples;
+        std::vector<float> rms_samples;
+
+        /// Resize both datasets
+        void resize(int total_samples) {
+            max_samples.resize(total_samples);
+            rms_samples.resize(total_samples);
+        }
+
+        /// Zero out # of values in both datasets
+        void zero(int total_samples) {
+            for (auto s = 0; s < total_samples; s++) {
+                max_samples[s] = 0.0;
+                rms_samples[s] = 0.0;
+            }
+        }
+
+        /// Scale # of values by some factor
+        void scale(int total_samples, float factor) {
+            for (auto s = 0; s < total_samples; s++) {
+                max_samples[s] *= factor;
+                rms_samples[s] *= factor;
+            }
+        }
+
+        /// Clear and free memory of both datasets
+        void clear() {
+            max_samples.clear();
+            max_samples.shrink_to_fit();
+            rms_samples.clear();
+            rms_samples.shrink_to_fit();
+        }
+
+        /// Return a vector of vectors (containing both datasets)
+        std::vector<std::vector<float>> vectors() {
+            std::vector<std::vector<float>> output;
+            output.push_back(max_samples);
+            output.push_back(rms_samples);
+            return output;
+        }
+    };
+
+    /**
+     * @brief This class is used to extra audio data used for generating waveforms.
+     *
+     * Pass in a ReaderBase* with audio data, and this class will iterate the reader,
+     * and sample down the dataset to a much smaller set - more useful for generating
+     * waveforms. For example, take 44100 samples per second, and reduce it to 20
+     * "max" or "average" samples per second - much easier to graph.
+     */
+    class AudioWaveformer {
+    private:
+        ReaderBase* reader;
+
+    public:
+        /// Default constructor
+        AudioWaveformer(ReaderBase* reader);
+
+        /// @brief Extract audio samples from any ReaderBase class
+        /// @param channel Which audio channel should we extract data from (-1 == all channels)
+        /// @param num_per_second How many samples per second to return
+        /// @param normalize Should we scale the data range so the largest value is 1.0
+        AudioWaveformData ExtractSamples(int channel, int num_per_second, bool normalize);
+
+        /// Destructor
+        ~AudioWaveformer();
+    };
+
+}
+
+#endif
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -50,6 +50,7 @@ set(OPENSHOT_SOURCES
  AudioDevices.cpp
  AudioReaderSource.cpp
  AudioResampler.cpp
+  AudioWaveformer.cpp
  CacheBase.cpp
  CacheDisk.cpp
  CacheMemory.cpp
--- a/tests/AudioWaveformer.cpp
+++ b/tests/AudioWaveformer.cpp
@@ -0,0 +1,132 @@
+/**
+ * @file
+ * @brief Unit tests for openshot::AudioWaveformer
+ * @author Jonathan Thomas <jonathan@openshot.org>
+ *
+ * @ref License
+ */
+
+// Copyright (c) 2008-2022 OpenShot Studios, LLC
+//
+// SPDX-License-Identifier: LGPL-3.0-or-later
+
+#include "openshot_catch.h"
+#include "AudioWaveformer.h"
+#include "FFmpegReader.h"
+
+
+using namespace openshot;
+
+TEST_CASE( "Extract waveform data piano.wav", "[libopenshot][audiowaveformer]" )
+{
+    // Create a reader
+    std::stringstream path;
+    path << TEST_MEDIA_PATH << "piano.wav";
+    FFmpegReader r(path.str());
+    r.Open();
+
+    // Create AudioWaveformer and extract a smaller "average" sample set of audio data
+    AudioWaveformer waveformer(&r);
+    for (auto channel = 0; channel < r.info.channels; channel++) {
+        AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, false);
+
+        if (channel == 0) {
+            CHECK(waveform.rms_samples.size() == 107);
+            CHECK(waveform.rms_samples[0] == Approx(0.04879f).margin(0.00001));
+            CHECK(waveform.rms_samples[86] == Approx(0.13578f).margin(0.00001));
+            CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
+        } else if (channel == 1) {
+            CHECK(waveform.rms_samples.size() == 107);
+            CHECK(waveform.rms_samples[0] == Approx(0.04879f).margin(0.00001));
+            CHECK(waveform.rms_samples[86] == Approx(0.13578f).margin(0.00001));
+            CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
+        }
+
+        waveform.clear();
+    }
+
+    // Clean up
+    r.Close();
+}
+
+TEST_CASE( "Extract waveform data sintel", "[libopenshot][audiowaveformer]" )
+{
+    // Create a reader
+    std::stringstream path;
+    path << TEST_MEDIA_PATH << "sintel_trailer-720p.mp4";
+    FFmpegReader r(path.str());
+
+    // Create AudioWaveformer and extract a smaller "average" sample set of audio data
+    AudioWaveformer waveformer(&r);
+    for (auto channel = 0; channel < r.info.channels; channel++) {
+        AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, false);
+
+        if (channel == 0) {
+            CHECK(waveform.rms_samples.size() == 1058);
+            CHECK(waveform.rms_samples[0] == Approx(0.00001f).margin(0.00001));
+            CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
+            CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
+        } else if (channel == 1) {
+            CHECK(waveform.rms_samples.size() == 1058);
+            CHECK(waveform.rms_samples[0] == Approx(0.00001f ).margin(0.00001));
+            CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
+            CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
+        }
+
+        waveform.clear();
+    }
+
+    // Clean up
+    r.Close();
+}
+
+
+TEST_CASE( "Extract waveform data sintel (all channels)", "[libopenshot][audiowaveformer]" )
+{
+    // Create a reader
+    std::stringstream path;
+    path << TEST_MEDIA_PATH << "sintel_trailer-720p.mp4";
+    FFmpegReader r(path.str());
+
+    // Create AudioWaveformer and extract a smaller "average" sample set of audio data
+    AudioWaveformer waveformer(&r);
+    AudioWaveformData waveform = waveformer.ExtractSamples(-1, 20, false);
+
+    CHECK(waveform.rms_samples.size() == 1058);
+    CHECK(waveform.rms_samples[0] == Approx(0.00001f).margin(0.00001));
+    CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
+    CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
+
+    waveform.clear();
+
+    // Clean up
+    r.Close();
+}
+
+TEST_CASE( "Normalize & scale waveform data piano.wav", "[libopenshot][audiowaveformer]" )
+{
+    // Create a reader
+    std::stringstream path;
+    path << TEST_MEDIA_PATH << "piano.wav";
+    FFmpegReader r(path.str());
+
+    // Create AudioWaveformer and extract a smaller "average" sample set of audio data
+    AudioWaveformer waveformer(&r);
+    for (auto channel = 0; channel < r.info.channels; channel++) {
+        // Normalize values and scale them between -1 and +1
+        AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, true);
+
+        if (channel == 0) {
+            CHECK(waveform.rms_samples.size() == 107);
+            CHECK(waveform.rms_samples[0] == Approx(0.07524f).margin(0.00001));
+            CHECK(waveform.rms_samples[35] == Approx(0.20063f).margin(0.00001));
+            CHECK(waveform.rms_samples[86] == Approx(0.2094f).margin(0.00001));
+            CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
+        }
+
+        waveform.clear();
+    }
+
+    // Clean up
+    r.Close();
+}
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -20,6 +20,7 @@ file(TO_NATIVE_PATH "${PROJECT_SOURCE_DIR}/examples/" TEST_MEDIA_PATH)
 ###  TEST SOURCE FILES
 ###
 set(OPENSHOT_TESTS
+  AudioWaveformer
  CacheDisk
  CacheMemory
  Clip