Merge pull request #868 from OpenShot/audio-waveformer

New AudioWaveformer Class (for generating fast, graph-friendly audio datasets)
This commit is contained in:
Jonathan Thomas
2022-11-02 21:54:45 -05:00
committed by GitHub
7 changed files with 369 additions and 2 deletions

View File

@@ -9,7 +9,7 @@
//
// SPDX-License-Identifier: LGPL-3.0-or-later
%module openshot
%module("threads"=1) openshot
/* Suppress warnings about ignored operator= */
%warnfilter(362);
@@ -45,16 +45,19 @@
%template() std::map<std::string, int>;
%template() std::pair<int, int>;
%template() std::vector<int>;
%template() std::vector<float>;
%template() std::pair<double, double>;
%template() std::pair<float, float>;
%template() std::pair<std::string, std::string>;
%template() std::vector<std::pair<std::string, std::string>>;
%template() std::vector<std::vector<float>>;
%{
#include "OpenShotVersion.h"
#include "ReaderBase.h"
#include "WriterBase.h"
#include "AudioDevices.h"
#include "AudioWaveformer.h"
#include "CacheBase.h"
#include "CacheDisk.h"
#include "CacheMemory.h"
@@ -263,6 +266,7 @@
%include "ReaderBase.h"
%include "WriterBase.h"
%include "AudioDevices.h"
%include "AudioWaveformer.h"
%include "CacheBase.h"
%include "CacheDisk.h"
%include "CacheMemory.h"

View File

@@ -9,7 +9,7 @@
//
// SPDX-License-Identifier: LGPL-3.0-or-later
%module openshot
%module("threads"=1) openshot
/* Suppress warnings about ignored operator= */
%warnfilter(362);
@@ -45,10 +45,12 @@
%template() std::map<std::string, int>;
%template() std::pair<int, int>;
%template() std::vector<int>;
%template() std::vector<float>;
%template() std::pair<double, double>;
%template() std::pair<float, float>;
%template() std::pair<std::string, std::string>;
%template() std::vector<std::pair<std::string, std::string>>;
%template() std::vector<std::vector<float>>;
%{
/* Ruby and FFmpeg define competing RSHIFT macros,
@@ -63,6 +65,7 @@
#include "ReaderBase.h"
#include "WriterBase.h"
#include "AudioDevices.h"
#include "AudioWaveformer.h"
#include "CacheBase.h"
#include "CacheDisk.h"
#include "CacheMemory.h"
@@ -133,6 +136,7 @@
%include "ReaderBase.h"
%include "WriterBase.h"
%include "AudioDevices.h"
%include "AudioWaveformer.h"
%include "CacheBase.h"
%include "CacheDisk.h"
%include "CacheMemory.h"

124
src/AudioWaveformer.cpp Normal file
View File

@@ -0,0 +1,124 @@
/**
* @file
* @brief Source file for AudioWaveformer class
* @author Jonathan Thomas <jonathan@openshot.org>
*
* @ref License
*/
// Copyright (c) 2008-2022 OpenShot Studios, LLC
//
// SPDX-License-Identifier: LGPL-3.0-or-later
#include "AudioWaveformer.h"
using namespace std;
using namespace openshot;
// Default constructor
AudioWaveformer::AudioWaveformer(ReaderBase* new_reader) : reader(new_reader)
{
}
// Destructor
AudioWaveformer::~AudioWaveformer()
{
}
// Extract audio samples from any ReaderBase class
AudioWaveformData AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
AudioWaveformData data;
if (reader) {
// Open reader (if needed)
bool does_reader_have_video = reader->info.has_video;
if (!reader->IsOpen()) {
reader->Open();
}
// Disable video for faster processing
reader->info.has_video = false;
int sample_rate = reader->info.sample_rate;
int sample_divisor = sample_rate / num_per_second;
int total_samples = num_per_second * (reader->info.duration + 1.0);
int extracted_index = 0;
// Resize and clear audio buffers
data.resize(total_samples);
data.zero(total_samples);
// Loop through all frames
int sample_index = 0;
float samples_max = 0.0;
float chunk_max = 0.0;
float chunk_squared_sum = 0.0;
// How many channels are we using
int channel_count = 1;
if (channel == -1) {
channel_count = reader->info.channels;
}
for (auto f = 1; f <= reader->info.video_length; f++) {
// Get next frame
shared_ptr<openshot::Frame> frame = reader->GetFrame(f);
// Cache channels for this frame, to reduce # of calls to frame->GetAudioSamples
float* channels[channel_count];
for (auto channel_index = 0; channel_index < reader->info.channels; channel_index++) {
if (channel == channel_index || channel == -1) {
channels[channel_index] = frame->GetAudioSamples(channel_index);
}
}
// Get sample value from a specific channel (or all channels)
for (auto s = 0; s < frame->GetAudioSamplesCount(); s++) {
for (auto channel_index = 0; channel_index < reader->info.channels; channel_index++) {
if (channel == channel_index || channel == -1) {
float *samples = channels[channel_index];
float rms_sample_value = std::sqrt(samples[s] * samples[s]);
// Accumulate sample averages
chunk_squared_sum += rms_sample_value;
chunk_max = std::max(chunk_max, rms_sample_value);
}
}
sample_index += 1;
// Cut-off reached
if (sample_index % sample_divisor == 0) {
float avg_squared_sum = chunk_squared_sum / (sample_divisor * channel_count);
data.max_samples[extracted_index] = chunk_max;
data.rms_samples[extracted_index] = avg_squared_sum;
extracted_index++;
// Track max/min values
samples_max = std::max(samples_max, chunk_max);
// reset sample total and index
sample_index = 0;
chunk_max = 0.0;
chunk_squared_sum = 0.0;
}
}
}
// Scale all values to the -1 to +1 range (regardless of how small or how large the
// original audio sample values are)
if (normalize) {
float scale = 1.0f / samples_max;
data.scale(total_samples, scale);
}
// Resume previous has_video value
reader->info.has_video = does_reader_have_video;
}
return data;
}

101
src/AudioWaveformer.h Normal file
View File

@@ -0,0 +1,101 @@
/**
* @file
* @brief Header file for AudioWaveformer class
* @author Jonathan Thomas <jonathan@openshot.org>
*
* @ref License
*/
// Copyright (c) 2008-2022 OpenShot Studios, LLC
//
// SPDX-License-Identifier: LGPL-3.0-or-later
#ifndef OPENSHOT_WAVEFORMER_H
#define OPENSHOT_WAVEFORMER_H
#include "ReaderBase.h"
#include "Frame.h"
#include <vector>
namespace openshot {
/**
* @brief This struct holds the extracted waveform data (both the RMS root-mean-squared average, and the max values)
*
* Because we extract 2 different datasets from the audio, we return this struct with access to both sets of data,
* the average root mean squared values, and the max sample values.
*/
struct AudioWaveformData
{
std::vector<float> max_samples;
std::vector<float> rms_samples;
/// Resize both datasets
void resize(int total_samples) {
max_samples.resize(total_samples);
rms_samples.resize(total_samples);
}
/// Zero out # of values in both datasets
void zero(int total_samples) {
for (auto s = 0; s < total_samples; s++) {
max_samples[s] = 0.0;
rms_samples[s] = 0.0;
}
}
/// Scale # of values by some factor
void scale(int total_samples, float factor) {
for (auto s = 0; s < total_samples; s++) {
max_samples[s] *= factor;
rms_samples[s] *= factor;
}
}
/// Clear and free memory of both datasets
void clear() {
max_samples.clear();
max_samples.shrink_to_fit();
rms_samples.clear();
rms_samples.shrink_to_fit();
}
/// Return a vector of vectors (containing both datasets)
std::vector<std::vector<float>> vectors() {
std::vector<std::vector<float>> output;
output.push_back(max_samples);
output.push_back(rms_samples);
return output;
}
};
/**
* @brief This class is used to extra audio data used for generating waveforms.
*
* Pass in a ReaderBase* with audio data, and this class will iterate the reader,
* and sample down the dataset to a much smaller set - more useful for generating
* waveforms. For example, take 44100 samples per second, and reduce it to 20
* "max" or "average" samples per second - much easier to graph.
*/
class AudioWaveformer {
private:
ReaderBase* reader;
public:
/// Default constructor
AudioWaveformer(ReaderBase* reader);
/// @brief Extract audio samples from any ReaderBase class
/// @param channel Which audio channel should we extract data from (-1 == all channels)
/// @param num_per_second How many samples per second to return
/// @param normalize Should we scale the data range so the largest value is 1.0
AudioWaveformData ExtractSamples(int channel, int num_per_second, bool normalize);
/// Destructor
~AudioWaveformer();
};
}
#endif

View File

@@ -50,6 +50,7 @@ set(OPENSHOT_SOURCES
AudioDevices.cpp
AudioReaderSource.cpp
AudioResampler.cpp
AudioWaveformer.cpp
CacheBase.cpp
CacheDisk.cpp
CacheMemory.cpp

132
tests/AudioWaveformer.cpp Normal file
View File

@@ -0,0 +1,132 @@
/**
* @file
* @brief Unit tests for openshot::AudioWaveformer
* @author Jonathan Thomas <jonathan@openshot.org>
*
* @ref License
*/
// Copyright (c) 2008-2022 OpenShot Studios, LLC
//
// SPDX-License-Identifier: LGPL-3.0-or-later
#include "openshot_catch.h"
#include "AudioWaveformer.h"
#include "FFmpegReader.h"
using namespace openshot;
TEST_CASE( "Extract waveform data piano.wav", "[libopenshot][audiowaveformer]" )
{
// Create a reader
std::stringstream path;
path << TEST_MEDIA_PATH << "piano.wav";
FFmpegReader r(path.str());
r.Open();
// Create AudioWaveformer and extract a smaller "average" sample set of audio data
AudioWaveformer waveformer(&r);
for (auto channel = 0; channel < r.info.channels; channel++) {
AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, false);
if (channel == 0) {
CHECK(waveform.rms_samples.size() == 107);
CHECK(waveform.rms_samples[0] == Approx(0.04879f).margin(0.00001));
CHECK(waveform.rms_samples[86] == Approx(0.13578f).margin(0.00001));
CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
} else if (channel == 1) {
CHECK(waveform.rms_samples.size() == 107);
CHECK(waveform.rms_samples[0] == Approx(0.04879f).margin(0.00001));
CHECK(waveform.rms_samples[86] == Approx(0.13578f).margin(0.00001));
CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
}
waveform.clear();
}
// Clean up
r.Close();
}
TEST_CASE( "Extract waveform data sintel", "[libopenshot][audiowaveformer]" )
{
// Create a reader
std::stringstream path;
path << TEST_MEDIA_PATH << "sintel_trailer-720p.mp4";
FFmpegReader r(path.str());
// Create AudioWaveformer and extract a smaller "average" sample set of audio data
AudioWaveformer waveformer(&r);
for (auto channel = 0; channel < r.info.channels; channel++) {
AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, false);
if (channel == 0) {
CHECK(waveform.rms_samples.size() == 1058);
CHECK(waveform.rms_samples[0] == Approx(0.00001f).margin(0.00001));
CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
} else if (channel == 1) {
CHECK(waveform.rms_samples.size() == 1058);
CHECK(waveform.rms_samples[0] == Approx(0.00001f ).margin(0.00001));
CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
}
waveform.clear();
}
// Clean up
r.Close();
}
TEST_CASE( "Extract waveform data sintel (all channels)", "[libopenshot][audiowaveformer]" )
{
// Create a reader
std::stringstream path;
path << TEST_MEDIA_PATH << "sintel_trailer-720p.mp4";
FFmpegReader r(path.str());
// Create AudioWaveformer and extract a smaller "average" sample set of audio data
AudioWaveformer waveformer(&r);
AudioWaveformData waveform = waveformer.ExtractSamples(-1, 20, false);
CHECK(waveform.rms_samples.size() == 1058);
CHECK(waveform.rms_samples[0] == Approx(0.00001f).margin(0.00001));
CHECK(waveform.rms_samples[1037] == Approx(0.00003f).margin(0.00001));
CHECK(waveform.rms_samples[1038] == Approx(0.0f).margin(0.00001));
waveform.clear();
// Clean up
r.Close();
}
TEST_CASE( "Normalize & scale waveform data piano.wav", "[libopenshot][audiowaveformer]" )
{
// Create a reader
std::stringstream path;
path << TEST_MEDIA_PATH << "piano.wav";
FFmpegReader r(path.str());
// Create AudioWaveformer and extract a smaller "average" sample set of audio data
AudioWaveformer waveformer(&r);
for (auto channel = 0; channel < r.info.channels; channel++) {
// Normalize values and scale them between -1 and +1
AudioWaveformData waveform = waveformer.ExtractSamples(channel, 20, true);
if (channel == 0) {
CHECK(waveform.rms_samples.size() == 107);
CHECK(waveform.rms_samples[0] == Approx(0.07524f).margin(0.00001));
CHECK(waveform.rms_samples[35] == Approx(0.20063f).margin(0.00001));
CHECK(waveform.rms_samples[86] == Approx(0.2094f).margin(0.00001));
CHECK(waveform.rms_samples[87] == Approx(0.0f).margin(0.00001));
}
waveform.clear();
}
// Clean up
r.Close();
}

View File

@@ -20,6 +20,7 @@ file(TO_NATIVE_PATH "${PROJECT_SOURCE_DIR}/examples/" TEST_MEDIA_PATH)
### TEST SOURCE FILES
###
set(OPENSHOT_TESTS
AudioWaveformer
CacheDisk
CacheMemory
Clip