/** * @file * @brief Source file for AudioWaveformer class * @author Jonathan Thomas * * @ref License */ // Copyright (c) 2008-2022 OpenShot Studios, LLC // // SPDX-License-Identifier: LGPL-3.0-or-later #include "AudioWaveformer.h" #include #include #include #include #include #include #include "Clip.h" #include "Exceptions.h" #include "FrameMapper.h" #include "FFmpegReader.h" #include "Timeline.h" using namespace std; using namespace openshot; // Default constructor AudioWaveformer::AudioWaveformer(ReaderBase* new_reader) : reader(new_reader), detached_reader(nullptr), resolved_reader(nullptr), source_initialized(false) { } // Destructor AudioWaveformer::~AudioWaveformer() { } // Extract audio samples from any ReaderBase class AudioWaveformData AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) { // Legacy entry point: resolve a source reader (unwrap Clip/FrameMapper), then extract audio-only. AudioWaveformData data; if (!reader) { return data; } ReaderBase* source = ResolveWaveformReader(); Fraction source_fps = ResolveSourceFPS(source); AudioWaveformData base = ExtractSamplesFromReader(source, channel, num_per_second, false); // If this is a Clip, apply its keyframes using project fps (timeline if available, else reader fps) if (auto clip = dynamic_cast(reader)) { Timeline* timeline = dynamic_cast(clip->ParentTimeline()); Fraction project_fps = timeline ? timeline->info.fps : clip->Reader()->info.fps; return ApplyKeyframes(base, &clip->time, &clip->volume, project_fps, source_fps, source->info.channels, num_per_second, channel, normalize); } // No keyframes to apply if (normalize) { float max_sample = 0.0f; for (auto v : base.max_samples) { max_sample = std::max(max_sample, std::abs(v)); } if (max_sample > 0.0f) { base.scale(static_cast(base.max_samples.size()), 1.0f / max_sample); } } return base; } AudioWaveformData AudioWaveformer::ExtractSamples(const std::string& path, int channel, int num_per_second, bool normalize) { FFmpegReader temp_reader(path); temp_reader.Open(); // Disable video for speed bool has_video = temp_reader.info.has_video; temp_reader.info.has_video = false; AudioWaveformData data = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, normalize); temp_reader.info.has_video = has_video; temp_reader.Close(); return data; } AudioWaveformData AudioWaveformer::ExtractSamples(const std::string& path, const Keyframe* time_keyframe, const Keyframe* volume_keyframe, const Fraction& project_fps, int channel, int num_per_second, bool normalize) { FFmpegReader temp_reader(path); temp_reader.Open(); bool has_video = temp_reader.info.has_video; temp_reader.info.has_video = false; Fraction source_fps = temp_reader.info.fps; AudioWaveformData base = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, false); temp_reader.info.has_video = has_video; temp_reader.Close(); return ApplyKeyframes(base, time_keyframe, volume_keyframe, project_fps, source_fps, temp_reader.info.channels, num_per_second, channel, normalize); } AudioWaveformData AudioWaveformer::ApplyKeyframes(const AudioWaveformData& base, const Keyframe* time_keyframe, const Keyframe* volume_keyframe, const Fraction& project_fps, const Fraction& source_fps, int source_channels, int num_per_second, int channel, bool normalize) { AudioWaveformData data; if (num_per_second <= 0) { return data; } double project_fps_value = project_fps.ToDouble(); double source_fps_value = source_fps.ToDouble(); if (project_fps_value <= 0.0 || source_fps_value <= 0.0) { return data; } if (channel != -1 && (channel < 0 || channel >= source_channels)) { return data; } size_t base_total = base.max_samples.size(); if (base_total == 0) { return data; } // Determine output duration from time curve (if any). Time curves are in project-frame domain. int64_t output_frames = 0; if (time_keyframe && time_keyframe->GetCount() > 0) { output_frames = time_keyframe->GetLength(); } if (output_frames <= 0) { // Default to source duration derived from base waveform length double source_duration = static_cast(base_total) / static_cast(num_per_second); output_frames = static_cast(std::llround(source_duration * project_fps_value)); } double output_duration_seconds = static_cast(output_frames) / project_fps_value; int total_samples = static_cast(std::ceil(output_duration_seconds * num_per_second)); if (total_samples <= 0) { return data; } data.resize(total_samples); data.zero(total_samples); for (int i = 0; i < total_samples; ++i) { double out_time = static_cast(i) / static_cast(num_per_second); // Time keyframes are defined in project-frame domain; evaluate using project frames double project_frame = out_time * project_fps_value; double mapped_project_frame = time_keyframe ? time_keyframe->GetValue(project_frame) : project_frame; // Convert mapped project frame to seconds (project FPS), then to waveform index double source_time = mapped_project_frame / project_fps_value; double source_index = source_time * static_cast(num_per_second); // Sample base waveform (nearest with simple linear blend) int idx0 = static_cast(std::floor(source_index)); int idx1 = idx0 + 1; double frac = source_index - static_cast(idx0); float max_sample = 0.0f; float rms_sample = 0.0f; if (idx0 >= 0 && idx0 < static_cast(base_total)) { max_sample = base.max_samples[idx0]; rms_sample = base.rms_samples[idx0]; } if (idx1 >= 0 && idx1 < static_cast(base_total)) { max_sample = static_cast((1.0 - frac) * max_sample + frac * base.max_samples[idx1]); rms_sample = static_cast((1.0 - frac) * rms_sample + frac * base.rms_samples[idx1]); } double gain = 1.0; if (volume_keyframe) { double project_frame = out_time * project_fps_value; gain = volume_keyframe->GetValue(project_frame); } max_sample = static_cast(max_sample * gain); rms_sample = static_cast(rms_sample * gain); data.max_samples[i] = max_sample; data.rms_samples[i] = rms_sample; } if (normalize) { float samples_max = 0.0f; for (auto v : data.max_samples) { samples_max = std::max(samples_max, std::abs(v)); } if (samples_max > 0.0f) { data.scale(total_samples, 1.0f / samples_max); } } return data; } AudioWaveformData AudioWaveformer::ExtractSamplesFromReader(ReaderBase* source_reader, int channel, int num_per_second, bool normalize) { AudioWaveformData data; if (!source_reader || num_per_second <= 0) { return data; } // Open reader (if needed) if (!source_reader->IsOpen()) { source_reader->Open(); } const auto retry_delay = std::chrono::milliseconds(100); const auto max_wait_for_open = std::chrono::milliseconds(3000); auto get_frame_with_retry = [&](int64_t frame_number) -> std::shared_ptr { std::chrono::steady_clock::time_point wait_start; bool waiting_for_open = false; while (true) { try { return source_reader->GetFrame(frame_number); } catch (const openshot::ReaderClosed&) { auto now = std::chrono::steady_clock::now(); if (!waiting_for_open) { waiting_for_open = true; wait_start = now; } else if (now - wait_start >= max_wait_for_open) { throw; } std::this_thread::sleep_for(retry_delay); } } }; int sample_rate = source_reader->info.sample_rate; if (sample_rate <= 0) { sample_rate = num_per_second; } int sample_divisor = sample_rate / num_per_second; if (sample_divisor <= 0) { sample_divisor = 1; } // Determine length of video frames (for waveform) int64_t reader_video_length = source_reader->info.video_length; if (reader_video_length < 0) { reader_video_length = 0; } float reader_duration = source_reader->info.duration; double fps_value = source_reader->info.fps.ToDouble(); float frames_duration = 0.0f; if (reader_video_length > 0 && fps_value > 0.0) { frames_duration = static_cast(reader_video_length / fps_value); } if (reader_duration <= 0.0f) { reader_duration = frames_duration; } if (reader_duration < 0.0f) { reader_duration = 0.0f; } if (!source_reader->info.has_audio) { return data; } int total_samples = static_cast(std::ceil(reader_duration * num_per_second)); if (total_samples <= 0 || source_reader->info.channels == 0) { return data; } if (channel != -1 && (channel < 0 || channel >= source_reader->info.channels)) { return data; } // Resize and clear audio buffers data.resize(total_samples); data.zero(total_samples); int extracted_index = 0; int sample_index = 0; float samples_max = 0.0f; float chunk_max = 0.0f; double chunk_squared_sum = 0.0; int channel_count = (channel == -1) ? source_reader->info.channels : 1; std::vector channels(source_reader->info.channels, nullptr); try { for (int64_t f = 1; f <= reader_video_length && extracted_index < total_samples; f++) { std::shared_ptr frame = get_frame_with_retry(f); for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) { if (channel == channel_index || channel == -1) { channels[channel_index] = frame->GetAudioSamples(channel_index); } } int sample_count = frame->GetAudioSamplesCount(); for (int s = 0; s < sample_count; s++) { for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) { if (channel == channel_index || channel == -1) { float *samples = channels[channel_index]; if (!samples) { continue; } float abs_sample = std::abs(samples[s]); chunk_squared_sum += static_cast(samples[s]) * static_cast(samples[s]); chunk_max = std::max(chunk_max, abs_sample); } } sample_index += 1; if (sample_index % sample_divisor == 0) { float avg_squared_sum = 0.0f; if (channel_count > 0) { avg_squared_sum = static_cast(chunk_squared_sum / static_cast(sample_divisor * channel_count)); } if (extracted_index < total_samples) { data.max_samples[extracted_index] = chunk_max; data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum); samples_max = std::max(samples_max, chunk_max); extracted_index++; } sample_index = 0; chunk_max = 0.0f; chunk_squared_sum = 0.0; if (extracted_index >= total_samples) { break; } } } } } catch (...) { throw; } if (sample_index > 0 && extracted_index < total_samples) { float avg_squared_sum = 0.0f; if (channel_count > 0) { avg_squared_sum = static_cast(chunk_squared_sum / static_cast(sample_index * channel_count)); } data.max_samples[extracted_index] = chunk_max; data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum); samples_max = std::max(samples_max, chunk_max); extracted_index++; } if (normalize && samples_max > 0.0f) { float scale = 1.0f / samples_max; data.scale(total_samples, scale); } return data; } ReaderBase* AudioWaveformer::ResolveSourceReader(ReaderBase* source_reader) { if (!source_reader) { return nullptr; } ReaderBase* current = source_reader; while (true) { if (auto clip = dynamic_cast(current)) { current = clip->Reader(); continue; } if (auto mapper = dynamic_cast(current)) { current = mapper->Reader(); continue; } break; } return current; } Fraction AudioWaveformer::ResolveSourceFPS(ReaderBase* source_reader) { if (!source_reader) { return Fraction(0, 1); } return source_reader->info.fps; } // Resolve and cache the reader used for waveform extraction (prefer a detached FFmpegReader clone) ReaderBase* AudioWaveformer::ResolveWaveformReader() { if (source_initialized) { return resolved_reader ? resolved_reader : reader; } source_initialized = true; resolved_reader = ResolveSourceReader(reader); // Prefer a detached, audio-only FFmpegReader clone so we never mutate the live reader used for preview. if (auto ff_reader = dynamic_cast(resolved_reader)) { const Json::Value ff_json = ff_reader->JsonValue(); const std::string path = ff_json.get("path", "").asString(); if (!path.empty()) { try { auto clone = std::make_unique(path, false); clone->SetJsonValue(ff_json); clone->info.has_video = false; // explicitly audio-only for waveform extraction detached_reader = std::move(clone); resolved_reader = detached_reader.get(); } catch (...) { // Fall back to using the original reader if cloning fails detached_reader.reset(); resolved_reader = ResolveSourceReader(reader); } } } return resolved_reader ? resolved_reader : reader; }