libopenshot/src/AudioWaveformer.cpp

/**
 * @file
 * @brief Source file for AudioWaveformer class
 * @author Jonathan Thomas <jonathan@openshot.org>
 *
 * @ref License
 */

// Copyright (c) 2008-2022 OpenShot Studios, LLC
//
// SPDX-License-Identifier: LGPL-3.0-or-later

#include "AudioWaveformer.h"

#include <cmath>

#include <algorithm>
#include <chrono>
#include <memory>
#include <thread>
#include <vector>

#include "Clip.h"
#include "Exceptions.h"
#include "FrameMapper.h"
#include "FFmpegReader.h"
#include "Timeline.h"


using namespace std;
using namespace openshot;


// Default constructor
AudioWaveformer::AudioWaveformer(ReaderBase* new_reader) :
	reader(new_reader),
	detached_reader(nullptr),
	resolved_reader(nullptr),
	source_initialized(false)
{

}

// Destructor
AudioWaveformer::~AudioWaveformer()
{

}

// Extract audio samples from any ReaderBase class
AudioWaveformData AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
	// Legacy entry point: resolve a source reader (unwrap Clip/FrameMapper), then extract audio-only.
	AudioWaveformData data;
	if (!reader) {
		return data;
	}

	ReaderBase* source = ResolveWaveformReader();

	Fraction source_fps = ResolveSourceFPS(source);

	AudioWaveformData base = ExtractSamplesFromReader(source, channel, num_per_second, false);

	// If this is a Clip, apply its keyframes using project fps (timeline if available, else reader fps)
	if (auto clip = dynamic_cast<Clip*>(reader)) {
		Timeline* timeline = dynamic_cast<Timeline*>(clip->ParentTimeline());
		Fraction project_fps = timeline ? timeline->info.fps : clip->Reader()->info.fps;
		return ApplyKeyframes(base, &clip->time, &clip->volume, project_fps, source_fps, source->info.channels, num_per_second, channel, normalize);
	}

	// No keyframes to apply
	if (normalize) {
		float max_sample = 0.0f;
		for (auto v : base.max_samples) {
			max_sample = std::max(max_sample, std::abs(v));
		}
		if (max_sample > 0.0f) {
			base.scale(static_cast<int>(base.max_samples.size()), 1.0f / max_sample);
		}
	}
	return base;
}

AudioWaveformData AudioWaveformer::ExtractSamples(const std::string& path, int channel, int num_per_second, bool normalize) {
	FFmpegReader temp_reader(path);
	temp_reader.Open();
	// Disable video for speed
	bool has_video = temp_reader.info.has_video;
	temp_reader.info.has_video = false;
	AudioWaveformData data = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, normalize);
	temp_reader.info.has_video = has_video;
	temp_reader.Close();
	return data;
}

AudioWaveformData AudioWaveformer::ExtractSamples(const std::string& path,
												  const Keyframe* time_keyframe,
												  const Keyframe* volume_keyframe,
												  const Fraction& project_fps,
												  int channel,
												  int num_per_second,
												  bool normalize) {
	FFmpegReader temp_reader(path);
	temp_reader.Open();
	bool has_video = temp_reader.info.has_video;
	temp_reader.info.has_video = false;
	Fraction source_fps = temp_reader.info.fps;
	AudioWaveformData base = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, false);
	temp_reader.info.has_video = has_video;
	temp_reader.Close();
	return ApplyKeyframes(base, time_keyframe, volume_keyframe, project_fps, source_fps, temp_reader.info.channels, num_per_second, channel, normalize);
}

AudioWaveformData AudioWaveformer::ApplyKeyframes(const AudioWaveformData& base,
												  const Keyframe* time_keyframe,
												  const Keyframe* volume_keyframe,
												  const Fraction& project_fps,
												  const Fraction& source_fps,
												  int source_channels,
												  int num_per_second,
												  int channel,
												  bool normalize) {
	AudioWaveformData data;
	if (num_per_second <= 0) {
		return data;
	}

	double project_fps_value = project_fps.ToDouble();
	double source_fps_value = source_fps.ToDouble();
	if (project_fps_value <= 0.0 || source_fps_value <= 0.0) {
		return data;
	}

	if (channel != -1 && (channel < 0 || channel >= source_channels)) {
		return data;
	}

	size_t base_total = base.max_samples.size();
	if (base_total == 0) {
		return data;
	}

	// Determine output duration from time curve (if any). Time curves are in project-frame domain.
	int64_t output_frames = 0;
	if (time_keyframe && time_keyframe->GetCount() > 0) {
		output_frames = time_keyframe->GetLength();
	}
	if (output_frames <= 0) {
		// Default to source duration derived from base waveform length
		double source_duration = static_cast<double>(base_total) / static_cast<double>(num_per_second);
		output_frames = static_cast<int64_t>(std::llround(source_duration * project_fps_value));
	}
	double output_duration_seconds = static_cast<double>(output_frames) / project_fps_value;
	int total_samples = static_cast<int>(std::ceil(output_duration_seconds * num_per_second));

	if (total_samples <= 0) {
		return data;
	}

	data.resize(total_samples);
	data.zero(total_samples);

	for (int i = 0; i < total_samples; ++i) {
		double out_time = static_cast<double>(i) / static_cast<double>(num_per_second);
		// Time keyframes are defined in project-frame domain; evaluate using project frames
		double project_frame = out_time * project_fps_value;
		double mapped_project_frame = time_keyframe ? time_keyframe->GetValue(project_frame) : project_frame;
		// Convert mapped project frame to seconds (project FPS), then to waveform index
		double source_time = mapped_project_frame / project_fps_value;
		double source_index = source_time * static_cast<double>(num_per_second);

		// Sample base waveform (nearest with simple linear blend)
		int idx0 = static_cast<int>(std::floor(source_index));
		int idx1 = idx0 + 1;
		double frac = source_index - static_cast<double>(idx0);

		float max_sample = 0.0f;
		float rms_sample = 0.0f;
		if (idx0 >= 0 && idx0 < static_cast<int>(base_total)) {
			max_sample = base.max_samples[idx0];
			rms_sample = base.rms_samples[idx0];
		}
		if (idx1 >= 0 && idx1 < static_cast<int>(base_total)) {
			max_sample = static_cast<float>((1.0 - frac) * max_sample + frac * base.max_samples[idx1]);
			rms_sample = static_cast<float>((1.0 - frac) * rms_sample + frac * base.rms_samples[idx1]);
		}

		double gain = 1.0;
		if (volume_keyframe) {
			double project_frame = out_time * project_fps_value;
			gain = volume_keyframe->GetValue(project_frame);
		}
		max_sample = static_cast<float>(max_sample * gain);
		rms_sample = static_cast<float>(rms_sample * gain);

		data.max_samples[i] = max_sample;
		data.rms_samples[i] = rms_sample;
	}

	if (normalize) {
		float samples_max = 0.0f;
		for (auto v : data.max_samples) {
			samples_max = std::max(samples_max, std::abs(v));
		}
		if (samples_max > 0.0f) {
			data.scale(total_samples, 1.0f / samples_max);
		}
	}

	return data;
}

AudioWaveformData AudioWaveformer::ExtractSamplesFromReader(ReaderBase* source_reader, int channel, int num_per_second, bool normalize) {
	AudioWaveformData data;

	if (!source_reader || num_per_second <= 0) {
		return data;
	}

	// Open reader (if needed)
	if (!source_reader->IsOpen()) {
		source_reader->Open();
	}

	const auto retry_delay = std::chrono::milliseconds(100);
	const auto max_wait_for_open = std::chrono::milliseconds(3000);

	auto get_frame_with_retry = [&](int64_t frame_number) -> std::shared_ptr<openshot::Frame> {
		std::chrono::steady_clock::time_point wait_start;
		bool waiting_for_open = false;
		while (true) {
			try {
				return source_reader->GetFrame(frame_number);
			} catch (const openshot::ReaderClosed&) {
				auto now = std::chrono::steady_clock::now();
				if (!waiting_for_open) {
					waiting_for_open = true;
					wait_start = now;
				} else if (now - wait_start >= max_wait_for_open) {
					throw;
				}

				std::this_thread::sleep_for(retry_delay);
			}
		}
	};

	int sample_rate = source_reader->info.sample_rate;
	if (sample_rate <= 0) {
		sample_rate = num_per_second;
	}
	int sample_divisor = sample_rate / num_per_second;
	if (sample_divisor <= 0) {
		sample_divisor = 1;
	}

	// Determine length of video frames (for waveform)
	int64_t reader_video_length = source_reader->info.video_length;
	if (reader_video_length < 0) {
		reader_video_length = 0;
	}
	float reader_duration = source_reader->info.duration;
	double fps_value = source_reader->info.fps.ToDouble();
	float frames_duration = 0.0f;
	if (reader_video_length > 0 && fps_value > 0.0) {
		frames_duration = static_cast<float>(reader_video_length / fps_value);
	}
	if (reader_duration <= 0.0f) {
		reader_duration = frames_duration;
	}
	if (reader_duration < 0.0f) {
		reader_duration = 0.0f;
	}

	if (!source_reader->info.has_audio) {
		return data;
	}

	int total_samples = static_cast<int>(std::ceil(reader_duration * num_per_second));
	if (total_samples <= 0 || source_reader->info.channels == 0) {
		return data;
	}

	if (channel != -1 && (channel < 0 || channel >= source_reader->info.channels)) {
		return data;
	}

	// Resize and clear audio buffers
	data.resize(total_samples);
	data.zero(total_samples);

	int extracted_index = 0;
	int sample_index = 0;
	float samples_max = 0.0f;
	float chunk_max = 0.0f;
	double chunk_squared_sum = 0.0;

	int channel_count = (channel == -1) ? source_reader->info.channels : 1;
	std::vector<float*> channels(source_reader->info.channels, nullptr);

	try {
		for (int64_t f = 1; f <= reader_video_length && extracted_index < total_samples; f++) {
			std::shared_ptr<openshot::Frame> frame = get_frame_with_retry(f);

			for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) {
				if (channel == channel_index || channel == -1) {
					channels[channel_index] = frame->GetAudioSamples(channel_index);
				}
			}

			int sample_count = frame->GetAudioSamplesCount();
			for (int s = 0; s < sample_count; s++) {
				for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) {
					if (channel == channel_index || channel == -1) {
						float *samples = channels[channel_index];
						if (!samples) {
							continue;
						}
						float abs_sample = std::abs(samples[s]);
						chunk_squared_sum += static_cast<double>(samples[s]) * static_cast<double>(samples[s]);
						chunk_max = std::max(chunk_max, abs_sample);
					}
				}

				sample_index += 1;

				if (sample_index % sample_divisor == 0) {
					float avg_squared_sum = 0.0f;
					if (channel_count > 0) {
						avg_squared_sum = static_cast<float>(chunk_squared_sum / static_cast<double>(sample_divisor * channel_count));
					}

					if (extracted_index < total_samples) {
						data.max_samples[extracted_index] = chunk_max;
						data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum);
						samples_max = std::max(samples_max, chunk_max);
						extracted_index++;
					}

					sample_index = 0;
					chunk_max = 0.0f;
					chunk_squared_sum = 0.0;

					if (extracted_index >= total_samples) {
						break;
					}
				}
			}
		}
	} catch (...) {
		throw;
	}

	if (sample_index > 0 && extracted_index < total_samples) {
		float avg_squared_sum = 0.0f;
		if (channel_count > 0) {
			avg_squared_sum = static_cast<float>(chunk_squared_sum / static_cast<double>(sample_index * channel_count));
		}

		data.max_samples[extracted_index] = chunk_max;
		data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum);
		samples_max = std::max(samples_max, chunk_max);
		extracted_index++;
	}

	if (normalize && samples_max > 0.0f) {
		float scale = 1.0f / samples_max;
		data.scale(total_samples, scale);
	}

	return data;
}

ReaderBase* AudioWaveformer::ResolveSourceReader(ReaderBase* source_reader) {
	if (!source_reader) {
		return nullptr;
	}

	ReaderBase* current = source_reader;
	while (true) {
		if (auto clip = dynamic_cast<Clip*>(current)) {
			current = clip->Reader();
			continue;
		}
		if (auto mapper = dynamic_cast<FrameMapper*>(current)) {
			current = mapper->Reader();
			continue;
		}
		break;
	}
	return current;
}

Fraction AudioWaveformer::ResolveSourceFPS(ReaderBase* source_reader) {
	if (!source_reader) {
		return Fraction(0, 1);
	}
	return source_reader->info.fps;
}

// Resolve and cache the reader used for waveform extraction (prefer a detached FFmpegReader clone)
ReaderBase* AudioWaveformer::ResolveWaveformReader() {
	if (source_initialized) {
		return resolved_reader ? resolved_reader : reader;
	}
	source_initialized = true;

	resolved_reader = ResolveSourceReader(reader);

	// Prefer a detached, audio-only FFmpegReader clone so we never mutate the live reader used for preview.
	if (auto ff_reader = dynamic_cast<FFmpegReader*>(resolved_reader)) {
		const Json::Value ff_json = ff_reader->JsonValue();
		const std::string path = ff_json.get("path", "").asString();
		if (!path.empty()) {
			try {
				auto clone = std::make_unique<FFmpegReader>(path, false);
				clone->SetJsonValue(ff_json);
				clone->info.has_video = false; // explicitly audio-only for waveform extraction
				detached_reader = std::move(clone);
				resolved_reader = detached_reader.get();
			} catch (...) {
				// Fall back to using the original reader if cloning fails
				detached_reader.reset();
				resolved_reader = ResolveSourceReader(reader);
			}
		}
	}

	return resolved_reader ? resolved_reader : reader;
}