Files
UnrealEngineUWP/Engine/Source/Runtime/SignalProcessing/Private/VoiceProcessing.cpp
phil popp 056d717727 Deprecate AudioFFT in favor of optimized and hardware accelerated version
Deprecating voice processing as is not tested or needed. Current voice processing needs are handled in EOS with WebRTC.
#rb Aaron.McLeran, Helen.Yang
#jira UE-148715
#preflight 631a3e9ed31788ea3a59f214

[CL 21910441 by phil popp in ue5-main branch]
2022-09-08 21:27:22 -04:00

380 lines
14 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
#include "DSP/VoiceProcessing.h"
#include "DSP/AudioDebuggingUtilities.h"
#include "DSP/FloatArrayMath.h"
namespace Audio
{
FMovingAverager::FMovingAverager(uint32 NumSamples)
: BufferCursor(0)
, AccumulatedSum(0.0f)
{
AudioBuffer.Reset();
AudioBuffer.AddZeroed(FMath::Max<uint32>(NumSamples, 1));
}
float FMovingAverager::ProcessInput(const float& Input, float& Output)
{
FScopeLock ScopeLock(&ProcessCriticalSection);
float* BufferPtr = AudioBuffer.GetData();
Output = BufferPtr[BufferCursor];
BufferPtr[BufferCursor] = Input;
BufferCursor = (BufferCursor + 1) % AudioBuffer.Num();
// Instead of summing our entire buffer every tick, we simply add the incoming sample amplitude, and subtract the outgoing amplitude.
// TODO: With this approach AccumulatedSum will start to slowly drift over time from accumulated rounding error. Every so often we will need to
// reset AccumulatedSum to the actual sum of AudioBuffer.
AccumulatedSum += (FMath::Abs(Input) - FMath::Abs(Output));
return AccumulatedSum;
}
void FMovingAverager::SetNumSamples(uint32 NumSamples)
{
FScopeLock ScopeLock(&ProcessCriticalSection);
AudioBuffer.Reset();
AudioBuffer.AddZeroed(NumSamples);
AccumulatedSum = 0.0f;
}
FMovingVectorAverager::FMovingVectorAverager(uint32 NumSamples)
: BufferCursor(0)
, AccumulatedSum(VectorZero())
{
checkf(NumSamples % 4 == 0, TEXT("NumSamples must be divisible by 4!"));
const uint32 NumVectors = NumSamples / 4;
AudioBuffer.Reset();
AudioBuffer.AddZeroed(NumVectors);
}
float FMovingVectorAverager::ProcessAudio(const VectorRegister4Float& Input, VectorRegister4Float& Output)
{
VectorRegister4Float* BufferPtr = AudioBuffer.GetData();
Output = BufferPtr[BufferCursor];
BufferPtr[BufferCursor] = Input;
BufferCursor = (BufferCursor + 1) % AudioBuffer.Num();
// Instead of summing our entire buffer every tick, we simply add the incoming sample amplitude, and subtract the outgoing amplitude.
// TODO: With this approach AccumulatedSum will start to slowly drift over time from accumulated rounding error. Every so often we will need to
// reset AccumulatedSum to the actual sum of AudioBuffer.
const VectorRegister4Float AbsInput = VectorAbs(Input);
const VectorRegister4Float AbsOutput = VectorAbs(Output);
const VectorRegister4Float TotalAccumulation = VectorSubtract(AbsInput, AbsOutput);
AccumulatedSum = VectorAdd(AccumulatedSum, TotalAccumulation);
alignas(16) float PartionedSums[4];
VectorStore(AccumulatedSum, PartionedSums);
return (PartionedSums[0] + PartionedSums[1] + PartionedSums[2] + PartionedSums[3]) / (AudioBuffer.Num() * 4);
}
FSilenceDetection::FSilenceDetection(float InOnsetThreshold, float InReleaseThreshold, int32 AttackDurationInSamples, int32 ReleaseDurationInSamples)
: Averager(AttackDurationInSamples)
, ReleaseTau(FMath::Exp(-1.0f / (ReleaseDurationInSamples / 4)))
, OnsetThreshold(InOnsetThreshold)
, ReleaseThreshold(InReleaseThreshold)
, CurrentAmplitude(0.0f)
, bOnsetWasInLastBuffer(false)
{
ensureMsgf(InOnsetThreshold > InReleaseThreshold, TEXT("The onset threshold should always be louder than the release threshold."));
}
int32 FSilenceDetection::ProcessBuffer(const float* InAudio, float* OutAudio, int32 NumSamples)
{
if (bOnsetWasInLastBuffer || CurrentAmplitude > ReleaseThreshold)
{
bOnsetWasInLastBuffer = false;
// If we've been outputting audio up until the start of this callback, we are going to continue outputting audio
// Until the end of this buffer. If the current amplitude is below our threshold at the end of this callback, we will
// stop outputting audio then.
for (int32 InSampleIndex = 0; InSampleIndex < NumSamples; InSampleIndex += 4)
{
const VectorRegister4Float InputVector = VectorLoad(&InAudio[InSampleIndex]);
VectorRegister4Float OutputVector;
float InstantaneousAmplitude = Averager.ProcessAudio(InputVector, OutputVector);
CurrentAmplitude = ReleaseTau * (CurrentAmplitude - InstantaneousAmplitude) + InstantaneousAmplitude;
VectorStore(OutputVector, &OutAudio[InSampleIndex]);
}
// If we are releasing back to silence at the end of this buffer callback, we perform a short fadeout here.
if (CurrentAmplitude < ReleaseThreshold)
{
static const int32 DefaultNumSamplesToFadeOutOver = 32;
const int32 NumSamplesToFadeOutOver = FMath::Min(NumSamples, DefaultNumSamplesToFadeOutOver);
const int32 Offset = NumSamples - NumSamplesToFadeOutOver;
TArrayView<float> OutAudioView(&OutAudio[Offset], NumSamplesToFadeOutOver);
Audio::ArrayFade(OutAudioView, 1.0f, 0.0f);
}
return NumSamples;
}
else
{
// If we started this callback in a silent state, we simply buffer audio until we've detected an onset,
// At which point we begin outputting audio from the Averager.
int32 OutSampleIndex = 0;
bool bHitThreshold = false;
float InstantaneousAmplitude = 0.0f;
for (int32 InSampleIndex = 0; InSampleIndex < NumSamples; InSampleIndex += 4)
{
const VectorRegister4Float InputVector = VectorLoad(&InAudio[InSampleIndex]);
VectorRegister4Float OutputVector;
InstantaneousAmplitude = Averager.ProcessAudio(InputVector, OutputVector);
if (bHitThreshold)
{
VectorStore(OutputVector, &OutAudio[OutSampleIndex]);
OutSampleIndex += 4;
}
else
{
bHitThreshold = InstantaneousAmplitude > OnsetThreshold;
}
}
CurrentAmplitude = InstantaneousAmplitude;
bOnsetWasInLastBuffer = bHitThreshold;
check(CurrentAmplitude < 100.0f);
return OutSampleIndex;
}
}
void FSilenceDetection::SetThreshold(float InThreshold)
{
OnsetThreshold = InThreshold;
}
float FSilenceDetection::GetCurrentAmplitude()
{
return CurrentAmplitude;
}
FSlowAdaptiveGainControl::FSlowAdaptiveGainControl(float InGainTarget, int32 InAdaptiveRate, float InGainMin /*= 0.5f*/, float InGainMax /*= 2.0f*/)
: PeakDetector(InAdaptiveRate)
, GainTarget(InGainTarget)
, PreviousGain(1.0f)
, GainMin(InGainMin)
, GainMax(InGainMax)
{
}
float FSlowAdaptiveGainControl::ProcessAudio(float* InAudio, int32 NumSamples, float InAmplitude)
{
float PeakDetectorOutput = 0.0f; // unused
const float EstimatedPeak = PeakDetector.ProcessInput(InAmplitude, PeakDetectorOutput);
const float TargetGain = GetTargetGain(EstimatedPeak);
TArrayView<float> InAudioView(InAudio, NumSamples);
Audio::ArrayFade(InAudioView, PreviousGain, TargetGain);
PreviousGain = TargetGain;
return TargetGain;
}
void FSlowAdaptiveGainControl::SetAdaptiveRate(int32 InAdaptiveRate)
{
PeakDetector.SetNumSamples(InAdaptiveRate);
}
float FSlowAdaptiveGainControl::GetTargetGain(float InAmplitude)
{
const float UnclampedGain = GainTarget / InAmplitude;
return FMath::Clamp(UnclampedGain, GainMin, GainMax);
}
FAdaptiveFilter_DEPRECATED::FAdaptiveFilter_DEPRECATED(int32 FilterLength, int32 AudioCallbackSize)
: WindowSize(FilterLength)
, CurrentStepsUntilConvergence(0)
{
const int32 FFTSize = FFTIntrinsics::NextPowerOf2(FilterLength * 2 - 1);
//Initialize our weights to 0:
WeightDeltas.InitZeroed(FFTSize);
CurrentWeights.InitZeroed(FFTSize);
InputFrequencies.InitZeroed(FFTSize);
// For passthrough, we initialize our current weights to 1:
for (int32 Index = 0; Index < CurrentWeights.Real.Num(); Index++)
{
CurrentWeights.Real[Index] = 1.0f;
}
Convolver.SetFilter(CurrentWeights, FilterLength);
}
void FAdaptiveFilter_DEPRECATED::AdaptFilter()
{
// If we've suitably converged, we avoid incrementing our weights.
if (CurrentStepsUntilConvergence <= 0)
{
return;
}
IncrementWeights();
CurrentStepsUntilConvergence--;
Convolver.SetFilter(CurrentWeights, WindowSize);
}
void FAdaptiveFilter_DEPRECATED::SetWeightDeltas(const float* InWeightsReal, const float* InWeightsImag, int32 NumWeights, float InLearningRate)
{
// ProcessAudio::NumWeights needs to use the same
checkSlow(NumWeights == WeightDeltas.Real.Num());
// If NumWeights is not divisible by four, something is terribly wrong, since we are using an FFT with this.
checkSlow(NumWeights % 4 == 0);
const float* CurrentRealBuffer = CurrentWeights.Real.GetData();
const float* CurrentImagBuffer = CurrentWeights.Imag.GetData();
float* DeltaRealBuffer = WeightDeltas.Real.GetData();
float* DeltaImagBuffer = WeightDeltas.Imag.GetData();
// Our convergence rate is 1 / our NumConvergence steps.
const VectorRegister4Float ConvergenceRate = VectorSetFloat1(InLearningRate);
CurrentStepsUntilConvergence = FMath::CeilToInt(1.0f / InLearningRate);
for (int32 WeightIndex = 0; WeightIndex < NumWeights; WeightIndex+= 4)
{
VectorRegister4Float TargetReal = VectorLoad(&InWeightsReal[WeightIndex]);
VectorRegister4Float CurrentReal = VectorLoad(&CurrentRealBuffer[WeightIndex]);
const VectorRegister4Float DeltaReal = VectorMultiply(VectorSubtract(TargetReal, CurrentReal), ConvergenceRate);
VectorStore(DeltaReal, &DeltaRealBuffer[WeightIndex]);
VectorRegister4Float TargetImag = VectorLoad(&InWeightsImag[WeightIndex]);
VectorRegister4Float CurrentImag = VectorLoad(&CurrentImagBuffer[WeightIndex]);
const VectorRegister4Float DeltaImag = VectorMultiply(VectorSubtract(TargetImag, CurrentImag), ConvergenceRate);
VectorStore(DeltaImag, &DeltaImagBuffer[WeightIndex]);
}
}
void FAdaptiveFilter_DEPRECATED::IncrementWeights()
{
const int32 NumWeights = CurrentWeights.Real.Num();
float* CurrentRealBuffer = CurrentWeights.Real.GetData();
float* CurrentImagBuffer = CurrentWeights.Imag.GetData();
const float* DeltasRealBuffer = WeightDeltas.Real.GetData();
const float* DeltasImagBuffer = WeightDeltas.Imag.GetData();
for (int32 WeightIndex = 0; WeightIndex < NumWeights; WeightIndex += 4)
{
VectorRegister4Float DeltaReal = VectorLoad(&DeltasRealBuffer[WeightIndex]);
VectorRegister4Float CurrentReal = VectorLoad(&CurrentRealBuffer[WeightIndex]);
CurrentReal = VectorAdd(CurrentReal, DeltaReal);
VectorStore(CurrentReal, &CurrentRealBuffer[WeightIndex]);
VectorRegister4Float DeltaImag = VectorLoad(&DeltasImagBuffer[WeightIndex]);
VectorRegister4Float CurrentImag = VectorLoad(&CurrentImagBuffer[WeightIndex]);
CurrentImag = VectorAdd(CurrentImag, DeltaImag);
VectorStore(CurrentImag, &CurrentImagBuffer[WeightIndex]);
}
}
void FAdaptiveFilter_DEPRECATED::ProcessAudio(float* InAudio, int32 NumSamples)
{
AdaptFilter();
Convolver.ProcessAudio(InAudio, NumSamples);
}
void FAdaptiveFilter_DEPRECATED::SetWeights(const FrequencyBuffer& InFilterWeights, int32 FilterLength, float InLearningRate)
{
SetWeightDeltas(InFilterWeights.Real.GetData(), InFilterWeights.Imag.GetData(), InFilterWeights.Real.Num(), InLearningRate);
}
FFDAPFilterComputer_DEPRECATED::FFDAPFilterComputer_DEPRECATED()
{
}
void FFDAPFilterComputer_DEPRECATED::GenerateWeights(const float* IncomingSignal, int32 NumIncomingSamples, const float* OutgoingSignal, int32 NumOutgoingSamples, FrequencyBuffer& OutWeights)
{
int32 FFTSize = FFTIntrinsics::NextPowerOf2(NumIncomingSamples + NumOutgoingSamples - 1);
// zero pad inputs appropriately:
ZeroPaddedIncomingBuffer.Reset();
ZeroPaddedIncomingBuffer.AddZeroed(FFTSize);
FMemory::Memcpy(ZeroPaddedIncomingBuffer.GetData(), IncomingSignal, NumIncomingSamples * sizeof(float));
ZeroPaddedOutgoingBuffer.Reset();
ZeroPaddedOutgoingBuffer.AddZeroed(FFTSize);
FMemory::Memcpy(ZeroPaddedOutgoingBuffer.GetData(), OutgoingSignal, NumOutgoingSamples * sizeof(float));
int32 FilterLength = FMath::Max(NumIncomingSamples, NumOutgoingSamples);
PRAGMA_DISABLE_DEPRECATION_WARNINGS
CrossCorrelate(ZeroPaddedIncomingBuffer.GetData(), ZeroPaddedOutgoingBuffer.GetData(), FilterLength, FFTSize, IncomingFrequencies, OutgoingFrequencies, OutWeights);
PRAGMA_ENABLE_DEPRECATION_WARNINGS
// TODO: Perform linear phase conversion on OutWeights.
}
FAcousticEchoCancellation_DEPRECATED::FAcousticEchoCancellation_DEPRECATED(float InConvergenceRate, int32 CallbackSize, int32 InFilterLength, int32 InFilterUpdateRate /*= 1*/)
: AdaptiveFilter(InFilterLength, CallbackSize)
, ConvergenceRate(InConvergenceRate)
, FilterLength(InFilterLength)
, FilterUpdateRate(InFilterUpdateRate)
, FitlerUpdateCounter(InFilterUpdateRate)
{
checkSlow(FMath::IsPowerOfTwo(FilterLength));
}
void FAcousticEchoCancellation_DEPRECATED::ProcessAudio(float* InAudio, int32 NumSamples)
{
checkSlow(FMath::IsPowerOfTwo(NumSamples));
int32 NumBlocks = NumSamples / FilterLength;
for (int32 Index = 0; Index < NumBlocks; Index++)
{
float* BlockAudio = &InAudio[Index * FilterLength];
if (FitlerUpdateCounter == 0)
{
FilterComputerInput.Reset();
FilterComputerInput.AddZeroed(FilterLength);
// Since we are likely sampling audio buffers much less often than we are rendering them,
// we always skip ahead to the latest audio to compute our filter coefficients.
const bool bUseLatestAudio = true;
// Grab our audio from our taps:
int32 PopResult = PatchMixer.PopAudio(FilterComputerInput.GetData(), FilterLength, bUseLatestAudio);
if (PopResult != 0)
{
// Compute our new filter target and pass it to the adaptive filter:
FilterComputer.GenerateWeights(FilterComputerInput.GetData(), FilterLength, BlockAudio, FilterLength, FilterComputerOutput);
AdaptiveFilter.ProcessAudio(BlockAudio, NumSamples);
AdaptiveFilter.SetWeights(FilterComputerOutput, FilterLength, ConvergenceRate);
}
else
{
// If our taps were empty, don't both updating the filter.
AdaptiveFilter.ProcessAudio(BlockAudio, FilterLength);
}
FitlerUpdateCounter = FilterUpdateRate - 1;
}
else
{
AdaptiveFilter.ProcessAudio(BlockAudio, FilterLength);
FitlerUpdateCounter--;
}
}
}
Audio::FPatchInput FAcousticEchoCancellation_DEPRECATED::AddNewSignalPatch(int32 ExpectedLatency, float Gain /*= 1.0f*/)
{
return PatchMixer.AddNewInput(ExpectedLatency, Gain);
}
void FAcousticEchoCancellation_DEPRECATED::RemoveSignalPatch(const FPatchInput& Patch)
{
PatchMixer.RemovePatch(Patch);
}
}