You've already forked UnrealEngineUWP
mirror of
https://github.com/izzy2lost/UnrealEngineUWP.git
synced 2026-03-26 18:15:20 -07:00
#preflight 6274b51ade15651eeb4ffef5 #ROBOMERGE-AUTHOR: guillaume.abadie #ROBOMERGE-SOURCE: CL 20073343 via CL 20073349 via CL 20073355 #ROBOMERGE-BOT: UE5 (Release-Engine-Staging -> Main) (v943-19904690) [CL 20105784 by guillaume abadie in ue5-main branch]
1202 lines
35 KiB
Plaintext
1202 lines
35 KiB
Plaintext
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#include "../Common.ush"
|
|
#include "../Random.ush"
|
|
#include "../TextureSampling.ush"
|
|
#include "../FastMath.ush"
|
|
#include "../MonteCarlo.ush"
|
|
#include "../ScreenPass.ush"
|
|
#include "/Engine/Public/DualPixelVectorization.ush"
|
|
#include "/Engine/Public/WaveBroadcastIntrinsics.ush"
|
|
|
|
|
|
//------------------------------------------------------- RECOMPILE HASH
|
|
// 88217342-DCE7-451F-8670-0A14BC8B6109
|
|
|
|
|
|
//------------------------------------------------------- COMPILER CONFIG
|
|
|
|
// Generate vector truncation warnings to errors.
|
|
#pragma warning(error: 3206)
|
|
|
|
|
|
//------------------------------------------------------- CONFIG
|
|
|
|
#define DEBUG_OUTPUT 0
|
|
|
|
#if PLATFORM_SUPPORTS_REAL_TYPES && 1
|
|
#define CONFIG_COMPILE_FP16 1
|
|
#else
|
|
#define CONFIG_COMPILE_FP16 0
|
|
#endif
|
|
|
|
#if PLATFORM_SUPPORTS_WAVE_BROADCAST && 1
|
|
#define CONFIG_BUTTERFLY_KERNEL 1
|
|
#else
|
|
#define CONFIG_BUTTERFLY_KERNEL 0
|
|
#endif
|
|
|
|
#define CONFIG_SCENE_COLOR_OVERFLOW 1
|
|
|
|
#if POST_PROCESS_ALPHA
|
|
#define CONFIG_SCENE_COLOR_APLHA 1
|
|
#else
|
|
#define CONFIG_SCENE_COLOR_APLHA 0
|
|
#endif
|
|
|
|
#define CONFIG_ENABLE_STOCASTIC_QUANTIZATION (!CONFIG_SCENE_COLOR_APLHA)
|
|
|
|
//------------------------------------------------------- CONSTANTS
|
|
|
|
/* Maximum number of sample. */
|
|
#define MAX_SAMPLE_COUNT 8
|
|
|
|
#define MAX_FALLBACK_SAMPLE_COUNT 20
|
|
|
|
// Spatial anti-aliasing encoding settings.
|
|
#define SPATIAL_ANTI_ALIASING_OFFSET_RANGE 0.5
|
|
#define SPATIAL_ANTI_ALIASING_OFFSET_BITDEPTH_PER_CHANNEL 4
|
|
|
|
#define COMPRESS_PREV_USE_COUNT 1
|
|
#if COMPRESS_PREV_USE_COUNT
|
|
// COMPRESS_PREV_USE_COUNT compress the use count in 8bit, so need to sum of a 2x2 to not overflow these 8bits
|
|
#define PREV_USE_COUNT_QUANTIZATION 63
|
|
#else
|
|
#define PREV_USE_COUNT_QUANTIZATION 255
|
|
#endif
|
|
|
|
#define PARALLAX_REJECTION_MASK_THRESHOLD 0.5
|
|
|
|
#define MAX_PARALLAX_FACTOR 8.0
|
|
|
|
// Size and payload size of the subpixel grid in the history
|
|
#define SUB_PIXEL_GRID_SIZE 2
|
|
#define SUB_PIXEL_COUNT (SUB_PIXEL_GRID_SIZE * SUB_PIXEL_GRID_SIZE)
|
|
|
|
#define SUB_PIXEL_BIT_COUNT 4
|
|
#define SUB_PIXEL_BIT_MASK ((1 << SUB_PIXEL_BIT_COUNT) - 1)
|
|
|
|
#define SUB_PIXEL_PARALLAX_FACTOR_BIT_COUNT 4
|
|
#define SUB_PIXEL_PARALLAX_FACTOR_BIT_OFFSET 0
|
|
#define SUB_PIXEL_PARALLAX_FACTOR_BIT_MASK ((1 << SUB_PIXEL_PARALLAX_FACTOR_BIT_COUNT) - 1)
|
|
|
|
// Minimum number of sample of each texel when reprojecting the high frequency history.
|
|
// Value too low causes area of low sample count to have pixel dominating too much in reprojection causing input res aliasing on texture details
|
|
#define MIN_REPROJECTION_SAMPLE_COUNT 2.0
|
|
|
|
// Minimum weight of a pixel in history.
|
|
#define MIN_IMPLICIT_HISTORY_WEIGHT ceil(tsr_half(255.0) * rcp(tsr_half(MAX_SAMPLE_COUNT)) * tsr_half(MIN_REPROJECTION_SAMPLE_COUNT)) * rcp(tsr_half(255.0))
|
|
|
|
|
|
// K = Center of the nearest input pixel.
|
|
// O = Center of the output pixel.
|
|
//
|
|
// | |
|
|
// 0 | 1 | 2
|
|
// | |
|
|
// | |
|
|
// --------+-----------+--------
|
|
// | |
|
|
// | O |
|
|
// 3 | K | 5
|
|
// | |
|
|
// | |
|
|
// --------+-----------+--------
|
|
// | |
|
|
// | |
|
|
// 6 | 7 | 8
|
|
// | |
|
|
//
|
|
static const int2 kOffsets3x3[9] =
|
|
{
|
|
int2(-1, -1),
|
|
int2(0, -1),
|
|
int2(1, -1),
|
|
int2(-1, 0),
|
|
int2(0, 0), // K
|
|
int2(1, 0),
|
|
int2(-1, 1),
|
|
int2(0, 1),
|
|
int2(1, 1),
|
|
};
|
|
|
|
// T = Center of the nearest top left pixel input pixel.
|
|
// O = Center of the output pixel.
|
|
//
|
|
// |
|
|
// T | .
|
|
// |
|
|
// O |
|
|
// --------+--------
|
|
// |
|
|
// |
|
|
// . | .
|
|
// |
|
|
static const int2 Offsets2x2[4] =
|
|
{
|
|
int2( 0, 0), // T
|
|
int2( 1, 0),
|
|
int2( 0, 1),
|
|
int2( 1, 1),
|
|
};
|
|
|
|
// Indexes of the 3x3 square.
|
|
static const uint kSquareIndexes3x3[9] = { 4, 0, 1, 2, 3, 8, 7, 6, 5 };
|
|
|
|
// Indexes of the offsets to have plus + shape.
|
|
static const uint kPlusIndexes3x3[5] = { 4, 1, 3, 7, 5 };
|
|
|
|
|
|
#if CONFIG_COMPILE_FP16
|
|
#define tsr_half half
|
|
#define tsr_half2 half2
|
|
#define tsr_half3 half3
|
|
#define tsr_half4 half4
|
|
|
|
#define tsr_short int16_t
|
|
#define tsr_short2 int16_t2
|
|
#define tsr_short3 int16_t3
|
|
#define tsr_short4 int16_t4
|
|
|
|
#define tsr_ushort uint16_t
|
|
#define tsr_ushort2 uint16_t2
|
|
#define tsr_ushort3 uint16_t3
|
|
#define tsr_ushort4 uint16_t4
|
|
|
|
#define tsr_half2x2 half2x2
|
|
#define tsr_half3x2 half3x2
|
|
#define tsr_half4x2 half4x2
|
|
|
|
#define tsr_short2x2 int16_t2x2
|
|
#define tsr_short3x2 int16_t3x2
|
|
#define tsr_short4x2 int16_t4x2
|
|
|
|
#define tsr_ushort2x2 uint16_t2x2
|
|
#define tsr_ushort3x2 uint16_t3x2
|
|
#define tsr_ushort4x2 uint16_t4x2
|
|
|
|
#else
|
|
#define tsr_half float
|
|
#define tsr_half2 float2
|
|
#define tsr_half3 float3
|
|
#define tsr_half4 float4
|
|
|
|
#define tsr_short int
|
|
#define tsr_short2 int2
|
|
#define tsr_short3 int3
|
|
#define tsr_short4 int4
|
|
|
|
#define tsr_ushort uint
|
|
#define tsr_ushort2 uint2
|
|
#define tsr_ushort3 uint3
|
|
#define tsr_ushort4 uint4
|
|
|
|
#define tsr_half2x2 float2x2
|
|
#define tsr_half3x2 float3x2
|
|
#define tsr_half4x2 float4x2
|
|
|
|
#define tsr_short2x2 int2x2
|
|
#define tsr_short3x2 int3x2
|
|
#define tsr_short4x2 int4x2
|
|
|
|
#define tsr_ushort2x2 uint2x2
|
|
#define tsr_ushort3x2 uint3x2
|
|
#define tsr_ushort4x2 uint4x2
|
|
|
|
#endif
|
|
|
|
#if CONFIG_SCENE_COLOR_APLHA
|
|
#define tsr_halfC tsr_half4
|
|
#define tsr_halfCx2 tsr_half4x2
|
|
|
|
#else
|
|
#define tsr_halfC tsr_half3
|
|
#define tsr_halfCx2 tsr_half3x2
|
|
|
|
#endif
|
|
|
|
|
|
#define tsr_subpixel_details uint
|
|
#define tsr_subpixel_details2 uint2
|
|
#define tsr_subpixel_payload uint // TODO: uint16_t
|
|
#define tsr_subpixel_payload2 uint2 // TODO: uint16_t
|
|
|
|
// Largest encodable normal number in a half used on console.
|
|
static const tsr_half LargestNormalNumber = tsr_half(65504.0);
|
|
|
|
// Largest encodable scene color value to not overflaw in YCoCg with halfs on console
|
|
static const tsr_half3 LargestSceneColorRGB = (LargestNormalNumber * tsr_half(0.25)).xxx;
|
|
static const tsr_half4 LargestSceneColorRGBA = (LargestNormalNumber * tsr_half(0.25)).xxxx;
|
|
static const tsr_half3 LargestSceneColorYCoCg = LargestNormalNumber.xxx;
|
|
static const tsr_half4 LargestSceneColorYCoCgA = LargestNormalNumber.xxxx;
|
|
|
|
|
|
//------------------------------------------------------- PARAMETERS
|
|
|
|
|
|
float2 InputInfo_Extent;
|
|
float2 InputInfo_ExtentInverse;
|
|
float2 InputInfo_ScreenPosToViewportScale;
|
|
float2 InputInfo_ScreenPosToViewportBias;
|
|
uint2 InputInfo_ViewportMin;
|
|
uint2 InputInfo_ViewportMax;
|
|
float2 InputInfo_ViewportSize;
|
|
float2 InputInfo_ViewportSizeInverse;
|
|
float2 InputInfo_UVViewportMin;
|
|
float2 InputInfo_UVViewportMax;
|
|
float2 InputInfo_UVViewportSize;
|
|
float2 InputInfo_UVViewportSizeInverse;
|
|
float2 InputInfo_UVViewportBilinearMin;
|
|
float2 InputInfo_UVViewportBilinearMax;
|
|
float2 InputJitter;
|
|
int2 InputPixelPosMin;
|
|
int2 InputPixelPosMax;
|
|
FScreenTransform InputPixelPosToScreenPos;
|
|
float2 ScreenVelocityToInputPixelVelocity;
|
|
float2 InputPixelVelocityToScreenVelocity;
|
|
|
|
float2 LowFrequencyInfo_Extent;
|
|
float2 LowFrequencyInfo_ExtentInverse;
|
|
uint2 LowFrequencyInfo_ViewportMin;
|
|
uint2 LowFrequencyInfo_ViewportMax;
|
|
float2 LowFrequencyInfo_ViewportSize;
|
|
float2 LowFrequencyInfo_ViewportSizeInverse;
|
|
float2 LowFrequencyInfo_UVViewportBilinearMin;
|
|
float2 LowFrequencyInfo_UVViewportBilinearMax;
|
|
|
|
float2 RejectionInfo_Extent;
|
|
float2 RejectionInfo_ExtentInverse;
|
|
uint2 RejectionInfo_ViewportMin;
|
|
uint2 RejectionInfo_ViewportMax;
|
|
float2 RejectionInfo_ViewportSize;
|
|
float2 RejectionInfo_ViewportSizeInverse;
|
|
float2 RejectionInfo_UVViewportBilinearMin;
|
|
float2 RejectionInfo_UVViewportBilinearMax;
|
|
|
|
float2 HistoryInfo_Extent;
|
|
float2 HistoryInfo_ExtentInverse;
|
|
uint2 HistoryInfo_ViewportMin;
|
|
uint2 HistoryInfo_ViewportMax;
|
|
float2 HistoryInfo_ViewportSize;
|
|
float2 HistoryInfo_ViewportSizeInverse;
|
|
float2 HistoryInfo_UVViewportBilinearMin;
|
|
float2 HistoryInfo_UVViewportBilinearMax;
|
|
|
|
// FTSRPrevHistoryParameters
|
|
float2 PrevHistoryInfo_Extent;
|
|
float2 PrevHistoryInfo_ExtentInverse;
|
|
float2 PrevHistoryInfo_ScreenPosToViewportScale;
|
|
float2 PrevHistoryInfo_ScreenPosToViewportBias;
|
|
uint2 PrevHistoryInfo_ViewportMin;
|
|
uint2 PrevHistoryInfo_ViewportMax;
|
|
float2 PrevHistoryInfo_ViewportSize;
|
|
float2 PrevHistoryInfo_ViewportSizeInverse;
|
|
float2 PrevHistoryInfo_UVViewportMin;
|
|
float2 PrevHistoryInfo_UVViewportMax;
|
|
float2 PrevHistoryInfo_UVViewportSize;
|
|
float2 PrevHistoryInfo_UVViewportSizeInverse;
|
|
float2 PrevHistoryInfo_UVViewportBilinearMin;
|
|
float2 PrevHistoryInfo_UVViewportBilinearMax;
|
|
FScreenTransform ScreenPosToPrevHistoryBufferUV;
|
|
FScreenTransform ScreenPosToPrevSubpixelDetails;
|
|
float2 PrevSubpixelDetailsExtent;
|
|
float HistoryPreExposureCorrection;
|
|
|
|
uint bCameraCut;
|
|
|
|
#if DEBUG_OUTPUT
|
|
RWTexture2DArray<float4> DebugOutput;
|
|
#endif
|
|
|
|
|
|
//------------------------------------------------------- FUNCTIONS
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
float2x2 ApplyScreenTransform(float2x2 PInA, FScreenTransform AToB)
|
|
{
|
|
return dpv_add(dpv_mul(PInA, AToB.xy), AToB.zw);
|
|
}
|
|
|
|
#if PLATFORM_SUPPORTS_REAL_TYPES
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
half2x2 ApplyScreenTransform(half2x2 PInA, FScreenTransform AToB)
|
|
{
|
|
return dpv_add(dpv_mul(PInA, half2(AToB.xy)), half2(AToB.zw));
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
/** Compute the group wave index into SGRP to then recompue the GroupThreadIndex latter */
|
|
CALL_SITE_DEBUGLOC
|
|
uint GetGroupWaveIndex(uint GroupThreadIndex, uint GroupSize)
|
|
#if COMPILER_SUPPORTS_WAVE_ONCE
|
|
{
|
|
uint LaneCountPerWave = WaveGetLaneCount();
|
|
|
|
if (LaneCountPerWave >= GroupSize)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
return WaveReadLaneFirst(GroupThreadIndex / LaneCountPerWave);
|
|
}
|
|
#else
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
/** Force compute the group GroupThreadIndex through lane index and wave index if possible to reduce VGPR pressure. */
|
|
CALL_SITE_DEBUGLOC
|
|
uint GetGroupThreadIndex(uint GroupThreadIndex, uint GroupWaveIndex)
|
|
#if COMPILER_SUPPORTS_WAVE_ONCE
|
|
{
|
|
// shares GroupWaveOffset to save SALU
|
|
uint GroupWaveOffset = WaveGetLaneCount() * GroupWaveIndex;
|
|
|
|
// Do not share
|
|
uint ComputedGroupThreadIndex;
|
|
ISOLATE
|
|
{
|
|
ComputedGroupThreadIndex = GroupWaveOffset + WaveGetLaneIndex();
|
|
}
|
|
|
|
return ComputedGroupThreadIndex;
|
|
}
|
|
#else
|
|
{
|
|
return GroupThreadIndex;
|
|
}
|
|
#endif
|
|
|
|
// Clamp the offset to be shared across multiple samples
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2x2 ClampPixelOffset(
|
|
tsr_short2x2 KernelCenterPixelPos,
|
|
tsr_short2x2 Offset,
|
|
const tsr_short2 OffsetDirection,
|
|
int2 MinPixelPos, int2 MaxPixelPos)
|
|
{
|
|
tsr_short2x2 Min = dpv_sub(tsr_short2(MinPixelPos), KernelCenterPixelPos);
|
|
tsr_short2x2 Max = dpv_sub(tsr_short2(MaxPixelPos), KernelCenterPixelPos);
|
|
|
|
// Only do clamp based on the compile time known direction of the offset.
|
|
// This turns only 1 v_pk_max_i16 and v_pk_min_i16 for a 3x3 kernel.
|
|
tsr_short2x2 ClampedOffset = 0;
|
|
|
|
if (OffsetDirection.x > 0)
|
|
{
|
|
ClampedOffset[0] = min(Offset[0], Max[0]);
|
|
}
|
|
else if (OffsetDirection.x < 0)
|
|
{
|
|
ClampedOffset[0] = max(Offset[0], Min[0]);
|
|
}
|
|
|
|
if (OffsetDirection.y > 0)
|
|
{
|
|
ClampedOffset[1] = min(Offset[1], Max[1]);
|
|
}
|
|
else if (OffsetDirection.y < 0)
|
|
{
|
|
ClampedOffset[1] = max(Offset[1], Min[1]);
|
|
}
|
|
|
|
return ClampedOffset;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2 ClampPixelOffset(
|
|
tsr_short2 KernelCenterPixelPos,
|
|
tsr_short2 Offset,
|
|
const tsr_short2 OffsetDirection,
|
|
int2 MinPixelPos, int2 MaxPixelPos)
|
|
{
|
|
tsr_short2 Min = tsr_short2(MinPixelPos) - KernelCenterPixelPos;
|
|
tsr_short2 Max = tsr_short2(MaxPixelPos) - KernelCenterPixelPos;
|
|
|
|
// Only do clamp based on the compile time known direction of the offset.
|
|
// This turns only 1 v_pk_max_i16 and v_pk_min_i16 for a 3x3 kernel.
|
|
tsr_short2 ClampedOffset = 0;
|
|
|
|
if (OffsetDirection.x > 0)
|
|
ClampedOffset.x = min(Offset.x, Max.x);
|
|
else if (OffsetDirection.x < 0)
|
|
ClampedOffset.x = max(Offset.x, Min.x);
|
|
|
|
if (OffsetDirection.y > 0)
|
|
ClampedOffset.y = min(Offset.y, Max.y);
|
|
else if (OffsetDirection.y < 0)
|
|
ClampedOffset.y = max(Offset.y, Min.y);
|
|
|
|
return ClampedOffset;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2x2 ClampPixelOffset(
|
|
tsr_short2x2 SamplePixelPos,
|
|
int2 MinPixelPos, int2 MaxPixelPos)
|
|
{
|
|
tsr_short2 SamplePixelPos0 = dpv_lo(SamplePixelPos);
|
|
tsr_short2 SamplePixelPos1 = dpv_hi(SamplePixelPos);
|
|
|
|
SamplePixelPos0.x = min(SamplePixelPos0.x, tsr_short(MaxPixelPos.x));
|
|
SamplePixelPos0.x = max(SamplePixelPos0.x, tsr_short(MinPixelPos.x));
|
|
|
|
SamplePixelPos0.y = min(SamplePixelPos0.y, tsr_short(MaxPixelPos.y));
|
|
SamplePixelPos0.y = max(SamplePixelPos0.y, tsr_short(MinPixelPos.y));
|
|
|
|
SamplePixelPos1.x = min(SamplePixelPos1.x, tsr_short(MaxPixelPos.x));
|
|
SamplePixelPos1.x = max(SamplePixelPos1.x, tsr_short(MinPixelPos.x));
|
|
|
|
SamplePixelPos1.y = min(SamplePixelPos1.y, tsr_short(MaxPixelPos.y));
|
|
SamplePixelPos1.y = max(SamplePixelPos1.y, tsr_short(MinPixelPos.y));
|
|
|
|
return dpv_interleave_registers(SamplePixelPos0, SamplePixelPos1);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2 ClampPixelOffset(
|
|
tsr_short2 SamplePixelPos,
|
|
int2 MinPixelPos, int2 MaxPixelPos)
|
|
{
|
|
SamplePixelPos.x = min(SamplePixelPos.x, tsr_short(MaxPixelPos.x));
|
|
SamplePixelPos.x = max(SamplePixelPos.x, tsr_short(MinPixelPos.x));
|
|
|
|
SamplePixelPos.y = min(SamplePixelPos.y, tsr_short(MaxPixelPos.y));
|
|
SamplePixelPos.y = max(SamplePixelPos.y, tsr_short(MinPixelPos.y));
|
|
|
|
return SamplePixelPos;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2x2 AddAndClampPixelOffset(
|
|
tsr_short2x2 KernelCenterPixelPos,
|
|
tsr_short2x2 Offset,
|
|
const tsr_short2 OffsetDirection,
|
|
int2 MinPixelPos, int2 MaxPixelPos)
|
|
{
|
|
tsr_short2x2 SamplePixelPos = KernelCenterPixelPos + Offset;
|
|
|
|
tsr_short2 SamplePixelPos0 = dpv_lo(SamplePixelPos);
|
|
tsr_short2 SamplePixelPos1 = dpv_hi(SamplePixelPos);
|
|
|
|
// Only do clamp based on the compile time known direction of the offset.
|
|
// This turns only 1 v_pk_max_i16 and v_pk_min_i16 for a 3x3 kernel.
|
|
if (OffsetDirection.x > 0)
|
|
SamplePixelPos0.x = min(SamplePixelPos0.x, tsr_short(MaxPixelPos.x));
|
|
else if (OffsetDirection.x < 0)
|
|
SamplePixelPos0.x = max(SamplePixelPos0.x, tsr_short(MinPixelPos.x));
|
|
|
|
if (OffsetDirection.y > 0)
|
|
SamplePixelPos0.y = min(SamplePixelPos0.y, tsr_short(MaxPixelPos.y));
|
|
else if (OffsetDirection.y < 0)
|
|
SamplePixelPos0.y = max(SamplePixelPos0.y, tsr_short(MinPixelPos.y));
|
|
|
|
if (OffsetDirection.x > 0)
|
|
SamplePixelPos1.x = min(SamplePixelPos1.x, tsr_short(MaxPixelPos.x));
|
|
else if (OffsetDirection.x < 0)
|
|
SamplePixelPos1.x = max(SamplePixelPos1.x, tsr_short(MinPixelPos.x));
|
|
|
|
if (OffsetDirection.y > 0)
|
|
SamplePixelPos1.y = min(SamplePixelPos1.y, tsr_short(MaxPixelPos.y));
|
|
else if (OffsetDirection.y < 0)
|
|
SamplePixelPos1.y = max(SamplePixelPos1.y, tsr_short(MinPixelPos.y));
|
|
|
|
return dpv_interleave_registers(SamplePixelPos0, SamplePixelPos1);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2 AddAndClampPixelOffset(
|
|
tsr_short2 KernelCenterPixelPos,
|
|
tsr_short2 Offset,
|
|
const tsr_short2 OffsetDirection,
|
|
int2 MinPixelPos, int2 MaxPixelPos)
|
|
{
|
|
tsr_short2 SamplePixelPos = KernelCenterPixelPos + Offset;
|
|
|
|
// Only do clamp based on the compile time known direction of the offset.
|
|
// This turns only 1 v_pk_max_i16 and v_pk_min_i16 for a 3x3 kernel.
|
|
if (OffsetDirection.x > 0)
|
|
SamplePixelPos.x = min(SamplePixelPos.x, tsr_short(MaxPixelPos.x));
|
|
else if (OffsetDirection.x < 0)
|
|
SamplePixelPos.x = max(SamplePixelPos.x, tsr_short(MinPixelPos.x));
|
|
|
|
if (OffsetDirection.y > 0)
|
|
SamplePixelPos.y = min(SamplePixelPos.y, tsr_short(MaxPixelPos.y));
|
|
else if (OffsetDirection.y < 0)
|
|
SamplePixelPos.y = max(SamplePixelPos.y, tsr_short(MinPixelPos.y));
|
|
|
|
return SamplePixelPos;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2x2 InvalidateOutputPixelPos(tsr_short2x2 PixelPos, uint2 ViewportMax)
|
|
#if 1
|
|
{
|
|
tsr_short2x2 Subtract = dpv_sub(tsr_short2(ViewportMax - 1), PixelPos);
|
|
tsr_ushort2 Override = tsr_ushort2(Subtract[0] | Subtract[1]);
|
|
|
|
#if CONFIG_COMPILE_FP16
|
|
PixelPos[0] |= -tsr_short2((Override & uint16_t(0x8000)) >> 15);
|
|
#else
|
|
PixelPos[0] |= -tsr_short2((Override & uint(0x80000000)) >> 31);
|
|
#endif
|
|
|
|
return PixelPos;
|
|
}
|
|
#else
|
|
{
|
|
bool bIsValidPixel = all(PixelPos < ViewportMax);
|
|
PixelPos.x = bIsValidPixel ? PixelPos.x : ~tsr_short(0);
|
|
return PixelPos;
|
|
}
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2 InvalidateOutputPixelPos(tsr_short2 PixelPos, uint2 ViewportMax)
|
|
#if 1
|
|
{
|
|
tsr_short2 Subtract = tsr_short2(ViewportMax - 1) - PixelPos;
|
|
tsr_ushort Override = tsr_ushort(Subtract.x | Subtract.y);
|
|
|
|
#if CONFIG_COMPILE_FP16
|
|
PixelPos.x |= -tsr_short((Override & uint16_t(0x8000)) >> 15);
|
|
#else
|
|
PixelPos.x |= -tsr_short((Override & uint(0x80000000)) >> 31);
|
|
#endif
|
|
|
|
return PixelPos;
|
|
}
|
|
#else
|
|
{
|
|
bool bIsValidPixel = all(PixelPos < ViewportMax);
|
|
PixelPos.x = bIsValidPixel ? PixelPos.x : ~tsr_short(0);
|
|
return PixelPos;
|
|
}
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_ushort2 Map8x8Tile2x2Lane(uint GroupThreadIndex)
|
|
{
|
|
tsr_ushort2 T = tsr_ushort(GroupThreadIndex).xx;
|
|
tsr_ushort2 GroupId = (T >> tsr_ushort2(0, 1) & tsr_ushort2(0x01, 0x01)) | ((T >> tsr_ushort2(2 - 1, 4 - 1)) & tsr_ushort2(0x03 << 1, 0x03 << 1));
|
|
return GroupId;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_ushort2 Map16x16Tile2x2Lane(uint GroupThreadIndex)
|
|
{
|
|
tsr_ushort T = tsr_ushort(GroupThreadIndex);
|
|
|
|
tsr_ushort2 GroupId;
|
|
GroupId.x = ((T >> tsr_ushort(0)) & tsr_ushort(0x01)) | ((T >> tsr_ushort(2 - 1)) & tsr_ushort(0x07 << 1));
|
|
GroupId.y = ((T >> tsr_ushort(1)) & tsr_ushort(0x01)) | ((T >> tsr_ushort(5 - 1)) & tsr_ushort(0x07 << 1));
|
|
|
|
return GroupId;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
float SafeRcp(float x)
|
|
{
|
|
return x > 0.0 ? rcp(x) : 0.0;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
float2 SafeRcp(float2 x)
|
|
{
|
|
return float2(SafeRcp(x.x), SafeRcp(x.y));
|
|
}
|
|
|
|
#if CONFIG_COMPILE_FP16
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
half SafeRcp(half x)
|
|
{
|
|
// If x=0.0, then MaxHalfFloat * 0.0 = 0.0
|
|
return min(rcp(x), half(MaxHalfFloat)) * saturate(x * asfloat16(uint16_t(0x7C00)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
half2 SafeRcp(half2 x)
|
|
{
|
|
// If x=0.0, then MaxHalfFloat * 0.0 = 0.0
|
|
return min(rcp(x), half(MaxHalfFloat)) * saturate(x * asfloat16(uint16_t(0x7C00)));
|
|
}
|
|
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 ComputeLowFrequencyContribution(tsr_half2 HistoryValidity)
|
|
{
|
|
return saturate((HistoryValidity - tsr_half(1.0 * rcp(MAX_SAMPLE_COUNT))) * tsr_half(6.0));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half ComputeLowFrequencyContribution(tsr_half HistoryValidity)
|
|
{
|
|
return dpv_lo(ComputeLowFrequencyContribution(dpv_interleave_mono_registers(HistoryValidity)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half3x2 RGBToYCoCg(tsr_half3x2 RGB)
|
|
{
|
|
tsr_half2 R = RGB[0];
|
|
tsr_half2 G = RGB[1];
|
|
tsr_half2 B = RGB[2];
|
|
|
|
tsr_half2 RplusB = R + B;
|
|
tsr_half2 Y = 2.0 * G + RplusB;
|
|
tsr_half2 Cg = 2.0 * G - RplusB;
|
|
tsr_half2 Co = 2.0 * (R - B);
|
|
tsr_half3x2 YCoCg = tsr_half3x2(Y, Co, Cg);
|
|
|
|
return YCoCg;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half3x2 YCoCgToRGB(tsr_half3x2 YCoCg)
|
|
{
|
|
tsr_half2 Y = YCoCg[0] * tsr_half(0.25);
|
|
tsr_half2 Co = YCoCg[1] * tsr_half(0.25);
|
|
tsr_half2 Cg = YCoCg[2] * tsr_half(0.25);
|
|
|
|
tsr_half2 R = Y + Co - Cg;
|
|
tsr_half2 G = Y + Cg;
|
|
tsr_half2 B = Y - Co - Cg;
|
|
|
|
tsr_half3x2 RGB = tsr_half3x2(R, G, B);
|
|
return RGB;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half4x2 RGBToYCoCg(tsr_half4x2 RGBA)
|
|
{
|
|
return tsr_half4x2(RGBToYCoCg(tsr_half3x2(RGBA[0], RGBA[1], RGBA[2])), RGBA[3]);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half4x2 YCoCgToRGB(tsr_half4x2 YCoCgA)
|
|
{
|
|
return tsr_half4x2(YCoCgToRGB(tsr_half3x2(YCoCgA[0], YCoCgA[1], YCoCgA[2])), YCoCgA[3]);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half3 RGBToYCoCg(tsr_half3 RGB)
|
|
{
|
|
return dpv_lo(RGBToYCoCg(dpv_interleave_mono_registers(RGB)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half3 YCoCgToRGB(tsr_half3 YCoCg)
|
|
{
|
|
return dpv_lo(YCoCgToRGB(dpv_interleave_mono_registers(YCoCg)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half4 RGBToYCoCg(tsr_half4 RGBA)
|
|
{
|
|
return dpv_lo(RGBToYCoCg(dpv_interleave_mono_registers(RGBA)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half4 YCoCgToRGB(tsr_half4 YCoCgA)
|
|
{
|
|
return dpv_lo(YCoCgToRGB(dpv_interleave_mono_registers(YCoCgA)));
|
|
}
|
|
|
|
|
|
// Some bright pixel can cause HdrWeight to get nullified under fp16 representation. So clamping this to a value close to the minimum float float positive value (0.000061).
|
|
#define HDR_WEIGHT_SAFE_MIN_VALUE 0.0001
|
|
|
|
// Faster but less accurate luma computation.
|
|
// Luma includes a scaling by 4.
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 Luma4(tsr_half3x2 Color)
|
|
{
|
|
return (Color[1] * tsr_half(2.0)) + (Color[0] + Color[2]);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 Luma4(tsr_half4x2 Color)
|
|
{
|
|
return Luma4(tsr_half3x2(Color[0], Color[1], Color[2]));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half Luma4(tsr_half3 Color)
|
|
{
|
|
return dpv_lo(Luma4(dpv_interleave_mono_registers(Color)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half Luma4(tsr_half4 Color)
|
|
{
|
|
return Luma4(Color.rgb);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 HdrWeightY(tsr_half2 Luma)
|
|
{
|
|
tsr_half Exposure = tsr_half(1.0);
|
|
|
|
return max(tsr_half(HDR_WEIGHT_SAFE_MIN_VALUE), rcp(Luma * Exposure + tsr_half(4.0)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half HdrWeightY(tsr_half Luma)
|
|
{
|
|
return dpv_lo(HdrWeightY(dpv_interleave_mono_registers(Luma)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 HdrWeightInvY(tsr_half2 LDRLuma)
|
|
{
|
|
return tsr_half(4.0) * rcp(tsr_half(1.0) - LDRLuma);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half HdrWeightInvY(tsr_half LDRLuma)
|
|
{
|
|
return dpv_lo(HdrWeightInvY(dpv_interleave_mono_registers(LDRLuma)));
|
|
}
|
|
|
|
// Optimized HDR weighting function.
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 HdrWeight4(tsr_half3x2 Color)
|
|
{
|
|
return HdrWeightY(Luma4(Color));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 HdrWeight4(tsr_half4x2 Color)
|
|
{
|
|
return HdrWeightY(Luma4(Color));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half HdrWeight4(tsr_half3 Color)
|
|
{
|
|
return HdrWeightY(Luma4(Color));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half HdrWeight4(tsr_half4 Color)
|
|
{
|
|
return HdrWeightY(Luma4(Color));
|
|
}
|
|
|
|
|
|
// Returns the weight of a pixels at a coordinate <PixelDelta> from the PDF highest point.
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 ComputeSampleWeigth(tsr_half2 UpscaleFactor, tsr_half2x2 PixelDelta, const float MinimalContribution)
|
|
{
|
|
tsr_half2 u2 = UpscaleFactor * UpscaleFactor;
|
|
|
|
// 1 - 1.9 * x^2 + 0.9 * x^4
|
|
tsr_half2 x2 = saturate(u2 * dpv_length2(PixelDelta));
|
|
//return tsr_half(((float(0.9) + MinimalContribution) * x2 - float(1.9)) * x2 + float(1.0));
|
|
return saturate((tsr_half(0.9) * x2 - tsr_half(1.9)) * x2 + tsr_half(1.0 + MinimalContribution));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half ComputeSampleWeigth(tsr_half UpscaleFactor, tsr_half2 PixelDelta, const float MinimalContribution)
|
|
{
|
|
return dpv_lo(ComputeSampleWeigth(
|
|
dpv_interleave_mono_registers(UpscaleFactor),
|
|
dpv_interleave_mono_registers(PixelDelta),
|
|
MinimalContribution));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2x2 WeightedLerpFactors(tsr_half2 WeightA, tsr_half2 WeightB, tsr_half2 Blend)
|
|
{
|
|
tsr_half2 BlendA = (tsr_half(1.0) - Blend) * WeightA;
|
|
tsr_half2 BlendB = Blend * WeightB;
|
|
tsr_half2 RcpBlend = SafeRcp(BlendA + BlendB);
|
|
BlendA *= RcpBlend;
|
|
BlendB *= RcpBlend;
|
|
return tsr_half2x2(BlendA, BlendB);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 WeightedLerpFactors(tsr_half WeightA, tsr_half WeightB, tsr_half Blend)
|
|
{
|
|
return dpv_lo(WeightedLerpFactors(
|
|
dpv_interleave_mono_registers(WeightA),
|
|
dpv_interleave_mono_registers(WeightB),
|
|
dpv_interleave_mono_registers(Blend)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
bool TakeOnlyOneSamplePair(float2 Offset)
|
|
{
|
|
return Offset.y > 0.0 || (Offset.x > 0.0 && Offset.y == 0.0);
|
|
}
|
|
|
|
float2 ComputeStaticVelocity(float2 ScreenPos, float DeviceZ)
|
|
{
|
|
float3 PosN = float3(ScreenPos, DeviceZ);
|
|
|
|
float4 ThisClip = float4(PosN, 1);
|
|
float4 PrevClip = mul( ThisClip, View.ClipToPrevClip );
|
|
float2 PrevScreen = PrevClip.xy / PrevClip.w;
|
|
return PosN.xy - PrevScreen;
|
|
}
|
|
|
|
tsr_half ComputePredictionCompleteness(tsr_half SampleHistoryValidity)
|
|
{
|
|
return saturate(SampleHistoryValidity * tsr_half(MAX_SAMPLE_COUNT) - tsr_half(0.2));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
bool IsOffScreen(uint bCameraCut, float2 ScreenPos, tsr_half ParallaxRejectionMask)
|
|
{
|
|
bool bIsCameraCut = bCameraCut != 0;
|
|
bool bIsOutOfBounds = max(abs(ScreenPos.x), abs(ScreenPos.y)) >= 1.0;
|
|
bool bIsParallaxRejected = ParallaxRejectionMask < tsr_half(PARALLAX_REJECTION_MASK_THRESHOLD);
|
|
|
|
return (bIsCameraCut || bIsOutOfBounds || bIsParallaxRejected);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
void IsOffScreenOrDisoccluded(uint bCameraCut, float2x2 ScreenPos, tsr_half2 ParallaxRejectionMask, out bool2 bIsOffScreen, out bool2 bIsDisoccluded)
|
|
{
|
|
bool bIsCameraCut = bCameraCut != 0;
|
|
bool2 bIsOutOfBounds = max(abs(ScreenPos[0]), abs(ScreenPos[1])) >= 1.0;
|
|
bool2 bIsParallaxRejected = ParallaxRejectionMask < tsr_half(PARALLAX_REJECTION_MASK_THRESHOLD);
|
|
|
|
bIsOffScreen = or(bIsCameraCut, bIsOutOfBounds);
|
|
bIsDisoccluded = and(!bIsOffScreen, bIsParallaxRejected);
|
|
}
|
|
|
|
tsr_half MeasureBackbufferLDRQuantizationError()
|
|
{
|
|
// Assume the backbuffer is 10bit per channels
|
|
return tsr_half(0.5 / 1024.0);
|
|
}
|
|
|
|
tsr_half2 MeasureBackbufferLDRQuantizationErrorFromHDRLuma(tsr_half2 Luma)
|
|
{
|
|
tsr_half2 Tonemap = Luma * HdrWeightY(Luma) + MeasureBackbufferLDRQuantizationError().xx;
|
|
tsr_half2 PixelColorQuantizationError = abs(Tonemap * HdrWeightInvY(Tonemap) - Luma);
|
|
|
|
return PixelColorQuantizationError;
|
|
}
|
|
|
|
tsr_half MeasureBackbufferLDRQuantizationErrorFromHDRLuma(tsr_half Luma)
|
|
{
|
|
return dpv_lo(MeasureBackbufferLDRQuantizationErrorFromHDRLuma(dpv_interleave_mono_registers(Luma)));
|
|
}
|
|
|
|
tsr_half2 MeasureRejectionFactor(
|
|
tsr_halfCx2 PrevYCoCg,
|
|
tsr_halfCx2 ClampedPrevYCoCg,
|
|
tsr_halfCx2 InputCenterYCoCg,
|
|
tsr_halfCx2 InputMinYCoCg, tsr_halfCx2 InputMaxYCoCg,
|
|
tsr_half2 BackbufferQuantizationError)
|
|
{
|
|
const tsr_halfC Null = tsr_half(0.0);
|
|
|
|
#if CONFIG_SCENE_COLOR_APLHA
|
|
tsr_halfCx2 BackbufferQuantizationErrorVector = tsr_half4x2(BackbufferQuantizationError, BackbufferQuantizationError, BackbufferQuantizationError, BackbufferQuantizationError);
|
|
#else
|
|
tsr_halfCx2 BackbufferQuantizationErrorVector = tsr_half3x2(BackbufferQuantizationError, BackbufferQuantizationError, BackbufferQuantizationError);
|
|
#endif
|
|
|
|
tsr_halfCx2 BoxSize = InputMaxYCoCg - InputMinYCoCg;
|
|
|
|
tsr_halfCx2 ClampedEnergy = abs(ClampedPrevYCoCg - PrevYCoCg);
|
|
tsr_halfCx2 Delta = abs(InputCenterYCoCg - PrevYCoCg);
|
|
|
|
tsr_halfCx2 ClampError = max(
|
|
BoxSize * tsr_half(0.25),
|
|
BackbufferQuantizationErrorVector);
|
|
|
|
tsr_halfCx2 Factor = tsr_half(1.0) - saturate(max(ClampedEnergy - ClampError, dpv_interleave_mono_registers(Null)) * rcp(max(Delta, BoxSize)));
|
|
|
|
tsr_half2 Rejection = min3(Factor[0], Factor[1], Factor[2]);
|
|
|
|
#if CONFIG_SCENE_COLOR_APLHA
|
|
Rejection = min(Rejection, Factor[3]);
|
|
#endif
|
|
|
|
return Rejection;
|
|
}
|
|
|
|
tsr_half MeasureRejectionFactor(
|
|
tsr_halfC PrevYCoCg,
|
|
tsr_halfC ClampedPrevYCoCg,
|
|
tsr_halfC InputCenterYCoCg,
|
|
tsr_halfC InputMinYCoCg, tsr_halfC InputMaxYCoCg,
|
|
tsr_half BackbufferQuantizationError)
|
|
{
|
|
return dpv_lo(MeasureRejectionFactor(
|
|
dpv_interleave_mono_registers(PrevYCoCg),
|
|
dpv_interleave_mono_registers(ClampedPrevYCoCg),
|
|
dpv_interleave_mono_registers(InputCenterYCoCg),
|
|
dpv_interleave_mono_registers(InputMinYCoCg),
|
|
dpv_interleave_mono_registers(InputMaxYCoCg),
|
|
dpv_interleave_mono_registers(BackbufferQuantizationError)));
|
|
}
|
|
|
|
struct FSubpixelNeighborhood
|
|
{
|
|
// 2x2 quad of subpixels grid
|
|
tsr_subpixel_details SampleArray[4];
|
|
|
|
// Offset of subpixel
|
|
uint2 SubpixelOffset;
|
|
|
|
// Offset of the closest subpixel
|
|
bool2 ClosestSubpixelOffset;
|
|
};
|
|
|
|
FSubpixelNeighborhood GatherPrevSubpixelNeighborhood(Texture2D<tsr_subpixel_details> PrevSubPixelDetailTexture, float2 PrevHistoryBufferUV)
|
|
{
|
|
float2 PixelCoord = PrevHistoryBufferUV * PrevSubpixelDetailsExtent;
|
|
float2 TopLeftPixelCoord = floor(PixelCoord - 0.5) + 0.5;
|
|
|
|
FSubpixelNeighborhood Neighborhood;
|
|
#if 1 // TODO
|
|
{
|
|
Neighborhood.SampleArray[0] = PrevSubPixelDetailTexture[uint2(TopLeftPixelCoord + float2(0.0, 0.0))];
|
|
Neighborhood.SampleArray[1] = PrevSubPixelDetailTexture[uint2(TopLeftPixelCoord + float2(1.0, 0.0))];
|
|
Neighborhood.SampleArray[2] = PrevSubPixelDetailTexture[uint2(TopLeftPixelCoord + float2(0.0, 1.0))];
|
|
Neighborhood.SampleArray[3] = PrevSubPixelDetailTexture[uint2(TopLeftPixelCoord + float2(1.0, 1.0))];
|
|
}
|
|
#else
|
|
{
|
|
// (-,+),(+,+),(+,-),(-,-),
|
|
/**
|
|
* 3 2
|
|
* 0 1
|
|
*/
|
|
uint4 Samples = PrevSubPixelDetailTexture.Gather(GlobalPointClampedSampler, PrevHistoryBufferUV);
|
|
|
|
Neighborhood.SampleArray[0] = Samples[3]; // 00
|
|
Neighborhood.SampleArray[1] = Samples[2]; // 10
|
|
Neighborhood.SampleArray[2] = Samples[0]; // 01
|
|
Neighborhood.SampleArray[3] = Samples[1]; // 11
|
|
}
|
|
#endif
|
|
|
|
tsr_half2 Interp = tsr_half2(PixelCoord - TopLeftPixelCoord);
|
|
Neighborhood.SubpixelOffset = uint2(round(Interp * 2.0));
|
|
Neighborhood.ClosestSubpixelOffset = Interp > 0.5;
|
|
|
|
return Neighborhood;
|
|
}
|
|
|
|
tsr_subpixel_payload GetSubpixelPayload(FSubpixelNeighborhood Neighborhood, uint SubpixelId)
|
|
{
|
|
const uint2 SubpixelCoord = uint2(SubpixelId % SUB_PIXEL_GRID_SIZE, SubpixelId / SUB_PIXEL_GRID_SIZE);
|
|
|
|
uint2 InputSubpixelCoord = Neighborhood.SubpixelOffset + SubpixelCoord;
|
|
uint2 InputTexelCoord = InputSubpixelCoord / SUB_PIXEL_GRID_SIZE;
|
|
uint2 QuadSubpixelCoord = InputSubpixelCoord % SUB_PIXEL_GRID_SIZE;
|
|
|
|
|
|
uint QuadSampleId = dot(InputTexelCoord, uint2(1, 2));
|
|
tsr_subpixel_details InputSample = Neighborhood.SampleArray[QuadSampleId];
|
|
|
|
uint BitShift = SUB_PIXEL_BIT_COUNT * dot(QuadSubpixelCoord, uint2(1, SUB_PIXEL_GRID_SIZE));
|
|
|
|
tsr_subpixel_payload SubpixelPayload = tsr_subpixel_payload(InputSample >> BitShift) & SUB_PIXEL_BIT_MASK;
|
|
|
|
return SubpixelPayload;
|
|
}
|
|
|
|
tsr_subpixel_payload GetClosestSubpixelPayload(FSubpixelNeighborhood Neighborhood)
|
|
{
|
|
uint QuadSampleId = dot(uint2(Neighborhood.ClosestSubpixelOffset), uint2(1, 2));
|
|
tsr_subpixel_details InputSample = Neighborhood.SampleArray[QuadSampleId];
|
|
|
|
uint BitShift = SUB_PIXEL_BIT_COUNT * (3 - QuadSampleId);
|
|
|
|
tsr_subpixel_payload SubpixelPayload = tsr_subpixel_payload(InputSample >> BitShift) & SUB_PIXEL_BIT_MASK;
|
|
|
|
return SubpixelPayload;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_subpixel_payload CompressParallaxFactor(tsr_half ParallaxFactor)
|
|
{
|
|
return clamp(
|
|
tsr_subpixel_payload((ParallaxFactor - tsr_half(1.0)) * tsr_half(float(SUB_PIXEL_PARALLAX_FACTOR_BIT_MASK) / MAX_PARALLAX_FACTOR)),
|
|
tsr_subpixel_payload(0), tsr_subpixel_payload(SUB_PIXEL_PARALLAX_FACTOR_BIT_MASK));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half UncompressParallaxFactor(tsr_subpixel_payload CompressedParallaxFactor)
|
|
{
|
|
return
|
|
tsr_half(CompressedParallaxFactor & SUB_PIXEL_PARALLAX_FACTOR_BIT_MASK) *
|
|
tsr_half(MAX_PARALLAX_FACTOR / float(SUB_PIXEL_PARALLAX_FACTOR_BIT_MASK)) +
|
|
tsr_half(1.0);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half3x2 QuantizeForFloatRenderTarget(tsr_half3x2 Color, tsr_half E, const float3 QuantizationError)
|
|
{
|
|
tsr_half3x2 Error = dpv_mul(Color, tsr_half3(QuantizationError));
|
|
|
|
#if 0
|
|
// NOP
|
|
#elif CONFIG_COMPILE_FP16
|
|
{
|
|
Error[0] = asfloat16(asint16(Error[0]) & uint16_t(~0x03FF));
|
|
Error[1] = asfloat16(asint16(Error[1]) & uint16_t(~0x03FF));
|
|
Error[2] = asfloat16(asint16(Error[2]) & uint16_t(~0x03FF));
|
|
}
|
|
#else
|
|
{
|
|
Error[0] = asfloat(asuint(Error[0]) & ~0x007FFFFF);
|
|
Error[1] = asfloat(asuint(Error[1]) & ~0x007FFFFF);
|
|
Error[2] = asfloat(asuint(Error[2]) & ~0x007FFFFF);
|
|
}
|
|
#endif
|
|
|
|
return Color + Error * E;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half3 QuantizeForFloatRenderTarget(tsr_half3 Color, tsr_half E, const float3 QuantizationError)
|
|
{
|
|
return dpv_lo(QuantizeForFloatRenderTarget(dpv_interleave_mono_registers(Color), E, QuantizationError));
|
|
}
|
|
|
|
|
|
#if COMPRESS_PREV_USE_COUNT
|
|
|
|
// encode 4 use count that are 8bits shifted in the PF_R32_UINT
|
|
void ComputeCompressedUseCountPixelCoordinates(uint2 PixelPos, out uint2 CompressedUseCountPixelPos, out uint Shift)
|
|
{
|
|
// Overlap 2x2 consecutive blocks of 8x8 pixel to reduce atomic contention in the scattering pass
|
|
const uint kTileSize = 8;
|
|
|
|
uint2 PixelPosOffset = PixelPos - InputInfo_ViewportMin;
|
|
|
|
CompressedUseCountPixelPos = ((PixelPosOffset % kTileSize) | ((PixelPosOffset >> 1) & ~(kTileSize - 1))) + InputInfo_ViewportMin;
|
|
Shift = ((PixelPosOffset.x / kTileSize) % 2) | (((PixelPosOffset.y / kTileSize) % 2) << 1);
|
|
}
|
|
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
uint2 EncodeSpatialAntiAliasingOffset(tsr_half2x2 TexelOffset)
|
|
{
|
|
const uint Mask = (1 << SPATIAL_ANTI_ALIASING_OFFSET_BITDEPTH_PER_CHANNEL) - 1;
|
|
const uint Zero = Mask / 2;
|
|
|
|
tsr_half Multiply = tsr_half(float(Zero) / SPATIAL_ANTI_ALIASING_OFFSET_RANGE);
|
|
|
|
uint2 EncodedTexelOffset = 0;
|
|
EncodedTexelOffset |= clamp(uint2(int2(round(TexelOffset[0] * Multiply)) + int(1 + Zero)), uint(1), Mask) << 0;
|
|
EncodedTexelOffset |= clamp(uint2(int2(round(TexelOffset[1] * Multiply)) + int(1 + Zero)), uint(1), Mask) << SPATIAL_ANTI_ALIASING_OFFSET_BITDEPTH_PER_CHANNEL;
|
|
return EncodedTexelOffset;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
uint EncodeSpatialAntiAliasingOffset(tsr_half2 TexelOffset)
|
|
{
|
|
return dpv_lo(EncodeSpatialAntiAliasingOffset(dpv_interleave_mono_registers(TexelOffset)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2x2 DecodeSpatialAntiAliasingOffset(tsr_ushort2 EncodedInputTexelOffset)
|
|
{
|
|
const tsr_ushort Mask = tsr_ushort(1 << SPATIAL_ANTI_ALIASING_OFFSET_BITDEPTH_PER_CHANNEL) - tsr_ushort(1);
|
|
const tsr_ushort Zero = Mask / tsr_ushort(2);
|
|
|
|
tsr_half Multiply = tsr_half(SPATIAL_ANTI_ALIASING_OFFSET_RANGE / float(Zero));
|
|
|
|
tsr_half2x2 TexelOffset;
|
|
TexelOffset[0] = tsr_half2((EncodedInputTexelOffset >> tsr_ushort(0 )) & Mask) * Multiply - (tsr_half(1 + Zero) * Multiply);
|
|
TexelOffset[1] = tsr_half2((EncodedInputTexelOffset >> tsr_ushort(SPATIAL_ANTI_ALIASING_OFFSET_BITDEPTH_PER_CHANNEL)) & Mask) * Multiply - (tsr_half(1 + Zero) * Multiply);
|
|
|
|
return TexelOffset;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 DecodeSpatialAntiAliasingOffset(tsr_ushort EncodedInputTexelOffset)
|
|
{
|
|
return dpv_lo(DecodeSpatialAntiAliasingOffset(dpv_interleave_mono_registers(EncodedInputTexelOffset)));
|
|
}
|
|
|
|
|
|
#define VELOCITY_HOLE_FILLING_BITS 18
|
|
#define VELOCITY_HOLE_FILLING_LENGTH_BITS 13
|
|
#define VELOCITY_HOLE_FILLING_ANGLE_BITS (VELOCITY_HOLE_FILLING_BITS - VELOCITY_HOLE_FILLING_LENGTH_BITS)
|
|
#define VELOCITY_HOLE_FILLING_LENGTH_PRECISION (pow(2.0, 5))
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
uint EncodeHoleFillingVelocity(float2 PixelVelocity)
|
|
{
|
|
const uint AngleBitDepth = 1 << VELOCITY_HOLE_FILLING_ANGLE_BITS;
|
|
const uint LengthBitDepth = 1 << VELOCITY_HOLE_FILLING_LENGTH_BITS;
|
|
const float LengthPixelPrecision = VELOCITY_HOLE_FILLING_LENGTH_PRECISION;
|
|
|
|
float EncodedAngle = atan2Fast(PixelVelocity.y, PixelVelocity.x) * (0.5 * AngleBitDepth / PI) + (0.5 * AngleBitDepth);
|
|
uint iEncodedAngle = uint(round(EncodedAngle)) & (AngleBitDepth - 1);
|
|
|
|
float EncodedLength = length(PixelVelocity) * LengthPixelPrecision;
|
|
uint iEncodedLength = clamp(uint(ceil(EncodedLength)), 0u, LengthBitDepth - 1u);
|
|
|
|
uint EncodedHoleFillingVelocity = iEncodedAngle * LengthBitDepth + iEncodedLength;
|
|
return EncodedHoleFillingVelocity;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
void DecodeHoleFillingVelocity(uint EncodedHoleFillingVelocity, out float2 PixelVelocity, out float OutOfRangeFactor)
|
|
{
|
|
const uint AngleBitDepth = 1 << VELOCITY_HOLE_FILLING_ANGLE_BITS;
|
|
const uint LengthBitDepth = 1 << VELOCITY_HOLE_FILLING_LENGTH_BITS;
|
|
const float LengthPixelPrecision = VELOCITY_HOLE_FILLING_LENGTH_PRECISION;
|
|
const float LengthRange = float(LengthBitDepth) / LengthPixelPrecision;
|
|
|
|
uint iEncodedLength = EncodedHoleFillingVelocity % LengthBitDepth;
|
|
uint iEncodedAngle = (EncodedHoleFillingVelocity >> VELOCITY_HOLE_FILLING_LENGTH_BITS) % AngleBitDepth;
|
|
|
|
float EncodedLength = float(iEncodedLength);
|
|
float Length = EncodedLength * rcp(LengthPixelPrecision);
|
|
|
|
float EncodedAngle = float(iEncodedAngle);
|
|
float Angle = EncodedAngle * (PI * 2.0 / float(AngleBitDepth)) - PI;
|
|
|
|
sincos(Angle, /* out */ PixelVelocity.y, /* out */ PixelVelocity.x);
|
|
PixelVelocity *= Length;
|
|
|
|
OutOfRangeFactor = saturate(LengthRange - Length);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2 GetLaneOffsetSign()
|
|
#if CONFIG_BUTTERFLY_KERNEL
|
|
{
|
|
uint LaneIndex = WaveGetLaneIndex();
|
|
|
|
return tsr_short(-1) + tsr_short2(tsr_ushort2(tsr_ushort(LaneIndex) << tsr_ushort(1), LaneIndex) & tsr_ushort(0x2));
|
|
}
|
|
#else
|
|
{
|
|
return tsr_short(1).xx;
|
|
}
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
float ComputePixelVelocityBilateralWeight(float2 PixelVelocityDelta)
|
|
{
|
|
return saturate(1.0 - length2(PixelVelocityDelta));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
float ComputeScreenVelocityBilateralWeight(float2 ScreenVelocityDelta)
|
|
{
|
|
return ComputePixelVelocityBilateralWeight(ScreenVelocityDelta * ScreenVelocityToInputPixelVelocity);
|
|
}
|
|
|
|
|