UnrealEngineUWP/Engine/Shaders/PostProcessMotionBlur.usf

// Copyright 1998-2015 Epic Games, Inc. All Rights Reserved.

/*=============================================================================
	PostProcessMotionBlur.usf: PostProcessing MotionBlur
=============================================================================*/

#include "Common.usf"
#include "PostProcessCommon.usf"
#include "DeferredShadingCommon.usf"		// FGBufferData

// Spaces:
//   screen space: -1..1 -1..1
//   normalized_motionblur_velocity: 2d vector where the max motionblur is defined in a unit circle around 0
//   world_space
//   half_res_pixels

// similar implementation to: http://graphics.cs.williams.edu/papers/MotionBlurI3D12/McGuire12Blur.pdf
// but adapted for half resolution and not using randomization/noise

// from the paper: We use SOFT Z EXTENT = 1mm to 10cm for our results
#define SOFT_Z_EXTENT 1

// This,
//  - Gets the MainPS pass to low cost under no motion.
//  - Somewhat cleans up the blur/no blur transition.
//  - Reduces the motion blur on small motion (where it is not needed).
//  - Removes the background blur (seems higher quality).
#define MOTION_BLUR_EARLY_EXIT 1

// 0: use more texture lookups avoiding .Gather() lookups
#define GATHER_OPTIMIZATION (FEATURE_LEVEL >= FEATURE_LEVEL_SM5)

// 0:off / 1:on, useful to debug the motionblur algorithm
#define MOTIONBLUR_TESTCHART 0

// --------------------------------------------------------------------------

#if MOTION_BLUR_QUALITY == 0
	// this would break VisualizeMotionBlur so we disable it
	#undef MOTION_BLUR_EARLY_EXIT
	#define MOTION_BLUR_EARLY_EXIT 0
#endif

// helper function, from the paper but adjusted to avoid division by zero
float cone(float2 X, float2 Y, float2 v)
{
	// to avoid div by 0
	float Bias = 0.001f;

	// better for no background velocity
	return length(X - Y) < length(v);
}

// helper function, from the paper but adjusted to avoid division by zero
float cylinder(float2 X, float2 Y, float2 v)
{
	// to avoid div by 0
	float Bias = 0.001f;

	return 1 - smoothstep(0.95f * length(v), 1.05f * length(v) + Bias, length(X - Y) );
}

// helper function, from the paper
// note this assumes negative z values
// is zb closer than za?
float softDepthCompare(float za, float zb)
{
	return saturate(1 - (za - zb) / SOFT_Z_EXTENT);
}


// ------------------------------------------

// MOTION_BLUR_QUALITY == 0:visualize, 1:low, 2:medium, 3:high, 4:very high

// to scale to normalized motionblur velocity
// xy:includes y flip, zw:unused
float4 VelocityScale;

// Last frame's view projection matrix (world-camera) to clip)
float4x4 PrevViewProjMatrix;

// .xy multiply, .zw:add
// to transform the UV to a normalized view  0..1
float4 TextureViewMad;

// xy:IndividualVelocityScale.xy zw:unused, from postprocess settings
float4 MotionBlurParameters;

// BoneMatrices and PreviousBoneMatrices for visualization

// The bone matrix buffer stored as 4x3 (3 float4 texels behind each other), all chunks of a skeletal mesh in one
Buffer<float4> BoneMatrices0;
// The bone matrix buffer stored as 4x3 (3 float4 texels behind each other), all chunks of a skeletal mesh in one
Buffer<float4> BoneMatrices1;

// ------------------------------------------


// debug motionblur (very useful, keep)
// @param ScreenPos -1..1 -1..1 for viewport
// @param Velocity in -1..1 range for full motionblur
// @apram Color RGB and depth in alpha
// @param AvgObject 0:background, 1:foregound
void OverrideWithTestChart(float2 ScreenPos, inout float2 ObjectVelocity, inout float2 BackgroundVelocity, inout float4 Color, inout float AvgObject)
{
#if MOTIONBLUR_TESTCHART == 1
	// needs to be inside the loop to prevent NVIDIA driver optimizetion (blinking)
	float2 PixelPos = ScreenPos * ScreenPosToPixel.xy + ScreenPosToPixel.zw + 0.5f - 25;
	float3 BackgroundColor = lerp(0.0, 0.3f, PseudoRandom(PixelPos));
	float3 ForegroundColor = lerp(float3(1, 0, 0), float3(1, 1, 0), PseudoRandom(PixelPos));

	int2 tile = (int2)floor(PixelPos / 12.0f);
	int2 experiment = (int2)floor(tile / 5.0f);

	if(experiment.x >= 0 && experiment.y >= 0 && experiment.x < 10 && experiment.y < 5)
	{
		int2 localtile = uint2(tile) % 5;

		bool bForeground = localtile.x == 2 && localtile.y == 2;

		Color.rgb = bForeground ? ForegroundColor : BackgroundColor;

		// depth
		Color.a = bForeground ? 100.0f : 1000.0f;

		bool bLeftSide = experiment.x < 5;

		if(!bLeftSide)
		{
			experiment.x -= 5;
		}

		float ForegroundAngle = (experiment.x - 1) * (6.283f / 12);
		float BackgroundAngle = (experiment.y - 1) * (6.283f / 12) + 3.1415f/2;

		// ForegroundR with very small amounts needs extra testing so we do a non linear scale
		float ForegroundR = pow(experiment.x / 5.0f, 2);
		float BackgroundR = pow(experiment.y / 5.0f, 2);

		float2 ForegroundXY = ForegroundR * float2(sin(ForegroundAngle), cos(ForegroundAngle));
		float2 BackgroundXY = BackgroundR * float2(sin(BackgroundAngle), cos(BackgroundAngle));

		BackgroundVelocity.xy = BackgroundXY;

		if(bLeftSide)
		{
			ObjectVelocity.xy = ForegroundXY;
			AvgObject = bForeground;
		}
		else
		{
			ObjectVelocity.xy = bForeground ? ForegroundXY : BackgroundXY;
			AvgObject = 1.0f;
		}
	}

#endif
}

// ------------------------------------------

// motion blur setup vertex shader
void SetupVS(
	in float4 InPosition : ATTRIBUTE0,
	in float2 InTexCoord : ATTRIBUTE1,
	out float2 OutUV: TEXCOORD0,
	out float2 OutUVs[4]: TEXCOORD1,
	out float4 OutPosition : SV_POSITION
	)
{
	DrawRectangle(InPosition, InTexCoord, OutPosition, OutUV);

//	float2 ScreenPos =  OutPosition.xy;

	float2 HalfPixelOffset = PostprocessInput1Size.zw * float2(0.5f, 0.5f);

	// no filtering (2x2 kernel) to get no leaking in Depth of Field, lefttop,righttop,leftbottom,rightbottom
	OutUVs[0] = OutUV + HalfPixelOffset * float2(-1, -1);
	OutUVs[1] = OutUV + HalfPixelOffset * float2( 1, -1);
	OutUVs[2] = OutUV + HalfPixelOffset * float2(-1,  1);
	OutUVs[3] = OutUV + HalfPixelOffset * float2( 1,  1);
}

// motion blur setup pixel shader, downsamples to half resolution
// MRT0: velocity in normalized_motionblur_velocity and mask in 0:background..1:object (needed for soft masked)
// MRT1: color and depth in alpha
void SetupPS(
	float2 UV : TEXCOORD0,				// UV into the full resolution source RT
	float2 UVs[4] : TEXCOORD1,			// UV for high quality 4 samples with blurring
	out float4 OutVelocity : SV_Target0,
	out float4 OutColor : SV_Target1)
{
	// detect find objects
	float4 ObjectMask;
	float2 VelocitySamples[4];

#if GATHER_OPTIMIZATION
	{
		// using Gather: xyzw in counter clockwise order starting with the sample to the lower left of the queried location
		float4 Red = PostprocessInput0.GatherRed(PostprocessInput0Sampler, UV);
		float4 Green = PostprocessInput0.GatherGreen(PostprocessInput0Sampler, UV);

		VelocitySamples[0] = float2(Red.w, Green.w);
		VelocitySamples[1] = float2(Red.z, Green.z);
		VelocitySamples[2] = float2(Red.x, Green.x);
		VelocitySamples[3] = float2(Red.y, Green.y);
	}
#else
	UNROLL for( int i = 0; i < 4; i++ )
	{
		VelocitySamples[i] = PostprocessInput0.SampleLevel(PostprocessInput0Sampler, UVs[i], 0).xy;
	}
#endif

	UNROLL for( int i = 0; i < 4; i++ )
	{
		ObjectMask[i] = VelocitySamples[i].x > 0;
		VelocitySamples[i] = DecodeVelocityFromTexture(VelocitySamples[i]);
		VelocitySamples[i] = ObjectMask[i] ? VelocitySamples[i] : 0;
	}

	float AvgObject = dot(ObjectMask, 0.25f);

	ObjectMask = (AvgObject > 0.5f) ? ObjectMask : (1 - ObjectMask);

	// bias to avoid div by 0
	float InvTotalWeight = 1.0 / ( dot( ObjectMask, 1 ) + 0.000001f );

	float4 SumColorAndDepth = 0;

#if GATHER_OPTIMIZATION
	float4 DepthValues = PostprocessInput2.GatherRed(PostprocessInput2Sampler, UV, 0).r;
#else
	float4 DepthValues = float4(
		PostprocessInput2.SampleLevel(PostprocessInput2Sampler, UVs[2], 0).r,
		PostprocessInput2.SampleLevel(PostprocessInput2Sampler, UVs[3], 0).r,
		PostprocessInput2.SampleLevel(PostprocessInput2Sampler, UVs[1], 0).r,
		PostprocessInput2.SampleLevel(PostprocessInput2Sampler, UVs[0], 0).r);
#endif

	SumColorAndDepth += float4(PostprocessInput1.SampleLevel(PostprocessInput1Sampler, UVs[0], 0).rgb, DepthValues.x) * ObjectMask.x;
	SumColorAndDepth += float4(PostprocessInput1.SampleLevel(PostprocessInput1Sampler, UVs[1], 0).rgb, DepthValues.y) * ObjectMask.y;
	SumColorAndDepth += float4(PostprocessInput1.SampleLevel(PostprocessInput1Sampler, UVs[2], 0).rgb, DepthValues.z) * ObjectMask.z;
	SumColorAndDepth += float4(PostprocessInput1.SampleLevel(PostprocessInput1Sampler, UVs[3], 0).rgb, DepthValues.w) * ObjectMask.w;

	OutColor = SumColorAndDepth * InvTotalWeight;

	// clamp to avoid artifacts from exceeding fp16 through framebuffer blending of multiple very bright lights
	OutColor.rgb = min(float3(256 * 256, 256 * 256, 256 * 256), OutColor.rgb);

	OutColor.a = ConvertFromDeviceZ(OutColor.a);

	{
		float2 SumVelocity = 0;

		FLATTEN for( int i = 0; i < 4; i++ )
		{
			SumVelocity += VelocitySamples[i] * ObjectMask[i];
		}

		OutVelocity.xy = SumVelocity * InvTotalWeight * VelocityScale.xy;
	}

	// for debugging
/*
	{
		float2 BackgroundVelocity = 0;

		OverrideWithTestChart(UVAndScreenPos.zw, OutVelocity.xy, BackgroundVelocity, OutColor, AvgObject);
	}
*/

	// 0:background, 1:object layer
	OutVelocity.b = AvgObject > 0.5f;

	// clamp motion vector in a disc from -1 to 1 (the maximum motion vector range)
	{
		half Len = dot(OutVelocity.xy, OutVelocity.xy);

		float ScaleFix = rsqrt(Len);

		FLATTEN if(Len < 1)
		{
			ScaleFix = 1.0f;
		}
		FLATTEN if(Len < 0.01f)
		{
			ScaleFix = 0;
		}

		OutVelocity.xy *= ScaleFix * OutVelocity.b;
	}

	// alpha is used to normalize the velocity after blurring (0:background, 1:object)
	OutVelocity.a = 1;

	// debug, uncomment to geneate small and large horizontal motion and the objects
//	OutVelocity.xy = lerp(float2(0,0), (InUV.y > 0.8f) ? float2(1,0) : float2(0.02f,0), OutVelocity.b);
}


// used to visualize the motion blur
// @return 0/1
float Compute3DCheckerBoard(float3 Pos)
{
	float3 TiledWorldPos = frac(Pos) > 0.5f;
	return (float)((uint)dot(float3(1,1,1), TiledWorldPos) % 2);
}

uint GetStepCountFromQuality()
{
#if MOTION_BLUR_QUALITY == 1
	return 4;
#elif MOTION_BLUR_QUALITY == 2
	return 6;
#elif MOTION_BLUR_QUALITY == 3
	return 8;
#else // MOTION_BLUR_QUALITY == 4
	return 16;
#endif
}

// motionblur pixel shader
// input:
//   0: RGB:scene color, A: depth in half resolution from MotionBlurSetup
//   1: blurred quarter res velocity from MotionBlurSetup
//   2: half res velocity from MotionBlurSetup
// half resolution output: RGB: color, A:blend to full res factor
void MainPS(float4 UVAndScreenPos : TEXCOORD0, out float4 OutColor : SV_Target0)
{
	// Screen Quad UV 0..1
	float2 UV = UVAndScreenPos.xy;
	// screen position in [-1, 1] screen space
	float2 ScreenSpacePos = UVAndScreenPos.zw;

	OutColor = 0;

	// RGB:color, A:depth
	float4 ColorAndDepth = PostprocessInput0.SampleLevel(PostprocessInput0Sampler, UV.xy, 0);

	// can be moved to VS
	float3 ScreenVector = mul(float4(ScreenSpacePos, 1, 0), View.ScreenToWorld).xyz;

	// world space position of the current pixel
	float3 OffsetWorldPos = ScreenVector * ColorAndDepth.a;
	// previous frame clip space position of the current pixel
	float4 PrevClipPos = mul(float4(OffsetWorldPos, 1), PrevViewProjMatrix);
	// previous frame screen coordinates of the current pixel
	float3 PrevScreenCoord = PrevClipPos.xyz / PrevClipPos.w;

	// we split the content in a object and background layer

	// background velocity in half_res_pixels
	float2 PixelBackgroundVelocity;
	{
		// the background velocity in normalized_motionblur_velocity
		float2 NormBackgroundVelocity;
		{
			NormBackgroundVelocity = (UVAndScreenPos.zw - PrevScreenCoord.xy) * MotionBlurParameters.xy;

			// for debugging
			{
				float2 Velocity = 0;
				float4 Color = 0;
				float AvgObject = 0;

				OverrideWithTestChart(ScreenSpacePos, Velocity, NormBackgroundVelocity, OutColor, AvgObject);
			}

			// clamp the max motionblur within the unit radius
			float MotionLength = length(NormBackgroundVelocity);

			FLATTEN if(MotionLength > 1.0f)
			{
				NormBackgroundVelocity /= MotionLength;
			}
		}

		// in (-1..1 -1..1 for the screen)
		float2 ScreenBackgroundVelocity = NormBackgroundVelocity * MotionBlurParameters.zw;


		// in half_res_pixels
		PixelBackgroundVelocity = ScreenBackgroundVelocity * ScreenPosToPixel.xy;


	}

	// RG: xy motion in normalized_motionblur_velocity, B: object weight 0:background..1:object
	float3 NormSoftMaskedVelocity;
	{
		float4 SoftMaskedTexture = PostprocessInput1.SampleLevel(PostprocessInput1Sampler, UV.xy, 0);

		NormSoftMaskedVelocity = SoftMaskedTexture.rgb / (SoftMaskedTexture.a + 1.0f / 255.0f);
	}

	// how many motionblur samples affects quality and performance (we do multiple texture lookups per sample)
	const uint StepCount = GetStepCountFromQuality();

	// we start accumulation with the current pixel with a small weight to define div by near 0 case to be the current pixel color
	const float4 ColorAccumStartValue = float4(ColorAndDepth.rgb, 1) * 0.001f;

	// RGB:weighed Sum, A:weight to compute average color
	float4 BackgroundColorAccum = ColorAccumStartValue;

#if MOTION_BLUR_EARLY_EXIT
	if(true)
	{
		BackgroundColorAccum = float4(PostprocessInput0.SampleLevel(PostprocessInput0Sampler, UV.xy, 0).rgb, 1);
	}
	else
	{
#endif

	// create background motion layer
	UNROLL for(uint i = 0; i < StepCount; ++i)
	{
		float delta = (i / (float)(StepCount - 1)) - 0.5f;
		float2 LocalUV = UV + delta * PixelBackgroundVelocity * PostprocessInput0Size.zw;	// can be optimized
		float2 NormalizedView = LocalUV * TextureViewMad.xy + TextureViewMad.zw;

		FLATTEN if(all(NormalizedView > 0 && NormalizedView < 1))
		{
			float ObjectSample = PostprocessInput2.SampleLevel(PostprocessInput2Sampler, LocalUV.xy, 0).b;

			ObjectSample = saturate(ObjectSample * 4);

			float4 ColorSample = float4(PostprocessInput0.SampleLevel(PostprocessInput0Sampler, LocalUV.xy, 0).rgb, 1);

			BackgroundColorAccum += ColorSample * (1 - ObjectSample);
		}
	}

#if MOTION_BLUR_EARLY_EXIT
	}
#endif

	// object velocity in half_res_pixel
	float2 PixelObjectBlurredVelocity;
	{
		// SoftMasked Velocity in normalized_motionblur_velocity, lower resolution and blurred,
		// divide normalizes which is needed because of gaussian blurs, bias to avoid division by 0
		half2 NormBlurredVelocity = NormSoftMaskedVelocity.xy / (NormSoftMaskedVelocity.b + 1.0f / 255.0f);

		// Update the pixel velocity
		float2 ScreenBlurredVelocity = NormBlurredVelocity * MotionBlurParameters.zw;
		PixelObjectBlurredVelocity = ScreenBlurredVelocity * ScreenPosToPixel.xy;
	}


	// --------------------------------------
	// camera and background
	float2 PixelCombinedVelocity = lerp(PixelBackgroundVelocity, PixelObjectBlurredVelocity, NormSoftMaskedVelocity.b);

#if MOTION_BLUR_EARLY_EXIT
	float deBlur = dot(PixelCombinedVelocity, PixelCombinedVelocity);
	deBlur = sqrt(max(deBlur, 1.0/65536.0));
	deBlur = saturate((deBlur - 2.0) * (1.0/4.0));
	if(deBlur == 0.0)
	{
		OutColor = float4(0.0, 0.0, 0.0, 0.0);
		return;
	}
	else
	{
#endif

	{
		// in pixels
		float2 X = UV * PostprocessInput0Size.xy;
		float4 cXzX = PostprocessInput0.SampleLevel(PostprocessInput0Sampler, UV.xy, 0);
		float3 cX = cXzX.rgb;
		float zX = -cXzX.w;		// negated as this was in paper

		// in pixels
		float4 vXvX = PostprocessInput2.SampleLevel(PostprocessInput2Sampler, UV.xy, 0);
		float2 vX = vXvX.xy;
		vX = (vXvX.b > 0.5f) ? (vX * MotionBlurParameters.zw * ScreenPosToPixel.xy) : PixelBackgroundVelocity;

		// to avoid div by 0
		float Bias = 1.0f;

		float StartAlpha = 1 / (length(vX) + Bias);

		float3 BackgroundColor = BackgroundColorAccum.rgb / BackgroundColorAccum.a;

		float ColorAccumSum = 0.0001f;

		float4 ColorAccum = float4(BackgroundColor, 1) * ColorAccumSum;
		float4 SecondColorAccum = float4(BackgroundColor, 1) * ColorAccumSum;

		UNROLL for(uint e = 0; e < StepCount; ++e)
		{
			// we want to have the samples starting from inside going outwards
			uint iMid = StepCount / 2;
			uint iSign = (e % 2) * 2 - 1;
			uint i = iMid + (e / 2) * iSign;

			// -0.5 .. 0.5
			float delta = (i / (float)(StepCount - 1)) - 0.5f;
			float2 LocalUV = UV + delta * PixelCombinedVelocity * PostprocessInput0Size.zw;	// can be optimized
			float2 NormalizedView = LocalUV * TextureViewMad.xy + TextureViewMad.zw;

			FLATTEN if(all(NormalizedView > 0 && NormalizedView < 1))
			{
				// in pixels
				float2 Y = LocalUV * PostprocessInput0Size.xy;
				float4 cYzY = PostprocessInput0.SampleLevel(PostprocessInput0Sampler, LocalUV.xy, 0);
				float3 cY = cYzY.rgb;
				float zY = -cYzY.w;		// negated as this was in paper

				// in pixels
				float4 vYvY = PostprocessInput2.SampleLevel(PostprocessInput2Sampler, LocalUV.xy, 0);
				float2 vY = vYvY.xy;
				vY = (vYvY.b > 0.5f) ? (vY * MotionBlurParameters.zw * ScreenPosToPixel.xy) : PixelBackgroundVelocity;
//				vY = PixelBackgroundVelocity;

				float f = softDepthCompare(zX, zY);
				float b = softDepthCompare(zY, zX);

				float ay = 0;

				// Blurry Y in front of any X
				// the sample we look at (Y) is fast enough to affect me (direction not taken into account)
				ay += f * cone(Y, X, vY);
				// Any Y behind blurry X, estimate background
				ay += b * cone(X, Y, vX);
				// Simultaneously blurry X and Y
				ay += cylinder(Y, X, vY) * cylinder(X, Y, vX) * 2;

				// mask out Y that are not part of the moving direction we current process
//				ay *= 1 - saturate((length(vY - PixelCombinedVelocity) - length(vY - PixelBackgroundVelocity)) * 0.12f);

				ay = saturate(ay);

				ColorAccum += float4(cY.rgb, 0) * ay;
				ColorAccumSum += ay;

				float MaskAlreadyFound = saturate(1 - SecondColorAccum.a);
				SecondColorAccum += float4(cY.rgb, 1) * (1 - ay) * MaskAlreadyFound;
			}
		}

		// color content behind motion vector
//		float3 SecondColor = SecondColorAccum / (StepCount - ColorAccumSum);
		float3 SecondColor = SecondColorAccum .rgb / SecondColorAccum.a;

		// camera blurred background with the moving object removed
//		float LerpFactor = length(vX) - 1 > length(PixelBackgroundVelocity);
		float LerpFactor = saturate((length(PixelBackgroundVelocity) - length(vX)) );	// better
		float3 BehindMovingObject = lerp(BackgroundColor, SecondColor, LerpFactor);

		OutColor = float4(ColorAccum / ColorAccumSum);

		float BackgroundFraction = length(PixelCombinedVelocity) < length(vX) - 1;

		float MovingObjectFraction = ColorAccumSum / StepCount;

		// take the faster movement (background or non background)
		OutColor = lerp(float4(BehindMovingObject, 1), OutColor, MovingObjectFraction);

		OutColor.a = 1;

#if MOTIONBLUR_TESTCHART != 1
		// compute how much full res should blend in
		// works but without softedge (seems to be the issues with half res already)
		// comment tyhe next line to see the half res result
		OutColor.a = saturate( lerp(length(PixelBackgroundVelocity), length(PixelCombinedVelocity), MovingObjectFraction) * 0.25f);
#endif

#if MOTION_BLUR_EARLY_EXIT
		OutColor.a = deBlur;
#endif

		// prepare for the following compositing pass
		OutColor.rgb *= OutColor.a;
	}
	// --------------------------------------

#if MOTION_BLUR_EARLY_EXIT
	}
#endif


#if MOTION_BLUR_QUALITY == 0
	// visualize motion blur
	{
		float3 AbsWorldPos = View.ViewOrigin.xyz + OffsetWorldPos;
		float3 WorldCheckerboard = Compute3DCheckerBoard(AbsWorldPos * 0.02f) * 0.1f + Compute3DCheckerBoard(AbsWorldPos * 0.002f) * 0.3f + Compute3DCheckerBoard(AbsWorldPos * 0.0002f) * 0.6f;

		OutColor = float4(lerp(WorldCheckerboard, OutColor.rgb, 0.7f), 1);

		float2 NewPixelPosCentered = UV * PostprocessInput0Size.xy;

		// -1..1
		float2 NormalizedDirection = PixelObjectBlurredVelocity / ScreenPosToPixel.xy * MotionBlurParameters.xy;
		// -1..1
		float2 ScreenLocalTilePos = frac(NewPixelPosCentered / 16.0f) * 2 * 1.2f - 1;
		// -1..1
		float2 PerpDirection = float2(ScreenLocalTilePos.y, -ScreenLocalTilePos.x);

		float DirectionMask = 1 - saturate(abs(dot(PerpDirection, normalize(NormalizedDirection))) * 6);
		float StrengthMask = 1 - saturate((length(PerpDirection) - length(NormalizedDirection)) * 6);
		float OrientationMask = saturate(dot(normalize(ScreenLocalTilePos), NormalizedDirection) * 6);
		float DiskMask = saturate((length(PerpDirection) - 1.0f) * 6);

		bool bSelectorOpaque = PostprocessInput2.SampleLevel(PostprocessInput2Sampler, UV.xy, 0).b > 0.5f;

		float3 LineColor = lerp(float3(1,0,0), float3(0,1,0), OrientationMask);
		float3 LineMask = DirectionMask * StrengthMask;
		float3 TintColor = bSelectorOpaque ? float3(0.2f, 0.2f, 0.6f) : float3(0.5f, 0.5f, 0.5f);

		OutColor.rgb = lerp(lerp(WorldCheckerboard, TintColor, 0.5f), LineColor, LineMask);
		OutColor.rgb *= lerp(1.0f, 0.9f, DiskMask);
	}
	// visualize BoneBuffers
	{
		const int2 LeftTop = int2(16, 114);
		const int ElementsPerLine = 32;
		const int Scale = 3;
		const int LinesPerBuffer = 48;
		const int GapBetweenBuffers = 8;
		const int FadeRegion = 16;

		float2 PixelPos = ComputePixelPosCenter(UVAndScreenPos.zw, true);
		int2 PixelPosInt = ((uint2)PixelPos - LeftTop) / Scale;

		// 0  and 1 are valid buffers
		int LineInBuffer = PixelPosInt.y % (LinesPerBuffer + GapBetweenBuffers);
		int WhichBuffer = PixelPosInt.y / (LinesPerBuffer + GapBetweenBuffers);
		float Alpha = saturate((LinesPerBuffer - LineInBuffer) / (float)FadeRegion);

		if(PixelPosInt.x >= 0 && PixelPosInt.x < ElementsPerLine && PixelPosInt.y >= 0 && WhichBuffer / 2 == 0 && Alpha > 0)
		{
			uint Index = PixelPosInt.x + LineInBuffer * ElementsPerLine;

			float4 Value = WhichBuffer ? BoneMatrices1[Index] : BoneMatrices0[Index];

			OutColor.rgb = lerp(OutColor.rgb, Value.rgb, Alpha);
		}
	}
#endif
}


void MainRecombinePS(float4 UVAndScreenPos : TEXCOORD0, out float4 OutColor : SV_Target0)
{
	float2 UV = UVAndScreenPos.xy;

	float3 FullResSceneColor = PostprocessInput0.SampleLevel(PostprocessInput0Sampler, UV, 0).rgb;
	float4 MotionBlurOutput = PostprocessInput1.SampleLevel(PostprocessInput1Sampler, UV, 0);

	float Mask = MotionBlurOutput.a;

	OutColor.rgb = FullResSceneColor * (1 - Mask) + MotionBlurOutput.rgb;
	OutColor.a = 0;
}


uint2 TileCount;

void VelocityScatterVS(
	uint VId : SV_VertexID,
	uint IId : SV_InstanceID,
	out nointerpolation float2 OutColor : TEXCOORD0,
	out float4 OutPosition : SV_POSITION
	)
{
	OutPosition =  float4(0, 0, 0, 1);

	// needs to be the same on C++ side (faster on NVIDIA and AMD)
	uint QuadsPerInstance = 8;
#if MAC
	// remap the indices to get vertexid to VId and quadid into IId
	IId = IId * QuadsPerInstance + (VId / 6);
	VId = VId % 6;

	// triangle A: 0:left top, 1:right top, 2: left bottom
	// triangle B: 3:right bottom, 4:left bottom, 5: right top
	float2 CornerOffset = float2(VId % 2, VId > 1 && VId < 5) * 2 - 1;
#else
	// remap the indices to get vertexid to VId and quadid into IId
	IId = IId * QuadsPerInstance + (VId / 4);
	VId = VId % 4;

	// triangle A: 0:left top, 1:right top, 2: left bottom
	// triangle B: 3:right bottom, 4:left bottom, 5: right top
	float2 CornerOffset = float2(VId % 2, VId / 2) * 2 - 1;
#endif

	uint2 TilePos = uint2( IId % TileCount.x, IId / TileCount.x );

	float2 ScreenPos = ( TilePos - ScreenPosToPixel.zw ) / ScreenPosToPixel.xy;

	float2 Velocity = PostprocessInput0[ TilePos ].xy;
	OutColor = Velocity.xyxy;

	BRANCH
	if( length( Velocity * 0.5 * float2(1920,1080) ) <= 0.5 )
	{
		return;
	}

	float  VelocityLengthSqr = dot( Velocity, Velocity );
	float  VelocityLengthInv = rsqrt( VelocityLengthSqr );
	float  VelocityLength = VelocityLengthSqr * VelocityLengthInv;
	float2 VelocityDir = Velocity * VelocityLengthInv;

	// +1 pixel radius because shape is swept tile, approx as capsule
	// +1 pixel radius for conservative rasterization
	const float TwoPixelRadius = sqrt( 2.0 );
	CornerOffset *= float2( VelocityLength, 0 ) + TwoPixelRadius.xx / ScreenPosToPixel.xy;

	// Orient along velocity direction
	float2 AxisX = VelocityDir;
	float2 AxisY = float2( -VelocityDir.y, VelocityDir.x );
	CornerOffset = AxisX * CornerOffset.x + AxisY * CornerOffset.y;

	OutPosition.xy = ScreenPos + CornerOffset;

	// Depth ordered by velocity length
	OutPosition.z = saturate( VelocityLength * 0.5 );
}

void VelocityScatterPS(
	nointerpolation float4 InColor : TEXCOORD0,
	out float4 OutColor : SV_Target0
	)
{
	OutColor = InColor;
}


float2 MaxLength( float2 v0, float2 v1 )
{
	float LengthSqr0 = dot( v0, v0 );
	float LengthSqr1 = dot( v1, v1 );
	return LengthSqr0 > LengthSqr1 ? v0 : v1;
}

float2 MaxLength( float2 v0, float2 v1, float2 v2 )
{
	float LengthSqr0 = dot( v0, v0 );
	float LengthSqr1 = dot( v1, v1 );
	float LengthSqr2 = dot( v2, v2 );
	return LengthSqr0 > LengthSqr1 ? ( LengthSqr0 > LengthSqr2 ? v0 : v2 ) : ( LengthSqr1 > LengthSqr2 ? v1 : v2 );
}

void VelocityDilatePS(float4 UVAndScreenPos : TEXCOORD0, out float4 OutColor : SV_Target0)
{
	float2 UV = UVAndScreenPos.xy;

	float2 Neighbor0 = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0, int2(-1, -1) ).xy;
	float2 Neighbor1 = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0, int2( 0, -1) ).xy;
	float2 Neighbor2 = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0, int2( 1, -1) ).xy;
	float2 Neighbor3 = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0, int2(-1,  0) ).xy;
	float2 Neighbor4 = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0				).xy;
	float2 Neighbor5 = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0, int2( 1,  0) ).xy;
	float2 Neighbor6 = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0, int2(-1,  1) ).xy;
	float2 Neighbor7 = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0, int2( 0,  1) ).xy;
	float2 Neighbor8 = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0, int2( 1,  1) ).xy;

	float2 Max012 = MaxLength( Neighbor0, Neighbor1, Neighbor2 );
	float2 Max345 = MaxLength( Neighbor3, Neighbor4, Neighbor5 );
	float2 Max678 = MaxLength( Neighbor6, Neighbor7, Neighbor8 );

	OutColor.xy = MaxLength( Max012, Max345, Max678 );
	OutColor.zw = 0;
}


float2 DepthCmp( float CenterDepth, float SampleDepth, float DepthScale )
{
	return saturate( 0.5 + float2( DepthScale, -DepthScale ) * (SampleDepth - CenterDepth) );
	//return SampleDepth > CenterDepth ? float2(1,0) : float2(0,1);
}

float2 SpreadCmp( float OffsetLength, float2 SpreadLength, float PixelToSampleScale )
{
	return saturate( PixelToSampleScale * SpreadLength - max( OffsetLength - 1, 0 ) );
	//return PixelToSampleScale * SpreadLength > OffsetLength ? 1 : 0;
	//return SpreadLength > SearchLengthPixels * OffsetLength / ( StepCount - 0.5 ) ? 1 : 0;
}

float SampleWeight( float CenterDepth, float SampleDepth, float OffsetLength, float CenterSpreadLength, float SampleSpreadLength, float PixelToSampleScale, float DepthScale )
{
	float2 DepthWeights = DepthCmp( CenterDepth, SampleDepth, DepthScale );
	float2 SpreadWeights = SpreadCmp( OffsetLength, float2( CenterSpreadLength, SampleSpreadLength ), PixelToSampleScale );
	return dot( DepthWeights, SpreadWeights );
}

// [0, 1]
float RandFast( uint2 PixelPos, float Magic = 3571.0 )
{
	float2 Random = ( 1.0 / 4320.0 ) * PixelPos + float2( 0.25, 0.0 );
	Random = frac( dot( Random * Random, Magic ) );
	Random = frac( dot( Random * Random, Magic ) );
	return Random.x;
}


void MainNewPS( float4 UVAndScreenPos : TEXCOORD0, out float4 OutColor : SV_Target0 )
{
	// Screen Quad UV 0..1
	float2 UV = UVAndScreenPos.xy;
	// screen position in [-1, 1] screen space
	float2 ScreenSpacePos = UVAndScreenPos.zw;

	OutColor = 0;

	float VelocityScale = MotionBlurParameters.x;

	const uint StepCount = GetStepCountFromQuality() / 2;

#if 1
	uint2 PixelPos = uint2(UVAndScreenPos.zw * ScreenPosToPixel.xy + ScreenPosToPixel.zw + 0.5f);
	float2 PosMod = float2( PixelPos & 1 );
	float Dither = ( PosMod.x * 2 - 1 ) * ( PosMod.y * 2 - 1 );
	float Random = RandFast( PixelPos );
	float Random2 = RandFast( PixelPos, 5521 );
#endif

	float2 TileJitter = ( float2( Random, Random2 ) - 0.5 ) * 0.5;

	float2 MaxVelocity = PostprocessInput3.SampleLevel( PostprocessInput3Sampler, UV + TileJitter * PostprocessInput3Size.zw, 0 ).xy;
	//float2 MaxVelocity = PostprocessInput3[ PixelPos / 16 ].xy;

	float2 MaxVelocityScreen = MaxVelocity.xy * VelocityScale;
	float2 MaxVelocityPixels = MaxVelocityScreen * ScreenPosToPixel.xy;

	float3 CenterColor = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, UV, 0 ).rgb;
	float  CenterDepth = PostprocessInput1.SampleLevel( PostprocessInput1Sampler, UV, 0 ).r;

	CenterDepth = ConvertFromDeviceZ( CenterDepth );

	BRANCH
	if( length( MaxVelocityPixels ) <= 0.5 )
	{
		OutColor.rgb = CenterColor;
		return;
	}

	// in pixels
	float2 CenterVelocity = PostprocessInput2.SampleLevel( PostprocessInput2Sampler, UV, 0 ).xy;
	CenterVelocity = DecodeVelocityFromTexture( CenterVelocity ) * VelocityScale * ScreenPosToPixel.xy;
	float CenterVelocityLength = length( CenterVelocity );


	// Clip MaxVelocity to screen rect
	float2 InvVelocityScreen = rcp( MaxVelocityScreen );
	float2 MinIntersect = -InvVelocityScreen - ScreenSpacePos * InvVelocityScreen;
	float2 MaxIntersect =  InvVelocityScreen - ScreenSpacePos * InvVelocityScreen;
	float4 FarIntersect = float4( max( MinIntersect, MaxIntersect ), max( -MinIntersect, -MaxIntersect ) );
	float2 Intersect = saturate( min( FarIntersect.xz, FarIntersect.yw ) );

	// +/-
	float4 SearchVectorPixels = MaxVelocityPixels.xyxy * float4( Intersect.xx, -Intersect.yy );
	float2 SearchLengthPixels = { length( SearchVectorPixels.xy ), length( SearchVectorPixels.zw ) };

	// converts pixel length to sample steps
	float2 PixelToSampleScale = ( StepCount - 0.5 ) / SearchLengthPixels;

	float4 ColorAccum = 0;

	UNROLL
	for( uint i = 0; i < StepCount; i++ )
	{
		float2 SampleUV[2];
		float3 SampleColor[2];
		float  SampleDepth[2];
		float  SampleVelocityLength[2];
		float  Weight[2];

		float OffsetLength = (float)i + 0.5 + (Random - 0.5);
		float OffsetFraction = OffsetLength / ( StepCount - 0.5 );

		//float2 TestVector = (i & 1) > 0 ? CenterVelocity : SearchVectorPixels;
		//float2 PixelToSampleScale = ( StepCount - 0.5 ) / length( TestVector );

		// can be optimized
		SampleUV[0] = UV + OffsetFraction * SearchVectorPixels.xy * PostprocessInput0Size.zw;
		SampleUV[1] = UV + OffsetFraction * SearchVectorPixels.zw * PostprocessInput0Size.zw;

		UNROLL
		for( uint j = 0; j < 2; j++ )
		{
			SampleColor[j] = PostprocessInput0.SampleLevel( PostprocessInput0Sampler, SampleUV[j], 0 ).rgb;
			SampleDepth[j] = PostprocessInput1.SampleLevel( PostprocessInput1Sampler, SampleUV[j], 0 ).r;

			SampleDepth[j] = ConvertFromDeviceZ( SampleDepth[j] );

			// in pixels
			float2 SampleVelocity = PostprocessInput2.SampleLevel( PostprocessInput2Sampler, SampleUV[j], 0 ).xy;
			SampleVelocity = DecodeVelocityFromTexture( SampleVelocity ) * VelocityScale * ScreenPosToPixel.xy;
			SampleVelocityLength[j] = length( SampleVelocity );

			Weight[j] = SampleWeight( CenterDepth, SampleDepth[j], OffsetLength, CenterVelocityLength, SampleVelocityLength[j], PixelToSampleScale[j], SOFT_Z_EXTENT );

#if 0
			float TestLengthPixels = OffsetFraction * SearchLengthPixels[j];

			float2 DepthWeights = DepthCmp( CenterDepth, SampleDepth[j], SOFT_Z_EXTENT );

			//Weight[j]  = DepthWeights.x * saturate( 1 - TestLengthPixels / CenterVelocityLength );
			//Weight[j] += DepthWeights.y * saturate( 1 - TestLengthPixels / SampleVelocityLength[j] );
			//Weight[j] += ( 1 - smoothstep(0.95f * CenterVelocityLength, 1.05f * CenterVelocityLength, TestLengthPixels ) ) *
			//			( 1 - smoothstep(0.95f * SampleVelocityLength[j], 1.05f * SampleVelocityLength[j], TestLengthPixels ) ) * 2;
#endif
		}

		bool2 Mirror = bool2( SampleDepth[0] > SampleDepth[1], SampleVelocityLength[1] > SampleVelocityLength[0] );
		Weight[0] = all( Mirror ) ? Weight[1] : Weight[0];
		Weight[1] = any( Mirror ) ? Weight[1] : Weight[0];

		ColorAccum += Weight[0] * float4( SampleColor[0].rgb, 1 );
		ColorAccum += Weight[1] * float4( SampleColor[1].rgb, 1 );
	}

	//ColorAccum += float4( CenterColor, 1 ) / CenterVelocityLength;

	ColorAccum *= 0.5 / StepCount;
	OutColor.rgb = ColorAccum.rgb + ( 1 - ColorAccum.a ) * CenterColor;

#if 0
	// Fudge factor to increase alpha
	ColorAccum.rgb /= ColorAccum.a;
	ColorAccum.a = saturate( ColorAccum.a * 1.2 );
	OutColor.rgb = lerp( CenterColor, ColorAccum.rgb, ColorAccum.a );
#endif

	OutColor.a = 1;

	//OutColor.rgb = ColorAccum.a * 0.2;
}