Files
UnrealEngineUWP/Engine/Shaders/FastMath.usf
Gil Gribb 0711601519 Copying //UE4/Dev-Rendering to Dev-Main (//UE4/Dev-Main)
#lockdown nick.penwarden

==========================
MAJOR FEATURES + CHANGES
==========================

Change 2879377 on 2016/02/24 by Gil.Gribb

	UE4 - Added render thread start and stop delegates. GitHub 2006.
	#Jira UE-26184

Change 2879378 on 2016/02/24 by Gil.Gribb

	UE4 - Avoided using TG_PrePhysics as the first tickgroup so that licensees can add tickgropups.
	https://udn.unrealengine.com/questions/279126/code-assumes-that-tg-prephysics-is-the-first-tick.html
	#Jira UE-26971

Change 2879382 on 2016/02/24 by Gil.Gribb

	UE4 - Tweaked automation test framework by request from UDN post.

Change 2879727 on 2016/02/24 by Martin.Mittring

	adding debug info for Optimus driver detection issue
	#rb:Benjamin.Hyder
	#Test:PC

Change 2879728 on 2016/02/24 by Martin.Mittring

	fixed and improved VisualizeMotionBlur
	#rb:David.Hill
	#test:PC

Change 2879729 on 2016/02/24 by Martin.Mittring

	added AngleBetweenVectors() and variants to the FastMath library
	#rb:David.Hill
	#code_review:Brian.Karis

Change 2880133 on 2016/02/24 by David.Hill

	new r.DepthOfFieldQualitySetting
	for GDC squencer demo
	#rb:Martin.Mittring
	 - OR-15875

Change 2880314 on 2016/02/24 by Daniel.Wright

	Fixed uses of FDepthDrawingPolicyFactory being affected by bUseAsOccluder
	* This fixes preshadows on HISMC and foliage

Change 2880338 on 2016/02/24 by Martin.Mittring

	added SkinCache.Debug cvar
	#rb:Lina.Halper
	#test:PC

Change 2880344 on 2016/02/24 by Daniel.Wright

	Added the ability to apply DFAO to static indirect lighting, controlled by r.AOApplyToStaticIndirect
	* Lightmaps, stationary skylight and reflection captures are all affected
	* Specular occlusion on reflection captures requires a fair amount of tweaking of r.SkySpecularOcclusionStrength, MinOcclusion and MaxOcclusionDistance for good quality
	* For now, a movable skylight with low intensity (.0001) must be placed to control MaxOcclusionDistance and MinOcclusion

Change 2880346 on 2016/02/24 by Daniel.Wright

	Added several cvars to expose mesh distance field limits, which allows higher quality
	* r.DistanceFields.MaxPerMeshResolution
	* r.DistanceFields.DefaultVoxelDensity
	* r.DistanceFields.AtlasSizeXY
	* r.DistanceFields.AtlasSizeZ

Change 2881304 on 2016/02/25 by Gil.Gribb

	UE4 - Increased the priority of cloth tasks because these are on the critical path.

Change 2881306 on 2016/02/25 by Gil.Gribb

	UE4 - Added cvar to control background tick list cleanup.

Change 2881790 on 2016/02/25 by Daniel.Wright

	Screen size fading is only applied to spot and point lights

Change 2882077 on 2016/02/25 by Daniel.Wright

	DFAO indirect occlusion on static lighting is now correctly applied to IndirectIrradiance

Change 2882391 on 2016/02/25 by Martin.Mittring

	fixed bad caching of SRV for vertexbuffers in SkinCache (caused rendering artifacts and wasteful memory allocations). Finding a SRV is now O(1), was O(n)
	#rb:Olaf.Piesche
	#code_review:Rolando.Caloca,Marcus.Wassmer

Change 2883008 on 2016/02/26 by Gil.Gribb

	UE4 - Fixed recursive shader intialization crash on consoles.

Change 2883253 on 2016/02/26 by Martin.Mittring

	Improved SkinTangent compression
	#rb:Olaf.Piesche

Change 2883295 on 2016/02/26 by Martin.Mittring

	Added RecomputeSkinTangent feature for GPU SkinCache, not enabled by default (r.SkinCache.RecomputeTangents)
	#rb:Olaf.Piesche,Brian.Karis,Lina.Halper,Rolando.Caloca

Change 2883363 on 2016/02/26 by Gil.Gribb

	UE4 - Fixed an issue with recurisve shader init on consoles...again.

Change 2883912 on 2016/02/26 by Gil.Gribb

	UE4 - Fixed shadows updating static meshes while the prepass is in progress.

Change 2884829 on 2016/02/27 by Martin.Mittring

	OR-16237 indirect lighting on skin is too dark
	#rb:Martin.Mittring
	#code_review:Brian.Karis

Change 2885096 on 2016/02/28 by Martin.Mittring

	OR-13678

[CL 2890130 by Gil Gribb in Main branch]
2016-03-02 13:38:38 -05:00

235 lines
6.3 KiB
Plaintext

// Copyright 1998-2016 Epic Games, Inc. All Rights Reserved.
/*=============================================================================
Common.usf: Common shader code
=============================================================================*/
#pragma once
/******************************************************************************
Shader Fast Math Lib (v0.41)
A shader math library for optimized approximate transcendental functions.
Optimized and tested on AMD GCN architecture.
********************************************************************************/
/******************************************************************************
The MIT License (MIT)
Copyright (c) <2014> <Michal Drobot>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
********************************************************************************/
//
// Normalized range [0,1] Constants
//
#define IEEE_INT_RCP_CONST_NR0_SNORM 0x7EEF370B
#define IEEE_INT_SQRT_CONST_NR0_SNORM 0x1FBD1DF5
#define IEEE_INT_RCP_SQRT_CONST_NR0_SNORM 0x5F341A43
// Relative error : ~3.4% over full
// Precise format : ~small float
// 2 ALU
float rsqrtFast( float x )
{
#if !GL3_PROFILE
int i = asint(x);
i = 0x5f3759df - (i >> 1);
return asfloat(i);
#else
return rsqrt(x);
#endif
}
// Relative error : < 0.7% over full
// Precise format : ~small float
// 1 ALU
float sqrtFast( float x )
{
#if !GL3_PROFILE
int i = asint(x);
i = 0x1FBD1DF5 + (i >> 1);
return asfloat(i);
#else
return sqrt(x);
#endif
}
// Relative error : < 0.4% over full
// Precise format : ~small float
// 1 ALU
float rcpFast( float x )
{
#if !GL3_PROFILE
int i = asint(x);
i = 0x7EF311C2 - i;
return asfloat(i);
#else
return rcp(x);
#endif
}
// Using 1 Newton Raphson iterations
// Relative error : < 0.02% over full
// Precise format : ~half float
// 3 ALU
float rcpFastNR1( float x )
{
#if !GL3_PROFILE
int i = asint(x);
i = 0x7EF311C3 - i;
float xRcp = asfloat(i);
xRcp = xRcp * (-xRcp * x + 2.0f);
return xRcp;
#else
return rcp(x);
#endif
}
float lengthFast( float3 v )
{
float LengthSqr = dot(v,v);
return sqrtFast( LengthSqr );
}
float3 normalizeFast( float3 v )
{
float LengthSqr = dot(v,v);
return v * rsqrtFast( LengthSqr );
}
//
// Trigonometric functions
//
// max absolute error 9.0x10^-3
// Eberly's polynomial degree 1 - respect bounds
// 4 VGPR, 12 FR (8 FR, 1 QR), 1 scalar
// input [-1, 1] and output [0, PI]
float acosFast(float inX)
{
float x = abs(inX);
float res = -0.156583f * x + (0.5 * PI);
res *= sqrt(1.0f - x);
return (inX >= 0) ? res : PI - res;
}
// Same cost as acosFast + 1 FR
// Same error
// input [-1, 1] and output [-PI/2, PI/2]
float asinFast( float x )
{
return (0.5 * PI) - acosFast(x);
}
// max absolute error 1.3x10^-3
// Eberly's odd polynomial degree 5 - respect bounds
// 4 VGPR, 14 FR (10 FR, 1 QR), 2 scalar
// input [0, infinity] and output [0, PI/2]
float atanFastPos( float x )
{
float t0 = (x < 1.0f) ? x : 1.0f / x;
float t1 = t0 * t0;
float poly = 0.0872929f;
poly = -0.301895f + poly * t1;
poly = 1.0f + poly * t1;
poly = poly * t0;
return (x < 1.0f) ? poly : (0.5 * PI) - poly;
}
// 4 VGPR, 16 FR (12 FR, 1 QR), 2 scalar
// input [-infinity, infinity] and output [-PI/2, PI/2]
float atanFast( float x )
{
float t0 = atanFastPos( abs(x) );
return (x < 0) ? -t0: t0;
}
float atan2Fast( float y, float x )
{
float t0 = max( abs(x), abs(y) );
float t1 = min( abs(x), abs(y) );
float t3 = t1 / t0;
float t4 = t3 * t3;
// Same polynomial as atanFastPos
t0 = + 0.0872929;
t0 = t0 * t4 - 0.301895;
t0 = t0 * t4 + 1.0;
t3 = t0 * t3;
t3 = abs(y) > abs(x) ? (0.5 * PI) - t3 : t3;
t3 = x < 0 ? PI - t3 : t3;
t3 = y < 0 ? -t3 : t3;
return t3;
}
// 4th order polynomial approximation
// 4 VGRP, 16 ALU Full Rate
// 7 * 10^-5 radians precision
// Reference : Handbook of Mathematical Functions (chapter : Elementary Transcendental Functions), M. Abramowitz and I.A. Stegun, Ed.
float acosFast4(float inX)
{
float x1 = abs(inX);
float x2 = x1 * x1;
float x3 = x2 * x1;
float s;
s = -0.2121144f * x1 + 1.5707288f;
s = 0.0742610f * x2 + s;
s = -0.0187293f * x3 + s;
s = sqrt(1.0f - x1) * s;
// acos function mirroring
// check per platform if compiles to a selector - no branch neeeded
return inX >= 0.0f ? s : PI - s;
}
// 4th order polynomial approximation
// 4 VGRP, 16 ALU Full Rate
// 7 * 10^-5 radians precision
float asinFast4( float x )
{
return (0.5 * PI) - acosFast4(x);
}
// @param A doesn't have to be normalized, output could be NaN if this is near 0,0,0
// @param B doesn't have to be normalized, output could be NaN if this is near 0,0,0
// @return can be passed to a acosFast() or acos() to compute an angle
float CosBetweenVectors(float3 A, float3 B)
{
// unoptimized: dot(normalize(A), normalize(B))
return dot(A, B) * rsqrt(length2(A) * length2(B));
}
// @param A doesn't have to be normalized, output could be NaN if this is near 0,0,0
// @param B doesn't have to be normalized, output could be NaN if this is near 0,0,0
float AngleBetweenVectors(float3 A, float3 B)
{
return acos(CosBetweenVectors(A, B));
}
// @param A doesn't have to be normalized, output could be NaN if this is near 0,0,0
// @param B doesn't have to be normalized, output could be NaN if this is near 0,0,0
float AngleBetweenVectorsFast(float3 A, float3 B)
{
return acosFast(CosBetweenVectors(A, B));
}