Files
KTexComp/ispc_texcomp/kernel.ispc
2021-06-23 22:39:38 +02:00

567 lines
15 KiB
Plaintext

//
// BC1-BC5 encoding implementation.
//
//
// Copyright (c) 2016 Intel Corporation
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be included in all copies
// or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
// OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
#include "kernel_shared.isph"
///////////////////////////////////////////////////////////
// BC1/BC2/BC3/BC4/BC5/BC7 shared
///////////////////////////////////////////////////////////
// BC1/BC2/BC3/BC4/BC5 encoding
inline int stb__Mul8Bit(int a, int b)
{
int t = a*b + 128;
return (t + (t >> 8)) >> 8;
}
inline unsigned int16 stb__As16Bit(int r, int g, int b)
{
return (stb__Mul8Bit(r,31) << 11) + (stb__Mul8Bit(g,63) << 5) + stb__Mul8Bit(b,31);
}
inline unsigned int16 enc_rgb565(float c[3])
{
return stb__As16Bit((int)c[0], (int)c[1], (int)c[2]);
}
inline void dec_rgb565(float c[3], int p)
{
int c2 = (p>>0)&31;
int c1 = (p>>5)&63;
int c0 = (p>>11)&31;
c[0] = (c0<<3)+(c0>>2);
c[1] = (c1<<2)+(c1>>4);
c[2] = (c2<<3)+(c2>>2);
}
inline void pick_endpoints_dc(int c0[3], int c1[3], int block[48], int iaxis[3])
{
for (uniform int p=0; p<3; p++)
for (uniform int y=0; y<4; y++)
for (uniform int x=0; x<4; x++)
c0[p] += block[p*16+y*4+x];
for (uniform int p=0; p<3; p++)
c0[p] >>= 4;
}
inline void pick_endpoints(float c0[3], float c1[3], float block[48], float axis[3], float dc[3])
{
float min_dot = 256*256;
float max_dot = 0;
for (uniform int y=0; y<4; y++)
{
for (uniform int x=0; x<4; x++)
{
float dot = 0;
for (uniform int p=0; p<3; p++)
dot += (block[p*16+y*4+x]-dc[p])*axis[p];
min_dot = min(min_dot, dot);
max_dot = max(max_dot, dot);
}
}
if (max_dot-min_dot < 1f)
{
min_dot -= 0.5f;
max_dot += 0.5f;
}
float norm_sq = 0;
for (uniform int p=0; p<3; p++)
norm_sq += axis[p]*axis[p];
float rnorm_sq = rcp(norm_sq);
for (uniform int p=0; p<3; p++)
{
c0[p] = clamp(dc[p]+min_dot*rnorm_sq*axis[p], 0, 255);
c1[p] = clamp(dc[p]+max_dot*rnorm_sq*axis[p], 0, 255);
}
}
inline uint32 fast_quant(float block[48], int p0, int p1)
{
float c0[3];
float c1[3];
dec_rgb565(c0, p0);
dec_rgb565(c1, p1);
float dir[3];
for (uniform int p=0; p<3; p++)
dir[p] = c1[p]-c0[p];
float sq_norm = 0;
for (uniform int p=0; p<3; p++)
sq_norm += sq(dir[p]);
float rsq_norm = rcp(sq_norm);
for (uniform int p=0; p<3; p++)
dir[p] *= rsq_norm*3;
float bias = 0.5;
for (uniform int p=0; p<3; p++)
bias -= c0[p]*dir[p];
uint32 bits = 0;
uint32 scaler = 1;
for (uniform int k=0; k<16; k++)
{
float dot = 0;
for (uniform int p=0; p<3; p++)
dot += block[k+p*16]*dir[p];
int q = clamp((int)(dot+bias), 0, 3);
//bits += q<<(k*2);
bits += q*scaler;
scaler *= 4;
}
return bits;
}
inline void compute_covar_dc(float covar[6], float dc[3], float block[48])
{
for (uniform int i=0; i<6; i++)
covar[i] = 0;
for (uniform int p=0; p<3; p++)
dc[p] = 0;
for (uniform int k=0; k<16; k++)
for (uniform int p=0; p<3; p++)
dc[p] += block[k+p*16];
for (uniform int p=0; p<3; p++)
dc[p] /= 16;
for (uniform int k=0; k<16; k++)
{
float rgb[3];
for (uniform int p=0; p<3; p++)
rgb[p] = block[k+p*16]-dc[p];
covar[0] += rgb[0]*rgb[0];
covar[1] += rgb[0]*rgb[1];
covar[2] += rgb[0]*rgb[2];
covar[3] += rgb[1]*rgb[1];
covar[4] += rgb[1]*rgb[2];
covar[5] += rgb[2]*rgb[2];
}
}
// ugly, but makes BC1 compression 20% faster overall
inline void compute_covar_dc_ugly(float covar[6], float dc[3], float block[48])
{
for (uniform int p=0; p<3; p++)
{
float acc = 0;
for (uniform int k=0; k<16; k++)
acc += block[k+p*16];
dc[p] = acc/16;
}
float covar0 = 0f;
float covar1 = 0f;
float covar2 = 0f;
float covar3 = 0f;
float covar4 = 0f;
float covar5 = 0f;
for (uniform int k=0; k<16; k++)
{
float rgb0, rgb1, rgb2;
rgb0 = block[k+0*16]-dc[0];
rgb1 = block[k+1*16]-dc[1];
rgb2 = block[k+2*16]-dc[2];
covar0 += rgb0*rgb0;
covar1 += rgb0*rgb1;
covar2 += rgb0*rgb2;
covar3 += rgb1*rgb1;
covar4 += rgb1*rgb2;
covar5 += rgb2*rgb2;
}
covar[0] = covar0;
covar[1] = covar1;
covar[2] = covar2;
covar[3] = covar3;
covar[4] = covar4;
covar[5] = covar5;
}
inline void bc1_refine(int pe[2], float block[48], unsigned int32 bits, float dc[3])
{
float c0[3];
float c1[3];
if ((bits ^ (bits*4)) < 4)
{
// single color
for (uniform int p=0; p<3; p++)
{
c0[p] = dc[p];
c1[p] = dc[p];
}
}
else
{
float Atb1[3] = {0,0,0};
float sum_q = 0;
float sum_qq = 0;
unsigned int32 shifted_bits = bits;
for (uniform int k=0; k<16; k++)
{
float q = (int)(shifted_bits&3);
shifted_bits >>= 2;
float x = 3-q;
float y = q;
sum_q += q;
sum_qq += q*q;
for (uniform int p=0; p<3; p++)
Atb1[p] += x*block[k+p*16];
}
float sum[3];
float Atb2[3];
for (uniform int p=0; p<3; p++)
{
sum[p] = dc[p]*16;
Atb2[p] = 3*sum[p]-Atb1[p];
}
float Cxx = 16*sq(3)-2*3*sum_q+sum_qq;
float Cyy = sum_qq;
float Cxy = 3*sum_q-sum_qq;
float scale = 3f * rcp(Cxx*Cyy - Cxy*Cxy);
for (uniform int p=0; p<3; p++)
{
c0[p] = (Atb1[p]*Cyy - Atb2[p]*Cxy)*scale;
c1[p] = (Atb2[p]*Cxx - Atb1[p]*Cxy)*scale;
c0[p] = clamp(c0[p], 0, 255);
c1[p] = clamp(c1[p], 0, 255);
}
}
pe[0] = enc_rgb565(c0);
pe[1] = enc_rgb565(c1);
}
inline uint32 fix_qbits(uint32 qbits)
{
uniform const uint32 mask_01b = 0x55555555;
uniform const uint32 mask_10b = 0xAAAAAAAA;
uint32 qbits0 = qbits&mask_01b;
uint32 qbits1 = qbits&mask_10b;
qbits = (qbits1>>1) + (qbits1 ^ (qbits0<<1));
return qbits;
}
///////////////////////////////////////////////////////////
// Low level per block handling
//
inline void CompressBlockBC1_core( float BlockInput[48], uint32 BlockOutput[2], bool AllowSecondaryMode )
{
uniform const int powerIterations = 4;
uniform const int refineIterations = 1;
float covar[6];
float dc[3];
compute_covar_dc_ugly(covar, dc, BlockInput);
float eps = 0.001;
covar[0] += eps;
covar[3] += eps;
covar[5] += eps;
float axis[3];
compute_axis3(axis, covar, powerIterations);
float c0[3];
float c1[3];
pick_endpoints(c0, c1, BlockInput, axis, dc);
int p[2];
p[0] = enc_rgb565(c0);
p[1] = enc_rgb565(c1);
if (p[0]<p[1])
swap_ints(&p[0], &p[1], 1);
BlockOutput[0] = (1<<16)*p[1]+p[0];
BlockOutput[1] = fast_quant(BlockInput, p[0], p[1]);
// refine
for (uniform int i=0; i<refineIterations; i++)
{
bc1_refine(p, BlockInput, BlockOutput[1], dc);
if (p[0]<p[1])
swap_ints(&p[0], &p[1], 1);
BlockOutput[0] = (1<<16)*p[1]+p[0];
BlockOutput[1] = fast_quant(BlockInput, p[0], p[1]);
}
BlockOutput[1] = fix_qbits(BlockOutput[1]);
}
///////////////////////////////////////////////////////////
// BC2 quantization routine.
//
// BC2 stores a 4 bit unsigned normalized integer for each alpha value.
//
// Maps alpha values into [0,15] and rounds to nearest integer.
inline void QuantizeBlockBC2_alpha( float AlphaInput[16], uint32 AlphaOutput[2] )
{
int32 Rows[4];
for ( uniform int Row=0; Row<4; Row++ )
{
Rows[Row] = ((((int32)round(AlphaInput[4*Row+0]/255.f*15.f))&0xF)<< 0)
| ((((int32)round(AlphaInput[4*Row+1]/255.f*15.f))&0xF)<< 4)
| ((((int32)round(AlphaInput[4*Row+2]/255.f*15.f))&0xF)<< 8)
| ((((int32)round(AlphaInput[4*Row+3]/255.f*15.f))&0xF)<<12);
}
AlphaOutput[0] = (Rows[0]<<0) | (Rows[1]<<16);
AlphaOutput[1] = (Rows[2]<<0) | (Rows[3]<<16);
}
///////////////////////////////////////////////////////////
// BC3 alpha/BC4/BC5 compression routine.
//
// References:
// https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_texture_compression_s3tc.txt
// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt
// https://msdn.microsoft.com/de-de/library/windows/desktop/bb694531(v=vs.85).aspx
//
// The format stores two 1 byte integer endpoints (alpha_0 and alpha) which are used to
// create a 3-bit/8 value palette. It offers two modes which are selected depending on
// whether alpha_0 > alpha_1.
//
// The endpoints are followed by 16 3-bit values for each pixel.
//
// The endpoints are signed/unsigned byte depending on whether the format is either
// signed or unsigned. The BC3 alpha is regarded as unsigned.
//
// Depending on mode, folowing values are used for the palette (code taken off above msdn link):
//
// if( alpha_0 > alpha_1 )
// {
// // 6 interpolated alpha values.
// alpha_2 = 6/7*alpha_0 + 1/7*alpha_1; // bit code 010
// alpha_3 = 5/7*alpha_0 + 2/7*alpha_1; // bit code 011
// alpha_4 = 4/7*alpha_0 + 3/7*alpha_1; // bit code 100
// alpha_5 = 3/7*alpha_0 + 4/7*alpha_1; // bit code 101
// alpha_6 = 2/7*alpha_0 + 5/7*alpha_1; // bit code 110
// alpha_7 = 1/7*alpha_0 + 6/7*alpha_1; // bit code 111
// }
// else
// {
// // 4 interpolated alpha values.
// alpha_2 = 4/5*alpha_0 + 1/5*alpha_1; // bit code 010
// alpha_3 = 3/5*alpha_0 + 2/5*alpha_1; // bit code 011
// alpha_4 = 2/5*alpha_0 + 3/5*alpha_1; // bit code 100
// alpha_5 = 1/5*alpha_0 + 4/5*alpha_1; // bit code 101
// alpha_6 = 0.0; // bit code 110, for an unsigned format
// alpha_6 =-1.0; // bit code 110, for a signed format
// alpha_7 = 1.0; // bit code 111
// }
//
// In either of them:
// bit code 000 will be alpha_0
// bit code 001 will be alpha_1
//
inline void CompressBlockBC3_alpha( float AlphaInput[16], uint32 AlphaOutput[2] )
{
float MinValue = 255;
float MaxValue = 0;
// Find min and max float values in block.
for ( uniform int k=0; k<16; k++ )
{
MinValue = min( MinValue, AlphaInput[k] );
MaxValue = max( MaxValue, AlphaInput[k] );
}
// Pick endpoints beeing integer values in [0,255].
float Endpoints[2];
Endpoints[0] = min(ceil(MaxValue),255f); // alpha_0.
Endpoints[1] = max(0f,floor(MinValue)); // alpha_1.
// Make sure alpha0 > alpha1 to stay in the same mode.
if ( Endpoints[0]==Endpoints[1] )
{
if ( Endpoints[0]<255f )
Endpoints[0] += 1f;
else
Endpoints[1] -= 1f;
}
uint32 qblock[2] = { 0, 0 }; // Stores 24-bit/8 indices per element.
float scale = 7f/(Endpoints[0]-Endpoints[1]);
for ( uniform int k=0; k<16; k++ )
{
// Projection into [alpha_1,alpha_0], so that: alpha = q/7*alpha_0 + (7-q)/7*alpha_1
float v = AlphaInput[k];
float proj = (v-Endpoints[1])*scale+0.5f; // Maybe round instead?
int q = clamp( (int)proj, 0, 7 );
// Remap projection ordering into alpha_0 > alpha_1 mode ordering.
q = 7-q;
if (q > 0)
q++;
if (q==8)
q = 1;
// Set index aside.
qblock[k/8] |= q << ((k%8)*3);
}
// (could be improved by refinement)
AlphaOutput[0] = clamp((int)Endpoints[0],0,255) + clamp((int)Endpoints[1],0,255)*256;
AlphaOutput[0] |= qblock[0]<<16;
AlphaOutput[1] = qblock[0]>>16;
AlphaOutput[1] |= qblock[1]<<8;
}
///////////////////////////////////////////////////////////
// High level per block handling.
//
// BlockInput[]:
// 0-15: Red
// 16-31: Green
// 32-47: Blue
// 48-63: Alpha
//
inline void CompressBlockBC1( float BlockInput[48], uint32 BlockOutput[2] )
{
CompressBlockBC1_core( BlockInput, BlockOutput, false );
}
// !! I'm using this for secondary mode testing for now.
inline void CompressBlockBC1PA( float BlockInput[48], uint32 BlockOutput[2] )
{
CompressBlockBC1_core ( &BlockInput[ 0], &BlockOutput[2], true );
}
inline void CompressBlockBC2( float BlockInput[64], uint32 BlockOutput[4] )
{
QuantizeBlockBC2_alpha( &BlockInput[48], &BlockOutput[0] );
CompressBlockBC1_core ( &BlockInput[ 0], &BlockOutput[2], false );
}
inline void CompressBlockBC3( float BlockInput[64], uint32 BlockOutput[4] )
{
CompressBlockBC3_alpha( &BlockInput[48], &BlockOutput[0] );
CompressBlockBC1_core ( &BlockInput[ 0], &BlockOutput[2], false );
}
inline void CompressBlockBC4( float BlockInput[16], uint32 BlockOutput[2] )
{
CompressBlockBC3_alpha( BlockInput, BlockOutput );
}
inline void CompressBlockBC5( float BlockInput[32], uint32 BlockOutput[4] )
{
CompressBlockBC3_alpha( &BlockInput[ 0], &BlockOutput[0] );
CompressBlockBC3_alpha( &BlockInput[16], &BlockOutput[2] );
}
///////////////////////////////////////////////////////////
// Dispatcher from input surface to block level.
//
// Templates would be nice here, but not in ispc, maybe preprocessor macros?
// Are there function pointers available in ispc?
//
#define COMPRESSION_ENTRY_POINT(export_name,compress_routine,surface_type,load_routine,num_floats,num_block_dwords) \
export void export_name( uniform surface_type InputSurface[], uniform uint8 OutputBlocks[] ) \
{ \
uniform int NumHorizontalBlocks = InputSurface->Width/4; \
uniform int NumVerticalBlocks = InputSurface->Height/4; \
\
for ( uniform int VerticalBlockIndex=0; VerticalBlockIndex<NumVerticalBlocks; VerticalBlockIndex++ ) \
{ \
foreach ( HorizontalBlockIndex=0...NumHorizontalBlocks ) \
{ \
float BlockInput[num_floats]; \
uint32 BlockOutput[num_block_dwords]; \
\
load_routine( BlockInput, InputSurface, HorizontalBlockIndex, VerticalBlockIndex ); \
compress_routine( BlockInput, BlockOutput ); \
StoreCompressedBlock( OutputBlocks, InputSurface->Width, HorizontalBlockIndex, VerticalBlockIndex, BlockOutput, num_block_dwords ); \
} \
} \
}
// BC1.
COMPRESSION_ENTRY_POINT(BC1CompressRGBA8,CompressBlockBC1,Surface_RGBA8,LoadUncompressed4x4BlockInterleavedNoAlpha_RGBA8,48,2)
COMPRESSION_ENTRY_POINT(BC1CompressRGBA16,CompressBlockBC1,Surface_RGBA16,LoadUncompressed4x4BlockInterleavedNoAlpha_RGBA16,48,2)
// BC1 PA.
COMPRESSION_ENTRY_POINT(BC1PACompressRGBA8,CompressBlockBC1PA,Surface_RGBA8,LoadUncompressed4x4BlockInterleavedNoAlpha_RGBA8,48,2)
COMPRESSION_ENTRY_POINT(BC1PACompressRGBA16,CompressBlockBC1PA,Surface_RGBA16,LoadUncompressed4x4BlockInterleavedNoAlpha_RGBA16,48,2)
// BC2.
COMPRESSION_ENTRY_POINT(BC2CompressRGBA8,CompressBlockBC2,Surface_RGBA8,LoadUncompressed4x4BlockInterleaved_RGBA8,64,4)
COMPRESSION_ENTRY_POINT(BC2CompressRGBA16,CompressBlockBC2,Surface_RGBA16,LoadUncompressed4x4BlockInterleaved_RGBA16,64,4)
// BC3.
COMPRESSION_ENTRY_POINT(BC3CompressRGBA8,CompressBlockBC3,Surface_RGBA8,LoadUncompressed4x4BlockInterleaved_RGBA8,64,4)
COMPRESSION_ENTRY_POINT(BC3CompressRGBA16,CompressBlockBC3,Surface_RGBA16,LoadUncompressed4x4BlockInterleaved_RGBA16,64,4)
// BC4.
COMPRESSION_ENTRY_POINT(BC4CompressR8,CompressBlockBC4,Surface_R8,LoadUncompressed4x4Block_R8,16,2)
COMPRESSION_ENTRY_POINT(BC4CompressR16,CompressBlockBC4,Surface_R16,LoadUncompressed4x4Block_R16,16,2)
// BC5.
COMPRESSION_ENTRY_POINT(BC5CompressRG8,CompressBlockBC5,Surface_RG8,LoadUncompressed4x4BlockInterleaved_RG8,32,4)
COMPRESSION_ENTRY_POINT(BC5CompressRG16,CompressBlockBC5,Surface_RG16,LoadUncompressed4x4BlockInterleaved_RG16,32,4)