// // BC1-BC5 encoding implementation. // // // Copyright (c) 2016 Intel Corporation // // Permission is hereby granted, free of charge, to any person obtaining a copy of this // software and associated documentation files (the "Software"), to deal in the Software // without restriction, including without limitation the rights to use, copy, modify, // merge, publish, distribute, sublicense, and/or sell copies of the Software, and to // permit persons to whom the Software is furnished to do so, subject to the following // conditions: // // The above copyright notice and this permission notice shall be included in all copies // or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, // INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF // CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE // OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // #include "kernel_shared.isph" /////////////////////////////////////////////////////////// // BC1/BC2/BC3/BC4/BC5/BC7 shared /////////////////////////////////////////////////////////// // BC1/BC2/BC3/BC4/BC5 encoding inline int stb__Mul8Bit(int a, int b) { int t = a*b + 128; return (t + (t >> 8)) >> 8; } inline unsigned int16 stb__As16Bit(int r, int g, int b) { return (stb__Mul8Bit(r,31) << 11) + (stb__Mul8Bit(g,63) << 5) + stb__Mul8Bit(b,31); } inline unsigned int16 enc_rgb565(float c[3]) { return stb__As16Bit((int)c[0], (int)c[1], (int)c[2]); } inline void dec_rgb565(float c[3], int p) { int c2 = (p>>0)&31; int c1 = (p>>5)&63; int c0 = (p>>11)&31; c[0] = (c0<<3)+(c0>>2); c[1] = (c1<<2)+(c1>>4); c[2] = (c2<<3)+(c2>>2); } inline void pick_endpoints_dc(int c0[3], int c1[3], int block[48], int iaxis[3]) { for (uniform int p=0; p<3; p++) for (uniform int y=0; y<4; y++) for (uniform int x=0; x<4; x++) c0[p] += block[p*16+y*4+x]; for (uniform int p=0; p<3; p++) c0[p] >>= 4; } inline void pick_endpoints(float c0[3], float c1[3], float block[48], float axis[3], float dc[3]) { float min_dot = 256*256; float max_dot = 0; for (uniform int y=0; y<4; y++) { for (uniform int x=0; x<4; x++) { float dot = 0; for (uniform int p=0; p<3; p++) dot += (block[p*16+y*4+x]-dc[p])*axis[p]; min_dot = min(min_dot, dot); max_dot = max(max_dot, dot); } } if (max_dot-min_dot < 1f) { min_dot -= 0.5f; max_dot += 0.5f; } float norm_sq = 0; for (uniform int p=0; p<3; p++) norm_sq += axis[p]*axis[p]; float rnorm_sq = rcp(norm_sq); for (uniform int p=0; p<3; p++) { c0[p] = clamp(dc[p]+min_dot*rnorm_sq*axis[p], 0, 255); c1[p] = clamp(dc[p]+max_dot*rnorm_sq*axis[p], 0, 255); } } inline uint32 fast_quant(float block[48], int p0, int p1) { float c0[3]; float c1[3]; dec_rgb565(c0, p0); dec_rgb565(c1, p1); float dir[3]; for (uniform int p=0; p<3; p++) dir[p] = c1[p]-c0[p]; float sq_norm = 0; for (uniform int p=0; p<3; p++) sq_norm += sq(dir[p]); float rsq_norm = rcp(sq_norm); for (uniform int p=0; p<3; p++) dir[p] *= rsq_norm*3; float bias = 0.5; for (uniform int p=0; p<3; p++) bias -= c0[p]*dir[p]; uint32 bits = 0; uint32 scaler = 1; for (uniform int k=0; k<16; k++) { float dot = 0; for (uniform int p=0; p<3; p++) dot += block[k+p*16]*dir[p]; int q = clamp((int)(dot+bias), 0, 3); //bits += q<<(k*2); bits += q*scaler; scaler *= 4; } return bits; } inline void compute_covar_dc(float covar[6], float dc[3], float block[48]) { for (uniform int i=0; i<6; i++) covar[i] = 0; for (uniform int p=0; p<3; p++) dc[p] = 0; for (uniform int k=0; k<16; k++) for (uniform int p=0; p<3; p++) dc[p] += block[k+p*16]; for (uniform int p=0; p<3; p++) dc[p] /= 16; for (uniform int k=0; k<16; k++) { float rgb[3]; for (uniform int p=0; p<3; p++) rgb[p] = block[k+p*16]-dc[p]; covar[0] += rgb[0]*rgb[0]; covar[1] += rgb[0]*rgb[1]; covar[2] += rgb[0]*rgb[2]; covar[3] += rgb[1]*rgb[1]; covar[4] += rgb[1]*rgb[2]; covar[5] += rgb[2]*rgb[2]; } } // ugly, but makes BC1 compression 20% faster overall inline void compute_covar_dc_ugly(float covar[6], float dc[3], float block[48]) { for (uniform int p=0; p<3; p++) { float acc = 0; for (uniform int k=0; k<16; k++) acc += block[k+p*16]; dc[p] = acc/16; } float covar0 = 0f; float covar1 = 0f; float covar2 = 0f; float covar3 = 0f; float covar4 = 0f; float covar5 = 0f; for (uniform int k=0; k<16; k++) { float rgb0, rgb1, rgb2; rgb0 = block[k+0*16]-dc[0]; rgb1 = block[k+1*16]-dc[1]; rgb2 = block[k+2*16]-dc[2]; covar0 += rgb0*rgb0; covar1 += rgb0*rgb1; covar2 += rgb0*rgb2; covar3 += rgb1*rgb1; covar4 += rgb1*rgb2; covar5 += rgb2*rgb2; } covar[0] = covar0; covar[1] = covar1; covar[2] = covar2; covar[3] = covar3; covar[4] = covar4; covar[5] = covar5; } inline void bc1_refine(int pe[2], float block[48], unsigned int32 bits, float dc[3]) { float c0[3]; float c1[3]; if ((bits ^ (bits*4)) < 4) { // single color for (uniform int p=0; p<3; p++) { c0[p] = dc[p]; c1[p] = dc[p]; } } else { float Atb1[3] = {0,0,0}; float sum_q = 0; float sum_qq = 0; unsigned int32 shifted_bits = bits; for (uniform int k=0; k<16; k++) { float q = (int)(shifted_bits&3); shifted_bits >>= 2; float x = 3-q; float y = q; sum_q += q; sum_qq += q*q; for (uniform int p=0; p<3; p++) Atb1[p] += x*block[k+p*16]; } float sum[3]; float Atb2[3]; for (uniform int p=0; p<3; p++) { sum[p] = dc[p]*16; Atb2[p] = 3*sum[p]-Atb1[p]; } float Cxx = 16*sq(3)-2*3*sum_q+sum_qq; float Cyy = sum_qq; float Cxy = 3*sum_q-sum_qq; float scale = 3f * rcp(Cxx*Cyy - Cxy*Cxy); for (uniform int p=0; p<3; p++) { c0[p] = (Atb1[p]*Cyy - Atb2[p]*Cxy)*scale; c1[p] = (Atb2[p]*Cxx - Atb1[p]*Cxy)*scale; c0[p] = clamp(c0[p], 0, 255); c1[p] = clamp(c1[p], 0, 255); } } pe[0] = enc_rgb565(c0); pe[1] = enc_rgb565(c1); } inline uint32 fix_qbits(uint32 qbits) { uniform const uint32 mask_01b = 0x55555555; uniform const uint32 mask_10b = 0xAAAAAAAA; uint32 qbits0 = qbits&mask_01b; uint32 qbits1 = qbits&mask_10b; qbits = (qbits1>>1) + (qbits1 ^ (qbits0<<1)); return qbits; } /////////////////////////////////////////////////////////// // Low level per block handling // inline void CompressBlockBC1_core( float BlockInput[48], uint32 BlockOutput[2], bool AllowSecondaryMode ) { uniform const int powerIterations = 4; uniform const int refineIterations = 1; float covar[6]; float dc[3]; compute_covar_dc_ugly(covar, dc, BlockInput); float eps = 0.001; covar[0] += eps; covar[3] += eps; covar[5] += eps; float axis[3]; compute_axis3(axis, covar, powerIterations); float c0[3]; float c1[3]; pick_endpoints(c0, c1, BlockInput, axis, dc); int p[2]; p[0] = enc_rgb565(c0); p[1] = enc_rgb565(c1); if (p[0]
alpha_1.
//
// The endpoints are followed by 16 3-bit values for each pixel.
//
// The endpoints are signed/unsigned byte depending on whether the format is either
// signed or unsigned. The BC3 alpha is regarded as unsigned.
//
// Depending on mode, folowing values are used for the palette (code taken off above msdn link):
//
// if( alpha_0 > alpha_1 )
// {
// // 6 interpolated alpha values.
// alpha_2 = 6/7*alpha_0 + 1/7*alpha_1; // bit code 010
// alpha_3 = 5/7*alpha_0 + 2/7*alpha_1; // bit code 011
// alpha_4 = 4/7*alpha_0 + 3/7*alpha_1; // bit code 100
// alpha_5 = 3/7*alpha_0 + 4/7*alpha_1; // bit code 101
// alpha_6 = 2/7*alpha_0 + 5/7*alpha_1; // bit code 110
// alpha_7 = 1/7*alpha_0 + 6/7*alpha_1; // bit code 111
// }
// else
// {
// // 4 interpolated alpha values.
// alpha_2 = 4/5*alpha_0 + 1/5*alpha_1; // bit code 010
// alpha_3 = 3/5*alpha_0 + 2/5*alpha_1; // bit code 011
// alpha_4 = 2/5*alpha_0 + 3/5*alpha_1; // bit code 100
// alpha_5 = 1/5*alpha_0 + 4/5*alpha_1; // bit code 101
// alpha_6 = 0.0; // bit code 110, for an unsigned format
// alpha_6 =-1.0; // bit code 110, for a signed format
// alpha_7 = 1.0; // bit code 111
// }
//
// In either of them:
// bit code 000 will be alpha_0
// bit code 001 will be alpha_1
//
inline void CompressBlockBC3_alpha( float AlphaInput[16], uint32 AlphaOutput[2] )
{
float MinValue = 255;
float MaxValue = 0;
// Find min and max float values in block.
for ( uniform int k=0; k<16; k++ )
{
MinValue = min( MinValue, AlphaInput[k] );
MaxValue = max( MaxValue, AlphaInput[k] );
}
// Pick endpoints beeing integer values in [0,255].
float Endpoints[2];
Endpoints[0] = min(ceil(MaxValue),255f); // alpha_0.
Endpoints[1] = max(0f,floor(MinValue)); // alpha_1.
// Make sure alpha0 > alpha1 to stay in the same mode.
if ( Endpoints[0]==Endpoints[1] )
{
if ( Endpoints[0]<255f )
Endpoints[0] += 1f;
else
Endpoints[1] -= 1f;
}
uint32 qblock[2] = { 0, 0 }; // Stores 24-bit/8 indices per element.
float scale = 7f/(Endpoints[0]-Endpoints[1]);
for ( uniform int k=0; k<16; k++ )
{
// Projection into [alpha_1,alpha_0], so that: alpha = q/7*alpha_0 + (7-q)/7*alpha_1
float v = AlphaInput[k];
float proj = (v-Endpoints[1])*scale+0.5f; // Maybe round instead?
int q = clamp( (int)proj, 0, 7 );
// Remap projection ordering into alpha_0 > alpha_1 mode ordering.
q = 7-q;
if (q > 0)
q++;
if (q==8)
q = 1;
// Set index aside.
qblock[k/8] |= q << ((k%8)*3);
}
// (could be improved by refinement)
AlphaOutput[0] = clamp((int)Endpoints[0],0,255) + clamp((int)Endpoints[1],0,255)*256;
AlphaOutput[0] |= qblock[0]<<16;
AlphaOutput[1] = qblock[0]>>16;
AlphaOutput[1] |= qblock[1]<<8;
}
///////////////////////////////////////////////////////////
// High level per block handling.
//
// BlockInput[]:
// 0-15: Red
// 16-31: Green
// 32-47: Blue
// 48-63: Alpha
//
inline void CompressBlockBC1( float BlockInput[48], uint32 BlockOutput[2] )
{
CompressBlockBC1_core( BlockInput, BlockOutput, false );
}
// !! I'm using this for secondary mode testing for now.
inline void CompressBlockBC1PA( float BlockInput[48], uint32 BlockOutput[2] )
{
CompressBlockBC1_core ( &BlockInput[ 0], &BlockOutput[2], true );
}
inline void CompressBlockBC2( float BlockInput[64], uint32 BlockOutput[4] )
{
QuantizeBlockBC2_alpha( &BlockInput[48], &BlockOutput[0] );
CompressBlockBC1_core ( &BlockInput[ 0], &BlockOutput[2], false );
}
inline void CompressBlockBC3( float BlockInput[64], uint32 BlockOutput[4] )
{
CompressBlockBC3_alpha( &BlockInput[48], &BlockOutput[0] );
CompressBlockBC1_core ( &BlockInput[ 0], &BlockOutput[2], false );
}
inline void CompressBlockBC4( float BlockInput[16], uint32 BlockOutput[2] )
{
CompressBlockBC3_alpha( BlockInput, BlockOutput );
}
inline void CompressBlockBC5( float BlockInput[32], uint32 BlockOutput[4] )
{
CompressBlockBC3_alpha( &BlockInput[ 0], &BlockOutput[0] );
CompressBlockBC3_alpha( &BlockInput[16], &BlockOutput[2] );
}
///////////////////////////////////////////////////////////
// Dispatcher from input surface to block level.
//
// Templates would be nice here, but not in ispc, maybe preprocessor macros?
// Are there function pointers available in ispc?
//
#define COMPRESSION_ENTRY_POINT(export_name,compress_routine,surface_type,load_routine,num_floats,num_block_dwords) \
export void export_name( uniform surface_type InputSurface[], uniform uint8 OutputBlocks[] ) \
{ \
uniform int NumHorizontalBlocks = InputSurface->Width/4; \
uniform int NumVerticalBlocks = InputSurface->Height/4; \
\
for ( uniform int VerticalBlockIndex=0; VerticalBlockIndex