mirror of
https://github.com/OldUnreal/KTexComp.git
synced 2026-04-02 21:28:59 -07:00
623 lines
19 KiB
Plaintext
623 lines
19 KiB
Plaintext
//
|
|
// Shared and general purpose code for the encoding implementations.
|
|
//
|
|
|
|
//
|
|
// Copyright (c) 2016 Intel Corporation
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
|
|
// software and associated documentation files (the "Software"), to deal in the Software
|
|
// without restriction, including without limitation the rights to use, copy, modify,
|
|
// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
|
|
// permit persons to whom the Software is furnished to do so, subject to the following
|
|
// conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all copies
|
|
// or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
|
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
|
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
|
|
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
|
|
// OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
//
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Typedefs.
|
|
//
|
|
|
|
typedef unsigned int8 uint8;
|
|
typedef unsigned int16 uint16;
|
|
typedef unsigned int32 uint32;
|
|
typedef unsigned int64 uint64;
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// The following helpers isolate performance warnings
|
|
//
|
|
|
|
// int32
|
|
inline int32 gather_int(const uniform int32* const uniform ptr, int idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
inline void scatter_int(varying int32* uniform ptr, int idx, uint32 value)
|
|
{
|
|
ptr[idx] = value; // (perf warning expected)
|
|
}
|
|
|
|
// uint8
|
|
inline unsigned int8 gather_uint8(const uniform unsigned int8* const uniform ptr, int idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
inline unsigned int8 gather_uint8(const varying unsigned int8* const uniform ptr, int idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
|
|
// uint16
|
|
inline unsigned int16 gather_uint16(const uniform unsigned int16* const uniform ptr, int idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
inline unsigned int16 gather_uint16(const varying unsigned int16* const uniform ptr, int idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
|
|
// uint32
|
|
inline unsigned int32 gather_uint(const uniform unsigned int32* const uniform ptr, int idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
inline unsigned int32 gather_uint(const varying unsigned int32* const uniform ptr, int idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
inline void scatter_uint(uniform unsigned int32* ptr, int idx, uint32 value)
|
|
{
|
|
ptr[idx] = value; // (perf warning expected)
|
|
}
|
|
inline uint32 shift_right(uint32 v, const uniform int bits)
|
|
{
|
|
return v>>bits; // (perf warning expected)
|
|
}
|
|
|
|
// float
|
|
inline float gather_float(uniform float* uniform ptr, int idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
inline float gather_float(varying float* uniform ptr, int idx)
|
|
{
|
|
return ptr[idx]; // (perf warning expected)
|
|
}
|
|
inline void scatter_float(uniform float* uniform ptr, int idx, float value)
|
|
{
|
|
ptr[idx] = value; // (perf warning expected)
|
|
}
|
|
inline void scatter_float(varying float* uniform ptr, int idx, float value)
|
|
{
|
|
ptr[idx] = value; // (perf warning expected)
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Swapping helpers
|
|
|
|
inline void swap(float& a, float& b)
|
|
{
|
|
int t = a;
|
|
a = b; b = t;
|
|
}
|
|
|
|
inline void swap(int& a, int& b)
|
|
{
|
|
int t = a;
|
|
a = b; b = t;
|
|
}
|
|
|
|
inline void swap(uint32& a, uint32& b)
|
|
{
|
|
uint32 t = a;
|
|
a = b; b = t;
|
|
}
|
|
|
|
inline void swap(uint8& a, uint8& b)
|
|
{
|
|
uint8 t = a;
|
|
a = b; b = t;
|
|
}
|
|
|
|
inline void swap_ints(int u[], int v[], uniform int n)
|
|
{
|
|
for (uniform int i=0; i<n; i++)
|
|
{
|
|
int t = u[i];
|
|
u[i] = v[i];
|
|
v[i] = t;
|
|
}
|
|
}
|
|
|
|
inline void swap_uints(uint32 u[], uint32 v[], uniform int n)
|
|
{
|
|
for (uniform int i=0; i<n; i++)
|
|
{
|
|
uint32 t = u[i];
|
|
u[i] = v[i];
|
|
v[i] = t;
|
|
}
|
|
}
|
|
|
|
inline uint32 bswap32(uint32 v)
|
|
{
|
|
uint32 r = 0;
|
|
r += ((v >> 24) & 255) << 0;
|
|
r += ((v >> 16) & 255) << 8;
|
|
r += ((v >> 8) & 255) << 16;
|
|
r += ((v >> 0) & 255) << 24;
|
|
return r;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Scalar math helpers
|
|
|
|
inline float sq(float v)
|
|
{
|
|
return v*v;
|
|
}
|
|
|
|
inline int pow2(int x)
|
|
{
|
|
return 1<<x;
|
|
}
|
|
|
|
inline float clamp(float v, int a, int b)
|
|
{
|
|
return clamp(v, (float)a, (float)b);
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Vector/Matrix/Tensor math helpers
|
|
|
|
// vector dot product.
|
|
inline float dot3(float a[3], float b[3])
|
|
{
|
|
return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];
|
|
}
|
|
inline float dot4(float a[4], float b[4])
|
|
{
|
|
return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];
|
|
}
|
|
|
|
// a = covar*b, where a,b are vectors and b is a matrix.
|
|
inline void ssymv(float a[3], float covar[6], float b[3])
|
|
{
|
|
a[0] = covar[0]*b[0]+covar[1]*b[1]+covar[2]*b[2];
|
|
a[1] = covar[1]*b[0]+covar[3]*b[1]+covar[4]*b[2];
|
|
a[2] = covar[2]*b[0]+covar[4]*b[1]+covar[5]*b[2];
|
|
}
|
|
inline void ssymv3(float a[4], float covar[10], float b[4])
|
|
{
|
|
a[0] = covar[0]*b[0]+covar[1]*b[1]+covar[2]*b[2];
|
|
a[1] = covar[1]*b[0]+covar[4]*b[1]+covar[5]*b[2];
|
|
a[2] = covar[2]*b[0]+covar[5]*b[1]+covar[7]*b[2];
|
|
}
|
|
inline void ssymv4(float a[4], float covar[10], float b[4])
|
|
{
|
|
a[0] = covar[0]*b[0]+covar[1]*b[1]+covar[2]*b[2]+covar[3]*b[3];
|
|
a[1] = covar[1]*b[0]+covar[4]*b[1]+covar[5]*b[2]+covar[6]*b[3];
|
|
a[2] = covar[2]*b[0]+covar[5]*b[1]+covar[7]*b[2]+covar[8]*b[3];
|
|
a[3] = covar[3]*b[0]+covar[6]*b[1]+covar[8]*b[2]+covar[9]*b[3];
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Bit manipulation helpers.
|
|
//
|
|
|
|
inline void put_bits(uint32 data[5], uniform int* uniform pos, uniform int bits, int v)
|
|
{
|
|
assert(v<pow2(bits));
|
|
data[*pos/32] |= ((uint32)v) << (*pos%32);
|
|
if (*pos%32+bits>32)
|
|
data[*pos/32+1] |= shift_right(v, 32-*pos%32);
|
|
*pos += bits;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Helper for pixel access in variable sized block.
|
|
//
|
|
|
|
inline float get_pixel(float pixels[], uniform int p, uniform int x, uniform int y)
|
|
{
|
|
uniform static const int ystride = 8;
|
|
uniform static const int pstride = 64;
|
|
|
|
return pixels[pstride * p + ystride * y + x];
|
|
}
|
|
|
|
inline void set_pixel(float pixels[], uniform int p, uniform int x, uniform int y, float value)
|
|
{
|
|
uniform static const int ystride = 8;
|
|
uniform static const int pstride = 64;
|
|
|
|
pixels[pstride * p + ystride * y + x] = value;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Generic helpers
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Axis computation.
|
|
//
|
|
|
|
inline void compute_axis3(float axis[3], float covar[6], uniform const int powerIterations)
|
|
{
|
|
float vec[3] = {1,1,1};
|
|
|
|
for (uniform int i=0; i<powerIterations; i++)
|
|
{
|
|
ssymv(axis, covar, vec);
|
|
for (uniform int p=0; p<3; p++)
|
|
vec[p] = axis[p];
|
|
|
|
if (i%2==1) // renormalize every other iteration
|
|
{
|
|
float norm_sq = 0;
|
|
for (uniform int p=0; p<3; p++)
|
|
norm_sq += axis[p]*axis[p];
|
|
|
|
float rnorm = rsqrt(norm_sq);
|
|
for (uniform int p=0; p<3; p++)
|
|
vec[p] *= rnorm;
|
|
}
|
|
}
|
|
|
|
for (uniform int p=0; p<3; p++)
|
|
axis[p] = vec[p];
|
|
}
|
|
|
|
inline void compute_axis(float axis[4], float covar[10], uniform const int powerIterations, uniform int channels)
|
|
{
|
|
float vec[4] = {1,1,1,1};
|
|
|
|
for (uniform int i=0; i<powerIterations; i++)
|
|
{
|
|
if (channels == 3)
|
|
ssymv3(axis, covar, vec);
|
|
if (channels == 4)
|
|
ssymv4(axis, covar, vec);
|
|
|
|
for (uniform int p=0; p<channels; p++)
|
|
vec[p] = axis[p];
|
|
|
|
if (i%2==1) // renormalize every other iteration
|
|
{
|
|
float norm_sq = 0;
|
|
for (uniform int p=0; p<channels; p++)
|
|
norm_sq += axis[p]*axis[p];
|
|
|
|
float rnorm = rsqrt(norm_sq);
|
|
for (uniform int p=0; p<channels; p++)
|
|
vec[p] *= rnorm;
|
|
}
|
|
}
|
|
|
|
for (uniform int p=0; p<channels; p++)
|
|
axis[p] = vec[p];
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Sorting helpers
|
|
//
|
|
|
|
inline void partial_sort_list(int list[], uniform int length, uniform int partial_count)
|
|
{
|
|
for (uniform int k=0; k<partial_count; k++)
|
|
{
|
|
int best_idx = k;
|
|
int best_value = list[k];
|
|
for (uniform int i=k+1; i<length; i++)
|
|
{
|
|
if (best_value > list[i])
|
|
{
|
|
best_value = list[i];
|
|
best_idx = i;
|
|
}
|
|
}
|
|
|
|
// swap
|
|
scatter_int(list, best_idx, list[k]);
|
|
list[k] = best_value;
|
|
}
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Input surface structures.
|
|
//
|
|
|
|
struct rgba_surface
|
|
{
|
|
uint8* ptr;
|
|
int width, height, stride;
|
|
};
|
|
|
|
struct rg_surface
|
|
{
|
|
uint8* ptr;
|
|
int width, height, stride;
|
|
};
|
|
|
|
struct red_surface
|
|
{
|
|
uint8* ptr;
|
|
int width, height, stride;
|
|
};
|
|
|
|
// Surface definitions.
|
|
#define DELARE_SURFACE(name) \
|
|
struct name \
|
|
{ \
|
|
uint8* Ptr; \
|
|
int32 Width; \
|
|
int32 Height; \
|
|
int32 Stride; \
|
|
};
|
|
|
|
// 8 bit unsigned int/component
|
|
DELARE_SURFACE(Surface_R8)
|
|
DELARE_SURFACE(Surface_RG8)
|
|
DELARE_SURFACE(Surface_RGBA8)
|
|
|
|
// 8 bit signed int/component
|
|
DELARE_SURFACE(Surface_R8S)
|
|
DELARE_SURFACE(Surface_RG8S)
|
|
|
|
// 16 bit unsigned int/component
|
|
DELARE_SURFACE(Surface_R16)
|
|
DELARE_SURFACE(Surface_RG16)
|
|
DELARE_SURFACE(Surface_RGBA16)
|
|
|
|
// 16 bit signed int/component
|
|
DELARE_SURFACE(Surface_R16S)
|
|
DELARE_SURFACE(Surface_RG16S)
|
|
|
|
// 16 bit float/component (uint16_t in lack of buildin type).
|
|
DELARE_SURFACE(Surface_R16F)
|
|
DELARE_SURFACE(Surface_RG16F)
|
|
DELARE_SURFACE(Surface_RGBA16F)
|
|
|
|
// 32 bit float/component
|
|
DELARE_SURFACE(Surface_R32F)
|
|
DELARE_SURFACE(Surface_RG32F)
|
|
DELARE_SURFACE(Surface_RGBA32F)
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Load data from input surface structures.
|
|
//
|
|
|
|
inline void LoadUncompressed4x4Block_R8( float Block[16], uniform Surface_R8* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
|
|
{
|
|
const uniform int BlockHeight = 4;
|
|
const uniform int BlockWidth = 4;
|
|
|
|
for ( uniform int Row=0; Row<BlockWidth; Row++ )
|
|
{
|
|
uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];
|
|
uint32 RRRR = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex ); // *(BlockWidth/sizeof(uint32)) breaks things for reasons I can't figure out.
|
|
|
|
Block[Row*BlockWidth+0] = (int)((RRRR>> 0)&0xFF);
|
|
Block[Row*BlockWidth+1] = (int)((RRRR>> 8)&0xFF);
|
|
Block[Row*BlockWidth+2] = (int)((RRRR>>16)&0xFF);
|
|
Block[Row*BlockWidth+3] = (int)((RRRR>>24)&0xFF);
|
|
}
|
|
}
|
|
|
|
inline void LoadUncompressed4x4BlockInterleaved_RG8( float InterleavedBlock[32], uniform Surface_RG8* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
|
|
{
|
|
const uniform int BlockHeight = 4;
|
|
const uniform int BlockWidth = 4;
|
|
|
|
for ( uniform int Row=0; Row<BlockWidth; Row++ )
|
|
{
|
|
uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];
|
|
|
|
uint32 RG0RG1 = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*2+0 );
|
|
uint32 RG2RG3 = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*2+1 );
|
|
|
|
InterleavedBlock[16*0+Row*BlockWidth+0] = (int)((RG0RG1>> 0)&0xFF);
|
|
InterleavedBlock[16*1+Row*BlockWidth+0] = (int)((RG0RG1>> 8)&0xFF);
|
|
InterleavedBlock[16*0+Row*BlockWidth+1] = (int)((RG0RG1>>16)&0xFF);
|
|
InterleavedBlock[16*1+Row*BlockWidth+1] = (int)((RG0RG1>>24)&0xFF);
|
|
InterleavedBlock[16*0+Row*BlockWidth+2] = (int)((RG2RG3>> 0)&0xFF);
|
|
InterleavedBlock[16*1+Row*BlockWidth+2] = (int)((RG2RG3>> 8)&0xFF);
|
|
InterleavedBlock[16*0+Row*BlockWidth+3] = (int)((RG2RG3>>16)&0xFF);
|
|
InterleavedBlock[16*1+Row*BlockWidth+3] = (int)((RG2RG3>>24)&0xFF);
|
|
}
|
|
}
|
|
|
|
inline void LoadUncompressed4x4BlockInterleaved_RGBA8( float InterleavedBlock[64], uniform Surface_RGBA8* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
|
|
{
|
|
const uniform int BlockHeight = 4;
|
|
const uniform int BlockWidth = 4;
|
|
|
|
for ( uniform int Row=0; Row<BlockHeight; Row++ )
|
|
{
|
|
for ( uniform int Col=0; Col<BlockWidth; Col++ )
|
|
{
|
|
uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];
|
|
|
|
uint32 RGBA = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*BlockWidth+Col );
|
|
|
|
InterleavedBlock[16*0+Row*BlockWidth+Col] = (int)((RGBA>> 0)&0xFF);
|
|
InterleavedBlock[16*1+Row*BlockWidth+Col] = (int)((RGBA>> 8)&0xFF);
|
|
InterleavedBlock[16*2+Row*BlockWidth+Col] = (int)((RGBA>>16)&0xFF);
|
|
InterleavedBlock[16*3+Row*BlockWidth+Col] = (int)((RGBA>>24)&0xFF);
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void LoadUncompressed4x4BlockInterleavedNoAlpha_RGBA8( float InterleavedBlock[48], uniform Surface_RGBA8* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
|
|
{
|
|
const uniform int BlockHeight = 4;
|
|
const uniform int BlockWidth = 4;
|
|
|
|
for ( uniform int Row=0; Row<BlockHeight; Row++ )
|
|
{
|
|
for ( uniform int Col=0; Col<BlockWidth; Col++ )
|
|
{
|
|
uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];
|
|
|
|
uint32 RGBX = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*BlockWidth+Col );
|
|
|
|
InterleavedBlock[16*0+Row*BlockWidth+Col] = (int)((RGBX>> 0)&0xFF);
|
|
InterleavedBlock[16*1+Row*BlockWidth+Col] = (int)((RGBX>> 8)&0xFF);
|
|
InterleavedBlock[16*2+Row*BlockWidth+Col] = (int)((RGBX>>16)&0xFF);
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void LoadUncompressedVariableBlockInterleaved_RGBA8( float pixels[], uniform rgba_surface InputSurface[], int HorizontalBlockIndex, int VerticalBlockIndex, uniform int BlockWidth, uniform int BlockHeight )
|
|
{
|
|
uniform int pitch = BlockWidth * BlockHeight;
|
|
for (uniform int y = 0; y < BlockHeight; y++)
|
|
{
|
|
for (uniform int x = 0; x < BlockWidth; x++)
|
|
{
|
|
uint32 rgba = gather_uint((uint32*)InputSurface->ptr, ((VerticalBlockIndex * BlockHeight + y)*InputSurface->stride + (HorizontalBlockIndex * BlockWidth + x) * 4)/4);
|
|
|
|
set_pixel(pixels, 0, x, y, (int)((rgba >> 0) & 255));
|
|
set_pixel(pixels, 1, x, y, (int)((rgba >> 8) & 255));
|
|
set_pixel(pixels, 2, x, y, (int)((rgba >> 16) & 255));
|
|
set_pixel(pixels, 3, x, y, (int)((rgba >> 24) & 255));
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void LoadUncompressed4x4Block_R16( float Block[16], uniform Surface_R16* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
|
|
{
|
|
const uniform int BlockHeight = 4;
|
|
const uniform int BlockWidth = 4;
|
|
|
|
for ( uniform int Row=0; Row<BlockWidth; Row++ )
|
|
{
|
|
uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];
|
|
|
|
uint32 R0R1 = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*2+0 );
|
|
uint32 R2R3 = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*2+1 );
|
|
|
|
Block[Row*BlockWidth+0] = ((int)((R0R1>> 0)&0xFFFFu))*(255f/65535f);
|
|
Block[Row*BlockWidth+1] = ((int)((R0R1>>16)&0xFFFFu))*(255f/65535f);
|
|
Block[Row*BlockWidth+2] = ((int)((R2R3>> 0)&0xFFFFu))*(255f/65535f);
|
|
Block[Row*BlockWidth+3] = ((int)((R2R3>>16)&0xFFFFu))*(255f/65535f);
|
|
}
|
|
}
|
|
|
|
inline void LoadUncompressed4x4BlockInterleaved_RG16( float InterleavedBlock[32], uniform Surface_RG16* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
|
|
{
|
|
const uniform int BlockHeight = 4;
|
|
const uniform int BlockWidth = 4;
|
|
|
|
for ( uniform int Row=0; Row<BlockHeight; Row++ )
|
|
{
|
|
for ( uniform int Col=0; Col<BlockWidth; Col++ )
|
|
{
|
|
uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];
|
|
|
|
uint32 RG = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*BlockWidth+Col );
|
|
|
|
InterleavedBlock[16*0+Row*BlockWidth+Col] = ((int)((RG>> 0)&0xFFFFu))*(255f/65535f);
|
|
InterleavedBlock[16*1+Row*BlockWidth+Col] = ((int)((RG>>16)&0xFFFFu))*(255f/65535f);
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void LoadUncompressed4x4BlockInterleaved_RGBA16( float InterleavedBlock[64], uniform Surface_RGBA16* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
|
|
{
|
|
const uniform int BlockHeight = 4;
|
|
const uniform int BlockWidth = 4;
|
|
|
|
for ( uniform int Row=0; Row<BlockHeight; Row++ )
|
|
{
|
|
for ( uniform int Col=0; Col<BlockWidth; Col++ )
|
|
{
|
|
uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];
|
|
|
|
uint32 RG = gather_uint( SurfaceRowStartPtr, (HorizontalBlockIndex*BlockWidth+Col)*2+0 );
|
|
uint32 BA = gather_uint( SurfaceRowStartPtr, (HorizontalBlockIndex*BlockWidth+Col)*2+1 );
|
|
|
|
InterleavedBlock[16*0+Row*BlockWidth+Col] = ((int)((RG>> 0)&0xFFFFu))*(255f/65535f);
|
|
InterleavedBlock[16*1+Row*BlockWidth+Col] = ((int)((RG>>16)&0xFFFFu))*(255f/65535f);
|
|
InterleavedBlock[16*2+Row*BlockWidth+Col] = ((int)((BA>> 0)&0xFFFFu))*(255f/65535f);
|
|
InterleavedBlock[16*3+Row*BlockWidth+Col] = ((int)((BA>>16)&0xFFFFu))*(255f/65535f);
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void LoadUncompressed4x4BlockInterleavedNoAlpha_RGBA16( float InterleavedBlock[48], uniform Surface_RGBA16* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
|
|
{
|
|
const uniform int BlockHeight = 4;
|
|
const uniform int BlockWidth = 4;
|
|
|
|
for ( uniform int Row=0; Row<BlockHeight; Row++ )
|
|
{
|
|
for ( uniform int Col=0; Col<BlockWidth; Col++ )
|
|
{
|
|
uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];
|
|
|
|
uint32 RG = gather_uint( SurfaceRowStartPtr, (HorizontalBlockIndex*BlockWidth+Col)*2+0 );
|
|
uint32 BX = gather_uint( SurfaceRowStartPtr, (HorizontalBlockIndex*BlockWidth+Col)*2+1 );
|
|
|
|
InterleavedBlock[16*0+Row*BlockWidth+Col] = ((int)((RG>> 0)&0xFFFFu))*(255f/65535f);
|
|
InterleavedBlock[16*1+Row*BlockWidth+Col] = ((int)((RG>>16)&0xFFFFu))*(255f/65535f);
|
|
InterleavedBlock[16*2+Row*BlockWidth+Col] = ((int)((BX>> 0)&0xFFFFu))*(255f/65535f);
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void load_4x4_block_interleaved_16bit( float InterleavedBlock[48], uniform rgba_surface* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
|
|
{
|
|
for (uniform int y = 0; y<4; y++)
|
|
{
|
|
for (uniform int x = 0; x<4; x++)
|
|
{
|
|
uniform uint32* uniform src_ptr_r = (uint32*)&InputSurface->ptr[(VerticalBlockIndex * 4 + y)*InputSurface->stride + 0];
|
|
uniform uint32* uniform src_ptr_g = (uint32*)&InputSurface->ptr[(VerticalBlockIndex * 4 + y)*InputSurface->stride + 2];
|
|
uniform uint32* uniform src_ptr_b = (uint32*)&InputSurface->ptr[(VerticalBlockIndex * 4 + y)*InputSurface->stride + 4];
|
|
|
|
uint32 xr = gather_uint(src_ptr_r, (HorizontalBlockIndex * 4 + x) * 2);
|
|
uint32 xg = gather_uint(src_ptr_g, (HorizontalBlockIndex * 4 + x) * 2); // 2 byte aligned load?
|
|
uint32 xb = gather_uint(src_ptr_b, (HorizontalBlockIndex * 4 + x) * 2);
|
|
|
|
InterleavedBlock[16 * 0 + y * 4 + x] = (int)(xr & 0xFFFF);
|
|
InterleavedBlock[16 * 1 + y * 4 + x] = (int)(xg & 0xFFFF);
|
|
InterleavedBlock[16 * 2 + y * 4 + x] = (int)(xb & 0xFFFF);
|
|
InterleavedBlock[16 * 3 + y * 4 + x] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Store data.
|
|
//
|
|
|
|
inline void StoreCompressedBlock( uniform uint8 OutputBase[], int TextureWidth, int HorizontalBlockIndex, uniform int VerticalBlockIndex, uint32 Data[], int NumDataDwords )
|
|
{
|
|
for ( uniform int k=0; k<NumDataDwords; k++ )
|
|
{
|
|
uniform uint32* OutputRow = (uint32*)&OutputBase[(VerticalBlockIndex)*TextureWidth*NumDataDwords];
|
|
scatter_uint( OutputRow, HorizontalBlockIndex*NumDataDwords+k, Data[k]);
|
|
}
|
|
}
|