ispc_texcomp/kernel_shared.isph

//
// Shared and general purpose code for the encoding implementations.
//

//
//  Copyright (c) 2016 Intel Corporation
//
//  Permission is hereby granted, free of charge, to any person obtaining a copy of this 
//  software and associated documentation files (the "Software"), to deal in the Software 
//  without restriction, including without limitation the rights to use, copy, modify, 
//  merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 
//  permit persons to whom the Software is furnished to do so, subject to the following 
//  conditions: 
//
//  The above copyright notice and this permission notice shall be included in all copies 
//  or substantial portions of the Software.  
//
//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
//  INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 
//  PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
//  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
//  CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
//  OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//

///////////////////////////////////////////////////////////
// Typedefs.
//

typedef unsigned int8  uint8;
typedef unsigned int16 uint16;
typedef unsigned int32 uint32;
typedef unsigned int64 uint64;


///////////////////////////////////////////////////////////
// The following helpers isolate performance warnings 
//

// int32
inline int32 gather_int(const uniform int32* const uniform ptr, int idx)
{
	return ptr[idx]; // (perf warning expected)
}
inline void scatter_int(varying int32* uniform ptr, int idx, uint32 value)
{
	ptr[idx] = value; // (perf warning expected)
}

// uint8
inline unsigned int8 gather_uint8(const uniform unsigned int8* const uniform ptr, int idx)
{
	return ptr[idx]; // (perf warning expected)
}
inline unsigned int8 gather_uint8(const varying unsigned int8* const uniform ptr, int idx)
{
	return ptr[idx]; // (perf warning expected)
}

// uint16
inline unsigned int16 gather_uint16(const uniform unsigned int16* const uniform ptr, int idx)
{
	return ptr[idx]; // (perf warning expected)
}
inline unsigned int16 gather_uint16(const varying unsigned int16* const uniform ptr, int idx)
{
	return ptr[idx]; // (perf warning expected)
}

// uint32
inline unsigned int32 gather_uint(const uniform unsigned int32* const uniform ptr, int idx)
{
	return ptr[idx]; // (perf warning expected)
}
inline unsigned int32 gather_uint(const varying unsigned int32* const uniform ptr, int idx)
{
	return ptr[idx]; // (perf warning expected)
}
inline void scatter_uint(uniform unsigned int32* ptr, int idx, uint32 value)
{
	ptr[idx] = value; // (perf warning expected)
}
inline uint32 shift_right(uint32 v, const uniform int bits)
{
	return v>>bits; // (perf warning expected)
}

// float
inline float gather_float(uniform float* uniform ptr, int idx)
{
	return ptr[idx]; // (perf warning expected)
}
inline float gather_float(varying float* uniform ptr, int idx)
{
	return ptr[idx]; // (perf warning expected)
}
inline void scatter_float(uniform float* uniform ptr, int idx, float value)
{
	ptr[idx] = value; // (perf warning expected)
}
inline void scatter_float(varying float* uniform ptr, int idx, float value)
{
	ptr[idx] = value; // (perf warning expected)
}


///////////////////////////////////////////////////////////
// Swapping helpers

inline void swap(float& a, float& b)
{
	int t = a;
	a = b; b = t;
}

inline void swap(int& a, int& b)
{
	int t = a;
	a = b; b = t;
}

inline void swap(uint32& a, uint32& b)
{
	uint32 t = a;
	a = b; b = t;
}

inline void swap(uint8& a, uint8& b)
{
	uint8 t = a;
	a = b; b = t;
}

inline void swap_ints(int u[], int v[], uniform int n)
{
	for (uniform int i=0; i<n; i++)
	{
		int t = u[i];
		u[i] = v[i];
		v[i] = t;
	}
}

inline void swap_uints(uint32 u[], uint32 v[], uniform int n)
{
	for (uniform int i=0; i<n; i++)
	{
		uint32 t = u[i];
		u[i] = v[i];
		v[i] = t;
	}
}

inline uint32 bswap32(uint32 v)
{
	uint32 r = 0;
	r += ((v >> 24) & 255) << 0;
	r += ((v >> 16) & 255) << 8;
	r += ((v >> 8) & 255) << 16;
	r += ((v >> 0) & 255) << 24;
	return r;
}


///////////////////////////////////////////////////////////
// Scalar math helpers

inline float sq(float v)
{
	return v*v;
}

inline int pow2(int x) 
{
	return 1<<x; 
}

inline float clamp(float v, int a, int b)
{
	return clamp(v, (float)a, (float)b);
}


///////////////////////////////////////////////////////////
// Vector/Matrix/Tensor math helpers

// vector dot product.
inline float dot3(float a[3], float b[3])
{
	return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];
}
inline float dot4(float a[4], float b[4])
{
	return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];
}

// a = covar*b, where a,b are vectors and b is a matrix.
inline void ssymv(float a[3], float covar[6], float b[3])
{
	a[0] = covar[0]*b[0]+covar[1]*b[1]+covar[2]*b[2];
	a[1] = covar[1]*b[0]+covar[3]*b[1]+covar[4]*b[2];
	a[2] = covar[2]*b[0]+covar[4]*b[1]+covar[5]*b[2];
}
inline void ssymv3(float a[4], float covar[10], float b[4])
{
	a[0] = covar[0]*b[0]+covar[1]*b[1]+covar[2]*b[2];
	a[1] = covar[1]*b[0]+covar[4]*b[1]+covar[5]*b[2];
	a[2] = covar[2]*b[0]+covar[5]*b[1]+covar[7]*b[2];
}
inline void ssymv4(float a[4], float covar[10], float b[4])
{
	a[0] = covar[0]*b[0]+covar[1]*b[1]+covar[2]*b[2]+covar[3]*b[3];
	a[1] = covar[1]*b[0]+covar[4]*b[1]+covar[5]*b[2]+covar[6]*b[3];
	a[2] = covar[2]*b[0]+covar[5]*b[1]+covar[7]*b[2]+covar[8]*b[3];
	a[3] = covar[3]*b[0]+covar[6]*b[1]+covar[8]*b[2]+covar[9]*b[3];
}


///////////////////////////////////////////////////////////
// Bit manipulation helpers.
//

inline void put_bits(uint32 data[5], uniform int* uniform pos, uniform int bits, int v)
{
	assert(v<pow2(bits));
	data[*pos/32] |= ((uint32)v) << (*pos%32);
	if (*pos%32+bits>32)
		data[*pos/32+1] |= shift_right(v, 32-*pos%32);
	*pos += bits;
}


///////////////////////////////////////////////////////////
// Helper for pixel access in variable sized block.
//

inline float get_pixel(float pixels[], uniform int p, uniform int x, uniform int y)
{
	uniform static const int ystride = 8;
	uniform static const int pstride = 64;

	return pixels[pstride * p + ystride * y + x];
}

inline void set_pixel(float pixels[], uniform int p, uniform int x, uniform int y, float value)
{
	uniform static const int ystride = 8;
	uniform static const int pstride = 64;

	pixels[pstride * p + ystride * y + x] = value;
}


///////////////////////////////////////////////////////////
// Generic helpers


///////////////////////////////////////////////////////////
// Axis computation.
//

inline void compute_axis3(float axis[3], float covar[6], uniform const int powerIterations)
{
	float vec[3] = {1,1,1};

	for (uniform int i=0; i<powerIterations; i++)
	{
		ssymv(axis, covar, vec);
		for (uniform int p=0; p<3; p++)
			vec[p] = axis[p];

		if (i%2==1) // renormalize every other iteration
		{
			float norm_sq = 0;
			for (uniform int p=0; p<3; p++)
				norm_sq += axis[p]*axis[p];

			float rnorm = rsqrt(norm_sq);
			for (uniform int p=0; p<3; p++)
				vec[p] *= rnorm;
		}
	}

	for (uniform int p=0; p<3; p++)
		axis[p] = vec[p];
}

inline void compute_axis(float axis[4], float covar[10], uniform const int powerIterations, uniform int channels)
{
	float vec[4] = {1,1,1,1};

	for (uniform int i=0; i<powerIterations; i++)
	{
		if (channels == 3)
			ssymv3(axis, covar, vec);
		if (channels == 4)
			ssymv4(axis, covar, vec);

		for (uniform int p=0; p<channels; p++)
			vec[p] = axis[p];

		if (i%2==1) // renormalize every other iteration
		{
			float norm_sq = 0;
			for (uniform int p=0; p<channels; p++)
				norm_sq += axis[p]*axis[p];

			float rnorm = rsqrt(norm_sq);
			for (uniform int p=0; p<channels; p++)
				vec[p] *= rnorm;
		}		
	}

	for (uniform int p=0; p<channels; p++)
		axis[p] = vec[p];
}


///////////////////////////////////////////////////////////
// Sorting helpers
//

inline void partial_sort_list(int list[], uniform int length, uniform int partial_count)
{
	for (uniform int k=0; k<partial_count; k++)
	{
		int best_idx = k;
		int best_value = list[k];
		for (uniform int i=k+1; i<length; i++)
		{
			if (best_value > list[i])
			{
				best_value = list[i];
				best_idx = i;
			}
		}

		// swap
		scatter_int(list, best_idx, list[k]);
		list[k] = best_value;
	}
}


///////////////////////////////////////////////////////////
// Input surface structures.
//

struct rgba_surface
{
	uint8* ptr;
	int width, height, stride;
};

struct rg_surface
{
	uint8* ptr;
	int width, height, stride;
};

struct red_surface
{
	uint8* ptr;
	int width, height, stride;
};

// Surface definitions.
#define DELARE_SURFACE(name) \
	struct name \
	{ \
		uint8* Ptr; \
		int32 Width; \
		int32 Height; \
		int32 Stride; \
	};

// 8 bit unsigned int/component
DELARE_SURFACE(Surface_R8)
DELARE_SURFACE(Surface_RG8)
DELARE_SURFACE(Surface_RGBA8)

// 8 bit signed int/component
DELARE_SURFACE(Surface_R8S)
DELARE_SURFACE(Surface_RG8S)

// 16 bit unsigned int/component
DELARE_SURFACE(Surface_R16)
DELARE_SURFACE(Surface_RG16)
DELARE_SURFACE(Surface_RGBA16)

// 16 bit signed int/component
DELARE_SURFACE(Surface_R16S)
DELARE_SURFACE(Surface_RG16S)

// 16 bit float/component (uint16_t in lack of buildin type).
DELARE_SURFACE(Surface_R16F)
DELARE_SURFACE(Surface_RG16F)
DELARE_SURFACE(Surface_RGBA16F)

// 32 bit float/component
DELARE_SURFACE(Surface_R32F)
DELARE_SURFACE(Surface_RG32F)
DELARE_SURFACE(Surface_RGBA32F)

///////////////////////////////////////////////////////////
// Load data from input surface structures.
//

inline void LoadUncompressed4x4Block_R8( float Block[16], uniform Surface_R8* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
{
	const uniform int BlockHeight = 4;
	const uniform int BlockWidth  = 4;

	for ( uniform int Row=0; Row<BlockWidth; Row++ )
	{
		uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];
		uint32 RRRR = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex ); // *(BlockWidth/sizeof(uint32)) breaks things for reasons I can't figure out.

		Block[Row*BlockWidth+0] = (int)((RRRR>> 0)&0xFF);
		Block[Row*BlockWidth+1] = (int)((RRRR>> 8)&0xFF);
		Block[Row*BlockWidth+2] = (int)((RRRR>>16)&0xFF);
		Block[Row*BlockWidth+3] = (int)((RRRR>>24)&0xFF);
	}
}

inline void LoadUncompressed4x4BlockInterleaved_RG8( float InterleavedBlock[32], uniform Surface_RG8* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
{
	const uniform int BlockHeight = 4;
	const uniform int BlockWidth  = 4;

	for ( uniform int Row=0; Row<BlockWidth; Row++ )
	{
		uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];

		uint32 RG0RG1 = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*2+0 );
		uint32 RG2RG3 = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*2+1 );

		InterleavedBlock[16*0+Row*BlockWidth+0] = (int)((RG0RG1>> 0)&0xFF);
		InterleavedBlock[16*1+Row*BlockWidth+0] = (int)((RG0RG1>> 8)&0xFF);
		InterleavedBlock[16*0+Row*BlockWidth+1] = (int)((RG0RG1>>16)&0xFF);
		InterleavedBlock[16*1+Row*BlockWidth+1] = (int)((RG0RG1>>24)&0xFF);
		InterleavedBlock[16*0+Row*BlockWidth+2] = (int)((RG2RG3>> 0)&0xFF);
		InterleavedBlock[16*1+Row*BlockWidth+2] = (int)((RG2RG3>> 8)&0xFF);
		InterleavedBlock[16*0+Row*BlockWidth+3] = (int)((RG2RG3>>16)&0xFF);
		InterleavedBlock[16*1+Row*BlockWidth+3] = (int)((RG2RG3>>24)&0xFF);
	}
}

inline void LoadUncompressed4x4BlockInterleaved_RGBA8( float InterleavedBlock[64], uniform Surface_RGBA8* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
{
	const uniform int BlockHeight = 4;
	const uniform int BlockWidth  = 4;

	for ( uniform int Row=0; Row<BlockHeight; Row++ )
	{
		for ( uniform int Col=0; Col<BlockWidth; Col++ )
		{
			uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];

			uint32 RGBA = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*BlockWidth+Col );

			InterleavedBlock[16*0+Row*BlockWidth+Col] = (int)((RGBA>> 0)&0xFF);
			InterleavedBlock[16*1+Row*BlockWidth+Col] = (int)((RGBA>> 8)&0xFF);
			InterleavedBlock[16*2+Row*BlockWidth+Col] = (int)((RGBA>>16)&0xFF);
			InterleavedBlock[16*3+Row*BlockWidth+Col] = (int)((RGBA>>24)&0xFF);
		}
	}
}

inline void LoadUncompressed4x4BlockInterleavedNoAlpha_RGBA8( float InterleavedBlock[48], uniform Surface_RGBA8* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
{
	const uniform int BlockHeight = 4;
	const uniform int BlockWidth  = 4;

	for ( uniform int Row=0; Row<BlockHeight; Row++ )
	{
		for ( uniform int Col=0; Col<BlockWidth; Col++ )
		{
			uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];

			uint32 RGBX = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*BlockWidth+Col );

			InterleavedBlock[16*0+Row*BlockWidth+Col] = (int)((RGBX>> 0)&0xFF);
			InterleavedBlock[16*1+Row*BlockWidth+Col] = (int)((RGBX>> 8)&0xFF);
			InterleavedBlock[16*2+Row*BlockWidth+Col] = (int)((RGBX>>16)&0xFF);
		}
	}
}

inline void LoadUncompressedVariableBlockInterleaved_RGBA8( float pixels[], uniform rgba_surface InputSurface[], int HorizontalBlockIndex, int VerticalBlockIndex, uniform int BlockWidth, uniform int BlockHeight )
{
	uniform int pitch = BlockWidth * BlockHeight;
	for (uniform int y = 0; y < BlockHeight; y++)
	{
		for (uniform int x = 0; x < BlockWidth; x++)
		{
			uint32 rgba = gather_uint((uint32*)InputSurface->ptr, ((VerticalBlockIndex * BlockHeight + y)*InputSurface->stride + (HorizontalBlockIndex * BlockWidth + x) * 4)/4);

			set_pixel(pixels, 0, x, y, (int)((rgba >> 0) & 255));
			set_pixel(pixels, 1, x, y, (int)((rgba >> 8) & 255));
			set_pixel(pixels, 2, x, y, (int)((rgba >> 16) & 255));
			set_pixel(pixels, 3, x, y, (int)((rgba >> 24) & 255));
		}
	}
}

inline void LoadUncompressed4x4Block_R16( float Block[16], uniform Surface_R16* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
{
	const uniform int BlockHeight = 4;
	const uniform int BlockWidth  = 4;

	for ( uniform int Row=0; Row<BlockWidth; Row++ )
	{
		uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];

		uint32 R0R1 = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*2+0 );
		uint32 R2R3 = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*2+1 );

		Block[Row*BlockWidth+0] = ((int)((R0R1>> 0)&0xFFFFu))*(255f/65535f);
		Block[Row*BlockWidth+1] = ((int)((R0R1>>16)&0xFFFFu))*(255f/65535f);
		Block[Row*BlockWidth+2] = ((int)((R2R3>> 0)&0xFFFFu))*(255f/65535f);
		Block[Row*BlockWidth+3] = ((int)((R2R3>>16)&0xFFFFu))*(255f/65535f);
	}
}

inline void LoadUncompressed4x4BlockInterleaved_RG16( float InterleavedBlock[32], uniform Surface_RG16* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
{
	const uniform int BlockHeight = 4;
	const uniform int BlockWidth  = 4;

	for ( uniform int Row=0; Row<BlockHeight; Row++ )
	{
		for ( uniform int Col=0; Col<BlockWidth; Col++ )
		{
			uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];

			uint32 RG = gather_uint( SurfaceRowStartPtr, HorizontalBlockIndex*BlockWidth+Col );

			InterleavedBlock[16*0+Row*BlockWidth+Col] = ((int)((RG>> 0)&0xFFFFu))*(255f/65535f);
			InterleavedBlock[16*1+Row*BlockWidth+Col] = ((int)((RG>>16)&0xFFFFu))*(255f/65535f);
		}
	}
}

inline void LoadUncompressed4x4BlockInterleaved_RGBA16( float InterleavedBlock[64], uniform Surface_RGBA16* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
{
	const uniform int BlockHeight = 4;
	const uniform int BlockWidth  = 4;

	for ( uniform int Row=0; Row<BlockHeight; Row++ )
	{
		for ( uniform int Col=0; Col<BlockWidth; Col++ )
		{
			uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];

			uint32 RG = gather_uint( SurfaceRowStartPtr, (HorizontalBlockIndex*BlockWidth+Col)*2+0 );
			uint32 BA = gather_uint( SurfaceRowStartPtr, (HorizontalBlockIndex*BlockWidth+Col)*2+1 );

			InterleavedBlock[16*0+Row*BlockWidth+Col] = ((int)((RG>> 0)&0xFFFFu))*(255f/65535f);
			InterleavedBlock[16*1+Row*BlockWidth+Col] = ((int)((RG>>16)&0xFFFFu))*(255f/65535f);
			InterleavedBlock[16*2+Row*BlockWidth+Col] = ((int)((BA>> 0)&0xFFFFu))*(255f/65535f);
			InterleavedBlock[16*3+Row*BlockWidth+Col] = ((int)((BA>>16)&0xFFFFu))*(255f/65535f);
		}
	}
}

inline void LoadUncompressed4x4BlockInterleavedNoAlpha_RGBA16( float InterleavedBlock[48], uniform Surface_RGBA16* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
{
	const uniform int BlockHeight = 4;
	const uniform int BlockWidth  = 4;

	for ( uniform int Row=0; Row<BlockHeight; Row++ )
	{
		for ( uniform int Col=0; Col<BlockWidth; Col++ )
		{
			uniform uint32* uniform SurfaceRowStartPtr = (uint32*)&InputSurface->Ptr[(VerticalBlockIndex*BlockHeight+Row)*InputSurface->Stride];

			uint32 RG = gather_uint( SurfaceRowStartPtr, (HorizontalBlockIndex*BlockWidth+Col)*2+0 );
			uint32 BX = gather_uint( SurfaceRowStartPtr, (HorizontalBlockIndex*BlockWidth+Col)*2+1 );

			InterleavedBlock[16*0+Row*BlockWidth+Col] = ((int)((RG>> 0)&0xFFFFu))*(255f/65535f);
			InterleavedBlock[16*1+Row*BlockWidth+Col] = ((int)((RG>>16)&0xFFFFu))*(255f/65535f);
			InterleavedBlock[16*2+Row*BlockWidth+Col] = ((int)((BX>> 0)&0xFFFFu))*(255f/65535f);
		}
	}
}

inline void load_4x4_block_interleaved_16bit( float InterleavedBlock[48], uniform rgba_surface* uniform InputSurface, int HorizontalBlockIndex, uniform int VerticalBlockIndex )
{
	for (uniform int y = 0; y<4; y++)
	{
		for (uniform int x = 0; x<4; x++)
		{
			uniform uint32* uniform src_ptr_r = (uint32*)&InputSurface->ptr[(VerticalBlockIndex * 4 + y)*InputSurface->stride + 0];
			uniform uint32* uniform src_ptr_g = (uint32*)&InputSurface->ptr[(VerticalBlockIndex * 4 + y)*InputSurface->stride + 2];
			uniform uint32* uniform src_ptr_b = (uint32*)&InputSurface->ptr[(VerticalBlockIndex * 4 + y)*InputSurface->stride + 4];

			uint32 xr = gather_uint(src_ptr_r, (HorizontalBlockIndex * 4 + x) * 2);
			uint32 xg = gather_uint(src_ptr_g, (HorizontalBlockIndex * 4 + x) * 2); // 2 byte aligned load?
			uint32 xb = gather_uint(src_ptr_b, (HorizontalBlockIndex * 4 + x) * 2);

			InterleavedBlock[16 * 0 + y * 4 + x] = (int)(xr & 0xFFFF);
			InterleavedBlock[16 * 1 + y * 4 + x] = (int)(xg & 0xFFFF);
			InterleavedBlock[16 * 2 + y * 4 + x] = (int)(xb & 0xFFFF);
			InterleavedBlock[16 * 3 + y * 4 + x] = 0;
		}
	}
}

///////////////////////////////////////////////////////////
// Store data.
//

inline void StoreCompressedBlock( uniform uint8 OutputBase[], int TextureWidth, int HorizontalBlockIndex, uniform int VerticalBlockIndex, uint32 Data[], int NumDataDwords )
{
	for ( uniform int k=0; k<NumDataDwords; k++ )
	{
		uniform uint32* OutputRow = (uint32*)&OutputBase[(VerticalBlockIndex)*TextureWidth*NumDataDwords];
		scatter_uint( OutputRow, HorizontalBlockIndex*NumDataDwords+k, Data[k]);
	}
}