2023-08-03 07:16:11 -04:00
// Copyright Epic Games, Inc. All Rights Reserved.
# if VECTORVM_SUPPORTS_EXPERIMENTAL
# include "VectorVMExperimental.h"
# include "VectorVM.h"
2023-08-03 10:05:39 -04:00
# include "HAL/ConsoleManager.h"
2023-08-03 07:16:11 -04:00
//prototypes for internal functions to the VVM
# if VECTORVM_SUPPORTS_SERIALIZATION
void VVMSer_serializeInstruction ( FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState ,
FVectorVMState * VVMState , FVectorVMBatchState * BatchState ,
int StartInstance , int NumInstancesThisChunk , int NumLoops ,
int OpStart , int NumOps , uint64 StartInsCycles , uint64 EndInsCycles ) ;
uint32 VVMSer_initSerializationState_ ( FVectorVMSerializeState * SerializeState , FVectorVMExecContext * ExecCtx , const FVectorVMOptimizeContext * OptimizeContext , uint32 Flags ) ;
int VVMGetRegisterType ( FVectorVMState * VVMState , uint16 RegIdx , uint16 * OutAbsReg ) ;
uint32 * VVMSer_getRegPtrTablePtrFromIns ( FVectorVMSerializeState * SerializeState , FVectorVMSerializeInstruction * Ins , uint16 RegIdx ) ;
# endif
2023-10-06 10:49:41 -04:00
# if VECTORVM_DEBUG_PRINTF
# define VECTORVM_PRINTF(fmt, ...) \
{ \
char buff [ 1024 ] ; \
snprintf ( buff , sizeof ( buff ) , fmt , __VA_ARGS__ ) ; \
wchar_t buff16 [ 1024 ] ; \
char * p = buff ; \
wchar_t * p16 = buff16 ; \
while ( * p ) * p16 + + = * p + + ; \
* p16 = 0 ; \
UE_LOG ( LogVectorVM , Warning , TEXT ( " %s " ) , ( TCHAR * ) buff16 ) ; \
}
# define VECTORVM_PRINT_NEW_LINE()
int VVMGetRegisterType ( FVectorVMState * VVMState , uint16 RegIdx , uint16 * OutAbsReg ) ;
# endif
2023-08-03 07:16:11 -04:00
void * VVMDefaultRealloc ( void * Ptr , size_t NumBytes , const char * Filename , int LineNumber )
{
return FMemory : : Realloc ( Ptr , NumBytes ) ;
}
void VVMDefaultFree ( void * Ptr , const char * Filename , int LineNumber )
{
return FMemory : : Free ( Ptr ) ;
}
//cvar
static int32 GVVMPageSizeInKB = 64 ;
static FAutoConsoleVariableRef CVarVVMPageSizeInKB (
TEXT ( " vm.PageSizeInKB " ) ,
GVVMPageSizeInKB ,
TEXT ( " Minimum allocation per VM instance. There are 64 of these, so multiply GVVMPageSizeInKB * 64 * 1024 to get total number of bytes used by the VVM \n " ) ,
ECVF_ReadOnly
) ;
# if UE_BUILD_DEBUG
# define VM_FORCEINLINE FORCENOINLINE
# else
# define VM_FORCEINLINE FORCEINLINE
# endif
static const char VVM_RT_CHAR [ ] = {
' R ' , ' C ' , ' I ' , ' O ' , ' X '
} ;
# define VVM_CHUNK_FIXED_OVERHEAD_SIZE 512
//to avoid memset/memcpy when statically initializing sse variables
# define VVMSet_m128Const(Name, V) static const MS_ALIGN(16) float VVMConstVec4_##Name##4[4] GCC_ALIGN(16) = { V, V, V, V }
# define VVMSet_m128Const4(Name, V0, V1, V2, V3) static const MS_ALIGN(16) float VVMConstVec4_##Name##4[4] GCC_ALIGN(16) = { V0, V1, V2, V3 }
# define VVMSet_m128iConst(Name, V) static const MS_ALIGN(16) uint32 VVMConstVec4_##Name##4i[4] GCC_ALIGN(16) = { V, V, V, V }
# define VVMSet_m128iConst4(Name, V0, V1, V2, V3) static const MS_ALIGN(16) uint32 VVMConstVec4_##Name##4i[4] GCC_ALIGN(16) = { V0, V1, V2, V3 } /* equiv to setr */
# define VVM_m128Const(Name) (*(VectorRegister4f *)&(VVMConstVec4_##Name##4))
# define VVM_m128iConst(Name) (*(VectorRegister4i *)&(VVMConstVec4_##Name##4i))
VVMSet_m128Const ( One , 1.f ) ;
VVMSet_m128Const ( NegativeOne , - 1.f ) ;
VVMSet_m128Const ( OneHalf , 0.5f ) ;
VVMSet_m128Const ( Epsilon , 1.e-8 f ) ;
VVMSet_m128Const ( HalfPi , 3.14159265359f * 0.5f ) ;
VVMSet_m128Const ( QuarterPi , 3.14159265359f * 0.25f ) ;
VVMSet_m128Const ( FastSinA , 7.5894663844f ) ;
VVMSet_m128Const ( FastSinB , 1.6338434578f ) ;
VVMSet_m128Const ( Log2 , 0.6931471806f ) ;
VVMSet_m128Const ( OneOverLog2 , 1.4426950409f ) ;
VVMSet_m128iConst ( FMask , 0xFFFFFFFF ) ;
VVMSet_m128iConst4 ( ZeroOneTwoThree , 0 , 1 , 2 , 3 ) ;
VVMSet_m128iConst4 ( ZeroTwoFourSix , 0 , 2 , 4 , 6 ) ;
VVMSet_m128Const4 ( ZeroOneTwoThree , 0.f , 1.f , 2.f , 3.f ) ;
VVMSet_m128iConst ( RegOffsetMask , 0x7FFF ) ;
VVMSet_m128Const ( RegOneOverTwoPi , 1.f / 2.f / 3.14159265359f ) ;
VVMSet_m128iConst ( AlmostTwoBits , 0x3fffffff ) ;
//the output instructions work on 4 32 bit values at a time. The acquireindex instruction writes 1 byte for every
//4 instances. The lower 4 bits correspond to whether the instance should be kept or not. 1 for keep, 0 for discard
//So for instance if we have 4 instances, and we're keeping the first and third, then acquireindex would write 0101
//This is 5 in base 10. The output instructions would use the 5th element of this table to shuffle the values we
//want to write into the correct position. On x64 this is used by the _mm_shuffle_epi8 intrinsic,
//and vqtbl1q_u8 on ARM. These are used via the macro VVM_pshufb (pshufb is the x64 instruction that corresponds to
//_mm_shuffle_epi8)
static const MS_ALIGN ( 16 ) uint8 VVM_PSHUFB_OUTPUT_TABLE [ ] GCC_ALIGN ( 16 ) =
{
0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // xxxx
0x00 , 0x01 , 0x02 , 0x03 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 0xxx
0x04 , 0x05 , 0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 1xxx
0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 01xx
0x08 , 0x09 , 0x0A , 0x0B , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 2xxx
0x00 , 0x01 , 0x02 , 0x03 , 0x08 , 0x09 , 0x0A , 0x0B , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 02xx
0x04 , 0x05 , 0x06 , 0x07 , 0x08 , 0x09 , 0x0A , 0x0B , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 12xx
0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 , 0x08 , 0x09 , 0x0A , 0x0B , 0xFF , 0xFF , 0xFF , 0xFF , // 012x
0x0C , 0x0D , 0x0E , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 3xxx
0x00 , 0x01 , 0x02 , 0x03 , 0x0C , 0x0D , 0x0E , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 03xx
0x04 , 0x05 , 0x06 , 0x07 , 0x0C , 0x0D , 0x0E , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 013x
0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 , 0x0C , 0x0D , 0x0E , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , // 013x
0x08 , 0x09 , 0x0A , 0x0B , 0x0C , 0x0D , 0x0E , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , // 23xx
0x00 , 0x01 , 0x02 , 0x03 , 0x08 , 0x09 , 0x0A , 0x0B , 0x0C , 0x0D , 0x0E , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , // 023x
0x04 , 0x05 , 0x06 , 0x07 , 0x08 , 0x09 , 0x0A , 0x0B , 0x0C , 0x0D , 0x0E , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , // 123x
0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 , 0x08 , 0x09 , 0x0A , 0x0B , 0x0C , 0x0D , 0x0E , 0x0F // 0123
} ;
static const MS_ALIGN ( 16 ) uint8 VVM_PSHUFB_OUTPUT_TABLE16 [ ] GCC_ALIGN ( 16 ) =
{
0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x00 , 0x01 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x02 , 0x03 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x00 , 0x01 , 0x02 , 0x03 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x04 , 0x05 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x00 , 0x01 , 0x04 , 0x05 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x02 , 0x03 , 0x04 , 0x05 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x00 , 0x01 , 0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x02 , 0x03 , 0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x00 , 0x01 , 0x02 , 0x03 , 0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x04 , 0x05 , 0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x00 , 0x01 , 0x04 , 0x05 , 0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF
} ;
static const uint8 VVM_OUTPUT_ADVANCE_TABLE [ ] =
{
0 , 4 , 4 , 8 ,
4 , 8 , 8 , 12 ,
4 , 8 , 8 , 12 ,
8 , 12 , 12 , 16
} ;
static const uint8 VVM_OUTPUT_ADVANCE_TABLE16 [ ] =
{
0 , 2 , 2 , 4 ,
2 , 4 , 4 , 6 ,
2 , 4 , 4 , 6 ,
4 , 6 , 6 , 8
} ;
2023-10-23 19:25:08 -04:00
struct FVVM_VUI4
2023-08-03 07:16:11 -04:00
{
union
{
2023-10-23 19:25:08 -04:00
VectorRegister4i v ;
uint32 u4 [ 4 ] ;
int32 i4 [ 4 ] ;
2023-08-03 07:16:11 -04:00
} ;
} ;
VM_FORCEINLINE VectorRegister4i VVMf2i ( VectorRegister4i v0 )
{
FVecReg u ;
u . i = v0 ;
VectorRegister4i res = VectorFloatToInt ( u . v ) ;
return res ;
}
VM_FORCEINLINE VectorRegister4f VVMi2f ( VectorRegister4f v0 )
{
FVecReg u ;
u . v = v0 ;
VectorRegister4f res = VectorIntToFloat ( u . i ) ;
return res ;
}
# define VVMDebugBreakIf(expr) if ((expr)) { PLATFORM_BREAK(); }
VM_FORCEINLINE uint16 float_to_half_fast3_rtne ( uint32 f_in )
{
uint16 h_out ;
FPlatformMath : : StoreHalf ( & h_out , * reinterpret_cast < float * > ( & f_in ) ) ;
return h_out ;
}
void VVMMemCpy ( void * dst , void * src , size_t bytes )
{
unsigned char * RESTRICT d = ( unsigned char * ) dst ;
unsigned char * RESTRICT s = ( unsigned char * ) src ;
unsigned char * RESTRICT s_end = s + bytes ;
ptrdiff_t ofs_to_dest = d - s ;
if ( bytes < 16 )
{
if ( bytes )
{
do
{
s [ ofs_to_dest ] = s [ 0 ] ;
+ + s ;
} while ( s < s_end ) ;
}
}
else
{
// do one unaligned to get us aligned for the stream out below
VectorRegister4i i0 = VectorIntLoad ( s ) ;
VectorIntStore ( i0 , d ) ;
s + = 16 + 16 - ( ( size_t ) d & 15 ) ; // S is 16 bytes ahead
while ( s < = s_end )
{
i0 = VectorIntLoad ( s - 16 ) ;
VectorIntStoreAligned ( i0 , s - 16 + ofs_to_dest ) ;
s + = 16 ;
}
// do one unaligned to finish the copy
i0 = VectorIntLoad ( s_end - 16 ) ;
VectorIntStore ( i0 , s_end + ofs_to_dest - 16 ) ;
}
}
void VVMMemSet32 ( void * dst , uint32 val , size_t num_vals )
{
if ( num_vals < = 4 )
{
uint32 * dst32 = ( uint32 * ) dst ;
switch ( num_vals )
{
case 4 : VectorIntStore ( VectorIntSet1 ( val ) , dst ) ; break ;
case 3 : dst32 [ 2 ] = val ; //intentional fallthrough
case 2 : dst32 [ 1 ] = val ; //intentional fallthrough
case 1 : dst32 [ 0 ] = val ; //intentional fallthrough
case 0 : break ;
}
}
else
{
VectorRegister4i v4 = VectorIntSet1 ( val ) ;
char * RESTRICT ptr = ( char * ) dst ;
2023-10-12 13:39:49 -04:00
char * RESTRICT end_ptr = ptr + num_vals * sizeof ( val ) - sizeof ( v4 ) ;
2023-08-03 07:16:11 -04:00
while ( ptr < end_ptr ) {
VectorIntStore ( v4 , ptr ) ;
2023-10-06 10:49:41 -04:00
ptr + = sizeof ( v4 ) ;
2023-08-03 07:16:11 -04:00
}
VectorIntStore ( v4 , end_ptr ) ;
}
}
void VVMMemSet16 ( void * dst , uint16 val , size_t num_vals )
{
VectorRegister4i Val4 = VectorIntLoad1_16 ( & val ) ;
if ( num_vals < = 8 )
{
uint16 * dst16 = ( uint16 * ) dst ;
switch ( num_vals )
{
case 8 : VectorIntStore ( Val4 , dst16 ) ;
break ;
case 7 : VectorIntStore_16 ( Val4 , dst16 ) ;
VectorIntStore_16 ( Val4 , dst16 + 3 ) ;
break ;
case 6 : VectorIntStore_16 ( Val4 , dst16 ) ;
VectorIntStore_16 ( Val4 , dst16 + 2 ) ;
break ;
case 5 : VectorIntStore_16 ( Val4 , dst16 ) ;
VectorIntStore_16 ( Val4 , dst16 + 1 ) ;
break ;
case 4 : VectorIntStore_16 ( Val4 , dst16 ) ;
break ;
case 3 : dst16 [ 2 ] = val ; //intentional fallthrough
case 2 : dst16 [ 1 ] = val ; //intentional fallthrough
case 1 : dst16 [ 0 ] = val ; //intentional fallthrough
case 0 : break ;
}
}
else
{
char * RESTRICT ptr = ( char * ) dst ;
2023-10-12 13:39:49 -04:00
char * RESTRICT end_ptr = ptr + num_vals * sizeof ( val ) - sizeof ( Val4 ) ;
2023-08-03 07:16:11 -04:00
while ( ptr < end_ptr ) {
VectorIntStore ( Val4 , ptr ) ;
2023-10-06 10:49:41 -04:00
ptr + = sizeof ( Val4 ) ;
2023-08-03 07:16:11 -04:00
}
VectorIntStore ( Val4 , end_ptr ) ;
}
}
2024-04-19 10:01:44 -04:00
# if PLATFORM_CPU_X86_FAMILY || defined(__SSE3__)
2023-08-03 07:16:11 -04:00
# define VVM_pshufb(Src, Mask) _mm_shuffle_epi8(Src, Mask)
// Fabian's round-to-nearest-even float to half
2023-10-12 13:39:49 -04:00
static void VVM_floatToHalf ( void * output , float const * input )
2023-08-03 07:16:11 -04:00
{
static const MS_ALIGN ( 16 ) unsigned int mask_sign [ 4 ] GCC_ALIGN ( 16 ) = { 0x80000000u , 0x80000000u , 0x80000000u , 0x80000000u } ;
static const MS_ALIGN ( 16 ) int c_f16max [ 4 ] GCC_ALIGN ( 16 ) = { ( 127 + 16 ) < < 23 , ( 127 + 16 ) < < 23 , ( 127 + 16 ) < < 23 , ( 127 + 16 ) < < 23 } ; // all FP32 values >=this round to +inf
static const MS_ALIGN ( 16 ) int c_nanbit [ 4 ] GCC_ALIGN ( 16 ) = { 0x200 , 0x200 , 0x200 , 0x200 } ;
static const MS_ALIGN ( 16 ) int c_infty_as_fp16 [ 4 ] GCC_ALIGN ( 16 ) = { 0x7c00 , 0x7c00 , 0x7c00 , 0x7c00 } ;
static const MS_ALIGN ( 16 ) int c_min_normal [ 4 ] GCC_ALIGN ( 16 ) = { ( 127 - 14 ) < < 23 , ( 127 - 14 ) < < 23 , ( 127 - 14 ) < < 23 , ( 127 - 14 ) < < 23 } ; // smallest FP32 that yields a normalized FP16
static const MS_ALIGN ( 16 ) int c_subnorm_magic [ 4 ] GCC_ALIGN ( 16 ) = { ( ( 127 - 15 ) + ( 23 - 10 ) + 1 ) < < 23 , ( ( 127 - 15 ) + ( 23 - 10 ) + 1 ) < < 23 , ( ( 127 - 15 ) + ( 23 - 10 ) + 1 ) < < 23 , ( ( 127 - 15 ) + ( 23 - 10 ) + 1 ) < < 23 } ;
static const MS_ALIGN ( 16 ) int c_normal_bias [ 4 ] GCC_ALIGN ( 16 ) = { 0xfff - ( ( 127 - 15 ) < < 23 ) , 0xfff - ( ( 127 - 15 ) < < 23 ) , 0xfff - ( ( 127 - 15 ) < < 23 ) , 0xfff - ( ( 127 - 15 ) < < 23 ) } ; // adjust exponent and add mantissa rounding
__m128 f = _mm_loadu_ps ( input ) ;
__m128 msign = _mm_castsi128_ps ( * ( VectorRegister4i * ) mask_sign ) ;
__m128 justsign = _mm_and_ps ( msign , f ) ;
__m128 absf = _mm_xor_ps ( f , justsign ) ;
__m128i absf_int = _mm_castps_si128 ( absf ) ; // the cast is "free" (extra bypass latency, but no thruput hit)
__m128i f16max = * ( VectorRegister4i * ) c_f16max ;
__m128 b_isnan = _mm_cmpunord_ps ( absf , absf ) ; // is this a NaN?
__m128i b_isregular = _mm_cmpgt_epi32 ( f16max , absf_int ) ; // (sub)normalized or special?
__m128i nanbit = _mm_and_si128 ( _mm_castps_si128 ( b_isnan ) , * ( VectorRegister4i * ) c_nanbit ) ;
__m128i inf_or_nan = _mm_or_si128 ( nanbit , * ( VectorRegister4i * ) c_infty_as_fp16 ) ; // output for specials
__m128i min_normal = * ( VectorRegister4i * ) c_min_normal ;
__m128i b_issub = _mm_cmpgt_epi32 ( min_normal , absf_int ) ;
// "result is subnormal" path
__m128 subnorm1 = _mm_add_ps ( absf , _mm_castsi128_ps ( * ( VectorRegister4i * ) c_subnorm_magic ) ) ; // magic value to round output mantissa
__m128i subnorm2 = _mm_sub_epi32 ( _mm_castps_si128 ( subnorm1 ) , * ( VectorRegister4i * ) c_subnorm_magic ) ; // subtract out bias
// "result is normal" path
__m128i mantoddbit = _mm_slli_epi32 ( absf_int , 31 - 13 ) ; // shift bit 13 (mantissa LSB) to sign
__m128i mantodd = _mm_srai_epi32 ( mantoddbit , 31 ) ; // -1 if FP16 mantissa odd, else 0
__m128i round1 = _mm_add_epi32 ( absf_int , * ( VectorRegister4i * ) c_normal_bias ) ;
__m128i round2 = _mm_sub_epi32 ( round1 , mantodd ) ; // if mantissa LSB odd, bias towards rounding up (RTNE)
__m128i normal = _mm_srli_epi32 ( round2 , 13 ) ; // rounded result
// combine the two non-specials
__m128i nonspecial = _mm_or_si128 ( _mm_and_si128 ( subnorm2 , b_issub ) , _mm_andnot_si128 ( b_issub , normal ) ) ;
// merge in specials as well
__m128i joined = _mm_or_si128 ( _mm_and_si128 ( nonspecial , b_isregular ) , _mm_andnot_si128 ( b_isregular , inf_or_nan ) ) ;
__m128i sign_shift = _mm_srai_epi32 ( _mm_castps_si128 ( justsign ) , 16 ) ;
2023-10-12 13:39:49 -04:00
__m128i res = _mm_or_si128 ( joined , sign_shift ) ;
2023-08-03 07:16:11 -04:00
2023-10-12 13:39:49 -04:00
res = _mm_packs_epi32 ( res , res ) ;
_mm_storeu_si64 ( output , res ) ;
2023-08-03 07:16:11 -04:00
}
2023-10-12 13:39:49 -04:00
2023-08-03 07:16:11 -04:00
VM_FORCEINLINE VectorRegister4i VVMIntRShift ( VectorRegister4i v0 , VectorRegister4i v1 )
{
uint32 * v0_4 = ( uint32 * ) & v0 ;
uint32 * v1_4 = ( uint32 * ) & v1 ;
2023-10-23 19:25:08 -04:00
FVVM_VUI4 res ;
2023-08-03 07:16:11 -04:00
2023-10-23 19:25:08 -04:00
res . u4 [ 0 ] = v0_4 [ 0 ] > > v1_4 [ 0 ] ;
res . u4 [ 1 ] = v0_4 [ 1 ] > > v1_4 [ 1 ] ;
res . u4 [ 2 ] = v0_4 [ 2 ] > > v1_4 [ 2 ] ;
res . u4 [ 3 ] = v0_4 [ 3 ] > > v1_4 [ 3 ] ;
2023-08-03 07:16:11 -04:00
2023-10-23 19:25:08 -04:00
return res . v ;
2023-08-03 07:16:11 -04:00
}
VM_FORCEINLINE VectorRegister4i VVMIntLShift ( VectorRegister4i v0 , VectorRegister4i v1 )
{
uint32 * v0_4 = ( uint32 * ) & v0 ;
uint32 * v1_4 = ( uint32 * ) & v1 ;
2023-10-23 19:25:08 -04:00
FVVM_VUI4 res ;
2023-08-03 07:16:11 -04:00
2023-10-23 19:25:08 -04:00
res . u4 [ 0 ] = v0_4 [ 0 ] < < v1_4 [ 0 ] ;
res . u4 [ 1 ] = v0_4 [ 1 ] < < v1_4 [ 1 ] ;
res . u4 [ 2 ] = v0_4 [ 2 ] < < v1_4 [ 2 ] ;
res . u4 [ 3 ] = v0_4 [ 3 ] < < v1_4 [ 3 ] ;
2023-08-03 07:16:11 -04:00
2023-10-23 19:25:08 -04:00
return res . v ;
2023-08-03 07:16:11 -04:00
}
# elif PLATFORM_CPU_ARM_FAMILY
# define VVM_pshufb(Src, Mask) vqtbl1q_u8(Src, Mask)
static FORCEINLINE void VVM_floatToHalf ( void * output , float const * input )
{
float16x4_t out0 = vcvt_f16_f32 ( vld1q_f32 ( input + 0 ) ) ;
2024-08-23 14:27:41 -04:00
vst1_s16 ( ( int16_t * ) ( ( char * ) output + 0 ) , out0 ) ;
2023-08-03 07:16:11 -04:00
}
VM_FORCEINLINE VectorRegister4i VVMIntRShift ( VectorRegister4i v0 , VectorRegister4i v1 )
{
VectorRegister4i res = vshlq_u32 ( v0 , vmulq_s32 ( v1 , vdupq_n_s32 ( - 1 ) ) ) ;
return res ;
}
VM_FORCEINLINE VectorRegister4i VVMIntLShift ( VectorRegister4i v0 , VectorRegister4i v1 )
{
VectorRegister4i res = vshlq_u32 ( v0 , v1 ) ;
return res ;
}
# endif
2023-10-27 14:00:04 -04:00
static void SetupBatchStatePtrs ( FVectorVMExecContext * ExecCtx , FVectorVMBatchState * BatchState )
2023-08-03 07:16:11 -04:00
{
uint8 * BatchDataPtr = ( uint8 * ) VVM_ALIGN_64 ( ( size_t ) BatchState + sizeof ( FVectorVMBatchState ) ) ;
size_t NumPtrRegsInTable = ExecCtx - > VVMState - > NumTempRegisters + ExecCtx - > VVMState - > NumConstBuffers + ExecCtx - > VVMState - > NumInputBuffers * 2 + ExecCtx - > VVMState - > NumOutputBuffers ;
uint32 NumLoops = ExecCtx - > Internal . MaxInstancesPerChunk > > 2 ;
BatchState - > RegisterData = ( FVecReg * ) BatchDataPtr ; BatchDataPtr + = VVM_REG_SIZE * ExecCtx - > VVMState - > NumTempRegisters * NumLoops ;
BatchState - > RegPtrTable = ( uint8 * * ) BatchDataPtr ; BatchDataPtr + = NumPtrRegsInTable * sizeof ( uint32 * ) ;
BatchState - > RegIncTable = ( uint8 * ) BatchDataPtr ; BatchDataPtr + = NumPtrRegsInTable ;
BatchState - > OutputMaskIdx = ( uint8 * ) BatchDataPtr ; BatchDataPtr + = ExecCtx - > VVMState - > MaxOutputDataSet * NumLoops ;
BatchState - > ChunkLocalData . StartingOutputIdxPerDataSet = ( uint32 * ) BatchDataPtr ; BatchDataPtr + = ExecCtx - > VVMState - > ChunkLocalDataOutputIdxNumBytes ;
BatchState - > ChunkLocalData . NumOutputPerDataSet = ( uint32 * ) BatchDataPtr ; BatchDataPtr + = ExecCtx - > VVMState - > ChunkLocalNumOutputNumBytes ;
BatchState - > ChunkLocalData . OutputMaskIdx = ( uint8 * * ) BatchDataPtr ; BatchDataPtr + = ExecCtx - > VVMState - > ChunkLocalOutputMaskIdxNumBytes ;
2023-10-27 14:00:04 -04:00
BatchState - > ChunkLocalData . RandCounters = nullptr ; //these get malloc'd separately if they're ever used... which they very rarely are
2023-08-03 07:16:11 -04:00
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > MaxOutputDataSet ; + + i ) {
BatchState - > ChunkLocalData . OutputMaskIdx [ i ] = BatchState - > OutputMaskIdx + i * NumLoops ;
}
{ //deal with the external function register decoding buffer
size_t PtrBeforeExtFnDecodeReg = ( size_t ) BatchDataPtr ;
BatchState - > ChunkLocalData . ExtFnDecodedReg . RegData = ( uint32 * * ) BatchDataPtr ; BatchDataPtr + = sizeof ( FVecReg * ) * ExecCtx - > VVMState - > MaxExtFnRegisters ;
BatchState - > ChunkLocalData . ExtFnDecodedReg . RegInc = ( uint8 * ) BatchDataPtr ; BatchDataPtr + = sizeof ( uint8 ) * ExecCtx - > VVMState - > MaxExtFnRegisters ;
BatchDataPtr = ( uint8 * ) VVM_PTR_ALIGN ( BatchDataPtr ) ;
BatchState - > ChunkLocalData . ExtFnDecodedReg . DummyRegs = ( FVecReg * ) BatchDataPtr ; BatchDataPtr + = sizeof ( FVecReg ) * ExecCtx - > VVMState - > NumDummyRegsRequired ;
}
size_t PtrStart = ( size_t ) BatchState ;
size_t PtrAfterExtFnDecodeReg = ( size_t ) BatchDataPtr ;
check ( PtrAfterExtFnDecodeReg - PtrStart < = ExecCtx - > VVMState - > BatchOverheadSize + ExecCtx - > Internal . PerBatchRegisterDataBytesRequired ) ;
{ //build the register pointer table which contains pointers (in order) to:
uint32 * * TempRegPtr = ( uint32 * * ) BatchState - > RegPtrTable ; //1. Temp Registers
uint32 * * ConstBuffPtr = TempRegPtr + ExecCtx - > VVMState - > NumTempRegisters ; //2. Constant Buffers
uint32 * * InputPtr = ConstBuffPtr + ExecCtx - > VVMState - > NumConstBuffers ; //3. Input Registers
uint32 * * OutputPtr = InputPtr + ExecCtx - > VVMState - > NumInputBuffers * 2 ; //4. Output Buffers
static_assert ( sizeof ( FVecReg ) = = 16 ) ;
FMemory : : Memset ( BatchState - > RegIncTable , sizeof ( FVecReg ) , NumPtrRegsInTable ) ;
//temp regsiters
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > NumTempRegisters ; + + i ) {
TempRegPtr [ i ] = ( uint32 * ) ( BatchState - > RegisterData + i * NumLoops ) ;
}
//constant buffers
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > NumConstBuffers ; + + i ) {
ConstBuffPtr [ i ] = ( uint32 * ) ( ExecCtx - > VVMState - > ConstantBuffers + i ) ;
BatchState - > RegIncTable [ ExecCtx - > VVMState - > NumTempRegisters + i ] = 0 ;
}
//inputs
int NoAdvCounter = 0 ;
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > NumInputBuffers ; + + i )
{
uint8 DataSetIdx = ExecCtx - > VVMState - > InputMapCacheIdx [ i ] ;
uint16 InputMapSrcIdx = ExecCtx - > VVMState - > InputMapCacheSrc [ i ] ;
uint32 * * DataSetInputBuffers = ( uint32 * * ) ExecCtx - > DataSets [ DataSetIdx ] . InputRegisters . GetData ( ) ;
int32 InstanceOffset = ExecCtx - > DataSets [ DataSetIdx ] . InstanceOffset ;
2023-11-16 11:15:11 -05:00
const bool bNoAdvanceInput = InputMapSrcIdx & 0x8000 ;
const bool bHalfInput = InputMapSrcIdx & 0x4000 ;
InputMapSrcIdx = InputMapSrcIdx & 0x3FFF ;
const int32 InputRegisterIndex = ExecCtx - > VVMState - > NumTempRegisters + ExecCtx - > VVMState - > NumConstBuffers + i ;
if ( bNoAdvanceInput ) //this is a noadvance input. It points to data after the constant buffers
2023-08-03 07:16:11 -04:00
{
2023-11-16 11:15:11 -05:00
InputPtr [ i ] = ( uint32 * ) ( ExecCtx - > VVMState - > ConstantBuffers + ExecCtx - > VVMState - > NumConstBuffers + NoAdvCounter ) ;
2023-08-03 07:16:11 -04:00
+ + NoAdvCounter ;
2023-11-16 11:15:11 -05:00
BatchState - > RegIncTable [ InputRegisterIndex ] = 0 ; //no advance inputs... don't advance obviously
2023-08-03 07:16:11 -04:00
2023-11-16 11:15:11 -05:00
if ( bHalfInput ) //half input (@TODO: has never been tested)
2023-08-03 07:16:11 -04:00
{
2023-11-16 11:15:11 -05:00
uint16 * Ptr = ( uint16 * ) DataSetInputBuffers [ InputMapSrcIdx ] + InstanceOffset ;
2023-08-03 07:16:11 -04:00
float val = FPlatformMath : : LoadHalf ( Ptr ) ;
VectorRegister4f InputVal = VectorSet1 ( val ) ;
VectorStore ( InputVal , ( float * ) InputPtr [ i ] ) ;
}
else
{
2023-11-16 11:15:11 -05:00
uint32 * Ptr = ( uint32 * ) DataSetInputBuffers [ InputMapSrcIdx ] + InstanceOffset ;
2023-08-03 07:16:11 -04:00
VectorRegister4i InputVal4 = VectorIntSet1 ( * Ptr ) ;
VectorIntStore ( InputVal4 , InputPtr [ i ] ) ;
}
}
else //regular input, point directly to the input buffer
{
2023-11-16 11:15:11 -05:00
const uint32 DataTypeStride = bHalfInput ? 2 : 4 ;
const uint32 OffsetBytes = InstanceOffset * DataTypeStride ;
// Note that we don't update RegIncTable because it is handled by the op being invoked
// (it will assume that the register is half as appropriate)
InputPtr [ i ] = reinterpret_cast < uint32 * > (
reinterpret_cast < uint8 * > ( DataSetInputBuffers [ InputMapSrcIdx ] ) + OffsetBytes ) ;
2023-08-03 07:16:11 -04:00
}
2023-11-16 11:15:11 -05:00
//second copy of the "base" ptr so each chunk can start them at their correct starting offset
InputPtr [ i + ExecCtx - > VVMState - > NumInputBuffers ] = InputPtr [ i ] ;
2023-08-03 07:16:11 -04:00
}
//outputs
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > NumOutputBuffers ; + + i )
{
2023-10-27 14:00:04 -04:00
const uint8 DataSetIdx = ExecCtx - > VVMState - > OutputRemapDataSetIdx [ i ] ;
check ( DataSetIdx < ExecCtx - > DataSets . Num ( ) ) ;
const uint16 OutputDataType = ExecCtx - > VVMState - > OutputRemapDataType [ i ] ;
check ( OutputDataType < UE_ARRAY_COUNT ( FDataSetMeta : : OutputRegisterTypeOffsets ) ) ;
const uint16 OutputMapDst = ExecCtx - > VVMState - > OutputRemapDst [ i ] ;
if ( OutputMapDst = = 0xFFFF )
{
OutputPtr [ i ] = nullptr ;
}
else
{
const uint32 TypeOffset = ExecCtx - > DataSets [ DataSetIdx ] . OutputRegisterTypeOffsets [ OutputDataType ] ;
const uint32 OutputBufferIdx = TypeOffset + OutputMapDst ;
2023-11-16 11:15:11 -05:00
const uint32 DataTypeStride = OutputDataType = = 2 ? 2 : 4 ;
const uint32 InstanceOffsetBytes = ExecCtx - > DataSets [ DataSetIdx ] . InstanceOffset * DataTypeStride ;
2023-10-27 14:00:04 -04:00
2023-11-16 11:15:11 -05:00
OutputPtr [ i ] = reinterpret_cast < uint32 * > ( ExecCtx - > DataSets [ DataSetIdx ] . OutputRegisters [ OutputBufferIdx ] + InstanceOffsetBytes ) ;
2023-10-27 14:00:04 -04:00
}
2023-08-03 07:16:11 -04:00
}
}
}
void FreeVectorVMState ( FVectorVMState * VVMState )
{
if ( VVMState ! = nullptr )
{
FMemory : : Free ( VVMState ) ;
}
}
static void SetupRandStateForBatch ( FVectorVMBatchState * BatchState )
{
uint64 pcg_state = FPlatformTime : : Cycles64 ( ) ;
uint64 pcg_inc = ( ( ( uint64 ) BatchState < < 32 ) ^ 0XCAFEF00DD15EA5E5U ) | 1 ;
pcg_state ^ = ( FPlatformTime : : Cycles64 ( ) < < 32ULL ) ;
//use psuedo-pcg to setup a state for xorwow
for ( int i = 0 ; i < 5 ; + + i ) //loop for xorwow internal state
{
MS_ALIGN ( 16 ) uint32 Values [ 4 ] GCC_ALIGN ( 16 ) ;
for ( int j = 0 ; j < 4 ; + + j )
{
uint64 old_state = pcg_state ;
pcg_state = old_state * 6364136223846793005ULL + pcg_inc ;
uint32 xor_shifted = ( uint32 ) ( ( ( old_state > > 18U ) ^ old_state ) > > 27U ) ;
uint32 rot = old_state > > 59U ;
Values [ j ] = ( xor_shifted > > rot ) | ( xor_shifted < < ( ( 0U - rot ) & 31 ) ) ;
}
VectorIntStore ( * ( VectorRegister4i * ) Values , BatchState - > RandState . State + i ) ;
}
BatchState - > RandState . Counters = MakeVectorRegisterInt64 ( pcg_inc , pcg_state ) ;
BatchState - > RandStream . GenerateNewSeed ( ) ;
}
static VM_FORCEINLINE VectorRegister4i VVMXorwowStep ( FVectorVMBatchState * BatchState )
{
VectorRegister4i t = BatchState - > RandState . State [ 4 ] ;
VectorRegister4i s = BatchState - > RandState . State [ 0 ] ;
BatchState - > RandState . State [ 4 ] = BatchState - > RandState . State [ 3 ] ;
BatchState - > RandState . State [ 3 ] = BatchState - > RandState . State [ 2 ] ;
BatchState - > RandState . State [ 2 ] = BatchState - > RandState . State [ 1 ] ;
BatchState - > RandState . State [ 1 ] = s ;
t = VectorIntXor ( t , VectorShiftRightImmLogical ( t , 2 ) ) ;
t = VectorIntXor ( t , VectorShiftLeftImm ( t , 1 ) ) ;
t = VectorIntXor ( t , VectorIntXor ( s , VectorIntXor ( s , VectorShiftLeftImm ( s , 4 ) ) ) ) ;
BatchState - > RandState . State [ 0 ] = t ;
BatchState - > RandState . Counters = VectorIntAdd ( BatchState - > RandState . Counters , VectorIntSet1 ( 362437 ) ) ;
VectorRegister4i Result = VectorIntAdd ( t , VectorIntLoad ( & BatchState - > RandState . Counters ) ) ;
return Result ;
}
2024-02-04 14:20:58 -05:00
FVectorVMState * AllocVectorVMState ( FVectorVMOptimizeContext * OptimizeCtx )
{
if ( OptimizeCtx = = nullptr )
{
2023-08-03 07:16:11 -04:00
return nullptr ;
}
2024-02-04 14:20:58 -05:00
# if WITH_EDITORONLY_DATA
if ( OptimizeCtx - > Error . Line ! = 0 )
{
return nullptr ;
}
# endif
2023-08-03 07:16:11 -04:00
//compute the number of overhead bytes for this VVM State
uint32 ConstBufferOffset = VVM_ALIGN_32 ( sizeof ( FVectorVMState ) ) ;
size_t ConstantBufferNumBytes = VVM_PTR_ALIGN ( VVM_REG_SIZE * ( OptimizeCtx - > NumConstsRemapped + OptimizeCtx - > NumNoAdvanceInputs ) ) ;
size_t ExtFnTableNumBytes = VVM_PTR_ALIGN ( sizeof ( FVectorVMExtFunctionData ) * ( OptimizeCtx - > MaxExtFnUsed + 1 ) ) ;
size_t OutputPerDataSetNumBytes = VVM_PTR_ALIGN ( sizeof ( int32 ) * OptimizeCtx - > MaxOutputDataSet ) ;
size_t ConstMapCacheNumBytes = VVM_PTR_ALIGN ( ( sizeof ( uint8 ) + sizeof ( uint16 ) ) * OptimizeCtx - > NumConstsRemapped ) ;
size_t InputMapCacheNumBytes = VVM_PTR_ALIGN ( ( sizeof ( uint8 ) + sizeof ( uint16 ) ) * OptimizeCtx - > NumInputsRemapped ) ;
size_t VVMStateTotalNumBytes = ConstBufferOffset + ConstantBufferNumBytes + ExtFnTableNumBytes + OutputPerDataSetNumBytes + ConstMapCacheNumBytes + InputMapCacheNumBytes ;
uint8 * StatePtr = ( uint8 * ) FMemory : : Malloc ( VVMStateTotalNumBytes , 16 ) ;
if ( StatePtr = = nullptr )
{
return nullptr ;
}
FVectorVMState * VVMState = ( FVectorVMState * ) StatePtr ;
VVMState - > TotalNumBytes = ( uint32 ) VVMStateTotalNumBytes ;
{ //setup the pointers that are allocated in conjunction with the VVMState
uint32 ExtFnTableOffset = ( uint32 ) ( ConstBufferOffset + ConstantBufferNumBytes ) ;
uint32 OutputPerDataSetOffset = ( uint32 ) ( ExtFnTableOffset + ExtFnTableNumBytes ) ;
uint32 ConstMapCacheOffset = ( uint32 ) ( OutputPerDataSetOffset + OutputPerDataSetNumBytes ) ;
uint32 InputMapCacheOffset = ( uint32 ) ( ConstMapCacheOffset + ConstMapCacheNumBytes ) ;
VVMState - > ConstantBuffers = ( FVecReg * ) ( StatePtr + ConstBufferOffset ) ;
VVMState - > ExtFunctionTable = ( FVectorVMExtFunctionData * ) ( StatePtr + ExtFnTableOffset ) ;
VVMState - > NumOutputPerDataSet = ( int32 * ) ( StatePtr + OutputPerDataSetOffset ) ;
VVMState - > ConstMapCacheIdx = ( uint8 * ) ( StatePtr + ConstMapCacheOffset ) ;
VVMState - > InputMapCacheIdx = ( uint8 * ) ( StatePtr + InputMapCacheOffset ) ;
VVMState - > ConstMapCacheSrc = ( uint16 * ) ( VVMState - > ConstMapCacheIdx + OptimizeCtx - > NumConstsRemapped ) ;
VVMState - > InputMapCacheSrc = ( uint16 * ) ( VVMState - > InputMapCacheIdx + OptimizeCtx - > NumInputsRemapped ) ;
check ( ( size_t ) ( ( uint8 * ) VVMState - > InputMapCacheSrc - StatePtr ) + sizeof ( uint16 ) * OptimizeCtx - > NumInputsRemapped < = VVMStateTotalNumBytes ) ;
VVMState - > NumInstancesExecCached = 0 ;
for ( uint32 i = 0 ; i < OptimizeCtx - > NumExtFns ; + + i )
{
VVMState - > ExtFunctionTable [ i ] . Function = nullptr ;
VVMState - > ExtFunctionTable [ i ] . NumInputs = OptimizeCtx - > ExtFnTable [ i ] . NumInputs ;
VVMState - > ExtFunctionTable [ i ] . NumOutputs = OptimizeCtx - > ExtFnTable [ i ] . NumOutputs ;
}
}
{ //setup the pointers from the optimize context
VVMState - > ConstRemapTable = OptimizeCtx - > ConstRemap [ 1 ] ;
VVMState - > InputRemapTable = OptimizeCtx - > InputRemapTable ;
VVMState - > InputDataSetOffsets = OptimizeCtx - > InputDataSetOffsets ;
VVMState - > OutputRemapDataSetIdx = OptimizeCtx - > OutputRemapDataSetIdx ;
VVMState - > OutputRemapDataType = OptimizeCtx - > OutputRemapDataType ;
VVMState - > OutputRemapDst = OptimizeCtx - > OutputRemapDst ;
VVMState - > NumTempRegisters = OptimizeCtx - > NumTempRegisters ;
VVMState - > NumConstBuffers = OptimizeCtx - > NumConstsRemapped ;
VVMState - > NumInputBuffers = OptimizeCtx - > NumInputsRemapped ;
VVMState - > NumOutputsRemapped = OptimizeCtx - > NumOutputsRemapped ;
VVMState - > NumOutputBuffers = OptimizeCtx - > NumOutputInstructions ;
VVMState - > Bytecode = OptimizeCtx - > OutputBytecode ;
VVMState - > NumBytecodeBytes = OptimizeCtx - > NumBytecodeBytes ;
VVMState - > NumInputDataSets = OptimizeCtx - > NumInputDataSets ;
VVMState - > MaxOutputDataSet = OptimizeCtx - > MaxOutputDataSet ;
VVMState - > NumDummyRegsRequired = OptimizeCtx - > NumDummyRegsReq ;
VVMState - > Flags = OptimizeCtx - > Flags & ~ VVMFlag_DataMapCacheSetup ;
VVMState - > OptimizerHashId = OptimizeCtx - > HashId ;
VVMState - > NumExtFunctions = OptimizeCtx - > NumExtFns ;
VVMState - > MaxExtFnRegisters = OptimizeCtx - > MaxExtFnRegisters ;
}
{ //compute fixed batch size
const size_t NumPtrRegsInTable = VVMState - > NumTempRegisters +
VVMState - > NumConstBuffers +
VVMState - > NumInputBuffers * 2 +
VVMState - > NumOutputBuffers ;
const size_t ChunkLocalDataOutputIdxNumBytes = sizeof ( uint32 ) * VVMState - > MaxOutputDataSet ;
const size_t ChunkLocalNumOutputNumBytes = sizeof ( uint32 ) * VVMState - > MaxOutputDataSet ;
const size_t ChunkLocalOutputMaskIdxNumBytes = sizeof ( uint8 * ) * VVMState - > MaxOutputDataSet ;
const size_t ChunkLocalNumExtFnDecodeRegNumBytes = ( sizeof ( FVecReg * ) + sizeof ( uint8 ) ) * VVMState - > MaxExtFnRegisters + VVM_REG_SIZE * VVMState - > NumDummyRegsRequired ;
const size_t RegPtrTableNumBytes = ( sizeof ( uint32 * ) + sizeof ( uint8 ) ) * NumPtrRegsInTable ;
const size_t BatchOverheadSize = VVM_ALIGN_64 ( sizeof ( FVectorVMBatchState ) ) +
ChunkLocalDataOutputIdxNumBytes +
ChunkLocalNumOutputNumBytes +
ChunkLocalOutputMaskIdxNumBytes +
ChunkLocalNumExtFnDecodeRegNumBytes +
RegPtrTableNumBytes +
VVM_CHUNK_FIXED_OVERHEAD_SIZE ;
VVMState - > BatchOverheadSize = ( uint32 ) BatchOverheadSize ;
VVMState - > ChunkLocalDataOutputIdxNumBytes = ( uint32 ) ChunkLocalDataOutputIdxNumBytes ;
VVMState - > ChunkLocalNumOutputNumBytes = ( uint32 ) ChunkLocalNumOutputNumBytes ;
VVMState - > ChunkLocalOutputMaskIdxNumBytes = ( uint32 ) ChunkLocalOutputMaskIdxNumBytes ;
}
return VVMState ;
}
static void VVMBuildMapTableCaches ( FVectorVMExecContext * ExecCtx )
{
//constant buffers
2023-08-17 15:34:02 -04:00
check ( ExecCtx - > ConstantTableCount < = 0xFF ) ;
check ( ExecCtx - > VVMState - > NumConstBuffers < = 0xFFFF ) ;
2023-08-03 07:16:11 -04:00
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > NumConstBuffers ; + + i )
{
uint32 RemappedIdx = ExecCtx - > VVMState - > ConstRemapTable [ i ] ;
uint32 ConstCountAcc = 0 ;
for ( int j = 0 ; j < ExecCtx - > ConstantTableCount ; + + j )
{
const uint32 NumDWords = ( uint32 ) ExecCtx - > ConstantTableNumBytes [ j ] > > 2 ;
if ( ConstCountAcc + NumDWords > RemappedIdx )
{
const uint32 * SrcConstArray = ( uint32 * ) ExecCtx - > ConstantTableData [ j ] ;
uint32 Idx = RemappedIdx - ConstCountAcc ;
check ( Idx < 0xFFFF ) ;
ExecCtx - > VVMState - > ConstMapCacheIdx [ i ] = j ;
ExecCtx - > VVMState - > ConstMapCacheSrc [ i ] = ( uint16 ) ( Idx ) ;
break ;
}
ConstCountAcc + = NumDWords ;
}
}
//inputs
if ( ExecCtx - > VVMState - > NumInputDataSets > 0 )
{
int InputCounter = 0 ;
int NumInputDataSets = VVM_MIN ( ExecCtx - > VVMState - > NumInputDataSets , ( uint32 ) ExecCtx - > DataSets . Num ( ) ) ; //Niagara can pass in any amount of datasets, but we only care about the highest one actually used as determined by the optimizer
for ( int i = 0 ; i < NumInputDataSets ; + + i )
{
uint32 * * DataSetInputBuffers = ( uint32 * * ) ExecCtx - > DataSets [ i ] . InputRegisters . GetData ( ) ;
//regular inputs: float, int and half
for ( int j = 0 ; j < 3 ; + + j )
{
int NumInputsThisType = ExecCtx - > VVMState - > InputDataSetOffsets [ ( i < < 3 ) + j + 1 ] - ExecCtx - > VVMState - > InputDataSetOffsets [ ( i < < 3 ) + j ] ;
int TypeOffset = ExecCtx - > DataSets [ i ] . InputRegisterTypeOffsets [ j ] ;
for ( int k = 0 ; k < NumInputsThisType ; + + k )
{
int RemapIdx = ExecCtx - > VVMState - > InputDataSetOffsets [ ( i < < 3 ) + j ] + k ;
ExecCtx - > VVMState - > InputMapCacheIdx [ InputCounter ] = i ;
2023-11-16 11:15:11 -05:00
ExecCtx - > VVMState - > InputMapCacheSrc [ InputCounter ] = TypeOffset + ExecCtx - > VVMState - > InputRemapTable [ RemapIdx ] | ( ( ( j & 2 ) < < 13 ) ) ;
2023-08-03 07:16:11 -04:00
+ + InputCounter ;
}
}
if ( ExecCtx - > VVMState - > InputDataSetOffsets [ ( i < < 3 ) + 7 ] > 0 )
{
//no advance inputs: float, int and half
//no advance inputs point directly after the constant buffers
for ( int j = 0 ; j < 3 ; + + j )
{
int NumInputsThisType = ExecCtx - > VVMState - > InputDataSetOffsets [ ( i < < 3 ) + j + 4 ] - ExecCtx - > VVMState - > InputDataSetOffsets [ ( i < < 3 ) + j + 3 ] ;
int TypeOffset = ExecCtx - > DataSets [ i ] . InputRegisterTypeOffsets [ j ] ;
for ( int k = 0 ; k < NumInputsThisType ; + + k )
{
int RemapIdx = ExecCtx - > VVMState - > InputDataSetOffsets [ ( i < < 3 ) + j + 3 ] + k ;
ExecCtx - > VVMState - > InputMapCacheIdx [ InputCounter ] = i ;
ExecCtx - > VVMState - > InputMapCacheSrc [ InputCounter ] = ( TypeOffset + ExecCtx - > VVMState - > InputRemapTable [ RemapIdx ] ) | 0x8000 | ( ( ( j & 2 ) < < 13 ) ) ; //high bit is no advance input, 2nd high bit is whether it's half
+ + InputCounter ;
}
}
}
}
check ( InputCounter = = ExecCtx - > VVMState - > NumInputBuffers ) ;
}
}
2023-10-06 10:49:41 -04:00
# define VVM_OUTPUT_FUNCTION_HEADER(CT_InputInsOutputTypeOpCode) \
uint32 * NumOutputPerDataSet = BatchState - > ChunkLocalData . NumOutputPerDataSet ; \
uint32 * StartingOutputIdxPerDataSet = BatchState - > ChunkLocalData . StartingOutputIdxPerDataSet ; \
uint8 * * RegPtrTable = BatchState - > RegPtrTable ; \
uint8 * RegIncTable = BatchState - > RegIncTable ; \
uint8 * * OutputMaskIdx = BatchState - > ChunkLocalData . OutputMaskIdx ; \
TArrayView < FDataSetMeta > DataSets = ExecCtx - > DataSets ; \
uint8 RegType = ( uint8 ) ( CT_InputInsOutputTypeOpCode ) - ( uint8 ) EVectorVMOp : : outputdata_float ; \
int NumOutputLoops = InsPtr [ 0 ] ; \
uint8 DataSetIdx = InsPtr [ 1 ] ; \
uint32 NumOutputInstances = NumOutputPerDataSet [ DataSetIdx ] ; \
uint32 RegTypeOffset = DataSets [ DataSetIdx ] . OutputRegisterTypeOffsets [ RegType ] ; \
const uint16 * RESTRICT SrcIndices = ( uint16 * ) ( InsPtr + 2 ) ; \
const uint16 * RESTRICT DstIndices = SrcIndices + NumOutputLoops ; \
2023-08-03 07:16:11 -04:00
InsPtr + = 3 + 4 * NumOutputLoops ;
2023-10-06 10:49:41 -04:00
# if VECTORVM_DEBUG_PRINTF
# define VVM_OUTPUT_FUNCTION_FOOTER \
VECTORVM_PRINTF ( " \t Num Loops: %d, DataSetIndex: %d \n " , NumOutputLoops , DataSetIdx ) ; \
for ( int j = 0 ; j < NumOutputLoops ; + + j ) \
{ \
uint16 AbsRegIdx ; \
int ThisRegType = VVMGetRegisterType ( ExecCtx - > VVMState , SrcIndices [ j ] , & AbsRegIdx ) ; \
2023-11-16 11:15:11 -05:00
uint32 * DstReg = ( uint32 * ) RegPtrTable [ DstIndices [ j ] ] ; \
2023-10-06 10:49:41 -04:00
VECTORVM_PRINTF ( " \t %c%d -> %d [0x%p] \n " , VVM_RT_CHAR [ ThisRegType ] , AbsRegIdx , DstIndices [ j ] , DstReg ) ; \
} \
return InsPtr
# else //VECTORVM_DEBUG_PRINTF
# define VVM_OUTPUT_FUNCTION_FOOTER return InsPtr
# endif //VECTORVM_DEBUG_PRINTF
static const uint8 * VVM_Output32_from_16 ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx )
{
2023-11-02 16:10:45 -04:00
VVM_OUTPUT_FUNCTION_HEADER ( EVectorVMOp : : outputdata_float ) ; //-V501
2023-10-06 10:49:41 -04:00
VVM_OUTPUT_FUNCTION_FOOTER ;
}
static const uint8 * VVM_Output16_from_16 ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx )
{
VVM_OUTPUT_FUNCTION_HEADER ( EVectorVMOp : : outputdata_half ) ;
if ( CT_MultipleLoops )
{
if ( NumOutputInstances = = BatchState - > ChunkLocalData . NumInstancesThisChunk ) //all outputs written
{
for ( int j = 0 ; j < NumOutputLoops ; + + j )
{
int SrcInc = RegIncTable [ SrcIndices [ j ] ] ;
uint32 * SrcReg = ( uint32 * ) RegPtrTable [ SrcIndices [ j ] ] ;
2023-11-16 11:15:11 -05:00
uint32 * DstReg = ( uint32 * ) RegPtrTable [ DstIndices [ j ] ] ;
2023-10-06 10:49:41 -04:00
if ( SrcReg ! = DstReg )
{ //temp registers can be aliased to outputs
if ( SrcInc = = 0 ) //setting from a constant
{
VVMMemSet16 ( DstReg , * SrcReg , NumOutputInstances ) ;
}
else
{
VVMMemCpy ( DstReg , SrcReg , sizeof ( uint16 ) * NumOutputInstances ) ;
}
}
}
}
else if ( NumOutputInstances > 0 ) //not all outputs are being written
{
for ( int j = 0 ; j < NumOutputLoops ; + + j )
{
int SrcInc = RegIncTable [ SrcIndices [ j ] ] ;
2023-11-16 11:15:11 -05:00
uint64 * DstReg = ( uint64 * ) RegPtrTable [ DstIndices [ j ] ] ;
2023-10-06 10:49:41 -04:00
if ( SrcInc = = 0 ) //setting from a constant
{
uint64 Val = * ( uint64 * ) RegPtrTable [ SrcIndices [ j ] ] ;
uint64 * RESTRICT DstEnd = DstReg + NumOutputInstances ;
uint64 * RESTRICT Ptr = DstReg ;
while ( Ptr < DstEnd ) {
* Ptr = Val ;
Ptr + + ;
}
}
else
{
int NumLoops = ( int ) ( ( BatchState - > ChunkLocalData . NumInstancesThisChunk + 3 ) & ~ 3 ) > > 2 ; //assumes 4-wide ops
char * SrcPtr = ( char * ) RegPtrTable [ SrcIndices [ j ] ] ; //src and dst can alias
char * DstPtr = ( char * ) DstReg ;
uint8 * RESTRICT TblIdxPtr = OutputMaskIdx [ DataSetIdx ] ;
uint8 * RESTRICT TblIdxEndPtr = TblIdxPtr + NumLoops ;
while ( TblIdxPtr < TblIdxEndPtr )
{
uint8 TblIdx = * TblIdxPtr + + ;
VectorRegister4i Mask = ( ( VectorRegister4i * ) VVM_PSHUFB_OUTPUT_TABLE16 ) [ TblIdx ] ;
VectorRegister4i Src = VectorIntLoad_16 ( SrcPtr ) ;
VectorRegister4i Val = VVM_pshufb ( Src , Mask ) ;
VectorIntStore_16 ( Val , DstPtr ) ;
SrcPtr + = sizeof ( uint16 ) * 4 ;
DstPtr + = VVM_OUTPUT_ADVANCE_TABLE16 [ TblIdx ] ;
}
}
}
}
}
else
{
uint8 OutputMask = OutputMaskIdx [ DataSetIdx ] [ 0 ] ;
for ( int j = 0 ; j < NumOutputLoops ; + + j )
{
uint32 * SrcReg = ( uint32 * ) RegPtrTable [ SrcIndices [ j ] ] ;
2023-11-16 11:15:11 -05:00
uint32 * DstReg = ( uint32 * ) RegPtrTable [ DstIndices [ j ] ] ;
2023-10-06 10:49:41 -04:00
VectorRegister4i Mask = ( ( VectorRegister4i * ) VVM_PSHUFB_OUTPUT_TABLE16 ) [ OutputMask ] ;
VectorRegister4i Src = VectorIntLoad ( SrcReg ) ;
VectorRegister4i Val = VVM_pshufb ( Src , Mask ) ;
VectorIntStore_16 ( Val , DstReg ) ;
}
}
VVM_OUTPUT_FUNCTION_FOOTER ;
}
static const uint8 * VVM_Output16 ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx )
{
VVM_OUTPUT_FUNCTION_HEADER ( EVectorVMOp : : outputdata_half ) ;
2023-08-03 07:16:11 -04:00
if ( CT_MultipleLoops )
{
if ( NumOutputInstances = = BatchState - > ChunkLocalData . NumInstancesThisChunk ) //all outputs written
{
for ( int j = 0 ; j < NumOutputLoops ; + + j )
{
int SrcInc = RegIncTable [ SrcIndices [ j ] ] ;
uint32 * RESTRICT SrcReg = ( uint32 * ) RegPtrTable [ SrcIndices [ j ] ] ;
2023-11-16 11:15:11 -05:00
uint16 * RESTRICT DstReg = ( uint16 * ) RegPtrTable [ DstIndices [ j ] ] ;
2023-08-03 07:16:11 -04:00
check ( ( void * ) SrcReg ! = ( void * ) DstReg ) ; //half floats can't alias outputs
if ( SrcInc = = 0 ) //setting from a constant
{
//constants are 32 bits so convert
2023-10-12 13:39:49 -04:00
uint16 Val [ 8 ] ;
2023-08-03 07:16:11 -04:00
VVM_floatToHalf ( & Val , ( float * ) SrcReg ) ;
2023-10-12 13:39:49 -04:00
VVMMemSet16 ( DstReg , Val [ 0 ] , NumOutputInstances ) ;
2023-08-03 07:16:11 -04:00
}
else
{
uint16 * RESTRICT DstEnd = DstReg + NumOutputInstances ; //this may go over the 16 byte boundary that's alloced if there's an offset... not sure!
while ( DstReg < DstEnd )
{
VVM_floatToHalf ( DstReg , ( float * ) SrcReg ) ;
SrcReg + = 4 ;
DstReg + = 4 ;
}
}
}
}
else if ( NumOutputInstances > 0 ) //not all outputs are being written
{
for ( int j = 0 ; j < NumOutputLoops ; + + j )
{
int SrcInc = RegIncTable [ SrcIndices [ j ] ] ;
2023-11-16 11:15:11 -05:00
char * DstReg = ( char * ) RegPtrTable [ DstIndices [ j ] ] ;
2023-08-03 07:16:11 -04:00
if ( SrcInc = = 0 ) //setting from a constant
{
2023-10-12 13:39:49 -04:00
uint16 Val [ 8 ] ;
2023-08-03 07:16:11 -04:00
VVM_floatToHalf ( & Val , ( float * ) RegPtrTable [ SrcIndices [ j ] ] ) ;
2023-10-12 13:39:49 -04:00
VVMMemSet16 ( DstReg , Val [ 0 ] , NumOutputInstances ) ;
2023-08-03 07:16:11 -04:00
}
else
{
int NumLoops = ( int ) ( ( BatchState - > ChunkLocalData . NumInstancesThisChunk + 3 ) & ~ 3 ) > > 2 ; //assumes 4-wide ops
char * SrcPtr = ( char * ) RegPtrTable [ SrcIndices [ j ] ] ; //src and dst can alias
char * DstPtr = DstReg ;
uint8 * RESTRICT TblIdxPtr = OutputMaskIdx [ DataSetIdx ] ;
uint8 * RESTRICT TblIdxEndPtr = TblIdxPtr + NumLoops ;
while ( TblIdxPtr < TblIdxEndPtr ) {
uint8 TblIdx = * TblIdxPtr + + ;
VectorRegister4i Mask = ( ( VectorRegister4i * ) VVM_PSHUFB_OUTPUT_TABLE16 ) [ TblIdx ] ;
VectorRegister4i Src = VectorIntLoad ( ( VectorRegister4i * ) SrcPtr ) ; //loading from a temp register and they're always aligned
2023-10-12 13:39:49 -04:00
VectorRegister4i Val ;
FPlatformMath : : VectorStoreHalf ( ( uint16 * ) & Val , ( float * ) SrcPtr ) ;
VectorIntStore_16 ( VVM_pshufb ( Val , Mask ) , DstPtr ) ;
2023-08-03 07:16:11 -04:00
SrcPtr + = sizeof ( VectorRegister4i ) ;
DstPtr + = VVM_OUTPUT_ADVANCE_TABLE16 [ TblIdx ] ;
}
}
}
}
}
else
{
uint8 OutputMask = OutputMaskIdx [ DataSetIdx ] [ 0 ] ;
for ( int j = 0 ; j < NumOutputLoops ; + + j )
{
float * SrcReg = ( float * ) RegPtrTable [ SrcIndices [ j ] ] ;
2023-11-16 11:15:11 -05:00
uint16 * DstReg = ( uint16 * ) RegPtrTable [ DstIndices [ j ] ] ;
2023-08-03 07:16:11 -04:00
//convert 4 values at once then shift them in place
VectorRegister4i Mask = ( ( VectorRegister4i * ) VVM_PSHUFB_OUTPUT_TABLE16 ) [ OutputMask ] ;
VectorRegister4i HalfVals ;
2023-10-12 13:39:49 -04:00
FPlatformMath : : VectorStoreHalf ( ( uint16 * ) & HalfVals , SrcReg ) ;
2023-08-03 07:16:11 -04:00
VectorIntStore_16 ( VVM_pshufb ( HalfVals , Mask ) , DstReg ) ;
}
}
2023-10-06 10:49:41 -04:00
VVM_OUTPUT_FUNCTION_FOOTER ;
2023-08-03 07:16:11 -04:00
}
static const uint8 * VVM_Output32 ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx )
{
2023-10-06 10:49:41 -04:00
VVM_OUTPUT_FUNCTION_HEADER ( InsPtr [ - 1 ] ) ;
2023-08-03 07:16:11 -04:00
if ( CT_MultipleLoops )
{
if ( NumOutputInstances = = BatchState - > ChunkLocalData . NumInstancesThisChunk ) //all outputs written
{
for ( int j = 0 ; j < NumOutputLoops ; + + j )
{
int SrcInc = RegIncTable [ SrcIndices [ j ] ] ;
uint32 * SrcReg = ( uint32 * ) RegPtrTable [ SrcIndices [ j ] ] ;
2023-11-16 11:15:11 -05:00
uint32 * DstReg = ( uint32 * ) RegPtrTable [ DstIndices [ j ] ] ;
2023-08-03 07:16:11 -04:00
if ( SrcReg ! = DstReg )
{ //temp registers can be aliased to outputs
if ( SrcInc = = 0 ) //setting from a constant
{
VVMMemSet32 ( DstReg , * SrcReg , NumOutputInstances ) ;
}
else
{
VVMMemCpy ( DstReg , SrcReg , sizeof ( uint32 ) * NumOutputInstances ) ;
}
}
}
}
else if ( NumOutputInstances > 0 ) //not all outputs are being written
{
for ( int j = 0 ; j < NumOutputLoops ; + + j )
{
int SrcInc = RegIncTable [ SrcIndices [ j ] ] ;
2023-11-16 11:15:11 -05:00
char * DstReg = ( char * ) RegPtrTable [ DstIndices [ j ] ] ;
2023-08-03 07:16:11 -04:00
if ( SrcInc = = 0 ) //setting from a constant
{
VectorRegister4i Val = * ( VectorRegister4i * ) RegPtrTable [ SrcIndices [ j ] ] ;
2023-11-03 22:30:45 -04:00
char * RESTRICT PtrEnd = DstReg + sizeof ( uint32 ) * NumOutputInstances ;
2023-08-03 07:16:11 -04:00
char * RESTRICT Ptr = DstReg ;
2023-11-03 22:30:45 -04:00
while ( Ptr < PtrEnd )
2023-08-03 07:16:11 -04:00
{
VectorIntStore ( Val , Ptr ) ;
Ptr + = sizeof ( VectorRegister4i ) ;
}
}
else
{
int NumLoops = ( int ) ( ( BatchState - > ChunkLocalData . NumInstancesThisChunk + 3 ) & ~ 3 ) > > 2 ; //assumes 4-wide ops
char * SrcPtr = ( char * ) RegPtrTable [ SrcIndices [ j ] ] ; //src and dst can alias
char * DstPtr = DstReg ;
uint8 * RESTRICT TblIdxPtr = OutputMaskIdx [ DataSetIdx ] ;
uint8 * RESTRICT TblIdxEndPtr = TblIdxPtr + NumLoops ;
while ( TblIdxPtr < TblIdxEndPtr )
{
uint8 TblIdx = * TblIdxPtr + + ;
VectorRegister4i Mask = ( ( VectorRegister4i * ) VVM_PSHUFB_OUTPUT_TABLE ) [ TblIdx ] ;
VectorRegister4i Src = VectorIntLoad ( SrcPtr ) ;
VectorRegister4i Val = VVM_pshufb ( Src , Mask ) ;
VectorIntStore ( Val , DstPtr ) ;
SrcPtr + = sizeof ( VectorRegister4i ) ;
DstPtr + = VVM_OUTPUT_ADVANCE_TABLE [ TblIdx ] ;
}
}
}
}
}
else
{
uint8 OutputMask = OutputMaskIdx [ DataSetIdx ] [ 0 ] ;
for ( int j = 0 ; j < NumOutputLoops ; + + j )
{
uint32 * SrcReg = ( uint32 * ) RegPtrTable [ SrcIndices [ j ] ] ;
2023-11-16 11:15:11 -05:00
uint32 * DstReg = ( uint32 * ) RegPtrTable [ DstIndices [ j ] ] ;
2023-08-03 07:16:11 -04:00
VectorRegister4i Mask = ( ( VectorRegister4i * ) VVM_PSHUFB_OUTPUT_TABLE ) [ OutputMask ] ;
VectorRegister4i Src = VectorIntLoad ( SrcReg ) ;
VectorRegister4i Val = VVM_pshufb ( Src , Mask ) ;
VectorIntStore ( Val , DstReg ) ;
}
}
2023-10-06 10:49:41 -04:00
VVM_OUTPUT_FUNCTION_FOOTER ;
2023-08-03 07:16:11 -04:00
}
static const uint8 * VVM_acquireindex ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx )
{
uint32 * InputPtr = ( uint32 * ) BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ;
uint32 InputInc = ( ( uint32 ) BatchState - > RegIncTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ) > > 2 ;
uint8 DataSetIdx = InsPtr [ 2 ] ;
uint8 * OutputPtr = BatchState - > ChunkLocalData . OutputMaskIdx [ DataSetIdx ] ;
uint32 NumOutputInstances = 0 ;
if ( CT_MultipleLoops )
{
int NumFullLoops = BatchState - > ChunkLocalData . NumInstancesThisChunk > > 2 ;
int NumLeftoverInstances = BatchState - > ChunkLocalData . NumInstancesThisChunk - ( NumFullLoops < < 2 ) ;
for ( int i = 0 ; i < NumFullLoops ; + + i ) {
uint32 idx = ( ( InputPtr [ 0 ] & ( 1 < < 31 ) ) > > 31 ) |
( ( InputPtr [ 1 ] & ( 1 < < 31 ) ) > > 30 ) |
( ( InputPtr [ 2 ] & ( 1 < < 31 ) ) > > 29 ) |
( ( InputPtr [ 3 ] & ( 1 < < 31 ) ) > > 28 ) ;
InputPtr + = InputInc ;
OutputPtr [ i ] = ( uint8 ) idx ;
NumOutputInstances + = VVM_OUTPUT_ADVANCE_TABLE [ idx ] ;
}
if ( NumLeftoverInstances > 0 )
{
uint32 Index = 0 ;
switch ( NumLeftoverInstances ) {
case 1 : Index = ( ( InputPtr [ 0 ] & ( 1 < < 31 ) ) > > 31 ) ; break ;
case 2 : Index = ( ( InputPtr [ 0 ] & ( 1 < < 31 ) ) > > 31 ) | ( ( InputPtr [ 1 ] & ( 1 < < 31 ) ) > > 30 ) ; break ;
case 3 : Index = ( ( InputPtr [ 0 ] & ( 1 < < 31 ) ) > > 31 ) | ( ( InputPtr [ 1 ] & ( 1 < < 31 ) ) > > 30 ) | ( ( InputPtr [ 2 ] & ( 1 < < 31 ) ) > > 29 ) ; break ;
}
OutputPtr [ NumFullLoops ] = Index ;
NumOutputInstances + = VVM_OUTPUT_ADVANCE_TABLE [ Index ] ;
}
NumOutputInstances > > = 2 ;
}
else
{
uint32 Index = 0 ;
switch ( BatchState - > ChunkLocalData . NumInstancesThisChunk )
{
case 1 : Index = ( ( InputPtr [ 0 ] & ( 1 < < 31 ) ) > > 31 ) ; break ;
case 2 : Index = ( ( InputPtr [ 0 ] & ( 1 < < 31 ) ) > > 31 ) | ( ( InputPtr [ 1 ] & ( 1 < < 31 ) ) > > 30 ) ; break ;
case 3 : Index = ( ( InputPtr [ 0 ] & ( 1 < < 31 ) ) > > 31 ) | ( ( InputPtr [ 1 ] & ( 1 < < 31 ) ) > > 30 ) | ( ( InputPtr [ 2 ] & ( 1 < < 31 ) ) > > 29 ) ; break ;
case 4 : Index = ( ( InputPtr [ 0 ] & ( 1 < < 31 ) ) > > 31 ) | ( ( InputPtr [ 1 ] & ( 1 < < 31 ) ) > > 30 ) | ( ( InputPtr [ 2 ] & ( 1 < < 31 ) ) > > 29 ) | ( ( InputPtr [ 3 ] & ( 1 < < 31 ) ) > > 28 ) ; break ;
}
OutputPtr [ 0 ] = Index ;
NumOutputInstances + = VVM_OUTPUT_ADVANCE_TABLE [ Index ] > > 2 ;
}
BatchState - > ChunkLocalData . StartingOutputIdxPerDataSet [ DataSetIdx ] = ExecCtx - > DataSets [ DataSetIdx ] . InstanceOffset + FPlatformAtomics : : InterlockedAdd ( ExecCtx - > VVMState - > NumOutputPerDataSet + DataSetIdx , NumOutputInstances ) ;
BatchState - > ChunkLocalData . NumOutputPerDataSet [ DataSetIdx ] + = NumOutputInstances ;
return InsPtr + 4 ;
}
VM_FORCEINLINE const uint8 * VVM_exec_index ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , int NumLoops )
{
if ( CT_MultipleLoops )
{
uint8 * P0 = BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ;
VectorRegister4i Val = VectorIntAdd ( VectorIntSet1 ( BatchState - > ChunkLocalData . StartInstanceThisChunk ) , VVM_m128iConst ( ZeroOneTwoThree ) ) ;
VectorRegister4i Four = VectorIntSet1 ( 4 ) ;
uint8 * End = P0 + sizeof ( FVecReg ) * NumLoops ;
do
{
2023-08-30 10:40:47 -04:00
VectorIntStore ( Val , P0 ) ;
2023-08-03 07:16:11 -04:00
Val = VectorIntAdd ( Val , Four ) ;
P0 + = sizeof ( FVecReg ) ;
} while ( P0 < End ) ;
}
else
{
VectorIntStore ( VVM_m128iConst ( ZeroOneTwoThree ) , BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ) ;
}
return InsPtr + 3 ;
}
VM_FORCEINLINE const uint8 * VVM_exec_indexf ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , int NumLoops )
{
if ( CT_MultipleLoops )
{
uint8 * P0 = BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ;
VectorRegister4i Val = VectorIntAdd ( VectorIntSet1 ( BatchState - > ChunkLocalData . StartInstanceThisChunk ) , VVM_m128iConst ( ZeroOneTwoThree ) ) ;
VectorRegister4i Four = VectorIntSet1 ( 4 ) ;
uint8 * End = P0 + sizeof ( FVecReg ) * NumLoops ;
do
{
//this is faster than doing the work natively in floating point
2023-08-30 10:40:47 -04:00
VectorStore ( VectorIntToFloat ( Val ) , ( float * ) P0 ) ;
2023-08-03 07:16:11 -04:00
Val = VectorIntAdd ( Val , Four ) ;
P0 + = sizeof ( FVecReg ) ;
} while ( P0 < End ) ;
}
else
{
VectorStore ( VVM_m128Const ( ZeroOneTwoThree ) , ( float * ) BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ) ;
}
return InsPtr + 3 ;
}
VM_FORCEINLINE const uint8 * VVM_exec_index_addi ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , int NumLoops )
{
if ( CT_MultipleLoops )
{
uint8 * P0 = BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 1 ] ] ;
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ;
VectorRegister4i Val = VectorIntAdd ( VectorIntSet1 ( BatchState - > ChunkLocalData . StartInstanceThisChunk ) , VVM_m128iConst ( ZeroOneTwoThree ) ) ;
VectorRegister4i Four = VectorIntSet1 ( 4 ) ;
uint8 * End = P1 + sizeof ( FVecReg ) * NumLoops ;
do {
VectorRegister4i R0 = VectorIntLoad ( P0 ) ;
P0 + = Inc0 ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( VectorIntAdd ( Val , R0 ) , P1 ) ;
2023-08-03 07:16:11 -04:00
Val = VectorIntAdd ( Val , Four ) ;
P1 + = sizeof ( FVecReg ) ;
} while ( P1 < End ) ;
}
else
{
VectorRegister4i R0 = VectorIntLoad ( BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ) ;
VectorRegister4i Res = VectorIntAdd ( R0 , VVM_m128iConst ( ZeroOneTwoThree ) ) ;
VectorIntStore ( Res , BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 1 ] ] ) ;
}
return InsPtr + 5 ;
}
VM_FORCEINLINE VectorRegister4f VVM_nextRandom ( FVectorVMBatchState * BatchState , VectorRegister4f a )
{
return VectorMultiply ( VectorSubtract ( VectorRegister4f ( VectorCastIntToFloat ( VectorIntOr ( VectorShiftRightImmLogical ( VVMXorwowStep ( BatchState ) , 9 ) , VectorIntSet1 ( 0x3F800000 ) ) ) ) , VVM_m128Const ( One ) ) , a ) ;
}
# if VECTORVM_SUPPORTS_SERIALIZATION
bool VVM_serSyncRandom ( const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , int OutputRegPtrIdx , EVectorVMOp RandomOpCode )
{
if ( SerializeState & & ( SerializeState - > Flags & VVMSer_SyncRandom ) & & CmpSerializeState & & CmpSerializeState - > Flags & VVMSer_OptimizedBytecode )
{
int InsByteCodeOffset = ( int ) ( SerializeState - > Running . StartOpPtr - ExecCtx - > VVMState - > Bytecode ) ;
if ( CmpSerializeState - > NumInstructions > SerializeState - > NumInstructions & &
CmpSerializeState - > Instructions [ SerializeState - > NumInstructions ] . OpStart = = InsByteCodeOffset & &
( EVectorVMOp ) CmpSerializeState - > Bytecode [ InsByteCodeOffset - 1 ] = = RandomOpCode )
{
uint16 RegIdx = ( ( uint16 * ) InsPtr ) [ OutputRegPtrIdx ] ;
FVectorVMSerializeInstruction * CmpIns = CmpSerializeState - > Instructions + SerializeState - > NumInstructions ;
float * SrcReg = ( float * ) VVMSer_getRegPtrTablePtrFromIns ( CmpSerializeState , CmpIns , ( ( uint16 * ) ( CmpSerializeState - > Bytecode + InsByteCodeOffset ) ) [ OutputRegPtrIdx ] ) ;
float * DstReg = ( float * ) BatchState - > RegPtrTable [ RegIdx ] ;
int DstInc = ( int ) BatchState - > RegIncTable [ RegIdx ] ;
if ( DstInc ! = 0 )
{
VVMMemCpy ( DstReg , SrcReg , sizeof ( float ) * BatchState - > ChunkLocalData . NumInstancesThisChunk ) ;
}
}
return true ;
}
else if ( SerializeState & & ( SerializeState - > Flags & VVMSer_SyncRandom ) & & CmpSerializeState & & SerializeState - > NumInstances = = CmpSerializeState - > NumInstances & & ( CmpSerializeState - > NumInstructions > SerializeState - > NumInstructions ) )
{
FVectorVMSerializeInstruction * CmpIns = nullptr ; //this instruction
FVectorVMOptimizeInstruction * OptIns = nullptr ;
for ( uint32 i = 0 ; i < SerializeState - > OptimizeCtx - > Intermediate . NumInstructions ; + + i )
{
if ( SerializeState - > OptimizeCtx - > Intermediate . Instructions [ i ] . OpCode = = RandomOpCode & & SerializeState - > OptimizeCtx - > Intermediate . Instructions [ i ] . PtrOffsetInOptimizedBytecode = = ( int ) ( SerializeState - > Running . StartOpPtr - ExecCtx - > VVMState - > Bytecode ) )
{
OptIns = SerializeState - > OptimizeCtx - > Intermediate . Instructions + i ;
break ;
}
}
if ( OptIns )
{
for ( uint32 i = 0 ; i < CmpSerializeState - > NumInstructions ; + + i )
{
if ( CmpSerializeState - > Bytecode [ CmpSerializeState - > Instructions [ i ] . OpStart ] = = ( uint8 ) RandomOpCode & & OptIns - > PtrOffsetInOrigBytecode = = CmpSerializeState - > Instructions [ i ] . OpStart )
{
CmpIns = CmpSerializeState - > Instructions + i ;
break ;
}
}
}
check ( false ) ;
if ( OptIns & & CmpIns )
{
//for (int i = 0; i < NumRegisters; ++i)
//{
// uint8 RegIdx = InsPtr[1 + i];
// if (RegIdx != 0xFF) {
// uint32 * DstReg = (uint32 *)BatchState->RegPtrTable[RegIdx];
// int DstInc = (int)BatchState->RegIncTable[RegIdx];
// if (DstInc != 0) { //skip constants
// int SrcRegIdx = 0xFFFF;
// if (OptIns && (SerializeState->Flags & VVMSer_OptimizedBytecode) && !(CmpSerializeState->Flags & VVMSer_OptimizedBytecode))
// {
// SrcRegIdx = ((uint16 *)(CmpSerializeState->Bytecode + OptIns->PtrOffsetInOrigBytecode + 2))[i] & 0x7FFF; //high bit is register in original bytecode
// }
// if (SrcRegIdx != 0x7FFF) //invalid register, skipped by external function in the execution
// {
// //can only be temp registers because there's no output aliasing in the original VM
// uint8 *SrcReg = (uint8 *)(CmpIns->RegisterTable + SrcRegIdx * CmpSerializeState->NumInstances + StartInstanceThisChunk);
// VVMMemCpy(DstReg, SrcReg, sizeof(uint32) * NumInstancesThisChunk);
// }
// }
// }
//}
}
return true ;
}
return false ;
}
# else //VECTORVM_SUPPORTS_SERIALIZATION
VM_FORCEINLINE bool VVM_serSyncRandom ( const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , int OutputRegPtrIdx , EVectorVMOp RandomOpCode )
{
return false ;
}
# endif //VECTORVM_SUPPORTS_SERIALIZATION
2024-08-02 18:34:50 -04:00
static VM_FORCEINLINE int32 VVM_SafeIntDivide ( int32 Numerator , int32 Denominator )
{
static constexpr int32 MinIntValue = std : : numeric_limits < int32 > : : min ( ) ;
static constexpr int32 MaxIntValue = std : : numeric_limits < int32 > : : max ( ) ;
if ( Denominator = = 0 )
{
return 0 ;
}
else if ( ( Denominator = = - 1 ) & & ( Numerator = = MinIntValue ) )
{
return MaxIntValue ;
}
return Numerator / Denominator ;
}
2023-08-03 07:16:11 -04:00
static VM_FORCEINLINE VectorRegister4i VVMIntDiv ( VectorRegister4i v0 , VectorRegister4i v1 )
{
2023-10-23 19:25:08 -04:00
const int32 * v0_4 = reinterpret_cast < const int32 * > ( & v0 ) ;
const int32 * v1_4 = reinterpret_cast < const int32 * > ( & v1 ) ;
2023-08-03 07:16:11 -04:00
2023-10-23 19:25:08 -04:00
FVVM_VUI4 res ;
2024-08-02 18:34:50 -04:00
res . i4 [ 0 ] = VVM_SafeIntDivide ( v0_4 [ 0 ] , v1_4 [ 0 ] ) ;
res . i4 [ 1 ] = VVM_SafeIntDivide ( v0_4 [ 1 ] , v1_4 [ 1 ] ) ;
res . i4 [ 2 ] = VVM_SafeIntDivide ( v0_4 [ 2 ] , v1_4 [ 2 ] ) ;
res . i4 [ 3 ] = VVM_SafeIntDivide ( v0_4 [ 3 ] , v1_4 [ 3 ] ) ;
2023-08-03 07:16:11 -04:00
2023-10-23 19:25:08 -04:00
return res . v ;
2023-08-03 07:16:11 -04:00
}
static VM_FORCEINLINE const uint8 * VVM_random ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SS , FVectorVMSerializeState * CmpSS , int NumLoops )
{
VVMSer_instruction ( 0 , 1 ) ;
if ( ! VVM_serSyncRandom ( InsPtr , BatchState , ExecCtx , SS , CmpSS , 1 , EVectorVMOp : : random ) )
{
2023-08-17 15:34:02 -04:00
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
2023-08-03 07:16:11 -04:00
if ( CT_MultipleLoops )
{
uint8 * End = P1 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
P0 + = Inc0 ;
VectorRegister4f Res = VVM_nextRandom ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P1 ) ;
2023-08-03 07:16:11 -04:00
P1 + = sizeof ( FVecReg ) ;
} while ( P1 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f Res = VVM_nextRandom ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P1 ) ;
2023-08-03 07:16:11 -04:00
}
}
return InsPtr + 5 ;
}
static VM_FORCEINLINE const uint8 * VVM_randomi ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SS , FVectorVMSerializeState * CmpSS , int NumLoops )
{
VVMSer_instruction ( 1 , 1 ) ;
if ( ! VVM_serSyncRandom ( InsPtr , BatchState , ExecCtx , SS , CmpSS , 1 , EVectorVMOp : : randomi ) )
{
2023-08-17 15:34:02 -04:00
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
2023-08-03 07:16:11 -04:00
if ( CT_MultipleLoops )
{
uint8 * End = P1 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
P0 + = Inc0 ;
VectorRegister4f Res = VVM_nextRandom ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( VectorFloatToInt ( Res ) , P0 ) ;
2023-08-03 07:16:11 -04:00
P1 + = sizeof ( FVecReg ) ;
} while ( P1 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f Res = VVM_nextRandom ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( VectorFloatToInt ( Res ) , P0 ) ;
2023-08-03 07:16:11 -04:00
}
}
return InsPtr + 5 ;
}
static VM_FORCEINLINE const uint8 * VVM_random_add ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SS , FVectorVMSerializeState * CmpSS , int NumLoops )
{
VVMSer_instruction ( 0 , 2 ) ;
if ( ! VVM_serSyncRandom ( InsPtr , BatchState , ExecCtx , SS , CmpSS , 2 , EVectorVMOp : : random_add ) )
{
2023-08-17 15:34:02 -04:00
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint32 Inc1 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 1 ] ] ;
2023-08-03 07:16:11 -04:00
if ( CT_MultipleLoops )
{
uint8 * End = P2 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
P0 + = Inc0 ;
2023-08-17 15:34:02 -04:00
P1 + = Inc1 ;
2023-08-03 07:16:11 -04:00
VectorRegister4f Res = VectorAdd ( VVM_nextRandom ( BatchState , R0 ) , R1 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P2 ) ;
2023-08-03 07:16:11 -04:00
P2 + = sizeof ( FVecReg ) ;
} while ( P2 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
VectorRegister4f Res = VectorAdd ( VVM_nextRandom ( BatchState , R0 ) , R1 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P2 ) ;
2023-08-03 07:16:11 -04:00
}
}
return InsPtr + 7 ;
}
static VM_FORCEINLINE const uint8 * VVM_random_2x ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SS , FVectorVMSerializeState * CmpSS , int NumLoops )
{
VVMSer_instruction ( 0 , 2 ) ;
if ( ! VVM_serSyncRandom ( InsPtr , BatchState , ExecCtx , SS , CmpSS , 1 , EVectorVMOp : : random_2x ) & & ! VVM_serSyncRandom ( InsPtr , BatchState , ExecCtx , SS , CmpSS , 2 , EVectorVMOp : : random_2x ) )
{
2023-08-17 15:34:02 -04:00
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
2023-08-03 07:16:11 -04:00
if ( CT_MultipleLoops )
{
uint8 * End = P1 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
P0 + = Inc0 ;
VectorRegister4f Res0 = VVM_nextRandom ( BatchState , R0 ) ;
VectorRegister4f Res1 = VVM_nextRandom ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res0 , ( float * ) P1 ) ;
VectorStore ( Res1 , ( float * ) P2 ) ;
2023-08-03 07:16:11 -04:00
P1 + = sizeof ( FVecReg ) ;
P2 + = sizeof ( FVecReg ) ;
} while ( P1 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f Res0 = VVM_nextRandom ( BatchState , R0 ) ;
VectorRegister4f Res1 = VVM_nextRandom ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res0 , ( float * ) P1 ) ;
VectorStore ( Res1 , ( float * ) P2 ) ;
2023-08-03 07:16:11 -04:00
}
}
return InsPtr + 7 ;
}
VM_FORCEINLINE const uint8 * VVM_half_to_float ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , int NumLoops )
{
2023-08-17 15:34:02 -04:00
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] > > 1 ; //move the input 2 bytes instead of four
2023-08-03 07:16:11 -04:00
uint8 * End = P1 + sizeof ( FVecReg ) * NumLoops ;
do {
FPlatformMath : : VectorLoadHalf ( ( float * ) P1 , ( uint16 * ) P0 ) ;
P0 + = Inc0 ;
P1 + = sizeof ( FVecReg ) ;
} while ( P1 < End ) ;
return InsPtr + 5 ;
}
static const uint8 * VVM_update_id ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , int NumLoops )
{
2023-08-17 15:34:02 -04:00
uint16 * RegIndices = ( uint16 * ) InsPtr ;
int32 * R1 = ( int32 * ) BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
int32 * R2 = ( int32 * ) BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
2023-08-03 07:16:11 -04:00
uint8 DataSetIdx = InsPtr [ 4 ] ;
FDataSetMeta * DataSet = & ExecCtx - > DataSets [ DataSetIdx ] ;
check ( DataSetIdx < ( uint32 ) ExecCtx - > DataSets . Num ( ) ) ;
check ( DataSet - > IDTable ) ;
check ( DataSet - > IDTable - > Num ( ) > = DataSet - > InstanceOffset + BatchState - > ChunkLocalData . StartInstanceThisChunk + BatchState - > ChunkLocalData . NumInstancesThisChunk ) ;
int NumOutputInstances = BatchState - > ChunkLocalData . NumOutputPerDataSet [ DataSetIdx ] ;
int NumFreed = BatchState - > ChunkLocalData . NumInstancesThisChunk - BatchState - > ChunkLocalData . NumOutputPerDataSet [ DataSetIdx ] ;
//compute this chunk's MaxID
int MaxID = - 1 ;
if ( NumOutputInstances > 4 )
{
int NumOutput4 = ( int ) ( ( ( ( ( uint32 ) NumOutputInstances + 3U ) & ~ 3U ) - 1 ) > > 2 ) ;
VectorRegister4i Max4 = VectorIntSet1 ( - 1 ) ;
for ( int i = 0 ; i < NumOutput4 ; + + i )
{
VectorRegister4i R4 = VectorIntLoad ( R1 + ( uint64 ) ( ( uint64 ) i < < 2ULL ) ) ;
Max4 = VectorIntMax ( Max4 , R4 ) ;
}
VectorRegister4i Last4 = VectorIntLoad ( R1 + NumOutputInstances - 4 ) ;
Max4 = VectorIntMax ( Last4 , Max4 ) ;
int M4 [ 4 ] ;
VectorIntStore ( Max4 , M4 ) ;
int m0 = M4 [ 0 ] > M4 [ 1 ] ? M4 [ 0 ] : M4 [ 1 ] ;
int m1 = M4 [ 2 ] > M4 [ 3 ] ? M4 [ 2 ] : M4 [ 3 ] ;
int m = m0 > m1 ? m0 : m1 ;
if ( m > MaxID )
{
MaxID = m ;
}
}
else
{
for ( int i = 0 ; i < NumOutputInstances ; + + i )
{
if ( R1 [ i ] > MaxID )
{
MaxID = R1 [ i ] ;
}
}
}
int StartNumFreed = FPlatformAtomics : : InterlockedAdd ( ( volatile int32 * ) DataSet - > NumFreeIDs , NumFreed ) ;
int NumFullLoops = BatchState - > ChunkLocalData . NumInstancesThisChunk > > 2 ;
int NumLeftoverInstances = BatchState - > ChunkLocalData . NumInstancesThisChunk - ( NumFullLoops < < 2 ) ;
uint8 * OutputMaskIdx = BatchState - > ChunkLocalData . OutputMaskIdx [ DataSetIdx ] ;
int * IDTable = DataSet - > IDTable - > GetData ( ) ;
int * FreeTable = DataSet - > FreeIDTable - > GetData ( ) + StartNumFreed ;
int NumInstancesCounted = 0 ;
int NumInstancesFreed = 0 ;
NumInstancesCounted = 0 ;
{
for ( int i = 0 ; i < NumFullLoops ; + + i ) {
int LoopCount = i < < 2 ;
uint8 Idx = OutputMaskIdx [ i ] ;
int InsCnt = VVM_OUTPUT_ADVANCE_TABLE [ Idx ] > > 2 ;
int StartingOutputIdx = BatchState - > ChunkLocalData . StartingOutputIdxPerDataSet [ DataSetIdx ] + NumInstancesCounted ;
int StartingNumFreed = NumInstancesFreed ;
NumInstancesCounted + = InsCnt ;
NumInstancesFreed + = 4 - InsCnt ;
switch ( Idx ) {
case 0 : FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 1 ] ;
FreeTable [ StartingNumFreed + 2 ] = R1 [ LoopCount + 2 ] ;
FreeTable [ StartingNumFreed + 3 ] = R1 [ LoopCount + 3 ] ;
break ;
case 1 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 1 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 2 ] ;
FreeTable [ StartingNumFreed + 2 ] = R1 [ LoopCount + 3 ] ;
break ;
case 2 : IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 0 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 2 ] ;
FreeTable [ StartingNumFreed + 2 ] = R1 [ LoopCount + 3 ] ;
break ;
case 3 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 1 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 2 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 3 ] ;
break ;
case 4 : IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 0 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 1 ] ;
FreeTable [ StartingNumFreed + 2 ] = R1 [ LoopCount + 3 ] ;
break ;
case 5 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 1 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 1 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 3 ] ;
break ;
case 6 : IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 1 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 3 ] ;
break ;
case 7 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 1 ;
IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 2 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 3 ] ;
break ;
case 8 : IDTable [ R1 [ LoopCount + 3 ] ] = StartingOutputIdx + 0 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 1 ] ;
FreeTable [ StartingNumFreed + 2 ] = R1 [ LoopCount + 2 ] ;
break ;
case 9 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 3 ] ] = StartingOutputIdx + 1 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 1 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 2 ] ;
break ;
case 10 : IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 3 ] ] = StartingOutputIdx + 1 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 2 ] ;
break ;
case 11 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 1 ;
IDTable [ R1 [ LoopCount + 3 ] ] = StartingOutputIdx + 2 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 2 ] ;
break ;
case 12 : IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 3 ] ] = StartingOutputIdx + 1 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ StartingNumFreed + 1 ] = R1 [ LoopCount + 1 ] ;
break ;
case 13 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 1 ;
IDTable [ R1 [ LoopCount + 3 ] ] = StartingOutputIdx + 2 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 1 ] ;
break ;
case 14 : IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 1 ;
IDTable [ R1 [ LoopCount + 3 ] ] = StartingOutputIdx + 2 ;
FreeTable [ StartingNumFreed + 0 ] = R1 [ LoopCount + 0 ] ;
break ;
case 15 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 1 ;
IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 2 ;
IDTable [ R1 [ LoopCount + 3 ] ] = StartingOutputIdx + 3 ;
break ;
}
}
//if (NumLeftoverInstances != 0) {
{
int StartingOutputIdx = BatchState - > ChunkLocalData . StartingOutputIdxPerDataSet [ DataSetIdx ] + NumInstancesCounted ;
int LoopCount = NumFullLoops < < 2 ;
uint8 Idx = OutputMaskIdx [ NumFullLoops ] ;
switch ( NumLeftoverInstances ) {
case 0 : break ;
case 1 :
if ( Idx & 1 ) {
IDTable [ R1 [ LoopCount ] ] = StartingOutputIdx ;
+ + NumInstancesCounted ;
} else {
FreeTable [ NumInstancesFreed ] = R1 [ LoopCount ] ;
+ + NumInstancesFreed ;
}
break ;
case 2 :
switch ( Idx & 3 ) {
case 0 : FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ NumInstancesFreed + 1 ] = R1 [ LoopCount + 1 ] ;
NumInstancesFreed + = 2 ;
break ;
case 1 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 1 ] ;
+ + NumInstancesCounted ;
+ + NumInstancesFreed ;
break ;
case 2 : IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 0 ;
FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 0 ] ;
+ + NumInstancesCounted ;
+ + NumInstancesFreed ;
break ;
case 3 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 1 ;
NumInstancesCounted + = 2 ;
break ;
}
break ;
case 3 :
switch ( Idx & 7 ) {
case 0 : FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ NumInstancesFreed + 1 ] = R1 [ LoopCount + 1 ] ;
FreeTable [ NumInstancesFreed + 2 ] = R1 [ LoopCount + 2 ] ;
NumInstancesFreed + = 3 ;
break ;
case 1 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 1 ] ;
FreeTable [ NumInstancesFreed + 1 ] = R1 [ LoopCount + 2 ] ;
+ + NumInstancesCounted ;
NumInstancesFreed + = 2 ;
break ;
case 2 : IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 0 ;
FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ NumInstancesFreed + 1 ] = R1 [ LoopCount + 2 ] ;
+ + NumInstancesCounted ;
NumInstancesFreed + = 2 ;
break ;
case 3 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 1 ;
FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 2 ] ;
NumInstancesCounted + = 2 ;
+ + NumInstancesFreed ;
break ;
case 4 : IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 0 ;
FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 0 ] ;
FreeTable [ NumInstancesFreed + 1 ] = R1 [ LoopCount + 1 ] ;
+ + NumInstancesCounted ;
NumInstancesFreed + = 2 ;
break ;
case 5 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 1 ;
FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 1 ] ;
NumInstancesCounted + = 2 ;
+ + NumInstancesFreed ;
break ;
case 6 : IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 1 ;
FreeTable [ NumInstancesFreed + 0 ] = R1 [ LoopCount + 0 ] ;
NumInstancesCounted + = 2 ;
+ + NumInstancesFreed ;
break ;
case 7 : IDTable [ R1 [ LoopCount + 0 ] ] = StartingOutputIdx + 0 ;
IDTable [ R1 [ LoopCount + 1 ] ] = StartingOutputIdx + 1 ;
IDTable [ R1 [ LoopCount + 2 ] ] = StartingOutputIdx + 2 ;
NumInstancesCounted + = 3 ;
break ;
}
break ;
}
}
}
//Set the DataSet's MaxID if this chunk's MaxID is bigger
if ( MaxID ! = - 1 )
{
int SanityCount = 0 ;
do {
int OldMaxID = FPlatformAtomics : : AtomicRead ( DataSet - > MaxUsedID ) ;
if ( MaxID < = OldMaxID )
{
break ;
}
int NewMaxID = FPlatformAtomics : : InterlockedCompareExchange ( ( volatile int32 * ) DataSet - > MaxUsedID , MaxID , OldMaxID ) ;
if ( NewMaxID = = OldMaxID )
{
break ;
}
} while ( SanityCount + + < ( 1 < < 30 ) ) ;
VVMDebugBreakIf ( SanityCount > ( 1 < < 30 ) - 1 ) ;
}
return InsPtr + 6 ;
}
static const uint8 * VVM_acquire_id ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , int NumLoops )
{
VVMSer_instruction ( 1 , 2 ) ;
uint8 DataSetIdx = InsPtr [ 4 ] ;
check ( DataSetIdx < ( uint32 ) ExecCtx - > DataSets . Num ( ) ) ;
FDataSetMeta * DataSet = & ExecCtx - > DataSets [ DataSetIdx ] ;
{ //1. Get the free IDs into the temp register
int SanityCount = 0 ;
do
{
int OldNumFreeIDs = FPlatformAtomics : : AtomicRead ( DataSet - > NumFreeIDs ) ;
check ( OldNumFreeIDs > = BatchState - > ChunkLocalData . NumInstancesThisChunk ) ;
int * OutPtr = ( int * ) BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] ;
int * InPtr = DataSet - > FreeIDTable - > GetData ( ) + OldNumFreeIDs - BatchState - > ChunkLocalData . NumInstancesThisChunk ;
for ( int i = 0 ; i < BatchState - > ChunkLocalData . NumInstancesThisChunk ; + + i )
{
OutPtr [ i ] = InPtr [ BatchState - > ChunkLocalData . NumInstancesThisChunk - i - 1 ] ;
}
int NewNumFreeIDs = FPlatformAtomics : : InterlockedCompareExchange ( ( volatile int32 * ) DataSet - > NumFreeIDs , OldNumFreeIDs - BatchState - > ChunkLocalData . NumInstancesThisChunk , OldNumFreeIDs ) ;
if ( NewNumFreeIDs = = OldNumFreeIDs )
{
break ;
}
} while ( SanityCount + + < ( 1 < < 30 ) ) ;
VVMDebugBreakIf ( SanityCount > = ( 1 < < 30 ) - 1 ) ;
}
{ //2. append the IDs we acquired in step 1 to the end of the free table array, representing spawned IDs
//FreeID table is write-only as far as this invocation of the VM is concerned, so the interlocked add w/o filling
//in the data is fine
int StartNumSpawned = FPlatformAtomics : : InterlockedAdd ( DataSet - > NumSpawnedIDs , BatchState - > ChunkLocalData . NumInstancesThisChunk ) + BatchState - > ChunkLocalData . NumInstancesThisChunk ;
check ( StartNumSpawned < = DataSet - > FreeIDTable - > Max ( ) ) ;
VVMMemCpy ( DataSet - > FreeIDTable - > GetData ( ) + DataSet - > FreeIDTable - > Max ( ) - StartNumSpawned , BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 0 ] ] , sizeof ( int32 ) * BatchState - > ChunkLocalData . NumInstancesThisChunk ) ;
}
//3. set the tag
VVMMemSet32 ( BatchState - > RegPtrTable [ ( ( uint16 * ) InsPtr ) [ 1 ] ] , DataSet - > IDAcquireTag , BatchState - > ChunkLocalData . NumInstancesThisChunk ) ;
return InsPtr + 6 ;
}
2023-10-06 10:49:41 -04:00
const uint8 * VVM_external_func_call ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , int NumLoops )
2023-08-03 07:16:11 -04:00
{
int FnIdx = ( int ) * ( uint16 * ) ( InsPtr ) ;
FVectorVMExtFunctionData * ExtFnData = ExecCtx - > VVMState - > ExtFunctionTable + FnIdx ;
# if VECTORVM_DEBUG_PRINTF
printf ( " \t FnIdx: %d, # Inputs: %d, #Outputs: %d \n \t " , FnIdx , ExtFnData - > NumInputs , ExtFnData - > NumOutputs ) ;
for ( int i = 0 ; i < ExtFnData - > NumInputs + ExtFnData - > NumOutputs ; + + i ) {
uint16 AbsRegIdx ;
int RegType = VVMGetRegisterType ( ExecCtx - > VVMState , ( ( ( uint16 * ) InsPtr ) + 1 ) [ i ] , & AbsRegIdx ) ;
2023-10-06 10:49:41 -04:00
VECTORVM_PRINTF ( " %c%d, " , VVM_RT_CHAR [ RegType ] , AbsRegIdx ) ;
2023-08-03 07:16:11 -04:00
}
2023-10-06 10:49:41 -04:00
VECTORVM_PRINT_NEW_LINE ( ) ;
2023-08-03 07:16:11 -04:00
# endif //VECTORVM_DEBUG_PRINTF
# if VECTORVM_SUPPORTS_SERIALIZATION && !defined(VVM_SERIALIZE_NO_WRITE)
if ( SerializeState & & ( SerializeState - > Flags & VVMSer_SyncExtFns ) & & CmpSerializeState & & SerializeState - > NumInstances = = CmpSerializeState - > NumInstances & & ( CmpSerializeState - > NumInstructions > SerializeState - > NumInstructions ) )
{
//If we hit this branch we are using the output from the comparision state instead of running the external function itself.
//AFAIK the VM is not speced to have the inputs and outputs in a particular order, and even if it is we shouldn't rely on
//3rd party external function writers to follow the spec. Therefore we don't just sync what we think is output, we sync
//all temp registers that are used in the function.
int NumRegisters = ExtFnData - > NumInputs + ExtFnData - > NumOutputs ;
FVectorVMSerializeInstruction * CmpIns = nullptr ; //this instruction
FVectorVMOptimizeInstruction * OptIns = nullptr ;
if ( SerializeState - > OptimizeCtx & & SerializeState - > OptimizeCtx - > Intermediate . NumInstructions )
{
//instructions have been re-ordered, can't binary search, must linear search
for ( uint32 i = 0 ; i < SerializeState - > OptimizeCtx - > Intermediate . NumInstructions ; + + i )
{
if ( SerializeState - > OptimizeCtx - > Intermediate . Instructions [ i ] . OpCode = = EVectorVMOp : : external_func_call & & SerializeState - > OptimizeCtx - > Intermediate . Instructions [ i ] . PtrOffsetInOptimizedBytecode = = ( int ) ( SerializeState - > Running . StartOpPtr - ExecCtx - > VVMState - > Bytecode ) )
{
OptIns = SerializeState - > OptimizeCtx - > Intermediate . Instructions + i ;
break ;
}
}
if ( OptIns )
{
for ( uint32 i = 0 ; i < CmpSerializeState - > NumInstructions ; + + i )
{
if ( CmpSerializeState - > Bytecode [ CmpSerializeState - > Instructions [ i ] . OpStart ] = = ( uint8 ) EVectorVMOp : : external_func_call & & OptIns - > PtrOffsetInOrigBytecode = = CmpSerializeState - > Instructions [ i ] . OpStart )
{
CmpIns = CmpSerializeState - > Instructions + i ;
break ;
}
}
}
if ( OptIns & & CmpIns )
{
for ( int i = 0 ; i < NumRegisters ; + + i )
{
uint8 RegIdx = InsPtr [ 1 + i ] ;
if ( RegIdx ! = 0xFF ) {
uint32 * DstReg = ( uint32 * ) BatchState - > RegPtrTable [ RegIdx ] ;
int DstInc = ( int ) BatchState - > RegIncTable [ RegIdx ] ;
if ( DstInc ! = 0 ) { //skip constants
int SrcRegIdx = 0xFFFF ;
if ( OptIns & & ( SerializeState - > Flags & VVMSer_OptimizedBytecode ) & & ! ( CmpSerializeState - > Flags & VVMSer_OptimizedBytecode ) )
{
SrcRegIdx = ( ( uint16 * ) ( CmpSerializeState - > Bytecode + OptIns - > PtrOffsetInOrigBytecode + 2 ) ) [ i ] & 0x7FFF ; //high bit is register in original bytecode
}
if ( SrcRegIdx ! = 0x7FFF ) //invalid register, skipped by external function in the execution
{
//can only be temp registers because there's no output aliasing in the original VM
uint8 * SrcReg = ( uint8 * ) ( CmpIns - > RegisterTable + SrcRegIdx * CmpSerializeState - > NumInstances + BatchState - > ChunkLocalData . StartInstanceThisChunk ) ;
VVMMemCpy ( DstReg , SrcReg , sizeof ( uint32 ) * BatchState - > ChunkLocalData . NumInstancesThisChunk ) ;
}
}
}
}
}
}
else if ( ( SerializeState - > Flags & VVMSer_OptimizedBytecode ) & & ( CmpSerializeState - > Flags & VVMSer_OptimizedBytecode ) )
{
//we do not have the optimized internal state (reading from serialized data from a cooked build... therefore we're starting from an optimized script and comparing to a (different?) optimized
//script, and thus the instructions should line up
//CmpIns = CmpSerializeState->Instructions + SerializeState->Running.NumInstructionsThisChunk;
int ExtFnInsIdx = 0 ;
for ( int i = 0 ; i < SerializeState - > Running . NumInstructionsThisChunk ; + + i )
{
int OpCodeIdx = SerializeState - > Instructions [ i ] . OpStart - 1 ;
if ( OpCodeIdx > = 0 & & OpCodeIdx < ( int ) SerializeState - > NumBytecodeBytes )
{
EVectorVMOp InsOpCode = ( EVectorVMOp ) SerializeState - > Bytecode [ OpCodeIdx ] ;
if ( InsOpCode = = EVectorVMOp : : external_func_call )
{
+ + ExtFnInsIdx ;
}
}
}
//now find the instruction in the comparison serialize state
int SearchExtFnInsIdx = 0 ;
for ( uint32 i = 0 ; i < CmpSerializeState - > NumInstructions ; + + i )
{
int OpCodeIdx = CmpSerializeState - > Instructions [ i ] . OpStart - 1 ;
if ( OpCodeIdx > = 0 & & OpCodeIdx < ( int ) CmpSerializeState - > NumBytecodeBytes )
{
EVectorVMOp InsOpCode = ( EVectorVMOp ) CmpSerializeState - > Bytecode [ OpCodeIdx ] ;
if ( InsOpCode = = EVectorVMOp : : external_func_call )
{
if ( SearchExtFnInsIdx = = ExtFnInsIdx ) {
CmpIns = CmpSerializeState - > Instructions + i ;
break ;
}
+ + SearchExtFnInsIdx ;
}
}
}
if ( CmpIns )
{
uint16 * SrcRegIndices = ( uint16 * ) ( CmpSerializeState - > Bytecode + CmpIns - > OpStart + sizeof ( uint16 ) ) ;
uint16 * DstRegIndices = ( ( uint16 * ) InsPtr ) + 1 ;
for ( int i = 0 ; i < NumRegisters ; + + i )
{
uint16 SrcRegIdx = SrcRegIndices [ i ] ;
uint16 DstRegIdx = DstRegIndices [ i ] ;
if ( SrcRegIdx ! = 0xFFFF & & DstRegIdx ! = 0xFFFF )
{
uint8 DstInc = BatchState - > RegIncTable [ DstRegIdx ] ;
if ( DstInc ! = 0 ) //skip constants
{
int SrcRegType = VVMGetRegisterType ( ExecCtx - > VVMState , SrcRegIdx , & SrcRegIdx ) ;
uint32 * DstReg = ( uint32 * ) BatchState - > RegPtrTable [ DstRegIdx ] ;
uint32 * SrcReg = NULL ;
if ( SrcRegType = = VVM_RT_TEMPREG )
{
SrcReg = ( uint32 * ) CmpIns - > RegisterTable + SrcRegIdx * CmpSerializeState - > NumInstances + BatchState - > ChunkLocalData . StartInstanceThisChunk ;
}
else if ( SrcRegType = = VVM_RT_OUTPUT )
{
SrcReg = ( uint32 * ) CmpIns - > RegisterTable + ( CmpSerializeState - > NumTempRegisters + SrcRegIdx ) * CmpSerializeState - > NumInstances + BatchState - > ChunkLocalData . StartInstanceThisChunk ;
}
if ( SrcReg )
{
VVMMemCpy ( DstReg , SrcReg , sizeof ( uint32 ) * BatchState - > ChunkLocalData . NumInstancesThisChunk ) ;
}
}
}
}
}
}
}
else
{
# else //VECTORVM_SUPPORTS_SERIALIZATION
{
# endif //VECTORVM_SUPPORTS_SERIALIZATION
check ( * InsPtr < ExecCtx - > VVMState - > NumExtFunctions ) ;
check ( ( uint32 ) ( ExtFnData - > NumInputs + ExtFnData - > NumOutputs ) < = ExecCtx - > VVMState - > MaxExtFnRegisters ) ;
const uint16 * RegIndices = ( ( uint16 * ) InsPtr ) + 1 ;
uint32 DummyRegCount = 0 ;
for ( int i = 0 ; i < ExtFnData - > NumInputs + ExtFnData - > NumOutputs ; + + i ) {
if ( RegIndices [ i ] ! = 0xFFFF )
{
BatchState - > ChunkLocalData . ExtFnDecodedReg . RegData [ i ] = ( uint32 * ) BatchState - > RegPtrTable [ RegIndices [ i ] ] ;
BatchState - > ChunkLocalData . ExtFnDecodedReg . RegInc [ i ] = BatchState - > RegIncTable [ RegIndices [ i ] ] > > 4 ; //external functions increment by 1 32 bit value at a time
}
else
{
BatchState - > ChunkLocalData . ExtFnDecodedReg . RegData [ i ] = ( uint32 * ) ( BatchState - > ChunkLocalData . ExtFnDecodedReg . DummyRegs + DummyRegCount + + ) ;
BatchState - > ChunkLocalData . ExtFnDecodedReg . RegInc [ i ] = 0 ;
}
}
check ( DummyRegCount < = ExecCtx - > VVMState - > NumDummyRegsRequired ) ;
FVectorVMExternalFunctionContextExperimental ExtFnCtx ;
ExtFnCtx . RegisterData = BatchState - > ChunkLocalData . ExtFnDecodedReg . RegData ;
ExtFnCtx . RegInc = BatchState - > ChunkLocalData . ExtFnDecodedReg . RegInc ;
ExtFnCtx . RegReadCount = 0 ;
ExtFnCtx . NumRegisters = ExtFnData - > NumInputs + ExtFnData - > NumOutputs ;
ExtFnCtx . StartInstance = BatchState - > ChunkLocalData . StartInstanceThisChunk ;
ExtFnCtx . NumInstances = BatchState - > ChunkLocalData . NumInstancesThisChunk ;
ExtFnCtx . NumLoops = NumLoops ;
ExtFnCtx . PerInstanceFnInstanceIdx = 0 ;
ExtFnCtx . UserPtrTable = ExecCtx - > UserPtrTable . GetData ( ) ;
ExtFnCtx . NumUserPtrs = ExecCtx - > UserPtrTable . Num ( ) ;
ExtFnCtx . RandStream = & BatchState - > RandStream ;
ExtFnCtx . RandCounters = & BatchState - > ChunkLocalData . RandCounters ;
ExtFnCtx . DataSets = ExecCtx - > DataSets ;
# if VECTORVM_SUPPORTS_LEGACY
FVectorVMExternalFunctionContextProxy ProxyExtFnCtx ( ExtFnCtx ) ;
ExtFnData - > Function - > Execute ( ProxyExtFnCtx ) ;
# else // VECTORVM_SUPPORTS_LEGACY
ExtFnData - > Function - > Execute ( ExtFnCtx ) ;
# endif //VECTORVM_SUPPORTS_LEGACY
}
2023-10-06 10:49:41 -04:00
return InsPtr + 3 + 2 * ( ExtFnData - > NumInputs + ExtFnData - > NumOutputs ) ;
2023-08-03 07:16:11 -04:00
}
static VM_FORCEINLINE const uint8 * VVM_sincos ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , int NumLoops )
{
uint16 * RegIndices = ( uint16 * ) InsPtr ;
if ( CT_MultipleLoops )
{
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint8 * End = P1 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
P0 + = Inc0 ;
VectorSinCos ( ( VectorRegister4f * ) P1 , ( VectorRegister4f * ) P2 , & R0 ) ;
P1 + = sizeof ( FVecReg ) ;
P2 + = sizeof ( FVecReg ) ;
} while ( P1 < End ) ;
}
else
{
VectorRegister4f r0 = VectorLoad ( ( float * ) BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ) ;
VectorSinCos ( ( VectorRegister4f * ) BatchState - > RegPtrTable [ RegIndices [ 1 ] ] , ( VectorRegister4f * ) BatchState - > RegPtrTable [ RegIndices [ 2 ] ] , & r0 ) ;
}
return InsPtr + 7 ;
}
//zF16 - stb_image_resize
# define VVM_NULL_FN_ARGS const bool CT_MultipleLoops, const uint8 *InsPtr, FVectorVMBatchState *BatchState, FVectorVMExecContext *ExecCtx, FVectorVMSerializeState *SerializeState, FVectorVMSerializeState *CmpSerializeState, int NumLoops
typedef const uint8 * ( VVMFn_null ) ( VVM_NULL_FN_ARGS ) ;
typedef VectorRegister4f ( VVMFn_1f ) ( FVectorVMBatchState * , VectorRegister4f a ) ;
typedef VectorRegister4f ( VVMFn_2f ) ( FVectorVMBatchState * , VectorRegister4f a , VectorRegister4f b ) ;
typedef VectorRegister4f ( VVMFn_3f ) ( FVectorVMBatchState * , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) ;
typedef VectorRegister4f ( VVMFn_4f ) ( FVectorVMBatchState * , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) ;
typedef VectorRegister4f ( VVMFn_5f ) ( FVectorVMBatchState * , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d , VectorRegister4f e ) ;
typedef VectorRegister4i ( VVMFn_1i ) ( FVectorVMBatchState * , VectorRegister4i a ) ;
typedef VectorRegister4i ( VVMFn_2i ) ( FVectorVMBatchState * , VectorRegister4i a , VectorRegister4i b ) ;
typedef VectorRegister4i ( VVMFn_3i ) ( FVectorVMBatchState * , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) ;
//Dispatch functions: these rely on VM_FORCEINLINE to actually force the function to be inlined
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn0null_0null ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_null fn , int NumLoops ) {
VVMSer_instruction ( 0 , 0 ) ;
return fn ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn1null_0null ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_null fn , int NumLoops ) {
VVMSer_instruction ( 0 , 0 ) ;
return fn ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn1null_1null ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_null fn , int NumLoops ) {
VVMSer_instruction ( 0 , 1 ) ;
return fn ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn1null_2null ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_null fn , int NumLoops ) {
VVMSer_instruction ( 0 , 2 ) ;
return fn ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn2null_1null ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_null fn , int NumLoops ) {
VVMSer_instruction ( 0 , 2 ) ;
return fn ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn1f_1f ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_1f fn , int NumLoops ) {
VVMSer_instruction ( 1 , 1 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint8 * End = P1 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
P0 + = Inc0 ;
VectorRegister4f Res = fn ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P1 ) ;
2023-08-03 07:16:11 -04:00
P1 + = sizeof ( FVecReg ) ;
} while ( P1 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f Res = fn ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P1 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 5 ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn2f_1f ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_2f fn , int NumLoops ) {
VVMSer_instruction ( 1 , 2 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint32 Inc1 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 1 ] ] ;
uint8 * End = P2 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
P0 + = Inc0 ;
P1 + = Inc1 ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P2 ) ;
2023-08-03 07:16:11 -04:00
P2 + = sizeof ( FVecReg ) ;
} while ( P2 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P2 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 7 ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn3f_1f ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_3f fn , int NumLoops ) {
VVMSer_instruction ( 1 , 3 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
uint8 * P3 = BatchState - > RegPtrTable [ RegIndices [ 3 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint32 Inc1 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 1 ] ] ;
uint32 Inc2 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 2 ] ] ;
uint8 * End = P3 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
VectorRegister4f R2 = VectorLoad ( ( float * ) P2 ) ;
P0 + = Inc0 ;
P1 + = Inc1 ;
P2 + = Inc2 ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 , R2 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P3 ) ;
2023-08-03 07:16:11 -04:00
P3 + = sizeof ( FVecReg ) ;
} while ( P3 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
VectorRegister4f R2 = VectorLoad ( ( float * ) P2 ) ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 , R2 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P3 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 9 ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn4f_1f ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_4f fn , int NumLoops ) {
VVMSer_instruction ( 1 , 4 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
uint8 * P3 = BatchState - > RegPtrTable [ RegIndices [ 3 ] ] ;
uint8 * P4 = BatchState - > RegPtrTable [ RegIndices [ 4 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint32 Inc1 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 1 ] ] ;
uint32 Inc2 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 2 ] ] ;
uint32 Inc3 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 3 ] ] ;
uint8 * End = P4 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
VectorRegister4f R2 = VectorLoad ( ( float * ) P2 ) ;
VectorRegister4f R3 = VectorLoad ( ( float * ) P3 ) ;
P0 + = Inc0 ;
P1 + = Inc1 ;
P2 + = Inc2 ;
P3 + = Inc3 ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 , R2 , R3 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P4 ) ;
2023-08-03 07:16:11 -04:00
P4 + = sizeof ( FVecReg ) ;
} while ( P4 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
VectorRegister4f R2 = VectorLoad ( ( float * ) P2 ) ;
VectorRegister4f R3 = VectorLoad ( ( float * ) P3 ) ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 , R2 , R3 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P4 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 11 ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn5f_1f ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_5f fn , int NumLoops ) {
VVMSer_instruction ( 1 , 5 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
uint8 * P3 = BatchState - > RegPtrTable [ RegIndices [ 3 ] ] ;
uint8 * P4 = BatchState - > RegPtrTable [ RegIndices [ 4 ] ] ;
uint8 * P5 = BatchState - > RegPtrTable [ RegIndices [ 5 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint32 Inc1 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 1 ] ] ;
uint32 Inc2 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 2 ] ] ;
uint32 Inc3 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 3 ] ] ;
uint32 Inc4 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 4 ] ] ;
uint8 * End = P5 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
VectorRegister4f R2 = VectorLoad ( ( float * ) P2 ) ;
VectorRegister4f R3 = VectorLoad ( ( float * ) P3 ) ;
VectorRegister4f R4 = VectorLoad ( ( float * ) P4 ) ;
P0 + = Inc0 ;
P1 + = Inc1 ;
P2 + = Inc2 ;
P3 + = Inc3 ;
P4 + = Inc4 ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 , R2 , R3 , R4 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P5 ) ;
2023-08-03 07:16:11 -04:00
P5 + = sizeof ( FVecReg ) ;
} while ( P5 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
VectorRegister4f R2 = VectorLoad ( ( float * ) P2 ) ;
VectorRegister4f R3 = VectorLoad ( ( float * ) P3 ) ;
VectorRegister4f R4 = VectorLoad ( ( float * ) P4 ) ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 , R2 , R3 , R4 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P5 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 13 ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn2f_2f ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_2f fn , int NumLoops ) {
VVMSer_instruction ( 1 , 3 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
uint8 * P3 = BatchState - > RegPtrTable [ RegIndices [ 3 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint32 Inc1 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 1 ] ] ;
ptrdiff_t P23d = P3 - P2 ;
uint8 * End = P3 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
P0 + = Inc0 ;
P1 + = Inc1 ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P2 ) ;
VectorStore ( Res , ( float * ) ( P2 + P23d ) ) ;
2023-08-03 07:16:11 -04:00
P2 + = sizeof ( FVecReg ) ;
} while ( P2 < End ) ;
}
else
{
VectorRegister4f R0 = VectorLoad ( ( float * ) P0 ) ;
VectorRegister4f R1 = VectorLoad ( ( float * ) P1 ) ;
VectorRegister4f Res = fn ( BatchState , R0 , R1 ) ;
2023-08-30 10:40:47 -04:00
VectorStore ( Res , ( float * ) P2 ) ;
VectorStore ( Res , ( float * ) P3 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 9 ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn1i_1i ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_1i fn , int NumLoops ) {
VVMSer_instruction ( 1 , 1 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint8 * End = P1 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4i R0 = VectorIntLoad ( P0 ) ;
P0 + = Inc0 ;
VectorRegister4i Res = fn ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( Res , P1 ) ;
2023-08-03 07:16:11 -04:00
P1 + = sizeof ( FVecReg ) ;
} while ( P1 < End ) ;
}
else
{
VectorRegister4i R0 = VectorIntLoad ( P0 ) ;
VectorRegister4i Res = fn ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( Res , P1 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 5 ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn2i_1i ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_2i fn , int NumLoops ) {
VVMSer_instruction ( 1 , 2 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint32 Inc1 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 1 ] ] ;
uint8 * End = P2 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4i R0 = VectorIntLoad ( P0 ) ;
VectorRegister4i R1 = VectorIntLoad ( P1 ) ;
P0 + = Inc0 ;
P1 + = Inc1 ;
VectorRegister4i Res = fn ( BatchState , R0 , R1 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( Res , P2 ) ;
2023-08-03 07:16:11 -04:00
P2 + = sizeof ( FVecReg ) ;
} while ( P2 < End ) ;
}
else
{
VectorRegister4i R0 = VectorIntLoad ( P0 ) ;
VectorRegister4i R1 = VectorIntLoad ( P1 ) ;
VectorRegister4i Res = fn ( BatchState , R0 , R1 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( Res , P2 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 7 ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn3i_1i ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_3i fn , int NumLoops ) {
VVMSer_instruction ( 1 , 3 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
uint8 * P3 = BatchState - > RegPtrTable [ RegIndices [ 3 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint32 Inc1 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 1 ] ] ;
uint32 Inc2 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 2 ] ] ;
uint8 * End = P3 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4i R0 = VectorIntLoad ( P0 ) ;
VectorRegister4i R1 = VectorIntLoad ( P1 ) ;
VectorRegister4i R2 = VectorIntLoad ( P2 ) ;
P0 + = Inc0 ;
P1 + = Inc1 ;
P2 + = Inc2 ;
VectorRegister4i Res = fn ( BatchState , R0 , R1 , R2 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( Res , P3 ) ;
2023-08-03 07:16:11 -04:00
P3 + = sizeof ( FVecReg ) ;
} while ( P3 < End ) ;
}
else
{
VectorRegister4i R0 = VectorIntLoad ( P0 ) ;
VectorRegister4i R1 = VectorIntLoad ( P1 ) ;
VectorRegister4i R2 = VectorIntLoad ( P2 ) ;
VectorRegister4i Res = fn ( BatchState , R0 , R1 , R2 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( Res , P3 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 9 ;
}
VM_FORCEINLINE const uint8 * VVM_Dispatch_execFn1i_2i ( const bool CT_MultipleLoops , const uint8 * InsPtr , FVectorVMBatchState * BatchState , FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState , VVMFn_1i fn , int NumLoops ) {
VVMSer_instruction ( 1 , 2 ) ;
uint16 * RegIndices = ( uint16 * ) InsPtr ;
uint8 * P0 = BatchState - > RegPtrTable [ RegIndices [ 0 ] ] ;
uint8 * P1 = BatchState - > RegPtrTable [ RegIndices [ 1 ] ] ;
uint8 * P2 = BatchState - > RegPtrTable [ RegIndices [ 2 ] ] ;
if ( CT_MultipleLoops )
{
uint32 Inc0 = ( uint32 ) BatchState - > RegIncTable [ RegIndices [ 0 ] ] ;
uint8 * End = P1 + sizeof ( FVecReg ) * NumLoops ;
do
{
VectorRegister4i R0 = VectorIntLoad ( P0 ) ;
P0 + = Inc0 ;
VectorRegister4i Res = fn ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( Res , P1 ) ;
VectorIntStore ( Res , P2 ) ;
2023-08-03 07:16:11 -04:00
P1 + = sizeof ( FVecReg ) ;
P2 + = sizeof ( FVecReg ) ;
} while ( P2 < End ) ;
}
else
{
VectorRegister4i R0 = VectorIntLoad ( P0 ) ;
VectorRegister4i Res = fn ( BatchState , R0 ) ;
2023-08-30 10:40:47 -04:00
VectorIntStore ( Res , P1 ) ;
VectorIntStore ( Res , P2 ) ;
2023-08-03 07:16:11 -04:00
}
return InsPtr + 7 ;
}
2023-10-06 10:49:41 -04:00
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_add ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorAdd ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_sub ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorSubtract ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_mul ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorMultiply ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_div ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorSelect ( VectorCompareGT ( VectorAbs ( b ) , VVM_m128Const ( Epsilon ) ) , VectorDivide ( a , b ) , VectorZeroFloat ( ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_mad ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorMultiplyAdd ( a , b , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_lerp ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorLerp ( a , b , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_rcp ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorSelect ( VectorCompareGT ( VectorAbs ( a ) , VVM_m128Const ( Epsilon ) ) , VectorReciprocalEstimate ( a ) , VectorZeroFloat ( ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_rsq ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorSelect ( VectorCompareGT ( a , VVM_m128Const ( Epsilon ) ) , VectorReciprocalSqrt ( a ) , VectorZeroFloat ( ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_sqrt ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorSelect ( VectorCompareGT ( a , VVM_m128Const ( Epsilon ) ) , VectorSqrt ( a ) , VectorZeroFloat ( ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_neg ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorNegate ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_abs ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorAbs ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_exp ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorExp ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_exp2 ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorExp2 ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_log ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorSelect ( VectorCompareGT ( a , VectorZeroFloat ( ) ) , VectorLog ( a ) , VectorZeroFloat ( ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_log2 ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorLog2 ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_sin ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorSin ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_cos ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorCos ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_tan ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorTan ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_acos ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorATan2 ( VVM_Exec1f_sqrt ( BatchState , VectorMultiply ( VectorSubtract ( VVM_m128Const ( One ) , a ) , VectorAdd ( VVM_m128Const ( One ) , a ) ) ) , a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_asin ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorSubtract ( VVM_m128Const ( QuarterPi ) , VVM_Exec1f_acos ( BatchState , a ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_atan ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorATan ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_atan2 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorATan2 ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_ceil ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorCeil ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_floor ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorFloor ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_fmod ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorMod ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_frac ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorFractional ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_trunc ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorTruncate ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_clamp ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorClamp ( a , b , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_min ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorMin ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_max ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorMax ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_pow ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorSelect ( VectorCompareGT ( a , VVM_m128Const ( Epsilon ) ) , VectorPow ( a , b ) , VectorZeroFloat ( ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_round ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorRound ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_sign ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorSign ( a ) ; }
2024-06-18 18:49:28 -04:00
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_step ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorStep ( VectorSubtract ( b , a ) ) ; }
2023-10-06 10:49:41 -04:00
VM_FORCEINLINE const uint8 * VVM_Exec1null_random ( VVM_NULL_FN_ARGS ) { return VVM_random ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_noise ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_cmplt ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorCompareLT ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_cmple ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorCompareLE ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_cmpgt ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorCompareGT ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_cmpge ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorCompareGE ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_cmpeq ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorCompareEQ ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_cmpneq ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorCompareNE ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_select ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorSelect ( a , b , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_addi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntAdd ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_subi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntSubtract ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_muli ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntMultiply ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_divi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VVMIntDiv ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_clampi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntClamp ( a , b , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_mini ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntMin ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_maxi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntMax ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec1i_absi ( FVectorVMBatchState * BatchState , VectorRegister4i a ) { return VectorIntAbs ( a ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec1i_negi ( FVectorVMBatchState * BatchState , VectorRegister4i a ) { return VectorIntNegate ( a ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec1i_signi ( FVectorVMBatchState * BatchState , VectorRegister4i a ) { return VectorIntSign ( a ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_randomi ( VVM_NULL_FN_ARGS ) { return VVM_randomi ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_cmplti ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntCompareLT ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_cmplei ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntCompareLE ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_cmpgti ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntCompareGT ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_cmpgei ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntCompareGE ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_cmpeqi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntCompareEQ ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_cmpneqi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntCompareNEQ ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_bit_and ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntAnd ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_bit_or ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntOr ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_bit_xor ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntXor ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec1i_bit_not ( FVectorVMBatchState * BatchState , VectorRegister4i a ) { return VectorIntNot ( a ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_bit_lshift ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VVMIntLShift ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_bit_rshift ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VVMIntRShift ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntAnd ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntOr ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_logic_xor ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntXor ( a , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec1i_logic_not ( FVectorVMBatchState * BatchState , VectorRegister4i a ) { return VectorIntNot ( a ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec1i_f2i ( FVectorVMBatchState * BatchState , VectorRegister4i a ) { return VVMf2i ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_i2f ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VVMi2f ( a ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_f2b ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorCompareGT ( a , VectorZeroFloat ( ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec1f_b2f ( FVectorVMBatchState * BatchState , VectorRegister4f a ) { return VectorSelect ( a , VVM_m128Const ( One ) , VectorZeroFloat ( ) ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec1i_i2b ( FVectorVMBatchState * BatchState , VectorRegister4i a ) { return VectorIntCompareGT ( a , VectorSetZero ( ) ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec1i_b2i ( FVectorVMBatchState * BatchState , VectorRegister4i a ) { return VectorIntSelect ( a , VectorIntSet1 ( 1 ) , VectorSetZero ( ) ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_outputdata_float ( VVM_NULL_FN_ARGS ) { return VVM_Output32 ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_outputdata_int32 ( VVM_NULL_FN_ARGS ) { return VVM_Output32 ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_outputdata_half ( VVM_NULL_FN_ARGS ) { return VVM_Output16 ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_acquireindex ( VVM_NULL_FN_ARGS ) { return VVM_acquireindex ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_external_func_call ( VVM_NULL_FN_ARGS ) { return VVM_external_func_call ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_exec_index ( VVM_NULL_FN_ARGS ) { return VVM_exec_index ( CT_MultipleLoops , InsPtr , BatchState , NumLoops ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_noise2D ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_noise3D ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_enter_stat_scope ( VVM_NULL_FN_ARGS ) { return InsPtr + 2 ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_exit_stat_scope ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_update_id ( VVM_NULL_FN_ARGS ) { return VVM_update_id ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , NumLoops ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_acquire_id ( VVM_NULL_FN_ARGS ) { return VVM_acquire_id ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , NumLoops ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_half_to_float ( VVM_NULL_FN_ARGS ) { return VVM_half_to_float ( CT_MultipleLoops , InsPtr , BatchState , NumLoops ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_fasi ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_iasf ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_exec_indexf ( VVM_NULL_FN_ARGS ) { return VVM_exec_indexf ( CT_MultipleLoops , InsPtr , BatchState , NumLoops ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_exec_index_addi ( VVM_NULL_FN_ARGS ) { return VVM_exec_index_addi ( CT_MultipleLoops , InsPtr , BatchState , NumLoops ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_cmplt_select ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorSelect ( VectorCompareLT ( a , b ) , c , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_cmple_select ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorSelect ( VectorCompareLE ( a , b ) , c , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_cmpeq_select ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorSelect ( VectorCompareEQ ( a , b ) , c , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_cmplti_select ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorSelect ( VectorCastIntToFloat ( VectorIntCompareLT ( * ( VectorRegister4i * ) & a , * ( VectorRegister4i * ) & b ) ) , c , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_cmplei_select ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorSelect ( VectorCastIntToFloat ( VectorIntCompareLE ( * ( VectorRegister4i * ) & a , * ( VectorRegister4i * ) & b ) ) , c , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_cmpeqi_select ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorSelect ( VectorCastIntToFloat ( VectorIntCompareEQ ( * ( VectorRegister4i * ) & a , * ( VectorRegister4i * ) & b ) ) , c , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmplt_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntAnd ( VectorCastFloatToInt ( VectorCompareLT ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmple_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntAnd ( VectorCastFloatToInt ( VectorCompareLE ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmpgt_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntAnd ( VectorCastFloatToInt ( VectorCompareGT ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmpge_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntAnd ( VectorCastFloatToInt ( VectorCompareGE ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmpeq_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntAnd ( VectorCastFloatToInt ( VectorCompareEQ ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmpne_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntAnd ( VectorCastFloatToInt ( VectorCompareNE ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmplti_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntAnd ( VectorIntCompareLT ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmplei_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntAnd ( VectorIntCompareLE ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmpgti_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntAnd ( VectorIntCompareGT ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmpgei_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntAnd ( VectorIntCompareGE ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmpeqi_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntAnd ( VectorIntCompareEQ ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmpnei_logic_and ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntAnd ( VectorIntCompareNEQ ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmplt_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntOr ( VectorCastFloatToInt ( VectorCompareLT ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmple_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntOr ( VectorCastFloatToInt ( VectorCompareLE ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmpgt_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntOr ( VectorCastFloatToInt ( VectorCompareGT ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmpge_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntOr ( VectorCastFloatToInt ( VectorCompareGE ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmpeq_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntOr ( VectorCastFloatToInt ( VectorCompareEQ ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_cmpne_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCastIntToFloat ( VectorIntOr ( VectorCastFloatToInt ( VectorCompareNE ( a , b ) ) , * ( VectorRegister4i * ) & c ) ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmplti_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntOr ( VectorIntCompareLT ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmplei_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntOr ( VectorIntCompareLE ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmpgti_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntOr ( VectorIntCompareGT ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmpgei_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntOr ( VectorIntCompareGE ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmpeqi_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntOr ( VectorIntCompareEQ ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_cmpnei_logic_or ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntOr ( VectorIntCompareNEQ ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_mad_add ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorAdd ( VectorMultiplyAdd ( a , b , c ) , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_mad_sub0 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorSubtract ( VectorMultiplyAdd ( a , b , c ) , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_mad_sub1 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorSubtract ( d , VectorMultiplyAdd ( a , b , c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_mad_mul ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorMultiply ( VectorMultiplyAdd ( a , b , c ) , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_mad_sqrt ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorSqrt ( VectorMultiplyAdd ( a , b , c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec5f_mad_mad0 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d , VectorRegister4f e ) { return VectorMultiplyAdd ( d , e , VectorMultiplyAdd ( a , b , c ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec5f_mad_mad1 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d , VectorRegister4f e ) { return VectorMultiplyAdd ( VectorMultiplyAdd ( a , b , c ) , d , e ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_mul_mad0 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorMultiplyAdd ( VectorMultiply ( a , b ) , c , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_mul_mad1 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorMultiplyAdd ( c , d , VectorMultiply ( a , b ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_mul_add ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorAdd ( VectorMultiply ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_mul_sub0 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorSubtract ( VectorMultiply ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_mul_sub1 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorSubtract ( c , VectorMultiply ( a , b ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_mul_mul ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorMultiply ( VectorMultiply ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_mul_max ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorMax ( VectorMultiply ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_mul_2x ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorMultiply ( a , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_add_mad1 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorMultiplyAdd ( c , d , VectorAdd ( a , b ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_add_add ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorAdd ( VectorAdd ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_sub_cmplt1 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorCompareLT ( c , VectorSubtract ( a , b ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_sub_neg ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorNegate ( VectorSubtract ( a , b ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_sub_mul ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorMultiply ( VectorSubtract ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_div_mad0 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorMultiplyAdd ( VVM_Exec2f_div ( BatchState , a , b ) , c , d ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_div_f2i ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorFloatToInt ( VVM_Exec2f_div ( BatchState , VectorCastIntToFloat ( a ) , VectorCastIntToFloat ( b ) ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_div_mul ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorMultiply ( VVM_Exec2f_div ( BatchState , a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_muli_addi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntAdd ( VectorIntMultiply ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_addi_bit_rshift ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VVMIntRShift ( VectorIntAdd ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_addi_muli ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntMultiply ( VectorIntAdd ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec1i_b2i_2x ( FVectorVMBatchState * BatchState , VectorRegister4i a ) { return VectorIntSelect ( a , VectorIntSet1 ( 1 ) , VectorSetZero ( ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_i2f_div0 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VVM_Exec2f_div ( BatchState , VectorIntToFloat ( VectorCastFloatToInt ( a ) ) , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_i2f_div1 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VVM_Exec2f_div ( BatchState , b , VectorIntToFloat ( VectorCastFloatToInt ( a ) ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_i2f_mul ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorMultiply ( VectorIntToFloat ( VectorCastFloatToInt ( a ) ) , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_i2f_mad0 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorMultiplyAdd ( VectorIntToFloat ( VectorCastFloatToInt ( a ) ) , b , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_i2f_mad1 ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorMultiplyAdd ( a , b , VectorIntToFloat ( VectorCastFloatToInt ( c ) ) ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_f2i_select1 ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntSelect ( a , VectorFloatToInt ( VectorCastIntToFloat ( b ) ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_f2i_maxi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntMax ( VectorFloatToInt ( VectorCastIntToFloat ( a ) ) , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_f2i_addi ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorIntAdd ( VectorFloatToInt ( VectorCastIntToFloat ( a ) ) , b ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec3f_fmod_add ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c ) { return VectorAdd ( VectorMod ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_bit_and_i2f ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorIntToFloat ( VectorIntAnd ( VectorCastFloatToInt ( a ) , VectorCastFloatToInt ( b ) ) ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_bit_rshift_bit_and ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntAnd ( VVMIntRShift ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec2f_neg_cmplt ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b ) { return VectorCompareLT ( VectorNegate ( a ) , b ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_bit_or_muli ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntMultiply ( VectorIntOr ( a , b ) , c ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec3i_bit_lshift_bit_or ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b , VectorRegister4i c ) { return VectorIntOr ( VVMIntLShift ( a , b ) , c ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec2null_random_add ( VVM_NULL_FN_ARGS ) { return VVM_random_add ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_random_2x ( VVM_NULL_FN_ARGS ) { return VVM_random_2x ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx , SerializeState , CmpSerializeState , NumLoops ) ; }
VM_FORCEINLINE VectorRegister4i VVM_Exec2i_max_f2i ( FVectorVMBatchState * BatchState , VectorRegister4i a , VectorRegister4i b ) { return VectorFloatToInt ( VectorMax ( VectorCastIntToFloat ( a ) , VectorCastIntToFloat ( b ) ) ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_select_mul ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorMultiply ( VectorSelect ( a , b , c ) , d ) ; }
VM_FORCEINLINE VectorRegister4f VVM_Exec4f_select_add ( FVectorVMBatchState * BatchState , VectorRegister4f a , VectorRegister4f b , VectorRegister4f c , VectorRegister4f d ) { return VectorAdd ( VectorSelect ( a , b , c ) , d ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec1null_sin_cos ( VVM_NULL_FN_ARGS ) { return VVM_sincos ( CT_MultipleLoops , InsPtr , BatchState , NumLoops ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_inputdata_float ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_inputdata_int32 ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_inputdata_half ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_inputdata_noadvance_float ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_inputdata_noadvance_int32 ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_inputdata_noadvance_half ( VVM_NULL_FN_ARGS ) { return InsPtr ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_outputdata_float_from_half ( VVM_NULL_FN_ARGS ) { return VVM_Output32_from_16 ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx ) ; }
VM_FORCEINLINE const uint8 * VVM_Exec0null_outputdata_half_from_half ( VVM_NULL_FN_ARGS ) { return VVM_Output16_from_16 ( CT_MultipleLoops , InsPtr , BatchState , ExecCtx ) ; }
2023-08-03 07:16:11 -04:00
//this is the macro required to exit the infinite loop running over the bytecode most efficiently
# define VVM_Dispatch_execFn0done_0done(...) NULL; goto done_loop;
# if VECTORVM_SUPPORTS_COMPUTED_GOTO
# define VVM_OP_CASE(op) jmp_lbl_##op:
# define VVM_OP_NEXT goto *jmp_tbl[InsPtr[-1]]
# define VVM_OP_START VVM_OP_NEXT;
static const void * jmp_tbl [ ] = {
# define VVM_OP_XM(op, ...) &&jmp_lbl_##op,
VVM_OP_XM_LIST
# undef VVM_OP_XM
} ;
# else
# define VVM_OP_START switch ((EVectorVMOp)InsPtr[-1])
# if VECTORVM_DEBUG_PRINTF
2023-10-06 10:49:41 -04:00
# define VVM_OP_CASE(op) case EVectorVMOp::op: VECTORVM_PRINTF("%d, %s", DbgPfInsCount++, VVM_OP_NAMES_EXP[InsPtr[-1]]);
2023-08-03 07:16:11 -04:00
# else
# define VVM_OP_CASE(op) case EVectorVMOp::op:
# endif
# define VVM_OP_NEXT break
# endif
# if VECTORVM_DEBUG_PRINTF
2023-10-06 10:49:41 -04:00
static const char * VVM_OP_NAMES_EXP [ ] = {
2023-08-03 07:16:11 -04:00
# define VVM_OP_XM(OpCode, ...) #OpCode,
VVM_OP_XM_LIST
# undef VVM_OP_XM
} ;
# endif //VECTORVM_DEBUG_PRINTF
static void SetBatchPointersForCorrectChunkOffsets ( FVectorVMExecContext * ExecCtx , FVectorVMBatchState * BatchState )
{
{ //input buffers
int DstOffset = ExecCtx - > VVMState - > NumTempRegisters + ExecCtx - > VVMState - > NumConstBuffers ;
int SrcOffset = DstOffset + ExecCtx - > VVMState - > NumInputBuffers ;
for ( int i = 0 ; i < ( int ) ExecCtx - > VVMState - > NumInputBuffers ; + + i )
{
if ( BatchState - > RegIncTable [ DstOffset + i ] ! = 0 ) //don't offset the no-advance inputs
{
2023-11-16 11:15:11 -05:00
const uint32 DataTypeStride = ExecCtx - > VVMState - > InputMapCacheSrc [ i ] & 0x4000 ? 2 : 4 ;
const uint32 OffsetBytes = BatchState - > ChunkLocalData . StartInstanceThisChunk * DataTypeStride ;
BatchState - > RegPtrTable [ DstOffset + i ] = BatchState - > RegPtrTable [ SrcOffset + i ] + OffsetBytes ;
2023-08-03 07:16:11 -04:00
}
}
}
if ( BatchState - > ChunkLocalData . StartInstanceThisChunk ! = 0 )
{ //output buffers
int Offset = ExecCtx - > VVMState - > NumTempRegisters + ExecCtx - > VVMState - > NumConstBuffers + ExecCtx - > VVMState - > NumInputBuffers * 2 ;
for ( int i = 0 ; i < ( int ) ExecCtx - > VVMState - > NumOutputBuffers ; + + i )
{
2023-11-16 11:15:11 -05:00
const uint32 DataTypeStride = ExecCtx - > VVMState - > OutputRemapDataType [ i ] = = 2 ? 2 : 4 ;
const uint32 OffsetBytes = BatchState - > ChunkLocalData . NumOutputPerDataSet [ 0 ] * DataTypeStride ;
BatchState - > RegPtrTable [ Offset + i ] = BatchState - > RegPtrTable [ Offset + i ] + OffsetBytes ;
2023-08-03 07:16:11 -04:00
}
}
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > MaxOutputDataSet ; + + i )
{
BatchState - > ChunkLocalData . StartingOutputIdxPerDataSet [ i ] = 0 ;
BatchState - > ChunkLocalData . NumOutputPerDataSet [ i ] = 0 ;
}
}
static void ExecChunkSingleLoop ( FVectorVMExecContext * ExecCtx , FVectorVMBatchState * BatchState , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState )
{
static const int NumLoops = 1 ;
static constexpr bool CT_MultipleLoops = false ;
SetBatchPointersForCorrectChunkOffsets ( ExecCtx , BatchState ) ;
const uint8 * InsPtr = ExecCtx - > VVMState - > Bytecode + 1 ;
const uint8 * InsPtrEnd = InsPtr + ExecCtx - > VVMState - > NumBytecodeBytes ;
VVMSer_chunkStartExp ( SerializeState , ChunkIdx , 0 )
# if VECTORVM_DEBUG_PRINTF
int DbgPfInsCount = 0 ;
# endif //VECTORVM_DEBUG_PRINTF
for ( ; ; )
{
VVMSer_insStartExp ( SerializeState )
VVM_OP_START {
# define VVM_OP_XM(OpCode, Cat, NumInputs, NumOutputs, Type, ...) VVM_OP_CASE(OpCode) InsPtr = VVM_Dispatch_execFn##NumInputs##Type##_##NumOutputs##Type(CT_MultipleLoops, InsPtr, BatchState, ExecCtx, SerializeState, CmpSerializeState, VVM_Exec##NumInputs##Type##_##OpCode, NumLoops); VVM_OP_NEXT;
VVM_OP_XM_LIST
# undef VVM_OP_XM
}
VVMSer_insEndExp ( SerializeState , ( int ) ( SerializeState - > Running . StartOpPtr - ExecCtx - > VVMState - > Bytecode ) , ( int ) ( InsPtr - SerializeState - > Running . StartOpPtr ) ) ;
}
done_loop :
VVMSer_chunkEndExp ( SerializeState ) ;
}
static void ExecChunkMultipleLoops ( FVectorVMExecContext * ExecCtx , FVectorVMBatchState * BatchState , int NumLoops , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState )
{
static constexpr bool CT_MultipleLoops = true ;
SetBatchPointersForCorrectChunkOffsets ( ExecCtx , BatchState ) ;
const uint8 * InsPtr = ExecCtx - > VVMState - > Bytecode + 1 ;
const uint8 * InsPtrEnd = InsPtr + ExecCtx - > VVMState - > NumBytecodeBytes ;
VVMSer_chunkStartExp ( SerializeState , ChunkIdx , 0 )
# if VECTORVM_DEBUG_PRINTF
int DbgPfInsCount = 0 ;
# endif //VECTORVM_DEBUG_PRINTF
for ( ; ; )
{
VVMSer_insStartExp ( SerializeState )
VVM_OP_START {
# define VVM_OP_XM(OpCode, Cat, NumInputs, NumOutputs, Type, ...) VVM_OP_CASE(OpCode) InsPtr = VVM_Dispatch_execFn##NumInputs##Type##_##NumOutputs##Type(CT_MultipleLoops, InsPtr, BatchState, ExecCtx, SerializeState, CmpSerializeState, VVM_Exec##NumInputs##Type##_##OpCode, NumLoops); VVM_OP_NEXT;
VVM_OP_XM_LIST
# undef VVM_OP_XM
}
VVMSer_insEndExp ( SerializeState , ( int ) ( SerializeState - > Running . StartOpPtr - ExecCtx - > VVMState - > Bytecode ) , ( int ) ( InsPtr - SerializeState - > Running . StartOpPtr ) ) ;
}
done_loop :
VVMSer_chunkEndExp ( SerializeState ) ;
}
# undef VVM_Dispatch_execFn0done_0done
# undef VVM_OP_CASE
# undef VVM_OP_NEXT
void ExecVectorVMState ( FVectorVMExecContext * ExecCtx , FVectorVMSerializeState * SerializeState , FVectorVMSerializeState * CmpSerializeState )
{
if ( ExecCtx - > ExtFunctionTable . Num ( ) ! = ExecCtx - > VVMState - > NumExtFunctions ) {
return ;
}
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > NumExtFunctions ; + + i ) {
ExecCtx - > VVMState - > ExtFunctionTable [ i ] . Function = ExecCtx - > ExtFunctionTable [ i ] ;
}
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > MaxOutputDataSet ; + + i ) {
ExecCtx - > VVMState - > NumOutputPerDataSet [ i ] = 0 ;
}
//cache the mappings from niagara data buffers to internal minimized set
if ( ! ( ExecCtx - > VVMState - > Flags & VVMFlag_DataMapCacheSetup ) ) {
VVMBuildMapTableCaches ( ExecCtx ) ;
ExecCtx - > VVMState - > Flags | = VVMFlag_DataMapCacheSetup ;
}
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > NumConstBuffers ; + + i ) {
ExecCtx - > VVMState - > ConstantBuffers [ i ] . i = VectorIntSet1 ( ( ( uint32 * ) ExecCtx - > ConstantTableData [ ExecCtx - > VVMState - > ConstMapCacheIdx [ i ] ] ) [ ExecCtx - > VVMState - > ConstMapCacheSrc [ i ] ] ) ;
}
//if the number of instances hasn't changed since the last exec, we don't have to re-compute the internal state
if ( ExecCtx - > NumInstances = = ExecCtx - > VVMState - > NumInstancesExecCached )
{
ExecCtx - > Internal . NumBytesRequiredPerBatch = ExecCtx - > VVMState - > ExecCtxCache . NumBytesRequiredPerBatch ;
ExecCtx - > Internal . PerBatchRegisterDataBytesRequired = ExecCtx - > VVMState - > ExecCtxCache . PerBatchRegisterDataBytesRequired ;
ExecCtx - > Internal . MaxChunksPerBatch = ExecCtx - > VVMState - > ExecCtxCache . MaxChunksPerBatch ;
ExecCtx - > Internal . MaxInstancesPerChunk = ExecCtx - > VVMState - > ExecCtxCache . MaxInstancesPerChunk ;
}
else
{ //calculate Batch & Chunk division and all internal execution state required before executing
static const uint32 MaxChunksPerBatch = 4 ; //*MUST BE POW 2* arbitrary 4 chunks per batch... this is harder to load balance because it depends on CPU cores available during execution
static_assert ( MaxChunksPerBatch > 0 & & ( MaxChunksPerBatch & ( MaxChunksPerBatch - 1 ) ) = = 0 ) ;
size_t PageSizeInBytes = ( uint64_t ) GVVMPageSizeInKB < < 10 ;
size_t PerBatchRegisterDataBytesRequired = 0 ;
int NumBatches = 1 ;
int NumChunksPerBatch = ( int ) MaxChunksPerBatch ;
uint32 MaxLoopsPerChunk = 0 ;
{ //compute the number of bytes required per batch
const uint32 TotalNumLoopsRequired = VVM_MAX ( ( ( uint32 ) ExecCtx - > NumInstances + 3 ) > > 2 , 1 ) ;
const size_t NumBytesRequiredPerLoop = VVM_REG_SIZE * ExecCtx - > VVMState - > NumTempRegisters + ExecCtx - > VVMState - > MaxOutputDataSet * sizeof ( uint8 ) ;
size_t NumBytesPerBatchAvailableForTempRegs = PageSizeInBytes - ExecCtx - > VVMState - > BatchOverheadSize ;
size_t TotalNumLoopBytesRequired = VVM_ALIGN ( TotalNumLoopsRequired , MaxChunksPerBatch ) * NumBytesRequiredPerLoop ;
if ( NumBytesPerBatchAvailableForTempRegs < TotalNumLoopBytesRequired )
{
//Not everything fits into a single chunk, so we have to compute everything here
int NumChunksRequired = ( int ) ( TotalNumLoopBytesRequired + NumBytesPerBatchAvailableForTempRegs - 1 ) / ( int ) NumBytesPerBatchAvailableForTempRegs ;
MaxLoopsPerChunk = ( TotalNumLoopsRequired + NumChunksRequired - 1 ) / NumChunksRequired ;
NumChunksPerBatch = NumChunksRequired ;
}
else
{
//everything fits into a single chunk
NumChunksPerBatch = 1 ;
MaxLoopsPerChunk = TotalNumLoopsRequired ;
}
PerBatchRegisterDataBytesRequired = MaxLoopsPerChunk * NumBytesRequiredPerLoop ;
}
size_t NumBytesRequiredPerBatch = ExecCtx - > VVMState - > BatchOverheadSize + PerBatchRegisterDataBytesRequired ;
ExecCtx - > Internal . NumBytesRequiredPerBatch = ( uint32 ) NumBytesRequiredPerBatch ;
ExecCtx - > Internal . PerBatchRegisterDataBytesRequired = ( uint32 ) PerBatchRegisterDataBytesRequired ;
ExecCtx - > Internal . MaxChunksPerBatch = NumChunksPerBatch ;
ExecCtx - > Internal . MaxInstancesPerChunk = MaxLoopsPerChunk < < 2 ;
ExecCtx - > VVMState - > ExecCtxCache . NumBytesRequiredPerBatch = ExecCtx - > Internal . NumBytesRequiredPerBatch ;
ExecCtx - > VVMState - > ExecCtxCache . PerBatchRegisterDataBytesRequired = ExecCtx - > Internal . PerBatchRegisterDataBytesRequired ;
ExecCtx - > VVMState - > ExecCtxCache . MaxChunksPerBatch = ExecCtx - > Internal . MaxChunksPerBatch ;
ExecCtx - > VVMState - > ExecCtxCache . MaxInstancesPerChunk = ExecCtx - > Internal . MaxInstancesPerChunk ;
ExecCtx - > VVMState - > NumInstancesExecCached = ExecCtx - > NumInstances ;
}
VVMSer_initSerializationState ( ExecCtx , SerializeState , SerializeState - > OptimizeCtx , SerializeState - > Flags | VVMSer_OptimizedBytecode ) ;
# if VECTORVM_SUPPORTS_SERIALIZATION && !defined(VVM_SERIALIZE_NO_WRITE)
uint64 StartTime = FPlatformTime : : Cycles64 ( ) ;
if ( SerializeState )
{
SerializeState - > ExecDt = 0 ;
SerializeState - > SerializeDt = 0 ;
}
# endif
SerializeVectorVMInputDataSets ( SerializeState , ExecCtx ) ;
{
if ( ExecCtx - > VVMState - > Bytecode = = nullptr ) {
return ;
}
FVectorVMBatchState * BatchState = ( FVectorVMBatchState * ) FMemory : : Malloc ( ExecCtx - > VVMState - > BatchOverheadSize + ExecCtx - > Internal . PerBatchRegisterDataBytesRequired ) ;
2023-10-27 14:00:04 -04:00
SetupBatchStatePtrs ( ExecCtx , BatchState ) ;
2023-08-03 07:16:11 -04:00
if ( ExecCtx - > VVMState - > Flags & VVMFlag_HasRandInstruction )
{
SetupRandStateForBatch ( BatchState ) ;
}
int StartInstanceThisChunk = 0 ;
int NumChunksThisBatch = ( ExecCtx - > NumInstances + ExecCtx - > Internal . MaxInstancesPerChunk - 1 ) / ExecCtx - > Internal . MaxInstancesPerChunk ;
for ( int ChunkIdxThisBatch = 0 ; ChunkIdxThisBatch < NumChunksThisBatch ; + + ChunkIdxThisBatch , StartInstanceThisChunk + = ExecCtx - > Internal . MaxInstancesPerChunk )
{
int NumInstancesThisChunk = VVM_MIN ( ( int ) ExecCtx - > Internal . MaxInstancesPerChunk , ExecCtx - > NumInstances - StartInstanceThisChunk ) ;
int NumLoops = ( int ) ( ( NumInstancesThisChunk + 3 ) & ~ 3 ) > > 2 ; //assumes 4-wide ops
BatchState - > ChunkLocalData . ChunkIdx = ChunkIdxThisBatch ;
BatchState - > ChunkLocalData . StartInstanceThisChunk = StartInstanceThisChunk ;
BatchState - > ChunkLocalData . NumInstancesThisChunk = NumInstancesThisChunk ;
if ( NumLoops = = 1 )
{
ExecChunkSingleLoop ( ExecCtx , BatchState , SerializeState , CmpSerializeState ) ;
}
else if ( NumLoops > = 1 )
{
ExecChunkMultipleLoops ( ExecCtx , BatchState , NumLoops , SerializeState , CmpSerializeState ) ;
}
}
VVMSer_batchEndExp ( SerializeState ) ;
if ( BatchState - > ChunkLocalData . RandCounters )
{
FMemory : : Free ( BatchState - > ChunkLocalData . RandCounters ) ;
BatchState - > ChunkLocalData . RandCounters = NULL ;
}
FMemory : : Free ( BatchState ) ;
}
for ( uint32 i = 0 ; i < ExecCtx - > VVMState - > MaxOutputDataSet ; + + i )
{
ExecCtx - > DataSets [ i ] . DataSetAccessIndex = ExecCtx - > VVMState - > NumOutputPerDataSet [ i ] - 1 ;
}
# if VECTORVM_SUPPORTS_SERIALIZATION && !defined(VVM_SERIALIZE_NO_WRITE)
uint64 EndTime = FPlatformTime : : Cycles64 ( ) ;
if ( SerializeState )
{
SerializeState - > ExecDt = EndTime - StartTime ;
}
# endif //VECTORVM_SUPPORTS_SERIALIZATION
SerializeVectorVMOutputDataSets ( SerializeState , ExecCtx ) ;
}
# undef VVM_MIN
# undef VVM_MAX
# undef VVM_CLAMP
# undef VVM_ALIGN
# undef VVM_ALIGN_4
# undef VVM_ALIGN_16
# undef VVM_ALIGN_32
# undef VVM_ALIGN_64
# undef VVMSet_m128Const
# undef VVMSet_m128iConst
# undef VVMSet_m128iConst4
# undef VVM_m128Const
# undef VVM_m128iConst
uint16 VVMDbgGetRemappedInputIdx ( FVectorVMState * VectorVMState , int InputRegisterIdx , int DataSetIdx , bool IntReg , uint16 * OptOutIdxInRegTable )
{
//@TODO: implement
return 0xFFFF ;
}
uint16 VVMDbgGetRemappedOutputIdx ( FVectorVMState * VectorVMState , int OutputRegisterIdx , int DataSetIdx , bool IntReg , uint16 * OptOutIdxInRegTable )
{
for ( uint32 i = 0 ; i < VectorVMState - > NumOutputBuffers ; + + i ) {
if ( VectorVMState - > OutputRemapDst [ i ] = = OutputRegisterIdx & &
VectorVMState - > OutputRemapDataType [ i ] = = ( uint16 ) IntReg & &
VectorVMState - > OutputRemapDataSetIdx [ i ] = = DataSetIdx )
{
if ( OptOutIdxInRegTable ) {
* OptOutIdxInRegTable = ( uint16 ) ( VectorVMState - > NumTempRegisters + VectorVMState - > NumConstBuffers + VectorVMState - > NumInputBuffers * 2 + i ) ;
}
return ( uint16 ) i ;
}
}
return 0xFFFF ;
}
uint16 VVMDbgGetRemappedConstIdx ( FVectorVMState * VectorVMState , int ConstTableIdx , int ConstBuffIdx , uint16 * OptOutIdxInRegTable )
{
//@TODO: implement
return 0xFFFF ;
}
# else //NIAGARA_EXP_VM
struct FVectorVMState * AllocVectorVMState ( struct FVectorVMOptimizeContext * OptimizeCtx )
{
return nullptr ;
}
void FreeVectorVMState ( struct FVectorVMState * VVMState )
{
}
void ExecVectorVMState ( struct FVectorVMState * VVMState , struct FVectorVMSerializeState * SerializeState , struct FVectorVMSerializeState * CmpSerializeState )
{
}
# endif //NIAGARA_EXP_VM