Files
ultrasm64-2/asm/decompress.s
2025-06-23 12:17:38 -04:00

217 lines
5.6 KiB
ArmAsm

##############################################
# LZ4T - Fast decompressor in assembly
# Written by aglab2 inspired by Rasky LZ4 and devwizard YAZ0
##############################################
# NOTE: to optimize for speed, this decompressor can write up to 8 bytes
# after the end of the output buffer. The outut buffer must have been sized
# accordingly to accomodate for this.
#define MINMATCH 4
#define inbuf $s0
#define nibbles $s1
#define outbuf $s2
#define match_combo_mask $s3
#define len $s4
#define match_lim $s5
#define match_min $s6
#define v0_st $s7
#define dma_ctx $s8
#define dma_ptr $v0
#define shift $t9
#define len_add $t8
#define match_len $t7
#define match_off $t6
#define match_combo $t5
#define off_nibble $t4
.section .text.lz4t_unpack_fast
.p2align 5
.globl lz4t_unpack_fast
.func lz4t_unpack_fast
.set at
.set noreorder
lz4t_unpack_fast:
addiu $sp, $sp, -0x40
sw $ra, 0x14($sp)
sw $s0, 0x18($sp)
sw $s1, 0x1c($sp)
sw $s2, 0x20($sp)
sw $s3, 0x24($sp)
sw $s4, 0x28($sp)
sw $s5, 0x2C($sp)
sw $s6, 0x30($sp)
sw $s7, 0x34($sp)
sw $s8, 0x38($sp)
move $s0, $a0
lw $s1, 12($a0)
move $s2, $a1
move dma_ctx, $a2
lbu match_combo_mask, 8($a0)
sll match_combo_mask, 28
lbu match_min, 9($a0)
move dma_ptr, $a0
addiu $s0, 16
.Lloop:
sub $t0, inbuf, dma_ptr # check if we need to wait for dma
bgezal $t0, dma_async_ctx_read # if inbuf >= dma_ptr, wait for dma
move $a0, dma_ctx
bnez nibbles, .Lprocess_nibbles
li match_lim, 7
.Lload_nibbles:
lwl nibbles, 0(inbuf)
lwr nibbles, 3(inbuf)
beqz nibbles, .Lend
add inbuf, 4
.Lprocess_nibbles:
bgez nibbles, .Lmatches
srl len, nibbles, 28
.Lliterals:
andi len, 7
beqz len, .Llarge_literals
nop
.Lsmall_literal:
ldl $t0, 0(inbuf)
ldr $t0, 7(inbuf)
add inbuf, len
sdl $t0, 0(outbuf)
sdr $t0, 7(outbuf)
sll nibbles, 4
beq len, match_lim, .Lloop
add outbuf, len
.Lmatches_ex:
sub $t0, inbuf, dma_ptr # check if we need to wait for dma
bgezal $t0, dma_async_ctx_read # if inbuf >= dma_ptr, wait for dma
move $a0, dma_ctx
bnez nibbles, .Lprocess_ex_match_nibble
li match_lim, 15
.Lload_nibbles2:
lwl nibbles, 0(inbuf)
lwr nibbles, 3(inbuf)
beqz nibbles, .Lend
add inbuf, 4
.Lprocess_ex_match_nibble:
srl len, nibbles, 28
.Lmatches:
lwl match_combo, 0(inbuf)
lwr match_combo, 3(inbuf)
addiu inbuf, 2
srl match_off, match_combo, 16
beqz match_combo_mask, .Lfull_offset
sll nibbles, 4
srl nibbles, 4
and off_nibble, match_combo, match_combo_mask
or nibbles, off_nibble
andi match_off, 0xfff
.Lfull_offset:
bne len, match_lim, .Lmatch
addu match_len, len, match_min
# len is sign extended match_combo[8:15]
sll match_combo, 16
sra len, match_combo, 24
add inbuf, 1
bltzal len, .Lread_large_amount
andi len, 0x7f
add match_len, len
.Lmatch:
ble match_off, match_len, .Lmatch1_loop # check if we can do 8-byte copy
sub v0_st, outbuf, match_off # calculate start of match
.Lmatch8_loop: # 8-byte copy loop
ldl $t0, -1(v0_st) # load 8 bytes
ldr $t0, 6(v0_st)
addiu v0_st, 8
sdl $t0, 0(outbuf) # store 8 bytes
sdr $t0, 7(outbuf)
addiu match_len, -8
bgtz match_len, .Lmatch8_loop # check we went past match_len
addiu outbuf, 8
b .Lloop # jump to main loop
addu outbuf, match_len # adjust pointer remove extra bytes
.Lmatch1_loop: # 1-byte copy loop
lbu $t0, -1(v0_st) # load 1 byte
addiu v0_st, 1
sb $t0, 0(outbuf) # store 1 byte
addiu match_len, -1
bgtz match_len, .Lmatch1_loop # check we went past match_len
addiu outbuf, 1
b .Lloop # jump to main loop
nop
.Llarge_literals:
lb len, 0(inbuf)
add inbuf, 1
bltzal len, .Lread_large_amount
andi len, 0x7f
move v0_st, inbuf # store start of literals into v0_st
addiu len, 22
add inbuf, len # advance inbuf to end of literals
.Lcopy_lit:
sub $t0, v0_st, dma_ptr # check if all the literals have been DMA'd
bgezal $t0, dma_async_ctx_read # if not, wait for DMA
move $a0, dma_ctx
ldl $t0, 0(v0_st) # load 8 bytes of literals
ldr $t0, 7(v0_st)
addiu v0_st, 8
sdl $t0, 0(outbuf) # store 8 bytes of literals
sdr $t0, 7(outbuf)
addiu len, -8
bgez len, .Lcopy_lit # check if we went past the end of literals
addiu outbuf, 8
addu outbuf, len # adjust outbuf to roll back extra copied bytes
b .Lmatches_ex
sll nibbles, 4
.Lend:
lw $ra, 0x14($sp)
lw $s0, 0x18($sp)
lw $s1, 0x1c($sp)
lw $s2, 0x20($sp)
lw $s3, 0x24($sp)
lw $s4, 0x28($sp)
lw $s5, 0x2C($sp)
lw $s6, 0x30($sp)
lw $s7, 0x34($sp)
lw $s8, 0x38($sp)
jr $ra
addiu $sp, $sp, 0x40
.Lread_large_amount:
li shift, 7
.Lread_large_amount_loop:
lb $t0, 0(inbuf)
add inbuf, 1
andi $t1, $t0, 0x7f
sllv $t1, $t1, shift
or len, $t1
bltz $t0, .Lread_large_amount_loop
add shift, 7
jr $ra
nop
.endfunc