You've already forked ultrasm64-2
mirror of
https://github.com/HackerN64/ultrasm64-2.git
synced 2026-01-21 10:38:08 -08:00
217 lines
5.6 KiB
ArmAsm
217 lines
5.6 KiB
ArmAsm
##############################################
|
|
# LZ4T - Fast decompressor in assembly
|
|
# Written by aglab2 inspired by Rasky LZ4 and devwizard YAZ0
|
|
##############################################
|
|
|
|
# NOTE: to optimize for speed, this decompressor can write up to 8 bytes
|
|
# after the end of the output buffer. The outut buffer must have been sized
|
|
# accordingly to accomodate for this.
|
|
|
|
#define MINMATCH 4
|
|
|
|
#define inbuf $s0
|
|
#define nibbles $s1
|
|
#define outbuf $s2
|
|
#define match_combo_mask $s3
|
|
#define len $s4
|
|
#define match_lim $s5
|
|
#define match_min $s6
|
|
#define v0_st $s7
|
|
|
|
#define dma_ctx $s8
|
|
#define dma_ptr $v0
|
|
|
|
#define shift $t9
|
|
#define len_add $t8
|
|
#define match_len $t7
|
|
#define match_off $t6
|
|
#define match_combo $t5
|
|
#define off_nibble $t4
|
|
|
|
.section .text.lz4t_unpack_fast
|
|
.p2align 5
|
|
.globl lz4t_unpack_fast
|
|
.func lz4t_unpack_fast
|
|
.set at
|
|
.set noreorder
|
|
|
|
lz4t_unpack_fast:
|
|
addiu $sp, $sp, -0x40
|
|
sw $ra, 0x14($sp)
|
|
sw $s0, 0x18($sp)
|
|
sw $s1, 0x1c($sp)
|
|
sw $s2, 0x20($sp)
|
|
sw $s3, 0x24($sp)
|
|
sw $s4, 0x28($sp)
|
|
sw $s5, 0x2C($sp)
|
|
sw $s6, 0x30($sp)
|
|
sw $s7, 0x34($sp)
|
|
sw $s8, 0x38($sp)
|
|
|
|
move $s0, $a0
|
|
lw $s1, 12($a0)
|
|
move $s2, $a1
|
|
move dma_ctx, $a2
|
|
lbu match_combo_mask, 8($a0)
|
|
sll match_combo_mask, 28
|
|
lbu match_min, 9($a0)
|
|
|
|
move dma_ptr, $a0
|
|
addiu $s0, 16
|
|
|
|
.Lloop:
|
|
sub $t0, inbuf, dma_ptr # check if we need to wait for dma
|
|
bgezal $t0, dma_async_ctx_read # if inbuf >= dma_ptr, wait for dma
|
|
move $a0, dma_ctx
|
|
|
|
bnez nibbles, .Lprocess_nibbles
|
|
li match_lim, 7
|
|
|
|
.Lload_nibbles:
|
|
lwl nibbles, 0(inbuf)
|
|
lwr nibbles, 3(inbuf)
|
|
beqz nibbles, .Lend
|
|
add inbuf, 4
|
|
|
|
.Lprocess_nibbles:
|
|
bgez nibbles, .Lmatches
|
|
srl len, nibbles, 28
|
|
|
|
.Lliterals:
|
|
andi len, 7
|
|
beqz len, .Llarge_literals
|
|
nop
|
|
|
|
.Lsmall_literal:
|
|
ldl $t0, 0(inbuf)
|
|
ldr $t0, 7(inbuf)
|
|
add inbuf, len
|
|
sdl $t0, 0(outbuf)
|
|
sdr $t0, 7(outbuf)
|
|
sll nibbles, 4
|
|
beq len, match_lim, .Lloop
|
|
add outbuf, len
|
|
|
|
.Lmatches_ex:
|
|
sub $t0, inbuf, dma_ptr # check if we need to wait for dma
|
|
bgezal $t0, dma_async_ctx_read # if inbuf >= dma_ptr, wait for dma
|
|
move $a0, dma_ctx
|
|
|
|
bnez nibbles, .Lprocess_ex_match_nibble
|
|
li match_lim, 15
|
|
|
|
.Lload_nibbles2:
|
|
lwl nibbles, 0(inbuf)
|
|
lwr nibbles, 3(inbuf)
|
|
beqz nibbles, .Lend
|
|
add inbuf, 4
|
|
|
|
.Lprocess_ex_match_nibble:
|
|
srl len, nibbles, 28
|
|
|
|
.Lmatches:
|
|
lwl match_combo, 0(inbuf)
|
|
lwr match_combo, 3(inbuf)
|
|
addiu inbuf, 2
|
|
srl match_off, match_combo, 16
|
|
|
|
beqz match_combo_mask, .Lfull_offset
|
|
sll nibbles, 4
|
|
srl nibbles, 4
|
|
and off_nibble, match_combo, match_combo_mask
|
|
or nibbles, off_nibble
|
|
andi match_off, 0xfff
|
|
.Lfull_offset:
|
|
|
|
bne len, match_lim, .Lmatch
|
|
addu match_len, len, match_min
|
|
|
|
# len is sign extended match_combo[8:15]
|
|
sll match_combo, 16
|
|
sra len, match_combo, 24
|
|
add inbuf, 1
|
|
bltzal len, .Lread_large_amount
|
|
andi len, 0x7f
|
|
|
|
add match_len, len
|
|
|
|
.Lmatch:
|
|
ble match_off, match_len, .Lmatch1_loop # check if we can do 8-byte copy
|
|
sub v0_st, outbuf, match_off # calculate start of match
|
|
.Lmatch8_loop: # 8-byte copy loop
|
|
ldl $t0, -1(v0_st) # load 8 bytes
|
|
ldr $t0, 6(v0_st)
|
|
addiu v0_st, 8
|
|
sdl $t0, 0(outbuf) # store 8 bytes
|
|
sdr $t0, 7(outbuf)
|
|
addiu match_len, -8
|
|
bgtz match_len, .Lmatch8_loop # check we went past match_len
|
|
addiu outbuf, 8
|
|
b .Lloop # jump to main loop
|
|
addu outbuf, match_len # adjust pointer remove extra bytes
|
|
|
|
.Lmatch1_loop: # 1-byte copy loop
|
|
lbu $t0, -1(v0_st) # load 1 byte
|
|
addiu v0_st, 1
|
|
sb $t0, 0(outbuf) # store 1 byte
|
|
addiu match_len, -1
|
|
bgtz match_len, .Lmatch1_loop # check we went past match_len
|
|
addiu outbuf, 1
|
|
b .Lloop # jump to main loop
|
|
nop
|
|
|
|
.Llarge_literals:
|
|
lb len, 0(inbuf)
|
|
add inbuf, 1
|
|
bltzal len, .Lread_large_amount
|
|
andi len, 0x7f
|
|
|
|
move v0_st, inbuf # store start of literals into v0_st
|
|
addiu len, 22
|
|
add inbuf, len # advance inbuf to end of literals
|
|
.Lcopy_lit:
|
|
sub $t0, v0_st, dma_ptr # check if all the literals have been DMA'd
|
|
bgezal $t0, dma_async_ctx_read # if not, wait for DMA
|
|
move $a0, dma_ctx
|
|
ldl $t0, 0(v0_st) # load 8 bytes of literals
|
|
ldr $t0, 7(v0_st)
|
|
addiu v0_st, 8
|
|
sdl $t0, 0(outbuf) # store 8 bytes of literals
|
|
sdr $t0, 7(outbuf)
|
|
addiu len, -8
|
|
bgez len, .Lcopy_lit # check if we went past the end of literals
|
|
addiu outbuf, 8
|
|
addu outbuf, len # adjust outbuf to roll back extra copied bytes
|
|
|
|
b .Lmatches_ex
|
|
sll nibbles, 4
|
|
|
|
.Lend:
|
|
lw $ra, 0x14($sp)
|
|
lw $s0, 0x18($sp)
|
|
lw $s1, 0x1c($sp)
|
|
lw $s2, 0x20($sp)
|
|
lw $s3, 0x24($sp)
|
|
lw $s4, 0x28($sp)
|
|
lw $s5, 0x2C($sp)
|
|
lw $s6, 0x30($sp)
|
|
lw $s7, 0x34($sp)
|
|
lw $s8, 0x38($sp)
|
|
jr $ra
|
|
addiu $sp, $sp, 0x40
|
|
|
|
.Lread_large_amount:
|
|
li shift, 7
|
|
.Lread_large_amount_loop:
|
|
lb $t0, 0(inbuf)
|
|
add inbuf, 1
|
|
andi $t1, $t0, 0x7f
|
|
sllv $t1, $t1, shift
|
|
or len, $t1
|
|
bltz $t0, .Lread_large_amount_loop
|
|
add shift, 7
|
|
jr $ra
|
|
nop
|
|
|
|
.endfunc
|