Add aglab2 lz4t

This commit is contained in:
a
2025-06-23 12:17:38 -04:00
parent a527dc78e8
commit 74cf4e1ad4
18 changed files with 7303 additions and 123 deletions

View File

@@ -1,104 +1,216 @@
// assembler directives
.set noat // allow manual use of $at
.set noreorder // don't insert nops after branches
##############################################
# LZ4T - Fast decompressor in assembly
# Written by aglab2 inspired by Rasky LZ4 and devwizard YAZ0
##############################################
#include "macros.inc"
# NOTE: to optimize for speed, this decompressor can write up to 8 bytes
# after the end of the output buffer. The outut buffer must have been sized
# accordingly to accomodate for this.
#define MINMATCH 4
.section .text, "ax"
#define inbuf $s0
#define nibbles $s1
#define outbuf $s2
#define match_combo_mask $s3
#define len $s4
#define match_lim $s5
#define match_min $s6
#define v0_st $s7
// This file is handwritten.
#define dma_ctx $s8
#define dma_ptr $v0
glabel decompress
#if !defined(VERSION_JP) && !defined(VERSION_US)
lw $a3, 8($a0)
lw $t9, 0xc($a0)
lw $t8, 4($a0)
add $a3, $a3, $a0
add $t9, $t9, $a0
or $a2, $zero, $zero
addi $a0, $a0, 0x10
add $t8, $t8, $a1
.L8026ED80:
bnezl $a2, .L8026ED98
slt $t1, $t0, $zero
lw $t0, ($a0)
li $a2, 32
addi $a0, $a0, 4
slt $t1, $t0, $zero
.L8026ED98:
beql $t1, $zero, .L8026EDB8
lhu $t2, ($a3)
lb $t2, ($t9)
addi $t9, $t9, 1
addi $a1, $a1, 1
b .L8026EDE4
sb $t2, -1($a1)
lhu $t2, ($a3)
.L8026EDB8:
addi $a3, $a3, 2
srl $t3, $t2, 0xc
andi $t2, $t2, 0xfff
sub $t1, $a1, $t2
addi $t3, $t3, 3
.L8026EDCC:
lb $t2, -1($t1)
addi $t3, $t3, -1
addi $t1, $t1, 1
addi $a1, $a1, 1
bnez $t3, .L8026EDCC
sb $t2, -1($a1)
.L8026EDE4:
sll $t0, $t0, 1
bne $a1, $t8, .L8026ED80
addi $a2, $a2, -1
jr $ra
#define shift $t9
#define len_add $t8
#define match_len $t7
#define match_off $t6
#define match_combo $t5
#define off_nibble $t4
.section .text.lz4t_unpack_fast
.p2align 5
.globl lz4t_unpack_fast
.func lz4t_unpack_fast
.set at
.set noreorder
lz4t_unpack_fast:
addiu $sp, $sp, -0x40
sw $ra, 0x14($sp)
sw $s0, 0x18($sp)
sw $s1, 0x1c($sp)
sw $s2, 0x20($sp)
sw $s3, 0x24($sp)
sw $s4, 0x28($sp)
sw $s5, 0x2C($sp)
sw $s6, 0x30($sp)
sw $s7, 0x34($sp)
sw $s8, 0x38($sp)
move $s0, $a0
lw $s1, 12($a0)
move $s2, $a1
move dma_ctx, $a2
lbu match_combo_mask, 8($a0)
sll match_combo_mask, 28
lbu match_min, 9($a0)
move dma_ptr, $a0
addiu $s0, 16
.Lloop:
sub $t0, inbuf, dma_ptr # check if we need to wait for dma
bgezal $t0, dma_async_ctx_read # if inbuf >= dma_ptr, wait for dma
move $a0, dma_ctx
bnez nibbles, .Lprocess_nibbles
li match_lim, 7
.Lload_nibbles:
lwl nibbles, 0(inbuf)
lwr nibbles, 3(inbuf)
beqz nibbles, .Lend
add inbuf, 4
.Lprocess_nibbles:
bgez nibbles, .Lmatches
srl len, nibbles, 28
.Lliterals:
andi len, 7
beqz len, .Llarge_literals
nop
#else
lw $t8, 4($a0)
lw $a3, 8($a0)
lw $t9, 0xc($a0)
move $a2, $zero
add $t8, $t8, $a1
add $a3, $a3, $a0
add $t9, $t9, $a0
addi $a0, $a0, 0x10
.L8027EF50:
bnez $a2, .L8027EF64
.Lsmall_literal:
ldl $t0, 0(inbuf)
ldr $t0, 7(inbuf)
add inbuf, len
sdl $t0, 0(outbuf)
sdr $t0, 7(outbuf)
sll nibbles, 4
beq len, match_lim, .Lloop
add outbuf, len
.Lmatches_ex:
sub $t0, inbuf, dma_ptr # check if we need to wait for dma
bgezal $t0, dma_async_ctx_read # if inbuf >= dma_ptr, wait for dma
move $a0, dma_ctx
bnez nibbles, .Lprocess_ex_match_nibble
li match_lim, 15
.Lload_nibbles2:
lwl nibbles, 0(inbuf)
lwr nibbles, 3(inbuf)
beqz nibbles, .Lend
add inbuf, 4
.Lprocess_ex_match_nibble:
srl len, nibbles, 28
.Lmatches:
lwl match_combo, 0(inbuf)
lwr match_combo, 3(inbuf)
addiu inbuf, 2
srl match_off, match_combo, 16
beqz match_combo_mask, .Lfull_offset
sll nibbles, 4
srl nibbles, 4
and off_nibble, match_combo, match_combo_mask
or nibbles, off_nibble
andi match_off, 0xfff
.Lfull_offset:
bne len, match_lim, .Lmatch
addu match_len, len, match_min
# len is sign extended match_combo[8:15]
sll match_combo, 16
sra len, match_combo, 24
add inbuf, 1
bltzal len, .Lread_large_amount
andi len, 0x7f
add match_len, len
.Lmatch:
ble match_off, match_len, .Lmatch1_loop # check if we can do 8-byte copy
sub v0_st, outbuf, match_off # calculate start of match
.Lmatch8_loop: # 8-byte copy loop
ldl $t0, -1(v0_st) # load 8 bytes
ldr $t0, 6(v0_st)
addiu v0_st, 8
sdl $t0, 0(outbuf) # store 8 bytes
sdr $t0, 7(outbuf)
addiu match_len, -8
bgtz match_len, .Lmatch8_loop # check we went past match_len
addiu outbuf, 8
b .Lloop # jump to main loop
addu outbuf, match_len # adjust pointer remove extra bytes
.Lmatch1_loop: # 1-byte copy loop
lbu $t0, -1(v0_st) # load 1 byte
addiu v0_st, 1
sb $t0, 0(outbuf) # store 1 byte
addiu match_len, -1
bgtz match_len, .Lmatch1_loop # check we went past match_len
addiu outbuf, 1
b .Lloop # jump to main loop
nop
lw $t0, ($a0)
li $a2, 32
addi $a0, $a0, 4
.L8027EF64:
slt $t1, $t0, $zero
beqz $t1, .L8027EF88
nop
lb $t2, ($t9)
addi $t9, $t9, 1
sb $t2, ($a1)
addi $a1, $a1, 1
b .L8027EFBC
nop
.L8027EF88:
lhu $t2, ($a3)
addi $a3, $a3, 2
srl $t3, $t2, 0xc
andi $t2, $t2, 0xfff
sub $t1, $a1, $t2
addi $t3, $t3, 3
.L8027EFA0:
lb $t2, -1($t1)
addi $t3, $t3, -1
addi $t1, $t1, 1
sb $t2, ($a1)
addi $a1, $a1, 1
bnez $t3, .L8027EFA0
nop
.L8027EFBC:
sll $t0, $t0, 1
addi $a2, $a2, -1
bne $a1, $t8, .L8027EF50
nop
jr $ra
nop
#endif
.Llarge_literals:
lb len, 0(inbuf)
add inbuf, 1
bltzal len, .Lread_large_amount
andi len, 0x7f
move v0_st, inbuf # store start of literals into v0_st
addiu len, 22
add inbuf, len # advance inbuf to end of literals
.Lcopy_lit:
sub $t0, v0_st, dma_ptr # check if all the literals have been DMA'd
bgezal $t0, dma_async_ctx_read # if not, wait for DMA
move $a0, dma_ctx
ldl $t0, 0(v0_st) # load 8 bytes of literals
ldr $t0, 7(v0_st)
addiu v0_st, 8
sdl $t0, 0(outbuf) # store 8 bytes of literals
sdr $t0, 7(outbuf)
addiu len, -8
bgez len, .Lcopy_lit # check if we went past the end of literals
addiu outbuf, 8
addu outbuf, len # adjust outbuf to roll back extra copied bytes
b .Lmatches_ex
sll nibbles, 4
.Lend:
lw $ra, 0x14($sp)
lw $s0, 0x18($sp)
lw $s1, 0x1c($sp)
lw $s2, 0x20($sp)
lw $s3, 0x24($sp)
lw $s4, 0x28($sp)
lw $s5, 0x2C($sp)
lw $s6, 0x30($sp)
lw $s7, 0x34($sp)
lw $s8, 0x38($sp)
jr $ra
addiu $sp, $sp, 0x40
.Lread_large_amount:
li shift, 7
.Lread_large_amount_loop:
lb $t0, 0(inbuf)
add inbuf, 1
andi $t1, $t0, 0x7f
sllv $t1, $t1, shift
or len, $t1
bltz $t0, .Lread_large_amount_loop
add shift, 7
jr $ra
nop
.endfunc