Add aglab2 lz4t

2026-01-21 10:38:08 -08:00 · 2025-06-23 12:17:38 -04:00
parent a527dc78e8
commit 74cf4e1ad4
18 changed files with 7303 additions and 123 deletions
--- a/asm/decompress.s
+++ b/asm/decompress.s
@@ -1,104 +1,216 @@
-// assembler directives
-.set noat      // allow manual use of $at
-.set noreorder // don't insert nops after branches
+##############################################
+#  LZ4T - Fast decompressor in assembly
+#  Written by aglab2 inspired by Rasky LZ4 and devwizard YAZ0
+##############################################

-#include "macros.inc"
+# NOTE: to optimize for speed, this decompressor can write up to 8 bytes
+# after the end of the output buffer. The outut buffer must have been sized
+# accordingly to accomodate for this.

+#define MINMATCH    4

-.section .text, "ax"
+#define inbuf       $s0
+#define nibbles     $s1
+#define outbuf      $s2
+#define match_combo_mask $s3
+#define len         $s4
+#define match_lim   $s5
+#define match_min   $s6
+#define v0_st       $s7

-// This file is handwritten.
+#define dma_ctx     $s8
+#define dma_ptr     $v0

-glabel decompress
-#if !defined(VERSION_JP) && !defined(VERSION_US)
-    lw    $a3, 8($a0)
-    lw    $t9, 0xc($a0)
-    lw    $t8, 4($a0)
-    add   $a3, $a3, $a0
-    add   $t9, $t9, $a0
-    or    $a2, $zero, $zero
-    addi  $a0, $a0, 0x10
-    add   $t8, $t8, $a1
-.L8026ED80:
-    bnezl $a2, .L8026ED98
-     slt   $t1, $t0, $zero
-    lw    $t0, ($a0)
-    li    $a2, 32
-    addi  $a0, $a0, 4
-    slt   $t1, $t0, $zero
-.L8026ED98:
-    beql  $t1, $zero, .L8026EDB8
-     lhu   $t2, ($a3)
-    lb    $t2, ($t9)
-    addi  $t9, $t9, 1
-    addi  $a1, $a1, 1
-    b     .L8026EDE4
-     sb    $t2, -1($a1)
-    lhu   $t2, ($a3)
-.L8026EDB8:
-    addi  $a3, $a3, 2
-    srl   $t3, $t2, 0xc
-    andi  $t2, $t2, 0xfff
-    sub   $t1, $a1, $t2
-    addi  $t3, $t3, 3
-.L8026EDCC:
-    lb    $t2, -1($t1)
-    addi  $t3, $t3, -1
-    addi  $t1, $t1, 1
-    addi  $a1, $a1, 1
-    bnez  $t3, .L8026EDCC
-     sb    $t2, -1($a1)
-.L8026EDE4:
-    sll   $t0, $t0, 1
-    bne   $a1, $t8, .L8026ED80
-     addi  $a2, $a2, -1
-    jr    $ra
+#define shift       $t9
+#define len_add     $t8
+#define match_len   $t7
+#define match_off   $t6
+#define match_combo $t5
+#define off_nibble  $t4
+
+    .section .text.lz4t_unpack_fast
+	.p2align 5
+    .globl lz4t_unpack_fast 
+    .func lz4t_unpack_fast
+    .set at
+    .set noreorder
+
+lz4t_unpack_fast:
+    addiu $sp, $sp, -0x40
+    sw $ra, 0x14($sp)
+    sw $s0, 0x18($sp)
+    sw $s1, 0x1c($sp)
+    sw $s2, 0x20($sp)
+    sw $s3, 0x24($sp)
+    sw $s4, 0x28($sp)
+    sw $s5, 0x2C($sp)
+    sw $s6, 0x30($sp)
+    sw $s7, 0x34($sp)
+    sw $s8, 0x38($sp)
+
+    move $s0, $a0
+    lw $s1, 12($a0)
+    move $s2, $a1
+    move dma_ctx, $a2
+    lbu match_combo_mask, 8($a0)
+    sll match_combo_mask, 28
+    lbu match_min, 9($a0)
+
+    move dma_ptr, $a0
+    addiu $s0, 16
+
+.Lloop:
+    sub $t0, inbuf, dma_ptr                     # check if we need to wait for dma
+    bgezal $t0, dma_async_ctx_read                    # if inbuf >= dma_ptr, wait for dma
+     move $a0, dma_ctx
+
+    bnez nibbles, .Lprocess_nibbles
+     li match_lim, 7
+     
+.Lload_nibbles:
+    lwl nibbles, 0(inbuf)
+    lwr nibbles, 3(inbuf)
+    beqz nibbles, .Lend
+    add inbuf, 4
+
+.Lprocess_nibbles:
+    bgez nibbles, .Lmatches
+     srl len, nibbles, 28
+
+.Lliterals:
+    andi len, 7
+    beqz len, .Llarge_literals
     nop
-#else
-    lw    $t8, 4($a0)
-    lw    $a3, 8($a0)
-    lw    $t9, 0xc($a0)
-    move  $a2, $zero
-    add   $t8, $t8, $a1
-    add   $a3, $a3, $a0
-    add   $t9, $t9, $a0
-    addi  $a0, $a0, 0x10
-.L8027EF50:
-    bnez  $a2, .L8027EF64
+
+.Lsmall_literal:
+    ldl $t0, 0(inbuf)
+    ldr $t0, 7(inbuf)
+    add inbuf, len
+    sdl $t0, 0(outbuf)
+    sdr $t0, 7(outbuf)
+    sll nibbles, 4
+    beq len, match_lim, .Lloop
+    add outbuf, len
+
+.Lmatches_ex:
+    sub $t0, inbuf, dma_ptr                     # check if we need to wait for dma
+    bgezal $t0, dma_async_ctx_read                    # if inbuf >= dma_ptr, wait for dma
+     move $a0, dma_ctx
+
+    bnez nibbles, .Lprocess_ex_match_nibble
+     li match_lim, 15
+     
+.Lload_nibbles2:
+    lwl nibbles, 0(inbuf)
+    lwr nibbles, 3(inbuf)
+    beqz nibbles, .Lend
+    add inbuf, 4
+
+.Lprocess_ex_match_nibble:
+    srl len, nibbles, 28
+
+.Lmatches:
+    lwl match_combo, 0(inbuf)
+    lwr match_combo, 3(inbuf)
+    addiu inbuf, 2
+    srl match_off, match_combo, 16
+
+    beqz match_combo_mask, .Lfull_offset
+    sll nibbles, 4
+    srl nibbles, 4
+    and off_nibble, match_combo, match_combo_mask
+    or nibbles, off_nibble
+    andi match_off, 0xfff
+.Lfull_offset:
+
+    bne len, match_lim, .Lmatch
+     addu match_len, len, match_min
+
+    # len is sign extended match_combo[8:15]
+    sll match_combo, 16
+    sra len, match_combo, 24
+    add inbuf, 1
+    bltzal len, .Lread_large_amount
+     andi len, 0x7f
+
+    add match_len, len
+
+.Lmatch:
+    ble match_off, match_len, .Lmatch1_loop     # check if we can do 8-byte copy
+     sub v0_st, outbuf, match_off                 # calculate start of match
+.Lmatch8_loop:                                  # 8-byte copy loop
+    ldl $t0, -1(v0_st)                             # load 8 bytes
+    ldr $t0, 6(v0_st)
+    addiu v0_st, 8
+    sdl $t0, 0(outbuf)                          # store 8 bytes
+    sdr $t0, 7(outbuf)
+    addiu match_len, -8
+    bgtz match_len, .Lmatch8_loop               # check we went past match_len
+     addiu outbuf, 8
+    b .Lloop                                    # jump to main loop
+     addu outbuf, match_len                     # adjust pointer remove extra bytes
+
+.Lmatch1_loop:                                  # 1-byte copy loop
+    lbu $t0, -1(v0_st)                             # load 1 byte
+    addiu v0_st, 1
+    sb $t0, 0(outbuf)                           # store 1 byte
+    addiu match_len, -1
+    bgtz match_len, .Lmatch1_loop               # check we went past match_len
+     addiu outbuf, 1
+    b .Lloop                                    # jump to main loop
     nop
-    lw    $t0, ($a0)
-    li    $a2, 32
-    addi  $a0, $a0, 4
-.L8027EF64:
-    slt   $t1, $t0, $zero
-    beqz  $t1, .L8027EF88
-     nop
-    lb    $t2, ($t9)
-    addi  $t9, $t9, 1
-    sb    $t2, ($a1)
-    addi  $a1, $a1, 1
-    b     .L8027EFBC
-     nop
-.L8027EF88:
-    lhu   $t2, ($a3)
-    addi  $a3, $a3, 2
-    srl   $t3, $t2, 0xc
-    andi  $t2, $t2, 0xfff
-    sub   $t1, $a1, $t2
-    addi  $t3, $t3, 3
-.L8027EFA0:
-    lb    $t2, -1($t1)
-    addi  $t3, $t3, -1
-    addi  $t1, $t1, 1
-    sb    $t2, ($a1)
-    addi  $a1, $a1, 1
-    bnez  $t3, .L8027EFA0
-     nop
-.L8027EFBC:
-    sll   $t0, $t0, 1
-    addi  $a2, $a2, -1
-    bne   $a1, $t8, .L8027EF50
-     nop
-    jr    $ra
-     nop
-#endif
+
+.Llarge_literals:
+    lb len, 0(inbuf)
+    add inbuf, 1
+    bltzal len, .Lread_large_amount
+     andi len, 0x7f
+
+    move v0_st, inbuf                            # store start of literals into v0_st
+    addiu len, 22
+    add inbuf, len                        # advance inbuf to end of literals
+.Lcopy_lit:
+    sub $t0, v0_st, dma_ptr                     # check if all the literals have been DMA'd
+    bgezal $t0, dma_async_ctx_read                       # if not, wait for DMA
+     move $a0, dma_ctx
+    ldl $t0, 0(v0_st)                             # load 8 bytes of literals
+    ldr $t0, 7(v0_st)
+    addiu v0_st, 8
+    sdl $t0, 0(outbuf)                          # store 8 bytes of literals
+    sdr $t0, 7(outbuf)
+    addiu len, -8
+    bgez len, .Lcopy_lit                  # check if we went past the end of literals
+     addiu outbuf, 8
+    addu outbuf, len                      # adjust outbuf to roll back extra copied bytes
+
+    b .Lmatches_ex
+    sll nibbles, 4
+
+.Lend:
+    lw $ra, 0x14($sp)
+    lw $s0, 0x18($sp)
+    lw $s1, 0x1c($sp)
+    lw $s2, 0x20($sp)
+    lw $s3, 0x24($sp)
+    lw $s4, 0x28($sp)
+    lw $s5, 0x2C($sp)
+    lw $s6, 0x30($sp)
+    lw $s7, 0x34($sp)
+    lw $s8, 0x38($sp)
+    jr $ra
+    addiu $sp, $sp, 0x40
+
+.Lread_large_amount:
+    li shift, 7
+.Lread_large_amount_loop:
+    lb $t0, 0(inbuf)
+    add inbuf, 1
+    andi $t1, $t0, 0x7f
+    sllv $t1, $t1, shift
+    or len, $t1
+    bltz $t0, .Lread_large_amount_loop
+    add shift, 7
+    jr $ra
+    nop
+
+.endfunc