##############################################
#  LZ4T - Fast decompressor in assembly
#  Written by aglab2 inspired by Rasky LZ4 and devwizard YAZ0
##############################################

# NOTE: to optimize for speed, this decompressor can write up to 8 bytes
# after the end of the output buffer. The outut buffer must have been sized
# accordingly to accomodate for this.

#define MINMATCH    4

#define inbuf       $s0
#define nibbles     $s1
#define outbuf      $s2
#define match_combo_mask $s3
#define len         $s4
#define match_lim   $s5
#define match_min   $s6
#define v0_st       $s7

#define dma_ctx     $s8
#define dma_ptr     $v0

#define shift       $t9
#define len_add     $t8
#define match_len   $t7
#define match_off   $t6
#define match_combo $t5
#define off_nibble  $t4

    .section .text.lz4t_unpack_fast
	.p2align 5
    .globl lz4t_unpack_fast 
    .func lz4t_unpack_fast
    .set at
    .set noreorder

lz4t_unpack_fast:
    addiu $sp, $sp, -0x40
    sw $ra, 0x14($sp)
    sw $s0, 0x18($sp)
    sw $s1, 0x1c($sp)
    sw $s2, 0x20($sp)
    sw $s3, 0x24($sp)
    sw $s4, 0x28($sp)
    sw $s5, 0x2C($sp)
    sw $s6, 0x30($sp)
    sw $s7, 0x34($sp)
    sw $s8, 0x38($sp)

    move $s0, $a0
    lw $s1, 12($a0)
    move $s2, $a1
    move dma_ctx, $a2
    lbu match_combo_mask, 8($a0)
    sll match_combo_mask, 28
    lbu match_min, 9($a0)

    move dma_ptr, $a0
    addiu $s0, 16

.Lloop:
    sub $t0, inbuf, dma_ptr                     # check if we need to wait for dma
    bgezal $t0, dma_async_ctx_read                    # if inbuf >= dma_ptr, wait for dma
     move $a0, dma_ctx

    bnez nibbles, .Lprocess_nibbles
     li match_lim, 7
     
.Lload_nibbles:
    lwl nibbles, 0(inbuf)
    lwr nibbles, 3(inbuf)
    beqz nibbles, .Lend
    add inbuf, 4

.Lprocess_nibbles:
    bgez nibbles, .Lmatches
     srl len, nibbles, 28

.Lliterals:
    andi len, 7
    beqz len, .Llarge_literals
     nop

.Lsmall_literal:
    ldl $t0, 0(inbuf)
    ldr $t0, 7(inbuf)
    add inbuf, len
    sdl $t0, 0(outbuf)
    sdr $t0, 7(outbuf)
    sll nibbles, 4
    beq len, match_lim, .Lloop
    add outbuf, len

.Lmatches_ex:
    sub $t0, inbuf, dma_ptr                     # check if we need to wait for dma
    bgezal $t0, dma_async_ctx_read                    # if inbuf >= dma_ptr, wait for dma
     move $a0, dma_ctx

    bnez nibbles, .Lprocess_ex_match_nibble
     li match_lim, 15
     
.Lload_nibbles2:
    lwl nibbles, 0(inbuf)
    lwr nibbles, 3(inbuf)
    beqz nibbles, .Lend
    add inbuf, 4

.Lprocess_ex_match_nibble:
    srl len, nibbles, 28

.Lmatches:
    lwl match_combo, 0(inbuf)
    lwr match_combo, 3(inbuf)
    addiu inbuf, 2
    srl match_off, match_combo, 16

    beqz match_combo_mask, .Lfull_offset
    sll nibbles, 4
    srl nibbles, 4
    and off_nibble, match_combo, match_combo_mask
    or nibbles, off_nibble
    andi match_off, 0xfff
.Lfull_offset:

    bne len, match_lim, .Lmatch
     addu match_len, len, match_min

    # len is sign extended match_combo[8:15]
    sll match_combo, 16
    sra len, match_combo, 24
    add inbuf, 1
    bltzal len, .Lread_large_amount
     andi len, 0x7f

    add match_len, len

.Lmatch:
    ble match_off, match_len, .Lmatch1_loop     # check if we can do 8-byte copy
     sub v0_st, outbuf, match_off                 # calculate start of match
.Lmatch8_loop:                                  # 8-byte copy loop
    ldl $t0, -1(v0_st)                             # load 8 bytes
    ldr $t0, 6(v0_st)
    addiu v0_st, 8
    sdl $t0, 0(outbuf)                          # store 8 bytes
    sdr $t0, 7(outbuf)
    addiu match_len, -8
    bgtz match_len, .Lmatch8_loop               # check we went past match_len
     addiu outbuf, 8
    b .Lloop                                    # jump to main loop
     addu outbuf, match_len                     # adjust pointer remove extra bytes

.Lmatch1_loop:                                  # 1-byte copy loop
    lbu $t0, -1(v0_st)                             # load 1 byte
    addiu v0_st, 1
    sb $t0, 0(outbuf)                           # store 1 byte
    addiu match_len, -1
    bgtz match_len, .Lmatch1_loop               # check we went past match_len
     addiu outbuf, 1
    b .Lloop                                    # jump to main loop
     nop

.Llarge_literals:
    lb len, 0(inbuf)
    add inbuf, 1
    bltzal len, .Lread_large_amount
     andi len, 0x7f

    move v0_st, inbuf                            # store start of literals into v0_st
    addiu len, 22
    add inbuf, len                        # advance inbuf to end of literals
.Lcopy_lit:
    sub $t0, v0_st, dma_ptr                     # check if all the literals have been DMA'd
    bgezal $t0, dma_async_ctx_read                       # if not, wait for DMA
     move $a0, dma_ctx
    ldl $t0, 0(v0_st)                             # load 8 bytes of literals
    ldr $t0, 7(v0_st)
    addiu v0_st, 8
    sdl $t0, 0(outbuf)                          # store 8 bytes of literals
    sdr $t0, 7(outbuf)
    addiu len, -8
    bgez len, .Lcopy_lit                  # check if we went past the end of literals
     addiu outbuf, 8
    addu outbuf, len                      # adjust outbuf to roll back extra copied bytes

    b .Lmatches_ex
    sll nibbles, 4

.Lend:
    lw $ra, 0x14($sp)
    lw $s0, 0x18($sp)
    lw $s1, 0x1c($sp)
    lw $s2, 0x20($sp)
    lw $s3, 0x24($sp)
    lw $s4, 0x28($sp)
    lw $s5, 0x2C($sp)
    lw $s6, 0x30($sp)
    lw $s7, 0x34($sp)
    lw $s8, 0x38($sp)
    jr $ra
    addiu $sp, $sp, 0x40

.Lread_large_amount:
    li shift, 7
.Lread_large_amount_loop:
    lb $t0, 0(inbuf)
    add inbuf, 1
    andi $t1, $t0, 0x7f
    sllv $t1, $t1, shift
    or len, $t1
    bltz $t0, .Lread_large_amount_loop
    add shift, 7
    jr $ra
    nop

.endfunc