// +build !appengine // +build gc // +build !noasm #include "go_asm.h" #include "textflag.h" // AX scratch // BX scratch // CX literal and match lengths // DX token, match offset // // DI &dst // SI &src // R8 &dst + len(dst) // R9 &src + len(src) // R11 &dst // R12 short output end // R13 short input end // R14 &dict // R15 len(dict) // func decodeBlock(dst, src, dict []byte) int TEXT ·decodeBlock(SB), NOSPLIT, $48-80 MOVQ dst_base+0(FP), DI MOVQ DI, R11 MOVQ dst_len+8(FP), R8 ADDQ DI, R8 MOVQ src_base+24(FP), SI MOVQ src_len+32(FP), R9 CMPQ R9, $0 JE err_corrupt ADDQ SI, R9 MOVQ dict_base+48(FP), R14 MOVQ dict_len+56(FP), R15 // shortcut ends // short output end MOVQ R8, R12 SUBQ $32, R12 // short input end MOVQ R9, R13 SUBQ $16, R13 XORL CX, CX loop: // token := uint32(src[si]) MOVBLZX (SI), DX INCQ SI // lit_len = token >> 4 // if lit_len > 0 // CX = lit_len MOVL DX, CX SHRL $4, CX // if lit_len != 0xF CMPL CX, $0xF JEQ lit_len_loop CMPQ DI, R12 JAE copy_literal CMPQ SI, R13 JAE copy_literal // copy shortcut // A two-stage shortcut for the most common case: // 1) If the literal length is 0..14, and there is enough space, // enter the shortcut and copy 16 bytes on behalf of the literals // (in the fast mode, only 8 bytes can be safely copied this way). // 2) Further if the match length is 4..18, copy 18 bytes in a similar // manner; but we ensure that there's enough space in the output for // those 18 bytes earlier, upon entering the shortcut (in other words, // there is a combined check for both stages). // copy literal MOVOU (SI), X0 MOVOU X0, (DI) ADDQ CX, DI ADDQ CX, SI MOVL DX, CX ANDL $0xF, CX // The second stage: prepare for match copying, decode full info. // If it doesn't work out, the info won't be wasted. // offset := uint16(data[:2]) MOVWLZX (SI), DX TESTL DX, DX JE err_corrupt ADDQ $2, SI JC err_short_buf MOVQ DI, AX SUBQ DX, AX JC err_corrupt CMPQ AX, DI JA err_short_buf // if we can't do the second stage then jump straight to read the // match length, we already have the offset. CMPL CX, $0xF JEQ match_len_loop_pre CMPL DX, $8 JLT match_len_loop_pre CMPQ AX, R11 JB match_len_loop_pre // memcpy(op + 0, match + 0, 8); MOVQ (AX), BX MOVQ BX, (DI) // memcpy(op + 8, match + 8, 8); MOVQ 8(AX), BX MOVQ BX, 8(DI) // memcpy(op +16, match +16, 2); MOVW 16(AX), BX MOVW BX, 16(DI) LEAQ const_minMatch(DI)(CX*1), DI // shortcut complete, load next token JMP loopcheck // Read the rest of the literal length: // do { BX = src[si++]; lit_len += BX } while (BX == 0xFF). lit_len_loop: CMPQ SI, R9 JAE err_short_buf MOVBLZX (SI), BX INCQ SI ADDQ BX, CX CMPB BX, $0xFF JE lit_len_loop copy_literal: // bounds check src and dst MOVQ SI, AX ADDQ CX, AX JC err_short_buf CMPQ AX, R9 JA err_short_buf MOVQ DI, BX ADDQ CX, BX JC err_short_buf CMPQ BX, R8 JA err_short_buf // Copy literals of <=48 bytes through the XMM registers. CMPQ CX, $48 JGT memmove_lit // if len(dst[di:]) < 48 MOVQ R8, AX SUBQ DI, AX CMPQ AX, $48 JLT memmove_lit // if len(src[si:]) < 48 MOVQ R9, BX SUBQ SI, BX CMPQ BX, $48 JLT memmove_lit MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU 32(SI), X2 MOVOU X0, (DI) MOVOU X1, 16(DI) MOVOU X2, 32(DI) ADDQ CX, SI ADDQ CX, DI JMP finish_lit_copy memmove_lit: // memmove(to, from, len) MOVQ DI, 0(SP) MOVQ SI, 8(SP) MOVQ CX, 16(SP) // Spill registers. Increment SI, DI now so we don't need to save CX. ADDQ CX, DI ADDQ CX, SI MOVQ DI, 24(SP) MOVQ SI, 32(SP) MOVL DX, 40(SP) CALL runtime·memmove(SB) // restore registers MOVQ 24(SP), DI MOVQ 32(SP), SI MOVL 40(SP), DX // recalc initial values MOVQ dst_base+0(FP), R8 MOVQ R8, R11 ADDQ dst_len+8(FP), R8 MOVQ src_base+24(FP), R9 ADDQ src_len+32(FP), R9 MOVQ dict_base+48(FP), R14 MOVQ dict_len+56(FP), R15 MOVQ R8, R12 SUBQ $32, R12 MOVQ R9, R13 SUBQ $16, R13 finish_lit_copy: // CX := mLen // free up DX to use for offset MOVL DX, CX ANDL $0xF, CX CMPQ SI, R9 JAE end // offset // si += 2 // DX := int(src[si-2]) | int(src[si-1])<<8 ADDQ $2, SI JC err_short_buf CMPQ SI, R9 JA err_short_buf MOVWQZX -2(SI), DX // 0 offset is invalid TESTL DX, DX JEQ err_corrupt match_len_loop_pre: // if mlen != 0xF CMPB CX, $0xF JNE copy_match // do { BX = src[si++]; mlen += BX } while (BX == 0xFF). match_len_loop: CMPQ SI, R9 JAE err_short_buf MOVBLZX (SI), BX INCQ SI ADDQ BX, CX CMPB BX, $0xFF JE match_len_loop copy_match: ADDQ $const_minMatch, CX // check we have match_len bytes left in dst // di+match_len < len(dst) MOVQ DI, AX ADDQ CX, AX JC err_short_buf CMPQ AX, R8 JA err_short_buf // DX = offset // CX = match_len // BX = &dst + (di - offset) MOVQ DI, BX SUBQ DX, BX // check BX is within dst // if BX < &dst JC copy_match_from_dict CMPQ BX, R11 JBE copy_match_from_dict // if offset + match_len < di LEAQ (BX)(CX*1), AX CMPQ DI, AX JA copy_interior_match // AX := len(dst[:di]) // MOVQ DI, AX // SUBQ R11, AX // copy 16 bytes at a time // if di-offset < 16 copy 16-(di-offset) bytes to di // then do the remaining copy_match_loop: // for match_len >= 0 // dst[di] = dst[i] // di++ // i++ MOVB (BX), AX MOVB AX, (DI) INCQ DI INCQ BX DECQ CX JNZ copy_match_loop JMP loopcheck copy_interior_match: CMPQ CX, $16 JGT memmove_match // if len(dst[di:]) < 16 MOVQ R8, AX SUBQ DI, AX CMPQ AX, $16 JLT memmove_match MOVOU (BX), X0 MOVOU X0, (DI) ADDQ CX, DI XORL CX, CX JMP loopcheck copy_match_from_dict: // CX = match_len // BX = &dst + (di - offset) // AX = offset - di = dict_bytes_available => count of bytes potentially covered by the dictionary MOVQ R11, AX SUBQ BX, AX // BX = len(dict) - dict_bytes_available MOVQ R15, BX SUBQ AX, BX JS err_short_dict ADDQ R14, BX // if match_len > dict_bytes_available, match fits entirely within external dictionary : just copy CMPQ CX, AX JLT memmove_match // The match stretches over the dictionary and our block // 1) copy what comes from the dictionary // AX = dict_bytes_available = copy_size // BX = &dict_end - copy_size // CX = match_len // memmove(to, from, len) MOVQ DI, 0(SP) MOVQ BX, 8(SP) MOVQ AX, 16(SP) // store extra stuff we want to recover // spill MOVQ DI, 24(SP) MOVQ SI, 32(SP) MOVQ CX, 40(SP) CALL runtime·memmove(SB) // restore registers MOVQ 16(SP), AX // copy_size MOVQ 24(SP), DI MOVQ 32(SP), SI MOVQ 40(SP), CX // match_len // recalc initial values MOVQ dst_base+0(FP), R8 MOVQ R8, R11 // TODO: make these sensible numbers ADDQ dst_len+8(FP), R8 MOVQ src_base+24(FP), R9 ADDQ src_len+32(FP), R9 MOVQ dict_base+48(FP), R14 MOVQ dict_len+56(FP), R15 MOVQ R8, R12 SUBQ $32, R12 MOVQ R9, R13 SUBQ $16, R13 // di+=copy_size ADDQ AX, DI // 2) copy the rest from the current block // CX = match_len - copy_size = rest_size SUBQ AX, CX MOVQ R11, BX // check if we have a copy overlap // AX = &dst + rest_size MOVQ CX, AX ADDQ BX, AX // if &dst + rest_size > di, copy byte by byte CMPQ AX, DI JA copy_match_loop memmove_match: // memmove(to, from, len) MOVQ DI, 0(SP) MOVQ BX, 8(SP) MOVQ CX, 16(SP) // Spill registers. Increment DI now so we don't need to save CX. ADDQ CX, DI MOVQ DI, 24(SP) MOVQ SI, 32(SP) CALL runtime·memmove(SB) // restore registers MOVQ 24(SP), DI MOVQ 32(SP), SI // recalc initial values MOVQ dst_base+0(FP), R8 MOVQ R8, R11 // TODO: make these sensible numbers ADDQ dst_len+8(FP), R8 MOVQ src_base+24(FP), R9 ADDQ src_len+32(FP), R9 MOVQ R8, R12 SUBQ $32, R12 MOVQ R9, R13 SUBQ $16, R13 MOVQ dict_base+48(FP), R14 MOVQ dict_len+56(FP), R15 XORL CX, CX loopcheck: // for si < len(src) CMPQ SI, R9 JB loop end: // Remaining length must be zero. TESTQ CX, CX JNE err_corrupt SUBQ R11, DI MOVQ DI, ret+72(FP) RET err_corrupt: MOVQ $-1, ret+72(FP) RET err_short_buf: MOVQ $-2, ret+72(FP) RET err_short_dict: MOVQ $-3, ret+72(FP) RET