// +build gc // +build !noasm #include "go_asm.h" #include "textflag.h" // Register allocation. #define dst R0 #define dstorig R1 #define src R2 #define dstend R3 #define srcend R4 #define match R5 // Match address. #define dictend R6 #define token R7 #define len R8 // Literal and match lengths. #define offset R7 // Match offset; overlaps with token. #define tmp1 R9 #define tmp2 R11 #define tmp3 R12 // func decodeBlock(dst, src, dict []byte) int TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $-4-40 MOVW dst_base +0(FP), dst MOVW dst_len +4(FP), dstend MOVW src_base +12(FP), src MOVW src_len +16(FP), srcend CMP $0, srcend BEQ shortSrc ADD dst, dstend ADD src, srcend MOVW dst, dstorig loop: // Read token. Extract literal length. MOVBU.P 1(src), token MOVW token >> 4, len CMP $15, len BNE readLitlenDone readLitlenLoop: CMP src, srcend BEQ shortSrc MOVBU.P 1(src), tmp1 ADD.S tmp1, len BVS shortDst CMP $255, tmp1 BEQ readLitlenLoop readLitlenDone: CMP $0, len BEQ copyLiteralDone // Bounds check dst+len and src+len. ADD.S dst, len, tmp1 ADD.CC.S src, len, tmp2 BCS shortSrc CMP dstend, tmp1 //BHI shortDst // Uncomment for distinct error codes. CMP.LS srcend, tmp2 BHI shortSrc // Copy literal. CMP $4, len BLO copyLiteralFinish // Copy 0-3 bytes until src is aligned. TST $1, src MOVBU.NE.P 1(src), tmp1 MOVB.NE.P tmp1, 1(dst) SUB.NE $1, len TST $2, src MOVHU.NE.P 2(src), tmp2 MOVB.NE.P tmp2, 1(dst) MOVW.NE tmp2 >> 8, tmp1 MOVB.NE.P tmp1, 1(dst) SUB.NE $2, len B copyLiteralLoopCond copyLiteralLoop: // Aligned load, unaligned write. MOVW.P 4(src), tmp1 MOVW tmp1 >> 8, tmp2 MOVB tmp2, 1(dst) MOVW tmp1 >> 16, tmp3 MOVB tmp3, 2(dst) MOVW tmp1 >> 24, tmp2 MOVB tmp2, 3(dst) MOVB.P tmp1, 4(dst) copyLiteralLoopCond: // Loop until len-4 < 0. SUB.S $4, len BPL copyLiteralLoop copyLiteralFinish: // Copy remaining 0-3 bytes. // At this point, len may be < 0, but len&3 is still accurate. TST $1, len MOVB.NE.P 1(src), tmp3 MOVB.NE.P tmp3, 1(dst) TST $2, len MOVB.NE.P 2(src), tmp1 MOVB.NE.P tmp1, 2(dst) MOVB.NE -1(src), tmp2 MOVB.NE tmp2, -1(dst) copyLiteralDone: // Initial part of match length. // This frees up the token register for reuse as offset. AND $15, token, len CMP src, srcend BEQ end // Read offset. ADD.S $2, src BCS shortSrc CMP srcend, src BHI shortSrc MOVBU -2(src), offset MOVBU -1(src), tmp1 ORR.S tmp1 << 8, offset BEQ corrupt // Read rest of match length. CMP $15, len BNE readMatchlenDone readMatchlenLoop: CMP src, srcend BEQ shortSrc MOVBU.P 1(src), tmp1 ADD.S tmp1, len BVS shortDst CMP $255, tmp1 BEQ readMatchlenLoop readMatchlenDone: // Bounds check dst+len+minMatch. ADD.S dst, len, tmp1 ADD.CC.S $const_minMatch, tmp1 BCS shortDst CMP dstend, tmp1 BHI shortDst RSB dst, offset, match CMP dstorig, match BGE copyMatch4 // match < dstorig means the match starts in the dictionary, // at len(dict) - offset + (dst - dstorig). MOVW dict_base+24(FP), match MOVW dict_len +28(FP), dictend ADD $const_minMatch, len RSB dst, dstorig, tmp1 RSB dictend, offset, tmp2 ADD.S tmp2, tmp1 BMI shortDict ADD match, dictend ADD tmp1, match copyDict: MOVBU.P 1(match), tmp1 MOVB.P tmp1, 1(dst) SUB.S $1, len CMP.NE match, dictend BNE copyDict // If the match extends beyond the dictionary, the rest is at dstorig. CMP $0, len BEQ copyMatchDone MOVW dstorig, match B copyMatch // Copy a regular match. // Since len+minMatch is at least four, we can do a 4× unrolled // byte copy loop. Using MOVW instead of four byte loads is faster, // but to remain portable we'd have to align match first, which is // too expensive. By alternating loads and stores, we also handle // the case offset < 4. copyMatch4: SUB.S $4, len MOVBU.P 4(match), tmp1 MOVB.P tmp1, 4(dst) MOVBU -3(match), tmp2 MOVB tmp2, -3(dst) MOVBU -2(match), tmp3 MOVB tmp3, -2(dst) MOVBU -1(match), tmp1 MOVB tmp1, -1(dst) BPL copyMatch4 // Restore len, which is now negative. ADD.S $4, len BEQ copyMatchDone copyMatch: // Finish with a byte-at-a-time copy. SUB.S $1, len MOVBU.P 1(match), tmp2 MOVB.P tmp2, 1(dst) BNE copyMatch copyMatchDone: CMP src, srcend BNE loop end: CMP $0, len BNE corrupt SUB dstorig, dst, tmp1 MOVW tmp1, ret+36(FP) RET // The error cases have distinct labels so we can put different // return codes here when debugging, or if the error returns need to // be changed. shortDict: shortDst: shortSrc: corrupt: MOVW $-1, tmp1 MOVW tmp1, ret+36(FP) RET