#ifndef AWS_COMMON_MATH_GCC_ARM64_ASM_INL #define AWS_COMMON_MATH_GCC_ARM64_ASM_INL /** * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0. */ /* * This header is already included, but include it again to make editor * highlighting happier. */ #include #include /* clang-format off */ AWS_EXTERN_C_BEGIN /** * Multiplies a * b. If the result overflows, returns 2^64 - 1. */ AWS_STATIC_IMPL uint64_t aws_mul_u64_saturating(uint64_t a, uint64_t b) { /* We can use inline assembly to do this efficiently on arm64 by doing a high-mul and checking the upper 64 bits of a 64x64->128b multiply are zero */ uint64_t tmp = 0, res = 0; __asm__("umulh %x[hmul], %x[arga], %x[argb]\n" "mul %x[res], %x[arga], %x[argb]\n" "cmp %x[hmul], #0\n" "csinv %x[res], %x[res], xzr, eq\n" : /* inout: hmul is upper 64b, r is the result */ [hmul] "+&r"(tmp), [res]"+&r"(res) : /* in: a and b */ [arga] "r"(a), [argb] "r"(b) : /* clobbers: cc (cmp clobbers condition codes) */ "cc"); return res; } /** * If a * b overflows, returns AWS_OP_ERR; otherwise multiplies * a * b, returns the result in *r, and returns AWS_OP_SUCCESS. */ AWS_STATIC_IMPL int aws_mul_u64_checked(uint64_t a, uint64_t b, uint64_t *r) { /* We can use inline assembly to do this efficiently on arm64 by doing a high-mul and checking the upper 64 bits of a 64x64->128b multiply are zero */ uint64_t tmp, res; __asm__("umulh %x[hmul], %x[arga], %x[argb]\n" "mul %x[res], %x[arga], %x[argb]\n" : /* inout: hmul is upper 64b, r is the result */ [hmul] "=&r"(tmp), [res]"=&r"(res) : /* in: a and b */ [arga] "r"(a), [argb] "r"(b)); *r = res; if (tmp) { return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED); } return AWS_OP_SUCCESS; } /** * Multiplies a * b. If the result overflows, returns 2^32 - 1. */ AWS_STATIC_IMPL uint32_t aws_mul_u32_saturating(uint32_t a, uint32_t b) { /* We can use inline assembly to do this efficiently on arm64 by doing a high-mul and checking the upper 32 bits of a 32x32->64b multiply are zero */ uint64_t res = 0; __asm__("umull %x[res], %w[arga], %w[argb]\n" "cmp xzr, %x[res], lsr #32\n" "csinv %w[res], %w[res], wzr, eq\n" : /* inout: res contains both lower/upper 32b */ [res]"+&r"(res) : /* in: a and b */ [arga] "r"(a), [argb] "r"(b) : /* clobbers: cc (cmp clobbers condition codes) */ "cc"); return res & 0xffffffff; } /** * If a * b overflows, returns AWS_OP_ERR; otherwise multiplies * a * b, returns the result in *r, and returns AWS_OP_SUCCESS. */ AWS_STATIC_IMPL int aws_mul_u32_checked(uint32_t a, uint32_t b, uint32_t *r) { /* We can use inline assembly to do this efficiently on arm64 by doing a high-mul and checking the upper 32 bits of a 32x32->64b multiply are zero */ uint64_t res; __asm__("umull %x[res], %w[arga], %w[argb]\n" : /* inout: res is both upper/lower 32b */ [res]"=r"(res) : /* in: a and b */ [arga] "r"(a), [argb] "r"(b)); *r = res & 0xffffffff; if (res >> 32) { return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED); } return AWS_OP_SUCCESS; } /** * If a + b overflows, returns AWS_OP_ERR; otherwise adds * a + b, returns the result in *r, and returns AWS_OP_SUCCESS. */ AWS_STATIC_IMPL int aws_add_u64_checked(uint64_t a, uint64_t b, uint64_t *r) { /* We can use inline assembly to do this efficiently on arm64 by doing a * 64b + 64b add and checking the carry out */ uint64_t res, flag; __asm__("adds %x[res], %x[arga], %x[argb]\n" "csinv %x[flag], xzr, xzr, cc\n" : /* inout: res is the result of addition; flag is -1 if carry happened */ [res]"=&r"(res), [flag] "=r"(flag) : /* in: a and b */ [arga] "r"(a), [argb] "r"(b) : /* clobbers: cc (cmp clobbers condition codes) */ "cc"); *r = res; if (flag) { return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED); } return AWS_OP_SUCCESS; } /** * Adds a + b. If the result overflows, returns 2^64 - 1. */ AWS_STATIC_IMPL uint64_t aws_add_u64_saturating(uint64_t a, uint64_t b) { /* We can use inline assembly to do this efficiently on arm64 by doing a * 64b + 64b add and checking the carry out */ uint64_t res; __asm__("adds %x[res], %x[arga], %x[argb]\n" "csinv %x[res], %x[res], xzr, cc\n" : /* inout: res is the result */ [res]"=&r"(res) : /* in: a and b */ [arga] "r"(a), [argb] "r"(b) : /* clobbers: cc (cmp clobbers condition codes) */ "cc"); return res; } /** * If a + b overflows, returns AWS_OP_ERR; otherwise adds * a + b, returns the result in *r, and returns AWS_OP_SUCCESS. */ AWS_STATIC_IMPL int aws_add_u32_checked(uint32_t a, uint32_t b, uint32_t *r) { /* We can use inline assembly to do this efficiently on arm64 by doing a * 32b + 32b add and checking the carry out */ uint32_t res, flag; __asm__("adds %w[res], %w[arga], %w[argb]\n" "csinv %w[flag], wzr, wzr, cc\n" : /* inout: res is 32b result */ [res]"=&r"(res), [flag] "=r"(flag) : /* in: a and b */ [arga] "r"(a), [argb] "r"(b) : /* clobbers: cc (cmp clobbers condition codes) */ "cc"); *r = res; if (flag) { return aws_raise_error(AWS_ERROR_OVERFLOW_DETECTED); } return AWS_OP_SUCCESS; } /** * Adds a + b. If the result overflows, returns 2^32 - 1. */ AWS_STATIC_IMPL uint32_t aws_add_u32_saturating(uint32_t a, uint32_t b) { /* We can use inline assembly to do this efficiently on arm64 by doing a * 32b + 32b add and checking the carry out */ uint32_t res = 0; __asm__("adds %w[res], %w[arga], %w[argb]\n" "csinv %w[res], %w[res], wzr, cc\n" : /* inout: res is the result */ [res]"+&r"(res) : /* in: a and b */ [arga] "r"(a), [argb] "r"(b) : /* clobbers: cc (cmp clobbers condition codes) */ "cc"); return res; } AWS_EXTERN_C_END /* clang-format on */ #endif /* AWS_COMMON_MATH_GCC_ARM64_ASM_INL */