diff --git a/crc/Makefile.am b/crc/Makefile.am index ecf8b00..b6faf47 100644 --- a/crc/Makefile.am +++ b/crc/Makefile.am @@ -37,6 +37,10 @@ lsrc += \ crc/crc64_multibinary.asm \ crc/crc64_ecma_refl_by8.asm \ crc/crc64_ecma_norm_by8.asm \ + crc/crc64_iso_refl_by8.asm \ + crc/crc64_iso_norm_by8.asm \ + crc/crc64_jones_refl_by8.asm \ + crc/crc64_jones_norm_by8.asm \ crc/crc64_base.c \ crc/crc_multibinary.asm \ crc/crc_base.c @@ -47,9 +51,9 @@ extern_hdrs += include/crc.h include/crc64.h other_src += include/reg_sizes.asm include/types.h include/test.h check_tests += crc/crc16_t10dif_test crc/crc32_ieee_test crc/crc32_iscsi_test \ - crc/crc64_ecma_refl_test crc/crc64_ecma_norm_test + crc/crc64_funcs_test perf_tests += crc/crc16_t10dif_perf crc/crc32_ieee_perf crc/crc32_iscsi_perf \ - crc/crc64_ecma_refl_perf crc/crc64_ecma_norm_perf + crc/crc64_funcs_perf -examples += crc/crc_simple_test +examples += crc/crc_simple_test crc/crc64_example diff --git a/crc/crc64_base.c b/crc/crc64_base.c index 1c53dfc..29166f9 100644 --- a/crc/crc64_base.c +++ b/crc/crc64_base.c @@ -34,7 +34,7 @@ // crc64_ecma baseline function // Slow crc64 from the definition. Can be sped up with a lookup table. -uint64_t crc64_ecma_refl_base(uint64_t seed, uint8_t * buf, uint64_t len) +uint64_t crc64_ecma_refl_base(uint64_t seed, const uint8_t * buf, uint64_t len) { uint64_t rem = ~seed; unsigned int i, j; @@ -50,7 +50,7 @@ uint64_t crc64_ecma_refl_base(uint64_t seed, uint8_t * buf, uint64_t len) return ~rem; } -uint64_t crc64_ecma_norm_base(uint64_t seed, uint8_t * buf, uint64_t len) +uint64_t crc64_ecma_norm_base(uint64_t seed, const uint8_t * buf, uint64_t len) { uint64_t rem = ~seed; unsigned int i, j; @@ -66,6 +66,74 @@ uint64_t crc64_ecma_norm_base(uint64_t seed, uint8_t * buf, uint64_t len) return ~rem; } +// crc64_iso baseline function +// Slow crc64 from the definition. Can be sped up with a lookup table. +uint64_t crc64_iso_refl_base(uint64_t seed, const uint8_t * buf, uint64_t len) +{ + uint64_t rem = ~seed; + unsigned int i, j; + + uint64_t poly = 0xD800000000000000ULL; // ISO standard reflected + + for (i = 0; i < len; i++) { + rem = rem ^ (uint64_t) buf[i]; + for (j = 0; j < MAX_ITER; j++) { + rem = (rem & 0x1ULL ? poly : 0) ^ (rem >> 1); + } + } + return ~rem; +} + +uint64_t crc64_iso_norm_base(uint64_t seed, const uint8_t * buf, uint64_t len) +{ + uint64_t rem = ~seed; + unsigned int i, j; + + uint64_t poly = 0x000000000000001BULL; // ISO standard + + for (i = 0; i < len; i++) { + rem = rem ^ ((uint64_t) buf[i] << 56); + for (j = 0; j < MAX_ITER; j++) { + rem = (rem & 0x8000000000000000ULL ? poly : 0) ^ (rem << 1); + } + } + return ~rem; +} + +// crc64_jones baseline function +// Slow crc64 from the definition. Can be sped up with a lookup table. +uint64_t crc64_jones_refl_base(uint64_t seed, const uint8_t * buf, uint64_t len) +{ + uint64_t rem = ~seed; + unsigned int i, j; + + uint64_t poly = 0x95ac9329ac4bc9b5ULL; // Jones coefficients reflected + + for (i = 0; i < len; i++) { + rem = rem ^ (uint64_t) buf[i]; + for (j = 0; j < MAX_ITER; j++) { + rem = (rem & 0x1ULL ? poly : 0) ^ (rem >> 1); + } + } + return ~rem; +} + +uint64_t crc64_jones_norm_base(uint64_t seed, const uint8_t * buf, uint64_t len) +{ + uint64_t rem = ~seed; + unsigned int i, j; + + uint64_t poly = 0xad93d23594c935a9ULL; // Jones coefficients + + for (i = 0; i < len; i++) { + rem = rem ^ ((uint64_t) buf[i] << 56); + for (j = 0; j < MAX_ITER; j++) { + rem = (rem & 0x8000000000000000ULL ? poly : 0) ^ (rem << 1); + } + } + return ~rem; +} + struct slver { unsigned short snum; unsigned char ver; @@ -77,3 +145,15 @@ struct slver crc64_ecma_refl_base_slver = { 0x001c, 0x00, 0x00 }; struct slver crc64_ecma_norm_base_slver_00000019; struct slver crc64_ecma_norm_base_slver = { 0x0019, 0x00, 0x00 }; + +struct slver crc64_iso_refl_base_slver_00000022; +struct slver crc64_iso_refl_base_slver = { 0x0022, 0x00, 0x00 }; + +struct slver crc64_iso_norm_base_slver_0000001f; +struct slver crc64_iso_norm_base_slver = { 0x001f, 0x00, 0x00 }; + +struct slver crc64_jones_refl_base_slver_00000028; +struct slver crc64_jones_refl_base_slver = { 0x0028, 0x00, 0x00 }; + +struct slver crc64_jones_norm_base_slver_00000025; +struct slver crc64_jones_norm_base_slver = { 0x0025, 0x00, 0x00 }; diff --git a/crc/crc64_ecma_refl_test.c b/crc/crc64_ecma_refl_test.c deleted file mode 100644 index a858aa0..0000000 --- a/crc/crc64_ecma_refl_test.c +++ /dev/null @@ -1,174 +0,0 @@ -/********************************************************************** - Copyright(c) 2011-2016 Intel Corporation All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************/ - -#include -#include -#include -#include -#include "crc64.h" -#include "types.h" - -#ifndef TEST_SEED -# define TEST_SEED 0x1234 -#endif - -#define MAX_BUF 512 -#define TEST_SIZE 20 - -typedef uint64_t u64; -typedef uint32_t u32; -typedef uint16_t u16; -typedef uint8_t u8; - -// Generates pseudo-random data - -void rand_buffer(unsigned char *buf, long buffer_size) -{ - long i; - for (i = 0; i < buffer_size; i++) - buf[i] = rand(); -} - -int main(int argc, char *argv[]) -{ - int fail = 0; - u64 r; - int verbose = argc - 1; - int i, s, ret; - void *buf_alloc; - unsigned char *buf; - - printf("Test crc64_ecma_refl "); - - // Align to MAX_BUF boundary - ret = posix_memalign(&buf_alloc, MAX_BUF, MAX_BUF * TEST_SIZE); - if (ret) { - printf("alloc error: Fail"); - return -1; - } - buf = (unsigned char *)buf_alloc; - - srand(TEST_SEED); - - // Test of all zeros - memset(buf, 0, MAX_BUF * 10); - u64 crc = crc64_ecma_refl(TEST_SEED, buf, MAX_BUF); - u64 crc_ref = crc64_ecma_refl_base(TEST_SEED, buf, MAX_BUF); - if (crc != crc_ref) { - fail++; - printf("\n opt ref\n"); - printf(" ------ ------\n"); - printf("crc zero = 0x%16lx 0x%16lx \n", crc, crc_ref); - } else - printf("."); - - // Another simple test pattern - memset(buf, 0x8a, MAX_BUF); - crc = crc64_ecma_refl(TEST_SEED, buf, MAX_BUF); - crc_ref = crc64_ecma_refl_base(TEST_SEED, buf, MAX_BUF); - if (crc != crc_ref) - fail++; - if (verbose) - printf("crc all 8a = 0x%16lx 0x%16lx\n", crc, crc_ref); - else - printf("."); - - // Do a few random tests - r = rand(); - rand_buffer(buf, MAX_BUF * TEST_SIZE); - - for (i = 0; i < TEST_SIZE; i++) { - crc = crc64_ecma_refl(r, buf, MAX_BUF); - crc_ref = crc64_ecma_refl_base(r, buf, MAX_BUF); - if (crc != crc_ref) - fail++; - if (verbose) - printf("crc rand%3d = 0x%16lx 0x%16lx\n", i, crc, crc_ref); - else - printf("."); - buf += MAX_BUF; - } - - // Do a few random sizes - buf = (unsigned char *)buf_alloc; //reset buf - r = rand(); - - for (i = MAX_BUF; i >= 0; i--) { - crc = crc64_ecma_refl(r, buf, i); - crc_ref = crc64_ecma_refl_base(r, buf, i); - if (crc != crc_ref) { - fail++; - printf("fail random size%i 0x%16lx 0x%16lx\n", i, crc, crc_ref); - } else - printf("."); - } - - // Try different seeds - for (s = 0; s < 20; s++) { - buf = (unsigned char *)buf_alloc; //reset buf - - r = rand(); // just to get a new seed - rand_buffer(buf, MAX_BUF * TEST_SIZE); // new pseudo-rand data - - if (verbose) - printf("seed = 0x%lx\n", r); - - for (i = 0; i < TEST_SIZE; i++) { - crc = crc64_ecma_refl(r, buf, MAX_BUF); - crc_ref = crc64_ecma_refl_base(r, buf, MAX_BUF); - if (crc != crc_ref) - fail++; - if (verbose) - printf("crc rand%3d = 0x%16lx 0x%16lx\n", i, crc, crc_ref); - else - printf("."); - buf += MAX_BUF; - } - } - - // Run tests at end of buffer - buf = (unsigned char *)buf_alloc; //reset buf - buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end - for (i = 0; i < TEST_SIZE; i++) { - crc = crc64_ecma_refl(TEST_SEED, buf + i, TEST_SIZE - i); - crc_ref = crc64_ecma_refl_base(TEST_SEED, buf + i, TEST_SIZE - i); - if (crc != crc_ref) - fail++; - if (verbose) - printf("crc eob rand%3d = 0x%16lx 0x%16lx\n", i, crc, crc_ref); - else - printf("."); - } - - printf("Test done: %s\n", fail ? "Fail" : "Pass"); - if (fail) - printf("\nFailed %d tests\n", fail); - - return fail; -} diff --git a/crc/crc64_ecma_refl_perf.c b/crc/crc64_example.c similarity index 61% rename from crc/crc64_ecma_refl_perf.c rename to crc/crc64_example.c index d1ad7cb..64763a1 100644 --- a/crc/crc64_ecma_refl_perf.c +++ b/crc/crc64_example.c @@ -26,63 +26,43 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************/ - #include #include -#include -#include -#include +#include #include "crc64.h" -#include "test.h" -//#define CACHED_TEST -#ifdef CACHED_TEST -// Cached test, loop many times over small dataset -# define TEST_LEN 8*1024 -# define TEST_LOOPS 400000 -# define TEST_TYPE_STR "_warm" -#else -// Uncached test. Pull from large mem base. -# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ -# define TEST_LEN (2 * GT_L3_CACHE) -# define TEST_LOOPS 100 -# define TEST_TYPE_STR "_cold" -#endif - -#ifndef TEST_SEED -# define TEST_SEED 0x1234 -#endif - -#define TEST_MEM TEST_LEN +#define BUF_SIZE 8192 +#define INIT_SEED 0x12345678 int main(int argc, char *argv[]) { - int i; - void *buf; - uint64_t crc; - struct perf start, stop; + uint8_t inbuf[BUF_SIZE]; + uint64_t avail_in, total_in = 0; + uint64_t crc64_checksum; + FILE *in; - printf("crc64_ecma_refl_perf:\n"); - - if (posix_memalign(&buf, 1024, TEST_LEN)) { - printf("alloc error: Fail"); - return -1; + if (argc != 2) { + fprintf(stderr, "Usage: crc64_example infile\n"); + exit(0); + } + in = fopen(argv[1], "rb"); + if (!in) { + fprintf(stderr, "Can't open %s for reading\n", argv[1]); + exit(0); } - memset(buf, (char)TEST_SEED, TEST_LEN); - printf("Start timed tests\n"); + printf("crc64_example -- crc64_ecma_refl:\n"); fflush(0); - crc = crc64_ecma_refl(TEST_SEED, buf, TEST_LEN); - perf_start(&start); - for (i = 0; i < TEST_LOOPS; i++) { - crc = crc64_ecma_refl(TEST_SEED, buf, TEST_LEN); + crc64_checksum = INIT_SEED; + while ((avail_in = fread(inbuf, 1, BUF_SIZE, in))) { + // crc update mode + crc64_checksum = crc64_ecma_refl(crc64_checksum, inbuf, avail_in); + total_in += avail_in; } - perf_stop(&stop); - printf("crc64_ecma_refl" TEST_TYPE_STR ": "); - perf_print(stop, start, (long long)TEST_LEN * i); - printf("finish 0x%lx\n", crc); + fclose(in); + printf("total length is %ld, checksum is 0x%lx\n", total_in, crc64_checksum); return 0; } diff --git a/crc/crc64_ecma_norm_perf.c b/crc/crc64_funcs_perf.c similarity index 69% rename from crc/crc64_ecma_norm_perf.c rename to crc/crc64_funcs_perf.c index 8d42045..04135bf 100644 --- a/crc/crc64_ecma_norm_perf.c +++ b/crc/crc64_funcs_perf.c @@ -55,14 +55,30 @@ #define TEST_MEM TEST_LEN +typedef uint64_t(*crc64_func_t) (uint64_t, const uint8_t *, uint64_t); + +typedef struct func_case { + char *note; + crc64_func_t crc64_func_call; + crc64_func_t crc64_ref_call; +} func_case_t; + +func_case_t test_funcs[] = { + {"crc64_ecma_norm", crc64_ecma_norm, crc64_ecma_norm_base}, + {"crc64_ecma_refl", crc64_ecma_refl, crc64_ecma_refl_base}, + {"crc64_iso_norm", crc64_iso_norm, crc64_iso_norm_base}, + {"crc64_iso_refl", crc64_iso_refl, crc64_iso_refl_base}, + {"crc64_jones_norm", crc64_jones_norm, crc64_jones_norm_base}, + {"crc64_jones_refl", crc64_jones_refl, crc64_jones_refl_base} +}; + int main(int argc, char *argv[]) { - int i; + int i, j; void *buf; uint64_t crc; struct perf start, stop; - - printf("crc64_ecma_norm_perf:\n"); + func_case_t *test_func; if (posix_memalign(&buf, 1024, TEST_LEN)) { printf("alloc error: Fail"); @@ -70,19 +86,24 @@ int main(int argc, char *argv[]) } memset(buf, (char)TEST_SEED, TEST_LEN); - printf("Start timed tests\n"); - fflush(0); + for (j = 0; j < sizeof(test_funcs) / sizeof(test_funcs[0]); j++) { + test_func = &test_funcs[j]; + printf("%s_perf:\n", test_func->note); - crc = crc64_ecma_norm(TEST_SEED, buf, TEST_LEN); - perf_start(&start); - for (i = 0; i < TEST_LOOPS; i++) { - crc = crc64_ecma_norm(TEST_SEED, buf, TEST_LEN); + printf("Start timed tests\n"); + fflush(0); + + crc = test_func->crc64_func_call(TEST_SEED, buf, TEST_LEN); + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + crc = test_func->crc64_func_call(TEST_SEED, buf, TEST_LEN); + } + perf_stop(&stop); + printf("%s" TEST_TYPE_STR ": ", test_func->note); + perf_print(stop, start, (long long)TEST_LEN * i); + + printf("finish 0x%lx\n", crc); } - perf_stop(&stop); - printf("crc64_ecma_norm" TEST_TYPE_STR ": "); - perf_print(stop, start, (long long)TEST_LEN * i); - - printf("finish 0x%lx\n", crc); return 0; } diff --git a/crc/crc64_ecma_norm_test.c b/crc/crc64_funcs_test.c similarity index 52% rename from crc/crc64_ecma_norm_test.c rename to crc/crc64_funcs_test.c index 7d46e7c..f638f0f 100644 --- a/crc/crc64_ecma_norm_test.c +++ b/crc/crc64_funcs_test.c @@ -46,6 +46,23 @@ typedef uint32_t u32; typedef uint16_t u16; typedef uint8_t u8; +typedef uint64_t(*crc64_func_t) (uint64_t, const uint8_t *, uint64_t); + +typedef struct func_case { + char *note; + crc64_func_t crc64_func_call; + crc64_func_t crc64_ref_call; +} func_case_t; + +func_case_t test_funcs[] = { + {"crc64_ecma_norm", crc64_ecma_norm, crc64_ecma_norm_base}, + {"crc64_ecma_refl", crc64_ecma_refl, crc64_ecma_refl_base}, + {"crc64_iso_norm", crc64_iso_norm, crc64_iso_norm_base}, + {"crc64_iso_refl", crc64_iso_refl, crc64_iso_refl_base}, + {"crc64_jones_norm", crc64_jones_norm, crc64_jones_norm_base}, + {"crc64_jones_refl", crc64_jones_refl, crc64_jones_refl_base} +}; + // Generates pseudo-random data void rand_buffer(unsigned char *buf, long buffer_size) @@ -55,16 +72,27 @@ void rand_buffer(unsigned char *buf, long buffer_size) buf[i] = rand(); } +// Test cases +int zeros_test(func_case_t * test_func); + +int simple_pattern_test(func_case_t * test_func); + +int seeds_sizes_test(func_case_t * test_func); + +int eob_test(func_case_t * test_func); + +int update_test(func_case_t * test_func); + +int verbose = 0; +void *buf_alloc = NULL; + int main(int argc, char *argv[]) { - int fail = 0; - u64 r; - int verbose = argc - 1; - int i, s, ret; - void *buf_alloc; - unsigned char *buf; + int fail = 0, fail_case; + int i, ret; + func_case_t *test_func; - printf("Test crc64_ecma_norm "); + verbose = argc - 1; // Align to MAX_BUF boundary ret = posix_memalign(&buf_alloc, MAX_BUF, MAX_BUF * TEST_SIZE); @@ -72,14 +100,44 @@ int main(int argc, char *argv[]) printf("alloc error: Fail"); return -1; } - buf = (unsigned char *)buf_alloc; - srand(TEST_SEED); + printf("CRC64 Tests\n"); - // Test of all zeros + for (i = 0; i < sizeof(test_funcs) / sizeof(test_funcs[0]); i++) { + fail_case = 0; + test_func = &test_funcs[i]; + + printf("Test %s ", test_func->note); + fail_case += zeros_test(test_func); + fail_case += simple_pattern_test(test_func); + fail_case += seeds_sizes_test(test_func); + fail_case += eob_test(test_func); + fail_case += update_test(test_func); + printf("Test %s done: %s\n", test_func->note, fail_case ? "Fail" : "Pass"); + + if (fail_case) { + printf("\n%s Failed %d tests\n", test_func->note, fail_case); + fail++; + } + } + + printf("CRC64 Tests all done: %s\n", fail ? "Fail" : "Pass"); + + return fail; +} + +// Test of all zeros +int zeros_test(func_case_t * test_func) +{ + uint64_t crc, crc_ref; + int fail = 0; + unsigned char *buf = NULL; + + buf = (unsigned char *)buf_alloc; memset(buf, 0, MAX_BUF * 10); - u64 crc = crc64_ecma_norm(TEST_SEED, buf, MAX_BUF); - u64 crc_ref = crc64_ecma_norm_base(TEST_SEED, buf, MAX_BUF); + crc = test_func->crc64_func_call(TEST_SEED, buf, MAX_BUF * 10); + crc_ref = test_func->crc64_ref_call(TEST_SEED, buf, MAX_BUF * 10); + if (crc != crc_ref) { fail++; printf("\n opt ref\n"); @@ -88,10 +146,20 @@ int main(int argc, char *argv[]) } else printf("."); - // Another simple test pattern + return fail; +} + +// Another simple test pattern +int simple_pattern_test(func_case_t * test_func) +{ + uint64_t crc, crc_ref; + int fail = 0; + unsigned char *buf = NULL; + + buf = (unsigned char *)buf_alloc; memset(buf, 0x8a, MAX_BUF); - crc = crc64_ecma_norm(TEST_SEED, buf, MAX_BUF); - crc_ref = crc64_ecma_norm_base(TEST_SEED, buf, MAX_BUF); + crc = test_func->crc64_func_call(TEST_SEED, buf, MAX_BUF); + crc_ref = test_func->crc64_ref_call(TEST_SEED, buf, MAX_BUF); if (crc != crc_ref) fail++; if (verbose) @@ -99,13 +167,25 @@ int main(int argc, char *argv[]) else printf("."); + return fail; +} + +int seeds_sizes_test(func_case_t * test_func) +{ + uint64_t crc, crc_ref; + int fail = 0; + int i; + uint64_t r, s; + unsigned char *buf = NULL; + // Do a few random tests + buf = (unsigned char *)buf_alloc; //reset buf r = rand(); rand_buffer(buf, MAX_BUF * TEST_SIZE); for (i = 0; i < TEST_SIZE; i++) { - crc = crc64_ecma_norm(r, buf, MAX_BUF); - crc_ref = crc64_ecma_norm_base(r, buf, MAX_BUF); + crc = test_func->crc64_func_call(r, buf, MAX_BUF); + crc_ref = test_func->crc64_ref_call(r, buf, MAX_BUF); if (crc != crc_ref) fail++; if (verbose) @@ -120,8 +200,8 @@ int main(int argc, char *argv[]) r = rand(); for (i = MAX_BUF; i >= 0; i--) { - crc = crc64_ecma_norm(r, buf, i); - crc_ref = crc64_ecma_norm_base(r, buf, i); + crc = test_func->crc64_func_call(r, buf, i); + crc_ref = test_func->crc64_ref_call(r, buf, i); if (crc != crc_ref) { fail++; printf("fail random size%i 0x%16lx 0x%16lx\n", i, crc, crc_ref); @@ -140,8 +220,8 @@ int main(int argc, char *argv[]) printf("seed = 0x%lx\n", r); for (i = 0; i < TEST_SIZE; i++) { - crc = crc64_ecma_norm(r, buf, MAX_BUF); - crc_ref = crc64_ecma_norm_base(r, buf, MAX_BUF); + crc = test_func->crc64_func_call(r, buf, MAX_BUF); + crc_ref = test_func->crc64_ref_call(r, buf, MAX_BUF); if (crc != crc_ref) fail++; if (verbose) @@ -152,12 +232,22 @@ int main(int argc, char *argv[]) } } - // Run tests at end of buffer + return fail; +} + +// Run tests at end of buffer +int eob_test(func_case_t * test_func) +{ + uint64_t crc, crc_ref; + int fail = 0; + int i; + unsigned char *buf = NULL; + buf = (unsigned char *)buf_alloc; //reset buf buf = buf + ((MAX_BUF - 1) * TEST_SIZE); //Line up TEST_SIZE from end for (i = 0; i < TEST_SIZE; i++) { - crc = crc64_ecma_norm(TEST_SEED, buf + i, TEST_SIZE - i); - crc_ref = crc64_ecma_norm_base(TEST_SEED, buf + i, TEST_SIZE - i); + crc = test_func->crc64_func_call(TEST_SEED, buf + i, TEST_SIZE - i); + crc_ref = test_func->crc64_ref_call(TEST_SEED, buf + i, TEST_SIZE - i); if (crc != crc_ref) fail++; if (verbose) @@ -166,9 +256,35 @@ int main(int argc, char *argv[]) printf("."); } - printf("Test done: %s\n", fail ? "Fail" : "Pass"); - if (fail) - printf("\nFailed %d tests\n", fail); + return fail; +} + +int update_test(func_case_t * test_func) +{ + uint64_t crc, crc_ref; + int fail = 0; + int i; + uint64_t r; + unsigned char *buf = NULL; + + buf = (unsigned char *)buf_alloc; //reset buf + r = rand(); + // Process the whole buf with reference func single call. + crc_ref = test_func->crc64_ref_call(r, buf, MAX_BUF * TEST_SIZE); + // Process buf with update method. + for (i = 0; i < TEST_SIZE; i++) { + crc = test_func->crc64_func_call(r, buf, MAX_BUF); + // Update crc seeds and buf pointer. + r = crc; + buf += MAX_BUF; + } + + if (crc != crc_ref) + fail++; + if (verbose) + printf("crc rand%3d = 0x%16lx 0x%16lx\n", i, crc, crc_ref); + else + printf("."); return fail; } diff --git a/crc/crc64_iso_norm_by8.asm b/crc/crc64_iso_norm_by8.asm new file mode 100644 index 0000000..f227d23 --- /dev/null +++ b/crc/crc64_iso_norm_by8.asm @@ -0,0 +1,575 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Function API: +; uint64_t crc64_iso_norm_by8( +; uint64_t init_crc, //initial CRC value, 64 bits +; const unsigned char *buf, //buffer pointer to calculate CRC on +; uint64_t len //buffer length in bytes (64-bit data) +; ); +; +%include "reg_sizes.asm" + +[bits 64] +default rel + +section .text + +%ifidn __OUTPUT_FORMAT__, win64 + %xdefine arg1 rcx + %xdefine arg2 rdx + %xdefine arg3 r8 +%else + %xdefine arg1 rdi + %xdefine arg2 rsi + %xdefine arg3 rdx +%endif + +%define TMP 16*0 +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_SAVE 16*2 + %define VARIABLE_OFFSET 16*10+8 +%else + %define VARIABLE_OFFSET 16*2+8 +%endif +align 16 +global crc64_iso_norm_by8:function +crc64_iso_norm_by8: + + not arg1 ;~init_crc + + sub rsp,VARIABLE_OFFSET + +%ifidn __OUTPUT_FORMAT__, win64 + ; push the xmm registers into the stack to maintain + movdqa [rsp + XMM_SAVE + 16*0], xmm6 + movdqa [rsp + XMM_SAVE + 16*1], xmm7 + movdqa [rsp + XMM_SAVE + 16*2], xmm8 + movdqa [rsp + XMM_SAVE + 16*3], xmm9 + movdqa [rsp + XMM_SAVE + 16*4], xmm10 + movdqa [rsp + XMM_SAVE + 16*5], xmm11 + movdqa [rsp + XMM_SAVE + 16*6], xmm12 + movdqa [rsp + XMM_SAVE + 16*7], xmm13 +%endif + + + ; check if smaller than 256 + cmp arg3, 256 + + ; for sizes less than 256, we can't fold 128B at a time... + jl _less_than_256 + + + ; load the initial crc value + movq xmm10, arg1 ; initial crc + + ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register. + ; because data will be byte-reflected and will align with initial crc at correct place. + pslldq xmm10, 8 + + movdqa xmm11, [SHUF_MASK] + ; receive the initial 128B data, xor the initial crc value + movdqu xmm0, [arg2+16*0] + movdqu xmm1, [arg2+16*1] + movdqu xmm2, [arg2+16*2] + movdqu xmm3, [arg2+16*3] + movdqu xmm4, [arg2+16*4] + movdqu xmm5, [arg2+16*5] + movdqu xmm6, [arg2+16*6] + movdqu xmm7, [arg2+16*7] + + pshufb xmm0, xmm11 + ; XOR the initial_crc value + pxor xmm0, xmm10 + pshufb xmm1, xmm11 + pshufb xmm2, xmm11 + pshufb xmm3, xmm11 + pshufb xmm4, xmm11 + pshufb xmm5, xmm11 + pshufb xmm6, xmm11 + pshufb xmm7, xmm11 + + movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4 + ;imm value of pclmulqdq instruction will determine which constant to use + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; we subtract 256 instead of 128 to save one instruction from the loop + sub arg3, 256 + + ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop + ; loop will fold 128B at a time until we have 128+y Bytes of buffer + + + ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel +_fold_128_B_loop: + + ; update the buffer pointer + add arg2, 128 ; buf += 128; + + movdqu xmm9, [arg2+16*0] + movdqu xmm12, [arg2+16*1] + pshufb xmm9, xmm11 + pshufb xmm12, xmm11 + movdqa xmm8, xmm0 + movdqa xmm13, xmm1 + pclmulqdq xmm0, xmm10, 0x0 + pclmulqdq xmm8, xmm10 , 0x11 + pclmulqdq xmm1, xmm10, 0x0 + pclmulqdq xmm13, xmm10 , 0x11 + pxor xmm0, xmm9 + xorps xmm0, xmm8 + pxor xmm1, xmm12 + xorps xmm1, xmm13 + + movdqu xmm9, [arg2+16*2] + movdqu xmm12, [arg2+16*3] + pshufb xmm9, xmm11 + pshufb xmm12, xmm11 + movdqa xmm8, xmm2 + movdqa xmm13, xmm3 + pclmulqdq xmm2, xmm10, 0x0 + pclmulqdq xmm8, xmm10 , 0x11 + pclmulqdq xmm3, xmm10, 0x0 + pclmulqdq xmm13, xmm10 , 0x11 + pxor xmm2, xmm9 + xorps xmm2, xmm8 + pxor xmm3, xmm12 + xorps xmm3, xmm13 + + movdqu xmm9, [arg2+16*4] + movdqu xmm12, [arg2+16*5] + pshufb xmm9, xmm11 + pshufb xmm12, xmm11 + movdqa xmm8, xmm4 + movdqa xmm13, xmm5 + pclmulqdq xmm4, xmm10, 0x0 + pclmulqdq xmm8, xmm10 , 0x11 + pclmulqdq xmm5, xmm10, 0x0 + pclmulqdq xmm13, xmm10 , 0x11 + pxor xmm4, xmm9 + xorps xmm4, xmm8 + pxor xmm5, xmm12 + xorps xmm5, xmm13 + + movdqu xmm9, [arg2+16*6] + movdqu xmm12, [arg2+16*7] + pshufb xmm9, xmm11 + pshufb xmm12, xmm11 + movdqa xmm8, xmm6 + movdqa xmm13, xmm7 + pclmulqdq xmm6, xmm10, 0x0 + pclmulqdq xmm8, xmm10 , 0x11 + pclmulqdq xmm7, xmm10, 0x0 + pclmulqdq xmm13, xmm10 , 0x11 + pxor xmm6, xmm9 + xorps xmm6, xmm8 + pxor xmm7, xmm12 + xorps xmm7, xmm13 + + sub arg3, 128 + + ; check if there is another 128B in the buffer to be able to fold + jge _fold_128_B_loop + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add arg2, 128 + ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 + ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + + + ; fold the 8 xmm registers to 1 xmm register with different constants + + movdqa xmm10, [rk9] + movdqa xmm8, xmm0 + pclmulqdq xmm0, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + xorps xmm7, xmm0 + + movdqa xmm10, [rk11] + movdqa xmm8, xmm1 + pclmulqdq xmm1, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + xorps xmm7, xmm1 + + movdqa xmm10, [rk13] + movdqa xmm8, xmm2 + pclmulqdq xmm2, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + pxor xmm7, xmm2 + + movdqa xmm10, [rk15] + movdqa xmm8, xmm3 + pclmulqdq xmm3, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + xorps xmm7, xmm3 + + movdqa xmm10, [rk17] + movdqa xmm8, xmm4 + pclmulqdq xmm4, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + pxor xmm7, xmm4 + + movdqa xmm10, [rk19] + movdqa xmm8, xmm5 + pclmulqdq xmm5, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + xorps xmm7, xmm5 + + movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2 + + movdqa xmm8, xmm6 + pclmulqdq xmm6, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + pxor xmm7, xmm6 + + + ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop + ; instead of a cmp instruction, we use the negative flag with the jl instruction + add arg3, 128-16 + jl _final_reduction_for_128 + + ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory + ; we can fold 16 bytes at a time if y>=16 + ; continue folding 16B at a time + +_16B_reduction_loop: + movdqa xmm8, xmm7 + pclmulqdq xmm7, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + movdqu xmm0, [arg2] + pshufb xmm0, xmm11 + pxor xmm7, xmm0 + add arg2, 16 + sub arg3, 16 + ; instead of a cmp instruction, we utilize the flags with the jge instruction + ; equivalent of: cmp arg3, 16-16 + ; check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + ;now we have 16+z bytes left to reduce, where 0<= z < 16. + ;first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + ; check if any more data to fold. If not, compute the CRC of the final 128 bits + add arg3, 16 + je _128_done + + ; here we are getting data that is less than 16 bytes. + ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes. + ; after that the registers need to be adjusted. +_get_last_two_xmms: + movdqa xmm2, xmm7 + + movdqu xmm1, [arg2 - 16 + arg3] + pshufb xmm1, xmm11 + + ; get rid of the extra data that was loaded before + ; load the shift constant + lea rax, [pshufb_shf_table + 16] + sub rax, arg3 + movdqu xmm0, [rax] + + ; shift xmm2 to the left by arg3 bytes + pshufb xmm2, xmm0 + + ; shift xmm7 to the right by 16-arg3 bytes + pxor xmm0, [mask1] + pshufb xmm7, xmm0 + pblendvb xmm1, xmm2 ;xmm0 is implicit + + ; fold 16 Bytes + movdqa xmm2, xmm1 + movdqa xmm8, xmm7 + pclmulqdq xmm7, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + pxor xmm7, xmm2 + +_128_done: + ; compute crc of a 128-bit value + movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10 + movdqa xmm0, xmm7 + + ;64b fold + pclmulqdq xmm7, xmm10, 0x01 ; H*L + pslldq xmm0, 8 + pxor xmm7, xmm0 + + ;barrett reduction +_barrett: + movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10 + movdqa xmm0, xmm7 + + movdqa xmm1, xmm7 + pand xmm1, [mask3] + pclmulqdq xmm7, xmm10, 0x01 + pxor xmm7, xmm1 + + pclmulqdq xmm7, xmm10, 0x11 + pxor xmm7, xmm0 + pextrq rax, xmm7, 0 + +_cleanup: + not rax +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + XMM_SAVE + 16*0] + movdqa xmm7, [rsp + XMM_SAVE + 16*1] + movdqa xmm8, [rsp + XMM_SAVE + 16*2] + movdqa xmm9, [rsp + XMM_SAVE + 16*3] + movdqa xmm10, [rsp + XMM_SAVE + 16*4] + movdqa xmm11, [rsp + XMM_SAVE + 16*5] + movdqa xmm12, [rsp + XMM_SAVE + 16*6] + movdqa xmm13, [rsp + XMM_SAVE + 16*7] +%endif + add rsp, VARIABLE_OFFSET + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +align 16 +_less_than_256: + + ; check if there is enough buffer to be able to fold 16B at a time + cmp arg3, 32 + jl _less_than_32 + movdqa xmm11, [SHUF_MASK] + + ; if there is, load the constants + movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 + + movq xmm0, arg1 ; get the initial crc value + pslldq xmm0, 8 ; align it to its correct place + movdqu xmm7, [arg2] ; load the plaintext + pshufb xmm7, xmm11 ; byte-reflect the plaintext + pxor xmm7, xmm0 + + + ; update the buffer pointer + add arg2, 16 + + ; update the counter. subtract 32 instead of 16 to save one instruction from the loop + sub arg3, 32 + + jmp _16B_reduction_loop +align 16 +_less_than_32: + ; mov initial crc to the return value. this is necessary for zero-length buffers. + mov rax, arg1 + test arg3, arg3 + je _cleanup + + movdqa xmm11, [SHUF_MASK] + + movq xmm0, arg1 ; get the initial crc value + pslldq xmm0, 8 ; align it to its correct place + + cmp arg3, 16 + je _exact_16_left + jl _less_than_16_left + + movdqu xmm7, [arg2] ; load the plaintext + pshufb xmm7, xmm11 ; byte-reflect the plaintext + pxor xmm7, xmm0 ; xor the initial crc value + add arg2, 16 + sub arg3, 16 + movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 + jmp _get_last_two_xmms +align 16 +_less_than_16_left: + ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. + pxor xmm1, xmm1 + mov r11, rsp + movdqa [r11], xmm1 + + ; backup the counter value + mov r9, arg3 + cmp arg3, 8 + jl _less_than_8_left + + ; load 8 Bytes + mov rax, [arg2] + mov [r11], rax + add r11, 8 + sub arg3, 8 + add arg2, 8 +_less_than_8_left: + + cmp arg3, 4 + jl _less_than_4_left + + ; load 4 Bytes + mov eax, [arg2] + mov [r11], eax + add r11, 4 + sub arg3, 4 + add arg2, 4 +_less_than_4_left: + + cmp arg3, 2 + jl _less_than_2_left + + ; load 2 Bytes + mov ax, [arg2] + mov [r11], ax + add r11, 2 + sub arg3, 2 + add arg2, 2 +_less_than_2_left: + cmp arg3, 1 + jl _zero_left + + ; load 1 Byte + mov al, [arg2] + mov [r11], al +_zero_left: + movdqa xmm7, [rsp] + pshufb xmm7, xmm11 + pxor xmm7, xmm0 ; xor the initial crc value + + ; shl r9, 4 + lea rax, [pshufb_shf_table + 16] + sub rax, r9 + + cmp r9, 8 + jl _end_1to7 + +_end_8to15: + movdqu xmm0, [rax] + pxor xmm0, [mask1] + + pshufb xmm7, xmm0 + jmp _128_done + +_end_1to7: + ; Right shift (8-length) bytes in XMM + add rax, 8 + movdqu xmm0, [rax] + pshufb xmm7,xmm0 + + jmp _barrett +align 16 +_exact_16_left: + movdqu xmm7, [arg2] + pshufb xmm7, xmm11 + pxor xmm7, xmm0 ; xor the initial crc value + + jmp _128_done + +section .data + +; precomputed constants +align 16 + +rk1: +DQ 0x0000000000000145 +rk2: +DQ 0x0000000000001db7 +rk3: +DQ 0x000100000001001a +rk4: +DQ 0x001b0000001b015e +rk5: +DQ 0x0000000000000145 +rk6: +DQ 0x0000000000000000 +rk7: +DQ 0x000000000000001b +rk8: +DQ 0x000000000000001b +rk9: +DQ 0x0150145145145015 +rk10: +DQ 0x1c71db6db6db71c7 +rk11: +DQ 0x0001110110110111 +rk12: +DQ 0x001aab1ab1ab1aab +rk13: +DQ 0x0000014445014445 +rk14: +DQ 0x00001daab71daab7 +rk15: +DQ 0x0000000101000101 +rk16: +DQ 0x0000001b1b001b1b +rk17: +DQ 0x0000000001514515 +rk18: +DQ 0x000000001c6db6c7 +rk19: +DQ 0x0000000000011011 +rk20: +DQ 0x00000000001ab1ab + +mask1: +dq 0x8080808080808080, 0x8080808080808080 +mask2: +dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF +mask3: +dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF + +SHUF_MASK: +dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x0f0e0d0c0b0a0908 +dq 0x8080808080808080, 0x0f0e0d0c0b0a0908 +dq 0x8080808080808080, 0x8080808080808080 + +;;; func core, ver, snum +slversion crc64_iso_norm_by8, 01, 00, 0020 diff --git a/crc/crc64_iso_refl_by8.asm b/crc/crc64_iso_refl_by8.asm new file mode 100644 index 0000000..7ecd924 --- /dev/null +++ b/crc/crc64_iso_refl_by8.asm @@ -0,0 +1,538 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Function API: +; uint64_t crc64_iso_refl_by8( +; uint64_t init_crc, //initial CRC value, 64 bits +; const unsigned char *buf, //buffer pointer to calculate CRC on +; uint64_t len //buffer length in bytes (64-bit data) +; ); +; +%include "reg_sizes.asm" + +[bits 64] +default rel + +section .text + + +%ifidn __OUTPUT_FORMAT__, win64 + %xdefine arg1 rcx + %xdefine arg2 rdx + %xdefine arg3 r8 +%else + %xdefine arg1 rdi + %xdefine arg2 rsi + %xdefine arg3 rdx +%endif + +%define TMP 16*0 +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_SAVE 16*2 + %define VARIABLE_OFFSET 16*10+8 +%else + %define VARIABLE_OFFSET 16*2+8 +%endif + + +align 16 +global crc64_iso_refl_by8:function +crc64_iso_refl_by8: + ; uint64_t c = crc ^ 0xffffffff,ffffffffL; + not arg1 + sub rsp, VARIABLE_OFFSET + +%ifidn __OUTPUT_FORMAT__, win64 + ; push the xmm registers into the stack to maintain + movdqa [rsp + XMM_SAVE + 16*0], xmm6 + movdqa [rsp + XMM_SAVE + 16*1], xmm7 + movdqa [rsp + XMM_SAVE + 16*2], xmm8 + movdqa [rsp + XMM_SAVE + 16*3], xmm9 + movdqa [rsp + XMM_SAVE + 16*4], xmm10 + movdqa [rsp + XMM_SAVE + 16*5], xmm11 + movdqa [rsp + XMM_SAVE + 16*6], xmm12 + movdqa [rsp + XMM_SAVE + 16*7], xmm13 +%endif + + ; check if smaller than 256B + cmp arg3, 256 + + ; for sizes less than 256, we can't fold 128B at a time... + jl _less_than_256 + + + ; load the initial crc value + movq xmm10, arg1 ; initial crc + ; receive the initial 128B data, xor the initial crc value + movdqu xmm0, [arg2+16*0] + movdqu xmm1, [arg2+16*1] + movdqu xmm2, [arg2+16*2] + movdqu xmm3, [arg2+16*3] + movdqu xmm4, [arg2+16*4] + movdqu xmm5, [arg2+16*5] + movdqu xmm6, [arg2+16*6] + movdqu xmm7, [arg2+16*7] + + ; XOR the initial_crc value + pxor xmm0, xmm10 + movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4 + ;imm value of pclmulqdq instruction will determine which constant to use + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; we subtract 256 instead of 128 to save one instruction from the loop + sub arg3, 256 + + ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop + ; loop will fold 128B at a time until we have 128+y Bytes of buffer + + + ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel +_fold_128_B_loop: + + ; update the buffer pointer + add arg2, 128 + + movdqu xmm9, [arg2+16*0] + movdqu xmm12, [arg2+16*1] + movdqa xmm8, xmm0 + movdqa xmm13, xmm1 + pclmulqdq xmm0, xmm10, 0x10 + pclmulqdq xmm8, xmm10 , 0x1 + pclmulqdq xmm1, xmm10, 0x10 + pclmulqdq xmm13, xmm10 , 0x1 + pxor xmm0, xmm9 + xorps xmm0, xmm8 + pxor xmm1, xmm12 + xorps xmm1, xmm13 + + movdqu xmm9, [arg2+16*2] + movdqu xmm12, [arg2+16*3] + movdqa xmm8, xmm2 + movdqa xmm13, xmm3 + pclmulqdq xmm2, xmm10, 0x10 + pclmulqdq xmm8, xmm10 , 0x1 + pclmulqdq xmm3, xmm10, 0x10 + pclmulqdq xmm13, xmm10 , 0x1 + pxor xmm2, xmm9 + xorps xmm2, xmm8 + pxor xmm3, xmm12 + xorps xmm3, xmm13 + + movdqu xmm9, [arg2+16*4] + movdqu xmm12, [arg2+16*5] + movdqa xmm8, xmm4 + movdqa xmm13, xmm5 + pclmulqdq xmm4, xmm10, 0x10 + pclmulqdq xmm8, xmm10 , 0x1 + pclmulqdq xmm5, xmm10, 0x10 + pclmulqdq xmm13, xmm10 , 0x1 + pxor xmm4, xmm9 + xorps xmm4, xmm8 + pxor xmm5, xmm12 + xorps xmm5, xmm13 + + movdqu xmm9, [arg2+16*6] + movdqu xmm12, [arg2+16*7] + movdqa xmm8, xmm6 + movdqa xmm13, xmm7 + pclmulqdq xmm6, xmm10, 0x10 + pclmulqdq xmm8, xmm10 , 0x1 + pclmulqdq xmm7, xmm10, 0x10 + pclmulqdq xmm13, xmm10 , 0x1 + pxor xmm6, xmm9 + xorps xmm6, xmm8 + pxor xmm7, xmm12 + xorps xmm7, xmm13 + + sub arg3, 128 + + ; check if there is another 128B in the buffer to be able to fold + jge _fold_128_B_loop + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add arg2, 128 + ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 + ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + + + ; fold the 8 xmm registers to 1 xmm register with different constants + ; xmm0 to xmm7 + movdqa xmm10, [rk9] + movdqa xmm8, xmm0 + pclmulqdq xmm0, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + xorps xmm7, xmm0 + ;xmm1 to xmm7 + movdqa xmm10, [rk11] + movdqa xmm8, xmm1 + pclmulqdq xmm1, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + xorps xmm7, xmm1 + + movdqa xmm10, [rk13] + movdqa xmm8, xmm2 + pclmulqdq xmm2, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + pxor xmm7, xmm2 + + movdqa xmm10, [rk15] + movdqa xmm8, xmm3 + pclmulqdq xmm3, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + xorps xmm7, xmm3 + + movdqa xmm10, [rk17] + movdqa xmm8, xmm4 + pclmulqdq xmm4, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + pxor xmm7, xmm4 + + movdqa xmm10, [rk19] + movdqa xmm8, xmm5 + pclmulqdq xmm5, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + xorps xmm7, xmm5 + ; xmm6 to xmm7 + movdqa xmm10, [rk1] + movdqa xmm8, xmm6 + pclmulqdq xmm6, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + pxor xmm7, xmm6 + + + ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop + ; instead of a cmp instruction, we use the negative flag with the jl instruction + add arg3, 128-16 + jl _final_reduction_for_128 + + ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory + ; we can fold 16 bytes at a time if y>=16 + ; continue folding 16B at a time + +_16B_reduction_loop: + movdqa xmm8, xmm7 + pclmulqdq xmm7, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + movdqu xmm0, [arg2] + pxor xmm7, xmm0 + add arg2, 16 + sub arg3, 16 + ; instead of a cmp instruction, we utilize the flags with the jge instruction + ; equivalent of: cmp arg3, 16-16 + ; check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + ;now we have 16+z bytes left to reduce, where 0<= z < 16. + ;first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + add arg3, 16 + je _128_done + ; here we are getting data that is less than 16 bytes. + ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes. + ; after that the registers need to be adjusted. +_get_last_two_xmms: + + + movdqa xmm2, xmm7 + movdqu xmm1, [arg2 - 16 + arg3] + + ; get rid of the extra data that was loaded before + ; load the shift constant + lea rax, [pshufb_shf_table] + add rax, arg3 + movdqu xmm0, [rax] + + + pshufb xmm7, xmm0 + pxor xmm0, [mask3] + pshufb xmm2, xmm0 + + pblendvb xmm2, xmm1 ;xmm0 is implicit + ;;;;;;;;;; + movdqa xmm8, xmm7 + pclmulqdq xmm7, xmm10, 0x1 + + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + pxor xmm7, xmm2 + +_128_done: + ; compute crc of a 128-bit value + movdqa xmm10, [rk5] + movdqa xmm0, xmm7 + + ;64b fold + pclmulqdq xmm7, xmm10, 0 + psrldq xmm0, 8 + pxor xmm7, xmm0 + + ;barrett reduction +_barrett: + movdqa xmm1, xmm7 + movdqa xmm10, [rk7] + + pclmulqdq xmm7, xmm10, 0 + movdqa xmm2, xmm7 + pclmulqdq xmm7, xmm10, 0x10 + pslldq xmm2, 8 + pxor xmm7, xmm2 + pxor xmm7, xmm1 + pextrq rax, xmm7, 1 + +_cleanup: + ; return c ^ 0xffffffff, ffffffffL; + not rax + + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + XMM_SAVE + 16*0] + movdqa xmm7, [rsp + XMM_SAVE + 16*1] + movdqa xmm8, [rsp + XMM_SAVE + 16*2] + movdqa xmm9, [rsp + XMM_SAVE + 16*3] + movdqa xmm10, [rsp + XMM_SAVE + 16*4] + movdqa xmm11, [rsp + XMM_SAVE + 16*5] + movdqa xmm12, [rsp + XMM_SAVE + 16*6] + movdqa xmm13, [rsp + XMM_SAVE + 16*7] +%endif + add rsp, VARIABLE_OFFSET + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +align 16 +_less_than_256: + + ; check if there is enough buffer to be able to fold 16B at a time + cmp arg3, 32 + jl _less_than_32 + + ; if there is, load the constants + movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 + + movq xmm0, arg1 ; get the initial crc value + movdqu xmm7, [arg2] ; load the plaintext + pxor xmm7, xmm0 + + ; update the buffer pointer + add arg2, 16 + + ; update the counter. subtract 32 instead of 16 to save one instruction from the loop + sub arg3, 32 + + jmp _16B_reduction_loop + +align 16 +_less_than_32: + ; mov initial crc to the return value. this is necessary for zero-length buffers. + mov rax, arg1 + test arg3, arg3 + je _cleanup + + movq xmm0, arg1 ; get the initial crc value + + cmp arg3, 16 + je _exact_16_left + jl _less_than_16_left + + movdqu xmm7, [arg2] ; load the plaintext + pxor xmm7, xmm0 ; xor the initial crc value + add arg2, 16 + sub arg3, 16 + movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 + jmp _get_last_two_xmms + + +align 16 +_less_than_16_left: + ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. + + pxor xmm1, xmm1 + mov r11, rsp + movdqa [r11], xmm1 + + ; backup the counter value + mov r9, arg3 + cmp arg3, 8 + jl _less_than_8_left + + ; load 8 Bytes + mov rax, [arg2] + mov [r11], rax + add r11, 8 + sub arg3, 8 + add arg2, 8 +_less_than_8_left: + + cmp arg3, 4 + jl _less_than_4_left + + ; load 4 Bytes + mov eax, [arg2] + mov [r11], eax + add r11, 4 + sub arg3, 4 + add arg2, 4 +_less_than_4_left: + + cmp arg3, 2 + jl _less_than_2_left + + ; load 2 Bytes + mov ax, [arg2] + mov [r11], ax + add r11, 2 + sub arg3, 2 + add arg2, 2 +_less_than_2_left: + cmp arg3, 1 + jl _zero_left + + ; load 1 Byte + mov al, [arg2] + mov [r11], al + +_zero_left: + movdqa xmm7, [rsp] + pxor xmm7, xmm0 ; xor the initial crc value + + lea rax,[pshufb_shf_table] + + cmp r9, 8 + jl _end_1to7 + +_end_8to15: + movdqu xmm0, [rax + r9] + pshufb xmm7,xmm0 + jmp _128_done + +_end_1to7: + ; Left shift (8-length) bytes in XMM + movdqu xmm0, [rax + r9 + 8] + pshufb xmm7,xmm0 + + jmp _barrett + +align 16 +_exact_16_left: + movdqu xmm7, [arg2] + pxor xmm7, xmm0 ; xor the initial crc value + + jmp _128_done + +section .data + +; precomputed constants +align 16 +; rk7 = floor(2^128/Q) +; rk8 = Q +rk1: +DQ 0xf500000000000001 +rk2: +DQ 0x6b70000000000001 +rk3: +DQ 0xb001000000010000 +rk4: +DQ 0xf501b0000001b000 +rk5: +DQ 0xf500000000000001 +rk6: +DQ 0x0000000000000000 +rk7: +DQ 0xb000000000000001 +rk8: +DQ 0xb000000000000000 +rk9: +DQ 0xe014514514501501 +rk10: +DQ 0x771db6db6db71c71 +rk11: +DQ 0xa101101101110001 +rk12: +DQ 0x1ab1ab1ab1aab001 +rk13: +DQ 0xf445014445000001 +rk14: +DQ 0x6aab71daab700001 +rk15: +DQ 0xb100010100000001 +rk16: +DQ 0x01b001b1b0000001 +rk17: +DQ 0xe145150000000001 +rk18: +DQ 0x76db6c7000000001 +rk19: +DQ 0xa011000000000001 +rk20: +DQ 0x1b1ab00000000001 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + + +mask: +dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 +mask2: +dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF +mask3: +dq 0x8080808080808080, 0x8080808080808080 + +;;; func core, ver, snum +slversion crc64_iso_refl_by8, 01, 00, 0023 diff --git a/crc/crc64_jones_norm_by8.asm b/crc/crc64_jones_norm_by8.asm new file mode 100644 index 0000000..6cd358a --- /dev/null +++ b/crc/crc64_jones_norm_by8.asm @@ -0,0 +1,575 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Function API: +; uint64_t crc64_jones_norm_by8( +; uint64_t init_crc, //initial CRC value, 64 bits +; const unsigned char *buf, //buffer pointer to calculate CRC on +; uint64_t len //buffer length in bytes (64-bit data) +; ); +; +%include "reg_sizes.asm" + +[bits 64] +default rel + +section .text + +%ifidn __OUTPUT_FORMAT__, win64 + %xdefine arg1 rcx + %xdefine arg2 rdx + %xdefine arg3 r8 +%else + %xdefine arg1 rdi + %xdefine arg2 rsi + %xdefine arg3 rdx +%endif + +%define TMP 16*0 +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_SAVE 16*2 + %define VARIABLE_OFFSET 16*10+8 +%else + %define VARIABLE_OFFSET 16*2+8 +%endif +align 16 +global crc64_jones_norm_by8:function +crc64_jones_norm_by8: + + not arg1 ;~init_crc + + sub rsp,VARIABLE_OFFSET + +%ifidn __OUTPUT_FORMAT__, win64 + ; push the xmm registers into the stack to maintain + movdqa [rsp + XMM_SAVE + 16*0], xmm6 + movdqa [rsp + XMM_SAVE + 16*1], xmm7 + movdqa [rsp + XMM_SAVE + 16*2], xmm8 + movdqa [rsp + XMM_SAVE + 16*3], xmm9 + movdqa [rsp + XMM_SAVE + 16*4], xmm10 + movdqa [rsp + XMM_SAVE + 16*5], xmm11 + movdqa [rsp + XMM_SAVE + 16*6], xmm12 + movdqa [rsp + XMM_SAVE + 16*7], xmm13 +%endif + + + ; check if smaller than 256 + cmp arg3, 256 + + ; for sizes less than 256, we can't fold 128B at a time... + jl _less_than_256 + + + ; load the initial crc value + movq xmm10, arg1 ; initial crc + + ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register. + ; because data will be byte-reflected and will align with initial crc at correct place. + pslldq xmm10, 8 + + movdqa xmm11, [SHUF_MASK] + ; receive the initial 128B data, xor the initial crc value + movdqu xmm0, [arg2+16*0] + movdqu xmm1, [arg2+16*1] + movdqu xmm2, [arg2+16*2] + movdqu xmm3, [arg2+16*3] + movdqu xmm4, [arg2+16*4] + movdqu xmm5, [arg2+16*5] + movdqu xmm6, [arg2+16*6] + movdqu xmm7, [arg2+16*7] + + pshufb xmm0, xmm11 + ; XOR the initial_crc value + pxor xmm0, xmm10 + pshufb xmm1, xmm11 + pshufb xmm2, xmm11 + pshufb xmm3, xmm11 + pshufb xmm4, xmm11 + pshufb xmm5, xmm11 + pshufb xmm6, xmm11 + pshufb xmm7, xmm11 + + movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4 + ;imm value of pclmulqdq instruction will determine which constant to use + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; we subtract 256 instead of 128 to save one instruction from the loop + sub arg3, 256 + + ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop + ; loop will fold 128B at a time until we have 128+y Bytes of buffer + + + ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel +_fold_128_B_loop: + + ; update the buffer pointer + add arg2, 128 ; buf += 128; + + movdqu xmm9, [arg2+16*0] + movdqu xmm12, [arg2+16*1] + pshufb xmm9, xmm11 + pshufb xmm12, xmm11 + movdqa xmm8, xmm0 + movdqa xmm13, xmm1 + pclmulqdq xmm0, xmm10, 0x0 + pclmulqdq xmm8, xmm10 , 0x11 + pclmulqdq xmm1, xmm10, 0x0 + pclmulqdq xmm13, xmm10 , 0x11 + pxor xmm0, xmm9 + xorps xmm0, xmm8 + pxor xmm1, xmm12 + xorps xmm1, xmm13 + + movdqu xmm9, [arg2+16*2] + movdqu xmm12, [arg2+16*3] + pshufb xmm9, xmm11 + pshufb xmm12, xmm11 + movdqa xmm8, xmm2 + movdqa xmm13, xmm3 + pclmulqdq xmm2, xmm10, 0x0 + pclmulqdq xmm8, xmm10 , 0x11 + pclmulqdq xmm3, xmm10, 0x0 + pclmulqdq xmm13, xmm10 , 0x11 + pxor xmm2, xmm9 + xorps xmm2, xmm8 + pxor xmm3, xmm12 + xorps xmm3, xmm13 + + movdqu xmm9, [arg2+16*4] + movdqu xmm12, [arg2+16*5] + pshufb xmm9, xmm11 + pshufb xmm12, xmm11 + movdqa xmm8, xmm4 + movdqa xmm13, xmm5 + pclmulqdq xmm4, xmm10, 0x0 + pclmulqdq xmm8, xmm10 , 0x11 + pclmulqdq xmm5, xmm10, 0x0 + pclmulqdq xmm13, xmm10 , 0x11 + pxor xmm4, xmm9 + xorps xmm4, xmm8 + pxor xmm5, xmm12 + xorps xmm5, xmm13 + + movdqu xmm9, [arg2+16*6] + movdqu xmm12, [arg2+16*7] + pshufb xmm9, xmm11 + pshufb xmm12, xmm11 + movdqa xmm8, xmm6 + movdqa xmm13, xmm7 + pclmulqdq xmm6, xmm10, 0x0 + pclmulqdq xmm8, xmm10 , 0x11 + pclmulqdq xmm7, xmm10, 0x0 + pclmulqdq xmm13, xmm10 , 0x11 + pxor xmm6, xmm9 + xorps xmm6, xmm8 + pxor xmm7, xmm12 + xorps xmm7, xmm13 + + sub arg3, 128 + + ; check if there is another 128B in the buffer to be able to fold + jge _fold_128_B_loop + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add arg2, 128 + ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 + ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + + + ; fold the 8 xmm registers to 1 xmm register with different constants + + movdqa xmm10, [rk9] + movdqa xmm8, xmm0 + pclmulqdq xmm0, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + xorps xmm7, xmm0 + + movdqa xmm10, [rk11] + movdqa xmm8, xmm1 + pclmulqdq xmm1, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + xorps xmm7, xmm1 + + movdqa xmm10, [rk13] + movdqa xmm8, xmm2 + pclmulqdq xmm2, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + pxor xmm7, xmm2 + + movdqa xmm10, [rk15] + movdqa xmm8, xmm3 + pclmulqdq xmm3, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + xorps xmm7, xmm3 + + movdqa xmm10, [rk17] + movdqa xmm8, xmm4 + pclmulqdq xmm4, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + pxor xmm7, xmm4 + + movdqa xmm10, [rk19] + movdqa xmm8, xmm5 + pclmulqdq xmm5, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + xorps xmm7, xmm5 + + movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2 + + movdqa xmm8, xmm6 + pclmulqdq xmm6, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + pxor xmm7, xmm6 + + + ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop + ; instead of a cmp instruction, we use the negative flag with the jl instruction + add arg3, 128-16 + jl _final_reduction_for_128 + + ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory + ; we can fold 16 bytes at a time if y>=16 + ; continue folding 16B at a time + +_16B_reduction_loop: + movdqa xmm8, xmm7 + pclmulqdq xmm7, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + movdqu xmm0, [arg2] + pshufb xmm0, xmm11 + pxor xmm7, xmm0 + add arg2, 16 + sub arg3, 16 + ; instead of a cmp instruction, we utilize the flags with the jge instruction + ; equivalent of: cmp arg3, 16-16 + ; check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + ;now we have 16+z bytes left to reduce, where 0<= z < 16. + ;first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + ; check if any more data to fold. If not, compute the CRC of the final 128 bits + add arg3, 16 + je _128_done + + ; here we are getting data that is less than 16 bytes. + ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes. + ; after that the registers need to be adjusted. +_get_last_two_xmms: + movdqa xmm2, xmm7 + + movdqu xmm1, [arg2 - 16 + arg3] + pshufb xmm1, xmm11 + + ; get rid of the extra data that was loaded before + ; load the shift constant + lea rax, [pshufb_shf_table + 16] + sub rax, arg3 + movdqu xmm0, [rax] + + ; shift xmm2 to the left by arg3 bytes + pshufb xmm2, xmm0 + + ; shift xmm7 to the right by 16-arg3 bytes + pxor xmm0, [mask1] + pshufb xmm7, xmm0 + pblendvb xmm1, xmm2 ;xmm0 is implicit + + ; fold 16 Bytes + movdqa xmm2, xmm1 + movdqa xmm8, xmm7 + pclmulqdq xmm7, xmm10, 0x11 + pclmulqdq xmm8, xmm10, 0x0 + pxor xmm7, xmm8 + pxor xmm7, xmm2 + +_128_done: + ; compute crc of a 128-bit value + movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10 + movdqa xmm0, xmm7 + + ;64b fold + pclmulqdq xmm7, xmm10, 0x01 ; H*L + pslldq xmm0, 8 + pxor xmm7, xmm0 + + ;barrett reduction +_barrett: + movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10 + movdqa xmm0, xmm7 + + movdqa xmm1, xmm7 + pand xmm1, [mask3] + pclmulqdq xmm7, xmm10, 0x01 + pxor xmm7, xmm1 + + pclmulqdq xmm7, xmm10, 0x11 + pxor xmm7, xmm0 + pextrq rax, xmm7, 0 + +_cleanup: + not rax +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + XMM_SAVE + 16*0] + movdqa xmm7, [rsp + XMM_SAVE + 16*1] + movdqa xmm8, [rsp + XMM_SAVE + 16*2] + movdqa xmm9, [rsp + XMM_SAVE + 16*3] + movdqa xmm10, [rsp + XMM_SAVE + 16*4] + movdqa xmm11, [rsp + XMM_SAVE + 16*5] + movdqa xmm12, [rsp + XMM_SAVE + 16*6] + movdqa xmm13, [rsp + XMM_SAVE + 16*7] +%endif + add rsp, VARIABLE_OFFSET + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +align 16 +_less_than_256: + + ; check if there is enough buffer to be able to fold 16B at a time + cmp arg3, 32 + jl _less_than_32 + movdqa xmm11, [SHUF_MASK] + + ; if there is, load the constants + movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 + + movq xmm0, arg1 ; get the initial crc value + pslldq xmm0, 8 ; align it to its correct place + movdqu xmm7, [arg2] ; load the plaintext + pshufb xmm7, xmm11 ; byte-reflect the plaintext + pxor xmm7, xmm0 + + + ; update the buffer pointer + add arg2, 16 + + ; update the counter. subtract 32 instead of 16 to save one instruction from the loop + sub arg3, 32 + + jmp _16B_reduction_loop +align 16 +_less_than_32: + ; mov initial crc to the return value. this is necessary for zero-length buffers. + mov rax, arg1 + test arg3, arg3 + je _cleanup + + movdqa xmm11, [SHUF_MASK] + + movq xmm0, arg1 ; get the initial crc value + pslldq xmm0, 8 ; align it to its correct place + + cmp arg3, 16 + je _exact_16_left + jl _less_than_16_left + + movdqu xmm7, [arg2] ; load the plaintext + pshufb xmm7, xmm11 ; byte-reflect the plaintext + pxor xmm7, xmm0 ; xor the initial crc value + add arg2, 16 + sub arg3, 16 + movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 + jmp _get_last_two_xmms +align 16 +_less_than_16_left: + ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. + pxor xmm1, xmm1 + mov r11, rsp + movdqa [r11], xmm1 + + ; backup the counter value + mov r9, arg3 + cmp arg3, 8 + jl _less_than_8_left + + ; load 8 Bytes + mov rax, [arg2] + mov [r11], rax + add r11, 8 + sub arg3, 8 + add arg2, 8 +_less_than_8_left: + + cmp arg3, 4 + jl _less_than_4_left + + ; load 4 Bytes + mov eax, [arg2] + mov [r11], eax + add r11, 4 + sub arg3, 4 + add arg2, 4 +_less_than_4_left: + + cmp arg3, 2 + jl _less_than_2_left + + ; load 2 Bytes + mov ax, [arg2] + mov [r11], ax + add r11, 2 + sub arg3, 2 + add arg2, 2 +_less_than_2_left: + cmp arg3, 1 + jl _zero_left + + ; load 1 Byte + mov al, [arg2] + mov [r11], al +_zero_left: + movdqa xmm7, [rsp] + pshufb xmm7, xmm11 + pxor xmm7, xmm0 ; xor the initial crc value + + ; shl r9, 4 + lea rax, [pshufb_shf_table + 16] + sub rax, r9 + + cmp r9, 8 + jl _end_1to7 + +_end_8to15: + movdqu xmm0, [rax] + pxor xmm0, [mask1] + + pshufb xmm7, xmm0 + jmp _128_done + +_end_1to7: + ; Right shift (8-length) bytes in XMM + add rax, 8 + movdqu xmm0, [rax] + pshufb xmm7,xmm0 + + jmp _barrett +align 16 +_exact_16_left: + movdqu xmm7, [arg2] + pshufb xmm7, xmm11 + pxor xmm7, xmm0 ; xor the initial crc value + + jmp _128_done + +section .data + +; precomputed constants +align 16 + +rk1: +DQ 0x4445ed2750017038 +rk2: +DQ 0x698b74157cfbd736 +rk3: +DQ 0x0cfcfb5101c4b775 +rk4: +DQ 0x65403fd47cbec866 +rk5: +DQ 0x4445ed2750017038 +rk6: +DQ 0x0000000000000000 +rk7: +DQ 0xddf3eeb298be6cf8 +rk8: +DQ 0xad93d23594c935a9 +rk9: +DQ 0xd8dc208e2ba527b4 +rk10: +DQ 0xf032cfec76bb2bc5 +rk11: +DQ 0xb536044f357f4238 +rk12: +DQ 0xfdbf104d938ba67a +rk13: +DQ 0xeeddad9297a843e7 +rk14: +DQ 0x3550bce629466473 +rk15: +DQ 0x4e501e58ca43d25e +rk16: +DQ 0x13c961588f27f643 +rk17: +DQ 0x3b60d00dcb1099bc +rk18: +DQ 0x44bf1f468c53b9a3 +rk19: +DQ 0x96f2236e317179ee +rk20: +DQ 0xf00839aa0dd64bac + +mask1: +dq 0x8080808080808080, 0x8080808080808080 +mask2: +dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF +mask3: +dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF + +SHUF_MASK: +dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x0f0e0d0c0b0a0908 +dq 0x8080808080808080, 0x0f0e0d0c0b0a0908 +dq 0x8080808080808080, 0x8080808080808080 + +;;; func core, ver, snum +slversion crc64_jones_norm_by8, 01, 00, 0026 diff --git a/crc/crc64_jones_refl_by8.asm b/crc/crc64_jones_refl_by8.asm new file mode 100644 index 0000000..33938c2 --- /dev/null +++ b/crc/crc64_jones_refl_by8.asm @@ -0,0 +1,538 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Function API: +; uint64_t crc64_jones_refl_by8( +; uint64_t init_crc, //initial CRC value, 64 bits +; const unsigned char *buf, //buffer pointer to calculate CRC on +; uint64_t len //buffer length in bytes (64-bit data) +; ); +; +%include "reg_sizes.asm" + +[bits 64] +default rel + +section .text + + +%ifidn __OUTPUT_FORMAT__, win64 + %xdefine arg1 rcx + %xdefine arg2 rdx + %xdefine arg3 r8 +%else + %xdefine arg1 rdi + %xdefine arg2 rsi + %xdefine arg3 rdx +%endif + +%define TMP 16*0 +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_SAVE 16*2 + %define VARIABLE_OFFSET 16*10+8 +%else + %define VARIABLE_OFFSET 16*2+8 +%endif + + +align 16 +global crc64_jones_refl_by8:function +crc64_jones_refl_by8: + ; uint64_t c = crc ^ 0xffffffff,ffffffffL; + not arg1 + sub rsp, VARIABLE_OFFSET + +%ifidn __OUTPUT_FORMAT__, win64 + ; push the xmm registers into the stack to maintain + movdqa [rsp + XMM_SAVE + 16*0], xmm6 + movdqa [rsp + XMM_SAVE + 16*1], xmm7 + movdqa [rsp + XMM_SAVE + 16*2], xmm8 + movdqa [rsp + XMM_SAVE + 16*3], xmm9 + movdqa [rsp + XMM_SAVE + 16*4], xmm10 + movdqa [rsp + XMM_SAVE + 16*5], xmm11 + movdqa [rsp + XMM_SAVE + 16*6], xmm12 + movdqa [rsp + XMM_SAVE + 16*7], xmm13 +%endif + + ; check if smaller than 256B + cmp arg3, 256 + + ; for sizes less than 256, we can't fold 128B at a time... + jl _less_than_256 + + + ; load the initial crc value + movq xmm10, arg1 ; initial crc + ; receive the initial 128B data, xor the initial crc value + movdqu xmm0, [arg2+16*0] + movdqu xmm1, [arg2+16*1] + movdqu xmm2, [arg2+16*2] + movdqu xmm3, [arg2+16*3] + movdqu xmm4, [arg2+16*4] + movdqu xmm5, [arg2+16*5] + movdqu xmm6, [arg2+16*6] + movdqu xmm7, [arg2+16*7] + + ; XOR the initial_crc value + pxor xmm0, xmm10 + movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4 + ;imm value of pclmulqdq instruction will determine which constant to use + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; we subtract 256 instead of 128 to save one instruction from the loop + sub arg3, 256 + + ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop + ; loop will fold 128B at a time until we have 128+y Bytes of buffer + + + ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel +_fold_128_B_loop: + + ; update the buffer pointer + add arg2, 128 + + movdqu xmm9, [arg2+16*0] + movdqu xmm12, [arg2+16*1] + movdqa xmm8, xmm0 + movdqa xmm13, xmm1 + pclmulqdq xmm0, xmm10, 0x10 + pclmulqdq xmm8, xmm10 , 0x1 + pclmulqdq xmm1, xmm10, 0x10 + pclmulqdq xmm13, xmm10 , 0x1 + pxor xmm0, xmm9 + xorps xmm0, xmm8 + pxor xmm1, xmm12 + xorps xmm1, xmm13 + + movdqu xmm9, [arg2+16*2] + movdqu xmm12, [arg2+16*3] + movdqa xmm8, xmm2 + movdqa xmm13, xmm3 + pclmulqdq xmm2, xmm10, 0x10 + pclmulqdq xmm8, xmm10 , 0x1 + pclmulqdq xmm3, xmm10, 0x10 + pclmulqdq xmm13, xmm10 , 0x1 + pxor xmm2, xmm9 + xorps xmm2, xmm8 + pxor xmm3, xmm12 + xorps xmm3, xmm13 + + movdqu xmm9, [arg2+16*4] + movdqu xmm12, [arg2+16*5] + movdqa xmm8, xmm4 + movdqa xmm13, xmm5 + pclmulqdq xmm4, xmm10, 0x10 + pclmulqdq xmm8, xmm10 , 0x1 + pclmulqdq xmm5, xmm10, 0x10 + pclmulqdq xmm13, xmm10 , 0x1 + pxor xmm4, xmm9 + xorps xmm4, xmm8 + pxor xmm5, xmm12 + xorps xmm5, xmm13 + + movdqu xmm9, [arg2+16*6] + movdqu xmm12, [arg2+16*7] + movdqa xmm8, xmm6 + movdqa xmm13, xmm7 + pclmulqdq xmm6, xmm10, 0x10 + pclmulqdq xmm8, xmm10 , 0x1 + pclmulqdq xmm7, xmm10, 0x10 + pclmulqdq xmm13, xmm10 , 0x1 + pxor xmm6, xmm9 + xorps xmm6, xmm8 + pxor xmm7, xmm12 + xorps xmm7, xmm13 + + sub arg3, 128 + + ; check if there is another 128B in the buffer to be able to fold + jge _fold_128_B_loop + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + add arg2, 128 + ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 + ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + + + ; fold the 8 xmm registers to 1 xmm register with different constants + ; xmm0 to xmm7 + movdqa xmm10, [rk9] + movdqa xmm8, xmm0 + pclmulqdq xmm0, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + xorps xmm7, xmm0 + ;xmm1 to xmm7 + movdqa xmm10, [rk11] + movdqa xmm8, xmm1 + pclmulqdq xmm1, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + xorps xmm7, xmm1 + + movdqa xmm10, [rk13] + movdqa xmm8, xmm2 + pclmulqdq xmm2, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + pxor xmm7, xmm2 + + movdqa xmm10, [rk15] + movdqa xmm8, xmm3 + pclmulqdq xmm3, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + xorps xmm7, xmm3 + + movdqa xmm10, [rk17] + movdqa xmm8, xmm4 + pclmulqdq xmm4, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + pxor xmm7, xmm4 + + movdqa xmm10, [rk19] + movdqa xmm8, xmm5 + pclmulqdq xmm5, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + xorps xmm7, xmm5 + ; xmm6 to xmm7 + movdqa xmm10, [rk1] + movdqa xmm8, xmm6 + pclmulqdq xmm6, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + pxor xmm7, xmm6 + + + ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop + ; instead of a cmp instruction, we use the negative flag with the jl instruction + add arg3, 128-16 + jl _final_reduction_for_128 + + ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory + ; we can fold 16 bytes at a time if y>=16 + ; continue folding 16B at a time + +_16B_reduction_loop: + movdqa xmm8, xmm7 + pclmulqdq xmm7, xmm10, 0x1 + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + movdqu xmm0, [arg2] + pxor xmm7, xmm0 + add arg2, 16 + sub arg3, 16 + ; instead of a cmp instruction, we utilize the flags with the jge instruction + ; equivalent of: cmp arg3, 16-16 + ; check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + ;now we have 16+z bytes left to reduce, where 0<= z < 16. + ;first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + add arg3, 16 + je _128_done + ; here we are getting data that is less than 16 bytes. + ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes. + ; after that the registers need to be adjusted. +_get_last_two_xmms: + + + movdqa xmm2, xmm7 + movdqu xmm1, [arg2 - 16 + arg3] + + ; get rid of the extra data that was loaded before + ; load the shift constant + lea rax, [pshufb_shf_table] + add rax, arg3 + movdqu xmm0, [rax] + + + pshufb xmm7, xmm0 + pxor xmm0, [mask3] + pshufb xmm2, xmm0 + + pblendvb xmm2, xmm1 ;xmm0 is implicit + ;;;;;;;;;; + movdqa xmm8, xmm7 + pclmulqdq xmm7, xmm10, 0x1 + + pclmulqdq xmm8, xmm10, 0x10 + pxor xmm7, xmm8 + pxor xmm7, xmm2 + +_128_done: + ; compute crc of a 128-bit value + movdqa xmm10, [rk5] + movdqa xmm0, xmm7 + + ;64b fold + pclmulqdq xmm7, xmm10, 0 + psrldq xmm0, 8 + pxor xmm7, xmm0 + + ;barrett reduction +_barrett: + movdqa xmm1, xmm7 + movdqa xmm10, [rk7] + + pclmulqdq xmm7, xmm10, 0 + movdqa xmm2, xmm7 + pclmulqdq xmm7, xmm10, 0x10 + pslldq xmm2, 8 + pxor xmm7, xmm2 + pxor xmm7, xmm1 + pextrq rax, xmm7, 1 + +_cleanup: + ; return c ^ 0xffffffff, ffffffffL; + not rax + + +%ifidn __OUTPUT_FORMAT__, win64 + movdqa xmm6, [rsp + XMM_SAVE + 16*0] + movdqa xmm7, [rsp + XMM_SAVE + 16*1] + movdqa xmm8, [rsp + XMM_SAVE + 16*2] + movdqa xmm9, [rsp + XMM_SAVE + 16*3] + movdqa xmm10, [rsp + XMM_SAVE + 16*4] + movdqa xmm11, [rsp + XMM_SAVE + 16*5] + movdqa xmm12, [rsp + XMM_SAVE + 16*6] + movdqa xmm13, [rsp + XMM_SAVE + 16*7] +%endif + add rsp, VARIABLE_OFFSET + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +align 16 +_less_than_256: + + ; check if there is enough buffer to be able to fold 16B at a time + cmp arg3, 32 + jl _less_than_32 + + ; if there is, load the constants + movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 + + movq xmm0, arg1 ; get the initial crc value + movdqu xmm7, [arg2] ; load the plaintext + pxor xmm7, xmm0 + + ; update the buffer pointer + add arg2, 16 + + ; update the counter. subtract 32 instead of 16 to save one instruction from the loop + sub arg3, 32 + + jmp _16B_reduction_loop + +align 16 +_less_than_32: + ; mov initial crc to the return value. this is necessary for zero-length buffers. + mov rax, arg1 + test arg3, arg3 + je _cleanup + + movq xmm0, arg1 ; get the initial crc value + + cmp arg3, 16 + je _exact_16_left + jl _less_than_16_left + + movdqu xmm7, [arg2] ; load the plaintext + pxor xmm7, xmm0 ; xor the initial crc value + add arg2, 16 + sub arg3, 16 + movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10 + jmp _get_last_two_xmms + + +align 16 +_less_than_16_left: + ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. + + pxor xmm1, xmm1 + mov r11, rsp + movdqa [r11], xmm1 + + ; backup the counter value + mov r9, arg3 + cmp arg3, 8 + jl _less_than_8_left + + ; load 8 Bytes + mov rax, [arg2] + mov [r11], rax + add r11, 8 + sub arg3, 8 + add arg2, 8 +_less_than_8_left: + + cmp arg3, 4 + jl _less_than_4_left + + ; load 4 Bytes + mov eax, [arg2] + mov [r11], eax + add r11, 4 + sub arg3, 4 + add arg2, 4 +_less_than_4_left: + + cmp arg3, 2 + jl _less_than_2_left + + ; load 2 Bytes + mov ax, [arg2] + mov [r11], ax + add r11, 2 + sub arg3, 2 + add arg2, 2 +_less_than_2_left: + cmp arg3, 1 + jl _zero_left + + ; load 1 Byte + mov al, [arg2] + mov [r11], al + +_zero_left: + movdqa xmm7, [rsp] + pxor xmm7, xmm0 ; xor the initial crc value + + lea rax,[pshufb_shf_table] + + cmp r9, 8 + jl _end_1to7 + +_end_8to15: + movdqu xmm0, [rax + r9] + pshufb xmm7,xmm0 + jmp _128_done + +_end_1to7: + ; Left shift (8-length) bytes in XMM + movdqu xmm0, [rax + r9 + 8] + pshufb xmm7,xmm0 + + jmp _barrett + +align 16 +_exact_16_left: + movdqu xmm7, [arg2] + pxor xmm7, xmm0 ; xor the initial crc value + + jmp _128_done + +section .data + +; precomputed constants +align 16 +; rk7 = floor(2^128/Q) +; rk8 = Q +rk1: +DQ 0x381d0015c96f4444 +rk2: +DQ 0xd9d7be7d505da32c +rk3: +DQ 0x768361524d29ed0b +rk4: +DQ 0xcc26fa7c57f8054c +rk5: +DQ 0x381d0015c96f4444 +rk6: +DQ 0x0000000000000000 +rk7: +DQ 0x3e6cfa329aef9f77 +rk8: +DQ 0x2b5926535897936a +rk9: +DQ 0x5bc94ba8e2087636 +rk10: +DQ 0x6cf09c8f37710b75 +rk11: +DQ 0x3885fd59e440d95a +rk12: +DQ 0xbccba3936411fb7e +rk13: +DQ 0xe4dd0d81cbfce585 +rk14: +DQ 0xb715e37b96ed8633 +rk15: +DQ 0xf49784a634f014e4 +rk16: +DQ 0xaf86efb16d9ab4fb +rk17: +DQ 0x7b3211a760160db8 +rk18: +DQ 0xa062b2319d66692f +rk19: +DQ 0xef3d1d18ed889ed2 +rk20: +DQ 0x6ba4d760ab38201e + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + + +mask: +dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 +mask2: +dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF +mask3: +dq 0x8080808080808080, 0x8080808080808080 + +;;; func core, ver, snum +slversion crc64_jones_refl_by8, 01, 00, 0029 diff --git a/crc/crc64_multibinary.asm b/crc/crc64_multibinary.asm index 3ee988d..a20c8a7 100644 --- a/crc/crc64_multibinary.asm +++ b/crc/crc64_multibinary.asm @@ -49,85 +49,41 @@ extern crc64_ecma_refl_base extern crc64_ecma_norm_by8 extern crc64_ecma_norm_base -section .data -;;; *_mbinit are initial values for *_dispatched; is updated on first call. -;;; Therefore, *_dispatch_init is only executed on first call. +extern crc64_iso_refl_by8 +extern crc64_iso_refl_base -crc64_ecma_refl_dispatched: - dq crc64_ecma_refl_mbinit -crc64_ecma_norm_dispatched: - dq crc64_ecma_norm_mbinit +extern crc64_iso_norm_by8 +extern crc64_iso_norm_base + +extern crc64_jones_refl_by8 +extern crc64_jones_refl_base + +extern crc64_jones_norm_by8 +extern crc64_jones_norm_base section .text -;;;; -; crc64_ecma_refl multibinary function -;;;; -global crc64_ecma_refl:function -crc64_ecma_refl_mbinit: - call crc64_ecma_refl_dispatch_init -crc64_ecma_refl: - jmp qword [crc64_ecma_refl_dispatched] +%include "multibinary.asm" -crc64_ecma_refl_dispatch_init: - push rax - push rbx - push rcx - push rdx - push rsi - lea rsi, [crc64_ecma_refl_base WRT_OPT] ; Default +mbin_interface crc64_ecma_refl +mbin_dispatch_init_clmul crc64_ecma_refl, crc64_ecma_refl_base, crc64_ecma_refl_by8 +mbin_interface crc64_ecma_norm +mbin_dispatch_init_clmul crc64_ecma_norm, crc64_ecma_norm_base, crc64_ecma_norm_by8 - mov eax, 1 - cpuid - lea rbx, [crc64_ecma_refl_by8 WRT_OPT] +mbin_interface crc64_iso_refl +mbin_dispatch_init_clmul crc64_iso_refl, crc64_iso_refl_base, crc64_iso_refl_by8 +mbin_interface crc64_iso_norm +mbin_dispatch_init_clmul crc64_iso_norm, crc64_iso_norm_base, crc64_iso_norm_by8 - test ecx, FLAG_CPUID1_ECX_SSE3 - jz use_ecma_refl_base - test ecx, FLAG_CPUID1_ECX_CLMUL - cmovne rsi, rbx -use_ecma_refl_base: - mov [crc64_ecma_refl_dispatched], rsi - pop rsi - pop rdx - pop rcx - pop rbx - pop rax - ret - -;;;; -; crc64_ecma_norm multibinary function -;;;; -global crc64_ecma_norm:function -crc64_ecma_norm_mbinit: - call crc64_ecma_norm_dispatch_init -crc64_ecma_norm: - jmp qword [crc64_ecma_norm_dispatched] - -crc64_ecma_norm_dispatch_init: - push rax - push rbx - push rcx - push rdx - push rsi - lea rsi, [crc64_ecma_norm_base WRT_OPT] ; Default - - mov eax, 1 - cpuid - lea rbx, [crc64_ecma_norm_by8 WRT_OPT] - - test ecx, FLAG_CPUID1_ECX_SSE3 - jz use_ecma_norm_base - test ecx, FLAG_CPUID1_ECX_CLMUL - cmovne rsi, rbx -use_ecma_norm_base: - mov [crc64_ecma_norm_dispatched], rsi - pop rsi - pop rdx - pop rcx - pop rbx - pop rax - ret +mbin_interface crc64_jones_refl +mbin_dispatch_init_clmul crc64_jones_refl, crc64_jones_refl_base, crc64_jones_refl_by8 +mbin_interface crc64_jones_norm +mbin_dispatch_init_clmul crc64_jones_norm, crc64_jones_norm_base, crc64_jones_norm_by8 ;;; func core, ver, snum -slversion crc64_ecma_refl, 00, 00, 0018 -slversion crc64_ecma_norm, 00, 00, 001e +slversion crc64_ecma_refl, 00, 00, 001b +slversion crc64_ecma_norm, 00, 00, 0018 +slversion crc64_iso_refl, 00, 00, 0021 +slversion crc64_iso_norm, 00, 00, 001e +slversion crc64_jones_refl, 00, 00, 0027 +slversion crc64_jones_norm, 00, 00, 0024 diff --git a/include/crc64.h b/include/crc64.h index 743b347..8d7d81f 100644 --- a/include/crc64.h +++ b/include/crc64.h @@ -74,6 +74,62 @@ uint64_t crc64_ecma_norm( uint64_t len //!< buffer length in bytes (64-bit data) ); +/** + * @brief Generate CRC from ISO standard in reflected format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_iso_refl( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in normal format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_iso_norm( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in reflected format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_jones_refl( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in normal format, runs + * appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * @returns 64 bit CRC + */ +uint64_t crc64_jones_norm( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + /* Arch specific versions */ /** @@ -107,9 +163,9 @@ uint64_t crc64_ecma_norm_by8( * @returns 64 bit CRC */ uint64_t crc64_ecma_refl_base( - uint64_t seed, //!< initial CRC value, 64 bits - uint8_t *buf, //!< buffer to calculate CRC on - uint64_t len //!< buffer length in bytes (64-bit data) + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) ); /** @@ -117,11 +173,102 @@ uint64_t crc64_ecma_refl_base( * @returns 64 bit CRC */ uint64_t crc64_ecma_norm_base( - uint64_t seed, //!< initial CRC value, 64 bits - uint8_t *buf, //!< buffer to calculate CRC on - uint64_t len //!< buffer length in bytes (64-bit data) + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) ); +/** + * @brief Generate CRC from ISO standard in reflected format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_iso_refl_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in normal format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_iso_norm_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in reflected format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_iso_refl_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from ISO standard in normal format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_iso_norm_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in reflected format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_jones_refl_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in normal format. + * @requires SSE3, CLMUL + * + * @returns 64 bit CRC + */ + +uint64_t crc64_jones_norm_by8( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in reflected format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_jones_refl_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); + +/** + * @brief Generate CRC from "Jones" coefficients in normal format, runs baseline version + * @returns 64 bit CRC + */ +uint64_t crc64_jones_norm_base( + uint64_t init_crc, //!< initial CRC value, 64 bits + const unsigned char *buf, //!< buffer to calculate CRC on + uint64_t len //!< buffer length in bytes (64-bit data) + ); #ifdef __cplusplus } diff --git a/include/multibinary.asm b/include/multibinary.asm index 8863d7a..7fca3a1 100644 --- a/include/multibinary.asm +++ b/include/multibinary.asm @@ -142,6 +142,42 @@ ret %endmacro +;;;;; +; mbin_dispatch_init_clmul 3 parameters +; Use this case for CRC which needs both SSE4_1 and CLMUL +; 1-> function name +; 2-> base function +; 3-> SSE4_1 and CLMUL optimized function +;;;;; +%macro mbin_dispatch_init_clmul 3 + section .text + %1_dispatch_init: + push mbin_rsi + push mbin_rax + push mbin_rbx + push mbin_rcx + push mbin_rdx + lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function + + mov eax, 1 + cpuid + lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func + + ; Test for SSE4.2 + test ecx, FLAG_CPUID1_ECX_SSE4_1 + jz _%1_init_done + test ecx, FLAG_CPUID1_ECX_CLMUL + cmovne mbin_rsi, mbin_rbx + _%1_init_done: + pop mbin_rdx + pop mbin_rcx + pop mbin_rbx + pop mbin_rax + mov [%1_dispatched], mbin_rsi + pop mbin_rsi + ret +%endmacro + ;;;;; ; mbin_dispatch_init5 parameters ; 1-> function name