diff --git a/Makefile.am b/Makefile.am index d804cb0..27dec33 100644 --- a/Makefile.am +++ b/Makefile.am @@ -25,6 +25,7 @@ perf_tests32= # Include units include erasure_code/Makefile.am +include raid/Makefile.am # LIB version info not necessarily the same as package version LIBISAL_CURRENT=2 diff --git a/Makefile.nmake b/Makefile.nmake index 4985ec8..74905e3 100644 --- a/Makefile.nmake +++ b/Makefile.nmake @@ -27,9 +27,10 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ######################################################################## -objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_avx512.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_avx512.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_avx512.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_avx512.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_avx512.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_avx512.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_avx512.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_avx512.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj -INCLUDES = -I./ -Ierasure_code/ -Iinclude/ +objs = bin\ec_base.obj bin\ec_highlevel_func.obj bin\ec_multibinary.obj bin\gf_2vect_dot_prod_avx.obj bin\gf_2vect_dot_prod_avx2.obj bin\gf_2vect_dot_prod_avx512.obj bin\gf_2vect_dot_prod_sse.obj bin\gf_2vect_mad_avx.obj bin\gf_2vect_mad_avx2.obj bin\gf_2vect_mad_avx512.obj bin\gf_2vect_mad_sse.obj bin\gf_3vect_dot_prod_avx.obj bin\gf_3vect_dot_prod_avx2.obj bin\gf_3vect_dot_prod_avx512.obj bin\gf_3vect_dot_prod_sse.obj bin\gf_3vect_mad_avx.obj bin\gf_3vect_mad_avx2.obj bin\gf_3vect_mad_avx512.obj bin\gf_3vect_mad_sse.obj bin\gf_4vect_dot_prod_avx.obj bin\gf_4vect_dot_prod_avx2.obj bin\gf_4vect_dot_prod_avx512.obj bin\gf_4vect_dot_prod_sse.obj bin\gf_4vect_mad_avx.obj bin\gf_4vect_mad_avx2.obj bin\gf_4vect_mad_avx512.obj bin\gf_4vect_mad_sse.obj bin\gf_5vect_dot_prod_avx.obj bin\gf_5vect_dot_prod_avx2.obj bin\gf_5vect_dot_prod_sse.obj bin\gf_5vect_mad_avx.obj bin\gf_5vect_mad_avx2.obj bin\gf_5vect_mad_sse.obj bin\gf_6vect_dot_prod_avx.obj bin\gf_6vect_dot_prod_avx2.obj bin\gf_6vect_dot_prod_sse.obj bin\gf_6vect_mad_avx.obj bin\gf_6vect_mad_avx2.obj bin\gf_6vect_mad_sse.obj bin\gf_vect_dot_prod_avx.obj bin\gf_vect_dot_prod_avx2.obj bin\gf_vect_dot_prod_avx512.obj bin\gf_vect_dot_prod_sse.obj bin\gf_vect_mad_avx.obj bin\gf_vect_mad_avx2.obj bin\gf_vect_mad_avx512.obj bin\gf_vect_mad_sse.obj bin\gf_vect_mul_avx.obj bin\gf_vect_mul_sse.obj bin\pq_check_sse.obj bin\pq_gen_avx.obj bin\pq_gen_avx2.obj bin\pq_gen_sse.obj bin\raid_base.obj bin\raid_multibinary.obj bin\xor_check_sse.obj bin\xor_gen_avx.obj bin\xor_gen_sse.obj + +INCLUDES = -I./ -Ierasure_code/ -Iraid/ -Iinclude/ LINKFLAGS = /nologo CFLAGS = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) $(D) AFLAGS = -f win64 $(INCLUDES) $(D) @@ -53,13 +54,24 @@ isa-l.dll: $(objs) {erasure_code}.asm.obj: $(AS) $(AFLAGS) -o $@ $? +{raid}.c.obj: + $(CC) $(CFLAGS) /c -Fo$@ $? +{raid}.asm.obj: + $(AS) $(AFLAGS) -o $@ $? +# Examples +ex = xor_example.exe +ex: lib $(ex) + +$(ex): $(@B).obj + .obj.exe: link /out:$@ $(LINKFLAGS) isa-l.lib $? # Check tests -checks = erasure_code_test.exe erasure_code_update_test.exe gf_inverse_test.exe gf_vect_mul_test.exe +checks = erasure_code_test.exe erasure_code_update_test.exe gf_inverse_test.exe gf_vect_mul_test.exe \ + pq_check_test.exe pq_gen_test.exe xor_check_test.exe xor_gen_test.exe checks: lib $(checks) $(checks): $(@B).obj @@ -73,7 +85,7 @@ tests: lib $(tests) $(tests): $(@B).obj # Performance tests -perfs = erasure_code_base_perf.exe erasure_code_perf.exe erasure_code_sse_perf.exe erasure_code_update_perf.exe gf_2vect_dot_prod_sse_perf.exe gf_3vect_dot_prod_sse_perf.exe gf_4vect_dot_prod_sse_perf.exe gf_5vect_dot_prod_sse_perf.exe gf_6vect_dot_prod_sse_perf.exe gf_vect_dot_prod_1tbl.exe gf_vect_dot_prod_avx_perf.exe gf_vect_dot_prod_perf.exe gf_vect_dot_prod_sse_perf.exe gf_vect_mad_perf.exe gf_vect_mul_avx_perf.exe gf_vect_mul_perf.exe gf_vect_mul_sse_perf.exe +perfs = erasure_code_base_perf.exe erasure_code_perf.exe erasure_code_sse_perf.exe erasure_code_update_perf.exe gf_2vect_dot_prod_sse_perf.exe gf_3vect_dot_prod_sse_perf.exe gf_4vect_dot_prod_sse_perf.exe gf_5vect_dot_prod_sse_perf.exe gf_6vect_dot_prod_sse_perf.exe gf_vect_dot_prod_1tbl.exe gf_vect_dot_prod_avx_perf.exe gf_vect_dot_prod_perf.exe gf_vect_dot_prod_sse_perf.exe gf_vect_mad_perf.exe gf_vect_mul_avx_perf.exe gf_vect_mul_perf.exe gf_vect_mul_sse_perf.exe pq_gen_perf.exe xor_gen_perf.exe perfs: lib $(perfs) $(perfs): $(@B).obj diff --git a/Makefile.unx b/Makefile.unx index bc457b1..709f21a 100644 --- a/Makefile.unx +++ b/Makefile.unx @@ -27,7 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ######################################################################## -units = erasure_code +units = erasure_code raid default: lib diff --git a/include/raid.h b/include/raid.h new file mode 100644 index 0000000..192fca2 --- /dev/null +++ b/include/raid.h @@ -0,0 +1,302 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + +#ifndef _RAID_H_ +#define _RAID_H_ + +/** + * @file raid.h + * @brief Interface to RAID functions - XOR and P+Q calculation. + * + * This file defines the interface to optimized XOR calculation (RAID5) or P+Q + * dual parity (RAID6). Operations are carried out on an array of pointers to + * sources and output arrays. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Multi-binary functions */ + +/** + * @brief Generate XOR parity vector from N sources, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest + * pointers must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int xor_gen(int vects, int len, void **array); + + +/** + * @brief Checks that array has XOR parity sum of 0 across all vectors, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to vectors. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_check(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 32B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int pq_gen(int vects, int len, void **array); + + +/** + * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs appropriate version. + * + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * + * @param vects Number of vectors in array including P&Q. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and P, Q. P and Q parity + * are assumed to be the last two pointers in the array. + * All pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_check(int vects, int len, void **array); + + +/* Arch specific versions */ + +/** + * @brief Generate XOR parity vector from N sources. + * @requires SSE4.1 + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_gen_sse(int vects, int len, void **array); + + +/** + * @brief Generate XOR parity vector from N sources. + * @requires AVX + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest pointers + * must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int xor_gen_avx(int vects, int len, void **array); + + +/** + * @brief Checks that array has XOR parity sum of 0 across all vectors. + * @requires SSE4.1 + * + * @param vects Number of vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to vectors. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_check_sse(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources. + * @requires SSE4.1 + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_sse(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources. + * @requires AVX + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_avx(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources. + * @requires AVX2 + * + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 32B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest + * pointers must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_avx2(int vects, int len, void **array); + + +/** + * @brief Checks that array of N sources, P and Q are consistent across all vectors. + * @requires SSE4.1 + * + * @param vects Number of vectors in array including P&Q. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and P, Q. P and Q parity + are assumed to be the last two pointers in the array. + All pointers must be aligned to 16B. + * @returns 0 pass, other fail + */ + +int pq_check_sse(int vects, int len, void **array); + + +/** + * @brief Generate P+Q parity vectors from N sources, runs baseline version. + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and dest. For P+Q the dest + * is the last two pointers. ie array[vects-2], + * array[vects-1]. P and Q parity vectors are + * written to these last two pointers. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_gen_base(int vects, int len, void **array); + + +/** + * @brief Generate XOR parity vector from N sources, runs baseline version. + * @param vects Number of source+dest vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to source and dest. For XOR the dest is + * the last pointer. ie array[vects-1]. Src and dest pointers + * must be aligned to 32B. + * + * @returns 0 pass, other fail + */ + +int xor_gen_base(int vects, int len, void **array); + + +/** + * @brief Checks that array has XOR parity sum of 0 across all vectors, runs baseline version. + * + * @param vects Number of vectors in array. + * @param len Length of each vector in bytes. + * @param array Array of pointers to vectors. Src and dest pointers + * must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int xor_check_base(int vects, int len, void **array); + + +/** + * @brief Checks that array of N sources, P and Q are consistent across all vectors, runs baseline version. + * + * @param vects Number of vectors in array including P&Q. + * @param len Length of each vector in bytes. Must be 16B aligned. + * @param array Array of pointers to source and P, Q. P and Q parity + * are assumed to be the last two pointers in the array. + * All pointers must be aligned to 16B. + * + * @returns 0 pass, other fail + */ + +int pq_check_base(int vects, int len, void **array); + +#ifdef __cplusplus +} +#endif + +#endif //_RAID_H_ diff --git a/isa-l.def b/isa-l.def index 2b316c6..431d98e 100644 --- a/isa-l.def +++ b/isa-l.def @@ -54,3 +54,18 @@ gf_vect_mul @50 ec_encode_data_update @51 gf_vect_dot_prod @52 gf_vect_mad @53 +xor_gen @54 +xor_check @55 +pq_gen @56 +pq_check @57 +xor_gen_sse @58 +xor_gen_avx @59 +xor_check_sse @60 +pq_gen_sse @61 +pq_gen_avx @62 +pq_gen_avx2 @63 +pq_check_sse @64 +pq_gen_base @65 +xor_gen_base @66 +xor_check_base @67 +pq_check_base @68 diff --git a/raid/Makefile.am b/raid/Makefile.am new file mode 100644 index 0000000..7f9d6cf --- /dev/null +++ b/raid/Makefile.am @@ -0,0 +1,45 @@ +######################################################################## +# Copyright(c) 2011-2015 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +lsrc += raid/xor_gen_sse.asm raid/pq_gen_sse.asm raid/xor_check_sse.asm \ + raid/pq_check_sse.asm raid/pq_gen_avx.asm \ + raid/xor_gen_avx.asm raid/pq_gen_avx2.asm \ + raid/raid_base.c raid/raid_multibinary.asm + +extern_hdrs += include/raid.h + +other_src += include/test.h include/types.h + +check_tests += raid/xor_gen_test raid/pq_gen_test raid/xor_check_test raid/pq_check_test + +perf_tests += raid/xor_gen_perf raid/pq_gen_perf + +examples += raid/xor_example + +lsrc32 += xor_gen_sse.asm pq_gen_sse_i32.asm xor_check_sse.asm pq_check_sse_i32.asm raid_base.c diff --git a/raid/pq_check_sse.asm b/raid/pq_check_sse.asm new file mode 100644 index 0000000..96a8177 --- /dev/null +++ b/raid/pq_check_sse.asm @@ -0,0 +1,277 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using SSE3 +;;; int pq_check_sse(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define return rax + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define return rax + %define stack_size 7*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm15, 6*16 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size + %endmacro +%endif + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos return + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%define xp3 xmm8 +%define xq3 xmm9 +%define xtmp3 xmm10 +%define xs3 xmm11 + +%define xpoly xmm15 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movdqa + %define XSTR movntdq +%endif + +default rel + +[bits 64] +section .text + +align 16 +global pq_check_sse:function +func(pq_check_sse) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 + movdqa xpoly, [poly] + cmp len, 48 + jl loop16 + +len_aligned_32bytes: + sub len, 48 ;Do end of vec first and run backward + +loop48: + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src + XLDR xp2, [ptr+pos+16] ;Initialize xp2 with P2 src + 16B ahead + XLDR xp3, [ptr+pos+32] ;Initialize xp3 with P2 src + 32B ahead + pxor xq1, xq1 ;q1 = 0 + pxor xq2, xq2 ;q2 = 0 + pxor xq3, xq3 ;q3 = 0 + + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + XLDR xs3, [ptr+pos+32] ;Preload last vector (source) + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + pxor xp1, xs1 ; p1 ^= s1 + pxor xp2, xs2 ; p2 ^= s2 + pxor xp3, xs3 ; p3 ^= s2 + pxor xq1, xs1 ; q1 ^= s1 + pxor xq2, xs2 ; q2 ^= s2 + pxor xq3, xs3 ; q3 ^= s3 + pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0 + pxor xtmp2, xtmp2 ; xtmp2 = 0 + pxor xtmp3, xtmp3 ; xtmp3 = 0 + pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp3, xq3 ; xtmp3 = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp1 = poly or 0x00 + pand xtmp2, xpoly ; xtmp2 = poly or 0x00 + pand xtmp3, xpoly ; xtmp3 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + XLDR xs3, [ptr+pos+32] ; Get next vector (source data3) + paddb xq1, xq1 ; q1 = q1<<1 + paddb xq2, xq2 ; q2 = q2<<1 + paddb xq3, xq3 ; q3 = q3<<1 + pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + pxor xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + pxor xq1, xs1 ;q1 ^= 1 * s1[0] + pxor xp2, xs2 ;p2 ^= s2[0] + pxor xq2, xs2 ;q2 ^= 1 * s2[0] + pxor xp3, xs3 ;p3 ^= s3[0] + pxor xq3, xs3 ;q3 ^= 1 * s3[0] + + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + XLDR xtmp1, [tmp+pos] ;re-init xq1 with Q1 src + XLDR xtmp2, [tmp+pos+16] ;re-init xq2 with Q2 src + 16B ahead + XLDR xtmp3, [tmp+pos+32] ;re-init xq3 with Q2 src + 32B ahead + + pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved + pxor xq2, xtmp2 + pxor xq3, xtmp3 + + por xp1, xq1 ;Confirm that all P&Q parity are 0 + por xp1, xp2 + por xp1, xq2 + por xp1, xp3 + por xp1, xq3 + ptest xp1, xp1 + jnz return_fail + add pos, 48 + cmp pos, len + jle loop48 + + + ;; ------------------------------ + ;; Do last 16 or 32 Bytes remaining + add len, 48 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src + pxor xq1, xq1 ;q = 0 + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + pxor xq1, xs1 ; q ^= s + pxor xtmp1, xtmp1 ; xtmp = 0 + pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp = poly or 0x00 + pxor xp1, xs1 ; p ^= s + paddb xq1, xq1 ; q = q<<1 + pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + pxor xp1, xs1 ;p ^= s[0] - last source is already loaded + pxor xq1, xs1 ;q ^= 1 * s[0] + + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + XLDR xtmp1, [tmp+pos] ;re-init tmp with Q1 src + pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved + + por xp1, xq1 ;Confirm that all P&Q parity are = 0 + ptest xp1, xp1 + jnz return_fail + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_check_sse, 00, 06, 0033 diff --git a/raid/pq_check_sse_i32.asm b/raid/pq_check_sse_i32.asm new file mode 100644 index 0000000..6c5915f --- /dev/null +++ b/raid/pq_check_sse_i32.asm @@ -0,0 +1,282 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using SSE3 +;;; int pq_gen_sse(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define return rax + %define PS 8 + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define return rax + %define PS 8 + %define tmp r11 + %define stack_size 2*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + add rsp, stack_size + %endmacro + + +%elifidn __OUTPUT_FORMAT__, elf32 + %define arg0 edx + %define arg1 ecx + %define return eax + %define PS 4 + %define func(x) x: + %define arg(x) [ebp+8+PS*x] + %define arg2 edi ; must sav/restore + %define arg3 esi + %define tmp ebx + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg0, arg(0) + mov arg1, arg(1) + mov arg2, arg(2) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp ;if has frame pointer? + pop ebp + %endmacro + +%endif ; output formats + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos return + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] + %define xpoly xmm15 +%elifidn PS,4 ; 32-bit code + %define xpoly [poly] +%endif + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +section .text + +align 16 +global pq_check_sse:function +func(pq_check_sse) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 +%ifidn PS,8 + movdqa xpoly, [poly] ;For 64-bit, load poly into high xmm reg +%endif + cmp len, 32 + jl loop16 + +len_aligned_32bytes: + sub len, 32 ;Do end of vec first and run backward + +loop32: + mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src + XLDR xp2, [ptr+pos+16] ;Initialize xp2 with P2 src + 16B ahead + pxor xq1, xq1 ;q1 = 0 + pxor xq2, xq2 ;q2 = 0 + + mov ptr, [arg2+vec*PS] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*PS] ; get pointer to next vect + pxor xp1, xs1 ; p1 ^= s1 + pxor xp2, xs2 ; p2 ^= s2 + pxor xq1, xs1 ; q1 ^= s1 + pxor xq2, xs2 ; q2 ^= s2 + pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0 + pxor xtmp2, xtmp2 ; xtmp2 = 0 + pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp1 = poly or 0x00 + pand xtmp2, xpoly ; xtmp2 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + paddb xq1, xq1 ; q1 = q1<<1 + paddb xq2, xq2 ; q2 = q2<<1 + pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + pxor xq1, xs1 ;q1 ^= 1 * s1[0] + pxor xp2, xs2 ;p2 ^= s2[0] + pxor xq2, xs2 ;q2 ^= 1 * s2[0] + + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + XLDR xtmp1, [tmp+pos] ;re-init xq1 with Q1 src + XLDR xtmp2, [tmp+pos+16] ;re-init xq2 with Q2 src + 16B ahead + + pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved + pxor xq2, xtmp2 + + por xp1, xq1 ;Confirm that all P&Q parity are 0 + por xp1, xp2 + por xp1, xq2 + ptest xp1, xp1 + jnz return_fail + add pos, 32 + cmp pos, len + jle loop32 + + + ;; ------------------------------ + ;; Do last 16 Bytes remaining + add len, 32 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + XLDR xp1, [ptr+pos] ;Initialize xp1 with P1 src + pxor xq1, xq1 ;q = 0 + mov ptr, [arg2+vec*PS] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*PS] ; get pointer to next vect + pxor xq1, xs1 ; q ^= s + pxor xtmp1, xtmp1 ; xtmp = 0 + pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp = poly or 0x00 + pxor xp1, xs1 ; p ^= s + paddb xq1, xq1 ; q = q<<1 + pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + pxor xp1, xs1 ;p ^= s[0] - last source is already loaded + pxor xq1, xs1 ;q ^= 1 * s[0] + + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + XLDR xtmp1, [tmp+pos] ;re-init tmp with Q1 src + pxor xq1, xtmp1 ;xq1 = q1 calculated ^ q1 saved + + por xp1, xq1 ;Confirm that all P&Q parity are = 0 + ptest xp1, xp1 + jnz return_fail + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_check_sse, 00, 06, 0033 diff --git a/raid/pq_check_test.c b/raid/pq_check_test.c new file mode 100644 index 0000000..8b6d0a1 --- /dev/null +++ b/raid/pq_check_test.c @@ -0,0 +1,304 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 1024 +#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN)) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +int ref_multi_pq(int vects, int len, void **array) +{ + int i, j; + unsigned char p, q, s; + unsigned char **src = (unsigned char **)array; + + for (i = 0; i < len; i++) { + q = p = src[vects - 3][i]; + + for (j = vects - 4; j >= 0; j--) { + p ^= s = src[j][i]; + q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0)); // mult by GF{2} + } + + src[vects - 2][i] = p; // second to last pointer is p + src[vects - 1][i] = q; // last pointer is q + } + return 0; +} + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(int argc, char *argv[]) +{ + int i, j, k, ret, fail = 0; + void *buffs[TEST_SOURCES + 2]; + char c; + char *tmp_buf[TEST_SOURCES + 2]; + int serr, lerr; + + printf("Test pq_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN); + + srand(TEST_SEED); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 2; i++) { + void *buf; + if (posix_memalign(&buf, 16, TEST_LEN)) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Test of all zeros + for (i = 0; i < TEST_SOURCES + 2; i++) + memset(buffs[i], 0, TEST_LEN); + + ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs); + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret != 0) { + fail++; + printf("\nfail zero test %d\n", ret); + } + + ((char *)(buffs[0]))[TEST_LEN - 2] = 0x7; // corrupt buffer + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nfail corrupt buffer test %d\n", ret); + } + ((char *)(buffs[0]))[TEST_LEN - 2] = 0; // un-corrupt buffer + + // Test corrupted buffer any location on all sources + for (j = 0; j < TEST_SOURCES + 2; j++) { + for (i = TEST_LEN - 1; i >= 0; i--) { + ((char *)buffs[j])[i] = 0x5; // corrupt buffer + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nfail corrupt zero buffer test j=%d, i=%d\n", j, i); + return 1; + } + ((char *)buffs[j])[i] = 0; // un-corrupt buffer + } + putchar('.'); + } + + // Test rand1 + for (i = 0; i < TEST_SOURCES + 2; i++) + rand_buffer(buffs[i], TEST_LEN); + + ref_multi_pq(TEST_SOURCES + 2, TEST_LEN, buffs); + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret != 0) { + fail++; + printf("fail first rand test %d\n", ret); + } + + c = ((char *)(buffs[0]))[TEST_LEN - 2]; + ((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1; + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nFail corrupt buffer test, passed when should have failed\n"); + } + ((char *)(buffs[0]))[TEST_LEN - 2] = c; // un-corrupt buffer + + // Test corrupted buffer any location on all sources w/ random data + for (j = 0; j < TEST_SOURCES + 2; j++) { + for (i = TEST_LEN - 1; i >= 0; i--) { + // Check it still passes + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret != 0) { // should pass + fail++; + printf + ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n", + j, i); + return 1; + } + c = ((char *)buffs[j])[i]; + ((char *)buffs[j])[i] = c ^ 1; // corrupt buffer + ret = pq_check(TEST_SOURCES + 2, TEST_LEN, buffs); + if (ret == 0) { // Check it now fails + fail++; + printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i); + return 1; + } + ((char *)buffs[j])[i] = c; // un-corrupt buffer + } + putchar('.'); + } + + // Test various number of sources, full length + for (j = 4; j <= TEST_SOURCES + 2; j++) { + // New random data + for (i = 0; i < j; i++) + rand_buffer(buffs[i], TEST_LEN); + + // Generate p,q parity for this number of sources + ref_multi_pq(j, TEST_LEN, buffs); + + // Set errors up in each source and len position + for (i = 0; i < j; i++) { + for (k = 0; k < TEST_LEN; k++) { + // See if it still passes + ret = pq_check(j, TEST_LEN, buffs); + if (ret != 0) { // Should pass + printf("\nfail rand fixed len test %d sources\n", j); + fail++; + return 1; + } + + c = ((char *)buffs[i])[k]; + ((char *)buffs[i])[k] = c ^ 1; // corrupt buffer + + ret = pq_check(j, TEST_LEN, buffs); + if (ret == 0) { // Should fail + printf + ("\nfail rand fixed len test corrupted buffer %d sources\n", + j); + fail++; + return 1; + } + ((char *)buffs[i])[k] = c; // un-corrupt buffer + } + } + putchar('.'); + } + + fflush(0); + + // Test various number of sources and len + k = 16; + while (k <= TEST_LEN) { + char *tmp; + for (j = 4; j <= TEST_SOURCES + 2; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], k); + + // Generate p,q parity for this number of sources + ref_multi_pq(j, k, buffs); + + // Inject errors at various source and len positions + for (lerr = 0; lerr < k; lerr++) { + for (serr = 0; serr < j; serr++) { + // See if it still passes + ret = pq_check(j, k, buffs); + if (ret != 0) { // Should pass + printf + ("\nfail rand var src, len test %d sources, len=%d\n", + j, k); + fail++; + return 1; + } + + tmp = (char *)buffs[serr]; + c = tmp[lerr]; + ((char *)buffs[serr])[lerr] = c ^ 1; // corrupt buffer + + ret = pq_check(j, k, buffs); + if (ret == 0) { // Should fail + printf + ("\nfail rand var src, len test corrupted buffer " + "%d sources, len=%d, ret=%d\n", j, k, + ret); + fail++; + return 1; + } + ((char *)buffs[serr])[lerr] = c; // un-corrupt buffer + } + } + putchar('.'); + fflush(0); + } + k += 16; + } + + // Test at the end of buffer + for (i = 0; i < TEST_LEN; i += 16) { + for (j = 0; j < TEST_SOURCES + 2; j++) { + rand_buffer(buffs[j], TEST_LEN - i); + tmp_buf[j] = (char *)buffs[j] + i; + } + + pq_gen_base(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf); + + // Test good data + ret = pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf); + if (ret != 0) { + printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i); + fail++; + return 1; + } + // Test bad data + for (serr = 0; serr < TEST_SOURCES + 2; serr++) { + for (lerr = 0; lerr < (TEST_LEN - i); lerr++) { + c = tmp_buf[serr][lerr]; + tmp_buf[serr][lerr] = c ^ 1; + + ret = + pq_check(TEST_SOURCES + 2, TEST_LEN - i, (void *)tmp_buf); + if (ret == 0) { + printf("fail end test corrupted buffer - " + "offset: %d, len: %d, ret: %d\n", i, + TEST_LEN - i, ret); + fail++; + return 1; + } + + tmp_buf[serr][lerr] = c; + } + } + + putchar('.'); + fflush(0); + } + + if (fail == 0) + printf("Pass\n"); + + return fail; + +} diff --git a/raid/pq_gen_avx.asm b/raid/pq_gen_avx.asm new file mode 100644 index 0000000..43c31a5 --- /dev/null +++ b/raid/pq_gen_avx.asm @@ -0,0 +1,254 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using AVX +;;; int pq_gen_avx(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define return rax + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define return rax + %define stack_size 8*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm14, 6*16 + save_xmm128 xmm15, 7*16 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm14, [rsp + 6*16] + movdqa xmm15, [rsp + 7*16] + add rsp, stack_size + %endmacro +%endif + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos rax + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%define xp3 xmm8 +%define xq3 xmm9 +%define xtmp3 xmm10 +%define xs3 xmm11 + +%define xzero xmm14 +%define xpoly xmm15 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa +%else + %define XLDR vmovntdqa + %define XSTR vmovntdq +%endif + +default rel + +[bits 64] +section .text + +align 16 +global pq_gen_avx:function +func(pq_gen_avx) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 + vmovdqa xpoly, [poly] + vpxor xzero, xzero, xzero + cmp len, 48 + jl loop16 + +len_aligned_32bytes: + sub len, 48 ;Len points to last block + +loop48: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + XLDR xs3, [ptr+pos+32] ;Preload last vector (source) + vpxor xp1, xp1, xp1 ;p1 = 0 + vpxor xp2, xp2, xp2 ;p2 = 0 + vpxor xp3, xp3, xp3 ;p3 = 0 + vpxor xq1, xq1, xq1 ;q1 = 0 + vpxor xq2, xq2, xq2 ;q2 = 0 + vpxor xq3, xq3, xq3 ;q3 = 0 + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxor xq1, xq1, xs1 ; q1 ^= s1 + vpxor xq2, xq2, xs2 ; q2 ^= s2 + vpxor xq3, xq3, xs3 ; q3 ^= s3 + vpxor xp1, xp1, xs1 ; p1 ^= s1 + vpxor xp2, xp2, xs2 ; p2 ^= s2 + vpxor xp3, xp3, xs3 ; p3 ^= s2 + vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00 + vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00 + vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + XLDR xs3, [ptr+pos+32] ; Get next vector (source data3) + vpaddb xq1, xq1, xq1 ; q1 = q1<<1 + vpaddb xq2, xq2, xq2 ; q2 = q2<<1 + vpaddb xq3, xq3, xq3 ; q3 = q3<<1 + vpxor xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + vpxor xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + vpxor xq3, xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxor xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + vpxor xq1, xq1, xs1 ;q1 ^= 1 * s1[0] + vpxor xp2, xp2, xs2 ;p2 ^= s2[0] + vpxor xq2, xq2, xs2 ;q2 ^= 1 * s2[0] + vpxor xp3, xp3, xs3 ;p3 ^= s3[0] + vpxor xq3, xq3, xs3 ;q3 ^= 1 * s3[0] + XSTR [ptr+pos], xp1 ;Write parity P1 vector + XSTR [ptr+pos+16], xp2 ;Write parity P2 vector + XSTR [ptr+pos+32], xp3 ;Write parity P3 vector + XSTR [tmp+pos], xq1 ;Write parity Q1 vector + XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector + XSTR [tmp+pos+32], xq3 ;Write parity Q3 vector + add pos, 48 + cmp pos, len + jle loop48 + + ;; ------------------------------ + ;; Do last 16 or 32 Bytes remaining + add len, 48 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + vpxor xp1, xp1, xp1 ;p = 0 + vpxor xq1, xq1, xq1 ;q = 0 + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxor xq1, xq1, xs1 ; q1 ^= s1 + vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00 + vpxor xp1, xp1, xs1 ; p ^= s + vpaddb xq1, xq1, xq1 ; q = q<<1 + vpxor xq1, xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxor xp1, xp1, xs1 ;p ^= s[0] - last source is already loaded + vpxor xq1, xq1, xs1 ;q ^= 1 * s[0] + XSTR [ptr+pos], xp1 ;Write parity P vector + XSTR [tmp+pos], xq1 ;Write parity Q vector + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_gen_avx, 02, 0a, 0039 diff --git a/raid/pq_gen_avx2.asm b/raid/pq_gen_avx2.asm new file mode 100644 index 0000000..96797a6 --- /dev/null +++ b/raid/pq_gen_avx2.asm @@ -0,0 +1,256 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using AVX +;;; int pq_gen_avx(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 32 bytes. Length must be 32 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define return rax + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define return rax + %define stack_size 8*32 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + ;; Until a sav_ymm256 is defined + vmovdqu [rsp + 0*32], ymm6 + vmovdqu [rsp + 1*32], ymm7 + vmovdqu [rsp + 2*32], ymm8 + vmovdqu [rsp + 3*32], ymm9 + vmovdqu [rsp + 4*32], ymm10 + vmovdqu [rsp + 5*32], ymm11 + vmovdqu [rsp + 6*32], ymm14 + vmovdqu [rsp + 7*32], ymm15 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqu ymm6, [rsp + 0*32] + vmovdqu ymm7, [rsp + 1*32] + vmovdqu ymm8, [rsp + 2*32] + vmovdqu ymm9, [rsp + 3*32] + vmovdqu ymm10, [rsp + 4*32] + vmovdqu ymm11, [rsp + 5*32] + vmovdqu ymm14, [rsp + 6*32] + vmovdqu ymm15, [rsp + 7*32] + add rsp, stack_size + %endmacro +%endif + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos rax + +%define xp1 ymm0 +%define xq1 ymm1 +%define xtmp1 ymm2 +%define xs1 ymm3 + +%define xp2 ymm4 +%define xq2 ymm5 +%define xtmp2 ymm6 +%define xs2 ymm7 + +%define xp3 ymm8 +%define xq3 ymm9 +%define xtmp3 ymm10 +%define xs3 ymm11 + +%define xzero ymm14 +%define xpoly ymm15 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa +%else + %define XLDR vmovntdqa + %define XSTR vmovntdq +%endif + +default rel + +[bits 64] +section .text + +align 16 +global pq_gen_avx2:function +func(pq_gen_avx2) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (32-1) ;Check alignment of length + jnz return_fail + mov pos, 0 + vmovdqa xpoly, [poly] + vpxor xzero, xzero, xzero + cmp len, 96 + jl loop32 + +len_aligned_32bytes: + sub len, 3*32 ;Len points to last block + +loop96: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+32] ;Preload last vector (source) + XLDR xs3, [ptr+pos+64] ;Preload last vector (source) + vpxor xp1, xp1, xp1 ;p1 = 0 + vpxor xp2, xp2, xp2 ;p2 = 0 + vpxor xp3, xp3, xp3 ;p3 = 0 + vpxor xq1, xq1, xq1 ;q1 = 0 + vpxor xq2, xq2, xq2 ;q2 = 0 + vpxor xq3, xq3, xq3 ;q3 = 0 + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxor xq1, xq1, xs1 ; q1 ^= s1 + vpxor xq2, xq2, xs2 ; q2 ^= s2 + vpxor xq3, xq3, xs3 ; q3 ^= s3 + vpxor xp1, xp1, xs1 ; p1 ^= s1 + vpxor xp2, xp2, xs2 ; p2 ^= s2 + vpxor xp3, xp3, xs3 ; p3 ^= s2 + vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00 + vpblendvb xtmp2, xzero, xpoly, xq2 ; xtmp2 = poly or 0x00 + vpblendvb xtmp3, xzero, xpoly, xq3 ; xtmp3 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+32] ; Get next vector (source data2) + XLDR xs3, [ptr+pos+64] ; Get next vector (source data3) + vpaddb xq1, xq1, xq1 ; q1 = q1<<1 + vpaddb xq2, xq2, xq2 ; q2 = q2<<1 + vpaddb xq3, xq3, xq3 ; q3 = q3<<1 + vpxor xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + vpxor xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + vpxor xq3, xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxor xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + vpxor xq1, xq1, xs1 ;q1 ^= 1 * s1[0] + vpxor xp2, xp2, xs2 ;p2 ^= s2[0] + vpxor xq2, xq2, xs2 ;q2 ^= 1 * s2[0] + vpxor xp3, xp3, xs3 ;p3 ^= s3[0] + vpxor xq3, xq3, xs3 ;q3 ^= 1 * s3[0] + XSTR [ptr+pos], xp1 ;Write parity P1 vector + XSTR [ptr+pos+32], xp2 ;Write parity P2 vector + XSTR [ptr+pos+64], xp3 ;Write parity P3 vector + XSTR [tmp+pos], xq1 ;Write parity Q1 vector + XSTR [tmp+pos+32], xq2 ;Write parity Q2 vector + XSTR [tmp+pos+64], xq3 ;Write parity Q3 vector + add pos, 3*32 + cmp pos, len + jle loop96 + + ;; ------------------------------ + ;; Do last 16 or 32 Bytes remaining + add len, 3*32 + cmp pos, len + je return_pass + +loop32: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + vpxor xp1, xp1, xp1 ;p = 0 + vpxor xq1, xq1, xq1 ;q = 0 + +next_vect32: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + vpxor xq1, xq1, xs1 ; q1 ^= s1 + vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00 + vpxor xp1, xp1, xs1 ; p ^= s + vpaddb xq1, xq1, xq1 ; q = q<<1 + vpxor xq1, xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect32 ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + vpxor xp1, xp1, xs1 ;p ^= s[0] - last source is already loaded + vpxor xq1, xq1, xs1 ;q ^= 1 * s[0] + XSTR [ptr+pos], xp1 ;Write parity P vector + XSTR [tmp+pos], xq1 ;Write parity Q vector + add pos, 32 + cmp pos, len + jl loop32 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 32 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_gen_avx2, 04, 03, 0041 diff --git a/raid/pq_gen_perf.c b/raid/pq_gen_perf.c new file mode 100644 index 0000000..7d9289b --- /dev/null +++ b/raid/pq_gen_perf.c @@ -0,0 +1,97 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include +#include "raid.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_SOURCES 10 +# define TEST_LEN 8*1024 +# define TEST_LOOPS 40000 +# define TEST_TYPE_STR "_warm" +#else +# ifndef TEST_CUSTOM +// Uncached test. Pull from large mem base. +# define TEST_SOURCES 10 +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN ((GT_L3_CACHE / TEST_SOURCES) & ~(64-1)) +# define TEST_LOOPS 1000 +# define TEST_TYPE_STR "_cold" +# else +# define TEST_TYPE_STR "_cus" +# ifndef TEST_LOOPS +# define TEST_LOOPS 1000 +# endif +# endif +#endif + +#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN)) + +int main(int argc, char *argv[]) +{ + int i; + void *buffs[TEST_SOURCES + 2]; + struct perf start, stop; + + printf("Test pq_gen_perf %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 2; i++) { + int ret; + void *buf; + ret = posix_memalign(&buf, 32, TEST_LEN); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Setup data + for (i = 0; i < TEST_SOURCES + 2; i++) + memset(buffs[i], 0, TEST_LEN); + + // Warm up + pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs); + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs); + perf_stop(&stop); + printf("pq_gen" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + return 0; +} diff --git a/raid/pq_gen_sse.asm b/raid/pq_gen_sse.asm new file mode 100644 index 0000000..1426f3f --- /dev/null +++ b/raid/pq_gen_sse.asm @@ -0,0 +1,258 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using SSE3 +;;; int pq_gen_sse(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define return rax + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define return rax + %define stack_size 7*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm15, 6*16 + end_prolog + %endmacro + + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm15, [rsp + 6*16] + add rsp, stack_size + %endmacro +%endif + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos rax + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%define xp3 xmm8 +%define xq3 xmm9 +%define xtmp3 xmm10 +%define xs3 xmm11 + +%define xpoly xmm15 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +default rel + +[bits 64] +section .text + +align 16 +global pq_gen_sse:function +func(pq_gen_sse) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 + movdqa xpoly, [poly] + cmp len, 48 + jl loop16 + +len_aligned_32bytes: + sub len, 48 ;Len points to last block + +loop48: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + XLDR xs3, [ptr+pos+32] ;Preload last vector (source) + pxor xp1, xp1 ;p1 = 0 + pxor xp2, xp2 ;p2 = 0 + pxor xp3, xp3 ;p3 = 0 + pxor xq1, xq1 ;q1 = 0 + pxor xq2, xq2 ;q2 = 0 + pxor xq3, xq3 ;q3 = 0 + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + pxor xq1, xs1 ; q1 ^= s1 + pxor xq2, xs2 ; q2 ^= s2 + pxor xq3, xs3 ; q3 ^= s3 + pxor xp1, xs1 ; p1 ^= s1 + pxor xp2, xs2 ; p2 ^= s2 + pxor xp3, xs3 ; p3 ^= s2 + pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0 + pxor xtmp2, xtmp2 ; xtmp2 = 0 + pxor xtmp3, xtmp3 ; xtmp3 = 0 + pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp3, xq3 ; xtmp3 = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp1 = poly or 0x00 + pand xtmp2, xpoly ; xtmp2 = poly or 0x00 + pand xtmp3, xpoly ; xtmp3 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + XLDR xs3, [ptr+pos+32] ; Get next vector (source data3) + paddb xq1, xq1 ; q1 = q1<<1 + paddb xq2, xq2 ; q2 = q2<<1 + paddb xq3, xq3 ; q3 = q3<<1 + pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + pxor xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + pxor xq1, xs1 ;q1 ^= 1 * s1[0] + pxor xp2, xs2 ;p2 ^= s2[0] + pxor xq2, xs2 ;q2 ^= 1 * s2[0] + pxor xp3, xs3 ;p3 ^= s3[0] + pxor xq3, xs3 ;q3 ^= 1 * s3[0] + XSTR [ptr+pos], xp1 ;Write parity P1 vector + XSTR [ptr+pos+16], xp2 ;Write parity P2 vector + XSTR [ptr+pos+32], xp3 ;Write parity P3 vector + XSTR [tmp+pos], xq1 ;Write parity Q1 vector + XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector + XSTR [tmp+pos+32], xq3 ;Write parity Q3 vector + add pos, 48 + cmp pos, len + jle loop48 + + ;; ------------------------------ + ;; Do last 16 or 32 Bytes remaining + add len, 48 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+vec*8] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + pxor xp1, xp1 ;p = 0 + pxor xq1, xq1 ;q = 0 + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*8] ; get pointer to next vect + pxor xq1, xs1 ; q1 ^= s1 + pxor xtmp1, xtmp1 ; xtmp = 0 + pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp = poly or 0x00 + pxor xp1, xs1 ; p ^= s + paddb xq1, xq1 ; q = q<<1 + pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector + pxor xp1, xs1 ;p ^= s[0] - last source is already loaded + pxor xq1, xs1 ;q ^= 1 * s[0] + XSTR [ptr+pos], xp1 ;Write parity P vector + XSTR [tmp+pos], xq1 ;Write parity Q vector + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_gen_sse, 00, 09, 0032 diff --git a/raid/pq_gen_sse_i32.asm b/raid/pq_gen_sse_i32.asm new file mode 100644 index 0000000..16093d5 --- /dev/null +++ b/raid/pq_gen_sse_i32.asm @@ -0,0 +1,264 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized pq of N source vectors using SSE3 +;;; int pq_gen_sse(int vects, int len, void **array) + +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers +;;; (**array). Last two pointers are the P and Q destinations respectively. +;;; Vectors must be aligned to 16 bytes. Length must be 16 byte aligned. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define return rax + %define PS 8 + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define return rax + %define PS 8 + %define tmp r10 + %define stack_size 2*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + add rsp, stack_size + %endmacro + +%elifidn __OUTPUT_FORMAT__, elf32 + %define arg0 edx + %define arg1 ecx + %define return eax + %define PS 4 + %define func(x) x: + %define arg(x) [ebp+8+PS*x] + %define arg2 edi ; must sav/restore + %define arg3 esi + %define tmp ebx + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg0, arg(0) + mov arg1, arg(1) + mov arg2, arg(2) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp ;if has frame pointer? + pop ebp + %endmacro + +%endif ; output formats + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos return + +%define xp1 xmm0 +%define xq1 xmm1 +%define xtmp1 xmm2 +%define xs1 xmm3 + +%define xp2 xmm4 +%define xq2 xmm5 +%define xtmp2 xmm6 +%define xs2 xmm7 + +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] + %define xpoly xmm15 +%elifidn PS,4 ; 32-bit code + %define xpoly [poly] +%endif + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +section .text + +align 16 +global pq_gen_sse:function +func(pq_gen_sse) + FUNC_SAVE + sub vec, 3 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (16-1) ;Check alignment of length + jnz return_fail + mov pos, 0 +%ifidn PS,8 + movdqa xpoly, [poly] ;For 64-bit, load poly into high xmm reg +%endif + cmp len, 32 + jl loop16 + +len_aligned_32bytes: + sub len, 32 ;Do end of vec first and run backward + +loop32: + mov ptr, [arg2+vec*PS] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + XLDR xs2, [ptr+pos+16] ;Preload last vector (source) + pxor xp1, xp1 ;p1 = 0 + pxor xq1, xq1 ;q1 = 0 + pxor xp2, xp2 ;p2 = 0 + pxor xq2, xq2 ;q2 = 0 + +next_vect: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*PS] ; get pointer to next vect + pxor xq1, xs1 ; q1 ^= s1 + pxor xq2, xs2 ; q2 ^= s2 + pxor xp1, xs1 ; p1 ^= s1 + pxor xp2, xs2 ; p2 ^= s2 + pxor xtmp1, xtmp1 ; xtmp1 = 0 - for compare to 0 + pxor xtmp2, xtmp2 ; xtmp2 = 0 + pcmpgtb xtmp1, xq1 ; xtmp1 = mask 0xff or 0x00 if bit7 set + pcmpgtb xtmp2, xq2 ; xtmp2 = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp1 = poly or 0x00 + pand xtmp2, xpoly ; xtmp2 = poly or 0x00 + XLDR xs1, [ptr+pos] ; Get next vector (source data1) + XLDR xs2, [ptr+pos+16] ; Get next vector (source data2) + paddb xq1, xq1 ; q1 = q1<<1 + paddb xq2, xq2 ; q2 = q2<<1 + pxor xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked + pxor xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked + jg next_vect ; Loop for each vect except 0 + + mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + pxor xp1, xs1 ;p1 ^= s1[0] - last source is already loaded + pxor xq1, xs1 ;q1 ^= 1 * s1[0] + pxor xp2, xs2 ;p2 ^= s2[0] + pxor xq2, xs2 ;q2 ^= 1 * s2[0] + XSTR [ptr+pos], xp1 ;Write parity P1 vector + XSTR [ptr+pos+16], xp2 ;Write parity P2 vector + XSTR [tmp+pos], xq1 ;Write parity Q1 vector + XSTR [tmp+pos+16], xq2 ;Write parity Q2 vector + add pos, 32 + cmp pos, len + jle loop32 + + ;; ------------------------------ + ;; Do last 16 Bytes remaining + add len, 32 + cmp pos, len + je return_pass + +loop16: + mov ptr, [arg2+vec*PS] ;Fetch last source pointer + mov tmp, vec ;Set tmp to point back to last vector + XLDR xs1, [ptr+pos] ;Preload last vector (source) + pxor xp1, xp1 ;p = 0 + pxor xq1, xq1 ;q = 0 + +next_vect16: + sub tmp, 1 ;Inner loop for each source vector + mov ptr, [arg2+tmp*PS] ; get pointer to next vect + pxor xq1, xs1 ; q1 ^= s1 + pxor xtmp1, xtmp1 ; xtmp = 0 + pcmpgtb xtmp1, xq1 ; xtmp = mask 0xff or 0x00 if bit7 set + pand xtmp1, xpoly ; xtmp = poly or 0x00 + pxor xp1, xs1 ; p ^= s + paddb xq1, xq1 ; q = q<<1 + pxor xq1, xtmp1 ; q = q<<1 ^ poly_masked + XLDR xs1, [ptr+pos] ; Get next vector (source data) + jg next_vect16 ; Loop for each vect except 0 + + mov ptr, [arg2+PS+vec*PS] ;Get address of P parity vector + mov tmp, [arg2+(2*PS)+vec*PS] ;Get address of Q parity vector + pxor xp1, xs1 ;p ^= s[0] - last source is already loaded + pxor xq1, xs1 ;q ^= 1 * s[0] + XSTR [ptr+pos], xp1 ;Write parity P vector + XSTR [tmp+pos], xq1 ;Write parity Q vector + add pos, 16 + cmp pos, len + jl loop16 + + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +poly: +dq 0x1d1d1d1d1d1d1d1d, 0x1d1d1d1d1d1d1d1d + +;;; func core, ver, snum +slversion pq_gen_sse, 00, 08, 0032 diff --git a/raid/pq_gen_test.c b/raid/pq_gen_test.c new file mode 100644 index 0000000..d084496 --- /dev/null +++ b/raid/pq_gen_test.c @@ -0,0 +1,194 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 1024 +#define TEST_MEM ((TEST_SOURCES + 2)*(TEST_LEN)) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int dump(unsigned char *buf, int len) +{ + int i; + for (i = 0; i < len;) { + printf(" %2x", buf[i++]); + if (i % 16 == 0) + printf("\n"); + } + printf("\n"); + return 0; +} + +int main(int argc, char *argv[]) +{ + int i, j, k, ret, fail = 0; + void *buffs[TEST_SOURCES + 2]; // Pointers to src and dest + char *tmp_buf[TEST_SOURCES + 2]; + + printf("Test pq_gen_test "); + + srand(TEST_SEED); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 2; i++) { + void *buf; + ret = posix_memalign(&buf, 32, TEST_LEN); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Test of all zeros + for (i = 0; i < TEST_SOURCES + 2; i++) + memset(buffs[i], 0, TEST_LEN); + + pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs); + + for (i = 0; i < TEST_LEN; i++) { + if (((char *)buffs[TEST_SOURCES])[i] != 0) + fail++; + } + + for (i = 0; i < TEST_LEN; i++) { + if (((char *)buffs[TEST_SOURCES + 1])[i] != 0) + fail++; + } + + if (fail > 0) { + printf("fail zero test %d\n", fail); + return 1; + } else + putchar('.'); + + // Test rand1 + for (i = 0; i < TEST_SOURCES + 2; i++) + rand_buffer(buffs[i], TEST_LEN); + + ret = pq_gen(TEST_SOURCES + 2, TEST_LEN, buffs); + fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN, buffs); + + if (fail > 0) { + int t; + printf(" Fail rand test1 fail=%d, ret=%d\n", fail, ret); + for (t = 0; t < TEST_SOURCES + 2; t++) + dump(buffs[t], 15); + + printf(" reference function p,q\n"); + pq_gen_base(TEST_SOURCES + 2, TEST_LEN, buffs); + for (t = TEST_SOURCES; t < TEST_SOURCES + 2; t++) + dump(buffs[t], 15); + + return 1; + } else + putchar('.'); + + // Test various number of sources + for (j = 4; j <= TEST_SOURCES + 2; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], TEST_LEN); + + pq_gen(j, TEST_LEN, buffs); + fail |= pq_check_base(j, TEST_LEN, buffs); + + if (fail > 0) { + printf("fail rand test %d sources\n", j); + return 1; + } else + putchar('.'); + } + + fflush(0); + + // Test various number of sources and len + k = 0; + while (k <= TEST_LEN) { + for (j = 4; j <= TEST_SOURCES + 2; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], k); + + ret = pq_gen(j, k, buffs); + fail |= pq_check_base(j, k, buffs); + + if (fail > 0) { + printf("fail rand test %d sources, len=%d, fail=" + "%d, ret=%d\n", j, k, fail, ret); + return 1; + } + } + putchar('.'); + k += 32; + } + + // Test at the end of buffer + k = 0; + while (k <= TEST_LEN) { + for (j = 0; j < (TEST_SOURCES + 2); j++) { + rand_buffer(buffs[j], TEST_LEN - k); + tmp_buf[j] = (char *)buffs[j] + k; + } + + ret = pq_gen(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf); + fail |= pq_check_base(TEST_SOURCES + 2, TEST_LEN - k, (void *)tmp_buf); + + if (fail > 0) { + printf("fail end test - offset: %d, len: %d, fail: %d, " + "ret: %d\n", k, TEST_LEN - k, fail, ret); + return 1; + } + + putchar('.'); + fflush(0); + k += 32; + } + + if (!fail) + printf(" done: Pass\n"); + + return fail; +} diff --git a/raid/raid_base.c b/raid/raid_base.c new file mode 100644 index 0000000..25c1933 --- /dev/null +++ b/raid/raid_base.c @@ -0,0 +1,147 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include + +#if __WORDSIZE == 64 || _WIN64 || __x86_64__ +# define notbit0 0xfefefefefefefefeULL +# define bit7 0x8080808080808080ULL +# define gf8poly 0x1d1d1d1d1d1d1d1dULL +#else +# define notbit0 0xfefefefeUL +# define bit7 0x80808080UL +# define gf8poly 0x1d1d1d1dUL +#endif + +int pq_gen_base(int vects, int len, void **array) +{ + int i, j; + unsigned long p, q, s; + unsigned long **src = (unsigned long **)array; + int blocks = len / sizeof(long); + + for (i = 0; i < blocks; i++) { + q = p = src[vects - 3][i]; + + for (j = vects - 4; j >= 0; j--) { + p ^= s = src[j][i]; + q = s ^ (((q << 1) & notbit0) ^ // shift each byte + ((((q & bit7) << 1) - ((q & bit7) >> 7)) // mask out bytes + & gf8poly)); // apply poly + } + + src[vects - 2][i] = p; // second to last pointer is p + src[vects - 1][i] = q; // last pointer is q + } + return 0; +} + +int pq_check_base(int vects, int len, void **array) +{ + int i, j; + unsigned char p, q, s; + unsigned char **src = (unsigned char **)array; + + for (i = 0; i < len; i++) { + q = p = src[vects - 3][i]; + + for (j = vects - 4; j >= 0; j--) { + s = src[j][i]; + p ^= s; + + // mult by GF{2} + q = s ^ ((q << 1) ^ ((q & 0x80) ? 0x1d : 0)); + } + + if (src[vects - 2][i] != p) // second to last pointer is p + return i | 1; + if (src[vects - 1][i] != q) // last pointer is q + return i | 2; + } + return 0; +} + +int xor_gen_base(int vects, int len, void **array) +{ + int i, j; + unsigned char parity; + unsigned char **src = (unsigned char **)array; + + for (i = 0; i < len; i++) { + parity = src[0][i]; + for (j = 1; j < vects - 1; j++) + parity ^= src[j][i]; + + src[vects - 1][i] = parity; // last pointer is dest + + } + + return 0; +} + +int xor_check_base(int vects, int len, void **array) +{ + int i, j, fail = 0; + + unsigned char parity; + unsigned char **src = (unsigned char **)array; + + for (i = 0; i < len; i++) { + parity = 0; + for (j = 0; j < vects; j++) + parity ^= src[j][i]; + + if (parity != 0) { + fail = 1; + break; + } + } + if (fail && len > 0) + return len; + return fail; +} + +struct slver { + unsigned short snum; + unsigned char ver; + unsigned char core; +}; + +struct slver pq_gen_base_slver_0001012a; +struct slver pq_gen_base_slver = { 0x012a, 0x01, 0x00 }; + +struct slver xor_gen_base_slver_0001012b; +struct slver xor_gen_base_slver = { 0x012b, 0x01, 0x00 }; + +struct slver pq_check_base_slver_0001012c; +struct slver pq_check_base_slver = { 0x012c, 0x01, 0x00 }; + +struct slver xor_check_base_slver_0001012d; +struct slver xor_check_base_slver = { 0x012d, 0x01, 0x00 }; diff --git a/raid/raid_multibinary.asm b/raid/raid_multibinary.asm new file mode 100644 index 0000000..f079656 --- /dev/null +++ b/raid/raid_multibinary.asm @@ -0,0 +1,140 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 +%define WRT_OPT wrt ..plt +%else +%define WRT_OPT +%endif + +%include "reg_sizes.asm" +%include "multibinary.asm" + +default rel +[bits 64] + +extern pq_gen_base +extern pq_gen_sse +extern pq_gen_avx +extern pq_gen_avx2 + +extern xor_gen_base +extern xor_gen_sse +extern xor_gen_avx + +extern pq_check_base +extern pq_check_sse + +extern xor_check_base +extern xor_check_sse + +mbin_interface xor_gen +mbin_interface pq_gen + +mbin_dispatch_init5 xor_gen, xor_gen_base, xor_gen_sse, xor_gen_avx, xor_gen_avx +mbin_dispatch_init5 pq_gen, pq_gen_base, pq_gen_sse, pq_gen_avx, pq_gen_avx2 + + +section .data + +xor_check_dispatched: + dq xor_check_mbinit +pq_check_dispatched: + dq pq_check_mbinit + +section .text + +;;;; +; pq_check multibinary function +;;;; +global pq_check:function +pq_check_mbinit: + call pq_check_dispatch_init +pq_check: + jmp qword [pq_check_dispatched] + +pq_check_dispatch_init: + push rax + push rbx + push rcx + push rdx + push rsi + lea rsi, [pq_check_base WRT_OPT] ; Default + + mov eax, 1 + cpuid + test ecx, FLAG_CPUID1_ECX_SSE4_1 + lea rbx, [pq_check_sse WRT_OPT] + cmovne rsi, rbx + + mov [pq_check_dispatched], rsi + pop rsi + pop rdx + pop rcx + pop rbx + pop rax + ret + + +;;;; +; xor_check multibinary function +;;;; +global xor_check:function +xor_check_mbinit: + call xor_check_dispatch_init +xor_check: + jmp qword [xor_check_dispatched] + +xor_check_dispatch_init: + push rax + push rbx + push rcx + push rdx + push rsi + lea rsi, [xor_check_base WRT_OPT] ; Default + + mov eax, 1 + cpuid + test ecx, FLAG_CPUID1_ECX_SSE4_1 + lea rbx, [xor_check_sse WRT_OPT] + cmovne rsi, rbx + + mov [xor_check_dispatched], rsi + pop rsi + pop rdx + pop rcx + pop rbx + pop rax + ret + +;;; func core, ver, snum +slversion xor_gen, 00, 03, 0126 +slversion xor_check, 00, 03, 0127 +slversion pq_gen, 00, 03, 0128 +slversion pq_check, 00, 03, 0129 diff --git a/raid/xor_check_sse.asm b/raid/xor_check_sse.asm new file mode 100644 index 0000000..65ae2f7 --- /dev/null +++ b/raid/xor_check_sse.asm @@ -0,0 +1,285 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized xor of N source vectors using SSE +;;; int xor_gen_sse(int vects, int len, void **array) + +;;; Generates xor parity vector from N (vects-1) sources in array of pointers +;;; (**array). Last pointer is the dest. +;;; Vectors must be aligned to 16 bytes. Length can be any value. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 rax + %define tmp2.b al + %define tmp3 arg4 + %define return rax + %define PS 8 + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define return rax + %define tmp2 rax + %define tmp2.b al + %define PS 8 + %define tmp r11 + %define tmp3 r10 + %define stack_size 2*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + add rsp, stack_size + %endmacro + + +%elifidn __OUTPUT_FORMAT__, elf32 + %define arg0 arg(0) + %define arg1 ecx + %define tmp2 eax + %define tmp2.b al + %define tmp3 edx + %define return eax + %define PS 4 + %define func(x) x: + %define arg(x) [ebp+8+PS*x] + %define arg2 edi ; must sav/restore + %define arg3 esi + %define tmp ebx + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg1, arg(1) + mov arg2, arg(2) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp ;if has frame pointer + pop ebp + %endmacro + +%endif ; output formats + + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos tmp3 + +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +section .text + +align 16 +global xor_check_sse:function +func(xor_check_sse) + FUNC_SAVE +%ifidn PS,8 ;64-bit code + sub vec, 1 ; Keep as offset to last source +%else ;32-bit code + mov tmp, arg(0) ; Update vec length arg to last source + sub tmp, 1 + mov arg(0), tmp +%endif + + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (128-1) ;Check alignment of length + jnz len_not_aligned + + +len_aligned_128bytes: + sub len, 128 + mov pos, 0 + mov tmp, vec ;Preset to last vector + +loop128: + mov tmp2, [arg2+tmp*PS] ;Fetch last pointer in array + sub tmp, 1 ;Next vect + XLDR xmm0, [tmp2+pos] ;Start with end of array in last vector + XLDR xmm1, [tmp2+pos+16] ;Keep xor parity in xmm0-7 + XLDR xmm2, [tmp2+pos+(2*16)] + XLDR xmm3, [tmp2+pos+(3*16)] + XLDR xmm4, [tmp2+pos+(4*16)] + XLDR xmm5, [tmp2+pos+(5*16)] + XLDR xmm6, [tmp2+pos+(6*16)] + XLDR xmm7, [tmp2+pos+(7*16)] + +next_vect: + mov ptr, [arg2+tmp*PS] + sub tmp, 1 + xorpd xmm0, [ptr+pos] ;Get next vector (source) + xorpd xmm1, [ptr+pos+16] + xorpd xmm2, [ptr+pos+(2*16)] + xorpd xmm3, [ptr+pos+(3*16)] + xorpd xmm4, [ptr+pos+(4*16)] + xorpd xmm5, [ptr+pos+(5*16)] + xorpd xmm6, [ptr+pos+(6*16)] + xorpd xmm7, [ptr+pos+(7*16)] +;;; prefetch [ptr+pos+(8*16)] + jge next_vect ;Loop for each vect + + ;; End of vects, chech that all parity regs = 0 + mov tmp, vec ;Back to last vector + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + por xmm0, xmm4 + por xmm0, xmm5 + por xmm0, xmm6 + por xmm0, xmm7 + ptest xmm0, xmm0 + jnz return_fail + + add pos, 128 + cmp pos, len + jle loop128 + +return_pass: + FUNC_RESTORE + mov return, 0 + ret + + + +;;; Do one byte at a time for no alignment case + +xor_gen_byte: + mov tmp, vec ;Preset to last vector + +loop_1byte: + mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array + mov tmp2.b, [ptr+len-1] ;Get array n + sub tmp, 1 +nextvect_1byte: + mov ptr, [arg2+tmp*PS] + xor tmp2.b, [ptr+len-1] + sub tmp, 1 + jge nextvect_1byte + + mov tmp, vec ;Back to last vector + cmp tmp2.b, 0 + jne return_fail + sub len, 1 + test len, (8-1) + jnz loop_1byte + + cmp len, 0 + je return_pass + test len, (128-1) ;If not 0 and 128bit aligned + jz len_aligned_128bytes ; then do aligned case. len = y * 128 + + ;; else we are 8-byte aligned so fall through to recheck + + + ;; Unaligned length cases +len_not_aligned: + test len, (PS-1) + jne xor_gen_byte + mov tmp3, len + and tmp3, (128-1) ;Do the unaligned bytes 4-8 at a time + mov tmp, vec ;Preset to last vector + + ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes +loopN_bytes: + mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array + mov tmp2, [ptr+len-PS] ;Get array n + sub tmp, 1 +nextvect_Nbytes: + mov ptr, [arg2+tmp*PS] ;Get pointer to next vector + xor tmp2, [ptr+len-PS] + sub tmp, 1 + jge nextvect_Nbytes ;Loop for each source + + mov tmp, vec ;Back to last vector + cmp tmp2, 0 + jne return_fail + sub len, PS + sub tmp3, PS + jg loopN_bytes + + cmp len, 128 ;Now len is aligned to 128B + jge len_aligned_128bytes ;We can do the rest aligned + + cmp len, 0 + je return_pass + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +;;; func core, ver, snum +slversion xor_check_sse, 00, 03, 0031 + diff --git a/raid/xor_check_test.c b/raid/xor_check_test.c new file mode 100644 index 0000000..dfb571a --- /dev/null +++ b/raid/xor_check_test.c @@ -0,0 +1,280 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 1024 +#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN)) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(int argc, char *argv[]) +{ + int i, j, k, ret, fail = 0; + void *buffs[TEST_SOURCES + 1]; + char c; + int serr, lerr; + char *tmp_buf[TEST_SOURCES + 1]; + + printf("Test xor_check_test %d sources X %d bytes\n", TEST_SOURCES, TEST_LEN); + + srand(TEST_SEED); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 1; i++) { + void *buf; + if (posix_memalign(&buf, 16, TEST_LEN)) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Test of all zeros + for (i = 0; i < TEST_SOURCES + 1; i++) + memset(buffs[i], 0, TEST_LEN); + + xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs); + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret != 0) { + fail++; + printf("\nfail zero test %d\n", ret); + } + + ((char *)(buffs[0]))[TEST_LEN - 2] = 0x7; // corrupt buffer + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nfail corrupt buffer test %d\n", ret); + } + ((char *)(buffs[0]))[TEST_LEN - 2] = 0; // un-corrupt buffer + + // Test corrupted buffer any location on all sources + for (j = 0; j < TEST_SOURCES + 1; j++) { + for (i = TEST_LEN - 1; i >= 0; i--) { + ((char *)buffs[j])[i] = 0x5; // corrupt buffer + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i); + return 1; + } + ((char *)buffs[j])[i] = 0; // un-corrupt buffer + } + putchar('.'); + } + + // Test rand1 + for (i = 0; i < TEST_SOURCES + 1; i++) + rand_buffer(buffs[i], TEST_LEN); + + xor_gen_base(TEST_SOURCES + 1, TEST_LEN, buffs); + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret != 0) { + fail++; + printf("fail first rand test %d\n", ret); + } + + c = ((char *)(buffs[0]))[TEST_LEN - 2]; + ((char *)(buffs[0]))[TEST_LEN - 2] = c ^ 0x1; + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret == 0) { + fail++; + printf("\nFail corrupt buffer test, passed when should have failed\n"); + } + ((char *)(buffs[0]))[TEST_LEN - 2] = c; // un-corrupt buffer + + // Test corrupted buffer any location on all sources w/ random data + for (j = 0; j < TEST_SOURCES + 1; j++) { + for (i = TEST_LEN - 1; i >= 0; i--) { + // Check it still passes + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret != 0) { // should pass + fail++; + printf + ("\nFail rand test with un-corrupted buffer j=%d, i=%d\n", + j, i); + return 1; + } + c = ((char *)buffs[j])[i]; + ((char *)buffs[j])[i] = c ^ 1; // corrupt buffer + ret = xor_check(TEST_SOURCES + 1, TEST_LEN, buffs); + if (ret == 0) { // Check it now fails + fail++; + printf("\nfail corrupt buffer test j=%d, i=%d\n", j, i); + return 1; + } + ((char *)buffs[j])[i] = c; // un-corrupt buffer + } + putchar('.'); + } + + // Test various number of sources, full length + for (j = 3; j <= TEST_SOURCES + 1; j++) { + // New random data + for (i = 0; i < j; i++) + rand_buffer(buffs[i], TEST_LEN); + + // Generate xor parity for this number of sources + xor_gen_base(j, TEST_LEN, buffs); + + // Set errors up in each source and len position + for (i = 0; i < j; i++) { + for (k = 0; k < TEST_LEN; k++) { + // See if it still passes + ret = xor_check(j, TEST_LEN, buffs); + if (ret != 0) { // Should pass + printf("\nfail rand test %d sources\n", j); + fail++; + return 1; + } + + c = ((char *)buffs[i])[k]; + ((char *)buffs[i])[k] = c ^ 1; // corrupt buffer + + ret = xor_check(j, TEST_LEN, buffs); + if (ret == 0) { // Should fail + printf + ("\nfail rand test corrupted buffer %d sources\n", + j); + fail++; + return 1; + } + ((char *)buffs[i])[k] = c; // un-corrupt buffer + } + } + putchar('.'); + } + + fflush(0); + + // Test various number of sources and len + k = 1; + while (k <= TEST_LEN) { + for (j = 3; j <= TEST_SOURCES + 1; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], k); + + // Generate xor parity for this number of sources + xor_gen_base(j, k, buffs); + + // Inject errors at various source and len positions + for (lerr = 0; lerr < k; lerr += 10) { + for (serr = 0; serr < j; serr++) { + + // See if it still passes + ret = xor_check(j, k, buffs); + if (ret != 0) { // Should pass + printf("\nfail rand test %d sources\n", j); + fail++; + return 1; + } + + c = ((char *)buffs[serr])[lerr]; + ((char *)buffs[serr])[lerr] = c ^ 1; // corrupt buffer + + ret = xor_check(j, k, buffs); + if (ret == 0) { // Should fail + printf("\nfail rand test corrupted buffer " + "%d sources, len=%d, ret=%d\n", j, k, + ret); + fail++; + return 1; + } + ((char *)buffs[serr])[lerr] = c; // un-corrupt buffer + } + } + } + putchar('.'); + fflush(0); + k += 1; + } + + // Test at the end of buffer + for (i = 0; i < TEST_LEN; i += 32) { + for (j = 0; j < TEST_SOURCES + 1; j++) { + rand_buffer(buffs[j], TEST_LEN - i); + tmp_buf[j] = (char *)buffs[j] + i; + } + + xor_gen_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + + // Test good data + ret = xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + if (ret != 0) { + printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i); + fail++; + return 1; + } + // Test bad data + for (serr = 0; serr < TEST_SOURCES + 1; serr++) { + for (lerr = 0; lerr < (TEST_LEN - i); lerr++) { + c = tmp_buf[serr][lerr]; + tmp_buf[serr][lerr] = c ^ 1; + + ret = + xor_check(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + if (ret == 0) { + printf("fail end test corrupted buffer - " + "offset: %d, len: %d, ret: %d\n", i, + TEST_LEN - i, ret); + fail++; + return 1; + } + + tmp_buf[serr][lerr] = c; + } + } + + putchar('.'); + fflush(0); + } + + if (fail == 0) + printf("Pass\n"); + + return fail; + +} diff --git a/raid/xor_example.c b/raid/xor_example.c new file mode 100644 index 0000000..d328c31 --- /dev/null +++ b/raid/xor_example.c @@ -0,0 +1,70 @@ +/********************************************************************** + Copyright(c) 2011-2013 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 16*1024 + +int main(int argc, char *argv[]) +{ + int i, j, should_pass, should_fail; + void *buffs[TEST_SOURCES + 1]; + + printf("XOR example\n"); + for (i = 0; i < TEST_SOURCES + 1; i++) { + void *buf; + if (posix_memalign(&buf, 16, TEST_LEN)) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + printf("Make random data\n"); + for (i = 0; i < TEST_SOURCES + 1; i++) + for (j = 0; j < TEST_LEN; j++) + ((char *)buffs[i])[j] = rand(); + + printf("Generate xor parity\n"); + xor_gen_sse(TEST_SOURCES + 1, TEST_LEN, buffs); + + printf("Check parity: "); + should_pass = xor_check_sse(TEST_SOURCES + 1, TEST_LEN, buffs); + printf("%s\n", should_pass == 0 ? "Pass" : "Fail"); + + printf("Find corruption: "); + ((char *)buffs[TEST_SOURCES / 2])[TEST_LEN / 2] ^= 1; // flip one bit + should_fail = xor_check_sse(TEST_SOURCES + 1, TEST_LEN, buffs); //recheck + printf("%s\n", should_fail != 0 ? "Pass" : "Fail"); + + return 0; +} diff --git a/raid/xor_gen_avx.asm b/raid/xor_gen_avx.asm new file mode 100644 index 0000000..536ab3e --- /dev/null +++ b/raid/xor_gen_avx.asm @@ -0,0 +1,228 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized xor of N source vectors using AVX +;;; int xor_gen_avx(int vects, int len, void **array) + +;;; Generates xor parity vector from N (vects-1) sources in array of pointers +;;; (**array). Last pointer is the dest. +;;; Vectors must be aligned to 32 bytes. Length can be any value. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp3 arg4 + %define func(x) x: + %define return rax + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define tmp r11 + %define tmp3 r10 + %define func(x) proc_frame x + %define return rax + %define stack_size 2*32 + 8 ;must be an odd multiple of 8 + + %macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqu [rsp + 0*32], ymm6 + vmovdqu [rsp + 1*32], ymm7 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + vmovdqu ymm6, [rsp + 0*32] + vmovdqu ymm7, [rsp + 1*32] + add rsp, stack_size + %endmacro + +%endif ;output formats + + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define tmp2 rax +%define tmp2.b al +%define pos tmp3 +%define PS 8 + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa +%else + %define XLDR vmovdqa + %define XSTR vmovntdq +%endif + + +default rel +[bits 64] + +section .text + +align 16 +global xor_gen_avx:function +func(xor_gen_avx) + + FUNC_SAVE + sub vec, 2 ;Keep as offset to last source + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (128-1) ;Check alignment of length + jnz len_not_aligned + + +len_aligned_128bytes: + sub len, 128 + mov pos, 0 + +loop128: + mov tmp, vec ;Back to last vector + mov tmp2, [arg2+vec*PS] ;Fetch last pointer in array + sub tmp, 1 ;Next vect + XLDR ymm0, [tmp2+pos] ;Start with end of array in last vector + XLDR ymm1, [tmp2+pos+32] ;Keep xor parity in xmm0-7 + XLDR ymm2, [tmp2+pos+(2*32)] + XLDR ymm3, [tmp2+pos+(3*32)] + +next_vect: + mov ptr, [arg2+tmp*PS] + sub tmp, 1 + XLDR ymm4, [ptr+pos] ;Get next vector (source) + XLDR ymm5, [ptr+pos+32] + XLDR ymm6, [ptr+pos+(2*32)] + XLDR ymm7, [ptr+pos+(3*32)] + vxorpd ymm0, ymm0, ymm4 ;Add to xor parity + vxorpd ymm1, ymm1, ymm5 + vxorpd ymm2, ymm2, ymm6 + vxorpd ymm3, ymm3, ymm7 + jge next_vect ;Loop for each source + + mov ptr, [arg2+PS+vec*PS] ;Address of parity vector + XSTR [ptr+pos], ymm0 ;Write parity xor vector + XSTR [ptr+pos+(1*32)], ymm1 + XSTR [ptr+pos+(2*32)], ymm2 + XSTR [ptr+pos+(3*32)], ymm3 + add pos, 128 + cmp pos, len + jle loop128 + +return_pass: + FUNC_RESTORE + mov return, 0 + ret + + +;;; Do one byte at a time for no alignment case +loop_1byte: + mov tmp, vec ;Back to last vector + mov ptr, [arg2+vec*PS] ;Fetch last pointer in array + mov tmp2.b, [ptr+len-1] ;Get array n + sub tmp, 1 +nextvect_1byte: + mov ptr, [arg2+tmp*PS] + xor tmp2.b, [ptr+len-1] + sub tmp, 1 + jge nextvect_1byte + + mov tmp, vec + add tmp, 1 ;Add back to point to last vec + mov ptr, [arg2+tmp*PS] + mov [ptr+len-1], tmp2.b ;Write parity + sub len, 1 + test len, (PS-1) + jnz loop_1byte + + cmp len, 0 + je return_pass + test len, (128-1) ;If not 0 and 128bit aligned + jz len_aligned_128bytes ; then do aligned case. len = y * 128 + + ;; else we are 8-byte aligned so fall through to recheck + + + ;; Unaligned length cases +len_not_aligned: + test len, (PS-1) + jne loop_1byte + mov tmp3, len + and tmp3, (128-1) ;Do the unaligned bytes 8 at a time + + ;; Run backwards 8 bytes at a time for (tmp3) bytes +loop8_bytes: + mov tmp, vec ;Back to last vector + mov ptr, [arg2+vec*PS] ;Fetch last pointer in array + mov tmp2, [ptr+len-PS] ;Get array n + sub tmp, 1 +nextvect_8bytes: + mov ptr, [arg2+tmp*PS] ;Get pointer to next vector + xor tmp2, [ptr+len-PS] + sub tmp, 1 + jge nextvect_8bytes ;Loop for each source + + mov tmp, vec + add tmp, 1 ;Add back to point to last vec + mov ptr, [arg2+tmp*PS] + mov [ptr+len-PS], tmp2 ;Write parity + sub len, PS + sub tmp3, PS + jg loop8_bytes + + cmp len, 128 ;Now len is aligned to 128B + jge len_aligned_128bytes ;We can do the rest aligned + + cmp len, 0 + je return_pass + +return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data + +;;; func core, ver, snum +slversion xor_gen_avx, 02, 05, 0037 + diff --git a/raid/xor_gen_perf.c b/raid/xor_gen_perf.c new file mode 100644 index 0000000..53a963d --- /dev/null +++ b/raid/xor_gen_perf.c @@ -0,0 +1,98 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include +#include "raid.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Loop many times over same +# define TEST_SOURCES 10 +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define TEST_SOURCES 10 +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN GT_L3_CACHE / TEST_SOURCES +# define TEST_LOOPS 1000 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN)) + +int main(int argc, char *argv[]) +{ + int i, ret, fail = 0; + void **buffs; + void *buff; + struct perf start, stop; + + printf("Test xor_gen_perf\n"); + + ret = posix_memalign((void **)&buff, 8, sizeof(int *) * (TEST_SOURCES + 6)); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs = buff; + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 1; i++) { + void *buf; + ret = posix_memalign(&buf, 32, TEST_LEN); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Setup data + for (i = 0; i < TEST_SOURCES + 1; i++) + memset(buffs[i], 0, TEST_LEN); + + // Warm up + xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs); + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs); + perf_stop(&stop); + printf("xor_gen" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_MEM * i); + + return fail; +} diff --git a/raid/xor_gen_sse.asm b/raid/xor_gen_sse.asm new file mode 100644 index 0000000..2fd6fae --- /dev/null +++ b/raid/xor_gen_sse.asm @@ -0,0 +1,284 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Optimized xor of N source vectors using SSE +;;; int xor_gen_sse(int vects, int len, void **array) + +;;; Generates xor parity vector from N (vects-1) sources in array of pointers +;;; (**array). Last pointer is the dest. +;;; Vectors must be aligned to 16 bytes. Length can be any value. + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 rax + %define tmp2.b al + %define tmp3 arg4 + %define return rax + %define PS 8 + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE + +%elifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define return rax + %define tmp2 rax + %define tmp2.b al + %define PS 8 + %define tmp r11 + %define tmp3 r10 + %define stack_size 2*16 + 8 ; must be an odd multiple of 8 + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + end_prolog + %endmacro + %macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + add rsp, stack_size + %endmacro + + +%elifidn __OUTPUT_FORMAT__, elf32 + %define arg0 arg(0) + %define arg1 ecx + %define tmp2 eax + %define tmp2.b al + %define tmp3 edx + %define return eax + %define PS 4 + %define func(x) x: + %define arg(x) [ebp+8+PS*x] + %define arg2 edi ; must sav/restore + %define arg3 esi + %define tmp ebx + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg1, arg(1) + mov arg2, arg(2) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp ;if has frame pointer + pop ebp + %endmacro + +%endif ; output formats + + +%define vec arg0 +%define len arg1 +%define ptr arg3 +%define pos tmp3 + +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif + +;;; Use Non-temporal load/stor +%ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa +%else + %define XLDR movntdqa + %define XSTR movntdq +%endif + +section .text + +align 16 +global xor_gen_sse:function +func(xor_gen_sse) + FUNC_SAVE +%ifidn PS,8 ;64-bit code + sub vec, 2 ; Keep as offset to last source +%else ;32-bit code + mov tmp, arg(0) ; Update vec length arg to last source + sub tmp, 2 + mov arg(0), tmp +%endif + + jng return_fail ;Must have at least 2 sources + cmp len, 0 + je return_pass + test len, (128-1) ;Check alignment of length + jnz len_not_aligned + + +len_aligned_128bytes: + sub len, 128 + mov pos, 0 + mov tmp, vec ;Preset to last vector + +loop128: + mov tmp2, [arg2+tmp*PS] ;Fetch last pointer in array + sub tmp, 1 ;Next vect + XLDR xmm0, [tmp2+pos] ;Start with end of array in last vector + XLDR xmm1, [tmp2+pos+16] ;Keep xor parity in xmm0-7 + XLDR xmm2, [tmp2+pos+(2*16)] + XLDR xmm3, [tmp2+pos+(3*16)] + XLDR xmm4, [tmp2+pos+(4*16)] + XLDR xmm5, [tmp2+pos+(5*16)] + XLDR xmm6, [tmp2+pos+(6*16)] + XLDR xmm7, [tmp2+pos+(7*16)] + +next_vect: + mov ptr, [arg2+tmp*PS] + sub tmp, 1 + xorpd xmm0, [ptr+pos] ;Get next vector (source) + xorpd xmm1, [ptr+pos+16] + xorpd xmm2, [ptr+pos+(2*16)] + xorpd xmm3, [ptr+pos+(3*16)] + xorpd xmm4, [ptr+pos+(4*16)] + xorpd xmm5, [ptr+pos+(5*16)] + xorpd xmm6, [ptr+pos+(6*16)] + xorpd xmm7, [ptr+pos+(7*16)] +;;; prefetch [ptr+pos+(8*16)] + jge next_vect ;Loop for each vect + + + mov tmp, vec ;Back to last vector + mov ptr, [arg2+PS+tmp*PS] ;Address of parity vector + XSTR [ptr+pos], xmm0 ;Write parity xor vector + XSTR [ptr+pos+(1*16)], xmm1 + XSTR [ptr+pos+(2*16)], xmm2 + XSTR [ptr+pos+(3*16)], xmm3 + XSTR [ptr+pos+(4*16)], xmm4 + XSTR [ptr+pos+(5*16)], xmm5 + XSTR [ptr+pos+(6*16)], xmm6 + XSTR [ptr+pos+(7*16)], xmm7 + add pos, 128 + cmp pos, len + jle loop128 + +return_pass: + mov return, 0 + FUNC_RESTORE + ret + + + +;;; Do one byte at a time for no alignment case + +xor_gen_byte: + mov tmp, vec ;Preset to last vector + +loop_1byte: + mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array + mov tmp2.b, [ptr+len-1] ;Get array n + sub tmp, 1 +nextvect_1byte: + mov ptr, [arg2+tmp*PS] + xor tmp2.b, [ptr+len-1] + sub tmp, 1 + jge nextvect_1byte + + mov tmp, vec ;Back to last vector + mov ptr, [arg2+PS+tmp*PS] ;Get last vec + mov [ptr+len-1], tmp2.b ;Write parity + sub len, 1 + test len, (8-1) + jnz loop_1byte + + cmp len, 0 + je return_pass + test len, (128-1) ;If not 0 and 128bit aligned + jz len_aligned_128bytes ; then do aligned case. len = y * 128 + + ;; else we are 8-byte aligned so fall through to recheck + + + ;; Unaligned length cases +len_not_aligned: + test len, (PS-1) + jne xor_gen_byte + mov tmp3, len + and tmp3, (128-1) ;Do the unaligned bytes 4-8 at a time + mov tmp, vec ;Preset to last vector + + ;; Run backwards 8 bytes (4B for 32bit) at a time for (tmp3) bytes +loopN_bytes: + mov ptr, [arg2+tmp*PS] ;Fetch last pointer in array + mov tmp2, [ptr+len-PS] ;Get array n + sub tmp, 1 +nextvect_Nbytes: + mov ptr, [arg2+tmp*PS] ;Get pointer to next vector + xor tmp2, [ptr+len-PS] + sub tmp, 1 + jge nextvect_Nbytes ;Loop for each source + + mov tmp, vec ;Back to last vector + mov ptr, [arg2+PS+tmp*PS] ;Get last vec + mov [ptr+len-PS], tmp2 ;Write parity + sub len, PS + sub tmp3, PS + jg loopN_bytes + + cmp len, 128 ;Now len is aligned to 128B + jge len_aligned_128bytes ;We can do the rest aligned + + cmp len, 0 + je return_pass + +return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +;;; func core, ver, snum +slversion xor_gen_sse, 00, 0c, 0030 + diff --git a/raid/xor_gen_test.c b/raid/xor_gen_test.c new file mode 100644 index 0000000..f158f94 --- /dev/null +++ b/raid/xor_gen_test.c @@ -0,0 +1,165 @@ +/********************************************************************** + Copyright(c) 2011-2015 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include +#include +#include +#include +#include "raid.h" +#include "types.h" + +#define TEST_SOURCES 16 +#define TEST_LEN 1024 +#define TEST_MEM ((TEST_SOURCES + 1)*(TEST_LEN)) +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +// Generates pseudo-random data + +void rand_buffer(unsigned char *buf, long buffer_size) +{ + long i; + for (i = 0; i < buffer_size; i++) + buf[i] = rand(); +} + +int main(int argc, char *argv[]) +{ + int i, j, k, ret, fail = 0; + void *buffs[TEST_SOURCES + 1]; + char *tmp_buf[TEST_SOURCES + 1]; + + printf("Test xor_gen_test "); + + srand(TEST_SEED); + + // Allocate the arrays + for (i = 0; i < TEST_SOURCES + 1; i++) { + void *buf; + ret = posix_memalign(&buf, 32, TEST_LEN); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + buffs[i] = buf; + } + + // Test of all zeros + for (i = 0; i < TEST_SOURCES + 1; i++) + memset(buffs[i], 0, TEST_LEN); + + xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs); + + for (i = 0; i < TEST_LEN; i++) { + if (((char *)buffs[TEST_SOURCES])[i] != 0) + fail++; + } + + if (fail > 0) { + printf("fail zero test"); + return 1; + } else + putchar('.'); + + // Test rand1 + for (i = 0; i < TEST_SOURCES + 1; i++) + rand_buffer(buffs[i], TEST_LEN); + + xor_gen(TEST_SOURCES + 1, TEST_LEN, buffs); + + fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN, buffs); + + if (fail > 0) { + printf("fail rand test %d\n", fail); + return 1; + } else + putchar('.'); + + // Test various number of sources + for (j = 3; j <= TEST_SOURCES + 1; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], TEST_LEN); + + xor_gen(j, TEST_LEN, buffs); + fail |= xor_check_base(j, TEST_LEN, buffs); + + if (fail > 0) { + printf("fail rand test %d sources\n", j); + return 1; + } else + putchar('.'); + } + + fflush(0); + + // Test various number of sources and len + k = 0; + while (k <= TEST_LEN) { + for (j = 3; j <= TEST_SOURCES + 1; j++) { + for (i = 0; i < j; i++) + rand_buffer(buffs[i], k); + + xor_gen(j, k, buffs); + fail |= xor_check_base(j, k, buffs); + + if (fail > 0) { + printf("fail rand test %d sources, len=%d, ret=%d\n", j, k, + fail); + return 1; + } + } + putchar('.'); + k += 1; + } + + // Test at the end of buffer + for (i = 0; i < TEST_LEN; i += 32) { + for (j = 0; j < TEST_SOURCES + 1; j++) { + rand_buffer((unsigned char *)buffs[j] + i, TEST_LEN - i); + tmp_buf[j] = (char *)buffs[j] + i; + } + + xor_gen(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + fail |= xor_check_base(TEST_SOURCES + 1, TEST_LEN - i, (void *)tmp_buf); + + if (fail > 0) { + printf("fail end test - offset: %d, len: %d\n", i, TEST_LEN - i); + return 1; + } + + putchar('.'); + fflush(0); + } + + if (!fail) + printf(" done: Pass\n"); + + return fail; +}