From e59eef6be92357f10169d88875c3c04c047aff3b Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 3 Jan 2023 07:09:49 +0200 Subject: [PATCH] Moved SIMD mem* functions to separate files --- Library/Convert.cpp | 217 +++++++++---------------------------- Library/MemoryCopySIMD.cpp | 161 +++++++++++++++++++++++++++ Library/MemoryMoveSIMD.cpp | 44 ++++++++ Library/MemorySetSIMD.cpp | 156 ++++++++++++++++++++++++++ include/convert.h | 21 ++++ 5 files changed, 436 insertions(+), 163 deletions(-) create mode 100644 Library/MemoryCopySIMD.cpp create mode 100644 Library/MemoryMoveSIMD.cpp create mode 100644 Library/MemorySetSIMD.cpp diff --git a/Library/Convert.cpp b/Library/Convert.cpp index f6e4dff..93cc660 100644 --- a/Library/Convert.cpp +++ b/Library/Convert.cpp @@ -5,167 +5,6 @@ #include #include -EXTERNC void *memcpy_sse(void *dest, const void *src, size_t n) -{ - char *d = (char *)dest; - const char *s = (const char *)src; - - if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) - { - size_t num_vectors = n / 16; - for (size_t i = 0; i < num_vectors; i++) - { - asmv("movaps (%0), %%xmm0\n" - "movaps %%xmm0, (%1)\n" - : - : "r"(s), "r"(d) - : "xmm0"); - d += 16; - s += 16; - } - - n -= num_vectors * 16; - } - - memcpy_unsafe(d, s, n); - return dest; -} - -EXTERNC void *memcpy_sse2(void *dest, const void *src, size_t n) -{ - char *d = (char *)dest; - const char *s = (const char *)src; - - if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) - { - size_t num_vectors = n / 16; - for (size_t i = 0; i < num_vectors; i++) - { - asmv("movdqa (%0), %%xmm0\n" - "movdqa %%xmm0, (%1)\n" - : - : "r"(s), "r"(d) - : "xmm0"); - d += 16; - s += 16; - } - - n -= num_vectors * 16; - } - - memcpy_unsafe(d, s, n); - return dest; -} - -EXTERNC void *memcpy_sse3(void *dest, const void *src, size_t n) -{ - char *d = (char *)dest; - const char *s = (const char *)src; - - if ((((uintptr_t)d | (uintptr_t)s) & 0x7) == 0) - { - size_t num_vectors = n / 8; - for (size_t i = 0; i < num_vectors; i++) - { - asmv("movq (%0), %%xmm0\n" - "movddup %%xmm0, %%xmm1\n" - "movq %%xmm1, (%1)\n" - : - : "r"(s), "r"(d) - : "xmm0", "xmm1"); - d += 8; - s += 8; - } - - n -= num_vectors * 8; - } - - memcpy_unsafe(d, s, n); - return dest; -} - -EXTERNC void *memcpy_ssse3(void *dest, const void *src, size_t n) -{ - char *d = (char *)dest; - const char *s = (const char *)src; - - if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) - { - size_t num_vectors = n / 16; - for (size_t i = 0; i < num_vectors; i++) - { - asmv("movdqa (%0), %%xmm0\n" - "movdqa 16(%0), %%xmm1\n" - "palignr $8, %%xmm0, %%xmm1\n" - "movdqa %%xmm1, (%1)\n" - : - : "r"(s), "r"(d) - : "xmm0", "xmm1"); - d += 16; - s += 16; - } - - n -= num_vectors * 16; - } - - memcpy_unsafe(d, s, n); - return dest; -} - -EXTERNC void *memcpy_sse4_1(void *dest, const void *src, size_t n) -{ - CPU::__m128i *d = (CPU::__m128i *)dest; - const CPU::__m128i *s = (const CPU::__m128i *)src; - - if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) - { - size_t num_vectors = n / 16; - for (size_t i = 0; i < num_vectors; i++) - { - // movntdqa - asmv("movdqa (%0), %%xmm0\n" - "movdqa %%xmm0, (%1)\n" - : - : "r"(s), "r"(d) - : "xmm0"); - d += 16; - s += 16; - } - - n -= num_vectors * 16; - } - - memcpy_unsafe(d, s, n); - return dest; -} - -EXTERNC void *memcpy_sse4_2(void *dest, const void *src, size_t n) -{ - char *d = (char *)dest; - const char *s = (const char *)src; - - if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) - { - size_t num_vectors = n / 16; - for (size_t i = 0; i < num_vectors; i++) - { - asmv("movdqa (%0), %%xmm0\n" - "pcmpistri $0, (%0), %%xmm0\n" - "movdqa %%xmm0, (%1)\n" - : - : "r"(s), "r"(d) - : "xmm0"); - d += 16; - s += 16; - } - - n -= num_vectors * 16; - } - - memcpy_unsafe(d, s, n); - return dest; -} - EXTERNC int memcmp(const void *vl, const void *vr, size_t n) { const unsigned char *l = (unsigned char *)vl, *r = (unsigned char *)vr; @@ -848,7 +687,33 @@ EXTERNC __no_stack_protector void *__memset_chk(void *dest, int val, size_t len, if (unlikely(len > slen)) __chk_fail(); - return memset_unsafe(dest, val, len); + + switch (CPU::CheckSIMD()) + { + case CPU::x86SIMDType::SIMD_SSE: + return memset_sse(dest, val, len); + break; + case CPU::x86SIMDType::SIMD_SSE2: + return memset_sse2(dest, val, len); + break; + case CPU::x86SIMDType::SIMD_SSE3: + return memset_sse3(dest, val, len); + break; + case CPU::x86SIMDType::SIMD_SSSE3: + return memset_ssse3(dest, val, len); + break; + case CPU::x86SIMDType::SIMD_SSE41: + return memset_sse4_1(dest, val, len); + break; + case CPU::x86SIMDType::SIMD_SSE42: + return memset_sse4_2(dest, val, len); + break; + default: + return memset_unsafe(dest, val, len); + break; + } + error("Should not be here!"); + CPU::Stop(); } EXTERNC __no_stack_protector void *__memmove_chk(void *dest, const void *src, size_t len, size_t slen) @@ -882,7 +747,33 @@ EXTERNC __no_stack_protector void *__memmove_chk(void *dest, const void *src, si if (unlikely(len > slen)) __chk_fail(); - return memmove_unsafe(dest, src, len); + + switch (CPU::CheckSIMD()) + { + case CPU::x86SIMDType::SIMD_SSE: + return memmove_sse(dest, src, len); + break; + case CPU::x86SIMDType::SIMD_SSE2: + return memmove_sse2(dest, src, len); + break; + case CPU::x86SIMDType::SIMD_SSE3: + return memmove_sse3(dest, src, len); + break; + case CPU::x86SIMDType::SIMD_SSSE3: + return memmove_ssse3(dest, src, len); + break; + case CPU::x86SIMDType::SIMD_SSE41: + return memmove_sse4_1(dest, src, len); + break; + case CPU::x86SIMDType::SIMD_SSE42: + return memmove_sse4_2(dest, src, len); + break; + default: + return memmove_unsafe(dest, src, len); + break; + } + error("Should not be here!"); + CPU::Stop(); } EXTERNC __no_stack_protector char *__strcat_chk(char *dest, const char *src, size_t slen) diff --git a/Library/MemoryCopySIMD.cpp b/Library/MemoryCopySIMD.cpp new file mode 100644 index 0000000..28c855b --- /dev/null +++ b/Library/MemoryCopySIMD.cpp @@ -0,0 +1,161 @@ +#include + +#include +#include +#include +#include + +EXTERNC void *memcpy_sse(void *dest, const void *src, size_t n) +{ + char *d = (char *)dest; + const char *s = (const char *)src; + + if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movaps (%0), %%xmm0\n" + "movaps %%xmm0, (%1)\n" + : + : "r"(s), "r"(d) + : "xmm0"); + d += 16; + s += 16; + } + n -= num_vectors * 16; + } + + memcpy_unsafe(d, s, n); + return dest; +} + +EXTERNC void *memcpy_sse2(void *dest, const void *src, size_t n) +{ + char *d = (char *)dest; + const char *s = (const char *)src; + + if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movdqa (%0), %%xmm0\n" + "movdqa %%xmm0, (%1)\n" + : + : "r"(s), "r"(d) + : "xmm0"); + d += 16; + s += 16; + } + n -= num_vectors * 16; + } + + memcpy_unsafe(d, s, n); + return dest; +} + +EXTERNC void *memcpy_sse3(void *dest, const void *src, size_t n) +{ + char *d = (char *)dest; + const char *s = (const char *)src; + + if ((((uintptr_t)d | (uintptr_t)s) & 0x7) == 0) + { + size_t num_vectors = n / 8; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movq (%0), %%xmm0\n" + "movddup %%xmm0, %%xmm1\n" + "movq %%xmm1, (%1)\n" + : + : "r"(s), "r"(d) + : "xmm0", "xmm1"); + d += 8; + s += 8; + } + n -= num_vectors * 8; + } + + memcpy_unsafe(d, s, n); + return dest; +} + +EXTERNC void *memcpy_ssse3(void *dest, const void *src, size_t n) +{ + char *d = (char *)dest; + const char *s = (const char *)src; + + if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movdqa (%0), %%xmm0\n" + "movdqa 16(%0), %%xmm1\n" + "palignr $8, %%xmm0, %%xmm1\n" + "movdqa %%xmm1, (%1)\n" + : + : "r"(s), "r"(d) + : "xmm0", "xmm1"); + d += 16; + s += 16; + } + n -= num_vectors * 16; + } + + memcpy_unsafe(d, s, n); + return dest; +} + +EXTERNC void *memcpy_sse4_1(void *dest, const void *src, size_t n) +{ + CPU::__m128i *d = (CPU::__m128i *)dest; + const CPU::__m128i *s = (const CPU::__m128i *)src; + + if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + // movntdqa + asmv("movdqa (%0), %%xmm0\n" + "movdqa %%xmm0, (%1)\n" + : + : "r"(s), "r"(d) + : "xmm0"); + d += 16; + s += 16; + } + n -= num_vectors * 16; + } + + memcpy_unsafe(d, s, n); + return dest; +} + +EXTERNC void *memcpy_sse4_2(void *dest, const void *src, size_t n) +{ + char *d = (char *)dest; + const char *s = (const char *)src; + + if ((((uintptr_t)d | (uintptr_t)s) & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movdqa (%0), %%xmm0\n" + "pcmpistri $0, (%0), %%xmm0\n" + "movdqa %%xmm0, (%1)\n" + : + : "r"(s), "r"(d) + : "xmm0"); + d += 16; + s += 16; + } + n -= num_vectors * 16; + } + + memcpy_unsafe(d, s, n); + return dest; +} diff --git a/Library/MemoryMoveSIMD.cpp b/Library/MemoryMoveSIMD.cpp new file mode 100644 index 0000000..f51bb3a --- /dev/null +++ b/Library/MemoryMoveSIMD.cpp @@ -0,0 +1,44 @@ +#include + +#include +#include +#include +#include + +// TODO: Implement these functions + +EXTERNC void *memmove_sse(void *dest, const void *src, size_t n) +{ + memmove_unsafe(dest, src, n); + return dest; +} + +EXTERNC void *memmove_sse2(void *dest, const void *src, size_t n) +{ + memmove_unsafe(dest, src, n); + return dest; +} + +EXTERNC void *memmove_sse3(void *dest, const void *src, size_t n) +{ + memmove_unsafe(dest, src, n); + return dest; +} + +EXTERNC void *memmove_ssse3(void *dest, const void *src, size_t n) +{ + memmove_unsafe(dest, src, n); + return dest; +} + +EXTERNC void *memmove_sse4_1(void *dest, const void *src, size_t n) +{ + memmove_unsafe(dest, src, n); + return dest; +} + +EXTERNC void *memmove_sse4_2(void *dest, const void *src, size_t n) +{ + memmove_unsafe(dest, src, n); + return dest; +} diff --git a/Library/MemorySetSIMD.cpp b/Library/MemorySetSIMD.cpp new file mode 100644 index 0000000..53a3a1c --- /dev/null +++ b/Library/MemorySetSIMD.cpp @@ -0,0 +1,156 @@ +#include + +#include +#include +#include +#include + +// TODO: Implement these functions properly + +EXTERNC void *memset_sse(void *dest, int c, size_t n) +{ + return memset_unsafe(dest, c, n); + char *d = (char *)dest; + + if (((uintptr_t)d & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movaps (%0), %%xmm0\n" + "movaps %%xmm0, (%1)\n" + : + : "r"(c), "r"(d) + : "xmm0"); + d += 16; + } + n -= num_vectors * 16; + } + + memset_unsafe(d, c, n); + return dest; +} + +EXTERNC void *memset_sse2(void *dest, int c, size_t n) +{ + return memset_unsafe(dest, c, n); + char *d = (char *)dest; + + if (((uintptr_t)d & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movdqa (%0), %%xmm0\n" + "movdqa %%xmm0, (%1)\n" + : + : "r"(c), "r"(d) + : "xmm0"); + d += 16; + } + n -= num_vectors * 16; + } + + memset_unsafe(d, c, n); + return dest; +} + +EXTERNC void *memset_sse3(void *dest, int c, size_t n) +{ + return memset_unsafe(dest, c, n); + char *d = (char *)dest; + + if (((uintptr_t)d & 0x7) == 0) + { + size_t num_vectors = n / 8; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movq (%0), %%xmm0\n" + "movddup %%xmm0, %%xmm1\n" + "movq %%xmm1, (%1)\n" + : + : "r"(c), "r"(d) + : "xmm0", "xmm1"); + d += 16; + } + n -= num_vectors * 16; + } + + memset_unsafe(d, c, n); + return dest; +} + +EXTERNC void *memset_ssse3(void *dest, int c, size_t n) +{ + return memset_unsafe(dest, c, n); + char *d = (char *)dest; + + if (((uintptr_t)d & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movdqa (%0), %%xmm0\n" + "movdqa 16(%0), %%xmm1\n" + "palignr $8, %%xmm0, %%xmm1\n" + "movdqa %%xmm1, (%1)\n" + : + : "r"(c), "r"(d) + : "xmm0", "xmm1"); + d += 16; + } + n -= num_vectors * 16; + } + + memset_unsafe(d, c, n); + return dest; +} + +EXTERNC void *memset_sse4_1(void *dest, int c, size_t n) +{ + return memset_unsafe(dest, c, n); + char *d = (char *)dest; + + if (((uintptr_t)d & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movdqa (%0), %%xmm0\n" + "movdqa %%xmm0, (%1)\n" + : + : "r"(c), "r"(d) + : "xmm0"); + d += 16; + } + n -= num_vectors * 16; + } + + memset_unsafe(d, c, n); + return dest; +} + +EXTERNC void *memset_sse4_2(void *dest, int c, size_t n) +{ + return memset_unsafe(dest, c, n); + char *d = (char *)dest; + + if (((uintptr_t)d & 0xF) == 0) + { + size_t num_vectors = n / 16; + for (size_t i = 0; i < num_vectors; i++) + { + asmv("movdqa (%0), %%xmm0\n" + "pcmpistri $0, (%0), %%xmm0\n" + "movdqa %%xmm0, (%1)\n" + : + : "r"(c), "r"(d) + : "xmm0"); + d += 16; + } + n -= num_vectors * 16; + } + + memset_unsafe(d, c, n); + return dest; +} diff --git a/include/convert.h b/include/convert.h index 2684b73..2c20dd4 100644 --- a/include/convert.h +++ b/include/convert.h @@ -39,6 +39,27 @@ extern "C" void *memmove_unsafe(void *dest, const void *src, size_t n); int memcmp(const void *vl, const void *vr, size_t n); + void *memcpy_sse(void *dest, const void *src, size_t n); + void *memcpy_sse2(void *dest, const void *src, size_t n); + void *memcpy_sse3(void *dest, const void *src, size_t n); + void *memcpy_ssse3(void *dest, const void *src, size_t n); + void *memcpy_sse4_1(void *dest, const void *src, size_t n); + void *memcpy_sse4_2(void *dest, const void *src, size_t n); + + void *memset_sse(void *dest, int c, size_t n); + void *memset_sse2(void *dest, int c, size_t n); + void *memset_sse3(void *dest, int c, size_t n); + void *memset_ssse3(void *dest, int c, size_t n); + void *memset_sse4_1(void *dest, int c, size_t n); + void *memset_sse4_2(void *dest, int c, size_t n); + + void *memmove_sse(void *dest, const void *src, size_t n); + void *memmove_sse2(void *dest, const void *src, size_t n); + void *memmove_sse3(void *dest, const void *src, size_t n); + void *memmove_ssse3(void *dest, const void *src, size_t n); + void *memmove_sse4_1(void *dest, const void *src, size_t n); + void *memmove_sse4_2(void *dest, const void *src, size_t n); + long unsigned strlen(const char s[]); int strncmp(const char *s1, const char *s2, unsigned long n); char *strcat_unsafe(char *destination, const char *source);