TODO: implement a faster mem*

This commit is contained in:
Alex 2023-01-04 06:45:22 +02:00
parent fd477325a3
commit f91503d704
Signed by untrusted user who does not match committer: enderice2
GPG Key ID: EACC3AD603BAB4DD
3 changed files with 28 additions and 118 deletions

View File

@ -5,6 +5,11 @@
#include <debug.h> #include <debug.h>
#include <cpu.hpp> #include <cpu.hpp>
/*
TODO: Replace these functions with even more optimized versions.
The current versions are fast but not as fast as they could be and also we need implementation for avx, not only sse.
*/
EXTERNC void *memcpy_sse(void *dest, const void *src, size_t n) EXTERNC void *memcpy_sse(void *dest, const void *src, size_t n)
{ {
char *d = (char *)dest; char *d = (char *)dest;

View File

@ -5,40 +5,39 @@
#include <debug.h> #include <debug.h>
#include <cpu.hpp> #include <cpu.hpp>
/*
TODO: Replace these functions with even more optimized versions.
The current versions are fast but not as fast as they could be and also we need implementation for avx, not only sse.
*/
// TODO: Implement these functions // TODO: Implement these functions
EXTERNC void *memmove_sse(void *dest, const void *src, size_t n) EXTERNC void *memmove_sse(void *dest, const void *src, size_t n)
{ {
memmove_unsafe(dest, src, n); return memmove_unsafe(dest, src, n);
return dest;
} }
EXTERNC void *memmove_sse2(void *dest, const void *src, size_t n) EXTERNC void *memmove_sse2(void *dest, const void *src, size_t n)
{ {
memmove_unsafe(dest, src, n); return memmove_unsafe(dest, src, n);
return dest;
} }
EXTERNC void *memmove_sse3(void *dest, const void *src, size_t n) EXTERNC void *memmove_sse3(void *dest, const void *src, size_t n)
{ {
memmove_unsafe(dest, src, n); return memmove_unsafe(dest, src, n);
return dest;
} }
EXTERNC void *memmove_ssse3(void *dest, const void *src, size_t n) EXTERNC void *memmove_ssse3(void *dest, const void *src, size_t n)
{ {
memmove_unsafe(dest, src, n); return memmove_unsafe(dest, src, n);
return dest;
} }
EXTERNC void *memmove_sse4_1(void *dest, const void *src, size_t n) EXTERNC void *memmove_sse4_1(void *dest, const void *src, size_t n)
{ {
memmove_unsafe(dest, src, n); return memmove_unsafe(dest, src, n);
return dest;
} }
EXTERNC void *memmove_sse4_2(void *dest, const void *src, size_t n) EXTERNC void *memmove_sse4_2(void *dest, const void *src, size_t n)
{ {
memmove_unsafe(dest, src, n); return memmove_unsafe(dest, src, n);
return dest;
} }

View File

@ -5,110 +5,40 @@
#include <debug.h> #include <debug.h>
#include <cpu.hpp> #include <cpu.hpp>
// TODO: Implement these functions properly /*
TODO: Replace these functions with even more optimized versions.
The current versions are fast but not as fast as they could be and also we need implementation for avx, not only sse.
*/
// TODO: Implement these functions
EXTERNC void *memset_sse(void *dest, int c, size_t n) EXTERNC void *memset_sse(void *dest, int c, size_t n)
{ {
return memset_unsafe(dest, c, n); return memset_unsafe(dest, c, n);
char *d = (char *)dest;
if (((uintptr_t)d & 0xF) == 0)
{
size_t num_vectors = n / 16;
for (size_t i = 0; i < num_vectors; i++)
{
asmv("movaps (%0), %%xmm0\n"
"movaps %%xmm0, (%1)\n"
:
: "r"(c), "r"(d)
: "xmm0");
d += 16;
}
n -= num_vectors * 16;
}
memset_unsafe(d, c, n);
return dest;
} }
EXTERNC void *memset_sse2(void *dest, int c, size_t n) EXTERNC void *memset_sse2(void *dest, int c, size_t n)
{ {
return memset_unsafe(dest, c, n); return memset_unsafe(dest, c, n);
char *d = (char *)dest;
if (((uintptr_t)d & 0xF) == 0)
{
size_t num_vectors = n / 16;
for (size_t i = 0; i < num_vectors; i++)
{
asmv("movdqa (%0), %%xmm0\n"
"movdqa %%xmm0, (%1)\n"
:
: "r"(c), "r"(d)
: "xmm0");
d += 16;
}
n -= num_vectors * 16;
}
memset_unsafe(d, c, n);
return dest;
} }
EXTERNC void *memset_sse3(void *dest, int c, size_t n) EXTERNC void *memset_sse3(void *dest, int c, size_t n)
{ {
return memset_unsafe(dest, c, n); return memset_unsafe(dest, c, n);
char *d = (char *)dest;
if (((uintptr_t)d & 0x7) == 0)
{
size_t num_vectors = n / 8;
for (size_t i = 0; i < num_vectors; i++)
{
asmv("movq (%0), %%xmm0\n"
"movddup %%xmm0, %%xmm1\n"
"movq %%xmm1, (%1)\n"
:
: "r"(c), "r"(d)
: "xmm0", "xmm1");
d += 16;
}
n -= num_vectors * 16;
}
memset_unsafe(d, c, n);
return dest;
} }
EXTERNC void *memset_ssse3(void *dest, int c, size_t n) EXTERNC void *memset_ssse3(void *dest, int c, size_t n)
{ {
return memset_unsafe(dest, c, n); return memset_unsafe(dest, c, n);
char *d = (char *)dest;
if (((uintptr_t)d & 0xF) == 0)
{
size_t num_vectors = n / 16;
for (size_t i = 0; i < num_vectors; i++)
{
asmv("movdqa (%0), %%xmm0\n"
"movdqa 16(%0), %%xmm1\n"
"palignr $8, %%xmm0, %%xmm1\n"
"movdqa %%xmm1, (%1)\n"
:
: "r"(c), "r"(d)
: "xmm0", "xmm1");
d += 16;
}
n -= num_vectors * 16;
}
memset_unsafe(d, c, n);
return dest;
} }
EXTERNC void *memset_sse4_1(void *dest, int c, size_t n) EXTERNC void *memset_sse4_1(void *dest, int c, size_t n)
{ {
return memset_unsafe(dest, c, n); return memset_unsafe(dest, c, n);
}
EXTERNC void *memset_sse4_2(void *dest, int c, size_t n)
{
char *d = (char *)dest; char *d = (char *)dest;
if (((uintptr_t)d & 0xF) == 0) if (((uintptr_t)d & 0xF) == 0)
@ -116,7 +46,8 @@ EXTERNC void *memset_sse4_1(void *dest, int c, size_t n)
size_t num_vectors = n / 16; size_t num_vectors = n / 16;
for (size_t i = 0; i < num_vectors; i++) for (size_t i = 0; i < num_vectors; i++)
{ {
asmv("movdqa (%0), %%xmm0\n" asmv("movd %0, %%xmm0\n"
"pshufd $0, %%xmm0, %%xmm0\n"
"movdqa %%xmm0, (%1)\n" "movdqa %%xmm0, (%1)\n"
: :
: "r"(c), "r"(d) : "r"(c), "r"(d)
@ -129,28 +60,3 @@ EXTERNC void *memset_sse4_1(void *dest, int c, size_t n)
memset_unsafe(d, c, n); memset_unsafe(d, c, n);
return dest; return dest;
} }
EXTERNC void *memset_sse4_2(void *dest, int c, size_t n)
{
return memset_unsafe(dest, c, n);
char *d = (char *)dest;
if (((uintptr_t)d & 0xF) == 0)
{
size_t num_vectors = n / 16;
for (size_t i = 0; i < num_vectors; i++)
{
asmv("movdqa (%0), %%xmm0\n"
"pcmpistri $0, (%0), %%xmm0\n"
"movdqa %%xmm0, (%1)\n"
:
: "r"(d), "r"(c)
: "xmm0");
d += 16;
}
n -= num_vectors * 16;
}
memset_unsafe(d, c, n);
return dest;
}