Konwertuj 0x1234 na 0x11223344

Konwertuj 0x1234 na 0x11223344


Jak zwiększyć liczbę szesnastkową 0x1234 do 0x11223344 w wydajny sposób?


unsigned int c = 0x1234, b;
b = (c & 0xff) << 4 | c & 0xf | (c & 0xff0) << 8
| (c & 0xff00) << 12 | (c & 0xf000) << 16;
printf("%p -> %p\n", c, b);

Wyjście:


0x1234 -> 0x11223344

Potrzebuję tego do konwersji kolorów. Użytkownicy podają swoje dane w postaci 0xARGB, a ja muszę je przekonwertować na 0xAARRGGBB . I tak, mogą być miliony, ponieważ każdy może być pikselem. 1000x1000 pikseli to milion.


Rzeczywista sprawa jest jeszcze bardziej skomplikowana, ponieważ pojedyncza 32-bitowa wartość zawiera zarówno kolory pierwszego planu, jak i tła. Więc 0xARGBargb zostać:[ 0xAARRGGBB, 0xaarrggbb ]


O tak, jeszcze jedno, w prawdziwej aplikacji również neguję alpha, ponieważ w OpenGL 0xFF jest nieprzezroczysty, a 0x00 jest najbardziej przezroczysty, co w większości przypadków jest niewygodne, ponieważ zwykle wystarczy RGB zakłada się, że część i przejrzystość są nieobecne.


Odpowiedzi:


Można to zrobić za pomocą SSE2 w następujący sposób:


void ExpandSSE2(unsigned __int64 in, unsigned __int64 &outLo, unsigned __int64 &outHi) {
__m128i const mask = _mm_set1_epi16((short)0xF00F);
__m128i const mul0 = _mm_set1_epi16(0x0011);
__m128i const mul1 = _mm_set1_epi16(0x1000);
__m128i v;
v = _mm_cvtsi64_si128(in); // Move the 64-bit value to a 128-bit register
v = _mm_unpacklo_epi8(v, v); // 0x12 -> 0x1212
v = _mm_and_si128(v, mask); // 0x1212 -> 0x1002
v = _mm_mullo_epi16(v, mul0); // 0x1002 -> 0x1022
v = _mm_mulhi_epu16(v, mul1); // 0x1022 -> 0x0102
v = _mm_mullo_epi16(v, mul0); // 0x0102 -> 0x1122
outLo = _mm_extract_epi64(v, 0);
outHi = _mm_extract_epi64(v, 1);
}

Oczywiście chciałbyś umieścić wnętrzności funkcji w wewnętrznej pętli i wyciągnąć stałe. Będziesz także chciał pominąć rejestry x64 i załadować wartości bezpośrednio do 128-bitowych rejestrów SSE. Aby zobaczyć przykład, jak to zrobić, zapoznaj się z implementacją SSE2 w teście wydajności poniżej.


Podstawą jest pięć instrukcji, które wykonują operację na czterech wartościach kolorów naraz. Czyli jest to tylko około 1,25 instrukcji na wartość koloru. Należy również zauważyć, że SSE2 jest dostępne wszędzie tam, gdzie jest dostępny x64.


Test wydajności dla asortymentu rozwiązań tutaj
Kilka osób wspomniało, że jedynym sposobem, aby dowiedzieć się, co jest szybsze, jest uruchomienie kodu i jest to bezdyskusyjnie prawda. Skompilowałem więc kilka rozwiązań w test wydajności, abyśmy mogli porównać jabłka z jabłkami. Wybrałem rozwiązania, które moim zdaniem znacząco różniły się od innych na tyle, że wymagały przetestowania. Wszystkie rozwiązania odczytywane z pamięci, operują na danych i zapisują z powrotem do pamięci. W praktyce niektóre rozwiązania SSE będą wymagały dodatkowej uwagi przy wyrównywaniu i obsłudze przypadków, gdy nie ma jeszcze pełnych 16 bajtów do przetworzenia w danych wejściowych. Testowany przeze mnie kod to x64 skompilowany w ramach wydania przy użyciu programu Visual Studio 2013 działającego na 4+ GHz Core i7.


Oto moje wyniki:


ExpandOrig:               56.234 seconds  // From asker's original question
ExpandSmallLUT: 30.209 seconds // From Dmitry's answer
ExpandLookupSmallOneLUT: 33.689 seconds // from Dmitry's answer
ExpandLookupLarge: 51.312 seconds // A straightforward lookup table
ExpandAShelly: 43.829 seconds // From AShelly's answer
ExpandAShellyMulOp: 43.580 seconds // AShelly's answer with an optimization
ExpandSSE4: 17.854 seconds // My original SSE4 answer
ExpandSSE4Unroll: 17.405 seconds // My original SSE4 answer with loop unrolling
ExpandSSE2: 17.281 seconds // My current SSE2 answer
ExpandSSE2Unroll: 17.152 seconds // My current SSE2 answer with loop unrolling

W powyższych wynikach testu zobaczysz, że zawarłem kod pytającego, trzy implementacje tabeli przeglądowej, w tym implementację małej tabeli przeglądowej zaproponowaną w odpowiedzi Dmitrija. W zestawie jest też rozwiązanie AShelly, a także wersja z dokonaną przeze mnie optymalizacją (operacja może zostać wyeliminowana). Dołączyłem moją oryginalną implementację SSE4, a także lepszą wersję SSE2, którą stworzyłem później (teraz odzwierciedloną jako odpowiedź), a także rozwinięte wersje obu, ponieważ były tutaj najszybsze, i chciałem zobaczyć, jak bardzo ich rozwinięcie przyspieszyło . Dołączyłem również implementację SSE4 odpowiedzi AShelly.


Jak dotąd muszę ogłosić się zwycięzcą. Ale źródło znajduje się poniżej, więc każdy może przetestować je na swojej platformie i włączyć do testów własne rozwiązanie, aby sprawdzić, czy stworzył rozwiązanie, które jest jeszcze szybsze.


#define DATA_SIZE_IN  ((unsigned)(1024 * 1024 * 128))
#define DATA_SIZE_OUT ((unsigned)(2 * DATA_SIZE_IN))
#define RERUN_COUNT 500
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <utility>
#include <emmintrin.h> // SSE2
#include <tmmintrin.h> // SSSE3
#include <smmintrin.h> // SSE4
void ExpandOrig(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = (u & 0x00FF) << 4
| (u & 0x000F)
| (u & 0x0FF0) << 8
| (u & 0xFF00) << 12
| (u & 0xF000) << 16;
v = (v & 0x00FF) << 4
| (v & 0x000F)
| (v & 0x0FF0) << 8
| (v & 0xFF00) << 12
| (v & 0xF000) << 16;
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
unsigned LutLo[256],
LutHi[256];
void MakeLutLo(void) {
for (unsigned i = 0, x; i < 256; ++i) {
x = i;
x = ((x & 0xF0) << 4) | (x & 0x0F);
x |= (x << 4);
LutLo[i] = x;
}
}
void MakeLutHi(void) {
for (unsigned i = 0, x; i < 256; ++i) {
x = i;
x = ((x & 0xF0) << 20) | ((x & 0x0F) << 16);
x |= (x << 4);
LutHi[i] = x;
}
}
void ExpandLookupSmall(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = LutHi[u >> 8] | LutLo[u & 0xFF];
v = LutHi[v >> 8] | LutLo[v & 0xFF];
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
void ExpandLookupSmallOneLUT(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = ((LutLo[u >> 8] << 16) | LutLo[u & 0xFF]);
v = ((LutLo[v >> 8] << 16) | LutLo[v & 0xFF]);
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
unsigned LutLarge[256 * 256];
void MakeLutLarge(void) {
for (unsigned i = 0; i < (256 * 256); ++i)
LutLarge[i] = LutHi[i >> 8] | LutLo[i & 0xFF];
}
void ExpandLookupLarge(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = LutLarge[u];
v = LutLarge[v];
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
void ExpandAShelly(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v, w, x;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
w = (((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00);
x = (((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00);
w += w * 0x10;
x += x * 0x10;
// Store data
*(unsigned*)(out) = w;
*(unsigned*)(out + 4) = x;
in += 4;
out += 8;
} while (in != past);
}
void ExpandAShellyMulOp(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = ((((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
v = ((((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
void ExpandSSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask0 = _mm_set1_epi16((short)0x8000),
mask1 = _mm_set1_epi8(0x0F),
mul = _mm_set1_epi16(0x0011);
__m128i u, v, w, x;
do {
// Read input into low 8 bytes of u and v
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi8(u, u); // Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u); // Do it again for v
w = _mm_srli_epi16(u, 4); // Copy the value into w and shift it right half a byte
x = _mm_srli_epi16(v, 4); // Do it again for v
u = _mm_blendv_epi8(u, w, mask0); // Select odd bytes from w, and even bytes from v, giving the the desired value in the upper nibble of each byte
v = _mm_blendv_epi8(v, x, mask0); // Do it again for v
u = _mm_and_si128(u, mask1); // Clear the all the upper nibbles
v = _mm_and_si128(v, mask1); // Do it again for v
u = _mm_mullo_epi16(u, mul); // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v = _mm_mullo_epi16(v, mul); // Do it again for v
// Write output
_mm_store_si128((__m128i*)(out ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in += 16;
out += 32;
} while (in != past);
}
void ExpandSSE4Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask0 = _mm_set1_epi16((short)0x8000),
mask1 = _mm_set1_epi8(0x0F),
mul = _mm_set1_epi16(0x0011);
__m128i u0, v0, w0, x0,
u1, v1, w1, x1,
u2, v2, w2, x2,
u3, v3, w3, x3;
do {
// Read input into low 8 bytes of u and v
u0 = _mm_load_si128((__m128i const*)(in ));
u1 = _mm_load_si128((__m128i const*)(in + 16));
u2 = _mm_load_si128((__m128i const*)(in + 32));
u3 = _mm_load_si128((__m128i const*)(in + 48));
v0 = _mm_unpackhi_epi8(u0, u0); // Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0); // Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1); // Do it again
u1 = _mm_unpacklo_epi8(u1, u1); // Again for u1
v2 = _mm_unpackhi_epi8(u2, u2); // Again for v1
u2 = _mm_unpacklo_epi8(u2, u2); // Again for u2
v3 = _mm_unpackhi_epi8(u3, u3); // Again for v2
u3 = _mm_unpacklo_epi8(u3, u3); // Again for u3
w0 = _mm_srli_epi16(u0, 4); // Copy the value into w and shift it right half a byte
x0 = _mm_srli_epi16(v0, 4); // Do it again for v
w1 = _mm_srli_epi16(u1, 4); // Again for u1
x1 = _mm_srli_epi16(v1, 4); // Again for v1
w2 = _mm_srli_epi16(u2, 4); // Again for u2
x2 = _mm_srli_epi16(v2, 4); // Again for v2
w3 = _mm_srli_epi16(u3, 4); // Again for u3
x3 = _mm_srli_epi16(v3, 4); // Again for v3
u0 = _mm_blendv_epi8(u0, w0, mask0); // Select even bytes from w, and odd bytes from v, giving the the desired value in the upper nibble of each byte
v0 = _mm_blendv_epi8(v0, x0, mask0); // Do it again for v
u1 = _mm_blendv_epi8(u1, w1, mask0); // Again for u1
v1 = _mm_blendv_epi8(v1, x1, mask0); // Again for v1
u2 = _mm_blendv_epi8(u2, w2, mask0); // Again for u2
v2 = _mm_blendv_epi8(v2, x2, mask0); // Again for v2
u3 = _mm_blendv_epi8(u3, w3, mask0); // Again for u3
v3 = _mm_blendv_epi8(v3, x3, mask0); // Again for v3
u0 = _mm_and_si128(u0, mask1); // Clear the all the upper nibbles
v0 = _mm_and_si128(v0, mask1); // Do it again for v
u1 = _mm_and_si128(u1, mask1); // Again for u1
v1 = _mm_and_si128(v1, mask1); // Again for v1
u2 = _mm_and_si128(u2, mask1); // Again for u2
v2 = _mm_and_si128(v2, mask1); // Again for v2
u3 = _mm_and_si128(u3, mask1); // Again for u3
v3 = _mm_and_si128(v3, mask1); // Again for v3
u0 = _mm_mullo_epi16(u0, mul); // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v0 = _mm_mullo_epi16(v0, mul); // Do it again for v
u1 = _mm_mullo_epi16(u1, mul); // Again for u1
v1 = _mm_mullo_epi16(v1, mul); // Again for v1
u2 = _mm_mullo_epi16(u2, mul); // Again for u2
v2 = _mm_mullo_epi16(v2, mul); // Again for v2
u3 = _mm_mullo_epi16(u3, mul); // Again for u3
v3 = _mm_mullo_epi16(v3, mul); // Again for v3
// Write output
_mm_store_si128((__m128i*)(out ), u0);
_mm_store_si128((__m128i*)(out + 16), v0);
_mm_store_si128((__m128i*)(out + 32), u1);
_mm_store_si128((__m128i*)(out + 48), v1);
_mm_store_si128((__m128i*)(out + 64), u2);
_mm_store_si128((__m128i*)(out + 80), v2);
_mm_store_si128((__m128i*)(out + 96), u3);
_mm_store_si128((__m128i*)(out + 112), v3);
in += 64;
out += 128;
} while (in != past);
}
void ExpandSSE2(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask = _mm_set1_epi16((short)0xF00F),
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i u, v;
do {
// Read input into low 8 bytes of u and v
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi8(u, u); // Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u); // Do it again for v
u = _mm_and_si128(u, mask);
v = _mm_and_si128(v, mask);
u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);
u = _mm_mulhi_epu16(u, mul1); // This can also be done with a right shift of 4 bits, but this seems to mesure faster
v = _mm_mulhi_epu16(v, mul1);
u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);
// write output
_mm_store_si128((__m128i*)(out ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in += 16;
out += 32;
} while (in != past);
}
void ExpandSSE2Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask = _mm_set1_epi16((short)0xF00F),
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i u0, v0,
u1, v1;
do {
// Read input into low 8 bytes of u and v
u0 = _mm_load_si128((__m128i const*)(in ));
u1 = _mm_load_si128((__m128i const*)(in + 16));
v0 = _mm_unpackhi_epi8(u0, u0); // Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0); // Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1); // Do it again
u1 = _mm_unpacklo_epi8(u1, u1); // Again for u1
u0 = _mm_and_si128(u0, mask);
v0 = _mm_and_si128(v0, mask);
u1 = _mm_and_si128(u1, mask);
v1 = _mm_and_si128(v1, mask);
u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);
u0 = _mm_mulhi_epu16(u0, mul1);
v0 = _mm_mulhi_epu16(v0, mul1);
u1 = _mm_mulhi_epu16(u1, mul1);
v1 = _mm_mulhi_epu16(v1, mul1);
u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);
// write output
_mm_store_si128((__m128i*)(out ), u0);
_mm_store_si128((__m128i*)(out + 16), v0);
_mm_store_si128((__m128i*)(out + 32), u1);
_mm_store_si128((__m128i*)(out + 48), v1);
in += 32;
out += 64;
} while (in != past);
}
void ExpandAShellySSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const zero = _mm_setzero_si128(),
v0F0F = _mm_set1_epi32(0x0F0F),
vF0F0 = _mm_set1_epi32(0xF0F0),
v0101 = _mm_set1_epi32(0x0101),
v1010 = _mm_set1_epi32(0x1010),
v000F000F = _mm_set1_epi32(0x000F000F),
v0F000F00 = _mm_set1_epi32(0x0F000F00),
v0011 = _mm_set1_epi32(0x0011);
__m128i u, v, w, x;
do {
// Read in data
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi16(u, zero);
u = _mm_unpacklo_epi16(u, zero);
// original source: ((((a & 0xF0F) * 0x101) & 0xF000F) + (((a & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
w = _mm_and_si128(u, v0F0F);
x = _mm_and_si128(v, v0F0F);
u = _mm_and_si128(u, vF0F0);
v = _mm_and_si128(v, vF0F0);
w = _mm_mullo_epi32(w, v0101); // _mm_mullo_epi32 is what makes this require SSE4 instead of SSE2
x = _mm_mullo_epi32(x, v0101);
u = _mm_mullo_epi32(u, v1010);
v = _mm_mullo_epi32(v, v1010);
w = _mm_and_si128(w, v000F000F);
x = _mm_and_si128(x, v000F000F);
u = _mm_and_si128(u, v0F000F00);
v = _mm_and_si128(v, v0F000F00);
u = _mm_add_epi32(u, w);
v = _mm_add_epi32(v, x);
u = _mm_mullo_epi32(u, v0011);
v = _mm_mullo_epi32(v, v0011);
// write output
_mm_store_si128((__m128i*)(out ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in += 16;
out += 32;
} while (in != past);
}
int main() {
unsigned char *const indat = new unsigned char[DATA_SIZE_IN ],
*const outdat0 = new unsigned char[DATA_SIZE_OUT],
*const outdat1 = new unsigned char[DATA_SIZE_OUT],
* curout = outdat0,
* lastout = outdat1,
* place;
unsigned start,
stop;
place = indat + DATA_SIZE_IN - 1;
do {
*place = (unsigned char)rand();
} while (place-- != indat);
MakeLutLo();
MakeLutHi();
MakeLutLarge();
for (unsigned testcount = 0; testcount < 1000; ++testcount) {
// Solution posted by the asker
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandOrig(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandOrig:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
// Dmitry's small lookup table solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupSmall(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSmallLUT:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// Dmitry's small lookup table solution using only one lookup table
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupSmallOneLUT(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandLookupSmallOneLUT:\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// Large lookup table solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupLarge(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandLookupLarge:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShelly(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShelly:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution optimizing out an addition
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShellyMulOp(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShellyMulOp:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE4 solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE4:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE4 solution unrolled
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE4Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE4Unroll:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE2 solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE2(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE2:\t\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE2 solution unrolled
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE2Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE2Unroll:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution implemented using SSE2
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShellySSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShellySSE4:\t\t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
}
delete[] indat;
delete[] outdat0;
delete[] outdat1;
return 0;
}

UWAGA:


Początkowo miałem tutaj implementację SSE4. Znalazłem sposób na zaimplementowanie tego za pomocą SSE2, który jest lepszy, ponieważ będzie działał na większej liczbie platform. Implementacja SSE2 jest również szybsza. Tak więc rozwiązanie przedstawione na górze jest teraz implementacją SSE2, a nie SSE4. Implementację SSE4 można nadal zobaczyć w testach wydajności lub w historii edycji.


Niektóre odpowiedzi na kod


unsigned int c = 0x1234, b;
b = (c &
0xff) <<
4 | c &
0xf | (c &
0xff0) <<
8
| (c &
0xff00) <<
12 | (c &
0xf000) <<
16;
printf("%p ->
%p\n", c, b);
0x1234 ->
0x11223344
void ExpandSSE2(unsigned __int64 in, unsigned __int64 &outLo, unsigned __int64 &outHi) {   __m128i const mask = _mm_set1_epi16((short)0xF00F);
__m128i const mul0 = _mm_set1_epi16(0x0011);
__m128i const mul1 = _mm_set1_epi16(0x1000);
__m128i
v;
v = _mm_cvtsi64_si128(in);
// Move the 64-bit value to a 128-bit register v = _mm_unpacklo_epi8(v, v);
// 0x12 ->
0x1212 v = _mm_and_si128(v, mask);
// 0x1212 ->
0x1002 v = _mm_mullo_epi16(v, mul0);
// 0x1002 ->
0x1022 v = _mm_mulhi_epu16(v, mul1);
// 0x1022 ->
0x0102 v = _mm_mullo_epi16(v, mul0);
// 0x0102 ->
0x1122 outLo = _mm_extract_epi64(v, 0);
outHi = _mm_extract_epi64(v, 1);
}
ExpandOrig:   56.234 seconds  // From asker's original question ExpandSmallLUT:
30.209 seconds // From Dmitry's answer ExpandLookupSmallOneLUT: 33.689 seconds // from Dmitry's answer ExpandLookupLarge:
51.312 seconds // A straightforward lookup table ExpandAShelly:43.829 seconds // From AShelly's answer ExpandAShellyMulOp:
43.580 seconds // AShelly's answer with an optimization ExpandSSE4: 17.854 seconds // My original SSE4 answer ExpandSSE4Unroll:
17.405 seconds // My original SSE4 answer with loop unrolling ExpandSSE2: 17.281 seconds // My current SSE2 answer ExpandSSE2Unroll:
17.152 seconds // My current SSE2 answer with loop unrolling
#define DATA_SIZE_IN  ((unsigned)(1024 * 1024 * 128)) #define DATA_SIZE_OUT ((unsigned)(2 * DATA_SIZE_IN)) #define RERUN_COUNT   500  #include <cstdlib>
#include <ctime>
#include <iostream>
#include <utility>
#include <emmintrin.h>
// SSE2 #include <tmmintrin.h>
// SSSE3 #include <smmintrin.h>
// SSE4 void ExpandOrig(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >>
16;
u &= 0x0000FFFF;
// Do computation
u = (u &
0x00FF) <<
4
| (u &
0x000F)
| (u &
0x0FF0) <<
8
| (u &
0xFF00) <<
12
| (u &
0xF000) <<
16;
v = (v &
0x00FF) <<
4
| (v &
0x000F)
| (v &
0x0FF0) <<
8
| (v &
0xFF00) <<
12
| (v &
0xF000) <<
16;
// Store data
*(unsigned*)(out)
= u;
*(unsigned*)(out + 4) = v;
in
+= 4;
out
+= 8;
} while (in != past);
} unsigned LutLo[256],
LutHi[256];
void MakeLutLo(void) { for (unsigned i = 0, x;
i <
256;
++i) {
x
= i;
x
= ((x &
0xF0) <<
4) | (x &
0x0F);
x
|= (x <<
4);
LutLo[i] = x;
} } void MakeLutHi(void) { for (unsigned i = 0, x;
i <
256;
++i) {
x
= i;
x
= ((x &
0xF0) <<
20) | ((x &
0x0F) <<
16);
x
|= (x <<
4);
LutHi[i] = x;
} } void ExpandLookupSmall(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >>
16;
u &= 0x0000FFFF;
// Do computation
u = LutHi[u >>
8] | LutLo[u &
0xFF];
v = LutHi[v >>
8] | LutLo[v &
0xFF];
// Store data
*(unsigned*)(out)
= u;
*(unsigned*)(out + 4) = v;
in
+= 4;
out
+= 8;
} while (in != past);
} void ExpandLookupSmallOneLUT(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >>
16;
u &= 0x0000FFFF;
// Do computation
u = ((LutLo[u >>
8] <<
16) | LutLo[u &
0xFF]);
v = ((LutLo[v >>
8] <<
16) | LutLo[v &
0xFF]);
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
} unsigned LutLarge[256 * 256];
void MakeLutLarge(void) { for (unsigned i = 0;
i <
(256 * 256);
++i)
LutLarge[i] = LutHi[i >>
8] | LutLo[i &
0xFF];
} void ExpandLookupLarge(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >>
16;
u &= 0x0000FFFF;
// Do computation
u = LutLarge[u];
v = LutLarge[v];
// Store data
*(unsigned*)(out)
= u;
*(unsigned*)(out + 4) = v;
in
+= 4;
out
+= 8;
} while (in != past);
} void ExpandAShelly(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v, w, x;
do {
// Read in data
u = *(unsigned const*)in;
v = u >>
16;
u &= 0x0000FFFF;
// Do computation
w = (((u &
0xF0F) * 0x101) &
0xF000F) + (((u &
0xF0F0) * 0x1010) &
0xF000F00);
x = (((v &
0xF0F) * 0x101) &
0xF000F) + (((v &
0xF0F0) * 0x1010) &
0xF000F00);
w += w * 0x10;
x += x * 0x10;
// Store data
*(unsigned*)(out)
= w;
*(unsigned*)(out + 4) = x;
in
+= 4;
out
+= 8;
} while (in != past);
} void ExpandAShellyMulOp(unsigned char const *in, unsigned char const *past, unsigned char *out) { unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >>
16;
u &= 0x0000FFFF;
// Do computation
u = ((((u &
0xF0F) * 0x101) &
0xF000F) + (((u &
0xF0F0) * 0x1010) &
0xF000F00)) * 0x11;
v = ((((v &
0xF0F) * 0x101) &
0xF000F) + (((v &
0xF0F0) * 0x1010) &
0xF000F00)) * 0x11;
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
} void ExpandSSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const mask0 = _mm_set1_epi16((short)0x8000),
mask1 = _mm_set1_epi8(0x0F),
mul = _mm_set1_epi16(0x0011);
__m128i
u, v, w, x;
do {
// Read input into low 8 bytes of u and v
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi8(u, u);
// Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u);
// Do it again for v
w = _mm_srli_epi16(u, 4);
// Copy the value into w and shift it right half a byte
x = _mm_srli_epi16(v, 4);
// Do it again for v
u = _mm_blendv_epi8(u, w, mask0);
// Select odd bytes from w, and even bytes from v, giving the the desired value in the upper nibble of each byte
v = _mm_blendv_epi8(v, x, mask0);
// Do it again for v
u = _mm_and_si128(u, mask1);
// Clear the all the upper nibbles
v = _mm_and_si128(v, mask1);
// Do it again for v
u = _mm_mullo_epi16(u, mul);
// Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v = _mm_mullo_epi16(v, mul);
// Do it again for v
// Write output
_mm_store_si128((__m128i*)(out
), u);
_mm_store_si128((__m128i*)(out + 16), v);
in += 16;
out += 32;
} while (in != past);
} void ExpandSSE4Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const mask0 = _mm_set1_epi16((short)0x8000),
mask1 = _mm_set1_epi8(0x0F),
mul = _mm_set1_epi16(0x0011);
__m128i
u0, v0, w0, x0,
u1, v1, w1, x1,
u2, v2, w2, x2,
u3, v3, w3, x3;
do {
// Read input into low 8 bytes of u and v
u0 = _mm_load_si128((__m128i const*)(in
));
u1 = _mm_load_si128((__m128i const*)(in + 16));
u2 = _mm_load_si128((__m128i const*)(in + 32));
u3 = _mm_load_si128((__m128i const*)(in + 48));
v0 = _mm_unpackhi_epi8(u0, u0);
// Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0);
// Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1);
// Do it again
u1 = _mm_unpacklo_epi8(u1, u1);
// Again for u1
v2 = _mm_unpackhi_epi8(u2, u2);
// Again for v1
u2 = _mm_unpacklo_epi8(u2, u2);
// Again for u2
v3 = _mm_unpackhi_epi8(u3, u3);
// Again for v2
u3 = _mm_unpacklo_epi8(u3, u3);
// Again for u3
w0 = _mm_srli_epi16(u0, 4);
// Copy the value into w and shift it right half a byte
x0 = _mm_srli_epi16(v0, 4);
// Do it again for v
w1 = _mm_srli_epi16(u1, 4);
// Again for u1
x1 = _mm_srli_epi16(v1, 4);
// Again for v1
w2 = _mm_srli_epi16(u2, 4);
// Again for u2
x2 = _mm_srli_epi16(v2, 4);
// Again for v2
w3 = _mm_srli_epi16(u3, 4);
// Again for u3
x3 = _mm_srli_epi16(v3, 4);
// Again for v3
u0 = _mm_blendv_epi8(u0, w0, mask0);
// Select even bytes from w, and odd bytes from v, giving the the desired value in the upper nibble of each byte
v0 = _mm_blendv_epi8(v0, x0, mask0);
// Do it again for v
u1 = _mm_blendv_epi8(u1, w1, mask0);
// Again for u1
v1 = _mm_blendv_epi8(v1, x1, mask0);
// Again for v1
u2 = _mm_blendv_epi8(u2, w2, mask0);
// Again for u2
v2 = _mm_blendv_epi8(v2, x2, mask0);
// Again for v2
u3 = _mm_blendv_epi8(u3, w3, mask0);
// Again for u3
v3 = _mm_blendv_epi8(v3, x3, mask0);
// Again for v3
u0 = _mm_and_si128(u0, mask1);
// Clear the all the upper nibbles
v0 = _mm_and_si128(v0, mask1);
// Do it again for v
u1 = _mm_and_si128(u1, mask1);
// Again for u1
v1 = _mm_and_si128(v1, mask1);
// Again for v1
u2 = _mm_and_si128(u2, mask1);
// Again for u2
v2 = _mm_and_si128(v2, mask1);
// Again for v2
u3 = _mm_and_si128(u3, mask1);
// Again for u3
v3 = _mm_and_si128(v3, mask1);
// Again for v3
u0 = _mm_mullo_epi16(u0, mul);
// Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v0 = _mm_mullo_epi16(v0, mul);
// Do it again for v
u1 = _mm_mullo_epi16(u1, mul);
// Again for u1
v1 = _mm_mullo_epi16(v1, mul);
// Again for v1
u2 = _mm_mullo_epi16(u2, mul);
// Again for u2
v2 = _mm_mullo_epi16(v2, mul);
// Again for v2
u3 = _mm_mullo_epi16(u3, mul);
// Again for u3
v3 = _mm_mullo_epi16(v3, mul);
// Again for v3
// Write output
_mm_store_si128((__m128i*)(out
), u0);
_mm_store_si128((__m128i*)(out + 16), v0);
_mm_store_si128((__m128i*)(out + 32), u1);
_mm_store_si128((__m128i*)(out + 48), v1);
_mm_store_si128((__m128i*)(out + 64), u2);
_mm_store_si128((__m128i*)(out + 80), v2);
_mm_store_si128((__m128i*)(out + 96), u3);
_mm_store_si128((__m128i*)(out + 112), v3);
in += 64;
out += 128;
} while (in != past);
} void ExpandSSE2(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const mask = _mm_set1_epi16((short)0xF00F),
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i
u, v;
do {
// Read input into low 8 bytes of u and v
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi8(u, u);
// Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u);
// Do it again for v
u = _mm_and_si128(u, mask);
v = _mm_and_si128(v, mask);
u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);
u = _mm_mulhi_epu16(u, mul1);
// This can also be done with a right shift of 4 bits, but this seems to mesure faster
v = _mm_mulhi_epu16(v, mul1);
u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);
// write output
_mm_store_si128((__m128i*)(out
), u);
_mm_store_si128((__m128i*)(out + 16), v);
in += 16;
out += 32;
} while (in != past);
} void ExpandSSE2Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const mask = _mm_set1_epi16((short)0xF00F),
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i
u0, v0,
u1, v1;
do {
// Read input into low 8 bytes of u and v
u0 = _mm_load_si128((__m128i const*)(in
));
u1 = _mm_load_si128((__m128i const*)(in + 16));
v0 = _mm_unpackhi_epi8(u0, u0);
// Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0);
// Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1);
// Do it again
u1 = _mm_unpacklo_epi8(u1, u1);
// Again for u1
u0 = _mm_and_si128(u0, mask);
v0 = _mm_and_si128(v0, mask);
u1 = _mm_and_si128(u1, mask);
v1 = _mm_and_si128(v1, mask);
u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);
u0 = _mm_mulhi_epu16(u0, mul1);
v0 = _mm_mulhi_epu16(v0, mul1);
u1 = _mm_mulhi_epu16(u1, mul1);
v1 = _mm_mulhi_epu16(v1, mul1);
u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);
// write output
_mm_store_si128((__m128i*)(out
), u0);
_mm_store_si128((__m128i*)(out + 16), v0);
_mm_store_si128((__m128i*)(out + 32), u1);
_mm_store_si128((__m128i*)(out + 48), v1);
in += 32;
out += 64;
} while (in != past);
} void ExpandAShellySSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) { __m128i const zero
= _mm_setzero_si128(),
v0F0F
= _mm_set1_epi32(0x0F0F),
vF0F0
= _mm_set1_epi32(0xF0F0),
v0101
= _mm_set1_epi32(0x0101),
v1010
= _mm_set1_epi32(0x1010),
v000F000F = _mm_set1_epi32(0x000F000F),
v0F000F00 = _mm_set1_epi32(0x0F000F00),
v0011 = _mm_set1_epi32(0x0011);
__m128i
u, v, w, x;
do {
// Read in data
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi16(u, zero);
u = _mm_unpacklo_epi16(u, zero);
// original source: ((((a &
0xF0F) * 0x101) &
0xF000F) + (((a &
0xF0F0) * 0x1010) &
0xF000F00)) * 0x11;
w = _mm_and_si128(u, v0F0F);
x = _mm_and_si128(v, v0F0F);
u = _mm_and_si128(u, vF0F0);
v = _mm_and_si128(v, vF0F0);
w = _mm_mullo_epi32(w, v0101);
// _mm_mullo_epi32 is what makes this require SSE4 instead of SSE2
x = _mm_mullo_epi32(x, v0101);
u = _mm_mullo_epi32(u, v1010);
v = _mm_mullo_epi32(v, v1010);
w = _mm_and_si128(w, v000F000F);
x = _mm_and_si128(x, v000F000F);
u = _mm_and_si128(u, v0F000F00);
v = _mm_and_si128(v, v0F000F00);
u = _mm_add_epi32(u, w);
v = _mm_add_epi32(v, x);
u = _mm_mullo_epi32(u, v0011);
v = _mm_mullo_epi32(v, v0011);
// write output
_mm_store_si128((__m128i*)(out
), u);
_mm_store_si128((__m128i*)(out + 16), v);
in += 16;
out += 32;
} while (in != past);
} int main() { unsigned char *const indat = new unsigned char[DATA_SIZE_IN ],
*const outdat0 = new unsigned char[DATA_SIZE_OUT],
*const outdat1 = new unsigned char[DATA_SIZE_OUT],
*
curout = outdat0,
*
lastout = outdat1,
*
place;
unsigned start,stop;
place = indat + DATA_SIZE_IN - 1;
do {
*place = (unsigned char)rand();
} while (place-- != indat);
MakeLutLo();
MakeLutHi();
MakeLutLarge();
for (unsigned testcount = 0;
testcount <
1000;
++testcount) {
// Solution posted by the asker
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandOrig(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandOrig:\t\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
// Dmitry's small lookup table solution
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandLookupSmall(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandSmallLUT:\t\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
// Dmitry's small lookup table solution using only one lookup table
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandLookupSmallOneLUT(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandLookupSmallOneLUT:\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
// Large lookup table solution
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandLookupLarge(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandLookupLarge:\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandAShelly(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandAShelly:\t\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution optimizing out an addition
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandAShellyMulOp(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandAShellyMulOp:\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
// My SSE4 solution
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandSSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandSSE4:\t\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
// My SSE4 solution unrolled
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandSSE4Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandSSE4Unroll:\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
// My SSE2 solution
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandSSE2(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandSSE2:\t\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
// My SSE2 solution unrolled
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandSSE2Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandSSE2Unroll:\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution implemented using SSE2
start = clock();
for (unsigned rerun = 0;
rerun <
RERUN_COUNT;
++rerun)
ExpandAShellySSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout <<
"ExpandAShellySSE4:\t\t" <<
(((stop - start) / 1000) / 60) <<
':' <<
(((stop - start) / 1000) % 60) <<
":." <<
((stop - start) % 1000) <<
std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout <<
"INCORRECT OUTPUT" <<
std::endl;
} delete[] indat;
delete[] outdat0;
delete[] outdat1;
return 0;
}
#include <stdio.h>
int main() { unsigned x = 0x1234;
x = (x <<
8) | x;
x = ((x &
0x00f000f0) <<
4) | (x &
0x000f000f);
x = (x <<
4) | x;
printf("0x1234 ->
0x%08x\n",x);
return 0;
}
unsigned *makeLookupTable(void) {   unsigned *tbl = malloc(sizeof(unsigned) * 65536);
if (!tbl) return NULL;
int i;
for (i = 0;
i <
65536;
i++) {
unsigned x = i;
x |= (x <<
8);
x = ((x &
0x00f000f0) <<
4) | (x &
0x000f000f);
x |= (x <<
4);
/* Uncomment next line to invert the high byte as mentioned in the edit. */
/* x = x ^ 0xff000000;
*/
tbl[i] = x;
} return tbl;
}
result = lookuptable[input];
result = lookuptable[input &
0xffff];
unsigned *makeLookupTableLow(void) {   unsigned *tbl = malloc(sizeof(unsigned) * 256);
if (!tbl) return NULL;
int i;
for (i = 0;
i <
256;
i++) {
unsigned x = i;
x = ((x &
0xf0) <<
4) | (x &
0x0f);
x |= (x <<
4);
tbl[i] = x;
} return tbl;
}
unsigned *makeLookupTableHigh(void) {   unsigned *tbl = malloc(sizeof(unsigned) * 256);
if (!tbl) return NULL;
int i;
for (i = 0;
i <
256;
i++) {
unsigned x = i;
x = ((x &
0xf0) <<
20) | ((x &
0x0f) <<
16);
x |= (x <<
4);
/* uncomment next line to invert high byte */
/* x = x ^ 0xff000000;
*/
tbl[i] = x;
} return tbl;
}
result = hightable[input >>
8] | lowtable[input &
0xff];
result = (lowtable[input >>
8] <<
16) | lowtable[input &
0xff];
result ^= 0xff000000;
/* to invert high byte */
   64b SSE4.1  32b SSE4.1  32b no SSE -------------------------- ----------  ----------  ---------- ExpandOrig
time: 3.502 s
3.501 s
6.260 s ExpandLookupSmall time: 3.530 s
3.997 s
3.996 s ExpandLookupLarge time: 3.434 s
3.419 s
3.427 s ExpandIsalamon
time: 3.654 s
3.673 s
8.870 s ExpandIsalamonOpt time: 3.784 s
3.720 s
8.719 s ExpandChronoKitsune time: 3.658 s
3.463 s
6.546 s ExpandEvgenyKluev time: 6.790 s
7.697 s 13.383 s ExpandIammilind
time: 3.485 s
3.498 s
6.436 s ExpandDmitri
time: 3.457 s
3.477 s
5.461 s ExpandNitish712
time: 3.574 s
3.800 s
6.789 s ExpandAdamLiss
time: 3.673 s
5.680 s
6.969 s ExpandAShelly
time: 3.524 s
4.295 s
5.867 s ExpandAShellyMulOp time: 3.527 s
4.295 s
5.852 s ExpandSSE4
time: 3.428 s ExpandSSE4Unroll
time: 3.333 s ExpandSSE2
time: 3.392 s ExpandSSE2Unroll
time: 3.318 s ExpandAShellySSE4 time: 3.392 s
b = (((c &
0x0F0F) * 0x0101) &
0x00F000F) +
(((c &
0xF0F0) * 0x1010) &
0xF000F00);
b += b * 0x10;
printf("%x\n",b);
//Shows '0x11223344'
constexpr unsigned int transform1(unsigned int x) {   return ((x <<
8) | x);
} constexpr unsigned int transform2(unsigned int x) { return (((x &
0x00f000f0) <<
4) | (x &
0x000f000f));
} constexpr unsigned int transform3(unsigned int x) { return ((x <<
4) | x);
} constexpr unsigned int transform(unsigned int x) { return transform3(transform2(transform1(x)));
} // Dimitri version, using constexprs template <unsigned int argb>
struct aarrggbb_dimitri { static const unsigned int value = transform(argb);
};
// Adam Liss version template <unsigned int argb>
struct aarrggbb_adamLiss { static const unsigned int value =
(argb &
0xf000) * 0x11000 +
(argb &
0x0f00) * 0x01100 +
(argb &
0x00f0) * 0x00110 +
(argb &
0x000f) * 0x00011;
};
#define EXPAND16(x) aarrggbb<x + 0>::value, \ aarrggbb<x + 1>::value, \ aarrggbb<x + 2>::value, \ aarrggbb<x + 3>::value, \ aarrggbb<x + 4>::value, \ aarrggbb<x + 5>::value, \ aarrggbb<x + 6>::value, \ ... and so on  #define EXPAND EXPAND16(0), \ EXPAND16(0x10), \ EXPAND16(0x20), \ EXPAND16(0x30), \ EXPAND16(0x40), \ ... and so on  ... and so on 
uint64_t x = 0x1234;
x *= 0x0001000100010001ull;
x &= 0xF0000F0000F0000Full;
x *= 0x0000001001001001ull;
x &= 0xF0F0F0F000000000ull;
x = (x >>
36) * 0x11;
std::cout <<
std::hex <<
x <<
'\n';
#include <stdio.h>
#include <stdlib.h>
void main() { unsigned int c = 0x1234, b;
b = (c &
0xf000) * 0x11000 + (c &
0x0f00) * 0x01100 +
(c &
0x00f0) * 0x00110 + (c &
0x000f) * 0x00011;
printf("%x ->
%x\n", c, b);
}
unsigned int c = 0x1234;
unsigned int b = (c &
0xf) | ((c &
0xf0) <<
4) |
((c &
0xf00) <<
8) | ((c &
0xf000) <<
12);
b |= (b <<
4);
DWORD OrVal(DWORD &
nible_pos, DWORD input_val, DWORD temp_val, int shift) {
if (nible_pos==0)
nible_pos = 0x0000000F;
else
nible_pos = nible_pos <<
4;
DWORD nible = input_val &
nible_pos;
temp_val |= (nible <<
shift);
temp_val |= (nible <<
(shift + 4));
return temp_val;
} DWORD Converter2(DWORD input_val) {
DWORD nible_pos = 0x00000000;
DWORD temp_val = 0x00000000;
temp_val = OrVal(nible_pos, input_val, temp_val, 0);
temp_val = OrVal(nible_pos, input_val, temp_val, 4);
temp_val = OrVal(nible_pos, input_val, temp_val, 8);
temp_val = OrVal(nible_pos, input_val, temp_val, 12);
return temp_val;
} DWORD val2 = Converter2(0x1234);
 DWORD Converter3(DWORD input_val) {
DWORD nible_pos = 0;
DWORD temp_val = 0;
int shift = 0;
DWORD bit_nible[4] = { 0x000F, 0x000F0, 0x0F00, 0xF000 };
for ( ;
shift <
16;
shift+=4 )
{
if (nible_pos==0) nible_pos = 0x0000000F;
else nible_pos = nible_pos <<
4;
DWORD nible = input_val & nible_pos;
temp_val |= (nible <<
shift);
temp_val |= (nible <<
(shift + 4));
}
return temp_val;
}
unsigned int g = 0x1234;
unsigned int ans = 0;
ans = ( ( g &
0xf000 ) <<
16) + ( (g &
0xf00 ) <<
12)
+ ( ( g&0xf0 ) <<
8) + ( ( g&0xf ) <<
4);
ans = ( ans | ans>>4 );
printf("%p ->
%p\n", g, ans);
unsigned long transform(unsigned long n) {
/* n: 00AR
* 00GB
*/
n = ((n &
0xff00) <<
8) | (n &
0x00ff);
/* n: 0AR0
* 0GB0
*/
n <<= 4;
/* n: AAR0
* GGB0
*/
n |= (n &
0x0f000f00L) <<
4;
/* n: AARR
* GGBB
*/
n |= (n &
0x00f000f0L) >>
4;
return n;
}
std::string toAARRGGBB(const std::string &argb) {
std::string ret("0x");
int start = 2;
//"0x####";
// ^^ skipped
for (int i = start;i <
argb.length();
++i)
{
ret += argb[i];
ret += argb[i];
}
return ret;
} int main() {
std::string argb = toAARRGGBB("0xACED");
//!!! }