20 #ifndef UTIL_BLOCK_H__ 21 #define UTIL_BLOCK_H__ 27 #include <xmmintrin.h> 28 #include <emmintrin.h> 29 #include <smmintrin.h> 30 #include <wmmintrin.h> 38 const block * dest = nblocks+x;
44 const block * dest = nblocks+x;
50 const block * dest = nblocks+x;
57 const block * dest = nblocks+x;
59 __m128i vcmp = _mm_xor_si128(*(x++), *(y++));
60 if(!_mm_testz_si128(vcmp, vcmp))
66 #define zero_block() _mm_setzero_si128() 67 #define one_block() makeBlock(0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL) 68 #define getLSB(x) (*((unsigned short *)&x)&1) 69 #define makeBlock(X,Y) _mm_set_epi64((__m64)(X), (__m64)(Y)) 72 return _mm_testz_si128(*b,*b);
76 __m128i neq = _mm_xor_si128(*b,
one_block());
77 return _mm_testz_si128(neq, neq);
85 sse_trans(uint8_t *out, uint8_t
const *inp,
int nrows,
int ncols)
87 # define INP(x,y) inp[(x)*ncols/8 + (y)/8] 88 # define OUT(x,y) out[(y)*nrows/8 + (x)/8] 90 union { __m128i x; uint8_t b[16]; } tmp;
92 assert(nrows % 8 == 0 && ncols % 8 == 0);
95 for (rr = 0; rr <= nrows - 16; rr += 16) {
96 for (cc = 0; cc < ncols; cc += 8) {
98 INP(rr+15,cc),
INP(rr+14,cc),
INP(rr+13,cc),
INP(rr+12,cc),
INP(rr+11,cc),
INP(rr+10,cc),
INP(rr+9,cc),
99 INP(rr+8,cc),
INP(rr+7,cc),
INP(rr+6,cc),
INP(rr+5,cc),
INP(rr+4,cc),
INP(rr+3,cc),
INP(rr+2,cc),
INP(rr+1,cc),
101 for (i = 8; --i >= 0; vec = _mm_slli_epi64(vec, 1))
102 *(uint16_t*)&
OUT(rr,cc+i)= _mm_movemask_epi8(vec);
105 if (rr == nrows)
return;
109 for (cc = 0; cc <= ncols - 16; cc += 16) {
111 *(uint16_t
const*)&
INP(rr + 7, cc), *(uint16_t
const*)&
INP(rr + 6, cc),
112 *(uint16_t
const*)&
INP(rr + 5, cc), *(uint16_t
const*)&
INP(rr + 4, cc),
113 *(uint16_t
const*)&
INP(rr + 3, cc), *(uint16_t
const*)&
INP(rr + 2, cc),
114 *(uint16_t
const*)&
INP(rr + 1, cc), *(uint16_t
const*)&
INP(rr + 0, cc));
115 for (i = 8; --i >= 0; vec = _mm_slli_epi64(vec, 1)) {
116 OUT(rr, cc + i) = h = _mm_movemask_epi8(vec);
117 OUT(rr, cc + i + 8) = h >> 8;
120 if (cc == ncols)
return;
123 for (i = 0; i < 8; ++i)
124 tmp.b[i] =
INP(rr + i, cc);
125 for (i = 8; --i >= 0; tmp.x = _mm_slli_epi64(tmp.x, 1))
126 OUT(rr, cc + i) = _mm_movemask_epi8(tmp.x);
130 const char fix_key[] =
"\x61\x7e\x8d\xa2\xa0\x51\x1e\x96" 131 "\x5e\x41\xc2\x9b\x15\x3f\xc7\x7a";
158 const __m128i mask = _mm_set_epi32(135,1,1,1);
159 __m128i tmp = _mm_srai_epi32(bl, 31);
160 tmp = _mm_and_si128(tmp, mask);
161 tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2,1,0,3));
162 bl = _mm_slli_epi32(bl, 1);
163 return _mm_xor_si128(bl,tmp);
167 const __m128i mask = _mm_set_epi32(0,0, (1<<31),0);
168 __m128i tmp = _mm_and_si128(bl,mask);
169 bl = _mm_slli_epi64(bl, 1);
170 return _mm_xor_si128(bl,tmp);
173 const __m128i mask = _mm_set_epi32(0,1,0,0);
174 __m128i tmp = _mm_and_si128(bl,mask);
175 bl = _mm_slli_epi64(bl, 1);
176 return _mm_xor_si128(bl,tmp);
178 #endif//UTIL_BLOCK_H__ __m128i block
Definition: block.h:8
bool isZero(const block *b)
Definition: block.h:71
block xorBlocks(block x, block y)
Definition: block.h:35
__m128i block_tpl[2]
Definition: block.h:34
bool isOne(const block *b)
Definition: block.h:75
void sse_trans(uint8_t *out, uint8_t const *inp, int nrows, int ncols)
Definition: block.h:85
void xorBlocks_arr2(block *res, const block *x, const block *y, int nblocks)
Definition: block.h:49
const char fix_key[]
Definition: block.h:130
bool block_cmp(const block *x, const block *y, int nblocks)
Definition: block.h:56
#define one_block()
Definition: block.h:67
void xorBlocks_arr(block *res, const block *x, const block *y, int nblocks)
Definition: block.h:37
block andBlocks(block x, block y)
Definition: block.h:36