35 #include "m4ri_config.h"
42 #include <emmintrin.h>
56 #define __M4RI_SSE2_CUTOFF 10
65 #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27)
75 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L2_CACHE))) / 2, 2048)
162 #if 0 // Commented out in order to keep the size of mzd_t 64 bytes (one cache line). This could be added back if rows was ever removed.
204 static uint8_t
const mzd_flag_nonzero_offset = 0x1;
205 static uint8_t
const mzd_flag_nonzero_excess = 0x2;
206 static uint8_t
const mzd_flag_windowed_zerooffset = 0x4;
207 static uint8_t
const mzd_flag_windowed_zeroexcess = 0x8;
208 static uint8_t
const mzd_flag_windowed_ownsblocks = 0x10;
209 static uint8_t
const mzd_flag_multiple_blocks = 0x20;
219 return M->
flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset);
243 assert(M->
nrows == 0 || result == M->
rows[0]);
299 return n ? 0 : M->
nrows;
313 word* result = M->
blocks[0].begin + big_vector;
316 result = M->
blocks[n].begin + big_vector - n * (M->
blocks[0].size /
sizeof(
word));
318 assert(result == M->
rows[row]);
340 void mzd_free(
mzd_t *A);
374 return mzd_init_window((
mzd_t*)M, lowr, lowc, highr, highc);
383 #define mzd_free_window mzd_free
395 if ((rowa == rowb) || (startblock >= M->
width))
404 word *a = M->
rows[rowa] + startblock;
405 word *b = M->
rows[rowb] + startblock;
410 for(
wi_t i = 0; i < width; ++i) {
416 tmp = (a[width] ^ b[width]) & mask_end;
420 __M4RI_DD_ROW(M, rowa);
421 __M4RI_DD_ROW(M, rowb);
442 word tmp = (a[0] ^ b[0]) & mask_begin;
447 for(
wi_t i = 1; i < width; ++i) {
452 tmp = (a[width] ^ b[width]) & mask_end;
462 __M4RI_DD_ROW(M, rowa);
463 __M4RI_DD_ROW(M, rowb);
514 int max_bit =
MAX(a_bit, b_bit);
515 int count_remaining = stop_row - start_row;
516 int min_bit = a_bit + b_bit - max_bit;
518 int offset = max_bit - min_bit;
525 if (a_word == b_word) {
527 count_remaining -= count;
529 int fast_count = count / 4;
530 int rest_count = count - 4 * fast_count;
533 while (fast_count--) {
535 xor_v[1] = ptr[rowstride];
536 xor_v[2] = ptr[2 * rowstride];
537 xor_v[3] = ptr[3 * rowstride];
538 xor_v[0] ^= xor_v[0] >> offset;
539 xor_v[1] ^= xor_v[1] >> offset;
540 xor_v[2] ^= xor_v[2] >> offset;
541 xor_v[3] ^= xor_v[3] >> offset;
546 xor_v[0] |= xor_v[0] << offset;
547 xor_v[1] |= xor_v[1] << offset;
548 xor_v[2] |= xor_v[2] << offset;
549 xor_v[3] |= xor_v[3] << offset;
551 ptr[rowstride] ^= xor_v[1];
552 ptr[2 * rowstride] ^= xor_v[2];
553 ptr[3 * rowstride] ^= xor_v[3];
554 ptr += 4 * rowstride;
556 while (rest_count--) {
558 xor_v ^= xor_v >> offset;
560 *ptr ^= xor_v | (xor_v << offset);
570 if (min_bit == a_bit) {
571 min_ptr = ptr + a_word;
572 max_offset = b_word - a_word;
574 min_ptr = ptr + b_word;
575 max_offset = a_word - b_word;
578 count_remaining -= count;
581 word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
583 min_ptr[max_offset] ^= xor_v << offset;
584 min_ptr += rowstride;
589 if (min_bit == a_bit)
590 min_ptr = ptr + a_word;
592 min_ptr = ptr + b_word;
644 M->
rows[x][block] ^= values << spot;
647 M->
rows[x][block + 1] ^= values >> space;
666 M->
rows[x][block] &= values << spot;
669 M->
rows[x][block + 1] &= values >> space;
686 M->
rows[x][block] &= ~(values << spot);
689 M->
rows[x][block + 1] &= ~(values >> space);
705 assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols);
709 word *src = M->
rows[srcrow] + startblock;
710 word *dst = M->
rows[dstrow] + startblock;
714 *dst++ ^= *src++ & mask_begin;
719 if (wide > not_aligned + 1)
726 __m128i* __src = (__m128i*)src;
727 __m128i* __dst = (__m128i*)dst;
728 __m128i*
const eof = (__m128i*)((
unsigned long)(src + wide) & ~0xFUL);
731 __m128i xmm1 = _mm_xor_si128(*__dst, *__src);
734 while(++__src < eof);
737 wide = ((
sizeof(
word)*wide)%16)/
sizeof(
word);
750 dst[i - 1] ^= src[i - 1] & ~mask_end;
752 __M4RI_DD_ROW(M, dstrow);
766 void mzd_row_add(
mzd_t *M,
rci_t const sourcerow,
rci_t const destrow);
850 void mzd_randomize(
mzd_t *M);
866 void mzd_set_ui(
mzd_t *M,
unsigned int const value);
883 rci_t mzd_gauss_delayed(
mzd_t *M,
rci_t const startcol,
int const full);
900 rci_t mzd_echelonize_naive(
mzd_t *M,
int const full);
911 int mzd_equal(
mzd_t const *A,
mzd_t const *B);
1044 #define mzd_sub mzd_add
1056 #define _mzd_sub _mzd_add
1073 word temp = (spill <= 0) ? M->
rows[x][block] << -spill : (M->
rows[x][block + 1] << (m4ri_radix - spill)) | (M->
rows[x][block] >> spill);
1074 return temp >> (m4ri_radix - n);
1097 void mzd_combine(
mzd_t *DST,
rci_t const row3,
wi_t const startblock3,
1161 wi_t wide = A->
width - a_startblock - 1;
1163 word *a = A->
rows[a_row] + a_startblock;
1164 word *b = B->
rows[b_row] + b_startblock;
1166 #if __M4RI_HAVE_SSE2
1175 __m128i *a128 = (__m128i*)a;
1176 __m128i *b128 = (__m128i*)b;
1177 const __m128i *eof = (__m128i*)((
unsigned long)(a + wide) & ~0xFUL);
1180 *a128 = _mm_xor_si128(*a128, *b128);
1183 }
while(a128 < eof);
1187 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
1190 #endif // __M4RI_HAVE_SSE2
1193 wi_t n = (wide + 7) / 8;
1195 case 0:
do { *(a++) ^= *(b++);
1196 case 7: *(a++) ^= *(b++);
1197 case 6: *(a++) ^= *(b++);
1198 case 5: *(a++) ^= *(b++);
1199 case 4: *(a++) ^= *(b++);
1200 case 3: *(a++) ^= *(b++);
1201 case 2: *(a++) ^= *(b++);
1202 case 1: *(a++) ^= *(b++);
1236 wi_t wide = A->
width - a_startblock - 1;
1237 word *a = A->
rows[a_row] + a_startblock;
1238 word *b = B->
rows[b_row] + b_startblock;
1239 word *c = C->
rows[c_row] + c_startblock;
1250 #if __M4RI_HAVE_SSE2
1259 __m128i *a128 = (__m128i*)a;
1260 __m128i *b128 = (__m128i*)b;
1261 __m128i *c128 = (__m128i*)c;
1262 const __m128i *eof = (__m128i*)((
unsigned long)(a + wide) & ~0xFUL);
1265 *c128 = _mm_xor_si128(*a128, *b128);
1269 }
while(a128 < eof);
1274 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
1277 #endif // __M4RI_HAVE_SSE2
1280 wi_t n = (wide + 7) / 8;
1282 case 0:
do { *(c++) = *(a++) ^ *(b++);
1283 case 7: *(c++) = *(a++) ^ *(b++);
1284 case 6: *(c++) = *(a++) ^ *(b++);
1285 case 5: *(c++) = *(a++) ^ *(b++);
1286 case 4: *(c++) = *(a++) ^ *(b++);
1287 case 3: *(c++) = *(a++) ^ *(b++);
1288 case 2: *(c++) = *(a++) ^ *(b++);
1289 case 1: *(c++) = *(a++) ^ *(b++);
1319 int mzd_is_zero(
mzd_t const *A);
1329 void mzd_row_clear_offset(
mzd_t *M,
rci_t const row,
rci_t const coloffset);
1362 double mzd_density(
mzd_t const *A,
wi_t res);
1398 unsigned long long hash = 0;