M4RI  1.0.1
mzd.h
Go to the documentation of this file.
1 
10 #ifndef M4RI_MZD
11 #define M4RI_MZD
12 
13 /*******************************************************************
14 *
15 * M4RI: Linear Algebra over GF(2)
16 *
17 * Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu>
18 * Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
19 * Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
20 *
21 * Distributed under the terms of the GNU General Public License (GPL)
22 * version 2 or higher.
23 *
24 * This code is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 * General Public License for more details.
28 *
29 * The full text of the GPL is available at:
30 *
31 * http://www.gnu.org/licenses/
32 *
33 ********************************************************************/
34 
35 #include "m4ri_config.h"
36 
37 #include <math.h>
38 #include <assert.h>
39 #include <stdio.h>
40 
41 #if __M4RI_HAVE_SSE2
42 #include <emmintrin.h>
43 #endif
44 
45 #include "misc.h"
46 #include "debug_dump.h"
47 
48 #if __M4RI_HAVE_SSE2
49 
56 #define __M4RI_SSE2_CUTOFF 10
57 #endif
58 
65 #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27)
66 
75 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L2_CACHE))) / 2, 2048)
76 
77 typedef struct {
78  size_t size;
79  word* begin;
80  word* end;
81 } mzd_block_t;
82 
89 typedef struct mzd_t {
95 
101 
109 
118 
127 
133 
138  uint16_t offset;
139 
153  uint8_t flags;
154 
160  uint8_t blockrows_log;
161 
162 #if 0 // Commented out in order to keep the size of mzd_t 64 bytes (one cache line). This could be added back if rows was ever removed.
163 
168  int blockrows_mask;
169 #endif
170 
176 
182 
189 
196 
197 } mzd_t;
198 
202 static wi_t const mzd_paddingwidth = 3;
203 
204 static uint8_t const mzd_flag_nonzero_offset = 0x1;
205 static uint8_t const mzd_flag_nonzero_excess = 0x2;
206 static uint8_t const mzd_flag_windowed_zerooffset = 0x4;
207 static uint8_t const mzd_flag_windowed_zeroexcess = 0x8;
208 static uint8_t const mzd_flag_windowed_ownsblocks = 0x10;
209 static uint8_t const mzd_flag_multiple_blocks = 0x20;
210 
218 static inline int mzd_is_windowed(mzd_t const *M) {
219  return M->flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset);
220 }
221 
229 static inline int mzd_owns_blocks(mzd_t const *M) {
230  return M->blocks && (!mzd_is_windowed(M) || ((M->flags & mzd_flag_windowed_ownsblocks)));
231 }
232 
241 static inline word* mzd_first_row(mzd_t const *M) {
242  word* result = M->blocks[0].begin + M->offset_vector;
243  assert(M->nrows == 0 || result == M->rows[0]);
244  return result;
245 }
246 
257 static inline word* mzd_first_row_next_block(mzd_t const* M, int n) {
258  assert(n > 0);
259  return M->blocks[n].begin + M->offset_vector - M->row_offset * M->rowstride;
260 }
261 
271 static inline int mzd_row_to_block(mzd_t const* M, rci_t row) {
272  return (M->row_offset + row) >> M->blockrows_log;
273 }
274 
288 static inline wi_t mzd_rows_in_block(mzd_t const* M, int n) {
289  if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
290  if (__M4RI_UNLIKELY(n == 0)) {
291  return (1 << M->blockrows_log) - M->row_offset;
292  } else {
293  int const last_block = mzd_row_to_block(M, M->nrows - 1);
294  if (n < last_block)
295  return (1 << M->blockrows_log);
296  return M->nrows + M->row_offset - (n << M->blockrows_log);
297  }
298  }
299  return n ? 0 : M->nrows;
300 }
301 
311 static inline word* mzd_row(mzd_t const* M, rci_t row) {
312  wi_t big_vector = M->offset_vector + row * M->rowstride;
313  word* result = M->blocks[0].begin + big_vector;
314  if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
315  int const n = (M->row_offset + row) >> M->blockrows_log;
316  result = M->blocks[n].begin + big_vector - n * (M->blocks[0].size / sizeof(word));
317  }
318  assert(result == M->rows[row]);
319  return result;
320 }
321 
332 mzd_t *mzd_init(rci_t const r, rci_t const c);
333 
340 void mzd_free(mzd_t *A);
341 
342 
364 mzd_t *mzd_init_window(mzd_t *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc);
365 
372 static inline mzd_t const *mzd_init_window_const(mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc)
373 {
374  return mzd_init_window((mzd_t*)M, lowr, lowc, highr, highc);
375 }
376 
383 #define mzd_free_window mzd_free
384 
394 static inline void _mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb, wi_t const startblock) {
395  if ((rowa == rowb) || (startblock >= M->width))
396  return;
397 
398  /* This is the case since we're only called from _mzd_ple_mmpf,
399  * which makes the same assumption. Therefore we don't need
400  * to take a mask_begin into account. */
401  assert(M->offset == 0);
402 
403  wi_t width = M->width - startblock - 1;
404  word *a = M->rows[rowa] + startblock;
405  word *b = M->rows[rowb] + startblock;
406  word tmp;
407  word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
408 
409  if (width != 0) {
410  for(wi_t i = 0; i < width; ++i) {
411  tmp = a[i];
412  a[i] = b[i];
413  b[i] = tmp;
414  }
415  }
416  tmp = (a[width] ^ b[width]) & mask_end;
417  a[width] ^= tmp;
418  b[width] ^= tmp;
419 
420  __M4RI_DD_ROW(M, rowa);
421  __M4RI_DD_ROW(M, rowb);
422 }
423 
432 static inline void mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb) {
433  if(rowa == rowb)
434  return;
435 
436  wi_t width = M->width - 1;
437  word *a = M->rows[rowa];
438  word *b = M->rows[rowb];
439  word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - M->offset);
440  word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
441 
442  word tmp = (a[0] ^ b[0]) & mask_begin;
443  if (width != 0) {
444  a[0] ^= tmp;
445  b[0] ^= tmp;
446 
447  for(wi_t i = 1; i < width; ++i) {
448  tmp = a[i];
449  a[i] = b[i];
450  b[i] = tmp;
451  }
452  tmp = (a[width] ^ b[width]) & mask_end;
453  a[width] ^= tmp;
454  b[width] ^= tmp;
455 
456  } else {
457  tmp &= mask_end;
458  a[0] ^= tmp;
459  b[0] ^= tmp;
460  }
461 
462  __M4RI_DD_ROW(M, rowa);
463  __M4RI_DD_ROW(M, rowb);
464 }
465 
478 void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j);
479 
488 void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb);
489 
500 static inline void mzd_col_swap_in_rows(mzd_t *M, rci_t const cola, rci_t const colb, rci_t const start_row, rci_t const stop_row) {
501  if (cola == colb)
502  return;
503 
504  rci_t const _cola = cola + M->offset;
505  rci_t const _colb = colb + M->offset;
506 
507  wi_t const a_word = _cola / m4ri_radix;
508  wi_t const b_word = _colb / m4ri_radix;
509 
510  int const a_bit = _cola % m4ri_radix;
511  int const b_bit = _colb % m4ri_radix;
512 
513  word* RESTRICT ptr = mzd_row(M, start_row);
514  int max_bit = MAX(a_bit, b_bit);
515  int count_remaining = stop_row - start_row;
516  int min_bit = a_bit + b_bit - max_bit;
517  int block = mzd_row_to_block(M, start_row);
518  int offset = max_bit - min_bit;
519  word mask = m4ri_one << min_bit;
520  int count = MIN(mzd_rows_in_block(M, 0), count_remaining);
521  // Apparently we're calling with start_row == stop_row sometimes (seems a bug to me).
522  if (count <= 0)
523  return;
524 
525  if (a_word == b_word) {
526  while(1) {
527  count_remaining -= count;
528  ptr += a_word;
529  int fast_count = count / 4;
530  int rest_count = count - 4 * fast_count;
531  word xor_v[4];
532  wi_t const rowstride = M->rowstride;
533  while (fast_count--) {
534  xor_v[0] = ptr[0];
535  xor_v[1] = ptr[rowstride];
536  xor_v[2] = ptr[2 * rowstride];
537  xor_v[3] = ptr[3 * rowstride];
538  xor_v[0] ^= xor_v[0] >> offset;
539  xor_v[1] ^= xor_v[1] >> offset;
540  xor_v[2] ^= xor_v[2] >> offset;
541  xor_v[3] ^= xor_v[3] >> offset;
542  xor_v[0] &= mask;
543  xor_v[1] &= mask;
544  xor_v[2] &= mask;
545  xor_v[3] &= mask;
546  xor_v[0] |= xor_v[0] << offset;
547  xor_v[1] |= xor_v[1] << offset;
548  xor_v[2] |= xor_v[2] << offset;
549  xor_v[3] |= xor_v[3] << offset;
550  ptr[0] ^= xor_v[0];
551  ptr[rowstride] ^= xor_v[1];
552  ptr[2 * rowstride] ^= xor_v[2];
553  ptr[3 * rowstride] ^= xor_v[3];
554  ptr += 4 * rowstride;
555  }
556  while (rest_count--) {
557  word xor_v = *ptr;
558  xor_v ^= xor_v >> offset;
559  xor_v &= mask;
560  *ptr ^= xor_v | (xor_v << offset);
561  ptr += rowstride;
562  }
563  if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0)
564  break;
565  ptr = mzd_first_row_next_block(M, block);
566  }
567  } else {
568  word* RESTRICT min_ptr;
569  wi_t max_offset;
570  if (min_bit == a_bit) {
571  min_ptr = ptr + a_word;
572  max_offset = b_word - a_word;
573  } else {
574  min_ptr = ptr + b_word;
575  max_offset = a_word - b_word;
576  }
577  while(1) {
578  count_remaining -= count;
579  wi_t const rowstride = M->rowstride;
580  while(count--) {
581  word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
582  min_ptr[0] ^= xor_v;
583  min_ptr[max_offset] ^= xor_v << offset;
584  min_ptr += rowstride;
585  }
586  if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0)
587  break;
588  ptr = mzd_first_row_next_block(M, block);
589  if (min_bit == a_bit)
590  min_ptr = ptr + a_word;
591  else
592  min_ptr = ptr + b_word;
593  }
594  }
595 
596  __M4RI_DD_MZD(M);
597 }
598 
610 static inline BIT mzd_read_bit(mzd_t const *M, rci_t const row, rci_t const col ) {
611  return __M4RI_GET_BIT(M->rows[row][(col+M->offset)/m4ri_radix], (col+M->offset) % m4ri_radix);
612 }
613 
626 static inline void mzd_write_bit(mzd_t *M, rci_t const row, rci_t const col, BIT const value) {
627  __M4RI_WRITE_BIT(M->rows[row][(col + M->offset) / m4ri_radix], (col + M->offset) % m4ri_radix, value);
628 }
629 
630 
641 static inline void mzd_xor_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
642  int const spot = (y + M->offset) % m4ri_radix;
643  wi_t const block = (y + M->offset) / m4ri_radix;
644  M->rows[x][block] ^= values << spot;
645  int const space = m4ri_radix - spot;
646  if (n > space)
647  M->rows[x][block + 1] ^= values >> space;
648 }
649 
660 static inline void mzd_and_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
661  /* This is the best way, since this will drop out once we inverse the bits in values: */
662  values >>= (m4ri_radix - n); /* Move the bits to the lowest columns */
663 
664  int const spot = (y + M->offset) % m4ri_radix;
665  wi_t const block = (y + M->offset) / m4ri_radix;
666  M->rows[x][block] &= values << spot;
667  int const space = m4ri_radix - spot;
668  if (n > space)
669  M->rows[x][block + 1] &= values >> space;
670 }
671 
681 static inline void mzd_clear_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
682  assert(n>0 && n <= m4ri_radix);
683  word values = m4ri_ffff >> (m4ri_radix - n);
684  int const spot = (y + M->offset) % m4ri_radix;
685  wi_t const block = (y + M->offset) / m4ri_radix;
686  M->rows[x][block] &= ~(values << spot);
687  int const space = m4ri_radix - spot;
688  if (n > space)
689  M->rows[x][block + 1] &= ~(values >> space);
690 }
691 
704 static inline void mzd_row_add_offset(mzd_t *M, rci_t dstrow, rci_t srcrow, rci_t coloffset) {
705  assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols);
706  coloffset += M->offset;
707  wi_t const startblock= coloffset/m4ri_radix;
708  wi_t wide = M->width - startblock;
709  word *src = M->rows[srcrow] + startblock;
710  word *dst = M->rows[dstrow] + startblock;
711  word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset % m4ri_radix);
712  word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
713 
714  *dst++ ^= *src++ & mask_begin;
715  --wide;
716 
717 #if __M4RI_HAVE_SSE2
718  int not_aligned = __M4RI_ALIGNMENT(src,16) != 0; /* 0: Aligned, 1: Not aligned */
719  if (wide > not_aligned + 1) /* Speed up for small matrices */
720  {
721  if (not_aligned) {
722  *dst++ ^= *src++;
723  --wide;
724  }
725  /* Now wide > 1 */
726  __m128i* __src = (__m128i*)src;
727  __m128i* __dst = (__m128i*)dst;
728  __m128i* const eof = (__m128i*)((unsigned long)(src + wide) & ~0xFUL);
729  do
730  {
731  __m128i xmm1 = _mm_xor_si128(*__dst, *__src);
732  *__dst++ = xmm1;
733  }
734  while(++__src < eof);
735  src = (word*)__src;
736  dst = (word*)__dst;
737  wide = ((sizeof(word)*wide)%16)/sizeof(word);
738  }
739 #endif
740  wi_t i = -1;
741  while(++i < wide)
742  dst[i] ^= src[i];
743  /*
744  * Revert possibly non-zero excess bits.
745  * Note that i == wide here, and wide can be 0.
746  * But really, src[wide - 1] is M->rows[srcrow][M->width - 1] ;)
747  * We use i - 1 here to let the compiler know these are the same addresses
748  * that we last accessed, in the previous loop.
749  */
750  dst[i - 1] ^= src[i - 1] & ~mask_end;
751 
752  __M4RI_DD_ROW(M, dstrow);
753 }
754 
766 void mzd_row_add(mzd_t *M, rci_t const sourcerow, rci_t const destrow);
767 
782 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A);
783 
798 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
799 
814 mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
815 
827 mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, int const clear);
828 
838 mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear);
839 
850 void mzd_randomize(mzd_t *M);
851 
866 void mzd_set_ui(mzd_t *M, unsigned int const value);
867 
883 rci_t mzd_gauss_delayed(mzd_t *M, rci_t const startcol, int const full);
884 
900 rci_t mzd_echelonize_naive(mzd_t *M, int const full);
901 
911 int mzd_equal(mzd_t const *A, mzd_t const *B);
912 
926 int mzd_cmp(mzd_t const *A, mzd_t const *B);
927 
935 mzd_t *mzd_copy(mzd_t *DST, mzd_t const *A);
936 
957 mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B);
958 
978 mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B);
979 
992 mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc);
993 
1007 mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I);
1008 
1020 mzd_t *mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
1021 
1032 mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
1033 
1044 #define mzd_sub mzd_add
1045 
1056 #define _mzd_sub _mzd_add
1057 
1058 
1059 
1069 static inline word mzd_read_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
1070  int const spot = (y + M->offset) % m4ri_radix;
1071  wi_t const block = (y + M->offset) / m4ri_radix;
1072  int const spill = spot + n - m4ri_radix;
1073  word temp = (spill <= 0) ? M->rows[x][block] << -spill : (M->rows[x][block + 1] << (m4ri_radix - spill)) | (M->rows[x][block] >> spill);
1074  return temp >> (m4ri_radix - n);
1075 }
1076 
1077 
1097 void mzd_combine(mzd_t *DST, rci_t const row3, wi_t const startblock3,
1098  mzd_t const *SC1, rci_t const row1, wi_t const startblock1,
1099  mzd_t const *SC2, rci_t const row2, wi_t const startblock2);
1100 
1101 
1121 static inline void mzd_combine_weird(mzd_t *C, rci_t const c_row, wi_t const c_startblock,
1122  mzd_t const *A, rci_t const a_row, wi_t const a_startblock,
1123  mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
1124  word tmp;
1125  rci_t i = 0;
1126 
1127 
1128  for(; i + m4ri_radix <= A->ncols; i += m4ri_radix) {
1129  tmp = mzd_read_bits(A, a_row, i, m4ri_radix) ^ mzd_read_bits(B, b_row, i, m4ri_radix);
1130  mzd_clear_bits(C, c_row, i, m4ri_radix);
1131  mzd_xor_bits(C, c_row, i, m4ri_radix, tmp);
1132  }
1133  if(A->ncols - i) {
1134  tmp = mzd_read_bits(A, a_row, i, (A->ncols - i)) ^ mzd_read_bits(B, b_row, i, (B->ncols - i));
1135  mzd_clear_bits(C, c_row, i, (C->ncols - i));
1136  mzd_xor_bits(C, c_row, i, (C->ncols - i), tmp);
1137  }
1138 
1139  __M4RI_DD_MZD(C);
1140 }
1141 
1158 static inline void mzd_combine_even_in_place(mzd_t *A, rci_t const a_row, wi_t const a_startblock,
1159  mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
1160 
1161  wi_t wide = A->width - a_startblock - 1;
1162 
1163  word *a = A->rows[a_row] + a_startblock;
1164  word *b = B->rows[b_row] + b_startblock;
1165 
1166 #if __M4RI_HAVE_SSE2
1167  if(wide > __M4RI_SSE2_CUTOFF) {
1169  if (__M4RI_ALIGNMENT(a,16)) {
1170  *a++ ^= *b++;
1171  wide--;
1172  }
1173 
1174  if (__M4RI_ALIGNMENT(a, 16) == 0 && __M4RI_ALIGNMENT(b, 16) == 0) {
1175  __m128i *a128 = (__m128i*)a;
1176  __m128i *b128 = (__m128i*)b;
1177  const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL);
1178 
1179  do {
1180  *a128 = _mm_xor_si128(*a128, *b128);
1181  ++b128;
1182  ++a128;
1183  } while(a128 < eof);
1184 
1185  a = (word*)a128;
1186  b = (word*)b128;
1187  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
1188  }
1189  }
1190 #endif // __M4RI_HAVE_SSE2
1191 
1192  if (wide > 0) {
1193  wi_t n = (wide + 7) / 8;
1194  switch (wide % 8) {
1195  case 0: do { *(a++) ^= *(b++);
1196  case 7: *(a++) ^= *(b++);
1197  case 6: *(a++) ^= *(b++);
1198  case 5: *(a++) ^= *(b++);
1199  case 4: *(a++) ^= *(b++);
1200  case 3: *(a++) ^= *(b++);
1201  case 2: *(a++) ^= *(b++);
1202  case 1: *(a++) ^= *(b++);
1203  } while (--n > 0);
1204  }
1205  }
1206 
1207  *a ^= *b & __M4RI_LEFT_BITMASK(A->ncols%m4ri_radix);
1208 
1209  __M4RI_DD_MZD(A);
1210 }
1211 
1212 
1232 static inline void mzd_combine_even(mzd_t *C, rci_t const c_row, wi_t const c_startblock,
1233  mzd_t const *A, rci_t const a_row, wi_t const a_startblock,
1234  mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
1235 
1236  wi_t wide = A->width - a_startblock - 1;
1237  word *a = A->rows[a_row] + a_startblock;
1238  word *b = B->rows[b_row] + b_startblock;
1239  word *c = C->rows[c_row] + c_startblock;
1240 
1241  /* /\* this is a corner case triggered by Strassen multiplication */
1242  /* * which assumes certain (virtual) matrix sizes */
1243  /* * 2011/03/07: I don't think this was ever correct *\/ */
1244  /* if (a_row >= A->nrows) { */
1245  /* assert(a_row < A->nrows); */
1246  /* for(wi_t i = 0; i < wide; ++i) { */
1247  /* c[i] = b[i]; */
1248  /* } */
1249  /* } else { */
1250 #if __M4RI_HAVE_SSE2
1251  if(wide > __M4RI_SSE2_CUTOFF) {
1253  if (__M4RI_ALIGNMENT(a,16)) {
1254  *c++ = *b++ ^ *a++;
1255  wide--;
1256  }
1257 
1258  if ( (__M4RI_ALIGNMENT(b, 16) | __M4RI_ALIGNMENT(c, 16)) == 0) {
1259  __m128i *a128 = (__m128i*)a;
1260  __m128i *b128 = (__m128i*)b;
1261  __m128i *c128 = (__m128i*)c;
1262  const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL);
1263 
1264  do {
1265  *c128 = _mm_xor_si128(*a128, *b128);
1266  ++c128;
1267  ++b128;
1268  ++a128;
1269  } while(a128 < eof);
1270 
1271  a = (word*)a128;
1272  b = (word*)b128;
1273  c = (word*)c128;
1274  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
1275  }
1276  }
1277 #endif // __M4RI_HAVE_SSE2
1278 
1279  if (wide > 0) {
1280  wi_t n = (wide + 7) / 8;
1281  switch (wide % 8) {
1282  case 0: do { *(c++) = *(a++) ^ *(b++);
1283  case 7: *(c++) = *(a++) ^ *(b++);
1284  case 6: *(c++) = *(a++) ^ *(b++);
1285  case 5: *(c++) = *(a++) ^ *(b++);
1286  case 4: *(c++) = *(a++) ^ *(b++);
1287  case 3: *(c++) = *(a++) ^ *(b++);
1288  case 2: *(c++) = *(a++) ^ *(b++);
1289  case 1: *(c++) = *(a++) ^ *(b++);
1290  } while (--n > 0);
1291  }
1292  }
1293  *c ^= ((*a ^ *b ^ *c) & __M4RI_LEFT_BITMASK(C->ncols%m4ri_radix));
1294 
1295  __M4RI_DD_MZD(C);
1296 }
1297 
1298 
1307 static inline int mzd_read_bits_int(mzd_t const *M, rci_t const x, rci_t const y, int const n)
1308 {
1309  return __M4RI_CONVERT_TO_INT(mzd_read_bits(M, x, y, n));
1310 }
1311 
1312 
1319 int mzd_is_zero(mzd_t const *A);
1320 
1329 void mzd_row_clear_offset(mzd_t *M, rci_t const row, rci_t const coloffset);
1330 
1347 int mzd_find_pivot(mzd_t const *M, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c);
1348 
1349 
1362 double mzd_density(mzd_t const *A, wi_t res);
1363 
1378 double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c);
1379 
1380 
1389 rci_t mzd_first_zero_row(mzd_t const *A);
1390 
1397 static inline unsigned long long mzd_hash(mzd_t const *A) {
1398  unsigned long long hash = 0;
1399  for (rci_t r = 0; r < A->nrows; ++r)
1400  hash ^= rotate_word(calculate_hash(A->rows[r], A->width), r % m4ri_radix);
1401  return hash;
1402 }
1403 
1413 mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A);
1414 
1424 mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A);
1425 
1426 #endif // M4RI_MZD