M4RI  1.0.1
xor.h
Go to the documentation of this file.
1 
10 #ifndef M4RI_XOR_H
11 #define M4RI_XOR_H
12 
13  /*******************************************************************
14  *
15  * M4RI: Linear Algebra over GF(2)
16  *
17  * Copyright (C) 2008-2010 Martin Albrecht <martinralbrecht@googlemail.com>
18  *
19  * Distributed under the terms of the GNU General Public License (GPL)
20  * version 2 or higher.
21  *
22  * This code is distributed in the hope that it will be useful,
23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25  * General Public License for more details.
26  *
27  * The full text of the GPL is available at:
28  *
29  * http://www.gnu.org/licenses/
30  *
31  ********************************************************************/
32 
33 #include "m4ri_config.h"
34 
35 #if __M4RI_HAVE_SSE2
36 #include <emmintrin.h>
37 #endif
38 
39 #include "misc.h"
40 
48 static inline void _mzd_combine8(word *c, word const *t1, word const *t2, word const *t3, word const *t4,
49  word const *t5, word const *t6, word const *t7, word const *t8, wi_t wide_in) {
50  wi_t wide = wide_in;
51 #if __M4RI_HAVE_SSE2
52  /* assuming t1 ... t8 are aligned, but c might not be */
53  if (__M4RI_ALIGNMENT(c,16)==0) {
54  __m128i *__c = (__m128i*)c;
55  __m128i *__t1 = (__m128i*)t1;
56  __m128i *__t2 = (__m128i*)t2;
57  __m128i *__t3 = (__m128i*)t3;
58  __m128i *__t4 = (__m128i*)t4;
59  __m128i *__t5 = (__m128i*)t5;
60  __m128i *__t6 = (__m128i*)t6;
61  __m128i *__t7 = (__m128i*)t7;
62  __m128i *__t8 = (__m128i*)t8;
63  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
64  __m128i xmm1;
65 
66  while(__c < eof) {
67  xmm1 = _mm_xor_si128(*__c, *__t1++);
68  xmm1 = _mm_xor_si128(xmm1, *__t2++);
69  xmm1 = _mm_xor_si128(xmm1, *__t3++);
70  xmm1 = _mm_xor_si128(xmm1, *__t4++);
71  xmm1 = _mm_xor_si128(xmm1, *__t5++);
72  xmm1 = _mm_xor_si128(xmm1, *__t6++);
73  xmm1 = _mm_xor_si128(xmm1, *__t7++);
74  xmm1 = _mm_xor_si128(xmm1, *__t8++);
75  *__c++ = xmm1;
76  }
77  c = (word*)__c;
78  t1 = (word*)__t1;
79  t2 = (word*)__t2;
80  t3 = (word*)__t3;
81  t4 = (word*)__t4;
82  t5 = (word*)__t5;
83  t6 = (word*)__t6;
84  t7 = (word*)__t7;
85  t8 = (word*)__t8;
86  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
87  }
88 #endif
89  for(wi_t i = 0; i < wide; ++i) {
90  c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i];
91  }
92 
93  __M4RI_DD_RAWROW(c, wide_in);
94 }
95 
101 static inline void _mzd_combine4(word *c, word const *t1, word const *t2, word const *t3, word const *t4, wi_t wide_in) {
102  wi_t wide = wide_in;
103 #if __M4RI_HAVE_SSE2
104  /* assuming t1 ... t4 are aligned, but c might not be */
105  if (__M4RI_ALIGNMENT(c,16)==0) {
106  __m128i *__c = (__m128i*)c;
107  __m128i *__t1 = (__m128i*)t1;
108  __m128i *__t2 = (__m128i*)t2;
109  __m128i *__t3 = (__m128i*)t3;
110  __m128i *__t4 = (__m128i*)t4;
111  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
112  __m128i xmm1;
113 
114  while(__c < eof) {
115  xmm1 = _mm_xor_si128(*__c, *__t1++);
116  xmm1 = _mm_xor_si128(xmm1, *__t2++);
117  xmm1 = _mm_xor_si128(xmm1, *__t3++);
118  xmm1 = _mm_xor_si128(xmm1, *__t4++);
119  *__c++ = xmm1;
120  }
121  c = (word*)__c;
122  t1 = (word*)__t1;
123  t2 = (word*)__t2;
124  t3 = (word*)__t3;
125  t4 = (word*)__t4;
126  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
127  }
128  if(!wide) {
129  __M4RI_DD_RAWROW(c, wide_in);
130  return;
131  }
132 #endif // __M4RI_HAVE_SSE2
133  wi_t n = (wide + 7) / 8;
134  switch (wide % 8) {
135  case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
136  case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
137  case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
138  case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
139  case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
140  case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
141  case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
142  case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
143  } while (--n > 0);
144  }
145  __M4RI_DD_RAWROW(c, wide_in);
146 }
147 
153 static inline void _mzd_combine3(word *c, word const *t1, word const *t2, word const *t3, wi_t wide_in) {
154  wi_t wide = wide_in;
155 #if __M4RI_HAVE_SSE2
156  /* assuming t1 ... t3 are aligned, but c might not be */
157  if (__M4RI_ALIGNMENT(c,16)==0) {
158  __m128i *__c = (__m128i*)c;
159  __m128i *__t1 = (__m128i*)t1;
160  __m128i *__t2 = (__m128i*)t2;
161  __m128i *__t3 = (__m128i*)t3;
162  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
163  __m128i xmm1;
164 
165  while(__c < eof) {
166  xmm1 = _mm_xor_si128(*__c, *__t1++);
167  xmm1 = _mm_xor_si128(xmm1, *__t2++);
168  xmm1 = _mm_xor_si128(xmm1, *__t3++);
169  *__c++ = xmm1;
170  }
171  c = (word*)__c;
172  t1 = (word*)__t1;
173  t2 = (word*)__t2;
174  t3 = (word*)__t3;
175  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
176  }
177  if(!wide) {
178  __M4RI_DD_RAWROW(c, wide_in);
179  return;
180  }
181 #endif // __M4RI_HAVE_SSE2
182  wi_t n = (wide + 7) / 8;
183  switch (wide % 8) {
184  case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++;
185  case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
186  case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
187  case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
188  case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
189  case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
190  case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
191  case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
192  } while (--n > 0);
193  }
194  __M4RI_DD_RAWROW(c, wide_in);
195 }
196 
197 
203 static inline void _mzd_combine2(word *c, word const *t1, word const *t2, wi_t wide_in) {
204  wi_t wide = wide_in;
205 #if __M4RI_HAVE_SSE2
206  /* assuming t1 ... t2 are aligned, but c might not be */
207  if (__M4RI_ALIGNMENT(c,16)==0) {
208  __m128i *__c = (__m128i*)c;
209  __m128i *__t1 = (__m128i*)t1;
210  __m128i *__t2 = (__m128i*)t2;
211  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
212  __m128i xmm1;
213 
214  while(__c < eof) {
215  xmm1 = _mm_xor_si128(*__c, *__t1++);
216  xmm1 = _mm_xor_si128(xmm1, *__t2++);
217  *__c++ = xmm1;
218  }
219  c = (word*)__c;
220  t1 = (word*)__t1;
221  t2 = (word*)__t2;
222  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
223  }
224  if(!wide) {
225  __M4RI_DD_RAWROW(c, wide_in);
226  return;
227  }
228 #endif // __M4RI_HAVE_SSE2
229  wi_t n = (wide + 7) / 8;
230  switch (wide % 8) {
231  case 0: do { *c++ ^= *t1++ ^ *t2++;
232  case 7: *c++ ^= *t1++ ^ *t2++;
233  case 6: *c++ ^= *t1++ ^ *t2++;
234  case 5: *c++ ^= *t1++ ^ *t2++;
235  case 4: *c++ ^= *t1++ ^ *t2++;
236  case 3: *c++ ^= *t1++ ^ *t2++;
237  case 2: *c++ ^= *t1++ ^ *t2++;
238  case 1: *c++ ^= *t1++ ^ *t2++;
239  } while (--n > 0);
240  }
241  __M4RI_DD_RAWROW(c, wide_in);
242 }
243 
249 static inline void _mzd_combine(word *c, word const *t1, wi_t wide_in) {
250  wi_t wide = wide_in;
251 #if __M4RI_HAVE_SSE2
252  /* assuming c, t1 are alligned the same way */
253 
254  if (__M4RI_ALIGNMENT(c,16)==8 && wide) {
255  *c++ ^= *t1++;
256  wide--;
257  }
258 
259  __m128i *__c = (__m128i*)c;
260  __m128i *__t1 = (__m128i*)t1;
261  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
262  __m128i xmm1;
263 
264 
265  while(__c < eof-1) {
266  xmm1 = _mm_xor_si128(*__c, *__t1++);
267  *__c++ = xmm1;
268  xmm1 = _mm_xor_si128(*__c, *__t1++);
269  *__c++ = xmm1;
270  }
271 
272  if(__c < eof) {
273  xmm1 = _mm_xor_si128(*__c, *__t1++);
274  *__c++ = xmm1;
275  }
276 
277  c = (word*)__c;
278  t1 = (word*)__t1;
279  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
280 
281  if(!wide) {
282  __M4RI_DD_RAWROW(c, wide_in);
283  return;
284  }
285 #endif // __M4RI_HAVE_SSE2
286 
287  wi_t n = (wide + 7) / 8;
288  switch (wide % 8) {
289  case 0: do { *c++ ^= *t1++;
290  case 7: *c++ ^= *t1++;
291  case 6: *c++ ^= *t1++;
292  case 5: *c++ ^= *t1++;
293  case 4: *c++ ^= *t1++;
294  case 3: *c++ ^= *t1++;
295  case 2: *c++ ^= *t1++;
296  case 1: *c++ ^= *t1++;
297  } while (--n > 0);
298  }
299  __M4RI_DD_RAWROW(c, wide_in);
300 }
301 
302 
303 #ifdef __M4RI_M4RM_GRAY8
304 #define _MZD_COMBINE _mzd_combine8(c, t1, t2, t3, t4, t5, t6, t7, t8, wide)
305 #else // __M4RI_M4RM_GRAY8
306 #define _MZD_COMBINE _mzd_combine4(c, t1, t2, t3, t4, wide)
307 #endif // __M4RI_M4RM_GRAY8
308 
309 #endif // M4RI_XOR_H