Actual source code: baij2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/dense/seq/dense.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <petscblaslapack.h>

  7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  8:   #include <immintrin.h>
  9: #endif

 11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
 14:   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
 15:   const PetscInt *idx;
 16:   PetscInt        start, end, *ai, *aj, bs, *nidx2;
 17:   PetscBT         table;

 19:   m  = a->mbs;
 20:   ai = a->i;
 21:   aj = a->j;
 22:   bs = A->rmap->bs;


 26:   PetscBTCreate(m, &table);
 27:   PetscMalloc1(m + 1, &nidx);
 28:   PetscMalloc1(A->rmap->N + 1, &nidx2);

 30:   for (i = 0; i < is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz = 0;
 33:     PetscBTMemzero(m, table);

 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     ISGetIndices(is[i], &idx);
 37:     ISGetLocalSize(is[i], &n);

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j = 0; j < n; ++j) {
 41:       ival = idx[j] / bs; /* convert the indices into block indices */
 43:       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
 44:     }
 45:     ISRestoreIndices(is[i], &idx);
 46:     ISDestroy(&is[i]);

 48:     k = 0;
 49:     for (j = 0; j < ov; j++) { /* for each overlap*/
 50:       n = isz;
 51:       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row + 1];
 55:         for (l = start; l < end; l++) {
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
 58:         }
 59:       }
 60:     }
 61:     /* expand the Index Set */
 62:     for (j = 0; j < isz; j++) {
 63:       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
 64:     }
 65:     ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i);
 66:   }
 67:   PetscBTDestroy(&table);
 68:   PetscFree(nidx);
 69:   PetscFree(nidx2);
 70:   return 0;
 71: }

 73: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
 74: {
 75:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
 76:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
 77:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
 78:   const PetscInt *irow, *icol;
 79:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
 80:   PetscInt       *aj = a->j, *ai = a->i;
 81:   MatScalar      *mat_a;
 82:   Mat             C;
 83:   PetscBool       flag;

 85:   ISGetIndices(isrow, &irow);
 86:   ISGetIndices(iscol, &icol);
 87:   ISGetLocalSize(isrow, &nrows);
 88:   ISGetLocalSize(iscol, &ncols);

 90:   PetscCalloc1(1 + oldcols, &smap);
 91:   ssmap = smap;
 92:   PetscMalloc1(1 + nrows, &lens);
 93:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
 94:   /* determine lens of each row */
 95:   for (i = 0; i < nrows; i++) {
 96:     kstart  = ai[irow[i]];
 97:     kend    = kstart + a->ilen[irow[i]];
 98:     lens[i] = 0;
 99:     for (k = kstart; k < kend; k++) {
100:       if (ssmap[aj[k]]) lens[i]++;
101:     }
102:   }
103:   /* Create and fill new matrix */
104:   if (scall == MAT_REUSE_MATRIX) {
105:     c = (Mat_SeqBAIJ *)((*B)->data);

108:     PetscArraycmp(c->ilen, lens, c->mbs, &flag);
110:     PetscArrayzero(c->ilen, c->mbs);
111:     C = *B;
112:   } else {
113:     MatCreate(PetscObjectComm((PetscObject)A), &C);
114:     MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE);
115:     MatSetType(C, ((PetscObject)A)->type_name);
116:     MatSeqBAIJSetPreallocation(C, bs, 0, lens);
117:   }
118:   c = (Mat_SeqBAIJ *)(C->data);
119:   for (i = 0; i < nrows; i++) {
120:     row      = irow[i];
121:     kstart   = ai[row];
122:     kend     = kstart + a->ilen[row];
123:     mat_i    = c->i[i];
124:     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
125:     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
126:     mat_ilen = c->ilen + i;
127:     for (k = kstart; k < kend; k++) {
128:       if ((tcol = ssmap[a->j[k]])) {
129:         *mat_j++ = tcol - 1;
130:         PetscArraycpy(mat_a, a->a + k * bs2, bs2);
131:         mat_a += bs2;
132:         (*mat_ilen)++;
133:       }
134:     }
135:   }
136:   /* sort */
137:   if (c->j && c->a) {
138:     MatScalar *work;
139:     PetscMalloc1(bs2, &work);
140:     for (i = 0; i < nrows; i++) {
141:       PetscInt ilen;
142:       mat_i = c->i[i];
143:       mat_j = c->j + mat_i;
144:       mat_a = c->a + mat_i * bs2;
145:       ilen  = c->ilen[i];
146:       PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work);
147:     }
148:     PetscFree(work);
149:   }

151:   /* Free work space */
152:   ISRestoreIndices(iscol, &icol);
153:   PetscFree(smap);
154:   PetscFree(lens);
155:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
156:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);

158:   ISRestoreIndices(isrow, &irow);
159:   *B = C;
160:   return 0;
161: }

163: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
164: {
165:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
166:   IS              is1, is2;
167:   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
168:   const PetscInt *irow, *icol;

170:   ISGetIndices(isrow, &irow);
171:   ISGetIndices(iscol, &icol);
172:   ISGetLocalSize(isrow, &nrows);
173:   ISGetLocalSize(iscol, &ncols);

175:   /* Verify if the indices corespond to each element in a block
176:    and form the IS with compressed IS */
177:   maxmnbs = PetscMax(a->mbs, a->nbs);
178:   PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary);
179:   PetscArrayzero(vary, a->mbs);
180:   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
182:   count = 0;
183:   for (i = 0; i < nrows; i++) {
184:     j = irow[i] / bs;
185:     if ((vary[j]--) == bs) iary[count++] = j;
186:   }
187:   ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1);

189:   PetscArrayzero(vary, a->nbs);
190:   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
192:   count = 0;
193:   for (i = 0; i < ncols; i++) {
194:     j = icol[i] / bs;
195:     if ((vary[j]--) == bs) iary[count++] = j;
196:   }
197:   ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2);
198:   ISRestoreIndices(isrow, &irow);
199:   ISRestoreIndices(iscol, &icol);
200:   PetscFree2(vary, iary);

202:   MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B);
203:   ISDestroy(&is1);
204:   ISDestroy(&is2);
205:   return 0;
206: }

208: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
209: {
210:   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
211:   Mat_SubSppt *submatj = c->submatis1;

213:   (*submatj->destroy)(C);
214:   MatDestroySubMatrix_Private(submatj);
215:   return 0;
216: }

218: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
219: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
220: {
221:   PetscInt     i;
222:   Mat          C;
223:   Mat_SeqBAIJ *c;
224:   Mat_SubSppt *submatj;

226:   for (i = 0; i < n; i++) {
227:     C       = (*mat)[i];
228:     c       = (Mat_SeqBAIJ *)C->data;
229:     submatj = c->submatis1;
230:     if (submatj) {
231:       if (--((PetscObject)C)->refct <= 0) {
232:         PetscFree(C->factorprefix);
233:         (*submatj->destroy)(C);
234:         MatDestroySubMatrix_Private(submatj);
235:         PetscFree(C->defaultvectype);
236:         PetscFree(C->defaultrandtype);
237:         PetscLayoutDestroy(&C->rmap);
238:         PetscLayoutDestroy(&C->cmap);
239:         PetscHeaderDestroy(&C);
240:       }
241:     } else {
242:       MatDestroy(&C);
243:     }
244:   }

246:   /* Destroy Dummy submatrices created for reuse */
247:   MatDestroySubMatrices_Dummy(n, mat);

249:   PetscFree(*mat);
250:   return 0;
251: }

253: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
254: {
255:   PetscInt i;

257:   if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(n + 1, B);

259:   for (i = 0; i < n; i++) MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]);
260:   return 0;
261: }

263: /* -------------------------------------------------------*/
264: /* Should check that shapes of vectors and matrices match */
265: /* -------------------------------------------------------*/

267: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
268: {
269:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
270:   PetscScalar       *z, sum;
271:   const PetscScalar *x;
272:   const MatScalar   *v;
273:   PetscInt           mbs, i, n;
274:   const PetscInt    *idx, *ii, *ridx = NULL;
275:   PetscBool          usecprow = a->compressedrow.use;

277:   VecGetArrayRead(xx, &x);
278:   VecGetArrayWrite(zz, &z);

280:   if (usecprow) {
281:     mbs  = a->compressedrow.nrows;
282:     ii   = a->compressedrow.i;
283:     ridx = a->compressedrow.rindex;
284:     PetscArrayzero(z, a->mbs);
285:   } else {
286:     mbs = a->mbs;
287:     ii  = a->i;
288:   }

290:   for (i = 0; i < mbs; i++) {
291:     n   = ii[1] - ii[0];
292:     v   = a->a + ii[0];
293:     idx = a->j + ii[0];
294:     ii++;
295:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
296:     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
297:     sum = 0.0;
298:     PetscSparseDensePlusDot(sum, x, v, idx, n);
299:     if (usecprow) {
300:       z[ridx[i]] = sum;
301:     } else {
302:       z[i] = sum;
303:     }
304:   }
305:   VecRestoreArrayRead(xx, &x);
306:   VecRestoreArrayWrite(zz, &z);
307:   PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt);
308:   return 0;
309: }

311: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
312: {
313:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
314:   PetscScalar       *z = NULL, sum1, sum2, *zarray;
315:   const PetscScalar *x, *xb;
316:   PetscScalar        x1, x2;
317:   const MatScalar   *v;
318:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
319:   PetscBool          usecprow = a->compressedrow.use;

321:   VecGetArrayRead(xx, &x);
322:   VecGetArrayWrite(zz, &zarray);

324:   idx = a->j;
325:   v   = a->a;
326:   if (usecprow) {
327:     mbs  = a->compressedrow.nrows;
328:     ii   = a->compressedrow.i;
329:     ridx = a->compressedrow.rindex;
330:     PetscArrayzero(zarray, 2 * a->mbs);
331:   } else {
332:     mbs = a->mbs;
333:     ii  = a->i;
334:     z   = zarray;
335:   }

337:   for (i = 0; i < mbs; i++) {
338:     n = ii[1] - ii[0];
339:     ii++;
340:     sum1 = 0.0;
341:     sum2 = 0.0;
342:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
343:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
344:     for (j = 0; j < n; j++) {
345:       xb = x + 2 * (*idx++);
346:       x1 = xb[0];
347:       x2 = xb[1];
348:       sum1 += v[0] * x1 + v[2] * x2;
349:       sum2 += v[1] * x1 + v[3] * x2;
350:       v += 4;
351:     }
352:     if (usecprow) z = zarray + 2 * ridx[i];
353:     z[0] = sum1;
354:     z[1] = sum2;
355:     if (!usecprow) z += 2;
356:   }
357:   VecRestoreArrayRead(xx, &x);
358:   VecRestoreArrayWrite(zz, &zarray);
359:   PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt);
360:   return 0;
361: }

363: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
364: {
365:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
366:   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
367:   const PetscScalar *x, *xb;
368:   const MatScalar   *v;
369:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
370:   PetscBool          usecprow = a->compressedrow.use;

372: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
373:   #pragma disjoint(*v, *z, *xb)
374: #endif

376:   VecGetArrayRead(xx, &x);
377:   VecGetArrayWrite(zz, &zarray);

379:   idx = a->j;
380:   v   = a->a;
381:   if (usecprow) {
382:     mbs  = a->compressedrow.nrows;
383:     ii   = a->compressedrow.i;
384:     ridx = a->compressedrow.rindex;
385:     PetscArrayzero(zarray, 3 * a->mbs);
386:   } else {
387:     mbs = a->mbs;
388:     ii  = a->i;
389:     z   = zarray;
390:   }

392:   for (i = 0; i < mbs; i++) {
393:     n = ii[1] - ii[0];
394:     ii++;
395:     sum1 = 0.0;
396:     sum2 = 0.0;
397:     sum3 = 0.0;
398:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
399:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
400:     for (j = 0; j < n; j++) {
401:       xb = x + 3 * (*idx++);
402:       x1 = xb[0];
403:       x2 = xb[1];
404:       x3 = xb[2];

406:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
407:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
408:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
409:       v += 9;
410:     }
411:     if (usecprow) z = zarray + 3 * ridx[i];
412:     z[0] = sum1;
413:     z[1] = sum2;
414:     z[2] = sum3;
415:     if (!usecprow) z += 3;
416:   }
417:   VecRestoreArrayRead(xx, &x);
418:   VecRestoreArrayWrite(zz, &zarray);
419:   PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt);
420:   return 0;
421: }

423: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
424: {
425:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
426:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
427:   const PetscScalar *x, *xb;
428:   const MatScalar   *v;
429:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
430:   PetscBool          usecprow = a->compressedrow.use;

432:   VecGetArrayRead(xx, &x);
433:   VecGetArrayWrite(zz, &zarray);

435:   idx = a->j;
436:   v   = a->a;
437:   if (usecprow) {
438:     mbs  = a->compressedrow.nrows;
439:     ii   = a->compressedrow.i;
440:     ridx = a->compressedrow.rindex;
441:     PetscArrayzero(zarray, 4 * a->mbs);
442:   } else {
443:     mbs = a->mbs;
444:     ii  = a->i;
445:     z   = zarray;
446:   }

448:   for (i = 0; i < mbs; i++) {
449:     n = ii[1] - ii[0];
450:     ii++;
451:     sum1 = 0.0;
452:     sum2 = 0.0;
453:     sum3 = 0.0;
454:     sum4 = 0.0;

456:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
457:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
458:     for (j = 0; j < n; j++) {
459:       xb = x + 4 * (*idx++);
460:       x1 = xb[0];
461:       x2 = xb[1];
462:       x3 = xb[2];
463:       x4 = xb[3];
464:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
465:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
466:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
467:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
468:       v += 16;
469:     }
470:     if (usecprow) z = zarray + 4 * ridx[i];
471:     z[0] = sum1;
472:     z[1] = sum2;
473:     z[2] = sum3;
474:     z[3] = sum4;
475:     if (!usecprow) z += 4;
476:   }
477:   VecRestoreArrayRead(xx, &x);
478:   VecRestoreArrayWrite(zz, &zarray);
479:   PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt);
480:   return 0;
481: }

483: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
484: {
485:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
486:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
487:   const PetscScalar *xb, *x;
488:   const MatScalar   *v;
489:   const PetscInt    *idx, *ii, *ridx = NULL;
490:   PetscInt           mbs, i, j, n;
491:   PetscBool          usecprow = a->compressedrow.use;

493:   VecGetArrayRead(xx, &x);
494:   VecGetArrayWrite(zz, &zarray);

496:   idx = a->j;
497:   v   = a->a;
498:   if (usecprow) {
499:     mbs  = a->compressedrow.nrows;
500:     ii   = a->compressedrow.i;
501:     ridx = a->compressedrow.rindex;
502:     PetscArrayzero(zarray, 5 * a->mbs);
503:   } else {
504:     mbs = a->mbs;
505:     ii  = a->i;
506:     z   = zarray;
507:   }

509:   for (i = 0; i < mbs; i++) {
510:     n = ii[1] - ii[0];
511:     ii++;
512:     sum1 = 0.0;
513:     sum2 = 0.0;
514:     sum3 = 0.0;
515:     sum4 = 0.0;
516:     sum5 = 0.0;
517:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
518:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
519:     for (j = 0; j < n; j++) {
520:       xb = x + 5 * (*idx++);
521:       x1 = xb[0];
522:       x2 = xb[1];
523:       x3 = xb[2];
524:       x4 = xb[3];
525:       x5 = xb[4];
526:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
527:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
528:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
529:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
530:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
531:       v += 25;
532:     }
533:     if (usecprow) z = zarray + 5 * ridx[i];
534:     z[0] = sum1;
535:     z[1] = sum2;
536:     z[2] = sum3;
537:     z[3] = sum4;
538:     z[4] = sum5;
539:     if (!usecprow) z += 5;
540:   }
541:   VecRestoreArrayRead(xx, &x);
542:   VecRestoreArrayWrite(zz, &zarray);
543:   PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt);
544:   return 0;
545: }

547: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
548: {
549:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
550:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
551:   const PetscScalar *x, *xb;
552:   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
553:   const MatScalar   *v;
554:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
555:   PetscBool          usecprow = a->compressedrow.use;

557:   VecGetArrayRead(xx, &x);
558:   VecGetArrayWrite(zz, &zarray);

560:   idx = a->j;
561:   v   = a->a;
562:   if (usecprow) {
563:     mbs  = a->compressedrow.nrows;
564:     ii   = a->compressedrow.i;
565:     ridx = a->compressedrow.rindex;
566:     PetscArrayzero(zarray, 6 * a->mbs);
567:   } else {
568:     mbs = a->mbs;
569:     ii  = a->i;
570:     z   = zarray;
571:   }

573:   for (i = 0; i < mbs; i++) {
574:     n = ii[1] - ii[0];
575:     ii++;
576:     sum1 = 0.0;
577:     sum2 = 0.0;
578:     sum3 = 0.0;
579:     sum4 = 0.0;
580:     sum5 = 0.0;
581:     sum6 = 0.0;

583:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
584:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
585:     for (j = 0; j < n; j++) {
586:       xb = x + 6 * (*idx++);
587:       x1 = xb[0];
588:       x2 = xb[1];
589:       x3 = xb[2];
590:       x4 = xb[3];
591:       x5 = xb[4];
592:       x6 = xb[5];
593:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
594:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
595:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
596:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
597:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
598:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
599:       v += 36;
600:     }
601:     if (usecprow) z = zarray + 6 * ridx[i];
602:     z[0] = sum1;
603:     z[1] = sum2;
604:     z[2] = sum3;
605:     z[3] = sum4;
606:     z[4] = sum5;
607:     z[5] = sum6;
608:     if (!usecprow) z += 6;
609:   }

611:   VecRestoreArrayRead(xx, &x);
612:   VecRestoreArrayWrite(zz, &zarray);
613:   PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt);
614:   return 0;
615: }

617: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
618: {
619:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
620:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
621:   const PetscScalar *x, *xb;
622:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
623:   const MatScalar   *v;
624:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
625:   PetscBool          usecprow = a->compressedrow.use;

627:   VecGetArrayRead(xx, &x);
628:   VecGetArrayWrite(zz, &zarray);

630:   idx = a->j;
631:   v   = a->a;
632:   if (usecprow) {
633:     mbs  = a->compressedrow.nrows;
634:     ii   = a->compressedrow.i;
635:     ridx = a->compressedrow.rindex;
636:     PetscArrayzero(zarray, 7 * a->mbs);
637:   } else {
638:     mbs = a->mbs;
639:     ii  = a->i;
640:     z   = zarray;
641:   }

643:   for (i = 0; i < mbs; i++) {
644:     n = ii[1] - ii[0];
645:     ii++;
646:     sum1 = 0.0;
647:     sum2 = 0.0;
648:     sum3 = 0.0;
649:     sum4 = 0.0;
650:     sum5 = 0.0;
651:     sum6 = 0.0;
652:     sum7 = 0.0;

654:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
655:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
656:     for (j = 0; j < n; j++) {
657:       xb = x + 7 * (*idx++);
658:       x1 = xb[0];
659:       x2 = xb[1];
660:       x3 = xb[2];
661:       x4 = xb[3];
662:       x5 = xb[4];
663:       x6 = xb[5];
664:       x7 = xb[6];
665:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
666:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
667:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
668:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
669:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
670:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
671:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
672:       v += 49;
673:     }
674:     if (usecprow) z = zarray + 7 * ridx[i];
675:     z[0] = sum1;
676:     z[1] = sum2;
677:     z[2] = sum3;
678:     z[3] = sum4;
679:     z[4] = sum5;
680:     z[5] = sum6;
681:     z[6] = sum7;
682:     if (!usecprow) z += 7;
683:   }

685:   VecRestoreArrayRead(xx, &x);
686:   VecRestoreArrayWrite(zz, &zarray);
687:   PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt);
688:   return 0;
689: }

691: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
692: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
693: {
694:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
695:   PetscScalar       *z = NULL, *work, *workt, *zarray;
696:   const PetscScalar *x, *xb;
697:   const MatScalar   *v;
698:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
699:   const PetscInt    *idx, *ii, *ridx = NULL;
700:   PetscInt           k;
701:   PetscBool          usecprow = a->compressedrow.use;

703:   __m256d a0, a1, a2, a3, a4, a5;
704:   __m256d w0, w1, w2, w3;
705:   __m256d z0, z1, z2;
706:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

708:   VecGetArrayRead(xx, &x);
709:   VecGetArrayWrite(zz, &zarray);

711:   idx = a->j;
712:   v   = a->a;
713:   if (usecprow) {
714:     mbs  = a->compressedrow.nrows;
715:     ii   = a->compressedrow.i;
716:     ridx = a->compressedrow.rindex;
717:     PetscArrayzero(zarray, bs * a->mbs);
718:   } else {
719:     mbs = a->mbs;
720:     ii  = a->i;
721:     z   = zarray;
722:   }

724:   if (!a->mult_work) {
725:     k = PetscMax(A->rmap->n, A->cmap->n);
726:     PetscMalloc1(k + 1, &a->mult_work);
727:   }

729:   work = a->mult_work;
730:   for (i = 0; i < mbs; i++) {
731:     n = ii[1] - ii[0];
732:     ii++;
733:     workt = work;
734:     for (j = 0; j < n; j++) {
735:       xb = x + bs * (*idx++);
736:       for (k = 0; k < bs; k++) workt[k] = xb[k];
737:       workt += bs;
738:     }
739:     if (usecprow) z = zarray + bs * ridx[i];

741:     z0 = _mm256_setzero_pd();
742:     z1 = _mm256_setzero_pd();
743:     z2 = _mm256_setzero_pd();

745:     for (j = 0; j < n; j++) {
746:       /* first column of a */
747:       w0 = _mm256_set1_pd(work[j * 9]);
748:       a0 = _mm256_loadu_pd(&v[j * 81]);
749:       z0 = _mm256_fmadd_pd(a0, w0, z0);
750:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
751:       z1 = _mm256_fmadd_pd(a1, w0, z1);
752:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
753:       z2 = _mm256_fmadd_pd(a2, w0, z2);

755:       /* second column of a */
756:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
757:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
758:       z0 = _mm256_fmadd_pd(a0, w1, z0);
759:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
760:       z1 = _mm256_fmadd_pd(a1, w1, z1);
761:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
762:       z2 = _mm256_fmadd_pd(a2, w1, z2);

764:       /* third column of a */
765:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
766:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
767:       z0 = _mm256_fmadd_pd(a3, w2, z0);
768:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
769:       z1 = _mm256_fmadd_pd(a4, w2, z1);
770:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
771:       z2 = _mm256_fmadd_pd(a5, w2, z2);

773:       /* fourth column of a */
774:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
775:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
776:       z0 = _mm256_fmadd_pd(a0, w3, z0);
777:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
778:       z1 = _mm256_fmadd_pd(a1, w3, z1);
779:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
780:       z2 = _mm256_fmadd_pd(a2, w3, z2);

782:       /* fifth column of a */
783:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
784:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
785:       z0 = _mm256_fmadd_pd(a3, w0, z0);
786:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
787:       z1 = _mm256_fmadd_pd(a4, w0, z1);
788:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
789:       z2 = _mm256_fmadd_pd(a5, w0, z2);

791:       /* sixth column of a */
792:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
793:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
794:       z0 = _mm256_fmadd_pd(a0, w1, z0);
795:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
796:       z1 = _mm256_fmadd_pd(a1, w1, z1);
797:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
798:       z2 = _mm256_fmadd_pd(a2, w1, z2);

800:       /* seventh column of a */
801:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
802:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
803:       z0 = _mm256_fmadd_pd(a0, w2, z0);
804:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
805:       z1 = _mm256_fmadd_pd(a1, w2, z1);
806:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
807:       z2 = _mm256_fmadd_pd(a2, w2, z2);

809:       /* eighth column of a */
810:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
811:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
812:       z0 = _mm256_fmadd_pd(a3, w3, z0);
813:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
814:       z1 = _mm256_fmadd_pd(a4, w3, z1);
815:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
816:       z2 = _mm256_fmadd_pd(a5, w3, z2);

818:       /* ninth column of a */
819:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
820:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
821:       z0 = _mm256_fmadd_pd(a0, w0, z0);
822:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
823:       z1 = _mm256_fmadd_pd(a1, w0, z1);
824:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
825:       z2 = _mm256_fmadd_pd(a2, w0, z2);
826:     }

828:     _mm256_storeu_pd(&z[0], z0);
829:     _mm256_storeu_pd(&z[4], z1);
830:     _mm256_maskstore_pd(&z[8], mask1, z2);

832:     v += n * bs2;
833:     if (!usecprow) z += bs;
834:   }
835:   VecRestoreArrayRead(xx, &x);
836:   VecRestoreArrayWrite(zz, &zarray);
837:   PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
838:   return 0;
839: }
840: #endif

842: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
843: {
844:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
845:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
846:   const PetscScalar *x, *xb;
847:   PetscScalar       *zarray, xv;
848:   const MatScalar   *v;
849:   const PetscInt    *ii, *ij = a->j, *idx;
850:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
851:   PetscBool          usecprow = a->compressedrow.use;

853:   VecGetArrayRead(xx, &x);
854:   VecGetArrayWrite(zz, &zarray);

856:   v = a->a;
857:   if (usecprow) {
858:     mbs  = a->compressedrow.nrows;
859:     ii   = a->compressedrow.i;
860:     ridx = a->compressedrow.rindex;
861:     PetscArrayzero(zarray, 11 * a->mbs);
862:   } else {
863:     mbs = a->mbs;
864:     ii  = a->i;
865:     z   = zarray;
866:   }

868:   for (i = 0; i < mbs; i++) {
869:     n     = ii[i + 1] - ii[i];
870:     idx   = ij + ii[i];
871:     sum1  = 0.0;
872:     sum2  = 0.0;
873:     sum3  = 0.0;
874:     sum4  = 0.0;
875:     sum5  = 0.0;
876:     sum6  = 0.0;
877:     sum7  = 0.0;
878:     sum8  = 0.0;
879:     sum9  = 0.0;
880:     sum10 = 0.0;
881:     sum11 = 0.0;

883:     for (j = 0; j < n; j++) {
884:       xb = x + 11 * (idx[j]);

886:       for (k = 0; k < 11; k++) {
887:         xv = xb[k];
888:         sum1 += v[0] * xv;
889:         sum2 += v[1] * xv;
890:         sum3 += v[2] * xv;
891:         sum4 += v[3] * xv;
892:         sum5 += v[4] * xv;
893:         sum6 += v[5] * xv;
894:         sum7 += v[6] * xv;
895:         sum8 += v[7] * xv;
896:         sum9 += v[8] * xv;
897:         sum10 += v[9] * xv;
898:         sum11 += v[10] * xv;
899:         v += 11;
900:       }
901:     }
902:     if (usecprow) z = zarray + 11 * ridx[i];
903:     z[0]  = sum1;
904:     z[1]  = sum2;
905:     z[2]  = sum3;
906:     z[3]  = sum4;
907:     z[4]  = sum5;
908:     z[5]  = sum6;
909:     z[6]  = sum7;
910:     z[7]  = sum8;
911:     z[8]  = sum9;
912:     z[9]  = sum10;
913:     z[10] = sum11;

915:     if (!usecprow) z += 11;
916:   }

918:   VecRestoreArrayRead(xx, &x);
919:   VecRestoreArrayWrite(zz, &zarray);
920:   PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt);
921:   return 0;
922: }

924: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
925: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
926: {
927:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
928:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
929:   const PetscScalar *x, *xb;
930:   PetscScalar       *zarray, xv;
931:   const MatScalar   *v;
932:   const PetscInt    *ii, *ij = a->j, *idx;
933:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
934:   PetscBool          usecprow = a->compressedrow.use;

936:   VecGetArrayRead(xx, &x);
937:   VecGetArrayWrite(zz, &zarray);

939:   v = a->a;
940:   if (usecprow) {
941:     mbs  = a->compressedrow.nrows;
942:     ii   = a->compressedrow.i;
943:     ridx = a->compressedrow.rindex;
944:     PetscArrayzero(zarray, 12 * a->mbs);
945:   } else {
946:     mbs = a->mbs;
947:     ii  = a->i;
948:     z   = zarray;
949:   }

951:   for (i = 0; i < mbs; i++) {
952:     n     = ii[i + 1] - ii[i];
953:     idx   = ij + ii[i];
954:     sum1  = 0.0;
955:     sum2  = 0.0;
956:     sum3  = 0.0;
957:     sum4  = 0.0;
958:     sum5  = 0.0;
959:     sum6  = 0.0;
960:     sum7  = 0.0;
961:     sum8  = 0.0;
962:     sum9  = 0.0;
963:     sum10 = 0.0;
964:     sum11 = 0.0;
965:     sum12 = 0.0;

967:     for (j = 0; j < n; j++) {
968:       xb = x + 12 * (idx[j]);

970:       for (k = 0; k < 12; k++) {
971:         xv = xb[k];
972:         sum1 += v[0] * xv;
973:         sum2 += v[1] * xv;
974:         sum3 += v[2] * xv;
975:         sum4 += v[3] * xv;
976:         sum5 += v[4] * xv;
977:         sum6 += v[5] * xv;
978:         sum7 += v[6] * xv;
979:         sum8 += v[7] * xv;
980:         sum9 += v[8] * xv;
981:         sum10 += v[9] * xv;
982:         sum11 += v[10] * xv;
983:         sum12 += v[11] * xv;
984:         v += 12;
985:       }
986:     }
987:     if (usecprow) z = zarray + 12 * ridx[i];
988:     z[0]  = sum1;
989:     z[1]  = sum2;
990:     z[2]  = sum3;
991:     z[3]  = sum4;
992:     z[4]  = sum5;
993:     z[5]  = sum6;
994:     z[6]  = sum7;
995:     z[7]  = sum8;
996:     z[8]  = sum9;
997:     z[9]  = sum10;
998:     z[10] = sum11;
999:     z[11] = sum12;
1000:     if (!usecprow) z += 12;
1001:   }
1002:   VecRestoreArrayRead(xx, &x);
1003:   VecRestoreArrayWrite(zz, &zarray);
1004:   PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1005:   return 0;
1006: }

1008: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1009: {
1010:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1011:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1012:   const PetscScalar *x, *xb;
1013:   PetscScalar       *zarray, *yarray, xv;
1014:   const MatScalar   *v;
1015:   const PetscInt    *ii, *ij = a->j, *idx;
1016:   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
1017:   PetscBool          usecprow = a->compressedrow.use;

1019:   VecGetArrayRead(xx, &x);
1020:   VecGetArrayPair(yy, zz, &yarray, &zarray);

1022:   v = a->a;
1023:   if (usecprow) {
1024:     if (zz != yy) PetscArraycpy(zarray, yarray, 12 * mbs);
1025:     mbs  = a->compressedrow.nrows;
1026:     ii   = a->compressedrow.i;
1027:     ridx = a->compressedrow.rindex;
1028:   } else {
1029:     ii = a->i;
1030:     y  = yarray;
1031:     z  = zarray;
1032:   }

1034:   for (i = 0; i < mbs; i++) {
1035:     n   = ii[i + 1] - ii[i];
1036:     idx = ij + ii[i];

1038:     if (usecprow) {
1039:       y = yarray + 12 * ridx[i];
1040:       z = zarray + 12 * ridx[i];
1041:     }
1042:     sum1  = y[0];
1043:     sum2  = y[1];
1044:     sum3  = y[2];
1045:     sum4  = y[3];
1046:     sum5  = y[4];
1047:     sum6  = y[5];
1048:     sum7  = y[6];
1049:     sum8  = y[7];
1050:     sum9  = y[8];
1051:     sum10 = y[9];
1052:     sum11 = y[10];
1053:     sum12 = y[11];

1055:     for (j = 0; j < n; j++) {
1056:       xb = x + 12 * (idx[j]);

1058:       for (k = 0; k < 12; k++) {
1059:         xv = xb[k];
1060:         sum1 += v[0] * xv;
1061:         sum2 += v[1] * xv;
1062:         sum3 += v[2] * xv;
1063:         sum4 += v[3] * xv;
1064:         sum5 += v[4] * xv;
1065:         sum6 += v[5] * xv;
1066:         sum7 += v[6] * xv;
1067:         sum8 += v[7] * xv;
1068:         sum9 += v[8] * xv;
1069:         sum10 += v[9] * xv;
1070:         sum11 += v[10] * xv;
1071:         sum12 += v[11] * xv;
1072:         v += 12;
1073:       }
1074:     }

1076:     z[0]  = sum1;
1077:     z[1]  = sum2;
1078:     z[2]  = sum3;
1079:     z[3]  = sum4;
1080:     z[4]  = sum5;
1081:     z[5]  = sum6;
1082:     z[6]  = sum7;
1083:     z[7]  = sum8;
1084:     z[8]  = sum9;
1085:     z[9]  = sum10;
1086:     z[10] = sum11;
1087:     z[11] = sum12;
1088:     if (!usecprow) {
1089:       y += 12;
1090:       z += 12;
1091:     }
1092:   }
1093:   VecRestoreArrayRead(xx, &x);
1094:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
1095:   PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1096:   return 0;
1097: }

1099: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1100: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1101: {
1102:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1103:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1104:   const PetscScalar *x, *xb;
1105:   PetscScalar        x1, x2, x3, x4, *zarray;
1106:   const MatScalar   *v;
1107:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1108:   PetscInt           mbs, i, j, n;
1109:   PetscBool          usecprow = a->compressedrow.use;

1111:   VecGetArrayRead(xx, &x);
1112:   VecGetArrayWrite(zz, &zarray);

1114:   v = a->a;
1115:   if (usecprow) {
1116:     mbs  = a->compressedrow.nrows;
1117:     ii   = a->compressedrow.i;
1118:     ridx = a->compressedrow.rindex;
1119:     PetscArrayzero(zarray, 12 * a->mbs);
1120:   } else {
1121:     mbs = a->mbs;
1122:     ii  = a->i;
1123:     z   = zarray;
1124:   }

1126:   for (i = 0; i < mbs; i++) {
1127:     n   = ii[i + 1] - ii[i];
1128:     idx = ij + ii[i];

1130:     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1131:     for (j = 0; j < n; j++) {
1132:       xb = x + 12 * (idx[j]);
1133:       x1 = xb[0];
1134:       x2 = xb[1];
1135:       x3 = xb[2];
1136:       x4 = xb[3];

1138:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1139:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1140:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1141:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1142:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1143:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1144:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1145:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1146:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1147:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1148:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1149:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1150:       v += 48;

1152:       x1 = xb[4];
1153:       x2 = xb[5];
1154:       x3 = xb[6];
1155:       x4 = xb[7];

1157:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1158:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1159:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1160:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1161:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1162:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1163:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1164:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1165:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1166:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1167:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1168:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1169:       v += 48;

1171:       x1 = xb[8];
1172:       x2 = xb[9];
1173:       x3 = xb[10];
1174:       x4 = xb[11];
1175:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1176:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1177:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1178:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1179:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1180:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1181:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1182:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1183:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1184:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1185:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1186:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1187:       v += 48;
1188:     }
1189:     if (usecprow) z = zarray + 12 * ridx[i];
1190:     z[0]  = sum1;
1191:     z[1]  = sum2;
1192:     z[2]  = sum3;
1193:     z[3]  = sum4;
1194:     z[4]  = sum5;
1195:     z[5]  = sum6;
1196:     z[6]  = sum7;
1197:     z[7]  = sum8;
1198:     z[8]  = sum9;
1199:     z[9]  = sum10;
1200:     z[10] = sum11;
1201:     z[11] = sum12;
1202:     if (!usecprow) z += 12;
1203:   }
1204:   VecRestoreArrayRead(xx, &x);
1205:   VecRestoreArrayWrite(zz, &zarray);
1206:   PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1207:   return 0;
1208: }

1210: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1211: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1212: {
1213:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1214:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1215:   const PetscScalar *x, *xb;
1216:   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1217:   const MatScalar   *v;
1218:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1219:   PetscInt           mbs      = a->mbs, i, j, n;
1220:   PetscBool          usecprow = a->compressedrow.use;

1222:   VecGetArrayRead(xx, &x);
1223:   VecGetArrayPair(yy, zz, &yarray, &zarray);

1225:   v = a->a;
1226:   if (usecprow) {
1227:     if (zz != yy) PetscArraycpy(zarray, yarray, 12 * mbs);
1228:     mbs  = a->compressedrow.nrows;
1229:     ii   = a->compressedrow.i;
1230:     ridx = a->compressedrow.rindex;
1231:   } else {
1232:     ii = a->i;
1233:     y  = yarray;
1234:     z  = zarray;
1235:   }

1237:   for (i = 0; i < mbs; i++) {
1238:     n   = ii[i + 1] - ii[i];
1239:     idx = ij + ii[i];

1241:     if (usecprow) {
1242:       y = yarray + 12 * ridx[i];
1243:       z = zarray + 12 * ridx[i];
1244:     }
1245:     sum1  = y[0];
1246:     sum2  = y[1];
1247:     sum3  = y[2];
1248:     sum4  = y[3];
1249:     sum5  = y[4];
1250:     sum6  = y[5];
1251:     sum7  = y[6];
1252:     sum8  = y[7];
1253:     sum9  = y[8];
1254:     sum10 = y[9];
1255:     sum11 = y[10];
1256:     sum12 = y[11];

1258:     for (j = 0; j < n; j++) {
1259:       xb = x + 12 * (idx[j]);
1260:       x1 = xb[0];
1261:       x2 = xb[1];
1262:       x3 = xb[2];
1263:       x4 = xb[3];

1265:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1266:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1267:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1268:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1269:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1270:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1271:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1272:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1273:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1274:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1275:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1276:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1277:       v += 48;

1279:       x1 = xb[4];
1280:       x2 = xb[5];
1281:       x3 = xb[6];
1282:       x4 = xb[7];

1284:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1285:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1286:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1287:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1288:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1289:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1290:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1291:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1292:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1293:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1294:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1295:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1296:       v += 48;

1298:       x1 = xb[8];
1299:       x2 = xb[9];
1300:       x3 = xb[10];
1301:       x4 = xb[11];
1302:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1303:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1304:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1305:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1306:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1307:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1308:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1309:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1310:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1311:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1312:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1313:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1314:       v += 48;
1315:     }
1316:     z[0]  = sum1;
1317:     z[1]  = sum2;
1318:     z[2]  = sum3;
1319:     z[3]  = sum4;
1320:     z[4]  = sum5;
1321:     z[5]  = sum6;
1322:     z[6]  = sum7;
1323:     z[7]  = sum8;
1324:     z[8]  = sum9;
1325:     z[9]  = sum10;
1326:     z[10] = sum11;
1327:     z[11] = sum12;
1328:     if (!usecprow) {
1329:       y += 12;
1330:       z += 12;
1331:     }
1332:   }
1333:   VecRestoreArrayRead(xx, &x);
1334:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
1335:   PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt);
1336:   return 0;
1337: }

1339: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1340: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1341: {
1342:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1343:   PetscScalar       *z = NULL, *zarray;
1344:   const PetscScalar *x, *work;
1345:   const MatScalar   *v = a->a;
1346:   PetscInt           mbs, i, j, n;
1347:   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1348:   PetscBool          usecprow = a->compressedrow.use;
1349:   const PetscInt     bs = 12, bs2 = 144;

1351:   __m256d a0, a1, a2, a3, a4, a5;
1352:   __m256d w0, w1, w2, w3;
1353:   __m256d z0, z1, z2;

1355:   VecGetArrayRead(xx, &x);
1356:   VecGetArrayWrite(zz, &zarray);

1358:   if (usecprow) {
1359:     mbs  = a->compressedrow.nrows;
1360:     ii   = a->compressedrow.i;
1361:     ridx = a->compressedrow.rindex;
1362:     PetscArrayzero(zarray, bs * a->mbs);
1363:   } else {
1364:     mbs = a->mbs;
1365:     ii  = a->i;
1366:     z   = zarray;
1367:   }

1369:   for (i = 0; i < mbs; i++) {
1370:     z0 = _mm256_setzero_pd();
1371:     z1 = _mm256_setzero_pd();
1372:     z2 = _mm256_setzero_pd();

1374:     n = ii[1] - ii[0];
1375:     ii++;
1376:     for (j = 0; j < n; j++) {
1377:       work = x + bs * (*idx++);

1379:       /* first column of a */
1380:       w0 = _mm256_set1_pd(work[0]);
1381:       a0 = _mm256_loadu_pd(v + 0);
1382:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1383:       a1 = _mm256_loadu_pd(v + 4);
1384:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1385:       a2 = _mm256_loadu_pd(v + 8);
1386:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1388:       /* second column of a */
1389:       w1 = _mm256_set1_pd(work[1]);
1390:       a3 = _mm256_loadu_pd(v + 12);
1391:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1392:       a4 = _mm256_loadu_pd(v + 16);
1393:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1394:       a5 = _mm256_loadu_pd(v + 20);
1395:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1397:       /* third column of a */
1398:       w2 = _mm256_set1_pd(work[2]);
1399:       a0 = _mm256_loadu_pd(v + 24);
1400:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1401:       a1 = _mm256_loadu_pd(v + 28);
1402:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1403:       a2 = _mm256_loadu_pd(v + 32);
1404:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1406:       /* fourth column of a */
1407:       w3 = _mm256_set1_pd(work[3]);
1408:       a3 = _mm256_loadu_pd(v + 36);
1409:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1410:       a4 = _mm256_loadu_pd(v + 40);
1411:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1412:       a5 = _mm256_loadu_pd(v + 44);
1413:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1415:       /* fifth column of a */
1416:       w0 = _mm256_set1_pd(work[4]);
1417:       a0 = _mm256_loadu_pd(v + 48);
1418:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1419:       a1 = _mm256_loadu_pd(v + 52);
1420:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1421:       a2 = _mm256_loadu_pd(v + 56);
1422:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1424:       /* sixth column of a */
1425:       w1 = _mm256_set1_pd(work[5]);
1426:       a3 = _mm256_loadu_pd(v + 60);
1427:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1428:       a4 = _mm256_loadu_pd(v + 64);
1429:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1430:       a5 = _mm256_loadu_pd(v + 68);
1431:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1433:       /* seventh column of a */
1434:       w2 = _mm256_set1_pd(work[6]);
1435:       a0 = _mm256_loadu_pd(v + 72);
1436:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1437:       a1 = _mm256_loadu_pd(v + 76);
1438:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1439:       a2 = _mm256_loadu_pd(v + 80);
1440:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1442:       /* eighth column of a */
1443:       w3 = _mm256_set1_pd(work[7]);
1444:       a3 = _mm256_loadu_pd(v + 84);
1445:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1446:       a4 = _mm256_loadu_pd(v + 88);
1447:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1448:       a5 = _mm256_loadu_pd(v + 92);
1449:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1451:       /* ninth column of a */
1452:       w0 = _mm256_set1_pd(work[8]);
1453:       a0 = _mm256_loadu_pd(v + 96);
1454:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1455:       a1 = _mm256_loadu_pd(v + 100);
1456:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1457:       a2 = _mm256_loadu_pd(v + 104);
1458:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1460:       /* tenth column of a */
1461:       w1 = _mm256_set1_pd(work[9]);
1462:       a3 = _mm256_loadu_pd(v + 108);
1463:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1464:       a4 = _mm256_loadu_pd(v + 112);
1465:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1466:       a5 = _mm256_loadu_pd(v + 116);
1467:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1469:       /* eleventh column of a */
1470:       w2 = _mm256_set1_pd(work[10]);
1471:       a0 = _mm256_loadu_pd(v + 120);
1472:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1473:       a1 = _mm256_loadu_pd(v + 124);
1474:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1475:       a2 = _mm256_loadu_pd(v + 128);
1476:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1478:       /* twelveth column of a */
1479:       w3 = _mm256_set1_pd(work[11]);
1480:       a3 = _mm256_loadu_pd(v + 132);
1481:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1482:       a4 = _mm256_loadu_pd(v + 136);
1483:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1484:       a5 = _mm256_loadu_pd(v + 140);
1485:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1487:       v += bs2;
1488:     }
1489:     if (usecprow) z = zarray + bs * ridx[i];
1490:     _mm256_storeu_pd(&z[0], z0);
1491:     _mm256_storeu_pd(&z[4], z1);
1492:     _mm256_storeu_pd(&z[8], z2);
1493:     if (!usecprow) z += bs;
1494:   }
1495:   VecRestoreArrayRead(xx, &x);
1496:   VecRestoreArrayWrite(zz, &zarray);
1497:   PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
1498:   return 0;
1499: }
1500: #endif

1502: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1503: /* Default MatMult for block size 15 */
1504: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1505: {
1506:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1507:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1508:   const PetscScalar *x, *xb;
1509:   PetscScalar       *zarray, xv;
1510:   const MatScalar   *v;
1511:   const PetscInt    *ii, *ij = a->j, *idx;
1512:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1513:   PetscBool          usecprow = a->compressedrow.use;

1515:   VecGetArrayRead(xx, &x);
1516:   VecGetArrayWrite(zz, &zarray);

1518:   v = a->a;
1519:   if (usecprow) {
1520:     mbs  = a->compressedrow.nrows;
1521:     ii   = a->compressedrow.i;
1522:     ridx = a->compressedrow.rindex;
1523:     PetscArrayzero(zarray, 15 * a->mbs);
1524:   } else {
1525:     mbs = a->mbs;
1526:     ii  = a->i;
1527:     z   = zarray;
1528:   }

1530:   for (i = 0; i < mbs; i++) {
1531:     n     = ii[i + 1] - ii[i];
1532:     idx   = ij + ii[i];
1533:     sum1  = 0.0;
1534:     sum2  = 0.0;
1535:     sum3  = 0.0;
1536:     sum4  = 0.0;
1537:     sum5  = 0.0;
1538:     sum6  = 0.0;
1539:     sum7  = 0.0;
1540:     sum8  = 0.0;
1541:     sum9  = 0.0;
1542:     sum10 = 0.0;
1543:     sum11 = 0.0;
1544:     sum12 = 0.0;
1545:     sum13 = 0.0;
1546:     sum14 = 0.0;
1547:     sum15 = 0.0;

1549:     for (j = 0; j < n; j++) {
1550:       xb = x + 15 * (idx[j]);

1552:       for (k = 0; k < 15; k++) {
1553:         xv = xb[k];
1554:         sum1 += v[0] * xv;
1555:         sum2 += v[1] * xv;
1556:         sum3 += v[2] * xv;
1557:         sum4 += v[3] * xv;
1558:         sum5 += v[4] * xv;
1559:         sum6 += v[5] * xv;
1560:         sum7 += v[6] * xv;
1561:         sum8 += v[7] * xv;
1562:         sum9 += v[8] * xv;
1563:         sum10 += v[9] * xv;
1564:         sum11 += v[10] * xv;
1565:         sum12 += v[11] * xv;
1566:         sum13 += v[12] * xv;
1567:         sum14 += v[13] * xv;
1568:         sum15 += v[14] * xv;
1569:         v += 15;
1570:       }
1571:     }
1572:     if (usecprow) z = zarray + 15 * ridx[i];
1573:     z[0]  = sum1;
1574:     z[1]  = sum2;
1575:     z[2]  = sum3;
1576:     z[3]  = sum4;
1577:     z[4]  = sum5;
1578:     z[5]  = sum6;
1579:     z[6]  = sum7;
1580:     z[7]  = sum8;
1581:     z[8]  = sum9;
1582:     z[9]  = sum10;
1583:     z[10] = sum11;
1584:     z[11] = sum12;
1585:     z[12] = sum13;
1586:     z[13] = sum14;
1587:     z[14] = sum15;

1589:     if (!usecprow) z += 15;
1590:   }

1592:   VecRestoreArrayRead(xx, &x);
1593:   VecRestoreArrayWrite(zz, &zarray);
1594:   PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1595:   return 0;
1596: }

1598: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1599: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1600: {
1601:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1602:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1603:   const PetscScalar *x, *xb;
1604:   PetscScalar        x1, x2, x3, x4, *zarray;
1605:   const MatScalar   *v;
1606:   const PetscInt    *ii, *ij = a->j, *idx;
1607:   PetscInt           mbs, i, j, n, *ridx = NULL;
1608:   PetscBool          usecprow = a->compressedrow.use;

1610:   VecGetArrayRead(xx, &x);
1611:   VecGetArrayWrite(zz, &zarray);

1613:   v = a->a;
1614:   if (usecprow) {
1615:     mbs  = a->compressedrow.nrows;
1616:     ii   = a->compressedrow.i;
1617:     ridx = a->compressedrow.rindex;
1618:     PetscArrayzero(zarray, 15 * a->mbs);
1619:   } else {
1620:     mbs = a->mbs;
1621:     ii  = a->i;
1622:     z   = zarray;
1623:   }

1625:   for (i = 0; i < mbs; i++) {
1626:     n     = ii[i + 1] - ii[i];
1627:     idx   = ij + ii[i];
1628:     sum1  = 0.0;
1629:     sum2  = 0.0;
1630:     sum3  = 0.0;
1631:     sum4  = 0.0;
1632:     sum5  = 0.0;
1633:     sum6  = 0.0;
1634:     sum7  = 0.0;
1635:     sum8  = 0.0;
1636:     sum9  = 0.0;
1637:     sum10 = 0.0;
1638:     sum11 = 0.0;
1639:     sum12 = 0.0;
1640:     sum13 = 0.0;
1641:     sum14 = 0.0;
1642:     sum15 = 0.0;

1644:     for (j = 0; j < n; j++) {
1645:       xb = x + 15 * (idx[j]);
1646:       x1 = xb[0];
1647:       x2 = xb[1];
1648:       x3 = xb[2];
1649:       x4 = xb[3];

1651:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1652:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1653:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1654:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1655:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1656:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1657:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1658:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1659:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1660:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1661:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1662:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1663:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1664:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1665:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;

1667:       v += 60;

1669:       x1 = xb[4];
1670:       x2 = xb[5];
1671:       x3 = xb[6];
1672:       x4 = xb[7];

1674:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1675:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1676:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1677:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1678:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1679:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1680:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1681:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1682:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1683:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1684:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1685:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1686:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1687:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1688:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1689:       v += 60;

1691:       x1 = xb[8];
1692:       x2 = xb[9];
1693:       x3 = xb[10];
1694:       x4 = xb[11];
1695:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1696:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1697:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1698:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1699:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1700:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1701:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1702:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1703:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1704:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1705:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1706:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1707:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1708:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1709:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1710:       v += 60;

1712:       x1 = xb[12];
1713:       x2 = xb[13];
1714:       x3 = xb[14];
1715:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1716:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1717:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1718:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1719:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1720:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1721:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1722:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1723:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1724:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1725:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1726:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1727:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1728:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1729:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1730:       v += 45;
1731:     }
1732:     if (usecprow) z = zarray + 15 * ridx[i];
1733:     z[0]  = sum1;
1734:     z[1]  = sum2;
1735:     z[2]  = sum3;
1736:     z[3]  = sum4;
1737:     z[4]  = sum5;
1738:     z[5]  = sum6;
1739:     z[6]  = sum7;
1740:     z[7]  = sum8;
1741:     z[8]  = sum9;
1742:     z[9]  = sum10;
1743:     z[10] = sum11;
1744:     z[11] = sum12;
1745:     z[12] = sum13;
1746:     z[13] = sum14;
1747:     z[14] = sum15;

1749:     if (!usecprow) z += 15;
1750:   }

1752:   VecRestoreArrayRead(xx, &x);
1753:   VecRestoreArrayWrite(zz, &zarray);
1754:   PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1755:   return 0;
1756: }

1758: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1759: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1760: {
1761:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1762:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1763:   const PetscScalar *x, *xb;
1764:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1765:   const MatScalar   *v;
1766:   const PetscInt    *ii, *ij = a->j, *idx;
1767:   PetscInt           mbs, i, j, n, *ridx = NULL;
1768:   PetscBool          usecprow = a->compressedrow.use;

1770:   VecGetArrayRead(xx, &x);
1771:   VecGetArrayWrite(zz, &zarray);

1773:   v = a->a;
1774:   if (usecprow) {
1775:     mbs  = a->compressedrow.nrows;
1776:     ii   = a->compressedrow.i;
1777:     ridx = a->compressedrow.rindex;
1778:     PetscArrayzero(zarray, 15 * a->mbs);
1779:   } else {
1780:     mbs = a->mbs;
1781:     ii  = a->i;
1782:     z   = zarray;
1783:   }

1785:   for (i = 0; i < mbs; i++) {
1786:     n     = ii[i + 1] - ii[i];
1787:     idx   = ij + ii[i];
1788:     sum1  = 0.0;
1789:     sum2  = 0.0;
1790:     sum3  = 0.0;
1791:     sum4  = 0.0;
1792:     sum5  = 0.0;
1793:     sum6  = 0.0;
1794:     sum7  = 0.0;
1795:     sum8  = 0.0;
1796:     sum9  = 0.0;
1797:     sum10 = 0.0;
1798:     sum11 = 0.0;
1799:     sum12 = 0.0;
1800:     sum13 = 0.0;
1801:     sum14 = 0.0;
1802:     sum15 = 0.0;

1804:     for (j = 0; j < n; j++) {
1805:       xb = x + 15 * (idx[j]);
1806:       x1 = xb[0];
1807:       x2 = xb[1];
1808:       x3 = xb[2];
1809:       x4 = xb[3];
1810:       x5 = xb[4];
1811:       x6 = xb[5];
1812:       x7 = xb[6];
1813:       x8 = xb[7];

1815:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1816:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1817:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1818:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1819:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1820:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1821:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1822:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1823:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1824:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1825:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1826:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1827:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1828:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1829:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1830:       v += 120;

1832:       x1 = xb[8];
1833:       x2 = xb[9];
1834:       x3 = xb[10];
1835:       x4 = xb[11];
1836:       x5 = xb[12];
1837:       x6 = xb[13];
1838:       x7 = xb[14];

1840:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1841:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1842:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1843:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1844:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1845:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1846:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1847:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1848:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1849:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1850:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1851:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1852:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1853:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1854:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1855:       v += 105;
1856:     }
1857:     if (usecprow) z = zarray + 15 * ridx[i];
1858:     z[0]  = sum1;
1859:     z[1]  = sum2;
1860:     z[2]  = sum3;
1861:     z[3]  = sum4;
1862:     z[4]  = sum5;
1863:     z[5]  = sum6;
1864:     z[6]  = sum7;
1865:     z[7]  = sum8;
1866:     z[8]  = sum9;
1867:     z[9]  = sum10;
1868:     z[10] = sum11;
1869:     z[11] = sum12;
1870:     z[12] = sum13;
1871:     z[13] = sum14;
1872:     z[14] = sum15;

1874:     if (!usecprow) z += 15;
1875:   }

1877:   VecRestoreArrayRead(xx, &x);
1878:   VecRestoreArrayWrite(zz, &zarray);
1879:   PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1880:   return 0;
1881: }

1883: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1884: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1885: {
1886:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1887:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1888:   const PetscScalar *x, *xb;
1889:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1890:   const MatScalar   *v;
1891:   const PetscInt    *ii, *ij = a->j, *idx;
1892:   PetscInt           mbs, i, j, n, *ridx = NULL;
1893:   PetscBool          usecprow = a->compressedrow.use;

1895:   VecGetArrayRead(xx, &x);
1896:   VecGetArrayWrite(zz, &zarray);

1898:   v = a->a;
1899:   if (usecprow) {
1900:     mbs  = a->compressedrow.nrows;
1901:     ii   = a->compressedrow.i;
1902:     ridx = a->compressedrow.rindex;
1903:     PetscArrayzero(zarray, 15 * a->mbs);
1904:   } else {
1905:     mbs = a->mbs;
1906:     ii  = a->i;
1907:     z   = zarray;
1908:   }

1910:   for (i = 0; i < mbs; i++) {
1911:     n     = ii[i + 1] - ii[i];
1912:     idx   = ij + ii[i];
1913:     sum1  = 0.0;
1914:     sum2  = 0.0;
1915:     sum3  = 0.0;
1916:     sum4  = 0.0;
1917:     sum5  = 0.0;
1918:     sum6  = 0.0;
1919:     sum7  = 0.0;
1920:     sum8  = 0.0;
1921:     sum9  = 0.0;
1922:     sum10 = 0.0;
1923:     sum11 = 0.0;
1924:     sum12 = 0.0;
1925:     sum13 = 0.0;
1926:     sum14 = 0.0;
1927:     sum15 = 0.0;

1929:     for (j = 0; j < n; j++) {
1930:       xb  = x + 15 * (idx[j]);
1931:       x1  = xb[0];
1932:       x2  = xb[1];
1933:       x3  = xb[2];
1934:       x4  = xb[3];
1935:       x5  = xb[4];
1936:       x6  = xb[5];
1937:       x7  = xb[6];
1938:       x8  = xb[7];
1939:       x9  = xb[8];
1940:       x10 = xb[9];
1941:       x11 = xb[10];
1942:       x12 = xb[11];
1943:       x13 = xb[12];
1944:       x14 = xb[13];
1945:       x15 = xb[14];

1947:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1948:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1949:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1950:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1951:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1952:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1953:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1954:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1955:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1956:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1957:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1958:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1959:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1960:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1961:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1962:       v += 225;
1963:     }
1964:     if (usecprow) z = zarray + 15 * ridx[i];
1965:     z[0]  = sum1;
1966:     z[1]  = sum2;
1967:     z[2]  = sum3;
1968:     z[3]  = sum4;
1969:     z[4]  = sum5;
1970:     z[5]  = sum6;
1971:     z[6]  = sum7;
1972:     z[7]  = sum8;
1973:     z[8]  = sum9;
1974:     z[9]  = sum10;
1975:     z[10] = sum11;
1976:     z[11] = sum12;
1977:     z[12] = sum13;
1978:     z[13] = sum14;
1979:     z[14] = sum15;

1981:     if (!usecprow) z += 15;
1982:   }

1984:   VecRestoreArrayRead(xx, &x);
1985:   VecRestoreArrayWrite(zz, &zarray);
1986:   PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt);
1987:   return 0;
1988: }

1990: /*
1991:     This will not work with MatScalar == float because it calls the BLAS
1992: */
1993: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1994: {
1995:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1996:   PetscScalar       *z = NULL, *work, *workt, *zarray;
1997:   const PetscScalar *x, *xb;
1998:   const MatScalar   *v;
1999:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2000:   const PetscInt    *idx, *ii, *ridx = NULL;
2001:   PetscInt           ncols, k;
2002:   PetscBool          usecprow = a->compressedrow.use;

2004:   VecGetArrayRead(xx, &x);
2005:   VecGetArrayWrite(zz, &zarray);

2007:   idx = a->j;
2008:   v   = a->a;
2009:   if (usecprow) {
2010:     mbs  = a->compressedrow.nrows;
2011:     ii   = a->compressedrow.i;
2012:     ridx = a->compressedrow.rindex;
2013:     PetscArrayzero(zarray, bs * a->mbs);
2014:   } else {
2015:     mbs = a->mbs;
2016:     ii  = a->i;
2017:     z   = zarray;
2018:   }

2020:   if (!a->mult_work) {
2021:     k = PetscMax(A->rmap->n, A->cmap->n);
2022:     PetscMalloc1(k + 1, &a->mult_work);
2023:   }
2024:   work = a->mult_work;
2025:   for (i = 0; i < mbs; i++) {
2026:     n = ii[1] - ii[0];
2027:     ii++;
2028:     ncols = n * bs;
2029:     workt = work;
2030:     for (j = 0; j < n; j++) {
2031:       xb = x + bs * (*idx++);
2032:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2033:       workt += bs;
2034:     }
2035:     if (usecprow) z = zarray + bs * ridx[i];
2036:     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2037:     v += n * bs2;
2038:     if (!usecprow) z += bs;
2039:   }
2040:   VecRestoreArrayRead(xx, &x);
2041:   VecRestoreArrayWrite(zz, &zarray);
2042:   PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt);
2043:   return 0;
2044: }

2046: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2047: {
2048:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2049:   const PetscScalar *x;
2050:   PetscScalar       *y, *z, sum;
2051:   const MatScalar   *v;
2052:   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2053:   const PetscInt    *idx, *ii;
2054:   PetscBool          usecprow = a->compressedrow.use;

2056:   VecGetArrayRead(xx, &x);
2057:   VecGetArrayPair(yy, zz, &y, &z);

2059:   idx = a->j;
2060:   v   = a->a;
2061:   if (usecprow) {
2062:     if (zz != yy) PetscArraycpy(z, y, mbs);
2063:     mbs  = a->compressedrow.nrows;
2064:     ii   = a->compressedrow.i;
2065:     ridx = a->compressedrow.rindex;
2066:   } else {
2067:     ii = a->i;
2068:   }

2070:   for (i = 0; i < mbs; i++) {
2071:     n = ii[1] - ii[0];
2072:     ii++;
2073:     if (!usecprow) {
2074:       sum = y[i];
2075:     } else {
2076:       sum = y[ridx[i]];
2077:     }
2078:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2079:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2080:     PetscSparseDensePlusDot(sum, x, v, idx, n);
2081:     v += n;
2082:     idx += n;
2083:     if (usecprow) {
2084:       z[ridx[i]] = sum;
2085:     } else {
2086:       z[i] = sum;
2087:     }
2088:   }
2089:   VecRestoreArrayRead(xx, &x);
2090:   VecRestoreArrayPair(yy, zz, &y, &z);
2091:   PetscLogFlops(2.0 * a->nz);
2092:   return 0;
2093: }

2095: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2096: {
2097:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2098:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2099:   const PetscScalar *x, *xb;
2100:   PetscScalar        x1, x2, *yarray, *zarray;
2101:   const MatScalar   *v;
2102:   PetscInt           mbs = a->mbs, i, n, j;
2103:   const PetscInt    *idx, *ii, *ridx = NULL;
2104:   PetscBool          usecprow = a->compressedrow.use;

2106:   VecGetArrayRead(xx, &x);
2107:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2109:   idx = a->j;
2110:   v   = a->a;
2111:   if (usecprow) {
2112:     if (zz != yy) PetscArraycpy(zarray, yarray, 2 * mbs);
2113:     mbs  = a->compressedrow.nrows;
2114:     ii   = a->compressedrow.i;
2115:     ridx = a->compressedrow.rindex;
2116:   } else {
2117:     ii = a->i;
2118:     y  = yarray;
2119:     z  = zarray;
2120:   }

2122:   for (i = 0; i < mbs; i++) {
2123:     n = ii[1] - ii[0];
2124:     ii++;
2125:     if (usecprow) {
2126:       z = zarray + 2 * ridx[i];
2127:       y = yarray + 2 * ridx[i];
2128:     }
2129:     sum1 = y[0];
2130:     sum2 = y[1];
2131:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2132:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2133:     for (j = 0; j < n; j++) {
2134:       xb = x + 2 * (*idx++);
2135:       x1 = xb[0];
2136:       x2 = xb[1];

2138:       sum1 += v[0] * x1 + v[2] * x2;
2139:       sum2 += v[1] * x1 + v[3] * x2;
2140:       v += 4;
2141:     }
2142:     z[0] = sum1;
2143:     z[1] = sum2;
2144:     if (!usecprow) {
2145:       z += 2;
2146:       y += 2;
2147:     }
2148:   }
2149:   VecRestoreArrayRead(xx, &x);
2150:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2151:   PetscLogFlops(4.0 * a->nz);
2152:   return 0;
2153: }

2155: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2156: {
2157:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2158:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2159:   const PetscScalar *x, *xb;
2160:   const MatScalar   *v;
2161:   PetscInt           mbs = a->mbs, i, j, n;
2162:   const PetscInt    *idx, *ii, *ridx = NULL;
2163:   PetscBool          usecprow = a->compressedrow.use;

2165:   VecGetArrayRead(xx, &x);
2166:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2168:   idx = a->j;
2169:   v   = a->a;
2170:   if (usecprow) {
2171:     if (zz != yy) PetscArraycpy(zarray, yarray, 3 * mbs);
2172:     mbs  = a->compressedrow.nrows;
2173:     ii   = a->compressedrow.i;
2174:     ridx = a->compressedrow.rindex;
2175:   } else {
2176:     ii = a->i;
2177:     y  = yarray;
2178:     z  = zarray;
2179:   }

2181:   for (i = 0; i < mbs; i++) {
2182:     n = ii[1] - ii[0];
2183:     ii++;
2184:     if (usecprow) {
2185:       z = zarray + 3 * ridx[i];
2186:       y = yarray + 3 * ridx[i];
2187:     }
2188:     sum1 = y[0];
2189:     sum2 = y[1];
2190:     sum3 = y[2];
2191:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2192:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2193:     for (j = 0; j < n; j++) {
2194:       xb = x + 3 * (*idx++);
2195:       x1 = xb[0];
2196:       x2 = xb[1];
2197:       x3 = xb[2];
2198:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2199:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2200:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2201:       v += 9;
2202:     }
2203:     z[0] = sum1;
2204:     z[1] = sum2;
2205:     z[2] = sum3;
2206:     if (!usecprow) {
2207:       z += 3;
2208:       y += 3;
2209:     }
2210:   }
2211:   VecRestoreArrayRead(xx, &x);
2212:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2213:   PetscLogFlops(18.0 * a->nz);
2214:   return 0;
2215: }

2217: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2218: {
2219:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2220:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2221:   const PetscScalar *x, *xb;
2222:   const MatScalar   *v;
2223:   PetscInt           mbs = a->mbs, i, j, n;
2224:   const PetscInt    *idx, *ii, *ridx = NULL;
2225:   PetscBool          usecprow = a->compressedrow.use;

2227:   VecGetArrayRead(xx, &x);
2228:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2230:   idx = a->j;
2231:   v   = a->a;
2232:   if (usecprow) {
2233:     if (zz != yy) PetscArraycpy(zarray, yarray, 4 * mbs);
2234:     mbs  = a->compressedrow.nrows;
2235:     ii   = a->compressedrow.i;
2236:     ridx = a->compressedrow.rindex;
2237:   } else {
2238:     ii = a->i;
2239:     y  = yarray;
2240:     z  = zarray;
2241:   }

2243:   for (i = 0; i < mbs; i++) {
2244:     n = ii[1] - ii[0];
2245:     ii++;
2246:     if (usecprow) {
2247:       z = zarray + 4 * ridx[i];
2248:       y = yarray + 4 * ridx[i];
2249:     }
2250:     sum1 = y[0];
2251:     sum2 = y[1];
2252:     sum3 = y[2];
2253:     sum4 = y[3];
2254:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2255:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2256:     for (j = 0; j < n; j++) {
2257:       xb = x + 4 * (*idx++);
2258:       x1 = xb[0];
2259:       x2 = xb[1];
2260:       x3 = xb[2];
2261:       x4 = xb[3];
2262:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2263:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2264:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2265:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2266:       v += 16;
2267:     }
2268:     z[0] = sum1;
2269:     z[1] = sum2;
2270:     z[2] = sum3;
2271:     z[3] = sum4;
2272:     if (!usecprow) {
2273:       z += 4;
2274:       y += 4;
2275:     }
2276:   }
2277:   VecRestoreArrayRead(xx, &x);
2278:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2279:   PetscLogFlops(32.0 * a->nz);
2280:   return 0;
2281: }

2283: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2284: {
2285:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2286:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2287:   const PetscScalar *x, *xb;
2288:   PetscScalar       *yarray, *zarray;
2289:   const MatScalar   *v;
2290:   PetscInt           mbs = a->mbs, i, j, n;
2291:   const PetscInt    *idx, *ii, *ridx = NULL;
2292:   PetscBool          usecprow = a->compressedrow.use;

2294:   VecGetArrayRead(xx, &x);
2295:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2297:   idx = a->j;
2298:   v   = a->a;
2299:   if (usecprow) {
2300:     if (zz != yy) PetscArraycpy(zarray, yarray, 5 * mbs);
2301:     mbs  = a->compressedrow.nrows;
2302:     ii   = a->compressedrow.i;
2303:     ridx = a->compressedrow.rindex;
2304:   } else {
2305:     ii = a->i;
2306:     y  = yarray;
2307:     z  = zarray;
2308:   }

2310:   for (i = 0; i < mbs; i++) {
2311:     n = ii[1] - ii[0];
2312:     ii++;
2313:     if (usecprow) {
2314:       z = zarray + 5 * ridx[i];
2315:       y = yarray + 5 * ridx[i];
2316:     }
2317:     sum1 = y[0];
2318:     sum2 = y[1];
2319:     sum3 = y[2];
2320:     sum4 = y[3];
2321:     sum5 = y[4];
2322:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2323:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2324:     for (j = 0; j < n; j++) {
2325:       xb = x + 5 * (*idx++);
2326:       x1 = xb[0];
2327:       x2 = xb[1];
2328:       x3 = xb[2];
2329:       x4 = xb[3];
2330:       x5 = xb[4];
2331:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2332:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2333:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2334:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2335:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2336:       v += 25;
2337:     }
2338:     z[0] = sum1;
2339:     z[1] = sum2;
2340:     z[2] = sum3;
2341:     z[3] = sum4;
2342:     z[4] = sum5;
2343:     if (!usecprow) {
2344:       z += 5;
2345:       y += 5;
2346:     }
2347:   }
2348:   VecRestoreArrayRead(xx, &x);
2349:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2350:   PetscLogFlops(50.0 * a->nz);
2351:   return 0;
2352: }

2354: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2355: {
2356:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2357:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2358:   const PetscScalar *x, *xb;
2359:   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2360:   const MatScalar   *v;
2361:   PetscInt           mbs = a->mbs, i, j, n;
2362:   const PetscInt    *idx, *ii, *ridx = NULL;
2363:   PetscBool          usecprow = a->compressedrow.use;

2365:   VecGetArrayRead(xx, &x);
2366:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2368:   idx = a->j;
2369:   v   = a->a;
2370:   if (usecprow) {
2371:     if (zz != yy) PetscArraycpy(zarray, yarray, 6 * mbs);
2372:     mbs  = a->compressedrow.nrows;
2373:     ii   = a->compressedrow.i;
2374:     ridx = a->compressedrow.rindex;
2375:   } else {
2376:     ii = a->i;
2377:     y  = yarray;
2378:     z  = zarray;
2379:   }

2381:   for (i = 0; i < mbs; i++) {
2382:     n = ii[1] - ii[0];
2383:     ii++;
2384:     if (usecprow) {
2385:       z = zarray + 6 * ridx[i];
2386:       y = yarray + 6 * ridx[i];
2387:     }
2388:     sum1 = y[0];
2389:     sum2 = y[1];
2390:     sum3 = y[2];
2391:     sum4 = y[3];
2392:     sum5 = y[4];
2393:     sum6 = y[5];
2394:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2395:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2396:     for (j = 0; j < n; j++) {
2397:       xb = x + 6 * (*idx++);
2398:       x1 = xb[0];
2399:       x2 = xb[1];
2400:       x3 = xb[2];
2401:       x4 = xb[3];
2402:       x5 = xb[4];
2403:       x6 = xb[5];
2404:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2405:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2406:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2407:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2408:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2409:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2410:       v += 36;
2411:     }
2412:     z[0] = sum1;
2413:     z[1] = sum2;
2414:     z[2] = sum3;
2415:     z[3] = sum4;
2416:     z[4] = sum5;
2417:     z[5] = sum6;
2418:     if (!usecprow) {
2419:       z += 6;
2420:       y += 6;
2421:     }
2422:   }
2423:   VecRestoreArrayRead(xx, &x);
2424:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2425:   PetscLogFlops(72.0 * a->nz);
2426:   return 0;
2427: }

2429: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2430: {
2431:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2432:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2433:   const PetscScalar *x, *xb;
2434:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2435:   const MatScalar   *v;
2436:   PetscInt           mbs = a->mbs, i, j, n;
2437:   const PetscInt    *idx, *ii, *ridx = NULL;
2438:   PetscBool          usecprow = a->compressedrow.use;

2440:   VecGetArrayRead(xx, &x);
2441:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2443:   idx = a->j;
2444:   v   = a->a;
2445:   if (usecprow) {
2446:     if (zz != yy) PetscArraycpy(zarray, yarray, 7 * mbs);
2447:     mbs  = a->compressedrow.nrows;
2448:     ii   = a->compressedrow.i;
2449:     ridx = a->compressedrow.rindex;
2450:   } else {
2451:     ii = a->i;
2452:     y  = yarray;
2453:     z  = zarray;
2454:   }

2456:   for (i = 0; i < mbs; i++) {
2457:     n = ii[1] - ii[0];
2458:     ii++;
2459:     if (usecprow) {
2460:       z = zarray + 7 * ridx[i];
2461:       y = yarray + 7 * ridx[i];
2462:     }
2463:     sum1 = y[0];
2464:     sum2 = y[1];
2465:     sum3 = y[2];
2466:     sum4 = y[3];
2467:     sum5 = y[4];
2468:     sum6 = y[5];
2469:     sum7 = y[6];
2470:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2471:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2472:     for (j = 0; j < n; j++) {
2473:       xb = x + 7 * (*idx++);
2474:       x1 = xb[0];
2475:       x2 = xb[1];
2476:       x3 = xb[2];
2477:       x4 = xb[3];
2478:       x5 = xb[4];
2479:       x6 = xb[5];
2480:       x7 = xb[6];
2481:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2482:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2483:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2484:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2485:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2486:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2487:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2488:       v += 49;
2489:     }
2490:     z[0] = sum1;
2491:     z[1] = sum2;
2492:     z[2] = sum3;
2493:     z[3] = sum4;
2494:     z[4] = sum5;
2495:     z[5] = sum6;
2496:     z[6] = sum7;
2497:     if (!usecprow) {
2498:       z += 7;
2499:       y += 7;
2500:     }
2501:   }
2502:   VecRestoreArrayRead(xx, &x);
2503:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2504:   PetscLogFlops(98.0 * a->nz);
2505:   return 0;
2506: }

2508: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2509: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2510: {
2511:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2512:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2513:   const PetscScalar *x, *xb;
2514:   const MatScalar   *v;
2515:   PetscInt           mbs, i, j, n;
2516:   PetscInt           k;
2517:   PetscBool          usecprow = a->compressedrow.use;
2518:   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;

2520:   __m256d a0, a1, a2, a3, a4, a5;
2521:   __m256d w0, w1, w2, w3;
2522:   __m256d z0, z1, z2;
2523:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

2525:   VecCopy(yy, zz);
2526:   VecGetArrayRead(xx, &x);
2527:   VecGetArray(zz, &zarray);

2529:   idx = a->j;
2530:   v   = a->a;
2531:   if (usecprow) {
2532:     mbs  = a->compressedrow.nrows;
2533:     ii   = a->compressedrow.i;
2534:     ridx = a->compressedrow.rindex;
2535:   } else {
2536:     mbs = a->mbs;
2537:     ii  = a->i;
2538:     z   = zarray;
2539:   }

2541:   if (!a->mult_work) {
2542:     k = PetscMax(A->rmap->n, A->cmap->n);
2543:     PetscMalloc1(k + 1, &a->mult_work);
2544:   }

2546:   work = a->mult_work;
2547:   for (i = 0; i < mbs; i++) {
2548:     n = ii[1] - ii[0];
2549:     ii++;
2550:     workt = work;
2551:     for (j = 0; j < n; j++) {
2552:       xb = x + bs * (*idx++);
2553:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2554:       workt += bs;
2555:     }
2556:     if (usecprow) z = zarray + bs * ridx[i];

2558:     z0 = _mm256_loadu_pd(&z[0]);
2559:     z1 = _mm256_loadu_pd(&z[4]);
2560:     z2 = _mm256_set1_pd(z[8]);

2562:     for (j = 0; j < n; j++) {
2563:       /* first column of a */
2564:       w0 = _mm256_set1_pd(work[j * 9]);
2565:       a0 = _mm256_loadu_pd(&v[j * 81]);
2566:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2567:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2568:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2569:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2570:       z2 = _mm256_fmadd_pd(a2, w0, z2);

2572:       /* second column of a */
2573:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2574:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2575:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2576:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2577:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2578:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2579:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2581:       /* third column of a */
2582:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2583:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2584:       z0 = _mm256_fmadd_pd(a3, w2, z0);
2585:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2586:       z1 = _mm256_fmadd_pd(a4, w2, z1);
2587:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2588:       z2 = _mm256_fmadd_pd(a5, w2, z2);

2590:       /* fourth column of a */
2591:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2592:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2593:       z0 = _mm256_fmadd_pd(a0, w3, z0);
2594:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2595:       z1 = _mm256_fmadd_pd(a1, w3, z1);
2596:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2597:       z2 = _mm256_fmadd_pd(a2, w3, z2);

2599:       /* fifth column of a */
2600:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2601:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2602:       z0 = _mm256_fmadd_pd(a3, w0, z0);
2603:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2604:       z1 = _mm256_fmadd_pd(a4, w0, z1);
2605:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2606:       z2 = _mm256_fmadd_pd(a5, w0, z2);

2608:       /* sixth column of a */
2609:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2610:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2611:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2612:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2613:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2614:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2615:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2617:       /* seventh column of a */
2618:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2619:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2620:       z0 = _mm256_fmadd_pd(a0, w2, z0);
2621:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2622:       z1 = _mm256_fmadd_pd(a1, w2, z1);
2623:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2624:       z2 = _mm256_fmadd_pd(a2, w2, z2);

2626:       /* eighth column of a */
2627:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2628:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2629:       z0 = _mm256_fmadd_pd(a3, w3, z0);
2630:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2631:       z1 = _mm256_fmadd_pd(a4, w3, z1);
2632:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2633:       z2 = _mm256_fmadd_pd(a5, w3, z2);

2635:       /* ninth column of a */
2636:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2637:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2638:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2639:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2640:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2641:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2642:       z2 = _mm256_fmadd_pd(a2, w0, z2);
2643:     }

2645:     _mm256_storeu_pd(&z[0], z0);
2646:     _mm256_storeu_pd(&z[4], z1);
2647:     _mm256_maskstore_pd(&z[8], mask1, z2);

2649:     v += n * bs2;
2650:     if (!usecprow) z += bs;
2651:   }
2652:   VecRestoreArrayRead(xx, &x);
2653:   VecRestoreArray(zz, &zarray);
2654:   PetscLogFlops(162.0 * a->nz);
2655:   return 0;
2656: }
2657: #endif

2659: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2660: {
2661:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2662:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2663:   const PetscScalar *x, *xb;
2664:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2665:   const MatScalar   *v;
2666:   PetscInt           mbs = a->mbs, i, j, n;
2667:   const PetscInt    *idx, *ii, *ridx = NULL;
2668:   PetscBool          usecprow = a->compressedrow.use;

2670:   VecGetArrayRead(xx, &x);
2671:   VecGetArrayPair(yy, zz, &yarray, &zarray);

2673:   idx = a->j;
2674:   v   = a->a;
2675:   if (usecprow) {
2676:     if (zz != yy) PetscArraycpy(zarray, yarray, 7 * mbs);
2677:     mbs  = a->compressedrow.nrows;
2678:     ii   = a->compressedrow.i;
2679:     ridx = a->compressedrow.rindex;
2680:   } else {
2681:     ii = a->i;
2682:     y  = yarray;
2683:     z  = zarray;
2684:   }

2686:   for (i = 0; i < mbs; i++) {
2687:     n = ii[1] - ii[0];
2688:     ii++;
2689:     if (usecprow) {
2690:       z = zarray + 11 * ridx[i];
2691:       y = yarray + 11 * ridx[i];
2692:     }
2693:     sum1  = y[0];
2694:     sum2  = y[1];
2695:     sum3  = y[2];
2696:     sum4  = y[3];
2697:     sum5  = y[4];
2698:     sum6  = y[5];
2699:     sum7  = y[6];
2700:     sum8  = y[7];
2701:     sum9  = y[8];
2702:     sum10 = y[9];
2703:     sum11 = y[10];
2704:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2705:     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2706:     for (j = 0; j < n; j++) {
2707:       xb  = x + 11 * (*idx++);
2708:       x1  = xb[0];
2709:       x2  = xb[1];
2710:       x3  = xb[2];
2711:       x4  = xb[3];
2712:       x5  = xb[4];
2713:       x6  = xb[5];
2714:       x7  = xb[6];
2715:       x8  = xb[7];
2716:       x9  = xb[8];
2717:       x10 = xb[9];
2718:       x11 = xb[10];
2719:       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2720:       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2721:       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2722:       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2723:       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2724:       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2725:       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2726:       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2727:       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2728:       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2729:       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2730:       v += 121;
2731:     }
2732:     z[0]  = sum1;
2733:     z[1]  = sum2;
2734:     z[2]  = sum3;
2735:     z[3]  = sum4;
2736:     z[4]  = sum5;
2737:     z[5]  = sum6;
2738:     z[6]  = sum7;
2739:     z[7]  = sum8;
2740:     z[8]  = sum9;
2741:     z[9]  = sum10;
2742:     z[10] = sum11;
2743:     if (!usecprow) {
2744:       z += 11;
2745:       y += 11;
2746:     }
2747:   }
2748:   VecRestoreArrayRead(xx, &x);
2749:   VecRestoreArrayPair(yy, zz, &yarray, &zarray);
2750:   PetscLogFlops(242.0 * a->nz);
2751:   return 0;
2752: }

2754: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2755: {
2756:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2757:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2758:   const PetscScalar *x, *xb;
2759:   const MatScalar   *v;
2760:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2761:   PetscInt           ncols, k;
2762:   const PetscInt    *ridx     = NULL, *idx, *ii;
2763:   PetscBool          usecprow = a->compressedrow.use;

2765:   VecCopy(yy, zz);
2766:   VecGetArrayRead(xx, &x);
2767:   VecGetArray(zz, &zarray);

2769:   idx = a->j;
2770:   v   = a->a;
2771:   if (usecprow) {
2772:     mbs  = a->compressedrow.nrows;
2773:     ii   = a->compressedrow.i;
2774:     ridx = a->compressedrow.rindex;
2775:   } else {
2776:     mbs = a->mbs;
2777:     ii  = a->i;
2778:     z   = zarray;
2779:   }

2781:   if (!a->mult_work) {
2782:     k = PetscMax(A->rmap->n, A->cmap->n);
2783:     PetscMalloc1(k + 1, &a->mult_work);
2784:   }
2785:   work = a->mult_work;
2786:   for (i = 0; i < mbs; i++) {
2787:     n = ii[1] - ii[0];
2788:     ii++;
2789:     ncols = n * bs;
2790:     workt = work;
2791:     for (j = 0; j < n; j++) {
2792:       xb = x + bs * (*idx++);
2793:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2794:       workt += bs;
2795:     }
2796:     if (usecprow) z = zarray + bs * ridx[i];
2797:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2798:     v += n * bs2;
2799:     if (!usecprow) z += bs;
2800:   }
2801:   VecRestoreArrayRead(xx, &x);
2802:   VecRestoreArray(zz, &zarray);
2803:   PetscLogFlops(2.0 * a->nz * bs2);
2804:   return 0;
2805: }

2807: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2808: {
2809:   PetscScalar zero = 0.0;

2811:   VecSet(zz, zero);
2812:   MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz);
2813:   return 0;
2814: }

2816: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2817: {
2818:   PetscScalar zero = 0.0;

2820:   VecSet(zz, zero);
2821:   MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz);
2822:   return 0;
2823: }

2825: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2826: {
2827:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2828:   PetscScalar       *z, x1, x2, x3, x4, x5;
2829:   const PetscScalar *x, *xb = NULL;
2830:   const MatScalar   *v;
2831:   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2832:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2833:   Mat_CompressedRow  cprow    = a->compressedrow;
2834:   PetscBool          usecprow = cprow.use;

2836:   if (yy != zz) VecCopy(yy, zz);
2837:   VecGetArrayRead(xx, &x);
2838:   VecGetArray(zz, &z);

2840:   idx = a->j;
2841:   v   = a->a;
2842:   if (usecprow) {
2843:     mbs  = cprow.nrows;
2844:     ii   = cprow.i;
2845:     ridx = cprow.rindex;
2846:   } else {
2847:     mbs = a->mbs;
2848:     ii  = a->i;
2849:     xb  = x;
2850:   }

2852:   switch (bs) {
2853:   case 1:
2854:     for (i = 0; i < mbs; i++) {
2855:       if (usecprow) xb = x + ridx[i];
2856:       x1 = xb[0];
2857:       ib = idx + ii[0];
2858:       n  = ii[1] - ii[0];
2859:       ii++;
2860:       for (j = 0; j < n; j++) {
2861:         rval = ib[j];
2862:         z[rval] += PetscConj(*v) * x1;
2863:         v++;
2864:       }
2865:       if (!usecprow) xb++;
2866:     }
2867:     break;
2868:   case 2:
2869:     for (i = 0; i < mbs; i++) {
2870:       if (usecprow) xb = x + 2 * ridx[i];
2871:       x1 = xb[0];
2872:       x2 = xb[1];
2873:       ib = idx + ii[0];
2874:       n  = ii[1] - ii[0];
2875:       ii++;
2876:       for (j = 0; j < n; j++) {
2877:         rval = ib[j] * 2;
2878:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2879:         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2880:         v += 4;
2881:       }
2882:       if (!usecprow) xb += 2;
2883:     }
2884:     break;
2885:   case 3:
2886:     for (i = 0; i < mbs; i++) {
2887:       if (usecprow) xb = x + 3 * ridx[i];
2888:       x1 = xb[0];
2889:       x2 = xb[1];
2890:       x3 = xb[2];
2891:       ib = idx + ii[0];
2892:       n  = ii[1] - ii[0];
2893:       ii++;
2894:       for (j = 0; j < n; j++) {
2895:         rval = ib[j] * 3;
2896:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2897:         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2898:         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2899:         v += 9;
2900:       }
2901:       if (!usecprow) xb += 3;
2902:     }
2903:     break;
2904:   case 4:
2905:     for (i = 0; i < mbs; i++) {
2906:       if (usecprow) xb = x + 4 * ridx[i];
2907:       x1 = xb[0];
2908:       x2 = xb[1];
2909:       x3 = xb[2];
2910:       x4 = xb[3];
2911:       ib = idx + ii[0];
2912:       n  = ii[1] - ii[0];
2913:       ii++;
2914:       for (j = 0; j < n; j++) {
2915:         rval = ib[j] * 4;
2916:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2917:         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2918:         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2919:         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2920:         v += 16;
2921:       }
2922:       if (!usecprow) xb += 4;
2923:     }
2924:     break;
2925:   case 5:
2926:     for (i = 0; i < mbs; i++) {
2927:       if (usecprow) xb = x + 5 * ridx[i];
2928:       x1 = xb[0];
2929:       x2 = xb[1];
2930:       x3 = xb[2];
2931:       x4 = xb[3];
2932:       x5 = xb[4];
2933:       ib = idx + ii[0];
2934:       n  = ii[1] - ii[0];
2935:       ii++;
2936:       for (j = 0; j < n; j++) {
2937:         rval = ib[j] * 5;
2938:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2939:         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2940:         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2941:         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2942:         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2943:         v += 25;
2944:       }
2945:       if (!usecprow) xb += 5;
2946:     }
2947:     break;
2948:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2949:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2950: #if 0
2951:     {
2952:       PetscInt          ncols,k,bs2=a->bs2;
2953:       PetscScalar       *work,*workt,zb;
2954:       const PetscScalar *xtmp;
2955:       if (!a->mult_work) {
2956:         k    = PetscMax(A->rmap->n,A->cmap->n);
2957:         PetscMalloc1(k+1,&a->mult_work);
2958:       }
2959:       work = a->mult_work;
2960:       xtmp = x;
2961:       for (i=0; i<mbs; i++) {
2962:         n     = ii[1] - ii[0]; ii++;
2963:         ncols = n*bs;
2964:         PetscArrayzero(work,ncols);
2965:         if (usecprow) xtmp = x + bs*ridx[i];
2966:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2967:         v += n*bs2;
2968:         if (!usecprow) xtmp += bs;
2969:         workt = work;
2970:         for (j=0; j<n; j++) {
2971:           zb = z + bs*(*idx++);
2972:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2973:           workt += bs;
2974:         }
2975:       }
2976:     }
2977: #endif
2978:   }
2979:   VecRestoreArrayRead(xx, &x);
2980:   VecRestoreArray(zz, &z);
2981:   PetscLogFlops(2.0 * a->nz * a->bs2);
2982:   return 0;
2983: }

2985: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2986: {
2987:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2988:   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2989:   const PetscScalar *x, *xb = NULL;
2990:   const MatScalar   *v;
2991:   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2992:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2993:   Mat_CompressedRow  cprow    = a->compressedrow;
2994:   PetscBool          usecprow = cprow.use;

2996:   if (yy != zz) VecCopy(yy, zz);
2997:   VecGetArrayRead(xx, &x);
2998:   VecGetArray(zz, &z);

3000:   idx = a->j;
3001:   v   = a->a;
3002:   if (usecprow) {
3003:     mbs  = cprow.nrows;
3004:     ii   = cprow.i;
3005:     ridx = cprow.rindex;
3006:   } else {
3007:     mbs = a->mbs;
3008:     ii  = a->i;
3009:     xb  = x;
3010:   }

3012:   switch (bs) {
3013:   case 1:
3014:     for (i = 0; i < mbs; i++) {
3015:       if (usecprow) xb = x + ridx[i];
3016:       x1 = xb[0];
3017:       ib = idx + ii[0];
3018:       n  = ii[1] - ii[0];
3019:       ii++;
3020:       for (j = 0; j < n; j++) {
3021:         rval = ib[j];
3022:         z[rval] += *v * x1;
3023:         v++;
3024:       }
3025:       if (!usecprow) xb++;
3026:     }
3027:     break;
3028:   case 2:
3029:     for (i = 0; i < mbs; i++) {
3030:       if (usecprow) xb = x + 2 * ridx[i];
3031:       x1 = xb[0];
3032:       x2 = xb[1];
3033:       ib = idx + ii[0];
3034:       n  = ii[1] - ii[0];
3035:       ii++;
3036:       for (j = 0; j < n; j++) {
3037:         rval = ib[j] * 2;
3038:         z[rval++] += v[0] * x1 + v[1] * x2;
3039:         z[rval++] += v[2] * x1 + v[3] * x2;
3040:         v += 4;
3041:       }
3042:       if (!usecprow) xb += 2;
3043:     }
3044:     break;
3045:   case 3:
3046:     for (i = 0; i < mbs; i++) {
3047:       if (usecprow) xb = x + 3 * ridx[i];
3048:       x1 = xb[0];
3049:       x2 = xb[1];
3050:       x3 = xb[2];
3051:       ib = idx + ii[0];
3052:       n  = ii[1] - ii[0];
3053:       ii++;
3054:       for (j = 0; j < n; j++) {
3055:         rval = ib[j] * 3;
3056:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3057:         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3058:         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3059:         v += 9;
3060:       }
3061:       if (!usecprow) xb += 3;
3062:     }
3063:     break;
3064:   case 4:
3065:     for (i = 0; i < mbs; i++) {
3066:       if (usecprow) xb = x + 4 * ridx[i];
3067:       x1 = xb[0];
3068:       x2 = xb[1];
3069:       x3 = xb[2];
3070:       x4 = xb[3];
3071:       ib = idx + ii[0];
3072:       n  = ii[1] - ii[0];
3073:       ii++;
3074:       for (j = 0; j < n; j++) {
3075:         rval = ib[j] * 4;
3076:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3077:         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3078:         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3079:         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3080:         v += 16;
3081:       }
3082:       if (!usecprow) xb += 4;
3083:     }
3084:     break;
3085:   case 5:
3086:     for (i = 0; i < mbs; i++) {
3087:       if (usecprow) xb = x + 5 * ridx[i];
3088:       x1 = xb[0];
3089:       x2 = xb[1];
3090:       x3 = xb[2];
3091:       x4 = xb[3];
3092:       x5 = xb[4];
3093:       ib = idx + ii[0];
3094:       n  = ii[1] - ii[0];
3095:       ii++;
3096:       for (j = 0; j < n; j++) {
3097:         rval = ib[j] * 5;
3098:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3099:         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3100:         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3101:         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3102:         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3103:         v += 25;
3104:       }
3105:       if (!usecprow) xb += 5;
3106:     }
3107:     break;
3108:   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3109:     PetscInt           ncols, k;
3110:     PetscScalar       *work, *workt;
3111:     const PetscScalar *xtmp;
3112:     if (!a->mult_work) {
3113:       k = PetscMax(A->rmap->n, A->cmap->n);
3114:       PetscMalloc1(k + 1, &a->mult_work);
3115:     }
3116:     work = a->mult_work;
3117:     xtmp = x;
3118:     for (i = 0; i < mbs; i++) {
3119:       n = ii[1] - ii[0];
3120:       ii++;
3121:       ncols = n * bs;
3122:       PetscArrayzero(work, ncols);
3123:       if (usecprow) xtmp = x + bs * ridx[i];
3124:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3125:       v += n * bs2;
3126:       if (!usecprow) xtmp += bs;
3127:       workt = work;
3128:       for (j = 0; j < n; j++) {
3129:         zb = z + bs * (*idx++);
3130:         for (k = 0; k < bs; k++) zb[k] += workt[k];
3131:         workt += bs;
3132:       }
3133:     }
3134:   }
3135:   }
3136:   VecRestoreArrayRead(xx, &x);
3137:   VecRestoreArray(zz, &z);
3138:   PetscLogFlops(2.0 * a->nz * a->bs2);
3139:   return 0;
3140: }

3142: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3143: {
3144:   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3145:   PetscInt     totalnz = a->bs2 * a->nz;
3146:   PetscScalar  oalpha  = alpha;
3147:   PetscBLASInt one     = 1, tnz;

3149:   PetscBLASIntCast(totalnz, &tnz);
3150:   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3151:   PetscLogFlops(totalnz);
3152:   return 0;
3153: }

3155: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3156: {
3157:   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3158:   MatScalar   *v   = a->a;
3159:   PetscReal    sum = 0.0;
3160:   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;

3162:   if (type == NORM_FROBENIUS) {
3163: #if defined(PETSC_USE_REAL___FP16)
3164:     PetscBLASInt one = 1, cnt = bs2 * nz;
3165:     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3166: #else
3167:     for (i = 0; i < bs2 * nz; i++) {
3168:       sum += PetscRealPart(PetscConj(*v) * (*v));
3169:       v++;
3170:     }
3171: #endif
3172:     *norm = PetscSqrtReal(sum);
3173:     PetscLogFlops(2.0 * bs2 * nz);
3174:   } else if (type == NORM_1) { /* maximum column sum */
3175:     PetscReal *tmp;
3176:     PetscInt  *bcol = a->j;
3177:     PetscCalloc1(A->cmap->n + 1, &tmp);
3178:     for (i = 0; i < nz; i++) {
3179:       for (j = 0; j < bs; j++) {
3180:         k1 = bs * (*bcol) + j; /* column index */
3181:         for (k = 0; k < bs; k++) {
3182:           tmp[k1] += PetscAbsScalar(*v);
3183:           v++;
3184:         }
3185:       }
3186:       bcol++;
3187:     }
3188:     *norm = 0.0;
3189:     for (j = 0; j < A->cmap->n; j++) {
3190:       if (tmp[j] > *norm) *norm = tmp[j];
3191:     }
3192:     PetscFree(tmp);
3193:     PetscLogFlops(PetscMax(bs2 * nz - 1, 0));
3194:   } else if (type == NORM_INFINITY) { /* maximum row sum */
3195:     *norm = 0.0;
3196:     for (k = 0; k < bs; k++) {
3197:       for (j = 0; j < a->mbs; j++) {
3198:         v   = a->a + bs2 * a->i[j] + k;
3199:         sum = 0.0;
3200:         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3201:           for (k1 = 0; k1 < bs; k1++) {
3202:             sum += PetscAbsScalar(*v);
3203:             v += bs;
3204:           }
3205:         }
3206:         if (sum > *norm) *norm = sum;
3207:       }
3208:     }
3209:     PetscLogFlops(PetscMax(bs2 * nz - 1, 0));
3210:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3211:   return 0;
3212: }

3214: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3215: {
3216:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;

3218:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3219:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3220:     *flg = PETSC_FALSE;
3221:     return 0;
3222:   }

3224:   /* if the a->i are the same */
3225:   PetscArraycmp(a->i, b->i, a->mbs + 1, flg);
3226:   if (!*flg) return 0;

3228:   /* if a->j are the same */
3229:   PetscArraycmp(a->j, b->j, a->nz, flg);
3230:   if (!*flg) return 0;

3232:   /* if a->a are the same */
3233:   PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg);
3234:   return 0;
3235: }

3237: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3238: {
3239:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3240:   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3241:   PetscScalar *x, zero = 0.0;
3242:   MatScalar   *aa, *aa_j;

3245:   bs   = A->rmap->bs;
3246:   aa   = a->a;
3247:   ai   = a->i;
3248:   aj   = a->j;
3249:   ambs = a->mbs;
3250:   bs2  = a->bs2;

3252:   VecSet(v, zero);
3253:   VecGetArray(v, &x);
3254:   VecGetLocalSize(v, &n);
3256:   for (i = 0; i < ambs; i++) {
3257:     for (j = ai[i]; j < ai[i + 1]; j++) {
3258:       if (aj[j] == i) {
3259:         row  = i * bs;
3260:         aa_j = aa + j * bs2;
3261:         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3262:         break;
3263:       }
3264:     }
3265:   }
3266:   VecRestoreArray(v, &x);
3267:   return 0;
3268: }

3270: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3271: {
3272:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3273:   const PetscScalar *l, *r, *li, *ri;
3274:   PetscScalar        x;
3275:   MatScalar         *aa, *v;
3276:   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3277:   const PetscInt    *ai, *aj;

3279:   ai  = a->i;
3280:   aj  = a->j;
3281:   aa  = a->a;
3282:   m   = A->rmap->n;
3283:   n   = A->cmap->n;
3284:   bs  = A->rmap->bs;
3285:   mbs = a->mbs;
3286:   bs2 = a->bs2;
3287:   if (ll) {
3288:     VecGetArrayRead(ll, &l);
3289:     VecGetLocalSize(ll, &lm);
3291:     for (i = 0; i < mbs; i++) { /* for each block row */
3292:       M  = ai[i + 1] - ai[i];
3293:       li = l + i * bs;
3294:       v  = aa + bs2 * ai[i];
3295:       for (j = 0; j < M; j++) { /* for each block */
3296:         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3297:       }
3298:     }
3299:     VecRestoreArrayRead(ll, &l);
3300:     PetscLogFlops(a->nz);
3301:   }

3303:   if (rr) {
3304:     VecGetArrayRead(rr, &r);
3305:     VecGetLocalSize(rr, &rn);
3307:     for (i = 0; i < mbs; i++) { /* for each block row */
3308:       iai = ai[i];
3309:       M   = ai[i + 1] - iai;
3310:       v   = aa + bs2 * iai;
3311:       for (j = 0; j < M; j++) { /* for each block */
3312:         ri = r + bs * aj[iai + j];
3313:         for (k = 0; k < bs; k++) {
3314:           x = ri[k];
3315:           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3316:           v += bs;
3317:         }
3318:       }
3319:     }
3320:     VecRestoreArrayRead(rr, &r);
3321:     PetscLogFlops(a->nz);
3322:   }
3323:   return 0;
3324: }

3326: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3327: {
3328:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3330:   info->block_size   = a->bs2;
3331:   info->nz_allocated = a->bs2 * a->maxnz;
3332:   info->nz_used      = a->bs2 * a->nz;
3333:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3334:   info->assemblies   = A->num_ass;
3335:   info->mallocs      = A->info.mallocs;
3336:   info->memory       = 0; /* REVIEW ME */
3337:   if (A->factortype) {
3338:     info->fill_ratio_given  = A->info.fill_ratio_given;
3339:     info->fill_ratio_needed = A->info.fill_ratio_needed;
3340:     info->factor_mallocs    = A->info.factor_mallocs;
3341:   } else {
3342:     info->fill_ratio_given  = 0;
3343:     info->fill_ratio_needed = 0;
3344:     info->factor_mallocs    = 0;
3345:   }
3346:   return 0;
3347: }

3349: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3350: {
3351:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3353:   PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]);
3354:   return 0;
3355: }

3357: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3358: {
3359:   MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C);
3360:   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3361:   return 0;
3362: }

3364: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3365: {
3366:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3367:   PetscScalar       *z = NULL, sum1;
3368:   const PetscScalar *xb;
3369:   PetscScalar        x1;
3370:   const MatScalar   *v, *vv;
3371:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3372:   PetscBool          usecprow = a->compressedrow.use;

3374:   idx = a->j;
3375:   v   = a->a;
3376:   if (usecprow) {
3377:     mbs  = a->compressedrow.nrows;
3378:     ii   = a->compressedrow.i;
3379:     ridx = a->compressedrow.rindex;
3380:   } else {
3381:     mbs = a->mbs;
3382:     ii  = a->i;
3383:     z   = c;
3384:   }

3386:   for (i = 0; i < mbs; i++) {
3387:     n = ii[1] - ii[0];
3388:     ii++;
3389:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3390:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3391:     if (usecprow) z = c + ridx[i];
3392:     jj = idx;
3393:     vv = v;
3394:     for (k = 0; k < cn; k++) {
3395:       idx  = jj;
3396:       v    = vv;
3397:       sum1 = 0.0;
3398:       for (j = 0; j < n; j++) {
3399:         xb = b + (*idx++);
3400:         x1 = xb[0 + k * bm];
3401:         sum1 += v[0] * x1;
3402:         v += 1;
3403:       }
3404:       z[0 + k * cm] = sum1;
3405:     }
3406:     if (!usecprow) z += 1;
3407:   }
3408:   return 0;
3409: }

3411: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3412: {
3413:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3414:   PetscScalar       *z = NULL, sum1, sum2;
3415:   const PetscScalar *xb;
3416:   PetscScalar        x1, x2;
3417:   const MatScalar   *v, *vv;
3418:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3419:   PetscBool          usecprow = a->compressedrow.use;

3421:   idx = a->j;
3422:   v   = a->a;
3423:   if (usecprow) {
3424:     mbs  = a->compressedrow.nrows;
3425:     ii   = a->compressedrow.i;
3426:     ridx = a->compressedrow.rindex;
3427:   } else {
3428:     mbs = a->mbs;
3429:     ii  = a->i;
3430:     z   = c;
3431:   }

3433:   for (i = 0; i < mbs; i++) {
3434:     n = ii[1] - ii[0];
3435:     ii++;
3436:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3437:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3438:     if (usecprow) z = c + 2 * ridx[i];
3439:     jj = idx;
3440:     vv = v;
3441:     for (k = 0; k < cn; k++) {
3442:       idx  = jj;
3443:       v    = vv;
3444:       sum1 = 0.0;
3445:       sum2 = 0.0;
3446:       for (j = 0; j < n; j++) {
3447:         xb = b + 2 * (*idx++);
3448:         x1 = xb[0 + k * bm];
3449:         x2 = xb[1 + k * bm];
3450:         sum1 += v[0] * x1 + v[2] * x2;
3451:         sum2 += v[1] * x1 + v[3] * x2;
3452:         v += 4;
3453:       }
3454:       z[0 + k * cm] = sum1;
3455:       z[1 + k * cm] = sum2;
3456:     }
3457:     if (!usecprow) z += 2;
3458:   }
3459:   return 0;
3460: }

3462: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3463: {
3464:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3465:   PetscScalar       *z = NULL, sum1, sum2, sum3;
3466:   const PetscScalar *xb;
3467:   PetscScalar        x1, x2, x3;
3468:   const MatScalar   *v, *vv;
3469:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3470:   PetscBool          usecprow = a->compressedrow.use;

3472:   idx = a->j;
3473:   v   = a->a;
3474:   if (usecprow) {
3475:     mbs  = a->compressedrow.nrows;
3476:     ii   = a->compressedrow.i;
3477:     ridx = a->compressedrow.rindex;
3478:   } else {
3479:     mbs = a->mbs;
3480:     ii  = a->i;
3481:     z   = c;
3482:   }

3484:   for (i = 0; i < mbs; i++) {
3485:     n = ii[1] - ii[0];
3486:     ii++;
3487:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3488:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3489:     if (usecprow) z = c + 3 * ridx[i];
3490:     jj = idx;
3491:     vv = v;
3492:     for (k = 0; k < cn; k++) {
3493:       idx  = jj;
3494:       v    = vv;
3495:       sum1 = 0.0;
3496:       sum2 = 0.0;
3497:       sum3 = 0.0;
3498:       for (j = 0; j < n; j++) {
3499:         xb = b + 3 * (*idx++);
3500:         x1 = xb[0 + k * bm];
3501:         x2 = xb[1 + k * bm];
3502:         x3 = xb[2 + k * bm];
3503:         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3504:         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3505:         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3506:         v += 9;
3507:       }
3508:       z[0 + k * cm] = sum1;
3509:       z[1 + k * cm] = sum2;
3510:       z[2 + k * cm] = sum3;
3511:     }
3512:     if (!usecprow) z += 3;
3513:   }
3514:   return 0;
3515: }

3517: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3518: {
3519:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3520:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3521:   const PetscScalar *xb;
3522:   PetscScalar        x1, x2, x3, x4;
3523:   const MatScalar   *v, *vv;
3524:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3525:   PetscBool          usecprow = a->compressedrow.use;

3527:   idx = a->j;
3528:   v   = a->a;
3529:   if (usecprow) {
3530:     mbs  = a->compressedrow.nrows;
3531:     ii   = a->compressedrow.i;
3532:     ridx = a->compressedrow.rindex;
3533:   } else {
3534:     mbs = a->mbs;
3535:     ii  = a->i;
3536:     z   = c;
3537:   }

3539:   for (i = 0; i < mbs; i++) {
3540:     n = ii[1] - ii[0];
3541:     ii++;
3542:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3543:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3544:     if (usecprow) z = c + 4 * ridx[i];
3545:     jj = idx;
3546:     vv = v;
3547:     for (k = 0; k < cn; k++) {
3548:       idx  = jj;
3549:       v    = vv;
3550:       sum1 = 0.0;
3551:       sum2 = 0.0;
3552:       sum3 = 0.0;
3553:       sum4 = 0.0;
3554:       for (j = 0; j < n; j++) {
3555:         xb = b + 4 * (*idx++);
3556:         x1 = xb[0 + k * bm];
3557:         x2 = xb[1 + k * bm];
3558:         x3 = xb[2 + k * bm];
3559:         x4 = xb[3 + k * bm];
3560:         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3561:         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3562:         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3563:         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3564:         v += 16;
3565:       }
3566:       z[0 + k * cm] = sum1;
3567:       z[1 + k * cm] = sum2;
3568:       z[2 + k * cm] = sum3;
3569:       z[3 + k * cm] = sum4;
3570:     }
3571:     if (!usecprow) z += 4;
3572:   }
3573:   return 0;
3574: }

3576: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3577: {
3578:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3579:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3580:   const PetscScalar *xb;
3581:   PetscScalar        x1, x2, x3, x4, x5;
3582:   const MatScalar   *v, *vv;
3583:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3584:   PetscBool          usecprow = a->compressedrow.use;

3586:   idx = a->j;
3587:   v   = a->a;
3588:   if (usecprow) {
3589:     mbs  = a->compressedrow.nrows;
3590:     ii   = a->compressedrow.i;
3591:     ridx = a->compressedrow.rindex;
3592:   } else {
3593:     mbs = a->mbs;
3594:     ii  = a->i;
3595:     z   = c;
3596:   }

3598:   for (i = 0; i < mbs; i++) {
3599:     n = ii[1] - ii[0];
3600:     ii++;
3601:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3602:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3603:     if (usecprow) z = c + 5 * ridx[i];
3604:     jj = idx;
3605:     vv = v;
3606:     for (k = 0; k < cn; k++) {
3607:       idx  = jj;
3608:       v    = vv;
3609:       sum1 = 0.0;
3610:       sum2 = 0.0;
3611:       sum3 = 0.0;
3612:       sum4 = 0.0;
3613:       sum5 = 0.0;
3614:       for (j = 0; j < n; j++) {
3615:         xb = b + 5 * (*idx++);
3616:         x1 = xb[0 + k * bm];
3617:         x2 = xb[1 + k * bm];
3618:         x3 = xb[2 + k * bm];
3619:         x4 = xb[3 + k * bm];
3620:         x5 = xb[4 + k * bm];
3621:         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3622:         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3623:         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3624:         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3625:         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3626:         v += 25;
3627:       }
3628:       z[0 + k * cm] = sum1;
3629:       z[1 + k * cm] = sum2;
3630:       z[2 + k * cm] = sum3;
3631:       z[3 + k * cm] = sum4;
3632:       z[4 + k * cm] = sum5;
3633:     }
3634:     if (!usecprow) z += 5;
3635:   }
3636:   return 0;
3637: }

3639: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3640: {
3641:   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3642:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3643:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3644:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3645:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3646:   PetscBLASInt     bbs, bcn, bbm, bcm;
3647:   PetscScalar     *z = NULL;
3648:   PetscScalar     *c, *b;
3649:   const MatScalar *v;
3650:   const PetscInt  *idx, *ii, *ridx = NULL;
3651:   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3652:   PetscBool        usecprow = a->compressedrow.use;

3654:   if (!cm || !cn) return 0;
3658:   b = bd->v;
3659:   if (a->nonzerorowcnt != A->rmap->n) MatZeroEntries(C);
3660:   MatDenseGetArray(C, &c);
3661:   switch (bs) {
3662:   case 1:
3663:     MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3664:     break;
3665:   case 2:
3666:     MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3667:     break;
3668:   case 3:
3669:     MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3670:     break;
3671:   case 4:
3672:     MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3673:     break;
3674:   case 5:
3675:     MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3676:     break;
3677:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3678:     PetscBLASIntCast(bs, &bbs);
3679:     PetscBLASIntCast(cn, &bcn);
3680:     PetscBLASIntCast(bm, &bbm);
3681:     PetscBLASIntCast(cm, &bcm);
3682:     idx = a->j;
3683:     v   = a->a;
3684:     if (usecprow) {
3685:       mbs  = a->compressedrow.nrows;
3686:       ii   = a->compressedrow.i;
3687:       ridx = a->compressedrow.rindex;
3688:     } else {
3689:       mbs = a->mbs;
3690:       ii  = a->i;
3691:       z   = c;
3692:     }
3693:     for (i = 0; i < mbs; i++) {
3694:       n = ii[1] - ii[0];
3695:       ii++;
3696:       if (usecprow) z = c + bs * ridx[i];
3697:       if (n) {
3698:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3699:         v += bs2;
3700:       }
3701:       for (j = 1; j < n; j++) {
3702:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3703:         v += bs2;
3704:       }
3705:       if (!usecprow) z += bs;
3706:     }
3707:   }
3708:   MatDenseRestoreArray(C, &c);
3709:   PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn);
3710:   return 0;
3711: }