Actual source code: sbaij2.c


  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <../src/mat/impls/dense/seq/dense.h>
  4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  5: #include <petsc/private/kernels/blockinvert.h>
  6: #include <petscbt.h>
  7: #include <petscblaslapack.h>

  9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
 10: {
 11:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
 12:   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs, *nidx2;
 13:   const PetscInt *idx;
 14:   PetscBT         table_out, table_in;

 17:   mbs = a->mbs;
 18:   ai  = a->i;
 19:   aj  = a->j;
 20:   bs  = A->rmap->bs;
 21:   PetscBTCreate(mbs, &table_out);
 22:   PetscMalloc1(mbs + 1, &nidx);
 23:   PetscMalloc1(A->rmap->N + 1, &nidx2);
 24:   PetscBTCreate(mbs, &table_in);

 26:   for (i = 0; i < is_max; i++) { /* for each is */
 27:     isz = 0;
 28:     PetscBTMemzero(mbs, table_out);

 30:     /* Extract the indices, assume there can be duplicate entries */
 31:     ISGetIndices(is[i], &idx);
 32:     ISGetLocalSize(is[i], &n);

 34:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 35:     bcol_max = 0;
 36:     for (j = 0; j < n; ++j) {
 37:       brow = idx[j] / bs; /* convert the indices into block indices */
 39:       if (!PetscBTLookupSet(table_out, brow)) {
 40:         nidx[isz++] = brow;
 41:         if (bcol_max < brow) bcol_max = brow;
 42:       }
 43:     }
 44:     ISRestoreIndices(is[i], &idx);
 45:     ISDestroy(&is[i]);

 47:     k = 0;
 48:     for (j = 0; j < ov; j++) { /* for each overlap */
 49:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 50:       PetscBTMemzero(mbs, table_in);
 51:       for (l = k; l < isz; l++) PetscBTSet(table_in, nidx[l]);

 53:       n = isz; /* length of the updated is[i] */
 54:       for (brow = 0; brow < mbs; brow++) {
 55:         start = ai[brow];
 56:         end   = ai[brow + 1];
 57:         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 58:           for (l = start; l < end; l++) {
 59:             bcol = aj[l];
 60:             if (!PetscBTLookupSet(table_out, bcol)) {
 61:               nidx[isz++] = bcol;
 62:               if (bcol_max < bcol) bcol_max = bcol;
 63:             }
 64:           }
 65:           k++;
 66:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 67:         } else {             /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
 68:           for (l = start; l < end; l++) {
 69:             bcol = aj[l];
 70:             if (bcol > bcol_max) break;
 71:             if (PetscBTLookup(table_in, bcol)) {
 72:               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
 73:               break; /* for l = start; l<end ; l++) */
 74:             }
 75:           }
 76:         }
 77:       }
 78:     } /* for each overlap */

 80:     /* expand the Index Set */
 81:     for (j = 0; j < isz; j++) {
 82:       for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
 83:     }
 84:     ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i);
 85:   }
 86:   PetscBTDestroy(&table_out);
 87:   PetscFree(nidx);
 88:   PetscFree(nidx2);
 89:   PetscBTDestroy(&table_in);
 90:   return 0;
 91: }

 93: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 94:         Zero some ops' to avoid invalid usse */
 95: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
 96: {
 97:   MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE);
 98:   Bseq->ops->mult                   = NULL;
 99:   Bseq->ops->multadd                = NULL;
100:   Bseq->ops->multtranspose          = NULL;
101:   Bseq->ops->multtransposeadd       = NULL;
102:   Bseq->ops->lufactor               = NULL;
103:   Bseq->ops->choleskyfactor         = NULL;
104:   Bseq->ops->lufactorsymbolic       = NULL;
105:   Bseq->ops->choleskyfactorsymbolic = NULL;
106:   Bseq->ops->getinertia             = NULL;
107:   return 0;
108: }

110: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
111: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
112: {
113:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
114:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
115:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
116:   const PetscInt *irow, *icol;
117:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
118:   PetscInt       *aj = a->j, *ai = a->i;
119:   MatScalar      *mat_a;
120:   Mat             C;
121:   PetscBool       flag;


124:   ISGetIndices(isrow, &irow);
125:   ISGetIndices(iscol, &icol);
126:   ISGetLocalSize(isrow, &nrows);
127:   ISGetLocalSize(iscol, &ncols);

129:   PetscCalloc1(1 + oldcols, &smap);
130:   ssmap = smap;
131:   PetscMalloc1(1 + nrows, &lens);
132:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
133:   /* determine lens of each row */
134:   for (i = 0; i < nrows; i++) {
135:     kstart  = ai[irow[i]];
136:     kend    = kstart + a->ilen[irow[i]];
137:     lens[i] = 0;
138:     for (k = kstart; k < kend; k++) {
139:       if (ssmap[aj[k]]) lens[i]++;
140:     }
141:   }
142:   /* Create and fill new matrix */
143:   if (scall == MAT_REUSE_MATRIX) {
144:     c = (Mat_SeqSBAIJ *)((*B)->data);

147:     PetscArraycmp(c->ilen, lens, c->mbs, &flag);
149:     PetscArrayzero(c->ilen, c->mbs);
150:     C = *B;
151:   } else {
152:     MatCreate(PetscObjectComm((PetscObject)A), &C);
153:     MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE);
154:     MatSetType(C, ((PetscObject)A)->type_name);
155:     MatSeqSBAIJSetPreallocation(C, bs, 0, lens);
156:   }
157:   c = (Mat_SeqSBAIJ *)(C->data);
158:   for (i = 0; i < nrows; i++) {
159:     row      = irow[i];
160:     kstart   = ai[row];
161:     kend     = kstart + a->ilen[row];
162:     mat_i    = c->i[i];
163:     mat_j    = c->j + mat_i;
164:     mat_a    = c->a + mat_i * bs2;
165:     mat_ilen = c->ilen + i;
166:     for (k = kstart; k < kend; k++) {
167:       if ((tcol = ssmap[a->j[k]])) {
168:         *mat_j++ = tcol - 1;
169:         PetscArraycpy(mat_a, a->a + k * bs2, bs2);
170:         mat_a += bs2;
171:         (*mat_ilen)++;
172:       }
173:     }
174:   }
175:   /* sort */
176:   {
177:     MatScalar *work;

179:     PetscMalloc1(bs2, &work);
180:     for (i = 0; i < nrows; i++) {
181:       PetscInt ilen;
182:       mat_i = c->i[i];
183:       mat_j = c->j + mat_i;
184:       mat_a = c->a + mat_i * bs2;
185:       ilen  = c->ilen[i];
186:       PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work);
187:     }
188:     PetscFree(work);
189:   }

191:   /* Free work space */
192:   ISRestoreIndices(iscol, &icol);
193:   PetscFree(smap);
194:   PetscFree(lens);
195:   MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
196:   MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);

198:   ISRestoreIndices(isrow, &irow);
199:   *B = C;
200:   return 0;
201: }

203: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
204: {
205:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
206:   IS              is1, is2;
207:   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
208:   const PetscInt *irow, *icol;

210:   ISGetIndices(isrow, &irow);
211:   ISGetIndices(iscol, &icol);
212:   ISGetLocalSize(isrow, &nrows);
213:   ISGetLocalSize(iscol, &ncols);

215:   /* Verify if the indices corespond to each element in a block
216:    and form the IS with compressed IS */
217:   maxmnbs = PetscMax(a->mbs, a->nbs);
218:   PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary);
219:   PetscArrayzero(vary, a->mbs);
220:   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
222:   count = 0;
223:   for (i = 0; i < nrows; i++) {
224:     PetscInt j = irow[i] / bs;
225:     if ((vary[j]--) == bs) iary[count++] = j;
226:   }
227:   ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1);

229:   PetscArrayzero(vary, a->nbs);
230:   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
232:   count = 0;
233:   for (i = 0; i < ncols; i++) {
234:     PetscInt j = icol[i] / bs;
235:     if ((vary[j]--) == bs) iary[count++] = j;
236:   }
237:   ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2);
238:   ISRestoreIndices(isrow, &irow);
239:   ISRestoreIndices(iscol, &icol);
240:   PetscFree2(vary, iary);

242:   MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B);
243:   ISDestroy(&is1);
244:   ISDestroy(&is2);

246:   if (isrow != iscol) {
247:     PetscBool isequal;
248:     ISEqual(isrow, iscol, &isequal);
249:     if (!isequal) MatSeqSBAIJZeroOps_Private(*B);
250:   }
251:   return 0;
252: }

254: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
255: {
256:   PetscInt i;

258:   if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(n + 1, B);

260:   for (i = 0; i < n; i++) MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]);
261:   return 0;
262: }

264: /* -------------------------------------------------------*/
265: /* Should check that shapes of vectors and matrices match */
266: /* -------------------------------------------------------*/

268: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
269: {
270:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
271:   PetscScalar       *z, x1, x2, zero = 0.0;
272:   const PetscScalar *x, *xb;
273:   const MatScalar   *v;
274:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
275:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
276:   PetscInt           nonzerorow = 0;

278:   VecSet(zz, zero);
279:   if (!a->nz) return 0;
280:   VecGetArrayRead(xx, &x);
281:   VecGetArray(zz, &z);

283:   v  = a->a;
284:   xb = x;

286:   for (i = 0; i < mbs; i++) {
287:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
288:     x1   = xb[0];
289:     x2   = xb[1];
290:     ib   = aj + *ai;
291:     jmin = 0;
292:     nonzerorow += (n > 0);
293:     if (*ib == i) { /* (diag of A)*x */
294:       z[2 * i] += v[0] * x1 + v[2] * x2;
295:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
296:       v += 4;
297:       jmin++;
298:     }
299:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
300:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
301:     for (j = jmin; j < n; j++) {
302:       /* (strict lower triangular part of A)*x  */
303:       cval = ib[j] * 2;
304:       z[cval] += v[0] * x1 + v[1] * x2;
305:       z[cval + 1] += v[2] * x1 + v[3] * x2;
306:       /* (strict upper triangular part of A)*x  */
307:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
308:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
309:       v += 4;
310:     }
311:     xb += 2;
312:     ai++;
313:   }

315:   VecRestoreArrayRead(xx, &x);
316:   VecRestoreArray(zz, &z);
317:   PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
318:   return 0;
319: }

321: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
322: {
323:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
324:   PetscScalar       *z, x1, x2, x3, zero = 0.0;
325:   const PetscScalar *x, *xb;
326:   const MatScalar   *v;
327:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
328:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
329:   PetscInt           nonzerorow = 0;

331:   VecSet(zz, zero);
332:   if (!a->nz) return 0;
333:   VecGetArrayRead(xx, &x);
334:   VecGetArray(zz, &z);

336:   v  = a->a;
337:   xb = x;

339:   for (i = 0; i < mbs; i++) {
340:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
341:     x1   = xb[0];
342:     x2   = xb[1];
343:     x3   = xb[2];
344:     ib   = aj + *ai;
345:     jmin = 0;
346:     nonzerorow += (n > 0);
347:     if (*ib == i) { /* (diag of A)*x */
348:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
349:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
350:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
351:       v += 9;
352:       jmin++;
353:     }
354:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
355:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
356:     for (j = jmin; j < n; j++) {
357:       /* (strict lower triangular part of A)*x  */
358:       cval = ib[j] * 3;
359:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
360:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
361:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
362:       /* (strict upper triangular part of A)*x  */
363:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
364:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
365:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
366:       v += 9;
367:     }
368:     xb += 3;
369:     ai++;
370:   }

372:   VecRestoreArrayRead(xx, &x);
373:   VecRestoreArray(zz, &z);
374:   PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
375:   return 0;
376: }

378: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
379: {
380:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
381:   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
382:   const PetscScalar *x, *xb;
383:   const MatScalar   *v;
384:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
385:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
386:   PetscInt           nonzerorow = 0;

388:   VecSet(zz, zero);
389:   if (!a->nz) return 0;
390:   VecGetArrayRead(xx, &x);
391:   VecGetArray(zz, &z);

393:   v  = a->a;
394:   xb = x;

396:   for (i = 0; i < mbs; i++) {
397:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
398:     x1   = xb[0];
399:     x2   = xb[1];
400:     x3   = xb[2];
401:     x4   = xb[3];
402:     ib   = aj + *ai;
403:     jmin = 0;
404:     nonzerorow += (n > 0);
405:     if (*ib == i) { /* (diag of A)*x */
406:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
407:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
408:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
409:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
410:       v += 16;
411:       jmin++;
412:     }
413:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
414:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
415:     for (j = jmin; j < n; j++) {
416:       /* (strict lower triangular part of A)*x  */
417:       cval = ib[j] * 4;
418:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
419:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
420:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
421:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
422:       /* (strict upper triangular part of A)*x  */
423:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
424:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
425:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
426:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
427:       v += 16;
428:     }
429:     xb += 4;
430:     ai++;
431:   }

433:   VecRestoreArrayRead(xx, &x);
434:   VecRestoreArray(zz, &z);
435:   PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
436:   return 0;
437: }

439: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
440: {
441:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
442:   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
443:   const PetscScalar *x, *xb;
444:   const MatScalar   *v;
445:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
446:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
447:   PetscInt           nonzerorow = 0;

449:   VecSet(zz, zero);
450:   if (!a->nz) return 0;
451:   VecGetArrayRead(xx, &x);
452:   VecGetArray(zz, &z);

454:   v  = a->a;
455:   xb = x;

457:   for (i = 0; i < mbs; i++) {
458:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
459:     x1   = xb[0];
460:     x2   = xb[1];
461:     x3   = xb[2];
462:     x4   = xb[3];
463:     x5   = xb[4];
464:     ib   = aj + *ai;
465:     jmin = 0;
466:     nonzerorow += (n > 0);
467:     if (*ib == i) { /* (diag of A)*x */
468:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
469:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
470:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
471:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
472:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
473:       v += 25;
474:       jmin++;
475:     }
476:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
477:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
478:     for (j = jmin; j < n; j++) {
479:       /* (strict lower triangular part of A)*x  */
480:       cval = ib[j] * 5;
481:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
482:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
483:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
484:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
485:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
486:       /* (strict upper triangular part of A)*x  */
487:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
488:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
489:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
490:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
491:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
492:       v += 25;
493:     }
494:     xb += 5;
495:     ai++;
496:   }

498:   VecRestoreArrayRead(xx, &x);
499:   VecRestoreArray(zz, &z);
500:   PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
501:   return 0;
502: }

504: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
505: {
506:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
507:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
508:   const PetscScalar *x, *xb;
509:   const MatScalar   *v;
510:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
511:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
512:   PetscInt           nonzerorow = 0;

514:   VecSet(zz, zero);
515:   if (!a->nz) return 0;
516:   VecGetArrayRead(xx, &x);
517:   VecGetArray(zz, &z);

519:   v  = a->a;
520:   xb = x;

522:   for (i = 0; i < mbs; i++) {
523:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
524:     x1   = xb[0];
525:     x2   = xb[1];
526:     x3   = xb[2];
527:     x4   = xb[3];
528:     x5   = xb[4];
529:     x6   = xb[5];
530:     ib   = aj + *ai;
531:     jmin = 0;
532:     nonzerorow += (n > 0);
533:     if (*ib == i) { /* (diag of A)*x */
534:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
535:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
536:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
537:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
538:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
539:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
540:       v += 36;
541:       jmin++;
542:     }
543:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
544:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
545:     for (j = jmin; j < n; j++) {
546:       /* (strict lower triangular part of A)*x  */
547:       cval = ib[j] * 6;
548:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
549:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
550:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
551:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
552:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
553:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
554:       /* (strict upper triangular part of A)*x  */
555:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
556:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
557:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
558:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
559:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
560:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
561:       v += 36;
562:     }
563:     xb += 6;
564:     ai++;
565:   }

567:   VecRestoreArrayRead(xx, &x);
568:   VecRestoreArray(zz, &z);
569:   PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
570:   return 0;
571: }

573: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
574: {
575:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
576:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
577:   const PetscScalar *x, *xb;
578:   const MatScalar   *v;
579:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
580:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
581:   PetscInt           nonzerorow = 0;

583:   VecSet(zz, zero);
584:   if (!a->nz) return 0;
585:   VecGetArrayRead(xx, &x);
586:   VecGetArray(zz, &z);

588:   v  = a->a;
589:   xb = x;

591:   for (i = 0; i < mbs; i++) {
592:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
593:     x1   = xb[0];
594:     x2   = xb[1];
595:     x3   = xb[2];
596:     x4   = xb[3];
597:     x5   = xb[4];
598:     x6   = xb[5];
599:     x7   = xb[6];
600:     ib   = aj + *ai;
601:     jmin = 0;
602:     nonzerorow += (n > 0);
603:     if (*ib == i) { /* (diag of A)*x */
604:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
605:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
606:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
607:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
608:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
609:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
610:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
611:       v += 49;
612:       jmin++;
613:     }
614:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
615:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616:     for (j = jmin; j < n; j++) {
617:       /* (strict lower triangular part of A)*x  */
618:       cval = ib[j] * 7;
619:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
620:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
621:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
622:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
623:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
624:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
625:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
626:       /* (strict upper triangular part of A)*x  */
627:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
628:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
629:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
630:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
631:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
632:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
633:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
634:       v += 49;
635:     }
636:     xb += 7;
637:     ai++;
638:   }
639:   VecRestoreArrayRead(xx, &x);
640:   VecRestoreArray(zz, &z);
641:   PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
642:   return 0;
643: }

645: /*
646:     This will not work with MatScalar == float because it calls the BLAS
647: */
648: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
649: {
650:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
651:   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
652:   const PetscScalar *x, *x_ptr, *xb;
653:   const MatScalar   *v;
654:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
655:   const PetscInt    *idx, *aj, *ii;
656:   PetscInt           nonzerorow = 0;

658:   VecSet(zz, zero);
659:   if (!a->nz) return 0;
660:   VecGetArrayRead(xx, &x);
661:   VecGetArray(zz, &z);

663:   x_ptr = x;
664:   z_ptr = z;

666:   aj = a->j;
667:   v  = a->a;
668:   ii = a->i;

670:   if (!a->mult_work) PetscMalloc1(A->rmap->N + 1, &a->mult_work);
671:   work = a->mult_work;

673:   for (i = 0; i < mbs; i++) {
674:     n     = ii[1] - ii[0];
675:     ncols = n * bs;
676:     workt = work;
677:     idx   = aj + ii[0];
678:     nonzerorow += (n > 0);

680:     /* upper triangular part */
681:     for (j = 0; j < n; j++) {
682:       xb = x_ptr + bs * (*idx++);
683:       for (k = 0; k < bs; k++) workt[k] = xb[k];
684:       workt += bs;
685:     }
686:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
687:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

689:     /* strict lower triangular part */
690:     idx = aj + ii[0];
691:     if (n && *idx == i) {
692:       ncols -= bs;
693:       v += bs2;
694:       idx++;
695:       n--;
696:     }

698:     if (ncols > 0) {
699:       workt = work;
700:       PetscArrayzero(workt, ncols);
701:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
702:       for (j = 0; j < n; j++) {
703:         zb = z_ptr + bs * (*idx++);
704:         for (k = 0; k < bs; k++) zb[k] += workt[k];
705:         workt += bs;
706:       }
707:     }
708:     x += bs;
709:     v += n * bs2;
710:     z += bs;
711:     ii++;
712:   }

714:   VecRestoreArrayRead(xx, &x);
715:   VecRestoreArray(zz, &z);
716:   PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow);
717:   return 0;
718: }

720: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
721: {
722:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
723:   PetscScalar       *z, x1;
724:   const PetscScalar *x, *xb;
725:   const MatScalar   *v;
726:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
727:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
728:   PetscInt           nonzerorow = 0;
729: #if defined(PETSC_USE_COMPLEX)
730:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
731: #else
732:   const int aconj = 0;
733: #endif

735:   VecCopy(yy, zz);
736:   if (!a->nz) return 0;
737:   VecGetArrayRead(xx, &x);
738:   VecGetArray(zz, &z);
739:   v  = a->a;
740:   xb = x;

742:   for (i = 0; i < mbs; i++) {
743:     n    = ai[1] - ai[0]; /* length of i_th row of A */
744:     x1   = xb[0];
745:     ib   = aj + *ai;
746:     jmin = 0;
747:     nonzerorow += (n > 0);
748:     if (n && *ib == i) { /* (diag of A)*x */
749:       z[i] += *v++ * x[*ib++];
750:       jmin++;
751:     }
752:     if (aconj) {
753:       for (j = jmin; j < n; j++) {
754:         cval = *ib;
755:         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
756:         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
757:       }
758:     } else {
759:       for (j = jmin; j < n; j++) {
760:         cval = *ib;
761:         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
762:         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
763:       }
764:     }
765:     xb++;
766:     ai++;
767:   }

769:   VecRestoreArrayRead(xx, &x);
770:   VecRestoreArray(zz, &z);

772:   PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow));
773:   return 0;
774: }

776: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
777: {
778:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
779:   PetscScalar       *z, x1, x2;
780:   const PetscScalar *x, *xb;
781:   const MatScalar   *v;
782:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
783:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
784:   PetscInt           nonzerorow = 0;

786:   VecCopy(yy, zz);
787:   if (!a->nz) return 0;
788:   VecGetArrayRead(xx, &x);
789:   VecGetArray(zz, &z);

791:   v  = a->a;
792:   xb = x;

794:   for (i = 0; i < mbs; i++) {
795:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
796:     x1   = xb[0];
797:     x2   = xb[1];
798:     ib   = aj + *ai;
799:     jmin = 0;
800:     nonzerorow += (n > 0);
801:     if (n && *ib == i) { /* (diag of A)*x */
802:       z[2 * i] += v[0] * x1 + v[2] * x2;
803:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
804:       v += 4;
805:       jmin++;
806:     }
807:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
808:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
809:     for (j = jmin; j < n; j++) {
810:       /* (strict lower triangular part of A)*x  */
811:       cval = ib[j] * 2;
812:       z[cval] += v[0] * x1 + v[1] * x2;
813:       z[cval + 1] += v[2] * x1 + v[3] * x2;
814:       /* (strict upper triangular part of A)*x  */
815:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
816:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
817:       v += 4;
818:     }
819:     xb += 2;
820:     ai++;
821:   }
822:   VecRestoreArrayRead(xx, &x);
823:   VecRestoreArray(zz, &z);

825:   PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow));
826:   return 0;
827: }

829: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
830: {
831:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
832:   PetscScalar       *z, x1, x2, x3;
833:   const PetscScalar *x, *xb;
834:   const MatScalar   *v;
835:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
836:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
837:   PetscInt           nonzerorow = 0;

839:   VecCopy(yy, zz);
840:   if (!a->nz) return 0;
841:   VecGetArrayRead(xx, &x);
842:   VecGetArray(zz, &z);

844:   v  = a->a;
845:   xb = x;

847:   for (i = 0; i < mbs; i++) {
848:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
849:     x1   = xb[0];
850:     x2   = xb[1];
851:     x3   = xb[2];
852:     ib   = aj + *ai;
853:     jmin = 0;
854:     nonzerorow += (n > 0);
855:     if (n && *ib == i) { /* (diag of A)*x */
856:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
857:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
858:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
859:       v += 9;
860:       jmin++;
861:     }
862:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
863:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
864:     for (j = jmin; j < n; j++) {
865:       /* (strict lower triangular part of A)*x  */
866:       cval = ib[j] * 3;
867:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
868:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
869:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
870:       /* (strict upper triangular part of A)*x  */
871:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
872:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
873:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
874:       v += 9;
875:     }
876:     xb += 3;
877:     ai++;
878:   }

880:   VecRestoreArrayRead(xx, &x);
881:   VecRestoreArray(zz, &z);

883:   PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow));
884:   return 0;
885: }

887: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
888: {
889:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
890:   PetscScalar       *z, x1, x2, x3, x4;
891:   const PetscScalar *x, *xb;
892:   const MatScalar   *v;
893:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
894:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
895:   PetscInt           nonzerorow = 0;

897:   VecCopy(yy, zz);
898:   if (!a->nz) return 0;
899:   VecGetArrayRead(xx, &x);
900:   VecGetArray(zz, &z);

902:   v  = a->a;
903:   xb = x;

905:   for (i = 0; i < mbs; i++) {
906:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
907:     x1   = xb[0];
908:     x2   = xb[1];
909:     x3   = xb[2];
910:     x4   = xb[3];
911:     ib   = aj + *ai;
912:     jmin = 0;
913:     nonzerorow += (n > 0);
914:     if (n && *ib == i) { /* (diag of A)*x */
915:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
916:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
917:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
918:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
919:       v += 16;
920:       jmin++;
921:     }
922:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
923:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
924:     for (j = jmin; j < n; j++) {
925:       /* (strict lower triangular part of A)*x  */
926:       cval = ib[j] * 4;
927:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
928:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
929:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
930:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
931:       /* (strict upper triangular part of A)*x  */
932:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
933:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
934:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
935:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
936:       v += 16;
937:     }
938:     xb += 4;
939:     ai++;
940:   }

942:   VecRestoreArrayRead(xx, &x);
943:   VecRestoreArray(zz, &z);

945:   PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow));
946:   return 0;
947: }

949: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
950: {
951:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
952:   PetscScalar       *z, x1, x2, x3, x4, x5;
953:   const PetscScalar *x, *xb;
954:   const MatScalar   *v;
955:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
956:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
957:   PetscInt           nonzerorow = 0;

959:   VecCopy(yy, zz);
960:   if (!a->nz) return 0;
961:   VecGetArrayRead(xx, &x);
962:   VecGetArray(zz, &z);

964:   v  = a->a;
965:   xb = x;

967:   for (i = 0; i < mbs; i++) {
968:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
969:     x1   = xb[0];
970:     x2   = xb[1];
971:     x3   = xb[2];
972:     x4   = xb[3];
973:     x5   = xb[4];
974:     ib   = aj + *ai;
975:     jmin = 0;
976:     nonzerorow += (n > 0);
977:     if (n && *ib == i) { /* (diag of A)*x */
978:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
979:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
980:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
981:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
982:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
983:       v += 25;
984:       jmin++;
985:     }
986:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
987:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
988:     for (j = jmin; j < n; j++) {
989:       /* (strict lower triangular part of A)*x  */
990:       cval = ib[j] * 5;
991:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
992:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
993:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
994:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
995:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
996:       /* (strict upper triangular part of A)*x  */
997:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
998:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
999:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1000:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1001:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1002:       v += 25;
1003:     }
1004:     xb += 5;
1005:     ai++;
1006:   }

1008:   VecRestoreArrayRead(xx, &x);
1009:   VecRestoreArray(zz, &z);

1011:   PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow));
1012:   return 0;
1013: }

1015: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1016: {
1017:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1018:   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1019:   const PetscScalar *x, *xb;
1020:   const MatScalar   *v;
1021:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1022:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1023:   PetscInt           nonzerorow = 0;

1025:   VecCopy(yy, zz);
1026:   if (!a->nz) return 0;
1027:   VecGetArrayRead(xx, &x);
1028:   VecGetArray(zz, &z);

1030:   v  = a->a;
1031:   xb = x;

1033:   for (i = 0; i < mbs; i++) {
1034:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1035:     x1   = xb[0];
1036:     x2   = xb[1];
1037:     x3   = xb[2];
1038:     x4   = xb[3];
1039:     x5   = xb[4];
1040:     x6   = xb[5];
1041:     ib   = aj + *ai;
1042:     jmin = 0;
1043:     nonzerorow += (n > 0);
1044:     if (n && *ib == i) { /* (diag of A)*x */
1045:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1046:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1047:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1048:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1049:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1050:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1051:       v += 36;
1052:       jmin++;
1053:     }
1054:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1055:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1056:     for (j = jmin; j < n; j++) {
1057:       /* (strict lower triangular part of A)*x  */
1058:       cval = ib[j] * 6;
1059:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1060:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1061:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1062:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1063:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1064:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1065:       /* (strict upper triangular part of A)*x  */
1066:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1067:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1068:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1069:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1070:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1071:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1072:       v += 36;
1073:     }
1074:     xb += 6;
1075:     ai++;
1076:   }

1078:   VecRestoreArrayRead(xx, &x);
1079:   VecRestoreArray(zz, &z);

1081:   PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow));
1082:   return 0;
1083: }

1085: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1086: {
1087:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1088:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1089:   const PetscScalar *x, *xb;
1090:   const MatScalar   *v;
1091:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1092:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1093:   PetscInt           nonzerorow = 0;

1095:   VecCopy(yy, zz);
1096:   if (!a->nz) return 0;
1097:   VecGetArrayRead(xx, &x);
1098:   VecGetArray(zz, &z);

1100:   v  = a->a;
1101:   xb = x;

1103:   for (i = 0; i < mbs; i++) {
1104:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1105:     x1   = xb[0];
1106:     x2   = xb[1];
1107:     x3   = xb[2];
1108:     x4   = xb[3];
1109:     x5   = xb[4];
1110:     x6   = xb[5];
1111:     x7   = xb[6];
1112:     ib   = aj + *ai;
1113:     jmin = 0;
1114:     nonzerorow += (n > 0);
1115:     if (n && *ib == i) { /* (diag of A)*x */
1116:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1117:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1118:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1119:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1120:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1121:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1122:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1123:       v += 49;
1124:       jmin++;
1125:     }
1126:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1127:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1128:     for (j = jmin; j < n; j++) {
1129:       /* (strict lower triangular part of A)*x  */
1130:       cval = ib[j] * 7;
1131:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1132:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1133:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1134:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1135:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1136:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1137:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1138:       /* (strict upper triangular part of A)*x  */
1139:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1140:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1141:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1142:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1143:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1144:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1145:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1146:       v += 49;
1147:     }
1148:     xb += 7;
1149:     ai++;
1150:   }

1152:   VecRestoreArrayRead(xx, &x);
1153:   VecRestoreArray(zz, &z);

1155:   PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow));
1156:   return 0;
1157: }

1159: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1160: {
1161:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1162:   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1163:   const PetscScalar *x, *x_ptr, *xb;
1164:   const MatScalar   *v;
1165:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1166:   const PetscInt    *idx, *aj, *ii;
1167:   PetscInt           nonzerorow = 0;

1169:   VecCopy(yy, zz);
1170:   if (!a->nz) return 0;
1171:   VecGetArrayRead(xx, &x);
1172:   x_ptr = x;
1173:   VecGetArray(zz, &z);
1174:   z_ptr = z;

1176:   aj = a->j;
1177:   v  = a->a;
1178:   ii = a->i;

1180:   if (!a->mult_work) PetscMalloc1(A->rmap->n + 1, &a->mult_work);
1181:   work = a->mult_work;

1183:   for (i = 0; i < mbs; i++) {
1184:     n     = ii[1] - ii[0];
1185:     ncols = n * bs;
1186:     workt = work;
1187:     idx   = aj + ii[0];
1188:     nonzerorow += (n > 0);

1190:     /* upper triangular part */
1191:     for (j = 0; j < n; j++) {
1192:       xb = x_ptr + bs * (*idx++);
1193:       for (k = 0; k < bs; k++) workt[k] = xb[k];
1194:       workt += bs;
1195:     }
1196:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1197:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

1199:     /* strict lower triangular part */
1200:     idx = aj + ii[0];
1201:     if (n && *idx == i) {
1202:       ncols -= bs;
1203:       v += bs2;
1204:       idx++;
1205:       n--;
1206:     }
1207:     if (ncols > 0) {
1208:       workt = work;
1209:       PetscArrayzero(workt, ncols);
1210:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1211:       for (j = 0; j < n; j++) {
1212:         zb = z_ptr + bs * (*idx++);
1213:         for (k = 0; k < bs; k++) zb[k] += workt[k];
1214:         workt += bs;
1215:       }
1216:     }

1218:     x += bs;
1219:     v += n * bs2;
1220:     z += bs;
1221:     ii++;
1222:   }

1224:   VecRestoreArrayRead(xx, &x);
1225:   VecRestoreArray(zz, &z);

1227:   PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow));
1228:   return 0;
1229: }

1231: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1232: {
1233:   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1234:   PetscScalar   oalpha = alpha;
1235:   PetscBLASInt  one    = 1, totalnz;

1237:   PetscBLASIntCast(a->bs2 * a->nz, &totalnz);
1238:   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1239:   PetscLogFlops(totalnz);
1240:   return 0;
1241: }

1243: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1244: {
1245:   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1246:   const MatScalar *v        = a->a;
1247:   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1248:   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1249:   const PetscInt  *aj = a->j, *col;

1251:   if (!a->nz) {
1252:     *norm = 0.0;
1253:     return 0;
1254:   }
1255:   if (type == NORM_FROBENIUS) {
1256:     for (k = 0; k < mbs; k++) {
1257:       jmin = a->i[k];
1258:       jmax = a->i[k + 1];
1259:       col  = aj + jmin;
1260:       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1261:         for (i = 0; i < bs2; i++) {
1262:           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1263:           v++;
1264:         }
1265:         jmin++;
1266:       }
1267:       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1268:         for (i = 0; i < bs2; i++) {
1269:           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1270:           v++;
1271:         }
1272:       }
1273:     }
1274:     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1275:     PetscLogFlops(2.0 * bs2 * a->nz);
1276:   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1277:     PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl);
1278:     for (i = 0; i < mbs; i++) jl[i] = mbs;
1279:     il[0] = 0;

1281:     *norm = 0.0;
1282:     for (k = 0; k < mbs; k++) { /* k_th block row */
1283:       for (j = 0; j < bs; j++) sum[j] = 0.0;
1284:       /*-- col sum --*/
1285:       i = jl[k]; /* first |A(i,k)| to be added */
1286:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1287:                   at step k */
1288:       while (i < mbs) {
1289:         nexti = jl[i]; /* next block row to be added */
1290:         ik    = il[i]; /* block index of A(i,k) in the array a */
1291:         for (j = 0; j < bs; j++) {
1292:           v = a->a + ik * bs2 + j * bs;
1293:           for (k1 = 0; k1 < bs; k1++) {
1294:             sum[j] += PetscAbsScalar(*v);
1295:             v++;
1296:           }
1297:         }
1298:         /* update il, jl */
1299:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1300:         jmax = a->i[i + 1];
1301:         if (jmin < jmax) {
1302:           il[i] = jmin;
1303:           j     = a->j[jmin];
1304:           jl[i] = jl[j];
1305:           jl[j] = i;
1306:         }
1307:         i = nexti;
1308:       }
1309:       /*-- row sum --*/
1310:       jmin = a->i[k];
1311:       jmax = a->i[k + 1];
1312:       for (i = jmin; i < jmax; i++) {
1313:         for (j = 0; j < bs; j++) {
1314:           v = a->a + i * bs2 + j;
1315:           for (k1 = 0; k1 < bs; k1++) {
1316:             sum[j] += PetscAbsScalar(*v);
1317:             v += bs;
1318:           }
1319:         }
1320:       }
1321:       /* add k_th block row to il, jl */
1322:       col = aj + jmin;
1323:       if (jmax - jmin > 0 && *col == k) jmin++;
1324:       if (jmin < jmax) {
1325:         il[k] = jmin;
1326:         j     = a->j[jmin];
1327:         jl[k] = jl[j];
1328:         jl[j] = k;
1329:       }
1330:       for (j = 0; j < bs; j++) {
1331:         if (sum[j] > *norm) *norm = sum[j];
1332:       }
1333:     }
1334:     PetscFree3(sum, il, jl);
1335:     PetscLogFlops(PetscMax(mbs * a->nz - 1, 0));
1336:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1337:   return 0;
1338: }

1340: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1341: {
1342:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;

1344:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1345:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1346:     *flg = PETSC_FALSE;
1347:     return 0;
1348:   }

1350:   /* if the a->i are the same */
1351:   PetscArraycmp(a->i, b->i, a->mbs + 1, flg);
1352:   if (!*flg) return 0;

1354:   /* if a->j are the same */
1355:   PetscArraycmp(a->j, b->j, a->nz, flg);
1356:   if (!*flg) return 0;

1358:   /* if a->a are the same */
1359:   PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg);
1360:   return 0;
1361: }

1363: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1364: {
1365:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1366:   PetscInt         i, j, k, row, bs, ambs, bs2;
1367:   const PetscInt  *ai, *aj;
1368:   PetscScalar     *x, zero = 0.0;
1369:   const MatScalar *aa, *aa_j;

1371:   bs = A->rmap->bs;

1374:   aa   = a->a;
1375:   ambs = a->mbs;

1377:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1378:     PetscInt *diag = a->diag;
1379:     aa             = a->a;
1380:     ambs           = a->mbs;
1381:     VecGetArray(v, &x);
1382:     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1383:     VecRestoreArray(v, &x);
1384:     return 0;
1385:   }

1387:   ai  = a->i;
1388:   aj  = a->j;
1389:   bs2 = a->bs2;
1390:   VecSet(v, zero);
1391:   if (!a->nz) return 0;
1392:   VecGetArray(v, &x);
1393:   for (i = 0; i < ambs; i++) {
1394:     j = ai[i];
1395:     if (aj[j] == i) { /* if this is a diagonal element */
1396:       row  = i * bs;
1397:       aa_j = aa + j * bs2;
1398:       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1399:     }
1400:   }
1401:   VecRestoreArray(v, &x);
1402:   return 0;
1403: }

1405: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1406: {
1407:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1408:   PetscScalar        x;
1409:   const PetscScalar *l, *li, *ri;
1410:   MatScalar         *aa, *v;
1411:   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1412:   const PetscInt    *ai, *aj;
1413:   PetscBool          flg;

1415:   if (ll != rr) {
1416:     VecEqual(ll, rr, &flg);
1418:   }
1419:   if (!ll) return 0;
1420:   ai  = a->i;
1421:   aj  = a->j;
1422:   aa  = a->a;
1423:   m   = A->rmap->N;
1424:   bs  = A->rmap->bs;
1425:   mbs = a->mbs;
1426:   bs2 = a->bs2;

1428:   VecGetArrayRead(ll, &l);
1429:   VecGetLocalSize(ll, &lm);
1431:   for (i = 0; i < mbs; i++) { /* for each block row */
1432:     M  = ai[i + 1] - ai[i];
1433:     li = l + i * bs;
1434:     v  = aa + bs2 * ai[i];
1435:     for (j = 0; j < M; j++) { /* for each block */
1436:       ri = l + bs * aj[ai[i] + j];
1437:       for (k = 0; k < bs; k++) {
1438:         x = ri[k];
1439:         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1440:       }
1441:     }
1442:   }
1443:   VecRestoreArrayRead(ll, &l);
1444:   PetscLogFlops(2.0 * a->nz);
1445:   return 0;
1446: }

1448: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1449: {
1450:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1452:   info->block_size   = a->bs2;
1453:   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1454:   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1455:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1456:   info->assemblies   = A->num_ass;
1457:   info->mallocs      = A->info.mallocs;
1458:   info->memory       = 0; /* REVIEW ME */
1459:   if (A->factortype) {
1460:     info->fill_ratio_given  = A->info.fill_ratio_given;
1461:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1462:     info->factor_mallocs    = A->info.factor_mallocs;
1463:   } else {
1464:     info->fill_ratio_given  = 0;
1465:     info->fill_ratio_needed = 0;
1466:     info->factor_mallocs    = 0;
1467:   }
1468:   return 0;
1469: }

1471: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1472: {
1473:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1475:   PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]);
1476:   return 0;
1477: }

1479: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1480: {
1481:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1482:   PetscInt         i, j, n, row, col, bs, mbs;
1483:   const PetscInt  *ai, *aj;
1484:   PetscReal        atmp;
1485:   const MatScalar *aa;
1486:   PetscScalar     *x;
1487:   PetscInt         ncols, brow, bcol, krow, kcol;

1491:   bs  = A->rmap->bs;
1492:   aa  = a->a;
1493:   ai  = a->i;
1494:   aj  = a->j;
1495:   mbs = a->mbs;

1497:   VecSet(v, 0.0);
1498:   VecGetArray(v, &x);
1499:   VecGetLocalSize(v, &n);
1501:   for (i = 0; i < mbs; i++) {
1502:     ncols = ai[1] - ai[0];
1503:     ai++;
1504:     brow = bs * i;
1505:     for (j = 0; j < ncols; j++) {
1506:       bcol = bs * (*aj);
1507:       for (kcol = 0; kcol < bs; kcol++) {
1508:         col = bcol + kcol; /* col index */
1509:         for (krow = 0; krow < bs; krow++) {
1510:           atmp = PetscAbsScalar(*aa);
1511:           aa++;
1512:           row = brow + krow; /* row index */
1513:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1514:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1515:         }
1516:       }
1517:       aj++;
1518:     }
1519:   }
1520:   VecRestoreArray(v, &x);
1521:   return 0;
1522: }

1524: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1525: {
1526:   MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C);
1527:   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1528:   return 0;
1529: }

1531: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1532: {
1533:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1534:   PetscScalar       *z = c;
1535:   const PetscScalar *xb;
1536:   PetscScalar        x1;
1537:   const MatScalar   *v   = a->a, *vv;
1538:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1539: #if defined(PETSC_USE_COMPLEX)
1540:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1541: #else
1542:   const int aconj = 0;
1543: #endif

1545:   for (i = 0; i < mbs; i++) {
1546:     n = ii[1] - ii[0];
1547:     ii++;
1548:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1549:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1550:     jj = idx;
1551:     vv = v;
1552:     for (k = 0; k < cn; k++) {
1553:       idx = jj;
1554:       v   = vv;
1555:       for (j = 0; j < n; j++) {
1556:         xb = b + (*idx);
1557:         x1 = xb[0 + k * bm];
1558:         z[0 + k * cm] += v[0] * x1;
1559:         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1560:         v += 1;
1561:         ++idx;
1562:       }
1563:     }
1564:     z += 1;
1565:   }
1566:   return 0;
1567: }

1569: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1570: {
1571:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1572:   PetscScalar       *z = c;
1573:   const PetscScalar *xb;
1574:   PetscScalar        x1, x2;
1575:   const MatScalar   *v   = a->a, *vv;
1576:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1578:   for (i = 0; i < mbs; i++) {
1579:     n = ii[1] - ii[0];
1580:     ii++;
1581:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1582:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1583:     jj = idx;
1584:     vv = v;
1585:     for (k = 0; k < cn; k++) {
1586:       idx = jj;
1587:       v   = vv;
1588:       for (j = 0; j < n; j++) {
1589:         xb = b + 2 * (*idx);
1590:         x1 = xb[0 + k * bm];
1591:         x2 = xb[1 + k * bm];
1592:         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1593:         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1594:         if (*idx != i) {
1595:           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1596:           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1597:         }
1598:         v += 4;
1599:         ++idx;
1600:       }
1601:     }
1602:     z += 2;
1603:   }
1604:   return 0;
1605: }

1607: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1608: {
1609:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1610:   PetscScalar       *z = c;
1611:   const PetscScalar *xb;
1612:   PetscScalar        x1, x2, x3;
1613:   const MatScalar   *v   = a->a, *vv;
1614:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1616:   for (i = 0; i < mbs; i++) {
1617:     n = ii[1] - ii[0];
1618:     ii++;
1619:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1620:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1621:     jj = idx;
1622:     vv = v;
1623:     for (k = 0; k < cn; k++) {
1624:       idx = jj;
1625:       v   = vv;
1626:       for (j = 0; j < n; j++) {
1627:         xb = b + 3 * (*idx);
1628:         x1 = xb[0 + k * bm];
1629:         x2 = xb[1 + k * bm];
1630:         x3 = xb[2 + k * bm];
1631:         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1632:         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1633:         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1634:         if (*idx != i) {
1635:           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1636:           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1637:           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1638:         }
1639:         v += 9;
1640:         ++idx;
1641:       }
1642:     }
1643:     z += 3;
1644:   }
1645:   return 0;
1646: }

1648: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1649: {
1650:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1651:   PetscScalar       *z = c;
1652:   const PetscScalar *xb;
1653:   PetscScalar        x1, x2, x3, x4;
1654:   const MatScalar   *v   = a->a, *vv;
1655:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1657:   for (i = 0; i < mbs; i++) {
1658:     n = ii[1] - ii[0];
1659:     ii++;
1660:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1661:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1662:     jj = idx;
1663:     vv = v;
1664:     for (k = 0; k < cn; k++) {
1665:       idx = jj;
1666:       v   = vv;
1667:       for (j = 0; j < n; j++) {
1668:         xb = b + 4 * (*idx);
1669:         x1 = xb[0 + k * bm];
1670:         x2 = xb[1 + k * bm];
1671:         x3 = xb[2 + k * bm];
1672:         x4 = xb[3 + k * bm];
1673:         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1674:         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1675:         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1676:         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1677:         if (*idx != i) {
1678:           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1679:           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1680:           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1681:           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1682:         }
1683:         v += 16;
1684:         ++idx;
1685:       }
1686:     }
1687:     z += 4;
1688:   }
1689:   return 0;
1690: }

1692: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1693: {
1694:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1695:   PetscScalar       *z = c;
1696:   const PetscScalar *xb;
1697:   PetscScalar        x1, x2, x3, x4, x5;
1698:   const MatScalar   *v   = a->a, *vv;
1699:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1701:   for (i = 0; i < mbs; i++) {
1702:     n = ii[1] - ii[0];
1703:     ii++;
1704:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1705:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1706:     jj = idx;
1707:     vv = v;
1708:     for (k = 0; k < cn; k++) {
1709:       idx = jj;
1710:       v   = vv;
1711:       for (j = 0; j < n; j++) {
1712:         xb = b + 5 * (*idx);
1713:         x1 = xb[0 + k * bm];
1714:         x2 = xb[1 + k * bm];
1715:         x3 = xb[2 + k * bm];
1716:         x4 = xb[3 + k * bm];
1717:         x5 = xb[4 + k * cm];
1718:         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1719:         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1720:         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1721:         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1722:         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1723:         if (*idx != i) {
1724:           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1725:           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1726:           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1727:           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1728:           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1729:         }
1730:         v += 25;
1731:         ++idx;
1732:       }
1733:     }
1734:     z += 5;
1735:   }
1736:   return 0;
1737: }

1739: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1740: {
1741:   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1742:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1743:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1744:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1745:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1746:   PetscBLASInt     bbs, bcn, bbm, bcm;
1747:   PetscScalar     *z = NULL;
1748:   PetscScalar     *c, *b;
1749:   const MatScalar *v;
1750:   const PetscInt  *idx, *ii;
1751:   PetscScalar      _DOne = 1.0;

1753:   if (!cm || !cn) return 0;
1757:   b = bd->v;
1758:   MatZeroEntries(C);
1759:   MatDenseGetArray(C, &c);
1760:   switch (bs) {
1761:   case 1:
1762:     MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1763:     break;
1764:   case 2:
1765:     MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1766:     break;
1767:   case 3:
1768:     MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1769:     break;
1770:   case 4:
1771:     MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1772:     break;
1773:   case 5:
1774:     MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1775:     break;
1776:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1777:     PetscBLASIntCast(bs, &bbs);
1778:     PetscBLASIntCast(cn, &bcn);
1779:     PetscBLASIntCast(bm, &bbm);
1780:     PetscBLASIntCast(cm, &bcm);
1781:     idx = a->j;
1782:     v   = a->a;
1783:     mbs = a->mbs;
1784:     ii  = a->i;
1785:     z   = c;
1786:     for (i = 0; i < mbs; i++) {
1787:       n = ii[1] - ii[0];
1788:       ii++;
1789:       for (j = 0; j < n; j++) {
1790:         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1791:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1792:         v += bs2;
1793:       }
1794:       z += bs;
1795:     }
1796:   }
1797:   MatDenseRestoreArray(C, &c);
1798:   PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn);
1799:   return 0;
1800: }