Actual source code: mpiaijkok.kokkos.cxx

  1: #include <petscvec_kokkos.hpp>
  2: #include <petscsf.h>
  3: #include <petsc/private/sfimpl.h>
  4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5: #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp>
  6: #include <KokkosSparse_spadd.hpp>

  8: PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A, MatAssemblyType mode)
  9: {
 10:   Mat_SeqAIJKokkos *aijkok;

 12:   MatAssemblyEnd_MPIAIJ(A, mode);
 13:   aijkok = static_cast<Mat_SeqAIJKokkos *>(((Mat_MPIAIJ *)A->data)->A->spptr); /* Access spptr after MatAssemblyEnd_MPIAIJ(), which might have deleted old spptr */
 14:   if (aijkok && aijkok->device_mat_d.data()) {
 15:     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
 16:   }

 18:   return 0;
 19: }

 21: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
 22: {
 23:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;

 25:   PetscLayoutSetUp(mat->rmap);
 26:   PetscLayoutSetUp(mat->cmap);
 27: #if defined(PETSC_USE_DEBUG)
 28:   if (d_nnz) {
 29:     PetscInt i;
 31:   }
 32:   if (o_nnz) {
 33:     PetscInt i;
 35:   }
 36: #endif
 37: #if defined(PETSC_USE_CTABLE)
 38:   PetscTableDestroy(&mpiaij->colmap);
 39: #else
 40:   PetscFree(mpiaij->colmap);
 41: #endif
 42:   PetscFree(mpiaij->garray);
 43:   VecDestroy(&mpiaij->lvec);
 44:   VecScatterDestroy(&mpiaij->Mvctx);
 45:   /* Because the B will have been resized we simply destroy it and create a new one each time */
 46:   MatDestroy(&mpiaij->B);

 48:   if (!mpiaij->A) {
 49:     MatCreate(PETSC_COMM_SELF, &mpiaij->A);
 50:     MatSetSizes(mpiaij->A, mat->rmap->n, mat->cmap->n, mat->rmap->n, mat->cmap->n);
 51:   }
 52:   if (!mpiaij->B) {
 53:     PetscMPIInt size;
 54:     MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size);
 55:     MatCreate(PETSC_COMM_SELF, &mpiaij->B);
 56:     MatSetSizes(mpiaij->B, mat->rmap->n, size > 1 ? mat->cmap->N : 0, mat->rmap->n, size > 1 ? mat->cmap->N : 0);
 57:   }
 58:   MatSetType(mpiaij->A, MATSEQAIJKOKKOS);
 59:   MatSetType(mpiaij->B, MATSEQAIJKOKKOS);
 60:   MatSeqAIJSetPreallocation(mpiaij->A, d_nz, d_nnz);
 61:   MatSeqAIJSetPreallocation(mpiaij->B, o_nz, o_nnz);
 62:   mat->preallocated = PETSC_TRUE;
 63:   return 0;
 64: }

 66: PetscErrorCode MatMult_MPIAIJKokkos(Mat mat, Vec xx, Vec yy)
 67: {
 68:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
 69:   PetscInt    nt;

 71:   VecGetLocalSize(xx, &nt);
 73:   VecScatterBegin(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
 74:   (*mpiaij->A->ops->mult)(mpiaij->A, xx, yy);
 75:   VecScatterEnd(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
 76:   (*mpiaij->B->ops->multadd)(mpiaij->B, mpiaij->lvec, yy, yy);
 77:   return 0;
 78: }

 80: PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat, Vec xx, Vec yy, Vec zz)
 81: {
 82:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
 83:   PetscInt    nt;

 85:   VecGetLocalSize(xx, &nt);
 87:   VecScatterBegin(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
 88:   (*mpiaij->A->ops->multadd)(mpiaij->A, xx, yy, zz);
 89:   VecScatterEnd(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
 90:   (*mpiaij->B->ops->multadd)(mpiaij->B, mpiaij->lvec, zz, zz);
 91:   return 0;
 92: }

 94: PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat, Vec xx, Vec yy)
 95: {
 96:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
 97:   PetscInt    nt;

 99:   VecGetLocalSize(xx, &nt);
101:   (*mpiaij->B->ops->multtranspose)(mpiaij->B, xx, mpiaij->lvec);
102:   (*mpiaij->A->ops->multtranspose)(mpiaij->A, xx, yy);
103:   VecScatterBegin(mpiaij->Mvctx, mpiaij->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
104:   VecScatterEnd(mpiaij->Mvctx, mpiaij->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
105:   return 0;
106: }

108: /* Merge the "A, B" matrices of mat into a matrix C.  mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
109:    A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
110:    C still uses local column ids. Their corresponding global column ids are returned in glob.
111: */
112: PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat, MatReuse reuse, IS *glob, Mat *C)
113: {
114:   Mat             Ad, Ao;
115:   const PetscInt *cmap;

117:   MatMPIAIJGetSeqAIJ(mat, &Ad, &Ao, &cmap);
118:   MatSeqAIJKokkosMergeMats(Ad, Ao, reuse, C);
119:   if (glob) {
120:     PetscInt cst, i, dn, on, *gidx;
121:     MatGetLocalSize(Ad, NULL, &dn);
122:     MatGetLocalSize(Ao, NULL, &on);
123:     MatGetOwnershipRangeColumn(mat, &cst, NULL);
124:     PetscMalloc1(dn + on, &gidx);
125:     for (i = 0; i < dn; i++) gidx[i] = cst + i;
126:     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
127:     ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob);
128:   }
129:   return 0;
130: }

132: /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
133: struct MatMatStruct {
134:   MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */
135:   PetscSF             sf;      /* SF to send/recv matrix entries */
136:   MatScalarKokkosView abuf;    /* buf of mat values in send/recv */
137:   Mat                 C1, C2, B_local;
138:   KokkosCsrMatrix     C1_global, C2_global, C_global;
139:   KernelHandle        kh;
140:   MatMatStruct()
141:   {
142:     C1 = C2 = B_local = NULL;
143:     sf                = NULL;
144:   }

146:   ~MatMatStruct()
147:   {
148:     MatDestroy(&C1);
149:     MatDestroy(&C2);
150:     MatDestroy(&B_local);
151:     PetscSFDestroy(&sf);
152:     kh.destroy_spadd_handle();
153:   }
154: };

156: struct MatMatStruct_AB : public MatMatStruct {
157:   MatColIdxKokkosView rows;
158:   MatRowMapKokkosView rowoffset;
159:   Mat                 B_other, C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */

161:   MatMatStruct_AB() : B_other(NULL), C_petsc(NULL) { }
162:   ~MatMatStruct_AB()
163:   {
164:     MatDestroy(&B_other);
165:     MatDestroy(&C_petsc);
166:   }
167: };

169: struct MatMatStruct_AtB : public MatMatStruct {
170:   MatRowMapKokkosView srcrowoffset, dstrowoffset;
171: };

173: struct MatProductData_MPIAIJKokkos {
174:   MatMatStruct_AB  *mmAB;
175:   MatMatStruct_AtB *mmAtB;
176:   PetscBool         reusesym;

178:   MatProductData_MPIAIJKokkos() : mmAB(NULL), mmAtB(NULL), reusesym(PETSC_FALSE) { }
179:   ~MatProductData_MPIAIJKokkos()
180:   {
181:     delete mmAB;
182:     delete mmAtB;
183:   }
184: };

186: static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
187: {
188:   delete static_cast<MatProductData_MPIAIJKokkos *>(data);
189:   return 0;
190: }

192: /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix

194:    Input Parameters:
195: +  A       - the MATSEQAIJKOKKOS matrix
196: .  N       - new column size for the returned Kokkos matrix
197: -  l2g     - a map that maps old col ids to new col ids

199:    Output Parameters:
200: .  csrmat  - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
201:  */
202: static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A, PetscInt N, const ConstMatColIdxKokkosView &l2g, KokkosCsrMatrix &csrmat)
203: {
204:   KokkosCsrMatrix    &orig = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
205:   MatColIdxKokkosView jg("jg", orig.nnz()); /* New j array for csrmat */

207:   PetscCallCXX(Kokkos::parallel_for(
208:     orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) { jg(i) = l2g(orig.graph.entries(i)); }));
209:   csrmat = KokkosCsrMatrix("csrmat", orig.numRows(), N, orig.nnz(), orig.values, orig.graph.row_map, jg);
210:   return 0;
211: }

213: /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
214:    It is similar to MatCreateMPIAIJWithSplitArrays.

216:   Input Parameters:
217: +  mat   - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
218: .  A     - the diag matrix using local col ids
219: -  B     - the offdiag matrix using global col ids

221:   Output Parameters:
222: .  mat   - the updated MATMPIAIJKOKKOS matrix
223: */
224: static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat, Mat A, Mat B)
225: {
226:   Mat_MPIAIJ       *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
227:   PetscInt          m, n, M, N, Am, An, Bm, Bn;
228:   Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);

230:   MatGetSize(mat, &M, &N);
231:   MatGetLocalSize(mat, &m, &n);
232:   MatGetLocalSize(A, &Am, &An);
233:   MatGetLocalSize(B, &Bm, &Bn);

239:   mpiaij->A = A;
240:   mpiaij->B = B;

242:   mat->preallocated     = PETSC_TRUE;
243:   mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */

245:   MatSetOption(mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE);
246:   MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
247:   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
248:     also gets mpiaij->B compacted, with its col ids and size reduced
249:   */
250:   MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
251:   MatSetOption(mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE);
252:   MatSetOption(mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);

254:   /* Update bkok with new local col ids (stored on host) and size */
255:   bkok->j_dual.modify_host();
256:   bkok->j_dual.sync_device();
257:   bkok->SetColSize(mpiaij->B->cmap->n);
258:   return 0;
259: }

261: /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).

263:    It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
264:    In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
265:    Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
266:    to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).

268:    Collective on comm of ownerSF

270:    Input Parameters:
271: +   B       - the SEQAIJKOKKOS matrix, using local col ids
272: .   reuse   - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
273: .   N       - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
274: .   l2g     - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
275: .   ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)

277:    Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
278: +   bcastSF   - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
279: .   abuf      - buffer for sending matrix values
280: .   rows      - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
281:                 Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
282: .   rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
283: -   C         -  the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
284: */
285: static PetscErrorCode MatSeqAIJKokkosBcast(Mat B, MatReuse reuse, PetscInt N, const ConstMatColIdxKokkosView &l2g, PetscSF ownerSF, PetscSF &bcastSF, MatScalarKokkosView &abuf, MatColIdxKokkosView &rows, MatRowMapKokkosView &rowoffset, Mat &C)
286: {
287:   Mat_SeqAIJKokkos *bkok, *ckok;

289:   MatSeqAIJKokkosSyncDevice(B); /* Make sure B->spptr is accessible */
290:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);

292:   if (reuse == MAT_REUSE_MATRIX) {
293:     ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

295:     const auto &Ba = bkok->a_dual.view_device();
296:     const auto &Bi = bkok->i_dual.view_device();
297:     const auto &Ca = ckok->a_dual.view_device();

299:     /* Copy Ba to abuf */
300:     Kokkos::parallel_for(
301:       Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
302:         PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
303:         PetscInt r    = rows(i);
304:         PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
305:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, Bi(r + 1) - Bi(r)), [&](PetscInt k) { abuf(base + k) = Ba(Bi(r) + k); });
306:       });

308:     /* Send abuf to Ca through bcastSF and then mark C is updated on device */
309:     PetscSFBcastBegin(bcastSF, MPIU_SCALAR, abuf.data(), Ca.data(), MPI_REPLACE); /* TODO: get memtype for abuf */
310:     PetscSFBcastEnd(bcastSF, MPIU_SCALAR, abuf.data(), Ca.data(), MPI_REPLACE);
311:     ckok->a_dual.modify_device();
312:   } else if (reuse == MAT_INITIAL_MATRIX) {
313:     MPI_Comm    comm;
314:     PetscMPIInt tag;
315:     PetscInt    k, Cm, Cn, Cnnz, *Ci_h, nroots, nleaves;

317:     PetscObjectGetComm((PetscObject)ownerSF, &comm);
318:     PetscSFGetGraph(ownerSF, &nroots, &nleaves, NULL, NULL);
319:     Cm = nleaves; /* row size of C */
320:     Cn = N;       /* col size of C, which initially uses global ids, so we can safely set its col size as N */

322:     /* Get row lens (nz) of B's rows for later fast query */
323:     PetscInt       *Browlens;
324:     const PetscInt *tmp = bkok->i_host_data();
325:     PetscMalloc1(nroots, &Browlens);
326:     for (k = 0; k < nroots; k++) Browlens[k] = tmp[k + 1] - tmp[k];

328:     /* By ownerSF, each proc gets lens of rows of C */
329:     MatRowMapKokkosDualView Ci("i", Cm + 1); /* C's rowmap */
330:     Ci_h    = Ci.view_host().data();
331:     Ci_h[0] = 0;
332:     PetscSFBcastWithMemTypeBegin(ownerSF, MPIU_INT, PETSC_MEMTYPE_HOST, Browlens, PETSC_MEMTYPE_HOST, &Ci_h[1], MPI_REPLACE);
333:     PetscSFBcastEnd(ownerSF, MPIU_INT, Browlens, &Ci_h[1], MPI_REPLACE);
334:     for (k = 1; k < Cm + 1; k++) Ci_h[k] += Ci_h[k - 1]; /* Convert lens to CSR */
335:     Cnnz = Ci_h[Cm];
336:     Ci.modify_host();
337:     Ci.sync_device();

339:     /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
340:     MatColIdxKokkosDualView Cj("j", Cnnz);
341:     MatScalarKokkosDualView Ca("a", Cnnz);

343:     /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
344:     const PetscMPIInt *iranks, *ranks;
345:     const PetscInt    *ioffset, *irootloc, *roffset;
346:     PetscInt           i, j, niranks, nranks, *sdisp, *rdisp, *rowptr;
347:     MPI_Request       *reqs;

349:     PetscSFGetLeafRanks(ownerSF, &niranks, &iranks, &ioffset, &irootloc);                      /* irootloc[] contains indices of rows I need to send to each receiver */
350:     PetscSFGetRootRanks(ownerSF, &nranks, &ranks, &roffset, NULL /*rmine*/, NULL /*rremote*/); /* recv info */

352:     /* figure out offsets at the send buffer, to build the SF
353:       sdisp[]  - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
354:       rowptr[] - stores offsets for data of each row in abuf

356:       rdisp[]  - to receive sdisp[]
357:     */
358:     PetscMalloc3(niranks + 1, &sdisp, nranks, &rdisp, niranks + nranks, &reqs);
359:     MatRowMapKokkosViewHost rowptr_h("rowptr_h", ioffset[niranks] + 1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
360:     rowptr = rowptr_h.data();

362:     sdisp[0]  = 0;
363:     rowptr[0] = 0;
364:     for (i = 0; i < niranks; i++) { /* for each receiver */
365:       PetscInt len, nz = 0;
366:       for (j = ioffset[i]; j < ioffset[i + 1]; j++) { /* for each row to this receiver */
367:         len           = Browlens[irootloc[j]];
368:         rowptr[j + 1] = rowptr[j] + len;
369:         nz += len;
370:       }
371:       sdisp[i + 1] = sdisp[i] + nz;
372:     }
373:     PetscCommGetNewTag(comm, &tag);
374:     for (i = 0; i < nranks; i++) MPI_Irecv(&rdisp[i], 1, MPIU_INT, ranks[i], tag, comm, &reqs[i]);
375:     for (i = 0; i < niranks; i++) MPI_Isend(&sdisp[i], 1, MPIU_INT, iranks[i], tag, comm, &reqs[nranks + i]);
376:     MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);

378:     PetscInt     nleaves2 = Cnnz;           /* leaves are the nonzeros I will receive */
379:     PetscInt     nroots2  = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
380:     PetscSFNode *iremote;
381:     PetscMalloc1(nleaves2, &iremote);
382:     for (i = 0; i < nranks; i++) { /* for each sender */
383:       k = 0;
384:       for (j = Ci_h[roffset[i]]; j < Ci_h[roffset[i + 1]]; j++) {
385:         iremote[j].rank  = ranks[i];
386:         iremote[j].index = rdisp[i] + k;
387:         k++;
388:       }
389:     }
390:     /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
391:     PetscSFCreate(comm, &bcastSF);
392:     PetscSFSetGraph(bcastSF, nroots2, nleaves2, NULL /*ilocal*/, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);

394:     /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
395:       from local to global. Then use bcastSF to fill Ca, Cj.
396:     */
397:     ConstMatColIdxKokkosViewHost rows_h(irootloc, ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
398:     MatColIdxKokkosView          rows("rows", ioffset[niranks]);
399:     Kokkos::deep_copy(rows, rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */

401:     rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */

403:     MatColIdxKokkosView jbuf("jbuf", sdisp[niranks]);   /* send buf for (global) col ids */
404:     abuf = MatScalarKokkosView("abuf", sdisp[niranks]); /* send buf for mat values */

406:     const auto &Ba = bkok->a_dual.view_device();
407:     const auto &Bi = bkok->i_dual.view_device();
408:     const auto &Bj = bkok->j_dual.view_device();

410:     /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
411:     Kokkos::parallel_for(
412:       Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
413:         PetscInt i    = t.league_rank(); /* rows[i] is r-th row of B */
414:         PetscInt r    = rows(i);
415:         PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
416:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, Bi(r + 1) - Bi(r)), [&](PetscInt k) {
417:           abuf(base + k) = Ba(Bi(r) + k);
418:           jbuf(base + k) = l2g(Bj(Bi(r) + k));
419:         });
420:       });

422:     /* Send abuf & jbuf to fill Ca, Cj */
423:     PetscSFBcastBegin(bcastSF, MPIU_INT, jbuf.data(), Cj.view_device().data(), MPI_REPLACE);
424:     PetscSFBcastBegin(bcastSF, MPIU_SCALAR, abuf.data(), Ca.view_device().data(), MPI_REPLACE);
425:     PetscSFBcastEnd(bcastSF, MPIU_INT, jbuf.data(), Cj.view_device().data(), MPI_REPLACE);
426:     PetscSFBcastEnd(bcastSF, MPIU_SCALAR, abuf.data(), Ca.view_device().data(), MPI_REPLACE);
427:     Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
428:     Cj.sync_host();
429:     Ca.modify_device();

431:     /* Construct C with Ca, Ci, Cj */
432:     auto ckok = new Mat_SeqAIJKokkos(Cm, Cn, Cnnz, Ci, Cj, Ca);
433:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, &C);
434:     PetscFree3(sdisp, rdisp, reqs);
435:     PetscFree(Browlens);
436:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unsupported MatReuse enum %d", reuse);
437:   return 0;
438: }

440: /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)

442:   It is the reverse of MatSeqAIJKokkosBcast in some sense.

444:   Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
445:   In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
446:   contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.

448:   Input Parameters:
449: +  A        - the SEQAIJKOKKOS matrix to be reduced
450: .  reuse    - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
451: .  local    - true if A uses local col ids; false if A is already in global col ids.
452: .  N        - if local, N is A's global col size
453: .  l2g      - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
454: -  ownerSF  - the SF specifies ownership (root) of rows in A

456:   Output Parameters:
457: +  reduceSF    - the SF to reduce A's rows to contiguous buffers at the receiver side
458: .  abuf         - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
459: .  srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
460: .  dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
461:                   unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
462: -  C            - the matrix made up by rows sent to me from other ranks, using global col ids

464:    TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
465:  */
466: static PetscErrorCode MatSeqAIJKokkosReduce(Mat A, MatReuse reuse, PetscBool local, PetscInt N, const ConstMatColIdxKokkosView &l2g, PetscSF ownerSF, PetscSF &reduceSF, MatScalarKokkosView &abuf, MatRowMapKokkosView &srcrowoffset, MatRowMapKokkosView &dstrowoffset, KokkosCsrMatrix &C)
467: {
468:   PetscInt          i, r, Am, An, Annz, Cnnz, nrows;
469:   const PetscInt   *Ai;
470:   Mat_SeqAIJKokkos *akok;

472:   MatSeqAIJKokkosSyncDevice(A); /* So that A's latest data is on device */
473:   MatGetSize(A, &Am, &An);
474:   Ai   = static_cast<Mat_SeqAIJ *>(A->data)->i;
475:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
476:   Annz = Ai[Am];

478:   if (reuse == MAT_REUSE_MATRIX) {
479:     /* Send Aa to abuf */
480:     PetscSFReduceBegin(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
481:     PetscSFReduceEnd(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);

483:     /* Copy abuf to Ca */
484:     const MatScalarKokkosView &Ca = C.values;
485:     nrows                         = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
486:     Kokkos::parallel_for(
487:       Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
488:         PetscInt i   = t.league_rank();
489:         PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
490:         PetscInt len = srcrowoffset(i + 1) - srcrowoffset(i);
491:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt k) { Ca(dst + k) = abuf(src + k); });
492:       });
493:   } else if (reuse == MAT_INITIAL_MATRIX) {
494:     MPI_Comm     comm;
495:     MPI_Request *reqs;
496:     PetscMPIInt  tag;
497:     PetscInt     Cm;

499:     PetscObjectGetComm((PetscObject)ownerSF, &comm);
500:     PetscCommGetNewTag(comm, &tag);

502:     PetscInt           niranks, nranks, nroots, nleaves;
503:     const PetscMPIInt *iranks, *ranks;
504:     const PetscInt    *ioffset, *rows, *roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
505:     PetscSFSetUp(ownerSF);
506:     PetscSFGetLeafRanks(ownerSF, &niranks, &iranks, &ioffset, &rows);                          /* recv info: iranks[] will send rows to me */
507:     PetscSFGetRootRanks(ownerSF, &nranks, &ranks, &roffset, NULL /*rmine*/, NULL /*rremote*/); /* send info */
508:     PetscSFGetGraph(ownerSF, &nroots, &nleaves, NULL, NULL);
510:     Cm    = nroots;
511:     nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */

513:     /* Tell owners how long each row I will send */
514:     PetscInt               *srowlens;                              /* send buf of row lens */
515:     MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h", nrows + 1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
516:     PetscInt               *rrowlens = rrowlens_h.data();

518:     PetscMalloc2(Am, &srowlens, niranks + nranks, &reqs);
519:     for (i = 0; i < Am; i++) srowlens[i] = Ai[i + 1] - Ai[i];
520:     rrowlens[0] = 0;
521:     rrowlens++; /* shift the pointer to make the following expression more readable */
522:     for (i = 0; i < niranks; i++) MPI_Irecv(&rrowlens[ioffset[i]], ioffset[i + 1] - ioffset[i], MPIU_INT, iranks[i], tag, comm, &reqs[i]);
523:     for (i = 0; i < nranks; i++) MPI_Isend(&srowlens[roffset[i]], roffset[i + 1] - roffset[i], MPIU_INT, ranks[i], tag, comm, &reqs[niranks + i]);
524:     MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);

526:     /* Owner builds Ci on host by histogramming rrowlens[] */
527:     MatRowMapKokkosViewHost Ci_h("i", Cm + 1);
528:     Kokkos::deep_copy(Ci_h, 0); /* Zero Ci */
529:     MatRowMapType *Ci_ptr = Ci_h.data();

531:     for (i = 0; i < nrows; i++) {
532:       r = rows[i]; /* local row id of i-th received row */
533: #if defined(PETSC_USE_DEBUG)
535: #endif
536:       Ci_ptr[r + 1] += rrowlens[i]; /* add to length of row r in C */
537:     }
538:     for (i = 0; i < Cm; i++) Ci_ptr[i + 1] += Ci_ptr[i]; /* to CSR format */
539:     Cnnz = Ci_ptr[Cm];

541:     /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
542:     MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h", nrows);
543:     PetscInt               *dstrowoffset_hptr = dstrowoffset_h.data();
544:     PetscInt               *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */

546:     PetscCalloc1(Cm, &currowlens);           /* Init with zero, to be added to */
547:     for (i = 0; i < nrows; i++) {                       /* for each row I receive */
548:       r                    = rows[i];                   /* row id in C */
549:       dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
550:       currowlens[r] += rrowlens[i];                     /* accumulate to length of row r in C */
551:     }
552:     PetscFree(currowlens);

554:     rrowlens--;
555:     for (i = 0; i < nrows; i++) rrowlens[i + 1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
556:     dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), dstrowoffset_h);
557:     srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */

559:     /* Build the reduceSF, which performs buffer to buffer send/recv */
560:     PetscInt *sdisp, *rdisp; /* buffer to send offsets of roots, and buffer to recv them */
561:     PetscMalloc2(niranks, &sdisp, nranks, &rdisp);
562:     for (i = 0; i < niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
563:     for (i = 0; i < nranks; i++) MPI_Irecv(&rdisp[i], 1, MPIU_INT, ranks[i], tag, comm, &reqs[i]);
564:     for (i = 0; i < niranks; i++) MPI_Isend(&sdisp[i], 1, MPIU_INT, iranks[i], tag, comm, &reqs[nranks + i]);
565:     MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);

567:     /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
568:     PetscInt     nroots2 = Cnnz, nleaves2 = Annz;
569:     PetscSFNode *iremote;
570:     PetscMalloc1(nleaves2, &iremote); /* no free, since memory will be given to reduceSF */
571:     for (i = 0; i < nranks; i++) {
572:       PetscInt rootbase = rdisp[i];                      /* root offset at this root rank */
573:       PetscInt leafbase = Ai[roffset[i]];                /* leaf base */
574:       PetscInt nz       = Ai[roffset[i + 1]] - leafbase; /* I will send nz nonzeros to this root rank */
575:       for (PetscInt k = 0; k < nz; k++) {
576:         iremote[leafbase + k].rank  = ranks[i];
577:         iremote[leafbase + k].index = rootbase + k;
578:       }
579:     }
580:     PetscSFCreate(comm, &reduceSF);
581:     PetscSFSetGraph(reduceSF, nroots2, nleaves2, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
582:     PetscFree2(sdisp, rdisp);

584:     /* Reduce Aa, Ajg to abuf and jbuf */

586:     /* If A uses local col ids, convert them to global ones before sending */
587:     MatColIdxKokkosView Ajg;
588:     if (local) {
589:       Ajg                           = MatColIdxKokkosView("j", Annz);
590:       const MatColIdxKokkosView &Aj = akok->j_dual.view_device();
591:       Kokkos::parallel_for(
592:         Annz, KOKKOS_LAMBDA(const PetscInt i) { Ajg(i) = l2g(Aj(i)); });
593:     } else {
594:       Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
595:     }

597:     MatColIdxKokkosView jbuf("jbuf", Cnnz);
598:     abuf = MatScalarKokkosView("abuf", Cnnz);
599:     PetscSFReduceBegin(reduceSF, MPIU_INT, Ajg.data(), jbuf.data(), MPI_REPLACE);
600:     PetscSFReduceEnd(reduceSF, MPIU_INT, Ajg.data(), jbuf.data(), MPI_REPLACE);
601:     PetscSFReduceBegin(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
602:     PetscSFReduceEnd(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);

604:     /* Copy data from abuf, jbuf to Ca, Cj */
605:     MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ci_h); /* Ci is an alias of Ci_h if no device */
606:     MatColIdxKokkosView Cj("j", Cnnz);
607:     MatScalarKokkosView Ca("a", Cnnz);

609:     Kokkos::parallel_for(
610:       Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
611:         PetscInt i   = t.league_rank();
612:         PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
613:         PetscInt len = srcrowoffset(i + 1) - srcrowoffset(i);
614:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt k) {
615:           Ca(dst + k) = abuf(src + k);
616:           Cj(dst + k) = jbuf(src + k);
617:         });
618:       });

620:     /* Build C with Ca, Ci, Cj */
621:     C = KokkosCsrMatrix("csrmat", Cm, N, Cnnz, Ca, Ci, Cj);
622:     PetscFree2(srowlens, reqs);
623:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unsupported MatReuse enum %d", reuse);
624:   return 0;
625: }

627: /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a `MATMPIAIJKOKKOS` matrix by splitting a KokkosCsrMatrix

629:   Input Parameters:
630: +  C        - the `MATMPIAIJKOKKOS` matrix, of size m,n,M,N
631: .  reuse    - indicate whether the matrix has called this function before
632: .  csrmat   - the KokkosCsrMatrix, of size m,N
633: -  Cdstart  - when reuse == `MAT_REUSE_MATRIX`, it is an input parameter. For each row in csrmat, it stores the start of the first
634:               entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
635:               entry is 5, then Cdstart[i] = 3.

637:   Output Parameters:
638: +  C        - the updated `MATMPIAIJKOKKOS` matrix
639: -  Cdstart - when reuse == `MAT_INITIAL_MATRIX`, it is an output parameter

641:   Note:
642:    Between calls with `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`, csrmat must have the same nonzero pattern

644: .seealso: `MATMPIAIJKOKKOS`
645:  */
646: static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C, MatReuse reuse, const KokkosCsrMatrix &csrmat, MatRowMapKokkosView &Cdstart)
647: {
648:   const MatScalarKokkosView      &Ca = csrmat.values;
649:   const ConstMatRowMapKokkosView &Ci = csrmat.graph.row_map;
650:   PetscInt                        m, n, N;

652:   MatGetLocalSize(C, &m, &n);
653:   MatGetSize(C, NULL, &N);

655:   if (reuse == MAT_REUSE_MATRIX) {
656:     Mat_MPIAIJ                *mpiaij = static_cast<Mat_MPIAIJ *>(C->data);
657:     Mat_SeqAIJKokkos          *akok   = static_cast<Mat_SeqAIJKokkos *>(mpiaij->A->spptr);
658:     Mat_SeqAIJKokkos          *bkok   = static_cast<Mat_SeqAIJKokkos *>(mpiaij->B->spptr);
659:     const MatScalarKokkosView &Cda = akok->a_dual.view_device(), Coa = bkok->a_dual.view_device();
660:     const MatRowMapKokkosView &Cdi = akok->i_dual.view_device(), Coi = bkok->i_dual.view_device();

662:     /* Fill 'a' of Cd and Co on device */
663:     Kokkos::parallel_for(
664:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
665:         PetscInt i       = t.league_rank();     /* row i */
666:         PetscInt clen    = Ci(i + 1) - Ci(i);   /* len of row i of C */
667:         PetscInt cdlen   = Cdi(i + 1) - Cdi(i); /* len of row i of Cd */
668:         PetscInt cdstart = Cdstart(i);          /* [start, end) of row i of Cd in C */
669:         PetscInt cdend   = cdstart + cdlen;
670:         /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
671:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
672:           if (k < cdstart) { /* k in [0, cdstart) */
673:             Coa(Coi(i) + k) = Ca(Ci(i) + k);
674:           } else if (k < cdend) { /* k in [cdstart, cdend) */
675:             Cda(Cdi(i) + (k - cdstart)) = Ca(Ci(i) + k);
676:           } else { /* k in [cdend, clen) */
677:             Coa(Coi(i) + k - cdlen) = Ca(Ci(i) + k);
678:           }
679:         });
680:       });

682:     akok->a_dual.modify_device();
683:     bkok->a_dual.modify_device();
684:   } else if (reuse == MAT_INITIAL_MATRIX) {
685:     Mat                        Cd, Co;
686:     const MatColIdxKokkosView &Cj = csrmat.graph.entries;
687:     MatRowMapKokkosDualView    Cdi_dual("i", m + 1), Coi_dual("i", m + 1);
688:     MatRowMapKokkosView        Cdi = Cdi_dual.view_device(), Coi = Coi_dual.view_device();
689:     PetscInt                   cstart, cend;

691:     /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
692:        left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
693:        Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
694:        such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
695:        stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
696:      */
697:     Cdstart = MatRowMapKokkosView("Cdstart", m);
698:     PetscLayoutGetRange(C->cmap, &cstart, &cend); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */

700:     /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
701:       CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
702:      */
703:     Kokkos::parallel_for(
704:       Kokkos::TeamPolicy<>(m, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
705:         Kokkos::single(Kokkos::PerTeam(t), [=]() {                               /* Only one thread works in a team */
706:                                                    PetscInt i = t.league_rank(); /* row i */
707:                                                    PetscInt j, first, count, step;

709:                                                    if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
710:                                                      Cdi(0) = 0;
711:                                                      Coi(0) = 0;
712:                                                    }

714:                                                    /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
715:           in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
716:         */
717:                                                    count = Ci(i + 1) - Ci(i);
718:                                                    first = Ci(i);
719:                                                    while (count > 0) {
720:                                                      j    = first;
721:                                                      step = count / 2;
722:                                                      j += step;
723:                                                      if (Cj(j) < cstart) {
724:                                                        first = ++j;
725:                                                        count -= step + 1;
726:                                                      } else count = step;
727:                                                    }
728:                                                    Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */

730:                                                    /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
731:                                                    count = Ci(i + 1) - first;
732:                                                    while (count > 0) {
733:                                                      j    = first;
734:                                                      step = count / 2;
735:                                                      j += step;
736:                                                      if (Cj(j) < cend) {
737:                                                        first = ++j;
738:                                                        count -= step + 1;
739:                                                      } else count = step;
740:                                                    }
741:                                                    Cdi(i + 1) = first - (Ci(i) + Cdstart(i));     /* 'first' is the while-loop's output */
742:                                                    Coi(i + 1) = (Ci(i + 1) - Ci(i)) - Cdi(i + 1); /* Co's row len = C's row len - Cd's row len */
743:         });
744:       });

746:     /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
747:     Kokkos::parallel_scan(
748:       m + 1, KOKKOS_LAMBDA(const PetscInt i, PetscInt &update, const bool final) {
749:         update += Cdi(i);
750:         if (final) Cdi(i) = update;
751:       });
752:     Kokkos::parallel_scan(
753:       m + 1, KOKKOS_LAMBDA(const PetscInt i, PetscInt &update, const bool final) {
754:         update += Coi(i);
755:         if (final) Coi(i) = update;
756:       });

758:     /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
759:        MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
760:     */
761:     Cdi_dual.modify_device();
762:     Coi_dual.modify_device();
763:     Cdi_dual.sync_host();
764:     Coi_dual.sync_host();
765:     PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
766:     PetscInt Co_nnz = Coi_dual.view_host().data()[m];

768:     /* With nnz, allocate a, j for Cd and Co */
769:     MatColIdxKokkosDualView Cdj_dual("j", Cd_nnz), Coj_dual("j", Co_nnz);
770:     MatScalarKokkosDualView Cda_dual("a", Cd_nnz), Coa_dual("a", Co_nnz);

772:     /* Fill a, j of Cd and Co on device */
773:     MatColIdxKokkosView Cdj = Cdj_dual.view_device(), Coj = Coj_dual.view_device();
774:     MatScalarKokkosView Cda = Cda_dual.view_device(), Coa = Coa_dual.view_device();

776:     Kokkos::parallel_for(
777:       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
778:         PetscInt i       = t.league_rank();     /* row i */
779:         PetscInt clen    = Ci(i + 1) - Ci(i);   /* len of row i of C */
780:         PetscInt cdlen   = Cdi(i + 1) - Cdi(i); /* len of row i of Cd */
781:         PetscInt cdstart = Cdstart(i);          /* [start, end) of row i of Cd in C */
782:         PetscInt cdend   = cdstart + cdlen;
783:         /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
784:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
785:           if (k < cdstart) { /* k in [0, cdstart) */
786:             Coa(Coi(i) + k) = Ca(Ci(i) + k);
787:             Coj(Coi(i) + k) = Cj(Ci(i) + k);
788:           } else if (k < cdend) { /* k in [cdstart, cdend) */
789:             Cda(Cdi(i) + (k - cdstart)) = Ca(Ci(i) + k);
790:             Cdj(Cdi(i) + (k - cdstart)) = Cj(Ci(i) + k) - cstart; /* Use local col ids in Cdj */
791:           } else {                                                /* k in [cdend, clen) */
792:             Coa(Coi(i) + k - cdlen) = Ca(Ci(i) + k);
793:             Coj(Coi(i) + k - cdlen) = Cj(Ci(i) + k);
794:           }
795:         });
796:       });

798:     Cdj_dual.modify_device();
799:     Cda_dual.modify_device();
800:     Coj_dual.modify_device();
801:     Coa_dual.modify_device();
802:     /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
803:     auto cdkok = new Mat_SeqAIJKokkos(m, n, Cd_nnz, Cdi_dual, Cdj_dual, Cda_dual);
804:     auto cokok = new Mat_SeqAIJKokkos(m, N, Co_nnz, Coi_dual, Coj_dual, Coa_dual);
805:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, cdkok, &Cd);
806:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, cokok, &Co);
807:     MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C, Cd, Co); /* Coj will be converted to local ids within */
808:   }
809:   return 0;
810: }

812: /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.

814:   It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.

816:   Input Parameters:
817: +  C        - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
818: .  reuse    - indicate whether the matrix has called this function before
819: .  csrmat   - the KokkosCsrMatrix, of size m,N
820: -  Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
821:               entry of the diag block of C in csrmat's j array.

823:   Output Parameters:
824: +  C        - the updated MATMPIAIJKOKKOS matrix
825: -  Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter

827:   Note:
828:   the input matrix's col ids and col size will be changed.
829: */
830: static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C, MatColIdxKokkosView &l2g)
831: {
832:   Mat_SeqAIJKokkos      *ckok;
833:   ISLocalToGlobalMapping l2gmap;
834:   const PetscInt        *garray;
835:   PetscInt               sz;

837:   /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
838:   MatSeqAIJCompactOutExtraColumns_SeqAIJ(C, &l2gmap);
839:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
840:   ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
841:   ckok->j_dual.sync_device();
842:   ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */

844:   /* Build l2g -- the local to global mapping of C's cols */
845:   ISLocalToGlobalMappingGetIndices(l2gmap, &garray);
846:   ISLocalToGlobalMappingGetSize(l2gmap, &sz);

849:   ConstMatColIdxKokkosViewHost tmp(garray, sz);
850:   l2g = MatColIdxKokkosView("l2g", sz);
851:   Kokkos::deep_copy(l2g, tmp);

853:   ISLocalToGlobalMappingRestoreIndices(l2gmap, &garray);
854:   ISLocalToGlobalMappingDestroy(&l2gmap);
855:   return 0;
856: }

858: /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos

860:   Input Parameters:
861: +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
862: .  A        - an MPIAIJKOKKOS matrix
863: .  B        - an MPIAIJKOKKOS matrix
864: -  mm       - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.

866:   Note: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
867: */
868: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product, Mat A, Mat B, MatMatStruct_AB *mm)
869: {
870:   Mat_MPIAIJ              *a  = static_cast<Mat_MPIAIJ *>(A->data);
871:   Mat                      Ad = a->A, Ao = a->B; /* diag and offdiag of A */
872:   IS                       glob = NULL;
873:   const PetscInt          *garray;
874:   PetscInt                 N = B->cmap->N, sz;
875:   ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */
876:   MatColIdxKokkosView      l2g2;
877:   Mat                      C1, C2; /* intermediate matrices */

879:   /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
880:   MatMPIAIJGetLocalMatMerge(B, MAT_INITIAL_MATRIX, &glob, &mm->B_local);
881:   MatProductCreate(Ad, mm->B_local, NULL, &C1);
882:   MatProductSetType(C1, MATPRODUCT_AB);
883:   MatProductSetFill(C1, product->fill);
884:   C1->product->api_user = product->api_user;
885:   MatProductSetFromOptions(C1);
886:   PetscUseTypeMethod(C1, productsymbolic);

888:   ISGetIndices(glob, &garray);
889:   ISGetSize(glob, &sz);
890:   const auto &tmp = ConstMatColIdxKokkosViewHost(garray, sz);                       /* wrap garray as a view */
891:   l2g1            = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
892:   MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1, N, l2g1, mm->C1_global);

894:   /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
895:   MatSeqAIJKokkosBcast(mm->B_local, MAT_INITIAL_MATRIX, N, l2g1, a->Mvctx, mm->sf, mm->abuf, mm->rows, mm->rowoffset, mm->B_other);

897:   /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */
898:   MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other, l2g2);
899:   MatProductCreate(Ao, mm->B_other, NULL, &C2);
900:   MatProductSetType(C2, MATPRODUCT_AB);
901:   MatProductSetFill(C2, product->fill);
902:   C2->product->api_user = product->api_user;
903:   MatProductSetFromOptions(C2);
904:   PetscUseTypeMethod(C2, productsymbolic);
905:   MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2, N, l2g2, mm->C2_global);

907:   /* C = C1 + C2.  We actually use their global col ids versions in adding */
908:   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
909:   KokkosSparse::spadd_symbolic(&mm->kh, mm->C1_global, mm->C2_global, mm->C_global);
910:   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
911:   KokkosSparse::spadd_numeric(&mm->kh, (MatScalarType)1.0, mm->C1_global, (MatScalarType)1.0, mm->C2_global, mm->C_global);

913:   mm->C1 = C1;
914:   mm->C2 = C2;
915:   ISRestoreIndices(glob, &garray);
916:   ISDestroy(&glob);
917:   return 0;
918: }

920: /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos

922:   Input Parameters:
923: +  product  - Mat_Product which carried out the computation. Passed in to access info about this mat product.
924: .  A        - an MPIAIJKOKKOS matrix
925: .  B        - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
926: .  localB   - Does B use local col ids? If false, then B is already in global col ids.
927: .  N        - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
928: .  l2g      - If localB, then l2g maps B's local col ids to global ones.
929: -  mm       - a struct used to stash intermediate data in AtB

931:   Note: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
932: */
933: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product, Mat A, Mat B, PetscBool localB, PetscInt N, const ConstMatColIdxKokkosView &l2g, MatMatStruct_AtB *mm)
934: {
935:   Mat_MPIAIJ *a  = static_cast<Mat_MPIAIJ *>(A->data);
936:   Mat         Ad = a->A, Ao = a->B; /* diag and offdiag of A */
937:   Mat         C1, C2;               /* intermediate matrices */

939:   /* C1 = Ad^t * B */
940:   MatProductCreate(Ad, B, NULL, &C1);
941:   MatProductSetType(C1, MATPRODUCT_AtB);
942:   MatProductSetFill(C1, product->fill);
943:   C1->product->api_user = product->api_user;
944:   MatProductSetFromOptions(C1);
945:   PetscUseTypeMethod(C1, productsymbolic);

947:   if (localB) MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1, N, l2g, mm->C1_global);
948:   else mm->C1_global = static_cast<Mat_SeqAIJKokkos *>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */

950:   /* C2 = Ao^t * B */
951:   MatProductCreate(Ao, B, NULL, &C2);
952:   MatProductSetType(C2, MATPRODUCT_AtB);
953:   MatProductSetFill(C2, product->fill);
954:   C2->product->api_user = product->api_user;
955:   MatProductSetFromOptions(C2);
956:   PetscUseTypeMethod(C2, productsymbolic);

958:   MatSeqAIJKokkosReduce(C2, MAT_INITIAL_MATRIX, localB, N, l2g, a->Mvctx, mm->sf, mm->abuf, mm->srcrowoffset, mm->dstrowoffset, mm->C2_global);

960:   mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
961:   KokkosSparse::spadd_symbolic(&mm->kh, mm->C1_global, mm->C2_global, mm->C_global);
962:   /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
963:   KokkosSparse::spadd_numeric(&mm->kh, (MatScalarType)1.0, mm->C1_global, (MatScalarType)1.0, mm->C2_global, mm->C_global);
964:   mm->C1 = C1;
965:   mm->C2 = C2;
966:   return 0;
967: }

969: PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
970: {
971:   Mat_Product                 *product = C->product;
972:   MatProductType               ptype;
973:   MatProductData_MPIAIJKokkos *mmdata;
974:   MatMatStruct                *mm = NULL;
975:   MatMatStruct_AB             *ab;
976:   MatMatStruct_AtB            *atb;
977:   Mat                          A, B, Ad, Ao, Bd, Bo;
978:   const MatScalarType          one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */

980:   MatCheckProduct(C, 1);
981:   mmdata = static_cast<MatProductData_MPIAIJKokkos *>(product->data);
982:   ptype  = product->type;
983:   A      = product->A;
984:   B      = product->B;
985:   Ad     = static_cast<Mat_MPIAIJ *>(A->data)->A;
986:   Ao     = static_cast<Mat_MPIAIJ *>(A->data)->B;
987:   Bd     = static_cast<Mat_MPIAIJ *>(B->data)->A;
988:   Bo     = static_cast<Mat_MPIAIJ *>(B->data)->B;

990:   if (mmdata->reusesym) {           /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
991:     mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
992:     ab               = mmdata->mmAB;
993:     atb              = mmdata->mmAtB;
994:     if (ab) {
995:       static_cast<MatProductData_SeqAIJKokkos *>(ab->C1->product->data)->reusesym = PETSC_FALSE;
996:       static_cast<MatProductData_SeqAIJKokkos *>(ab->C2->product->data)->reusesym = PETSC_FALSE;
997:     }
998:     if (atb) {
999:       static_cast<MatProductData_SeqAIJKokkos *>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1000:       static_cast<MatProductData_SeqAIJKokkos *>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1001:     }
1002:     return 0;
1003:   }

1005:   if (ptype == MATPRODUCT_AB) {
1006:     ab = mmdata->mmAB;
1007:     /* C1 = Ad * B_local */
1009:     MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &ab->B_local);
1011:     if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, ab->C1);
1012:     (*ab->C1->ops->productnumeric)(ab->C1);
1013:     MatSeqAIJKokkosBcast(ab->B_local, MAT_REUSE_MATRIX, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, ab->sf, ab->abuf, ab->rows, ab->rowoffset, ab->B_other);
1014:     /* C2 = Ao * B_other */
1016:     if (ab->C1->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, ab->C2);
1017:     (*ab->C2->ops->productnumeric)(ab->C2);
1018:     /* C = C1_global + C2_global */
1019:     KokkosSparse::spadd_numeric(&ab->kh, one, ab->C1_global, one, ab->C2_global, ab->C_global);
1020:     mm = static_cast<MatMatStruct *>(ab);
1021:   } else if (ptype == MATPRODUCT_AtB) {
1022:     atb = mmdata->mmAtB;
1024:     /* C1 = Ad^t * B_local */
1025:     MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &atb->B_local);
1027:     if (atb->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, atb->C1);
1028:     (*atb->C1->ops->productnumeric)(atb->C1);

1030:     /* C2 = Ao^t * B_local */
1032:     if (atb->C2->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, atb->C2);
1033:     (*atb->C2->ops->productnumeric)(atb->C2);
1034:     /* Form C2_global */
1035:     MatSeqAIJKokkosReduce(atb->C2, MAT_REUSE_MATRIX, PETSC_TRUE, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, atb->sf, atb->abuf, atb->srcrowoffset, atb->dstrowoffset, atb->C2_global);
1036:     /* C = C1_global + C2_global */
1037:     KokkosSparse::spadd_numeric(&atb->kh, one, atb->C1_global, one, atb->C2_global, atb->C_global);
1038:     mm = static_cast<MatMatStruct *>(atb);
1039:   } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1040:     ab = mmdata->mmAB;
1041:     MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &ab->B_local);

1043:     /* ab->C1 = Ad * B_local */
1045:     if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, ab->C1);
1046:     (*ab->C1->ops->productnumeric)(ab->C1);
1047:     MatSeqAIJKokkosBcast(ab->B_local, MAT_REUSE_MATRIX, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, ab->sf, ab->abuf, ab->rows, ab->rowoffset, ab->B_other);
1048:     /* ab->C2 = Ao * B_other */
1049:     if (ab->C2->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, ab->C2);
1050:     (*ab->C2->ops->productnumeric)(ab->C2); /* C2 = Ao * B_other */
1051:     KokkosSparse::spadd_numeric(&ab->kh, one, ab->C1_global, one, ab->C2_global, ab->C_global);

1053:     /* atb->C1 = Bd^t * ab->C_petsc */
1054:     atb = mmdata->mmAtB;
1056:     if (atb->C1->product->A != Bd) MatProductReplaceMats(Bd, NULL, NULL, atb->C1);
1057:     (*atb->C1->ops->productnumeric)(atb->C1);
1058:     /* atb->C2 = Bo^t * ab->C_petsc */
1059:     if (atb->C2->product->A != Bo) MatProductReplaceMats(Bo, NULL, NULL, atb->C2);
1060:     (*atb->C2->ops->productnumeric)(atb->C2);
1061:     MatSeqAIJKokkosReduce(atb->C2, MAT_REUSE_MATRIX, PETSC_FALSE, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /* ownerSF */, atb->sf, atb->abuf, atb->srcrowoffset, atb->dstrowoffset, atb->C2_global);
1062:     KokkosSparse::spadd_numeric(&atb->kh, one, atb->C1_global, one, atb->C2_global, atb->C_global);
1063:     mm = static_cast<MatMatStruct *>(atb);
1064:   }
1065:   /* Split C_global to form C */
1066:   MatSetMPIAIJKokkosWithGlobalCSRMatrix(C, MAT_REUSE_MATRIX, mm->C_global, mm->Cdstart);
1067:   return 0;
1068: }

1070: PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1071: {
1072:   Mat                          A, B;
1073:   Mat_Product                 *product = C->product;
1074:   MatProductType               ptype;
1075:   MatProductData_MPIAIJKokkos *mmdata;
1076:   MatMatStruct                *mm   = NULL;
1077:   IS                           glob = NULL;
1078:   const PetscInt              *garray;
1079:   PetscInt                     m, n, M, N, sz;
1080:   ConstMatColIdxKokkosView     l2g; /* map local col ids to global ones */

1082:   MatCheckProduct(C, 1);
1084:   ptype = product->type;
1085:   A     = product->A;
1086:   B     = product->B;

1088:   switch (ptype) {
1089:   case MATPRODUCT_AB:
1090:     m = A->rmap->n;
1091:     n = B->cmap->n;
1092:     M = A->rmap->N;
1093:     N = B->cmap->N;
1094:     break;
1095:   case MATPRODUCT_AtB:
1096:     m = A->cmap->n;
1097:     n = B->cmap->n;
1098:     M = A->cmap->N;
1099:     N = B->cmap->N;
1100:     break;
1101:   case MATPRODUCT_PtAP:
1102:     m = B->cmap->n;
1103:     n = B->cmap->n;
1104:     M = B->cmap->N;
1105:     N = B->cmap->N;
1106:     break; /* BtAB */
1107:   default:
1108:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
1109:   }

1111:   MatSetSizes(C, m, n, M, N);
1112:   PetscLayoutSetUp(C->rmap);
1113:   PetscLayoutSetUp(C->cmap);
1114:   MatSetType(C, ((PetscObject)A)->type_name);

1116:   mmdata           = new MatProductData_MPIAIJKokkos();
1117:   mmdata->reusesym = product->api_user;

1119:   if (ptype == MATPRODUCT_AB) {
1120:     mmdata->mmAB = new MatMatStruct_AB();
1121:     MatProductSymbolic_MPIAIJKokkos_AB(product, A, B, mmdata->mmAB);
1122:     mm = static_cast<MatMatStruct *>(mmdata->mmAB);
1123:   } else if (ptype == MATPRODUCT_AtB) {
1124:     mmdata->mmAtB = new MatMatStruct_AtB();
1125:     auto atb      = mmdata->mmAtB;
1126:     MatMPIAIJGetLocalMatMerge(B, MAT_INITIAL_MATRIX, &glob, &atb->B_local);
1127:     ISGetIndices(glob, &garray);
1128:     ISGetSize(glob, &sz);
1129:     l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatColIdxKokkosViewHost(garray, sz));
1130:     MatProductSymbolic_MPIAIJKokkos_AtB(product, A, atb->B_local, PETSC_TRUE, N, l2g, atb);
1131:     ISRestoreIndices(glob, &garray);
1132:     ISDestroy(&glob);
1133:     mm = static_cast<MatMatStruct *>(atb);
1134:   } else if (ptype == MATPRODUCT_PtAP) {    /* BtAB */
1135:     mmdata->mmAB  = new MatMatStruct_AB();  /* tmp=A*B */
1136:     mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1137:     auto ab       = mmdata->mmAB;
1138:     auto atb      = mmdata->mmAtB;
1139:     MatProductSymbolic_MPIAIJKokkos_AB(product, A, B, ab);
1140:     auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
1141:     MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, tmp, &ab->C_petsc);
1142:     MatProductSymbolic_MPIAIJKokkos_AtB(product, B, ab->C_petsc, PETSC_FALSE, N, l2g /*not used*/, atb);
1143:     mm = static_cast<MatMatStruct *>(atb);
1144:   }
1145:   /* Split the C_global into petsc A, B format */
1146:   MatSetMPIAIJKokkosWithGlobalCSRMatrix(C, MAT_INITIAL_MATRIX, mm->C_global, mm->Cdstart);
1147:   C->product->data       = mmdata;
1148:   C->product->destroy    = MatProductDataDestroy_MPIAIJKokkos;
1149:   C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos;
1150:   return 0;
1151: }

1153: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1154: {
1155:   Mat_Product *product = mat->product;
1156:   PetscBool    match   = PETSC_FALSE;
1157:   PetscBool    usecpu  = PETSC_FALSE;

1159:   MatCheckProduct(mat, 1);
1160:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match);
1161:   if (match) { /* we can always fallback to the CPU if requested */
1162:     switch (product->type) {
1163:     case MATPRODUCT_AB:
1164:       if (product->api_user) {
1165:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
1166:         PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
1167:         PetscOptionsEnd();
1168:       } else {
1169:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
1170:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
1171:         PetscOptionsEnd();
1172:       }
1173:       break;
1174:     case MATPRODUCT_AtB:
1175:       if (product->api_user) {
1176:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
1177:         PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
1178:         PetscOptionsEnd();
1179:       } else {
1180:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
1181:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
1182:         PetscOptionsEnd();
1183:       }
1184:       break;
1185:     case MATPRODUCT_PtAP:
1186:       if (product->api_user) {
1187:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
1188:         PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
1189:         PetscOptionsEnd();
1190:       } else {
1191:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
1192:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
1193:         PetscOptionsEnd();
1194:       }
1195:       break;
1196:     default:
1197:       break;
1198:     }
1199:     match = (PetscBool)!usecpu;
1200:   }
1201:   if (match) {
1202:     switch (product->type) {
1203:     case MATPRODUCT_AB:
1204:     case MATPRODUCT_AtB:
1205:     case MATPRODUCT_PtAP:
1206:       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1207:       break;
1208:     default:
1209:       break;
1210:     }
1211:   }
1212:   /* fallback to MPIAIJ ops */
1213:   if (!mat->ops->productsymbolic) MatProductSetFromOptions_MPIAIJ(mat);
1214:   return 0;
1215: }

1217: static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1218: {
1219:   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ *)mat->data;
1220:   Mat_MPIAIJKokkos *mpikok;

1222:   MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j);
1223:   mat->preallocated = PETSC_TRUE;
1224:   MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
1225:   MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
1226:   MatZeroEntries(mat);
1227:   mpikok = static_cast<Mat_MPIAIJKokkos *>(mpiaij->spptr);
1228:   delete mpikok;
1229:   mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij);
1230:   return 0;
1231: }

1233: static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat, const PetscScalar v[], InsertMode imode)
1234: {
1235:   Mat_MPIAIJ                 *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
1236:   Mat_MPIAIJKokkos           *mpikok = static_cast<Mat_MPIAIJKokkos *>(mpiaij->spptr);
1237:   Mat                         A = mpiaij->A, B = mpiaij->B;
1238:   PetscCount                  Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
1239:   MatScalarKokkosView         Aa, Ba;
1240:   MatScalarKokkosView         v1;
1241:   MatScalarKokkosView        &vsend  = mpikok->sendbuf_d;
1242:   const MatScalarKokkosView  &v2     = mpikok->recvbuf_d;
1243:   const PetscCountKokkosView &Ajmap1 = mpikok->Ajmap1_d, Ajmap2 = mpikok->Ajmap2_d, Aimap2 = mpikok->Aimap2_d;
1244:   const PetscCountKokkosView &Bjmap1 = mpikok->Bjmap1_d, Bjmap2 = mpikok->Bjmap2_d, Bimap2 = mpikok->Bimap2_d;
1245:   const PetscCountKokkosView &Aperm1 = mpikok->Aperm1_d, Aperm2 = mpikok->Aperm2_d, Bperm1 = mpikok->Bperm1_d, Bperm2 = mpikok->Bperm2_d;
1246:   const PetscCountKokkosView &Cperm1 = mpikok->Cperm1_d;
1247:   PetscMemType                memtype;

1249:   PetscGetMemType(v, &memtype); /* Return PETSC_MEMTYPE_HOST when v is NULL */
1250:   if (PetscMemTypeHost(memtype)) {         /* If user gave v[] in host, we need to copy it to device if any */
1251:     v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), MatScalarKokkosViewHost((PetscScalar *)v, mpiaij->coo_n));
1252:   } else {
1253:     v1 = MatScalarKokkosView((PetscScalar *)v, mpiaij->coo_n); /* Directly use v[]'s memory */
1254:   }

1256:   if (imode == INSERT_VALUES) {
1257:     MatSeqAIJGetKokkosViewWrite(A, &Aa); /* write matrix values */
1258:     MatSeqAIJGetKokkosViewWrite(B, &Ba);
1259:   } else {
1260:     MatSeqAIJGetKokkosView(A, &Aa); /* read & write matrix values */
1261:     MatSeqAIJGetKokkosView(B, &Ba);
1262:   }

1264:   /* Pack entries to be sent to remote */
1265:   Kokkos::parallel_for(
1266:     vsend.extent(0), KOKKOS_LAMBDA(const PetscCount i) { vsend(i) = v1(Cperm1(i)); });

1268:   /* Send remote entries to their owner and overlap the communication with local computation */
1269:   PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_KOKKOS, vsend.data(), PETSC_MEMTYPE_KOKKOS, v2.data(), MPI_REPLACE);
1270:   /* Add local entries to A and B in one kernel */
1271:   Kokkos::parallel_for(
1272:     Annz + Bnnz, KOKKOS_LAMBDA(PetscCount i) {
1273:       PetscScalar sum = 0.0;
1274:       if (i < Annz) {
1275:         for (PetscCount k = Ajmap1(i); k < Ajmap1(i + 1); k++) sum += v1(Aperm1(k));
1276:         Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1277:       } else {
1278:         i -= Annz;
1279:         for (PetscCount k = Bjmap1(i); k < Bjmap1(i + 1); k++) sum += v1(Bperm1(k));
1280:         Ba(i) = (imode == INSERT_VALUES ? 0.0 : Ba(i)) + sum;
1281:       }
1282:     });
1283:   PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend.data(), v2.data(), MPI_REPLACE);

1285:   /* Add received remote entries to A and B in one kernel */
1286:   Kokkos::parallel_for(
1287:     Annz2 + Bnnz2, KOKKOS_LAMBDA(PetscCount i) {
1288:       if (i < Annz2) {
1289:         for (PetscCount k = Ajmap2(i); k < Ajmap2(i + 1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));
1290:       } else {
1291:         i -= Annz2;
1292:         for (PetscCount k = Bjmap2(i); k < Bjmap2(i + 1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));
1293:       }
1294:     });

1296:   if (imode == INSERT_VALUES) {
1297:     MatSeqAIJRestoreKokkosViewWrite(A, &Aa); /* Increase A & B's state etc. */
1298:     MatSeqAIJRestoreKokkosViewWrite(B, &Ba);
1299:   } else {
1300:     MatSeqAIJRestoreKokkosView(A, &Aa);
1301:     MatSeqAIJRestoreKokkosView(B, &Ba);
1302:   }
1303:   return 0;
1304: }

1306: PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1307: {
1308:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;

1310:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL);
1311:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL);
1312:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
1313:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
1314:   delete (Mat_MPIAIJKokkos *)mpiaij->spptr;
1315:   MatDestroy_MPIAIJ(A);
1316:   return 0;
1317: }

1319: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
1320: {
1321:   Mat         B;
1322:   Mat_MPIAIJ *a;

1324:   if (reuse == MAT_INITIAL_MATRIX) {
1325:     MatDuplicate(A, MAT_COPY_VALUES, newmat);
1326:   } else if (reuse == MAT_REUSE_MATRIX) {
1327:     MatCopy(A, *newmat, SAME_NONZERO_PATTERN);
1328:   }
1329:   B = *newmat;

1331:   B->boundtocpu = PETSC_FALSE;
1332:   PetscFree(B->defaultvectype);
1333:   PetscStrallocpy(VECKOKKOS, &B->defaultvectype);
1334:   PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJKOKKOS);

1336:   a = static_cast<Mat_MPIAIJ *>(A->data);
1337:   if (a->A) MatSetType(a->A, MATSEQAIJKOKKOS);
1338:   if (a->B) MatSetType(a->B, MATSEQAIJKOKKOS);
1339:   if (a->lvec) VecSetType(a->lvec, VECSEQKOKKOS);

1341:   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
1342:   B->ops->mult                  = MatMult_MPIAIJKokkos;
1343:   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
1344:   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
1345:   B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1346:   B->ops->destroy               = MatDestroy_MPIAIJKokkos;

1348:   PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJKokkos);
1349:   PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);
1350:   PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJKokkos);
1351:   PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJKokkos);
1352:   return 0;
1353: }
1354: /*MC
1355:    MATAIJKOKKOS - "mpiaijkokkos", a matrix type to be used for CSR sparse matrices with Kokkos

1357:    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types

1359:    Options Database Keys:
1360: .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()

1362:   Level: beginner

1364: .seealso: `MatCreateAIJKokkos()`, `MATSEQAIJKOKKOS`, `MATSEQAIJ`, `MATMPIAIJ`
1365: M*/
1366: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1367: {
1368:   PetscKokkosInitializeCheck();
1369:   MatCreate_MPIAIJ(A);
1370:   MatConvert_MPIAIJ_MPIAIJKokkos(A, MATMPIAIJKOKKOS, MAT_INPLACE_MATRIX, &A);
1371:   return 0;
1372: }

1374: /*@C
1375:    MatCreateAIJKokkos - Creates a sparse matrix in `MATAIJKOKOS` (compressed row) format
1376:    (the default parallel PETSc format).  This matrix will ultimately pushed down
1377:    to Kokkos for calculations. For good matrix
1378:    assembly performance the user should preallocate the matrix storage by setting
1379:    the parameter nz (or the array nnz).  By setting these parameters accurately,
1380:    performance during matrix assembly can be increased by more than a factor of 50.

1382:    Collective

1384:    Input Parameters:
1385: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1386: .  m - number of rows
1387: .  n - number of columns
1388: .  nz - number of nonzeros per row (same for all rows)
1389: -  nnz - array containing the number of nonzeros in the various rows
1390:          (possibly different for each row) or NULL

1392:    Output Parameter:
1393: .  A - the matrix

1395:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1396:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1397:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1399:    Notes:
1400:    If nnz is given then nz is ignored

1402:    The AIJ format, also called compressed row storage), is fully compatible with standard Fortran 77
1403:    storage.  That is, the stored row and column indices can begin at
1404:    either one (as in Fortran) or zero.  See the users' manual for details.

1406:    Specify the preallocated storage with either nz or nnz (not both).
1407:    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
1408:    allocation.  For large problems you MUST preallocate memory or you
1409:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1411:    By default, this format uses inodes (identical nodes) when possible, to
1412:    improve numerical efficiency of matrix-vector products and solves. We
1413:    search for consecutive rows with the same nonzero structure, thereby
1414:    reusing matrix information to achieve increased efficiency.

1416:    Level: intermediate

1418: .seealso: `MATAIJKOKOS`, `MATSEQAIJKOKOS`, `MATMPIAIJKOKOS`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJKOKKOS`, `MATAIJKOKKOS`
1419: @*/
1420: PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
1421: {
1422:   PetscMPIInt size;

1424:   MatCreate(comm, A);
1425:   MatSetSizes(*A, m, n, M, N);
1426:   MPI_Comm_size(comm, &size);
1427:   if (size > 1) {
1428:     MatSetType(*A, MATMPIAIJKOKKOS);
1429:     MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz);
1430:   } else {
1431:     MatSetType(*A, MATSEQAIJKOKKOS);
1432:     MatSeqAIJSetPreallocation(*A, d_nz, d_nnz);
1433:   }
1434:   return 0;
1435: }

1437: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1438: PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1439: {
1440:   PetscMPIInt                size, rank;
1441:   MPI_Comm                   comm;
1442:   PetscSplitCSRDataStructure d_mat = NULL;

1444:   PetscObjectGetComm((PetscObject)A, &comm);
1445:   MPI_Comm_size(comm, &size);
1446:   MPI_Comm_rank(comm, &rank);
1447:   if (size == 1) {
1448:     MatSeqAIJKokkosGetDeviceMat(A, &d_mat);
1449:     MatSeqAIJKokkosModifyDevice(A); /* Since we are going to modify matrix values on device */
1450:   } else {
1451:     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
1452:     MatSeqAIJKokkosGetDeviceMat(aij->A, &d_mat);
1453:     MatSeqAIJKokkosModifyDevice(aij->A);
1454:     MatSeqAIJKokkosModifyDevice(aij->B);
1456:   }
1457:   // act like MatSetValues because not called on host
1458:   if (A->assembled) {
1459:     if (A->was_assembled) PetscInfo(A, "Assemble more than once already\n");
1460:     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1461:   } else {
1462:     PetscInfo(A, "Warning !assemble ??? assembled=%" PetscInt_FMT "\n", A->assembled);
1463:   }
1464:   if (!d_mat) {
1465:     struct _n_SplitCSRMat h_mat; /* host container */
1466:     Mat_SeqAIJKokkos     *aijkokA;
1467:     Mat_SeqAIJ           *jaca;
1468:     PetscInt              n = A->rmap->n, nnz;
1469:     Mat                   Amat;
1470:     PetscInt             *colmap;

1472:     /* create and copy h_mat */
1473:     h_mat.M = A->cmap->N; // use for debug build
1474:     PetscInfo(A, "Create device matrix in Kokkos\n");
1475:     if (size == 1) {
1476:       Amat            = A;
1477:       jaca            = (Mat_SeqAIJ *)A->data;
1478:       h_mat.rstart    = 0;
1479:       h_mat.rend      = A->rmap->n;
1480:       h_mat.cstart    = 0;
1481:       h_mat.cend      = A->cmap->n;
1482:       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1483:       h_mat.offdiag.a                   = NULL;
1484:       aijkokA                           = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1485:     } else {
1486:       Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)A->data;
1487:       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ *)aij->B->data;
1488:       PetscInt          ii;
1489:       Mat_SeqAIJKokkos *aijkokB;

1491:       Amat    = aij->A;
1492:       aijkokA = static_cast<Mat_SeqAIJKokkos *>(aij->A->spptr);
1493:       aijkokB = static_cast<Mat_SeqAIJKokkos *>(aij->B->spptr);
1494:       jaca    = (Mat_SeqAIJ *)aij->A->data;
1497:       aij->donotstash          = PETSC_TRUE;
1498:       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1499:       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
1500:       PetscCalloc1(A->cmap->N, &colmap);
1501:       for (ii = 0; ii < aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii + 1;
1502:       // allocate B copy data
1503:       h_mat.rstart = A->rmap->rstart;
1504:       h_mat.rend   = A->rmap->rend;
1505:       h_mat.cstart = A->cmap->rstart;
1506:       h_mat.cend   = A->cmap->rend;
1507:       nnz          = jacb->i[n];
1508:       if (jacb->compressedrow.use) {
1509:         const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_i_k(jacb->i, n + 1);
1510:         aijkokB->i_uncompressed_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_i_k));
1511:         Kokkos::deep_copy(aijkokB->i_uncompressed_d, h_i_k);
1512:         h_mat.offdiag.i = aijkokB->i_uncompressed_d.data();
1513:       } else {
1514:         h_mat.offdiag.i = aijkokB->i_device_data();
1515:       }
1516:       h_mat.offdiag.j = aijkokB->j_device_data();
1517:       h_mat.offdiag.a = aijkokB->a_device_data();
1518:       {
1519:         Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_colmap_k(colmap, A->cmap->N);
1520:         aijkokB->colmap_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_colmap_k));
1521:         Kokkos::deep_copy(aijkokB->colmap_d, h_colmap_k);
1522:         h_mat.colmap = aijkokB->colmap_d.data();
1523:         PetscFree(colmap);
1524:       }
1525:       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1526:       h_mat.offdiag.n                 = n;
1527:     }
1528:     // allocate A copy data
1529:     nnz                          = jaca->i[n];
1530:     h_mat.diag.n                 = n;
1531:     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1532:     MPI_Comm_rank(comm, &h_mat.rank);
1534:     h_mat.diag.i = aijkokA->i_device_data();
1535:     h_mat.diag.j = aijkokA->j_device_data();
1536:     h_mat.diag.a = aijkokA->a_device_data();
1537:     // copy pointers and metdata to device
1538:     MatSeqAIJKokkosSetDeviceMat(Amat, &h_mat);
1539:     MatSeqAIJKokkosGetDeviceMat(Amat, &d_mat);
1540:     PetscInfo(A, "Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n", h_mat.diag.n, nnz);
1541:   }
1542:   *B           = d_mat;       // return it, set it in Mat, and set it up
1543:   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1544:   return 0;
1545: }

1547: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
1548: {
1549:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1551:   if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
1552:   else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
1553:   else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
1554:   else *mask = "PETSC_OFFLOAD_BOTH";
1555:   return 0;
1556: }

1558: PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
1559: {
1560:   PetscMPIInt size;
1561:   Mat         Ad, Ao;
1562:   const char *amask, *bmask;

1564:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);

1566:   if (size == 1) {
1567:     MatSeqAIJKokkosGetOffloadMask(A, &amask);
1568:     PetscPrintf(PETSC_COMM_SELF, "%s\n", amask);
1569:   } else {
1570:     Ad = ((Mat_MPIAIJ *)A->data)->A;
1571:     Ao = ((Mat_MPIAIJ *)A->data)->B;
1572:     MatSeqAIJKokkosGetOffloadMask(Ad, &amask);
1573:     MatSeqAIJKokkosGetOffloadMask(Ao, &bmask);
1574:     PetscPrintf(PETSC_COMM_SELF, "Diag : Off-diag = %s : %s\n", amask, bmask);
1575:   }
1576:   return 0;
1577: }