Actual source code: mpiaijkok.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <petscsf.h>
3: #include <petsc/private/sfimpl.h>
4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
5: #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp>
6: #include <KokkosSparse_spadd.hpp>
8: PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A, MatAssemblyType mode)
9: {
10: Mat_SeqAIJKokkos *aijkok;
12: MatAssemblyEnd_MPIAIJ(A, mode);
13: aijkok = static_cast<Mat_SeqAIJKokkos *>(((Mat_MPIAIJ *)A->data)->A->spptr); /* Access spptr after MatAssemblyEnd_MPIAIJ(), which might have deleted old spptr */
14: if (aijkok && aijkok->device_mat_d.data()) {
15: A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
16: }
18: return 0;
19: }
21: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
22: {
23: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
25: PetscLayoutSetUp(mat->rmap);
26: PetscLayoutSetUp(mat->cmap);
27: #if defined(PETSC_USE_DEBUG)
28: if (d_nnz) {
29: PetscInt i;
31: }
32: if (o_nnz) {
33: PetscInt i;
35: }
36: #endif
37: #if defined(PETSC_USE_CTABLE)
38: PetscTableDestroy(&mpiaij->colmap);
39: #else
40: PetscFree(mpiaij->colmap);
41: #endif
42: PetscFree(mpiaij->garray);
43: VecDestroy(&mpiaij->lvec);
44: VecScatterDestroy(&mpiaij->Mvctx);
45: /* Because the B will have been resized we simply destroy it and create a new one each time */
46: MatDestroy(&mpiaij->B);
48: if (!mpiaij->A) {
49: MatCreate(PETSC_COMM_SELF, &mpiaij->A);
50: MatSetSizes(mpiaij->A, mat->rmap->n, mat->cmap->n, mat->rmap->n, mat->cmap->n);
51: }
52: if (!mpiaij->B) {
53: PetscMPIInt size;
54: MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size);
55: MatCreate(PETSC_COMM_SELF, &mpiaij->B);
56: MatSetSizes(mpiaij->B, mat->rmap->n, size > 1 ? mat->cmap->N : 0, mat->rmap->n, size > 1 ? mat->cmap->N : 0);
57: }
58: MatSetType(mpiaij->A, MATSEQAIJKOKKOS);
59: MatSetType(mpiaij->B, MATSEQAIJKOKKOS);
60: MatSeqAIJSetPreallocation(mpiaij->A, d_nz, d_nnz);
61: MatSeqAIJSetPreallocation(mpiaij->B, o_nz, o_nnz);
62: mat->preallocated = PETSC_TRUE;
63: return 0;
64: }
66: PetscErrorCode MatMult_MPIAIJKokkos(Mat mat, Vec xx, Vec yy)
67: {
68: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
69: PetscInt nt;
71: VecGetLocalSize(xx, &nt);
73: VecScatterBegin(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
74: (*mpiaij->A->ops->mult)(mpiaij->A, xx, yy);
75: VecScatterEnd(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
76: (*mpiaij->B->ops->multadd)(mpiaij->B, mpiaij->lvec, yy, yy);
77: return 0;
78: }
80: PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat, Vec xx, Vec yy, Vec zz)
81: {
82: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
83: PetscInt nt;
85: VecGetLocalSize(xx, &nt);
87: VecScatterBegin(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
88: (*mpiaij->A->ops->multadd)(mpiaij->A, xx, yy, zz);
89: VecScatterEnd(mpiaij->Mvctx, xx, mpiaij->lvec, INSERT_VALUES, SCATTER_FORWARD);
90: (*mpiaij->B->ops->multadd)(mpiaij->B, mpiaij->lvec, zz, zz);
91: return 0;
92: }
94: PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat, Vec xx, Vec yy)
95: {
96: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
97: PetscInt nt;
99: VecGetLocalSize(xx, &nt);
101: (*mpiaij->B->ops->multtranspose)(mpiaij->B, xx, mpiaij->lvec);
102: (*mpiaij->A->ops->multtranspose)(mpiaij->A, xx, yy);
103: VecScatterBegin(mpiaij->Mvctx, mpiaij->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
104: VecScatterEnd(mpiaij->Mvctx, mpiaij->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
105: return 0;
106: }
108: /* Merge the "A, B" matrices of mat into a matrix C. mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS.
109: A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n).
110: C still uses local column ids. Their corresponding global column ids are returned in glob.
111: */
112: PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat, MatReuse reuse, IS *glob, Mat *C)
113: {
114: Mat Ad, Ao;
115: const PetscInt *cmap;
117: MatMPIAIJGetSeqAIJ(mat, &Ad, &Ao, &cmap);
118: MatSeqAIJKokkosMergeMats(Ad, Ao, reuse, C);
119: if (glob) {
120: PetscInt cst, i, dn, on, *gidx;
121: MatGetLocalSize(Ad, NULL, &dn);
122: MatGetLocalSize(Ao, NULL, &on);
123: MatGetOwnershipRangeColumn(mat, &cst, NULL);
124: PetscMalloc1(dn + on, &gidx);
125: for (i = 0; i < dn; i++) gidx[i] = cst + i;
126: for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
127: ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob);
128: }
129: return 0;
130: }
132: /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */
133: struct MatMatStruct {
134: MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */
135: PetscSF sf; /* SF to send/recv matrix entries */
136: MatScalarKokkosView abuf; /* buf of mat values in send/recv */
137: Mat C1, C2, B_local;
138: KokkosCsrMatrix C1_global, C2_global, C_global;
139: KernelHandle kh;
140: MatMatStruct()
141: {
142: C1 = C2 = B_local = NULL;
143: sf = NULL;
144: }
146: ~MatMatStruct()
147: {
148: MatDestroy(&C1);
149: MatDestroy(&C2);
150: MatDestroy(&B_local);
151: PetscSFDestroy(&sf);
152: kh.destroy_spadd_handle();
153: }
154: };
156: struct MatMatStruct_AB : public MatMatStruct {
157: MatColIdxKokkosView rows;
158: MatRowMapKokkosView rowoffset;
159: Mat B_other, C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */
161: MatMatStruct_AB() : B_other(NULL), C_petsc(NULL) { }
162: ~MatMatStruct_AB()
163: {
164: MatDestroy(&B_other);
165: MatDestroy(&C_petsc);
166: }
167: };
169: struct MatMatStruct_AtB : public MatMatStruct {
170: MatRowMapKokkosView srcrowoffset, dstrowoffset;
171: };
173: struct MatProductData_MPIAIJKokkos {
174: MatMatStruct_AB *mmAB;
175: MatMatStruct_AtB *mmAtB;
176: PetscBool reusesym;
178: MatProductData_MPIAIJKokkos() : mmAB(NULL), mmAtB(NULL), reusesym(PETSC_FALSE) { }
179: ~MatProductData_MPIAIJKokkos()
180: {
181: delete mmAB;
182: delete mmAtB;
183: }
184: };
186: static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data)
187: {
188: delete static_cast<MatProductData_MPIAIJKokkos *>(data);
189: return 0;
190: }
192: /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix
194: Input Parameters:
195: + A - the MATSEQAIJKOKKOS matrix
196: . N - new column size for the returned Kokkos matrix
197: - l2g - a map that maps old col ids to new col ids
199: Output Parameters:
200: . csrmat - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A.
201: */
202: static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A, PetscInt N, const ConstMatColIdxKokkosView &l2g, KokkosCsrMatrix &csrmat)
203: {
204: KokkosCsrMatrix &orig = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
205: MatColIdxKokkosView jg("jg", orig.nnz()); /* New j array for csrmat */
207: PetscCallCXX(Kokkos::parallel_for(
208: orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) { jg(i) = l2g(orig.graph.entries(i)); }));
209: csrmat = KokkosCsrMatrix("csrmat", orig.numRows(), N, orig.nnz(), orig.values, orig.graph.row_map, jg);
210: return 0;
211: }
213: /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix.
214: It is similar to MatCreateMPIAIJWithSplitArrays.
216: Input Parameters:
217: + mat - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set
218: . A - the diag matrix using local col ids
219: - B - the offdiag matrix using global col ids
221: Output Parameters:
222: . mat - the updated MATMPIAIJKOKKOS matrix
223: */
224: static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat, Mat A, Mat B)
225: {
226: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
227: PetscInt m, n, M, N, Am, An, Bm, Bn;
228: Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
230: MatGetSize(mat, &M, &N);
231: MatGetLocalSize(mat, &m, &n);
232: MatGetLocalSize(A, &Am, &An);
233: MatGetLocalSize(B, &Bm, &Bn);
239: mpiaij->A = A;
240: mpiaij->B = B;
242: mat->preallocated = PETSC_TRUE;
243: mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
245: MatSetOption(mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE);
246: MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
247: /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
248: also gets mpiaij->B compacted, with its col ids and size reduced
249: */
250: MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
251: MatSetOption(mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE);
252: MatSetOption(mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);
254: /* Update bkok with new local col ids (stored on host) and size */
255: bkok->j_dual.modify_host();
256: bkok->j_dual.sync_device();
257: bkok->SetColSize(mpiaij->B->cmap->n);
258: return 0;
259: }
261: /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C).
263: It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF.
264: In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves.
265: Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k
266: to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph).
268: Collective on comm of ownerSF
270: Input Parameters:
271: + B - the SEQAIJKOKKOS matrix, using local col ids
272: . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
273: . N - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX)
274: . l2g - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX)
275: . ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX)
277: Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX)
278: + bcastSF - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros.
279: . abuf - buffer for sending matrix values
280: . rows - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[].
281: Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors.
282: . rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[]
283: - C - the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids.
284: */
285: static PetscErrorCode MatSeqAIJKokkosBcast(Mat B, MatReuse reuse, PetscInt N, const ConstMatColIdxKokkosView &l2g, PetscSF ownerSF, PetscSF &bcastSF, MatScalarKokkosView &abuf, MatColIdxKokkosView &rows, MatRowMapKokkosView &rowoffset, Mat &C)
286: {
287: Mat_SeqAIJKokkos *bkok, *ckok;
289: MatSeqAIJKokkosSyncDevice(B); /* Make sure B->spptr is accessible */
290: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
292: if (reuse == MAT_REUSE_MATRIX) {
293: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
295: const auto &Ba = bkok->a_dual.view_device();
296: const auto &Bi = bkok->i_dual.view_device();
297: const auto &Ca = ckok->a_dual.view_device();
299: /* Copy Ba to abuf */
300: Kokkos::parallel_for(
301: Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
302: PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */
303: PetscInt r = rows(i);
304: PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
305: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, Bi(r + 1) - Bi(r)), [&](PetscInt k) { abuf(base + k) = Ba(Bi(r) + k); });
306: });
308: /* Send abuf to Ca through bcastSF and then mark C is updated on device */
309: PetscSFBcastBegin(bcastSF, MPIU_SCALAR, abuf.data(), Ca.data(), MPI_REPLACE); /* TODO: get memtype for abuf */
310: PetscSFBcastEnd(bcastSF, MPIU_SCALAR, abuf.data(), Ca.data(), MPI_REPLACE);
311: ckok->a_dual.modify_device();
312: } else if (reuse == MAT_INITIAL_MATRIX) {
313: MPI_Comm comm;
314: PetscMPIInt tag;
315: PetscInt k, Cm, Cn, Cnnz, *Ci_h, nroots, nleaves;
317: PetscObjectGetComm((PetscObject)ownerSF, &comm);
318: PetscSFGetGraph(ownerSF, &nroots, &nleaves, NULL, NULL);
319: Cm = nleaves; /* row size of C */
320: Cn = N; /* col size of C, which initially uses global ids, so we can safely set its col size as N */
322: /* Get row lens (nz) of B's rows for later fast query */
323: PetscInt *Browlens;
324: const PetscInt *tmp = bkok->i_host_data();
325: PetscMalloc1(nroots, &Browlens);
326: for (k = 0; k < nroots; k++) Browlens[k] = tmp[k + 1] - tmp[k];
328: /* By ownerSF, each proc gets lens of rows of C */
329: MatRowMapKokkosDualView Ci("i", Cm + 1); /* C's rowmap */
330: Ci_h = Ci.view_host().data();
331: Ci_h[0] = 0;
332: PetscSFBcastWithMemTypeBegin(ownerSF, MPIU_INT, PETSC_MEMTYPE_HOST, Browlens, PETSC_MEMTYPE_HOST, &Ci_h[1], MPI_REPLACE);
333: PetscSFBcastEnd(ownerSF, MPIU_INT, Browlens, &Ci_h[1], MPI_REPLACE);
334: for (k = 1; k < Cm + 1; k++) Ci_h[k] += Ci_h[k - 1]; /* Convert lens to CSR */
335: Cnnz = Ci_h[Cm];
336: Ci.modify_host();
337: Ci.sync_device();
339: /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */
340: MatColIdxKokkosDualView Cj("j", Cnnz);
341: MatScalarKokkosDualView Ca("a", Cnnz);
343: /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */
344: const PetscMPIInt *iranks, *ranks;
345: const PetscInt *ioffset, *irootloc, *roffset;
346: PetscInt i, j, niranks, nranks, *sdisp, *rdisp, *rowptr;
347: MPI_Request *reqs;
349: PetscSFGetLeafRanks(ownerSF, &niranks, &iranks, &ioffset, &irootloc); /* irootloc[] contains indices of rows I need to send to each receiver */
350: PetscSFGetRootRanks(ownerSF, &nranks, &ranks, &roffset, NULL /*rmine*/, NULL /*rremote*/); /* recv info */
352: /* figure out offsets at the send buffer, to build the SF
353: sdisp[] - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver.
354: rowptr[] - stores offsets for data of each row in abuf
356: rdisp[] - to receive sdisp[]
357: */
358: PetscMalloc3(niranks + 1, &sdisp, nranks, &rdisp, niranks + nranks, &reqs);
359: MatRowMapKokkosViewHost rowptr_h("rowptr_h", ioffset[niranks] + 1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */
360: rowptr = rowptr_h.data();
362: sdisp[0] = 0;
363: rowptr[0] = 0;
364: for (i = 0; i < niranks; i++) { /* for each receiver */
365: PetscInt len, nz = 0;
366: for (j = ioffset[i]; j < ioffset[i + 1]; j++) { /* for each row to this receiver */
367: len = Browlens[irootloc[j]];
368: rowptr[j + 1] = rowptr[j] + len;
369: nz += len;
370: }
371: sdisp[i + 1] = sdisp[i] + nz;
372: }
373: PetscCommGetNewTag(comm, &tag);
374: for (i = 0; i < nranks; i++) MPI_Irecv(&rdisp[i], 1, MPIU_INT, ranks[i], tag, comm, &reqs[i]);
375: for (i = 0; i < niranks; i++) MPI_Isend(&sdisp[i], 1, MPIU_INT, iranks[i], tag, comm, &reqs[nranks + i]);
376: MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);
378: PetscInt nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */
379: PetscInt nroots2 = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */
380: PetscSFNode *iremote;
381: PetscMalloc1(nleaves2, &iremote);
382: for (i = 0; i < nranks; i++) { /* for each sender */
383: k = 0;
384: for (j = Ci_h[roffset[i]]; j < Ci_h[roffset[i + 1]]; j++) {
385: iremote[j].rank = ranks[i];
386: iremote[j].index = rdisp[i] + k;
387: k++;
388: }
389: }
390: /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */
391: PetscSFCreate(comm, &bcastSF);
392: PetscSFSetGraph(bcastSF, nroots2, nleaves2, NULL /*ilocal*/, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
394: /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted
395: from local to global. Then use bcastSF to fill Ca, Cj.
396: */
397: ConstMatColIdxKokkosViewHost rows_h(irootloc, ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */
398: MatColIdxKokkosView rows("rows", ioffset[niranks]);
399: Kokkos::deep_copy(rows, rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */
401: rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */
403: MatColIdxKokkosView jbuf("jbuf", sdisp[niranks]); /* send buf for (global) col ids */
404: abuf = MatScalarKokkosView("abuf", sdisp[niranks]); /* send buf for mat values */
406: const auto &Ba = bkok->a_dual.view_device();
407: const auto &Bi = bkok->i_dual.view_device();
408: const auto &Bj = bkok->j_dual.view_device();
410: /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */
411: Kokkos::parallel_for(
412: Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
413: PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */
414: PetscInt r = rows(i);
415: PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */
416: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, Bi(r + 1) - Bi(r)), [&](PetscInt k) {
417: abuf(base + k) = Ba(Bi(r) + k);
418: jbuf(base + k) = l2g(Bj(Bi(r) + k));
419: });
420: });
422: /* Send abuf & jbuf to fill Ca, Cj */
423: PetscSFBcastBegin(bcastSF, MPIU_INT, jbuf.data(), Cj.view_device().data(), MPI_REPLACE);
424: PetscSFBcastBegin(bcastSF, MPIU_SCALAR, abuf.data(), Ca.view_device().data(), MPI_REPLACE);
425: PetscSFBcastEnd(bcastSF, MPIU_INT, jbuf.data(), Cj.view_device().data(), MPI_REPLACE);
426: PetscSFBcastEnd(bcastSF, MPIU_SCALAR, abuf.data(), Ca.view_device().data(), MPI_REPLACE);
427: Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */
428: Cj.sync_host();
429: Ca.modify_device();
431: /* Construct C with Ca, Ci, Cj */
432: auto ckok = new Mat_SeqAIJKokkos(Cm, Cn, Cnnz, Ci, Cj, Ca);
433: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, &C);
434: PetscFree3(sdisp, rdisp, reqs);
435: PetscFree(Browlens);
436: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unsupported MatReuse enum %d", reuse);
437: return 0;
438: }
440: /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C)
442: It is the reverse of MatSeqAIJKokkosBcast in some sense.
444: Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves.
445: In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might
446: contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF.
448: Input Parameters:
449: + A - the SEQAIJKOKKOS matrix to be reduced
450: . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
451: . local - true if A uses local col ids; false if A is already in global col ids.
452: . N - if local, N is A's global col size
453: . l2g - if local, a map mapping A's local col ids to global ones, which are in range of [0,N).
454: - ownerSF - the SF specifies ownership (root) of rows in A
456: Output Parameters:
457: + reduceSF - the SF to reduce A's rows to contiguous buffers at the receiver side
458: . abuf - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows.
459: . srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len.
460: . dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to
461: unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset.
462: - C - the matrix made up by rows sent to me from other ranks, using global col ids
464: TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX.
465: */
466: static PetscErrorCode MatSeqAIJKokkosReduce(Mat A, MatReuse reuse, PetscBool local, PetscInt N, const ConstMatColIdxKokkosView &l2g, PetscSF ownerSF, PetscSF &reduceSF, MatScalarKokkosView &abuf, MatRowMapKokkosView &srcrowoffset, MatRowMapKokkosView &dstrowoffset, KokkosCsrMatrix &C)
467: {
468: PetscInt i, r, Am, An, Annz, Cnnz, nrows;
469: const PetscInt *Ai;
470: Mat_SeqAIJKokkos *akok;
472: MatSeqAIJKokkosSyncDevice(A); /* So that A's latest data is on device */
473: MatGetSize(A, &Am, &An);
474: Ai = static_cast<Mat_SeqAIJ *>(A->data)->i;
475: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
476: Annz = Ai[Am];
478: if (reuse == MAT_REUSE_MATRIX) {
479: /* Send Aa to abuf */
480: PetscSFReduceBegin(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
481: PetscSFReduceEnd(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
483: /* Copy abuf to Ca */
484: const MatScalarKokkosView &Ca = C.values;
485: nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */
486: Kokkos::parallel_for(
487: Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
488: PetscInt i = t.league_rank();
489: PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
490: PetscInt len = srcrowoffset(i + 1) - srcrowoffset(i);
491: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt k) { Ca(dst + k) = abuf(src + k); });
492: });
493: } else if (reuse == MAT_INITIAL_MATRIX) {
494: MPI_Comm comm;
495: MPI_Request *reqs;
496: PetscMPIInt tag;
497: PetscInt Cm;
499: PetscObjectGetComm((PetscObject)ownerSF, &comm);
500: PetscCommGetNewTag(comm, &tag);
502: PetscInt niranks, nranks, nroots, nleaves;
503: const PetscMPIInt *iranks, *ranks;
504: const PetscInt *ioffset, *rows, *roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */
505: PetscSFSetUp(ownerSF);
506: PetscSFGetLeafRanks(ownerSF, &niranks, &iranks, &ioffset, &rows); /* recv info: iranks[] will send rows to me */
507: PetscSFGetRootRanks(ownerSF, &nranks, &ranks, &roffset, NULL /*rmine*/, NULL /*rremote*/); /* send info */
508: PetscSFGetGraph(ownerSF, &nroots, &nleaves, NULL, NULL);
510: Cm = nroots;
511: nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */
513: /* Tell owners how long each row I will send */
514: PetscInt *srowlens; /* send buf of row lens */
515: MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h", nrows + 1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */
516: PetscInt *rrowlens = rrowlens_h.data();
518: PetscMalloc2(Am, &srowlens, niranks + nranks, &reqs);
519: for (i = 0; i < Am; i++) srowlens[i] = Ai[i + 1] - Ai[i];
520: rrowlens[0] = 0;
521: rrowlens++; /* shift the pointer to make the following expression more readable */
522: for (i = 0; i < niranks; i++) MPI_Irecv(&rrowlens[ioffset[i]], ioffset[i + 1] - ioffset[i], MPIU_INT, iranks[i], tag, comm, &reqs[i]);
523: for (i = 0; i < nranks; i++) MPI_Isend(&srowlens[roffset[i]], roffset[i + 1] - roffset[i], MPIU_INT, ranks[i], tag, comm, &reqs[niranks + i]);
524: MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);
526: /* Owner builds Ci on host by histogramming rrowlens[] */
527: MatRowMapKokkosViewHost Ci_h("i", Cm + 1);
528: Kokkos::deep_copy(Ci_h, 0); /* Zero Ci */
529: MatRowMapType *Ci_ptr = Ci_h.data();
531: for (i = 0; i < nrows; i++) {
532: r = rows[i]; /* local row id of i-th received row */
533: #if defined(PETSC_USE_DEBUG)
535: #endif
536: Ci_ptr[r + 1] += rrowlens[i]; /* add to length of row r in C */
537: }
538: for (i = 0; i < Cm; i++) Ci_ptr[i + 1] += Ci_ptr[i]; /* to CSR format */
539: Cnnz = Ci_ptr[Cm];
541: /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */
542: MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h", nrows);
543: PetscInt *dstrowoffset_hptr = dstrowoffset_h.data();
544: PetscInt *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */
546: PetscCalloc1(Cm, &currowlens); /* Init with zero, to be added to */
547: for (i = 0; i < nrows; i++) { /* for each row I receive */
548: r = rows[i]; /* row id in C */
549: dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */
550: currowlens[r] += rrowlens[i]; /* accumulate to length of row r in C */
551: }
552: PetscFree(currowlens);
554: rrowlens--;
555: for (i = 0; i < nrows; i++) rrowlens[i + 1] += rrowlens[i]; /* Change rrowlens[] to CSR format */
556: dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), dstrowoffset_h);
557: srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */
559: /* Build the reduceSF, which performs buffer to buffer send/recv */
560: PetscInt *sdisp, *rdisp; /* buffer to send offsets of roots, and buffer to recv them */
561: PetscMalloc2(niranks, &sdisp, nranks, &rdisp);
562: for (i = 0; i < niranks; i++) sdisp[i] = rrowlens[ioffset[i]];
563: for (i = 0; i < nranks; i++) MPI_Irecv(&rdisp[i], 1, MPIU_INT, ranks[i], tag, comm, &reqs[i]);
564: for (i = 0; i < niranks; i++) MPI_Isend(&sdisp[i], 1, MPIU_INT, iranks[i], tag, comm, &reqs[nranks + i]);
565: MPI_Waitall(niranks + nranks, reqs, MPI_STATUSES_IGNORE);
567: /* Nonzeros in abuf/jbuf are roots and those in A are leaves */
568: PetscInt nroots2 = Cnnz, nleaves2 = Annz;
569: PetscSFNode *iremote;
570: PetscMalloc1(nleaves2, &iremote); /* no free, since memory will be given to reduceSF */
571: for (i = 0; i < nranks; i++) {
572: PetscInt rootbase = rdisp[i]; /* root offset at this root rank */
573: PetscInt leafbase = Ai[roffset[i]]; /* leaf base */
574: PetscInt nz = Ai[roffset[i + 1]] - leafbase; /* I will send nz nonzeros to this root rank */
575: for (PetscInt k = 0; k < nz; k++) {
576: iremote[leafbase + k].rank = ranks[i];
577: iremote[leafbase + k].index = rootbase + k;
578: }
579: }
580: PetscSFCreate(comm, &reduceSF);
581: PetscSFSetGraph(reduceSF, nroots2, nleaves2, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER);
582: PetscFree2(sdisp, rdisp);
584: /* Reduce Aa, Ajg to abuf and jbuf */
586: /* If A uses local col ids, convert them to global ones before sending */
587: MatColIdxKokkosView Ajg;
588: if (local) {
589: Ajg = MatColIdxKokkosView("j", Annz);
590: const MatColIdxKokkosView &Aj = akok->j_dual.view_device();
591: Kokkos::parallel_for(
592: Annz, KOKKOS_LAMBDA(const PetscInt i) { Ajg(i) = l2g(Aj(i)); });
593: } else {
594: Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */
595: }
597: MatColIdxKokkosView jbuf("jbuf", Cnnz);
598: abuf = MatScalarKokkosView("abuf", Cnnz);
599: PetscSFReduceBegin(reduceSF, MPIU_INT, Ajg.data(), jbuf.data(), MPI_REPLACE);
600: PetscSFReduceEnd(reduceSF, MPIU_INT, Ajg.data(), jbuf.data(), MPI_REPLACE);
601: PetscSFReduceBegin(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
602: PetscSFReduceEnd(reduceSF, MPIU_SCALAR, akok->a_device_data(), abuf.data(), MPI_REPLACE);
604: /* Copy data from abuf, jbuf to Ca, Cj */
605: MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ci_h); /* Ci is an alias of Ci_h if no device */
606: MatColIdxKokkosView Cj("j", Cnnz);
607: MatScalarKokkosView Ca("a", Cnnz);
609: Kokkos::parallel_for(
610: Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
611: PetscInt i = t.league_rank();
612: PetscInt src = srcrowoffset(i), dst = dstrowoffset(i);
613: PetscInt len = srcrowoffset(i + 1) - srcrowoffset(i);
614: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt k) {
615: Ca(dst + k) = abuf(src + k);
616: Cj(dst + k) = jbuf(src + k);
617: });
618: });
620: /* Build C with Ca, Ci, Cj */
621: C = KokkosCsrMatrix("csrmat", Cm, N, Cnnz, Ca, Ci, Cj);
622: PetscFree2(srowlens, reqs);
623: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unsupported MatReuse enum %d", reuse);
624: return 0;
625: }
627: /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a `MATMPIAIJKOKKOS` matrix by splitting a KokkosCsrMatrix
629: Input Parameters:
630: + C - the `MATMPIAIJKOKKOS` matrix, of size m,n,M,N
631: . reuse - indicate whether the matrix has called this function before
632: . csrmat - the KokkosCsrMatrix, of size m,N
633: - Cdstart - when reuse == `MAT_REUSE_MATRIX`, it is an input parameter. For each row in csrmat, it stores the start of the first
634: entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag
635: entry is 5, then Cdstart[i] = 3.
637: Output Parameters:
638: + C - the updated `MATMPIAIJKOKKOS` matrix
639: - Cdstart - when reuse == `MAT_INITIAL_MATRIX`, it is an output parameter
641: Note:
642: Between calls with `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`, csrmat must have the same nonzero pattern
644: .seealso: `MATMPIAIJKOKKOS`
645: */
646: static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C, MatReuse reuse, const KokkosCsrMatrix &csrmat, MatRowMapKokkosView &Cdstart)
647: {
648: const MatScalarKokkosView &Ca = csrmat.values;
649: const ConstMatRowMapKokkosView &Ci = csrmat.graph.row_map;
650: PetscInt m, n, N;
652: MatGetLocalSize(C, &m, &n);
653: MatGetSize(C, NULL, &N);
655: if (reuse == MAT_REUSE_MATRIX) {
656: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(C->data);
657: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(mpiaij->A->spptr);
658: Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos *>(mpiaij->B->spptr);
659: const MatScalarKokkosView &Cda = akok->a_dual.view_device(), Coa = bkok->a_dual.view_device();
660: const MatRowMapKokkosView &Cdi = akok->i_dual.view_device(), Coi = bkok->i_dual.view_device();
662: /* Fill 'a' of Cd and Co on device */
663: Kokkos::parallel_for(
664: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
665: PetscInt i = t.league_rank(); /* row i */
666: PetscInt clen = Ci(i + 1) - Ci(i); /* len of row i of C */
667: PetscInt cdlen = Cdi(i + 1) - Cdi(i); /* len of row i of Cd */
668: PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
669: PetscInt cdend = cdstart + cdlen;
670: /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
671: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
672: if (k < cdstart) { /* k in [0, cdstart) */
673: Coa(Coi(i) + k) = Ca(Ci(i) + k);
674: } else if (k < cdend) { /* k in [cdstart, cdend) */
675: Cda(Cdi(i) + (k - cdstart)) = Ca(Ci(i) + k);
676: } else { /* k in [cdend, clen) */
677: Coa(Coi(i) + k - cdlen) = Ca(Ci(i) + k);
678: }
679: });
680: });
682: akok->a_dual.modify_device();
683: bkok->a_dual.modify_device();
684: } else if (reuse == MAT_INITIAL_MATRIX) {
685: Mat Cd, Co;
686: const MatColIdxKokkosView &Cj = csrmat.graph.entries;
687: MatRowMapKokkosDualView Cdi_dual("i", m + 1), Coi_dual("i", m + 1);
688: MatRowMapKokkosView Cdi = Cdi_dual.view_device(), Coi = Coi_dual.view_device();
689: PetscInt cstart, cend;
691: /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks:
692: left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend).
693: Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend,
694: such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly
695: stores values of cdstart and cdend-cstart (aka Cdi[]) instead.
696: */
697: Cdstart = MatRowMapKokkosView("Cdstart", m);
698: PetscLayoutGetRange(C->cmap, &cstart, &cend); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */
700: /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a
701: CUDA warp would completely diverge. So I use TeamPolicy with a team size 1.
702: */
703: Kokkos::parallel_for(
704: Kokkos::TeamPolicy<>(m, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
705: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
706: PetscInt i = t.league_rank(); /* row i */
707: PetscInt j, first, count, step;
709: if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */
710: Cdi(0) = 0;
711: Coi(0) = 0;
712: }
714: /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns
715: in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found.
716: */
717: count = Ci(i + 1) - Ci(i);
718: first = Ci(i);
719: while (count > 0) {
720: j = first;
721: step = count / 2;
722: j += step;
723: if (Cj(j) < cstart) {
724: first = ++j;
725: count -= step + 1;
726: } else count = step;
727: }
728: Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */
730: /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */
731: count = Ci(i + 1) - first;
732: while (count > 0) {
733: j = first;
734: step = count / 2;
735: j += step;
736: if (Cj(j) < cend) {
737: first = ++j;
738: count -= step + 1;
739: } else count = step;
740: }
741: Cdi(i + 1) = first - (Ci(i) + Cdstart(i)); /* 'first' is the while-loop's output */
742: Coi(i + 1) = (Ci(i + 1) - Ci(i)) - Cdi(i + 1); /* Co's row len = C's row len - Cd's row len */
743: });
744: });
746: /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */
747: Kokkos::parallel_scan(
748: m + 1, KOKKOS_LAMBDA(const PetscInt i, PetscInt &update, const bool final) {
749: update += Cdi(i);
750: if (final) Cdi(i) = update;
751: });
752: Kokkos::parallel_scan(
753: m + 1, KOKKOS_LAMBDA(const PetscInt i, PetscInt &update, const bool final) {
754: update += Coi(i);
755: if (final) Coi(i) = update;
756: });
758: /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in
759: MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co.
760: */
761: Cdi_dual.modify_device();
762: Coi_dual.modify_device();
763: Cdi_dual.sync_host();
764: Coi_dual.sync_host();
765: PetscInt Cd_nnz = Cdi_dual.view_host().data()[m];
766: PetscInt Co_nnz = Coi_dual.view_host().data()[m];
768: /* With nnz, allocate a, j for Cd and Co */
769: MatColIdxKokkosDualView Cdj_dual("j", Cd_nnz), Coj_dual("j", Co_nnz);
770: MatScalarKokkosDualView Cda_dual("a", Cd_nnz), Coa_dual("a", Co_nnz);
772: /* Fill a, j of Cd and Co on device */
773: MatColIdxKokkosView Cdj = Cdj_dual.view_device(), Coj = Coj_dual.view_device();
774: MatScalarKokkosView Cda = Cda_dual.view_device(), Coa = Coa_dual.view_device();
776: Kokkos::parallel_for(
777: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
778: PetscInt i = t.league_rank(); /* row i */
779: PetscInt clen = Ci(i + 1) - Ci(i); /* len of row i of C */
780: PetscInt cdlen = Cdi(i + 1) - Cdi(i); /* len of row i of Cd */
781: PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */
782: PetscInt cdend = cdstart + cdlen;
783: /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */
784: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) {
785: if (k < cdstart) { /* k in [0, cdstart) */
786: Coa(Coi(i) + k) = Ca(Ci(i) + k);
787: Coj(Coi(i) + k) = Cj(Ci(i) + k);
788: } else if (k < cdend) { /* k in [cdstart, cdend) */
789: Cda(Cdi(i) + (k - cdstart)) = Ca(Ci(i) + k);
790: Cdj(Cdi(i) + (k - cdstart)) = Cj(Ci(i) + k) - cstart; /* Use local col ids in Cdj */
791: } else { /* k in [cdend, clen) */
792: Coa(Coi(i) + k - cdlen) = Ca(Ci(i) + k);
793: Coj(Coi(i) + k - cdlen) = Cj(Ci(i) + k);
794: }
795: });
796: });
798: Cdj_dual.modify_device();
799: Cda_dual.modify_device();
800: Coj_dual.modify_device();
801: Coa_dual.modify_device();
802: /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */
803: auto cdkok = new Mat_SeqAIJKokkos(m, n, Cd_nnz, Cdi_dual, Cdj_dual, Cda_dual);
804: auto cokok = new Mat_SeqAIJKokkos(m, N, Co_nnz, Coi_dual, Coj_dual, Coa_dual);
805: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, cdkok, &Cd);
806: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, cokok, &Co);
807: MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C, Cd, Co); /* Coj will be converted to local ids within */
808: }
809: return 0;
810: }
812: /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids.
814: It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view.
816: Input Parameters:
817: + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N
818: . reuse - indicate whether the matrix has called this function before
819: . csrmat - the KokkosCsrMatrix, of size m,N
820: - Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first
821: entry of the diag block of C in csrmat's j array.
823: Output Parameters:
824: + C - the updated MATMPIAIJKOKKOS matrix
825: - Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter
827: Note:
828: the input matrix's col ids and col size will be changed.
829: */
830: static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C, MatColIdxKokkosView &l2g)
831: {
832: Mat_SeqAIJKokkos *ckok;
833: ISLocalToGlobalMapping l2gmap;
834: const PetscInt *garray;
835: PetscInt sz;
837: /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */
838: MatSeqAIJCompactOutExtraColumns_SeqAIJ(C, &l2gmap);
839: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
840: ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */
841: ckok->j_dual.sync_device();
842: ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */
844: /* Build l2g -- the local to global mapping of C's cols */
845: ISLocalToGlobalMappingGetIndices(l2gmap, &garray);
846: ISLocalToGlobalMappingGetSize(l2gmap, &sz);
849: ConstMatColIdxKokkosViewHost tmp(garray, sz);
850: l2g = MatColIdxKokkosView("l2g", sz);
851: Kokkos::deep_copy(l2g, tmp);
853: ISLocalToGlobalMappingRestoreIndices(l2gmap, &garray);
854: ISLocalToGlobalMappingDestroy(&l2gmap);
855: return 0;
856: }
858: /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos
860: Input Parameters:
861: + product - Mat_Product which carried out the computation. Passed in to access info about this mat product.
862: . A - an MPIAIJKOKKOS matrix
863: . B - an MPIAIJKOKKOS matrix
864: - mm - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations.
866: Note: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
867: */
868: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product, Mat A, Mat B, MatMatStruct_AB *mm)
869: {
870: Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ *>(A->data);
871: Mat Ad = a->A, Ao = a->B; /* diag and offdiag of A */
872: IS glob = NULL;
873: const PetscInt *garray;
874: PetscInt N = B->cmap->N, sz;
875: ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */
876: MatColIdxKokkosView l2g2;
877: Mat C1, C2; /* intermediate matrices */
879: /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */
880: MatMPIAIJGetLocalMatMerge(B, MAT_INITIAL_MATRIX, &glob, &mm->B_local);
881: MatProductCreate(Ad, mm->B_local, NULL, &C1);
882: MatProductSetType(C1, MATPRODUCT_AB);
883: MatProductSetFill(C1, product->fill);
884: C1->product->api_user = product->api_user;
885: MatProductSetFromOptions(C1);
886: PetscUseTypeMethod(C1, productsymbolic);
888: ISGetIndices(glob, &garray);
889: ISGetSize(glob, &sz);
890: const auto &tmp = ConstMatColIdxKokkosViewHost(garray, sz); /* wrap garray as a view */
891: l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), tmp); /* maybe just an alias to tmp, so we restore garray at the very end */
892: MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1, N, l2g1, mm->C1_global);
894: /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */
895: MatSeqAIJKokkosBcast(mm->B_local, MAT_INITIAL_MATRIX, N, l2g1, a->Mvctx, mm->sf, mm->abuf, mm->rows, mm->rowoffset, mm->B_other);
897: /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */
898: MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other, l2g2);
899: MatProductCreate(Ao, mm->B_other, NULL, &C2);
900: MatProductSetType(C2, MATPRODUCT_AB);
901: MatProductSetFill(C2, product->fill);
902: C2->product->api_user = product->api_user;
903: MatProductSetFromOptions(C2);
904: PetscUseTypeMethod(C2, productsymbolic);
905: MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2, N, l2g2, mm->C2_global);
907: /* C = C1 + C2. We actually use their global col ids versions in adding */
908: mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */
909: KokkosSparse::spadd_symbolic(&mm->kh, mm->C1_global, mm->C2_global, mm->C_global);
910: /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
911: KokkosSparse::spadd_numeric(&mm->kh, (MatScalarType)1.0, mm->C1_global, (MatScalarType)1.0, mm->C2_global, mm->C_global);
913: mm->C1 = C1;
914: mm->C2 = C2;
915: ISRestoreIndices(glob, &garray);
916: ISDestroy(&glob);
917: return 0;
918: }
920: /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos
922: Input Parameters:
923: + product - Mat_Product which carried out the computation. Passed in to access info about this mat product.
924: . A - an MPIAIJKOKKOS matrix
925: . B - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank.
926: . localB - Does B use local col ids? If false, then B is already in global col ids.
927: . N - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator.
928: . l2g - If localB, then l2g maps B's local col ids to global ones.
929: - mm - a struct used to stash intermediate data in AtB
931: Note: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids.
932: */
933: static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product, Mat A, Mat B, PetscBool localB, PetscInt N, const ConstMatColIdxKokkosView &l2g, MatMatStruct_AtB *mm)
934: {
935: Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ *>(A->data);
936: Mat Ad = a->A, Ao = a->B; /* diag and offdiag of A */
937: Mat C1, C2; /* intermediate matrices */
939: /* C1 = Ad^t * B */
940: MatProductCreate(Ad, B, NULL, &C1);
941: MatProductSetType(C1, MATPRODUCT_AtB);
942: MatProductSetFill(C1, product->fill);
943: C1->product->api_user = product->api_user;
944: MatProductSetFromOptions(C1);
945: PetscUseTypeMethod(C1, productsymbolic);
947: if (localB) MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1, N, l2g, mm->C1_global);
948: else mm->C1_global = static_cast<Mat_SeqAIJKokkos *>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */
950: /* C2 = Ao^t * B */
951: MatProductCreate(Ao, B, NULL, &C2);
952: MatProductSetType(C2, MATPRODUCT_AtB);
953: MatProductSetFill(C2, product->fill);
954: C2->product->api_user = product->api_user;
955: MatProductSetFromOptions(C2);
956: PetscUseTypeMethod(C2, productsymbolic);
958: MatSeqAIJKokkosReduce(C2, MAT_INITIAL_MATRIX, localB, N, l2g, a->Mvctx, mm->sf, mm->abuf, mm->srcrowoffset, mm->dstrowoffset, mm->C2_global);
960: mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */
961: KokkosSparse::spadd_symbolic(&mm->kh, mm->C1_global, mm->C2_global, mm->C_global);
962: /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */
963: KokkosSparse::spadd_numeric(&mm->kh, (MatScalarType)1.0, mm->C1_global, (MatScalarType)1.0, mm->C2_global, mm->C_global);
964: mm->C1 = C1;
965: mm->C2 = C2;
966: return 0;
967: }
969: PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C)
970: {
971: Mat_Product *product = C->product;
972: MatProductType ptype;
973: MatProductData_MPIAIJKokkos *mmdata;
974: MatMatStruct *mm = NULL;
975: MatMatStruct_AB *ab;
976: MatMatStruct_AtB *atb;
977: Mat A, B, Ad, Ao, Bd, Bo;
978: const MatScalarType one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */
980: MatCheckProduct(C, 1);
981: mmdata = static_cast<MatProductData_MPIAIJKokkos *>(product->data);
982: ptype = product->type;
983: A = product->A;
984: B = product->B;
985: Ad = static_cast<Mat_MPIAIJ *>(A->data)->A;
986: Ao = static_cast<Mat_MPIAIJ *>(A->data)->B;
987: Bd = static_cast<Mat_MPIAIJ *>(B->data)->A;
988: Bo = static_cast<Mat_MPIAIJ *>(B->data)->B;
990: if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
991: mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */
992: ab = mmdata->mmAB;
993: atb = mmdata->mmAtB;
994: if (ab) {
995: static_cast<MatProductData_SeqAIJKokkos *>(ab->C1->product->data)->reusesym = PETSC_FALSE;
996: static_cast<MatProductData_SeqAIJKokkos *>(ab->C2->product->data)->reusesym = PETSC_FALSE;
997: }
998: if (atb) {
999: static_cast<MatProductData_SeqAIJKokkos *>(atb->C1->product->data)->reusesym = PETSC_FALSE;
1000: static_cast<MatProductData_SeqAIJKokkos *>(atb->C2->product->data)->reusesym = PETSC_FALSE;
1001: }
1002: return 0;
1003: }
1005: if (ptype == MATPRODUCT_AB) {
1006: ab = mmdata->mmAB;
1007: /* C1 = Ad * B_local */
1009: MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &ab->B_local);
1011: if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, ab->C1);
1012: (*ab->C1->ops->productnumeric)(ab->C1);
1013: MatSeqAIJKokkosBcast(ab->B_local, MAT_REUSE_MATRIX, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, ab->sf, ab->abuf, ab->rows, ab->rowoffset, ab->B_other);
1014: /* C2 = Ao * B_other */
1016: if (ab->C1->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, ab->C2);
1017: (*ab->C2->ops->productnumeric)(ab->C2);
1018: /* C = C1_global + C2_global */
1019: KokkosSparse::spadd_numeric(&ab->kh, one, ab->C1_global, one, ab->C2_global, ab->C_global);
1020: mm = static_cast<MatMatStruct *>(ab);
1021: } else if (ptype == MATPRODUCT_AtB) {
1022: atb = mmdata->mmAtB;
1024: /* C1 = Ad^t * B_local */
1025: MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &atb->B_local);
1027: if (atb->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, atb->C1);
1028: (*atb->C1->ops->productnumeric)(atb->C1);
1030: /* C2 = Ao^t * B_local */
1032: if (atb->C2->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, atb->C2);
1033: (*atb->C2->ops->productnumeric)(atb->C2);
1034: /* Form C2_global */
1035: MatSeqAIJKokkosReduce(atb->C2, MAT_REUSE_MATRIX, PETSC_TRUE, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, atb->sf, atb->abuf, atb->srcrowoffset, atb->dstrowoffset, atb->C2_global);
1036: /* C = C1_global + C2_global */
1037: KokkosSparse::spadd_numeric(&atb->kh, one, atb->C1_global, one, atb->C2_global, atb->C_global);
1038: mm = static_cast<MatMatStruct *>(atb);
1039: } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1040: ab = mmdata->mmAB;
1041: MatMPIAIJGetLocalMatMerge(B, MAT_REUSE_MATRIX, NULL /* glob */, &ab->B_local);
1043: /* ab->C1 = Ad * B_local */
1045: if (ab->C1->product->A != Ad) MatProductReplaceMats(Ad, NULL, NULL, ab->C1);
1046: (*ab->C1->ops->productnumeric)(ab->C1);
1047: MatSeqAIJKokkosBcast(ab->B_local, MAT_REUSE_MATRIX, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /*ownerSF*/, ab->sf, ab->abuf, ab->rows, ab->rowoffset, ab->B_other);
1048: /* ab->C2 = Ao * B_other */
1049: if (ab->C2->product->A != Ao) MatProductReplaceMats(Ao, NULL, NULL, ab->C2);
1050: (*ab->C2->ops->productnumeric)(ab->C2); /* C2 = Ao * B_other */
1051: KokkosSparse::spadd_numeric(&ab->kh, one, ab->C1_global, one, ab->C2_global, ab->C_global);
1053: /* atb->C1 = Bd^t * ab->C_petsc */
1054: atb = mmdata->mmAtB;
1056: if (atb->C1->product->A != Bd) MatProductReplaceMats(Bd, NULL, NULL, atb->C1);
1057: (*atb->C1->ops->productnumeric)(atb->C1);
1058: /* atb->C2 = Bo^t * ab->C_petsc */
1059: if (atb->C2->product->A != Bo) MatProductReplaceMats(Bo, NULL, NULL, atb->C2);
1060: (*atb->C2->ops->productnumeric)(atb->C2);
1061: MatSeqAIJKokkosReduce(atb->C2, MAT_REUSE_MATRIX, PETSC_FALSE, 0 /* N */, MatColIdxKokkosView() /*l2g*/, NULL /* ownerSF */, atb->sf, atb->abuf, atb->srcrowoffset, atb->dstrowoffset, atb->C2_global);
1062: KokkosSparse::spadd_numeric(&atb->kh, one, atb->C1_global, one, atb->C2_global, atb->C_global);
1063: mm = static_cast<MatMatStruct *>(atb);
1064: }
1065: /* Split C_global to form C */
1066: MatSetMPIAIJKokkosWithGlobalCSRMatrix(C, MAT_REUSE_MATRIX, mm->C_global, mm->Cdstart);
1067: return 0;
1068: }
1070: PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C)
1071: {
1072: Mat A, B;
1073: Mat_Product *product = C->product;
1074: MatProductType ptype;
1075: MatProductData_MPIAIJKokkos *mmdata;
1076: MatMatStruct *mm = NULL;
1077: IS glob = NULL;
1078: const PetscInt *garray;
1079: PetscInt m, n, M, N, sz;
1080: ConstMatColIdxKokkosView l2g; /* map local col ids to global ones */
1082: MatCheckProduct(C, 1);
1084: ptype = product->type;
1085: A = product->A;
1086: B = product->B;
1088: switch (ptype) {
1089: case MATPRODUCT_AB:
1090: m = A->rmap->n;
1091: n = B->cmap->n;
1092: M = A->rmap->N;
1093: N = B->cmap->N;
1094: break;
1095: case MATPRODUCT_AtB:
1096: m = A->cmap->n;
1097: n = B->cmap->n;
1098: M = A->cmap->N;
1099: N = B->cmap->N;
1100: break;
1101: case MATPRODUCT_PtAP:
1102: m = B->cmap->n;
1103: n = B->cmap->n;
1104: M = B->cmap->N;
1105: N = B->cmap->N;
1106: break; /* BtAB */
1107: default:
1108: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
1109: }
1111: MatSetSizes(C, m, n, M, N);
1112: PetscLayoutSetUp(C->rmap);
1113: PetscLayoutSetUp(C->cmap);
1114: MatSetType(C, ((PetscObject)A)->type_name);
1116: mmdata = new MatProductData_MPIAIJKokkos();
1117: mmdata->reusesym = product->api_user;
1119: if (ptype == MATPRODUCT_AB) {
1120: mmdata->mmAB = new MatMatStruct_AB();
1121: MatProductSymbolic_MPIAIJKokkos_AB(product, A, B, mmdata->mmAB);
1122: mm = static_cast<MatMatStruct *>(mmdata->mmAB);
1123: } else if (ptype == MATPRODUCT_AtB) {
1124: mmdata->mmAtB = new MatMatStruct_AtB();
1125: auto atb = mmdata->mmAtB;
1126: MatMPIAIJGetLocalMatMerge(B, MAT_INITIAL_MATRIX, &glob, &atb->B_local);
1127: ISGetIndices(glob, &garray);
1128: ISGetSize(glob, &sz);
1129: l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatColIdxKokkosViewHost(garray, sz));
1130: MatProductSymbolic_MPIAIJKokkos_AtB(product, A, atb->B_local, PETSC_TRUE, N, l2g, atb);
1131: ISRestoreIndices(glob, &garray);
1132: ISDestroy(&glob);
1133: mm = static_cast<MatMatStruct *>(atb);
1134: } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */
1135: mmdata->mmAB = new MatMatStruct_AB(); /* tmp=A*B */
1136: mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */
1137: auto ab = mmdata->mmAB;
1138: auto atb = mmdata->mmAtB;
1139: MatProductSymbolic_MPIAIJKokkos_AB(product, A, B, ab);
1140: auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */
1141: MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, tmp, &ab->C_petsc);
1142: MatProductSymbolic_MPIAIJKokkos_AtB(product, B, ab->C_petsc, PETSC_FALSE, N, l2g /*not used*/, atb);
1143: mm = static_cast<MatMatStruct *>(atb);
1144: }
1145: /* Split the C_global into petsc A, B format */
1146: MatSetMPIAIJKokkosWithGlobalCSRMatrix(C, MAT_INITIAL_MATRIX, mm->C_global, mm->Cdstart);
1147: C->product->data = mmdata;
1148: C->product->destroy = MatProductDataDestroy_MPIAIJKokkos;
1149: C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos;
1150: return 0;
1151: }
1153: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat)
1154: {
1155: Mat_Product *product = mat->product;
1156: PetscBool match = PETSC_FALSE;
1157: PetscBool usecpu = PETSC_FALSE;
1159: MatCheckProduct(mat, 1);
1160: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match);
1161: if (match) { /* we can always fallback to the CPU if requested */
1162: switch (product->type) {
1163: case MATPRODUCT_AB:
1164: if (product->api_user) {
1165: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
1166: PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
1167: PetscOptionsEnd();
1168: } else {
1169: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
1170: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
1171: PetscOptionsEnd();
1172: }
1173: break;
1174: case MATPRODUCT_AtB:
1175: if (product->api_user) {
1176: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
1177: PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
1178: PetscOptionsEnd();
1179: } else {
1180: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
1181: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
1182: PetscOptionsEnd();
1183: }
1184: break;
1185: case MATPRODUCT_PtAP:
1186: if (product->api_user) {
1187: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
1188: PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
1189: PetscOptionsEnd();
1190: } else {
1191: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
1192: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
1193: PetscOptionsEnd();
1194: }
1195: break;
1196: default:
1197: break;
1198: }
1199: match = (PetscBool)!usecpu;
1200: }
1201: if (match) {
1202: switch (product->type) {
1203: case MATPRODUCT_AB:
1204: case MATPRODUCT_AtB:
1205: case MATPRODUCT_PtAP:
1206: mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos;
1207: break;
1208: default:
1209: break;
1210: }
1211: }
1212: /* fallback to MPIAIJ ops */
1213: if (!mat->ops->productsymbolic) MatProductSetFromOptions_MPIAIJ(mat);
1214: return 0;
1215: }
1217: static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1218: {
1219: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
1220: Mat_MPIAIJKokkos *mpikok;
1222: MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j);
1223: mat->preallocated = PETSC_TRUE;
1224: MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
1225: MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
1226: MatZeroEntries(mat);
1227: mpikok = static_cast<Mat_MPIAIJKokkos *>(mpiaij->spptr);
1228: delete mpikok;
1229: mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij);
1230: return 0;
1231: }
1233: static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat, const PetscScalar v[], InsertMode imode)
1234: {
1235: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
1236: Mat_MPIAIJKokkos *mpikok = static_cast<Mat_MPIAIJKokkos *>(mpiaij->spptr);
1237: Mat A = mpiaij->A, B = mpiaij->B;
1238: PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
1239: MatScalarKokkosView Aa, Ba;
1240: MatScalarKokkosView v1;
1241: MatScalarKokkosView &vsend = mpikok->sendbuf_d;
1242: const MatScalarKokkosView &v2 = mpikok->recvbuf_d;
1243: const PetscCountKokkosView &Ajmap1 = mpikok->Ajmap1_d, Ajmap2 = mpikok->Ajmap2_d, Aimap2 = mpikok->Aimap2_d;
1244: const PetscCountKokkosView &Bjmap1 = mpikok->Bjmap1_d, Bjmap2 = mpikok->Bjmap2_d, Bimap2 = mpikok->Bimap2_d;
1245: const PetscCountKokkosView &Aperm1 = mpikok->Aperm1_d, Aperm2 = mpikok->Aperm2_d, Bperm1 = mpikok->Bperm1_d, Bperm2 = mpikok->Bperm2_d;
1246: const PetscCountKokkosView &Cperm1 = mpikok->Cperm1_d;
1247: PetscMemType memtype;
1249: PetscGetMemType(v, &memtype); /* Return PETSC_MEMTYPE_HOST when v is NULL */
1250: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */
1251: v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), MatScalarKokkosViewHost((PetscScalar *)v, mpiaij->coo_n));
1252: } else {
1253: v1 = MatScalarKokkosView((PetscScalar *)v, mpiaij->coo_n); /* Directly use v[]'s memory */
1254: }
1256: if (imode == INSERT_VALUES) {
1257: MatSeqAIJGetKokkosViewWrite(A, &Aa); /* write matrix values */
1258: MatSeqAIJGetKokkosViewWrite(B, &Ba);
1259: } else {
1260: MatSeqAIJGetKokkosView(A, &Aa); /* read & write matrix values */
1261: MatSeqAIJGetKokkosView(B, &Ba);
1262: }
1264: /* Pack entries to be sent to remote */
1265: Kokkos::parallel_for(
1266: vsend.extent(0), KOKKOS_LAMBDA(const PetscCount i) { vsend(i) = v1(Cperm1(i)); });
1268: /* Send remote entries to their owner and overlap the communication with local computation */
1269: PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_KOKKOS, vsend.data(), PETSC_MEMTYPE_KOKKOS, v2.data(), MPI_REPLACE);
1270: /* Add local entries to A and B in one kernel */
1271: Kokkos::parallel_for(
1272: Annz + Bnnz, KOKKOS_LAMBDA(PetscCount i) {
1273: PetscScalar sum = 0.0;
1274: if (i < Annz) {
1275: for (PetscCount k = Ajmap1(i); k < Ajmap1(i + 1); k++) sum += v1(Aperm1(k));
1276: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1277: } else {
1278: i -= Annz;
1279: for (PetscCount k = Bjmap1(i); k < Bjmap1(i + 1); k++) sum += v1(Bperm1(k));
1280: Ba(i) = (imode == INSERT_VALUES ? 0.0 : Ba(i)) + sum;
1281: }
1282: });
1283: PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend.data(), v2.data(), MPI_REPLACE);
1285: /* Add received remote entries to A and B in one kernel */
1286: Kokkos::parallel_for(
1287: Annz2 + Bnnz2, KOKKOS_LAMBDA(PetscCount i) {
1288: if (i < Annz2) {
1289: for (PetscCount k = Ajmap2(i); k < Ajmap2(i + 1); k++) Aa(Aimap2(i)) += v2(Aperm2(k));
1290: } else {
1291: i -= Annz2;
1292: for (PetscCount k = Bjmap2(i); k < Bjmap2(i + 1); k++) Ba(Bimap2(i)) += v2(Bperm2(k));
1293: }
1294: });
1296: if (imode == INSERT_VALUES) {
1297: MatSeqAIJRestoreKokkosViewWrite(A, &Aa); /* Increase A & B's state etc. */
1298: MatSeqAIJRestoreKokkosViewWrite(B, &Ba);
1299: } else {
1300: MatSeqAIJRestoreKokkosView(A, &Aa);
1301: MatSeqAIJRestoreKokkosView(B, &Ba);
1302: }
1303: return 0;
1304: }
1306: PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A)
1307: {
1308: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
1310: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL);
1311: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL);
1312: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
1313: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
1314: delete (Mat_MPIAIJKokkos *)mpiaij->spptr;
1315: MatDestroy_MPIAIJ(A);
1316: return 0;
1317: }
1319: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
1320: {
1321: Mat B;
1322: Mat_MPIAIJ *a;
1324: if (reuse == MAT_INITIAL_MATRIX) {
1325: MatDuplicate(A, MAT_COPY_VALUES, newmat);
1326: } else if (reuse == MAT_REUSE_MATRIX) {
1327: MatCopy(A, *newmat, SAME_NONZERO_PATTERN);
1328: }
1329: B = *newmat;
1331: B->boundtocpu = PETSC_FALSE;
1332: PetscFree(B->defaultvectype);
1333: PetscStrallocpy(VECKOKKOS, &B->defaultvectype);
1334: PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJKOKKOS);
1336: a = static_cast<Mat_MPIAIJ *>(A->data);
1337: if (a->A) MatSetType(a->A, MATSEQAIJKOKKOS);
1338: if (a->B) MatSetType(a->B, MATSEQAIJKOKKOS);
1339: if (a->lvec) VecSetType(a->lvec, VECSEQKOKKOS);
1341: B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos;
1342: B->ops->mult = MatMult_MPIAIJKokkos;
1343: B->ops->multadd = MatMultAdd_MPIAIJKokkos;
1344: B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos;
1345: B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos;
1346: B->ops->destroy = MatDestroy_MPIAIJKokkos;
1348: PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJKokkos);
1349: PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJKokkos);
1350: PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJKokkos);
1351: PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJKokkos);
1352: return 0;
1353: }
1354: /*MC
1355: MATAIJKOKKOS - "mpiaijkokkos", a matrix type to be used for CSR sparse matrices with Kokkos
1357: A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
1359: Options Database Keys:
1360: . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
1362: Level: beginner
1364: .seealso: `MatCreateAIJKokkos()`, `MATSEQAIJKOKKOS`, `MATSEQAIJ`, `MATMPIAIJ`
1365: M*/
1366: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1367: {
1368: PetscKokkosInitializeCheck();
1369: MatCreate_MPIAIJ(A);
1370: MatConvert_MPIAIJ_MPIAIJKokkos(A, MATMPIAIJKOKKOS, MAT_INPLACE_MATRIX, &A);
1371: return 0;
1372: }
1374: /*@C
1375: MatCreateAIJKokkos - Creates a sparse matrix in `MATAIJKOKOS` (compressed row) format
1376: (the default parallel PETSc format). This matrix will ultimately pushed down
1377: to Kokkos for calculations. For good matrix
1378: assembly performance the user should preallocate the matrix storage by setting
1379: the parameter nz (or the array nnz). By setting these parameters accurately,
1380: performance during matrix assembly can be increased by more than a factor of 50.
1382: Collective
1384: Input Parameters:
1385: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1386: . m - number of rows
1387: . n - number of columns
1388: . nz - number of nonzeros per row (same for all rows)
1389: - nnz - array containing the number of nonzeros in the various rows
1390: (possibly different for each row) or NULL
1392: Output Parameter:
1393: . A - the matrix
1395: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1396: MatXXXXSetPreallocation() paradigm instead of this routine directly.
1397: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1399: Notes:
1400: If nnz is given then nz is ignored
1402: The AIJ format, also called compressed row storage), is fully compatible with standard Fortran 77
1403: storage. That is, the stored row and column indices can begin at
1404: either one (as in Fortran) or zero. See the users' manual for details.
1406: Specify the preallocated storage with either nz or nnz (not both).
1407: Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
1408: allocation. For large problems you MUST preallocate memory or you
1409: will get TERRIBLE performance, see the users' manual chapter on matrices.
1411: By default, this format uses inodes (identical nodes) when possible, to
1412: improve numerical efficiency of matrix-vector products and solves. We
1413: search for consecutive rows with the same nonzero structure, thereby
1414: reusing matrix information to achieve increased efficiency.
1416: Level: intermediate
1418: .seealso: `MATAIJKOKOS`, `MATSEQAIJKOKOS`, `MATMPIAIJKOKOS`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJKOKKOS`, `MATAIJKOKKOS`
1419: @*/
1420: PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
1421: {
1422: PetscMPIInt size;
1424: MatCreate(comm, A);
1425: MatSetSizes(*A, m, n, M, N);
1426: MPI_Comm_size(comm, &size);
1427: if (size > 1) {
1428: MatSetType(*A, MATMPIAIJKOKKOS);
1429: MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz);
1430: } else {
1431: MatSetType(*A, MATSEQAIJKOKKOS);
1432: MatSeqAIJSetPreallocation(*A, d_nz, d_nnz);
1433: }
1434: return 0;
1435: }
1437: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
1438: PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
1439: {
1440: PetscMPIInt size, rank;
1441: MPI_Comm comm;
1442: PetscSplitCSRDataStructure d_mat = NULL;
1444: PetscObjectGetComm((PetscObject)A, &comm);
1445: MPI_Comm_size(comm, &size);
1446: MPI_Comm_rank(comm, &rank);
1447: if (size == 1) {
1448: MatSeqAIJKokkosGetDeviceMat(A, &d_mat);
1449: MatSeqAIJKokkosModifyDevice(A); /* Since we are going to modify matrix values on device */
1450: } else {
1451: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
1452: MatSeqAIJKokkosGetDeviceMat(aij->A, &d_mat);
1453: MatSeqAIJKokkosModifyDevice(aij->A);
1454: MatSeqAIJKokkosModifyDevice(aij->B);
1456: }
1457: // act like MatSetValues because not called on host
1458: if (A->assembled) {
1459: if (A->was_assembled) PetscInfo(A, "Assemble more than once already\n");
1460: A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
1461: } else {
1462: PetscInfo(A, "Warning !assemble ??? assembled=%" PetscInt_FMT "\n", A->assembled);
1463: }
1464: if (!d_mat) {
1465: struct _n_SplitCSRMat h_mat; /* host container */
1466: Mat_SeqAIJKokkos *aijkokA;
1467: Mat_SeqAIJ *jaca;
1468: PetscInt n = A->rmap->n, nnz;
1469: Mat Amat;
1470: PetscInt *colmap;
1472: /* create and copy h_mat */
1473: h_mat.M = A->cmap->N; // use for debug build
1474: PetscInfo(A, "Create device matrix in Kokkos\n");
1475: if (size == 1) {
1476: Amat = A;
1477: jaca = (Mat_SeqAIJ *)A->data;
1478: h_mat.rstart = 0;
1479: h_mat.rend = A->rmap->n;
1480: h_mat.cstart = 0;
1481: h_mat.cend = A->cmap->n;
1482: h_mat.offdiag.i = h_mat.offdiag.j = NULL;
1483: h_mat.offdiag.a = NULL;
1484: aijkokA = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1485: } else {
1486: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
1487: Mat_SeqAIJ *jacb = (Mat_SeqAIJ *)aij->B->data;
1488: PetscInt ii;
1489: Mat_SeqAIJKokkos *aijkokB;
1491: Amat = aij->A;
1492: aijkokA = static_cast<Mat_SeqAIJKokkos *>(aij->A->spptr);
1493: aijkokB = static_cast<Mat_SeqAIJKokkos *>(aij->B->spptr);
1494: jaca = (Mat_SeqAIJ *)aij->A->data;
1497: aij->donotstash = PETSC_TRUE;
1498: aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
1499: jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
1500: PetscCalloc1(A->cmap->N, &colmap);
1501: for (ii = 0; ii < aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii + 1;
1502: // allocate B copy data
1503: h_mat.rstart = A->rmap->rstart;
1504: h_mat.rend = A->rmap->rend;
1505: h_mat.cstart = A->cmap->rstart;
1506: h_mat.cend = A->cmap->rend;
1507: nnz = jacb->i[n];
1508: if (jacb->compressedrow.use) {
1509: const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_i_k(jacb->i, n + 1);
1510: aijkokB->i_uncompressed_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_i_k));
1511: Kokkos::deep_copy(aijkokB->i_uncompressed_d, h_i_k);
1512: h_mat.offdiag.i = aijkokB->i_uncompressed_d.data();
1513: } else {
1514: h_mat.offdiag.i = aijkokB->i_device_data();
1515: }
1516: h_mat.offdiag.j = aijkokB->j_device_data();
1517: h_mat.offdiag.a = aijkokB->a_device_data();
1518: {
1519: Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_colmap_k(colmap, A->cmap->N);
1520: aijkokB->colmap_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_colmap_k));
1521: Kokkos::deep_copy(aijkokB->colmap_d, h_colmap_k);
1522: h_mat.colmap = aijkokB->colmap_d.data();
1523: PetscFree(colmap);
1524: }
1525: h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
1526: h_mat.offdiag.n = n;
1527: }
1528: // allocate A copy data
1529: nnz = jaca->i[n];
1530: h_mat.diag.n = n;
1531: h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1532: MPI_Comm_rank(comm, &h_mat.rank);
1534: h_mat.diag.i = aijkokA->i_device_data();
1535: h_mat.diag.j = aijkokA->j_device_data();
1536: h_mat.diag.a = aijkokA->a_device_data();
1537: // copy pointers and metdata to device
1538: MatSeqAIJKokkosSetDeviceMat(Amat, &h_mat);
1539: MatSeqAIJKokkosGetDeviceMat(Amat, &d_mat);
1540: PetscInfo(A, "Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n", h_mat.diag.n, nnz);
1541: }
1542: *B = d_mat; // return it, set it in Mat, and set it up
1543: A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
1544: return 0;
1545: }
1547: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask)
1548: {
1549: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1551: if (!aijkok) *mask = "AIJKOK_UNALLOCATED";
1552: else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU";
1553: else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU";
1554: else *mask = "PETSC_OFFLOAD_BOTH";
1555: return 0;
1556: }
1558: PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A)
1559: {
1560: PetscMPIInt size;
1561: Mat Ad, Ao;
1562: const char *amask, *bmask;
1564: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
1566: if (size == 1) {
1567: MatSeqAIJKokkosGetOffloadMask(A, &amask);
1568: PetscPrintf(PETSC_COMM_SELF, "%s\n", amask);
1569: } else {
1570: Ad = ((Mat_MPIAIJ *)A->data)->A;
1571: Ao = ((Mat_MPIAIJ *)A->data)->B;
1572: MatSeqAIJKokkosGetOffloadMask(Ad, &amask);
1573: MatSeqAIJKokkosGetOffloadMask(Ao, &bmask);
1574: PetscPrintf(PETSC_COMM_SELF, "Diag : Off-diag = %s : %s\n", amask, bmask);
1575: }
1576: return 0;
1577: }