Actual source code: cupminterface.hpp

  1: #ifndef PETSCCUPMINTERFACE_HPP
  2: #define PETSCCUPMINTERFACE_HPP

  4: #if defined(__cplusplus)
  5: #include <petsc/private/cpputil.hpp>
  6: #include <petsc/private/petscadvancedmacros.h>
  7: #include <petscdevice_cupm.h>

  9:   #include <array>

 11: namespace Petsc
 12: {

 14: namespace device
 15: {

 17: namespace cupm
 18: {

 20: // enum describing available cupm devices, this is used as the template parameter to any
 21: // class subclassing the Interface or using it as a member variable
 22: enum class DeviceType : int {
 23:   CUDA,
 24:   HIP
 25: };

 27: // clang-format off
 28: static constexpr std::array<const char *const, 5> DeviceTypes = {
 29:   "cuda",
 30:   "hip",
 31:   "Petsc::Device::CUPM::DeviceType",
 32:   "Petsc::Device::CUPM::DeviceType::",
 33:   nullptr
 34: };
 35: // clang-format on

 37: namespace impl
 38: {

 40:   // A backend agnostic PetscCallCUPM() function, this will only work inside the member
 41:   // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
 42:   // functions can also be wrapped inline:
 43:   //
 44:   // foo<int,char,bool>();
 45:   #define PetscCallCUPM(...) \
 46:     do { \
 47:       const cupmError_t cerr_p_ = __VA_ARGS__; \
 49:     } while (0)

 51:   #define PetscCallCUPMAbort(comm_, ...) \
 52:     do { \
 53:       const cupmError_t cerr_abort_p_ = __VA_ARGS__; \
 55:     } while (0)

 57:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT() - declaration to alias a cuda/hip integral constant
 58:   // value
 59:   //
 60:   // input params:
 61:   // our_prefix   - the prefix of the alias
 62:   // our_suffix   - the suffix of the alias
 63:   // their_prefix - the prefix of the variable being aliased
 64:   // their_suffix - the suffix of the variable being aliased
 65:   //
 66:   // example usage:
 67:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm,Success,cuda,AllGood); ->
 68:   // static const auto cupmSuccess = cudaAllGood;
 69:   //
 70:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm,Success,hip,AllRight); ->
 71:   // static const auto cupmSuccess = hipAllRight;
 72:   #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(our_prefix, our_suffix, their_prefix, their_suffix) static const auto PetscConcat(our_prefix, our_suffix) = PetscConcat(their_prefix, their_suffix)

 74:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON() - declaration to alias a cuda/hip integral constant
 75:   // value
 76:   //
 77:   // input params:
 78:   // our_suffix   - the suffix of the alias
 79:   // their_suffix - the suffix of the variable being aliased
 80:   //
 81:   // notes:
 82:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix
 83:   //
 84:   // example usage:
 85:   // #define PETSC_CUPM_PREFIX_L cuda
 86:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(Success,AllGood); ->
 87:   // static const auto cupmSuccess = cudaAllGood;
 88:   //
 89:   // #define PETSC_CUPM_PREFIX_L hip
 90:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(Success,AllRight); ->
 91:   // static const auto cupmSuccess = hipAllRight;
 92:   #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(our_suffix, their_suffix) PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix)

 94:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE() - declaration to alias a cuda/hip integral constant value
 95:   //
 96:   // input param:
 97:   // suffix - the common suffix shared between cuda, hip, and cupm
 98:   //
 99:   // notes:
100:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix
101:   //
102:   // example usage:
103:   // #define PETSC_CUPM_PREFIX_L cuda
104:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success); -> static const auto cupmSuccess = cudaSuccess;
105:   //
106:   // #define PETSC_CUPM_PREFIX_L hip
107:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success); -> static const auto cupmSuccess = hipSuccess;
108:   #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE(suffix) PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(suffix, suffix)

110:   // PETSC_CUPM_ALIAS_FUNCTION_EXACT() - declaration to alias a cuda/hip function
111:   //
112:   // input params:
113:   // our_prefix   - the prefix of the alias
114:   // our_suffix   - the suffix of the alias
115:   // their_prefix - the prefix of the function being aliased
116:   // their_suffix - the suffix of the function being aliased
117:   //
118:   // notes:
119:   // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
120:   //
121:   // example usage:
122:   // PETSC_CUPM_ALIAS_FUNCTION_EXACT(cupm,Malloc,cuda,Malloc) ->
123:   // template <typename... T>
124:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
125:   // {
126:   //   return cudaMalloc(std::forward<T>(args)...);
127:   // }
128:   #define PETSC_CUPM_ALIAS_FUNCTION_EXACT(our_prefix, our_suffix, their_prefix, their_suffix) PETSC_ALIAS_FUNCTION(static PetscConcat(our_prefix, our_suffix), PetscConcat(their_prefix, their_suffix))

130:   // PETSC_CUPM_ALIAS_FUNCTION_COMMON() - declaration to alias a cuda/hip function
131:   //
132:   // input params:
133:   // our_suffix   - the suffix of the alias
134:   // their_suffix - the common suffix of the cuda/hip function being aliased
135:   //
136:   // notes:
137:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
138:   // aliased. see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
139:   //
140:   // example usage:
141:   // #define PETSC_CUPM_PREFIX_L cuda
142:   // PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocFancy,Malloc) ->
143:   // template <typename... T>
144:   // static constexpr auto cupmMallocFancy(T&&... args) *noexcept and trailing return type deduction*
145:   // {
146:   //   return cudaMalloc(std::forward<T>(args)...);
147:   // }
148:   //
149:   // #define PETSC_CUPM_PREFIX_L hip
150:   // PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocFancy,Malloc) ->
151:   // template <typename... T>
152:   // static constexpr auto cupmMallocFancy(T&&... args) *noexcept and trailing return type deduction*
153:   // {
154:   //   return hipMalloc(std::forward<T>(args)...);
155:   // }
156:   #define PETSC_CUPM_ALIAS_FUNCTION_COMMON(our_suffix, their_suffix) PETSC_CUPM_ALIAS_FUNCTION_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix)

158:   // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
159:   //
160:   // input param:
161:   // suffix - the common suffix for hip, cuda and the alias
162:   //
163:   // notes:
164:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
165:   // aliased. see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
166:   //
167:   // example usage:
168:   // #define PETSC_CUPM_PREFIX_L cuda
169:   // PETSC_CUPM_ALIAS_FUNCTION(Malloc) ->
170:   // template <typename... T>
171:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
172:   // {
173:   //   return cudaMalloc(std::forward<T>(args)...);
174:   // }
175:   //
176:   // #define PETSC_CUPM_PREFIX_L hip
177:   // PETSC_CUPM_ALIAS_FUNCTION(Malloc) ->
178:   // template <typename... T>
179:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
180:   // {
181:   //   return hipMalloc(std::forward<T>(args)...);
182:   // }
183:   #define PETSC_CUPM_ALIAS_FUNCTION(suffix) PETSC_CUPM_ALIAS_FUNCTION_COMMON(suffix, suffix)

185:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT() - declaration to alias a cuda/hip function but
186:   // discard the last N arguments
187:   //
188:   // input params:
189:   // our_prefix   - the prefix of the alias
190:   // our_suffix   - the suffix of the alias
191:   // their_prefix - the prefix of the function being aliased
192:   // their_suffix - the suffix of the function being aliased
193:   // N            - integer constant [0,INT_MAX) dictating how many arguments to chop off the end
194:   //
195:   // notes:
196:   // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
197:   //
198:   // example use:
199:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(cupm,MallocAsync,cuda,Malloc,1) ->
200:   // template <typename... T, typename Tend>
201:   // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
202:   // return type deduction*
203:   // {
204:   //   (void)argend;
205:   //   return cudaMalloc(std::forward<T>(args)...);
206:   // }
207:   #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(our_prefix, our_suffix, their_prefix, their_suffix, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static PetscConcat(our_prefix, our_suffix), PetscConcat(their_prefix, their_suffix), N)

209:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON() - declaration to alias a cuda/hip function but
210:   // discard the last N arguments
211:   //
212:   // input params:
213:   // our_suffix   - the suffix of the alias
214:   // their_suffix - the suffix of the function being aliased
215:   // N            - integer constant [0,INT_MAX) dictating how many arguments to chop off the end
216:   //
217:   // notes:
218:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
219:   // aliased. see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the
220:   // expansion
221:   //
222:   // example use:
223:   // #define PETSC_CUPM_PREFIX_L cuda
224:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync,Malloc,1) ->
225:   // template <typename... T, typename Tend>
226:   // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
227:   // return type deduction*
228:   // {
229:   //   (void)argend;
230:   //   return cudaMalloc(std::forward<T>(args)...);
231:   // }
232:   #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(our_suffix, their_suffix, N) PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix, N)

234: // Base class that holds functions and variables that don't require CUDA or HIP to be present
235: // on the system
236: template <DeviceType T>
237: struct InterfaceBase {
238:   static const DeviceType type = T;

240:   PETSC_NODISCARD static constexpr const char *cupmName() noexcept
241:   {
242:     static_assert(util::integral_value(DeviceType::CUDA) == 0, "");
243:     static_assert(util::integral_value(DeviceType::HIP) == 1, "");
244:     return std::get<util::integral_value(T)>(DeviceTypes);
245:   }

247:   PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; }

249:   PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; }
250: };

252: // declare the base class static member variables
253: template <DeviceType T>
254: const DeviceType InterfaceBase<T>::type;

256:   #define PETSC_CUPM_BASE_CLASS_HEADER(base_name, DEVICE_TYPE) \
257:     using base_name = ::Petsc::device::cupm::impl::InterfaceBase<DEVICE_TYPE>; \
258:     using base_name::type; \
259:     using base_name::cupmName; \
260:     using base_name::PETSC_DEVICE_CUPM; \
261:     using base_name::PETSC_MEMTYPE_CUPM

263: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
264: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
265: // import them into classes.
266: template <DeviceType>
267: struct InterfaceImpl;

269:   #if PetscDefined(HAVE_CUDA)
270:     #define PETSC_CUPM_PREFIX_L cuda
271:     #define PETSC_CUPM_PREFIX_U CUDA
272: template <>
273: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
274:   PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::CUDA);

276:   // typedefs
277:   using cupmError_t             = cudaError_t;
278:   using cupmEvent_t             = cudaEvent_t;
279:   using cupmStream_t            = cudaStream_t;
280:   using cupmDeviceProp_t        = cudaDeviceProp;
281:   using cupmMemcpyKind_t        = cudaMemcpyKind;
282:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
283:   using cupmPointerAttributes_t = struct cudaPointerAttributes;
284:   using cupmMemoryType_t        = enum cudaMemoryType;
285:   using cupmDim3                = dim3;
286:   using cupmHostFn_t            = cudaHostFn_t;
287:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
288:   using cupmMemPool_t   = cudaMemPool_t;
289:   using cupmMemPoolAttr = cudaMemPoolAttr;
290:     #else
291:   using cupmMemPool_t   = void *;
292:   using cupmMemPoolAttr = unsigned int;
293:     #endif

295:   // values
296:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success);
297:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNotReady);
298:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorDeviceAlreadyInUse);
299:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorSetOnActiveProcess);
300:     #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
301:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorStubLibrary);
302:     #else
303:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorStubLibrary, ErrorInsufficientDriver);
304:     #endif
305:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNoDevice);
306:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamDefault);
307:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamNonBlocking);
308:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(DeviceMapHost);
309:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToDevice);
310:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToHost);
311:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToDevice);
312:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToHost);
313:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDefault);
314:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeHost);
315:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeDevice);
316:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeManaged);
317:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(EventDisableTiming);
318:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(HostAllocDefault);
319:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(HostAllocWriteCombined);
320:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
321:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemPoolAttrReleaseThreshold);
322:     #else
323:   static const cupmMemPoolAttr       cupmMemPoolAttrReleaseThreshold = 0;
324:     #endif

326:   // error functions
327:   PETSC_CUPM_ALIAS_FUNCTION(GetErrorName)
328:   PETSC_CUPM_ALIAS_FUNCTION(GetErrorString)
329:   PETSC_CUPM_ALIAS_FUNCTION(GetLastError)

331:   // device management
332:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceCount)
333:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceProperties)
334:   PETSC_CUPM_ALIAS_FUNCTION(GetDevice)
335:   PETSC_CUPM_ALIAS_FUNCTION(SetDevice)
336:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceFlags)
337:   PETSC_CUPM_ALIAS_FUNCTION(SetDeviceFlags)
338:   PETSC_CUPM_ALIAS_FUNCTION(PointerGetAttributes)
339:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
340:   PETSC_CUPM_ALIAS_FUNCTION(DeviceGetMemPool)
341:   PETSC_CUPM_ALIAS_FUNCTION(MemPoolSetAttribute)
342:     #else
343:   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
344:   {
345:     *pool = nullptr;
346:     return cupmSuccess;
347:   }

349:   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
350:     #endif
351:   // CUDA has no cudaInit() to match hipInit()
352:   PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); }

354:   // stream management
355:   PETSC_CUPM_ALIAS_FUNCTION(EventCreate)
356:   PETSC_CUPM_ALIAS_FUNCTION(EventCreateWithFlags)
357:   PETSC_CUPM_ALIAS_FUNCTION(EventDestroy)
358:   PETSC_CUPM_ALIAS_FUNCTION(EventRecord)
359:   PETSC_CUPM_ALIAS_FUNCTION(EventSynchronize)
360:   PETSC_CUPM_ALIAS_FUNCTION(EventElapsedTime)
361:   PETSC_CUPM_ALIAS_FUNCTION(EventQuery)
362:   PETSC_CUPM_ALIAS_FUNCTION(StreamCreate)
363:   PETSC_CUPM_ALIAS_FUNCTION(StreamCreateWithFlags)
364:   PETSC_CUPM_ALIAS_FUNCTION(StreamGetFlags)
365:   PETSC_CUPM_ALIAS_FUNCTION(StreamDestroy)
366:   PETSC_CUPM_ALIAS_FUNCTION(StreamWaitEvent)
367:   PETSC_CUPM_ALIAS_FUNCTION(StreamQuery)
368:   PETSC_CUPM_ALIAS_FUNCTION(StreamSynchronize)
369:   PETSC_CUPM_ALIAS_FUNCTION(DeviceSynchronize)
370:   PETSC_CUPM_ALIAS_FUNCTION(GetSymbolAddress)

372:   // memory management
373:   PETSC_CUPM_ALIAS_FUNCTION(Free)
374:   PETSC_CUPM_ALIAS_FUNCTION(Malloc)
375:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
376:   PETSC_CUPM_ALIAS_FUNCTION(FreeAsync)
377:   PETSC_CUPM_ALIAS_FUNCTION(MallocAsync)
378:     #else
379:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(FreeAsync, Free, 1)
380:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync, Malloc, 1)
381:     #endif
382:   PETSC_CUPM_ALIAS_FUNCTION(Memcpy)
383:   PETSC_CUPM_ALIAS_FUNCTION(MemcpyAsync)
384:   PETSC_CUPM_ALIAS_FUNCTION(MallocHost)
385:   PETSC_CUPM_ALIAS_FUNCTION(FreeHost)
386:   PETSC_CUPM_ALIAS_FUNCTION(Memset)
387:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
388:   PETSC_CUPM_ALIAS_FUNCTION(MemsetAsync)
389:     #else
390:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MemsetAsync, Memset, 1)
391:     #endif

393:   // launch control
394:   PETSC_CUPM_ALIAS_FUNCTION(LaunchHostFunc)
395:   template <typename FunctionT, typename... KernelArgsT>
396:   PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept
397:   {
398:     void *args[] = {(void *)&kernelArgs...};
399:     return cudaLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
400:   }
401: };
402:     #undef PETSC_CUPM_PREFIX_L
403:     #undef PETSC_CUPM_PREFIX_U
404:   #endif // PetscDefined(HAVE_CUDA)

406:   #if PetscDefined(HAVE_HIP)
407:     #define PETSC_CUPM_PREFIX_L hip
408:     #define PETSC_CUPM_PREFIX_U HIP
409: template <>
410: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
411:   PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::HIP);

413:   // typedefs
414:   using cupmError_t             = hipError_t;
415:   using cupmEvent_t             = hipEvent_t;
416:   using cupmStream_t            = hipStream_t;
417:   using cupmDeviceProp_t        = hipDeviceProp_t;
418:   using cupmMemcpyKind_t        = hipMemcpyKind;
419:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
420:   using cupmPointerAttributes_t = hipPointerAttribute_t;
421:   using cupmMemoryType_t        = enum hipMemoryType;
422:   using cupmDim3                = dim3;
423:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
424:   using cupmHostFn_t    = hipHostFn_t;
425:   using cupmMemPool_t   = hipMemPool_t;
426:   using cupmMemPoolAttr = hipMemPoolAttr;
427:     #else
428:   using cupmHostFn_t                                                 = void (*)(void *);
429:   using cupmMemPool_t                                                = void *;
430:   using cupmMemPoolAttr                                              = unsigned int;
431:     #endif

433:   // values
434:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success);
435:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNotReady);
436:   // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
437:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorDeviceAlreadyInUse, ErrorContextAlreadyInUse);
438:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorSetOnActiveProcess);
439:   // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
440:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorStubLibrary, ErrorInsufficientDriver);
441:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNoDevice);
442:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamDefault);
443:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamNonBlocking);
444:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(DeviceMapHost);
445:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToDevice);
446:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToHost);
447:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToDevice);
448:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToHost);
449:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDefault);
450:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeHost);
451:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeDevice);
452:   // see
453:   // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
454:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(MemoryTypeManaged, MemoryTypeUnified);
455:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(EventDisableTiming);
456:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(HostAllocDefault, HostMallocDefault);
457:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(HostAllocWriteCombined, HostMallocWriteCombined);
458:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
459:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemPoolAttrReleaseThreshold);
460:     #else
461:   static const cupmMemPoolAttr       cupmMemPoolAttrReleaseThreshold = 0;
462:     #endif

464:   // error functions
465:   PETSC_CUPM_ALIAS_FUNCTION(GetErrorName)
466:   PETSC_CUPM_ALIAS_FUNCTION(GetErrorString)
467:   PETSC_CUPM_ALIAS_FUNCTION(GetLastError)

469:   // device management
470:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceCount)
471:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceProperties)
472:   PETSC_CUPM_ALIAS_FUNCTION(GetDevice)
473:   PETSC_CUPM_ALIAS_FUNCTION(SetDevice)
474:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceFlags)
475:   PETSC_CUPM_ALIAS_FUNCTION(SetDeviceFlags)
476:   PETSC_CUPM_ALIAS_FUNCTION(PointerGetAttributes)
477:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
478:   PETSC_CUPM_ALIAS_FUNCTION(DeviceGetMemPool)
479:   PETSC_CUPM_ALIAS_FUNCTION(MemPoolSetAttribute)
480:     #else
481:   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
482:   {
483:     *pool = nullptr;
484:     return cupmSuccess;
485:   }

487:   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
488:     #endif
489:   PETSC_CUPM_ALIAS_FUNCTION(Init)

491:   // stream management
492:   PETSC_CUPM_ALIAS_FUNCTION(EventCreate)
493:   PETSC_CUPM_ALIAS_FUNCTION(EventCreateWithFlags)
494:   PETSC_CUPM_ALIAS_FUNCTION(EventDestroy)
495:   PETSC_CUPM_ALIAS_FUNCTION(EventRecord)
496:   PETSC_CUPM_ALIAS_FUNCTION(EventSynchronize)
497:   PETSC_CUPM_ALIAS_FUNCTION(EventElapsedTime)
498:   PETSC_CUPM_ALIAS_FUNCTION(EventQuery)
499:   PETSC_CUPM_ALIAS_FUNCTION(StreamCreate)
500:   PETSC_CUPM_ALIAS_FUNCTION(StreamCreateWithFlags)
501:   PETSC_CUPM_ALIAS_FUNCTION(StreamGetFlags)
502:   PETSC_CUPM_ALIAS_FUNCTION(StreamDestroy)
503:   PETSC_CUPM_ALIAS_FUNCTION(StreamWaitEvent)
504:   PETSC_CUPM_ALIAS_FUNCTION(StreamQuery)
505:   PETSC_CUPM_ALIAS_FUNCTION(StreamSynchronize)
506:   PETSC_CUPM_ALIAS_FUNCTION(DeviceSynchronize)
507:   PETSC_CUPM_ALIAS_FUNCTION(GetSymbolAddress)

509:   // memory management
510:   PETSC_CUPM_ALIAS_FUNCTION(Free)
511:   PETSC_CUPM_ALIAS_FUNCTION(Malloc)
512:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
513:   PETSC_CUPM_ALIAS_FUNCTION(MallocAsync)
514:   PETSC_CUPM_ALIAS_FUNCTION(FreeAsync)
515:     #else
516:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync, Malloc, 1)
517:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(FreeAsync, Free, 1)
518:     #endif
519:   PETSC_CUPM_ALIAS_FUNCTION(Memcpy)
520:   PETSC_CUPM_ALIAS_FUNCTION(MemcpyAsync)
521:   // hipMallocHost is deprecated
522:   PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocHost, HostMalloc)
523:   // hipFreeHost is deprecated
524:   PETSC_CUPM_ALIAS_FUNCTION_COMMON(FreeHost, HostFree)
525:   PETSC_CUPM_ALIAS_FUNCTION(Memset)
526:   PETSC_CUPM_ALIAS_FUNCTION(MemsetAsync)

528:       // launch control
529:       // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
530:       // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
531:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
532:   PETSC_CUPM_ALIAS_FUNCTION(LaunchHostFunc)
533:     #else
534:   PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept
535:   {
536:     // the only correct way to spoof this function is to do it synchronously...
537:     auto herr = hipStreamSynchronize(stream);
538:     if (PetscUnlikely(herr != hipSuccess)) return herr;
539:     fn(ctx);
540:     return herr;
541:   }
542:     #endif

544:   template <typename FunctionT, typename... KernelArgsT>
545:   PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept
546:   {
547:     void *args[] = {(void *)&kernelArgs...};
548:     return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
549:   }
550: };
551:     #undef PETSC_CUPM_PREFIX_L
552:     #undef PETSC_CUPM_PREFIX_U
553:   #endif // PetscDefined(HAVE_HIP)

555:   // shorthand for bringing all of the typedefs from the base Interface class into your own,
556:   // it's annoying that c++ doesn't have a way to do this automatically
557:   #define PETSC_CUPM_IMPL_CLASS_HEADER(base_name, T) \
558:     PETSC_CUPM_BASE_CLASS_HEADER(PetscConcat(base_, base_name), T); \
559:     using base_name = ::Petsc::device::cupm::impl::InterfaceImpl<T>; \
560:     /* types */ \
561:     using typename base_name::cupmComplex_t; \
562:     using typename base_name::cupmError_t; \
563:     using typename base_name::cupmEvent_t; \
564:     using typename base_name::cupmStream_t; \
565:     using typename base_name::cupmDeviceProp_t; \
566:     using typename base_name::cupmMemcpyKind_t; \
567:     using typename base_name::cupmPointerAttributes_t; \
568:     using typename base_name::cupmMemoryType_t; \
569:     using typename base_name::cupmDim3; \
570:     using typename base_name::cupmMemPool_t; \
571:     using typename base_name::cupmMemPoolAttr; \
572:     /* variables */ \
573:     using base_name::cupmSuccess; \
574:     using base_name::cupmErrorNotReady; \
575:     using base_name::cupmErrorDeviceAlreadyInUse; \
576:     using base_name::cupmErrorSetOnActiveProcess; \
577:     using base_name::cupmErrorStubLibrary; \
578:     using base_name::cupmErrorNoDevice; \
579:     using base_name::cupmStreamDefault; \
580:     using base_name::cupmStreamNonBlocking; \
581:     using base_name::cupmDeviceMapHost; \
582:     using base_name::cupmMemcpyHostToDevice; \
583:     using base_name::cupmMemcpyDeviceToHost; \
584:     using base_name::cupmMemcpyDeviceToDevice; \
585:     using base_name::cupmMemcpyHostToHost; \
586:     using base_name::cupmMemcpyDefault; \
587:     using base_name::cupmMemoryTypeHost; \
588:     using base_name::cupmMemoryTypeDevice; \
589:     using base_name::cupmMemoryTypeManaged; \
590:     using base_name::cupmEventDisableTiming; \
591:     using base_name::cupmHostAllocDefault; \
592:     using base_name::cupmHostAllocWriteCombined; \
593:     using base_name::cupmMemPoolAttrReleaseThreshold; \
594:     /* functions */ \
595:     using base_name::cupmGetErrorName; \
596:     using base_name::cupmGetErrorString; \
597:     using base_name::cupmGetLastError; \
598:     using base_name::cupmGetDeviceCount; \
599:     using base_name::cupmGetDeviceProperties; \
600:     using base_name::cupmGetDevice; \
601:     using base_name::cupmSetDevice; \
602:     using base_name::cupmGetDeviceFlags; \
603:     using base_name::cupmSetDeviceFlags; \
604:     using base_name::cupmPointerGetAttributes; \
605:     using base_name::cupmDeviceGetMemPool; \
606:     using base_name::cupmMemPoolSetAttribute; \
607:     using base_name::cupmInit; \
608:     using base_name::cupmEventCreate; \
609:     using base_name::cupmEventCreateWithFlags; \
610:     using base_name::cupmEventDestroy; \
611:     using base_name::cupmEventRecord; \
612:     using base_name::cupmEventSynchronize; \
613:     using base_name::cupmEventElapsedTime; \
614:     using base_name::cupmEventQuery; \
615:     using base_name::cupmStreamCreate; \
616:     using base_name::cupmStreamCreateWithFlags; \
617:     using base_name::cupmStreamGetFlags; \
618:     using base_name::cupmStreamDestroy; \
619:     using base_name::cupmStreamWaitEvent; \
620:     using base_name::cupmStreamQuery; \
621:     using base_name::cupmStreamSynchronize; \
622:     using base_name::cupmDeviceSynchronize; \
623:     using base_name::cupmGetSymbolAddress; \
624:     using base_name::cupmMalloc; \
625:     using base_name::cupmMallocAsync; \
626:     using base_name::cupmMemcpy; \
627:     using base_name::cupmMemcpyAsync; \
628:     using base_name::cupmMallocHost; \
629:     using base_name::cupmMemset; \
630:     using base_name::cupmMemsetAsync; \
631:     using base_name::cupmLaunchHostFunc

633: template <DeviceType>
634: struct Interface;

636: // The actual interface class
637: template <DeviceType T>
638: struct Interface : InterfaceImpl<T> {
639:   PETSC_CUPM_IMPL_CLASS_HEADER(interface_type, T);

641:   using cupmReal_t   = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
642:   using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;

644:   // REVIEW ME: this needs to be cleaned up, it is unreadable
645:   PETSC_NODISCARD static constexpr cupmScalar_t makeCupmScalar(PetscScalar s) noexcept
646:   {
647:   #if PetscDefined(USE_COMPLEX)
648:     return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)};
649:   #else
650:     return static_cast<cupmReal_t>(s);
651:   #endif
652:   }

654:   PETSC_NODISCARD static constexpr const cupmScalar_t *cupmScalarCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); }

656:   PETSC_NODISCARD static constexpr cupmScalar_t *cupmScalarCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); }

658:   PETSC_NODISCARD static constexpr const cupmReal_t *cupmRealCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); }

660:   PETSC_NODISCARD static constexpr cupmReal_t *cupmRealCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); }

662:   #if !defined(PETSC_PKG_CUDA_VERSION_GE)
663:     #define PETSC_PKG_CUDA_VERSION_GE(...) 0
664:     #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
665:   #endif
666:   PETSC_NODISCARD static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept
667:   {
668:     cupmPointerAttributes_t attr;
669:     cupmError_t             cerr;

672:     if (registered) {
674:       *registered = PETSC_FALSE;
675:     }
676:     if (managed) {
678:       *managed = PETSC_FALSE;
679:     }
680:     // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
681:     // a host pointer returns cudaErrorInvalidValue
682:     cerr = cupmPointerGetAttributes(&attr, data);
683:     cerr = cupmGetLastError();
684:       // HIP seems to always have used memoryType though
685:   #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || defined(__HIP_PLATFORM_HCC__)
686:     const auto mtype = attr.memoryType;
687:     if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
688:   #else
689:     if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) cerr;
690:     const auto mtype = attr.type;
691:     if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
692:   #endif // CUDART_VERSION && CUDART_VERSION < 10000 || __HIP_PLATFORM_HCC__
693:     if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
694:     if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
695:     return 0;
696:   }
697:   #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
698:     #undef PETSC_PKG_CUDA_VERSION_GE
699:   #endif

701:   PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept
702:   {
703:     switch (mode) {
704:     case PETSC_DEVICE_COPY_HTOH:
705:       return cupmMemcpyHostToHost;
706:     case PETSC_DEVICE_COPY_HTOD:
707:       return cupmMemcpyHostToDevice;
708:     case PETSC_DEVICE_COPY_DTOD:
709:       return cupmMemcpyDeviceToDevice;
710:     case PETSC_DEVICE_COPY_DTOH:
711:       return cupmMemcpyDeviceToHost;
712:     case PETSC_DEVICE_COPY_AUTO:
713:       return cupmMemcpyDefault;
714:     }
715:     PetscUnreachable();
716:     return cupmMemcpyDefault;
717:   }

719:   // these change what the arguments mean, so need to namespace these
720:   template <typename M>
721:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept
722:   {
723:     static_assert(!std::is_void<M>::value, "");

726:     if (PetscLikely(n)) {
727:       cupmMallocAsync(reinterpret_cast<void **>(ptr), n * sizeof(M), stream);
728:     } else {
729:       *ptr = nullptr;
730:     }
731:     return 0;
732:   }

734:   template <typename M>
735:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept
736:   {
737:     PetscCUPMMallocAsync(ptr, n);
738:     return 0;
739:   }

741:   template <typename M>
742:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept
743:   {
744:     static_assert(!std::is_void<M>::value, "");

747:     *ptr = nullptr;
748:     cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags);
749:     return 0;
750:   }

752:   template <typename D, typename S = D>
753:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept
754:   {
755:     static_assert(sizeof(D) == sizeof(S), "");
756:     static_assert(!std::is_void<D>::value && !std::is_void<S>::value, "");
757:     const auto size = n * sizeof(D);

759:     if (PetscUnlikely(!n)) return 0;
763:     // do early return after nullptr check since we need to check that they arent both nullptrs
764:     if (PetscUnlikely(dest == src)) return 0;
765:     if (kind == cupmMemcpyHostToHost) {
766:       // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy()
767:       // than it is to just call the vendor functions. This assumes of course that the stream
768:       // accounts for both memory regions being "idle"
769:       if (cupmStreamQuery(stream) == cupmSuccess) {
770:         PetscMemcpy(dest, src, size);
771:         return 0;
772:       }
773:       // need to clear the potential cupmErrorNotReady generated by query above...
774:       auto cerr = cupmGetLastError();

776:       if (PetscUnlikely(cerr != cupmErrorNotReady)) cerr;
777:     }
778:     if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
779:       cupmMemcpyAsync(dest, src, size, kind, stream);
780:     } else {
781:       cupmMemcpy(dest, src, size, kind);
782:     }

784:     // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
785:     // (yet) or don't know the direction
786:     if (kind == cupmMemcpyDeviceToHost) {
787:       PetscLogGpuToCpu(size);
788:     } else if (kind == cupmMemcpyHostToDevice) {
789:       PetscLogCpuToGpu(size);
790:     }
791:     return 0;
792:   }

794:   template <typename D, typename S = D>
795:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMemcpy(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind) noexcept
796:   {
797:     PetscCUPMMemcpyAsync(dest, src, n, kind);
798:     return 0;
799:   }

801:   template <typename M>
802:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept
803:   {
804:     static_assert(!std::is_void<M>::value, "");

806:     if (PetscLikely(n)) {
807:       const auto bytes = n * sizeof(M);

810:       if (stream || use_async) {
811:         cupmMemsetAsync(ptr, value, bytes, stream);
812:       } else {
813:         cupmMemset(ptr, value, bytes);
814:       }
815:     }
816:     return 0;
817:   }

819:   template <typename M>
820:   PETSC_NODISCARD static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept
821:   {
822:     PetscCUPMMemsetAsync(ptr, value, n);
823:     return 0;
824:   }

826:   // these we can transparently wrap, no need to namespace it to Petsc
827:   template <typename M>
828:   PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &&ptr, cupmStream_t stream = nullptr) noexcept
829:   {
830:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");

832:     if (ptr) {
833:       auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);

835:       ptr = nullptr;
836:       if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
837:     }
838:     return cupmSuccess;
839:   }

841:   PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) noexcept { return interface_type::cupmFreeAsync(ptr, stream); }

843:   template <typename M>
844:   PETSC_NODISCARD static cupmError_t cupmFree(M &&ptr) noexcept
845:   {
846:     return cupmFreeAsync(std::forward<M>(ptr));
847:   }

849:   PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) noexcept { return cupmFreeAsync(ptr); }

851:   template <typename M>
852:   PETSC_NODISCARD static cupmError_t cupmFreeHost(M &&ptr) noexcept
853:   {
854:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");
855:     const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
856:     ptr             = nullptr;
857:     return cerr;
858:   }

860:   PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) noexcept { return interface_type::cupmFreeHost(ptr); }

862:   // specific wrapper for device launch function, as the real function is a C routine and
863:   // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
864:   // boils down to the fact that ultimately we pass a
865:   //
866:   // void *args[] = {(void*)&kernel_args...};
867:   //
868:   // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
869:   // happen to the kernel arguments so we must do it ourselves here. This function does this in
870:   // 3 stages:
871:   // 1. Enumerate the kernel arguments (cupmLaunchKernel)
872:   // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
873:   //    expected by func() using the enumeration above (deduceKernelCall)
874:   // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
875:   //    it. (interface_type::cupmLaunchKernel)
876:   template <typename F, typename... Args>
877:   PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
878:   {
879:     return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
880:   }

882:   template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
883:   PETSC_NODISCARD static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept
884:   {
885:     static_assert(block_size > 0, "");
886:     static_assert(warp_size > 0, "");
887:     // want block_size to be a multiple of the warp_size
888:     static_assert(block_size % warp_size == 0, "");
889:     const auto nthread = std::min(n, block_size);
890:     const auto nblock  = (n + block_size - 1) / block_size;

892:     // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
893:     // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
894:     PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
895:     cupmLaunchKernel(std::forward<F>(func), nblock, nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...);
896:     return 0;
897:   }

899: private:
900:   template <typename S, typename D, typename = void>
901:   struct is_static_castable : std::false_type { };

903:   template <typename S, typename D>
904:   struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };

906:   template <typename D, typename S>
907:   static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
908:   {
909:     return static_cast<D>(std::forward<S>(src));
910:   }

912:   template <typename D, typename S>
913:   static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
914:   {
915:     return const_cast<D>(std::forward<S>(src));
916:   }

918:   template <typename F, typename... Args, std::size_t... Idx>
919:   PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
920:   {
921:     // clang-format off
922:     return interface_type::template cupmLaunchKernel(
923:       std::forward<F>(func),
924:       std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
925:       // can't static_cast() here since the function argument type may be cv-qualified, in
926:       // which case we would need to const_cast(). But you can only const_cast()
927:       // indirect types (pointers, references) and I don't want to add a
928:       // static_cast_that_becomes_a_const_cast() SFINAE monster to this template mess. C-style
929:       // casts luckily work here since it tries the following and uses the first one that
930:       // succeeds:
931:       // 1. const_cast()
932:       // 2. static_cast()
933:       // 3. static_cast() then const_cast()
934:       // 4. reinterpret_cast()...
935:       // hopefully we never get to reinterpret_cast() land
936:       //(typename util::func_traits<F>::template arg<Idx>::type)(kernelArgs)...
937:       cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
938:     );
939:     // clang-format on
940:   }
941: };

943:   #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(base_name, T) \
944:     PETSC_CUPM_IMPL_CLASS_HEADER(PetscConcat(base_name, _impl), T); \
945:     using base_name = ::Petsc::device::cupm::impl::Interface<T>; \
946:     using typename base_name::cupmReal_t; \
947:     using typename base_name::cupmScalar_t; \
948:     using base_name::makeCupmScalar; \
949:     using base_name::cupmScalarCast; \
950:     using base_name::cupmRealCast; \
951:     using base_name::PetscCUPMGetMemType; \
952:     using base_name::PetscCUPMMemset; \
953:     using base_name::PetscCUPMMemsetAsync; \
954:     using base_name::PetscCUPMMalloc; \
955:     using base_name::PetscCUPMMallocAsync; \
956:     using base_name::PetscCUPMMallocHost; \
957:     using base_name::PetscCUPMMemcpy; \
958:     using base_name::PetscCUPMMemcpyAsync; \
959:     using base_name::cupmFree; \
960:     using base_name::cupmFreeAsync; \
961:     using base_name::cupmFreeHost; \
962:     using base_name::cupmLaunchKernel; \
963:     using base_name::PetscCUPMLaunchKernel1D; \
964:     using base_name::PetscDeviceCopyModeToCUPMMemcpyKind

966: } // namespace impl

968: } // namespace cupm

970: } // namespace device

972: } // namespace Petsc

974: #endif /* __cplusplus */

976: #endif /* PETSCCUPMINTERFACE_HPP */