Actual source code: cupminterface.hpp
1: #ifndef PETSCCUPMINTERFACE_HPP
2: #define PETSCCUPMINTERFACE_HPP
4: #if defined(__cplusplus)
5: #include <petsc/private/cpputil.hpp>
6: #include <petsc/private/petscadvancedmacros.h>
7: #include <petscdevice_cupm.h>
9: #include <array>
11: namespace Petsc
12: {
14: namespace device
15: {
17: namespace cupm
18: {
20: // enum describing available cupm devices, this is used as the template parameter to any
21: // class subclassing the Interface or using it as a member variable
22: enum class DeviceType : int {
23: CUDA,
24: HIP
25: };
27: // clang-format off
28: static constexpr std::array<const char *const, 5> DeviceTypes = {
29: "cuda",
30: "hip",
31: "Petsc::Device::CUPM::DeviceType",
32: "Petsc::Device::CUPM::DeviceType::",
33: nullptr
34: };
35: // clang-format on
37: namespace impl
38: {
40: // A backend agnostic PetscCallCUPM() function, this will only work inside the member
41: // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
42: // functions can also be wrapped inline:
43: //
44: // foo<int,char,bool>();
45: #define PetscCallCUPM(...) \
46: do { \
47: const cupmError_t cerr_p_ = __VA_ARGS__; \
49: } while (0)
51: #define PetscCallCUPMAbort(comm_, ...) \
52: do { \
53: const cupmError_t cerr_abort_p_ = __VA_ARGS__; \
55: } while (0)
57: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT() - declaration to alias a cuda/hip integral constant
58: // value
59: //
60: // input params:
61: // our_prefix - the prefix of the alias
62: // our_suffix - the suffix of the alias
63: // their_prefix - the prefix of the variable being aliased
64: // their_suffix - the suffix of the variable being aliased
65: //
66: // example usage:
67: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm,Success,cuda,AllGood); ->
68: // static const auto cupmSuccess = cudaAllGood;
69: //
70: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm,Success,hip,AllRight); ->
71: // static const auto cupmSuccess = hipAllRight;
72: #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(our_prefix, our_suffix, their_prefix, their_suffix) static const auto PetscConcat(our_prefix, our_suffix) = PetscConcat(their_prefix, their_suffix)
74: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON() - declaration to alias a cuda/hip integral constant
75: // value
76: //
77: // input params:
78: // our_suffix - the suffix of the alias
79: // their_suffix - the suffix of the variable being aliased
80: //
81: // notes:
82: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix
83: //
84: // example usage:
85: // #define PETSC_CUPM_PREFIX_L cuda
86: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(Success,AllGood); ->
87: // static const auto cupmSuccess = cudaAllGood;
88: //
89: // #define PETSC_CUPM_PREFIX_L hip
90: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(Success,AllRight); ->
91: // static const auto cupmSuccess = hipAllRight;
92: #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(our_suffix, their_suffix) PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix)
94: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE() - declaration to alias a cuda/hip integral constant value
95: //
96: // input param:
97: // suffix - the common suffix shared between cuda, hip, and cupm
98: //
99: // notes:
100: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix
101: //
102: // example usage:
103: // #define PETSC_CUPM_PREFIX_L cuda
104: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success); -> static const auto cupmSuccess = cudaSuccess;
105: //
106: // #define PETSC_CUPM_PREFIX_L hip
107: // PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success); -> static const auto cupmSuccess = hipSuccess;
108: #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE(suffix) PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(suffix, suffix)
110: // PETSC_CUPM_ALIAS_FUNCTION_EXACT() - declaration to alias a cuda/hip function
111: //
112: // input params:
113: // our_prefix - the prefix of the alias
114: // our_suffix - the suffix of the alias
115: // their_prefix - the prefix of the function being aliased
116: // their_suffix - the suffix of the function being aliased
117: //
118: // notes:
119: // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
120: //
121: // example usage:
122: // PETSC_CUPM_ALIAS_FUNCTION_EXACT(cupm,Malloc,cuda,Malloc) ->
123: // template <typename... T>
124: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
125: // {
126: // return cudaMalloc(std::forward<T>(args)...);
127: // }
128: #define PETSC_CUPM_ALIAS_FUNCTION_EXACT(our_prefix, our_suffix, their_prefix, their_suffix) PETSC_ALIAS_FUNCTION(static PetscConcat(our_prefix, our_suffix), PetscConcat(their_prefix, their_suffix))
130: // PETSC_CUPM_ALIAS_FUNCTION_COMMON() - declaration to alias a cuda/hip function
131: //
132: // input params:
133: // our_suffix - the suffix of the alias
134: // their_suffix - the common suffix of the cuda/hip function being aliased
135: //
136: // notes:
137: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
138: // aliased. see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
139: //
140: // example usage:
141: // #define PETSC_CUPM_PREFIX_L cuda
142: // PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocFancy,Malloc) ->
143: // template <typename... T>
144: // static constexpr auto cupmMallocFancy(T&&... args) *noexcept and trailing return type deduction*
145: // {
146: // return cudaMalloc(std::forward<T>(args)...);
147: // }
148: //
149: // #define PETSC_CUPM_PREFIX_L hip
150: // PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocFancy,Malloc) ->
151: // template <typename... T>
152: // static constexpr auto cupmMallocFancy(T&&... args) *noexcept and trailing return type deduction*
153: // {
154: // return hipMalloc(std::forward<T>(args)...);
155: // }
156: #define PETSC_CUPM_ALIAS_FUNCTION_COMMON(our_suffix, their_suffix) PETSC_CUPM_ALIAS_FUNCTION_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix)
158: // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
159: //
160: // input param:
161: // suffix - the common suffix for hip, cuda and the alias
162: //
163: // notes:
164: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
165: // aliased. see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
166: //
167: // example usage:
168: // #define PETSC_CUPM_PREFIX_L cuda
169: // PETSC_CUPM_ALIAS_FUNCTION(Malloc) ->
170: // template <typename... T>
171: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
172: // {
173: // return cudaMalloc(std::forward<T>(args)...);
174: // }
175: //
176: // #define PETSC_CUPM_PREFIX_L hip
177: // PETSC_CUPM_ALIAS_FUNCTION(Malloc) ->
178: // template <typename... T>
179: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
180: // {
181: // return hipMalloc(std::forward<T>(args)...);
182: // }
183: #define PETSC_CUPM_ALIAS_FUNCTION(suffix) PETSC_CUPM_ALIAS_FUNCTION_COMMON(suffix, suffix)
185: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT() - declaration to alias a cuda/hip function but
186: // discard the last N arguments
187: //
188: // input params:
189: // our_prefix - the prefix of the alias
190: // our_suffix - the suffix of the alias
191: // their_prefix - the prefix of the function being aliased
192: // their_suffix - the suffix of the function being aliased
193: // N - integer constant [0,INT_MAX) dictating how many arguments to chop off the end
194: //
195: // notes:
196: // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
197: //
198: // example use:
199: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(cupm,MallocAsync,cuda,Malloc,1) ->
200: // template <typename... T, typename Tend>
201: // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
202: // return type deduction*
203: // {
204: // (void)argend;
205: // return cudaMalloc(std::forward<T>(args)...);
206: // }
207: #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(our_prefix, our_suffix, their_prefix, their_suffix, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static PetscConcat(our_prefix, our_suffix), PetscConcat(their_prefix, their_suffix), N)
209: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON() - declaration to alias a cuda/hip function but
210: // discard the last N arguments
211: //
212: // input params:
213: // our_suffix - the suffix of the alias
214: // their_suffix - the suffix of the function being aliased
215: // N - integer constant [0,INT_MAX) dictating how many arguments to chop off the end
216: //
217: // notes:
218: // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
219: // aliased. see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the
220: // expansion
221: //
222: // example use:
223: // #define PETSC_CUPM_PREFIX_L cuda
224: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync,Malloc,1) ->
225: // template <typename... T, typename Tend>
226: // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
227: // return type deduction*
228: // {
229: // (void)argend;
230: // return cudaMalloc(std::forward<T>(args)...);
231: // }
232: #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(our_suffix, their_suffix, N) PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix, N)
234: // Base class that holds functions and variables that don't require CUDA or HIP to be present
235: // on the system
236: template <DeviceType T>
237: struct InterfaceBase {
238: static const DeviceType type = T;
240: PETSC_NODISCARD static constexpr const char *cupmName() noexcept
241: {
242: static_assert(util::integral_value(DeviceType::CUDA) == 0, "");
243: static_assert(util::integral_value(DeviceType::HIP) == 1, "");
244: return std::get<util::integral_value(T)>(DeviceTypes);
245: }
247: PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; }
249: PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; }
250: };
252: // declare the base class static member variables
253: template <DeviceType T>
254: const DeviceType InterfaceBase<T>::type;
256: #define PETSC_CUPM_BASE_CLASS_HEADER(base_name, DEVICE_TYPE) \
257: using base_name = ::Petsc::device::cupm::impl::InterfaceBase<DEVICE_TYPE>; \
258: using base_name::type; \
259: using base_name::cupmName; \
260: using base_name::PETSC_DEVICE_CUPM; \
261: using base_name::PETSC_MEMTYPE_CUPM
263: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
264: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
265: // import them into classes.
266: template <DeviceType>
267: struct InterfaceImpl;
269: #if PetscDefined(HAVE_CUDA)
270: #define PETSC_CUPM_PREFIX_L cuda
271: #define PETSC_CUPM_PREFIX_U CUDA
272: template <>
273: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
274: PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::CUDA);
276: // typedefs
277: using cupmError_t = cudaError_t;
278: using cupmEvent_t = cudaEvent_t;
279: using cupmStream_t = cudaStream_t;
280: using cupmDeviceProp_t = cudaDeviceProp;
281: using cupmMemcpyKind_t = cudaMemcpyKind;
282: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
283: using cupmPointerAttributes_t = struct cudaPointerAttributes;
284: using cupmMemoryType_t = enum cudaMemoryType;
285: using cupmDim3 = dim3;
286: using cupmHostFn_t = cudaHostFn_t;
287: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
288: using cupmMemPool_t = cudaMemPool_t;
289: using cupmMemPoolAttr = cudaMemPoolAttr;
290: #else
291: using cupmMemPool_t = void *;
292: using cupmMemPoolAttr = unsigned int;
293: #endif
295: // values
296: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success);
297: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNotReady);
298: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorDeviceAlreadyInUse);
299: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorSetOnActiveProcess);
300: #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
301: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorStubLibrary);
302: #else
303: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorStubLibrary, ErrorInsufficientDriver);
304: #endif
305: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNoDevice);
306: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamDefault);
307: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamNonBlocking);
308: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(DeviceMapHost);
309: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToDevice);
310: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToHost);
311: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToDevice);
312: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToHost);
313: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDefault);
314: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeHost);
315: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeDevice);
316: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeManaged);
317: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(EventDisableTiming);
318: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(HostAllocDefault);
319: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(HostAllocWriteCombined);
320: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
321: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemPoolAttrReleaseThreshold);
322: #else
323: static const cupmMemPoolAttr cupmMemPoolAttrReleaseThreshold = 0;
324: #endif
326: // error functions
327: PETSC_CUPM_ALIAS_FUNCTION(GetErrorName)
328: PETSC_CUPM_ALIAS_FUNCTION(GetErrorString)
329: PETSC_CUPM_ALIAS_FUNCTION(GetLastError)
331: // device management
332: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceCount)
333: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceProperties)
334: PETSC_CUPM_ALIAS_FUNCTION(GetDevice)
335: PETSC_CUPM_ALIAS_FUNCTION(SetDevice)
336: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceFlags)
337: PETSC_CUPM_ALIAS_FUNCTION(SetDeviceFlags)
338: PETSC_CUPM_ALIAS_FUNCTION(PointerGetAttributes)
339: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
340: PETSC_CUPM_ALIAS_FUNCTION(DeviceGetMemPool)
341: PETSC_CUPM_ALIAS_FUNCTION(MemPoolSetAttribute)
342: #else
343: PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
344: {
345: *pool = nullptr;
346: return cupmSuccess;
347: }
349: PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
350: #endif
351: // CUDA has no cudaInit() to match hipInit()
352: PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); }
354: // stream management
355: PETSC_CUPM_ALIAS_FUNCTION(EventCreate)
356: PETSC_CUPM_ALIAS_FUNCTION(EventCreateWithFlags)
357: PETSC_CUPM_ALIAS_FUNCTION(EventDestroy)
358: PETSC_CUPM_ALIAS_FUNCTION(EventRecord)
359: PETSC_CUPM_ALIAS_FUNCTION(EventSynchronize)
360: PETSC_CUPM_ALIAS_FUNCTION(EventElapsedTime)
361: PETSC_CUPM_ALIAS_FUNCTION(EventQuery)
362: PETSC_CUPM_ALIAS_FUNCTION(StreamCreate)
363: PETSC_CUPM_ALIAS_FUNCTION(StreamCreateWithFlags)
364: PETSC_CUPM_ALIAS_FUNCTION(StreamGetFlags)
365: PETSC_CUPM_ALIAS_FUNCTION(StreamDestroy)
366: PETSC_CUPM_ALIAS_FUNCTION(StreamWaitEvent)
367: PETSC_CUPM_ALIAS_FUNCTION(StreamQuery)
368: PETSC_CUPM_ALIAS_FUNCTION(StreamSynchronize)
369: PETSC_CUPM_ALIAS_FUNCTION(DeviceSynchronize)
370: PETSC_CUPM_ALIAS_FUNCTION(GetSymbolAddress)
372: // memory management
373: PETSC_CUPM_ALIAS_FUNCTION(Free)
374: PETSC_CUPM_ALIAS_FUNCTION(Malloc)
375: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
376: PETSC_CUPM_ALIAS_FUNCTION(FreeAsync)
377: PETSC_CUPM_ALIAS_FUNCTION(MallocAsync)
378: #else
379: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(FreeAsync, Free, 1)
380: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync, Malloc, 1)
381: #endif
382: PETSC_CUPM_ALIAS_FUNCTION(Memcpy)
383: PETSC_CUPM_ALIAS_FUNCTION(MemcpyAsync)
384: PETSC_CUPM_ALIAS_FUNCTION(MallocHost)
385: PETSC_CUPM_ALIAS_FUNCTION(FreeHost)
386: PETSC_CUPM_ALIAS_FUNCTION(Memset)
387: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
388: PETSC_CUPM_ALIAS_FUNCTION(MemsetAsync)
389: #else
390: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MemsetAsync, Memset, 1)
391: #endif
393: // launch control
394: PETSC_CUPM_ALIAS_FUNCTION(LaunchHostFunc)
395: template <typename FunctionT, typename... KernelArgsT>
396: PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept
397: {
398: void *args[] = {(void *)&kernelArgs...};
399: return cudaLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
400: }
401: };
402: #undef PETSC_CUPM_PREFIX_L
403: #undef PETSC_CUPM_PREFIX_U
404: #endif // PetscDefined(HAVE_CUDA)
406: #if PetscDefined(HAVE_HIP)
407: #define PETSC_CUPM_PREFIX_L hip
408: #define PETSC_CUPM_PREFIX_U HIP
409: template <>
410: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
411: PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::HIP);
413: // typedefs
414: using cupmError_t = hipError_t;
415: using cupmEvent_t = hipEvent_t;
416: using cupmStream_t = hipStream_t;
417: using cupmDeviceProp_t = hipDeviceProp_t;
418: using cupmMemcpyKind_t = hipMemcpyKind;
419: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
420: using cupmPointerAttributes_t = hipPointerAttribute_t;
421: using cupmMemoryType_t = enum hipMemoryType;
422: using cupmDim3 = dim3;
423: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
424: using cupmHostFn_t = hipHostFn_t;
425: using cupmMemPool_t = hipMemPool_t;
426: using cupmMemPoolAttr = hipMemPoolAttr;
427: #else
428: using cupmHostFn_t = void (*)(void *);
429: using cupmMemPool_t = void *;
430: using cupmMemPoolAttr = unsigned int;
431: #endif
433: // values
434: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success);
435: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNotReady);
436: // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
437: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorDeviceAlreadyInUse, ErrorContextAlreadyInUse);
438: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorSetOnActiveProcess);
439: // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
440: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorStubLibrary, ErrorInsufficientDriver);
441: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNoDevice);
442: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamDefault);
443: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamNonBlocking);
444: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(DeviceMapHost);
445: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToDevice);
446: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToHost);
447: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToDevice);
448: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToHost);
449: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDefault);
450: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeHost);
451: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeDevice);
452: // see
453: // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
454: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(MemoryTypeManaged, MemoryTypeUnified);
455: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(EventDisableTiming);
456: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(HostAllocDefault, HostMallocDefault);
457: PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(HostAllocWriteCombined, HostMallocWriteCombined);
458: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
459: PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemPoolAttrReleaseThreshold);
460: #else
461: static const cupmMemPoolAttr cupmMemPoolAttrReleaseThreshold = 0;
462: #endif
464: // error functions
465: PETSC_CUPM_ALIAS_FUNCTION(GetErrorName)
466: PETSC_CUPM_ALIAS_FUNCTION(GetErrorString)
467: PETSC_CUPM_ALIAS_FUNCTION(GetLastError)
469: // device management
470: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceCount)
471: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceProperties)
472: PETSC_CUPM_ALIAS_FUNCTION(GetDevice)
473: PETSC_CUPM_ALIAS_FUNCTION(SetDevice)
474: PETSC_CUPM_ALIAS_FUNCTION(GetDeviceFlags)
475: PETSC_CUPM_ALIAS_FUNCTION(SetDeviceFlags)
476: PETSC_CUPM_ALIAS_FUNCTION(PointerGetAttributes)
477: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
478: PETSC_CUPM_ALIAS_FUNCTION(DeviceGetMemPool)
479: PETSC_CUPM_ALIAS_FUNCTION(MemPoolSetAttribute)
480: #else
481: PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
482: {
483: *pool = nullptr;
484: return cupmSuccess;
485: }
487: PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
488: #endif
489: PETSC_CUPM_ALIAS_FUNCTION(Init)
491: // stream management
492: PETSC_CUPM_ALIAS_FUNCTION(EventCreate)
493: PETSC_CUPM_ALIAS_FUNCTION(EventCreateWithFlags)
494: PETSC_CUPM_ALIAS_FUNCTION(EventDestroy)
495: PETSC_CUPM_ALIAS_FUNCTION(EventRecord)
496: PETSC_CUPM_ALIAS_FUNCTION(EventSynchronize)
497: PETSC_CUPM_ALIAS_FUNCTION(EventElapsedTime)
498: PETSC_CUPM_ALIAS_FUNCTION(EventQuery)
499: PETSC_CUPM_ALIAS_FUNCTION(StreamCreate)
500: PETSC_CUPM_ALIAS_FUNCTION(StreamCreateWithFlags)
501: PETSC_CUPM_ALIAS_FUNCTION(StreamGetFlags)
502: PETSC_CUPM_ALIAS_FUNCTION(StreamDestroy)
503: PETSC_CUPM_ALIAS_FUNCTION(StreamWaitEvent)
504: PETSC_CUPM_ALIAS_FUNCTION(StreamQuery)
505: PETSC_CUPM_ALIAS_FUNCTION(StreamSynchronize)
506: PETSC_CUPM_ALIAS_FUNCTION(DeviceSynchronize)
507: PETSC_CUPM_ALIAS_FUNCTION(GetSymbolAddress)
509: // memory management
510: PETSC_CUPM_ALIAS_FUNCTION(Free)
511: PETSC_CUPM_ALIAS_FUNCTION(Malloc)
512: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
513: PETSC_CUPM_ALIAS_FUNCTION(MallocAsync)
514: PETSC_CUPM_ALIAS_FUNCTION(FreeAsync)
515: #else
516: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync, Malloc, 1)
517: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(FreeAsync, Free, 1)
518: #endif
519: PETSC_CUPM_ALIAS_FUNCTION(Memcpy)
520: PETSC_CUPM_ALIAS_FUNCTION(MemcpyAsync)
521: // hipMallocHost is deprecated
522: PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocHost, HostMalloc)
523: // hipFreeHost is deprecated
524: PETSC_CUPM_ALIAS_FUNCTION_COMMON(FreeHost, HostFree)
525: PETSC_CUPM_ALIAS_FUNCTION(Memset)
526: PETSC_CUPM_ALIAS_FUNCTION(MemsetAsync)
528: // launch control
529: // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
530: // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
531: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
532: PETSC_CUPM_ALIAS_FUNCTION(LaunchHostFunc)
533: #else
534: PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept
535: {
536: // the only correct way to spoof this function is to do it synchronously...
537: auto herr = hipStreamSynchronize(stream);
538: if (PetscUnlikely(herr != hipSuccess)) return herr;
539: fn(ctx);
540: return herr;
541: }
542: #endif
544: template <typename FunctionT, typename... KernelArgsT>
545: PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept
546: {
547: void *args[] = {(void *)&kernelArgs...};
548: return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
549: }
550: };
551: #undef PETSC_CUPM_PREFIX_L
552: #undef PETSC_CUPM_PREFIX_U
553: #endif // PetscDefined(HAVE_HIP)
555: // shorthand for bringing all of the typedefs from the base Interface class into your own,
556: // it's annoying that c++ doesn't have a way to do this automatically
557: #define PETSC_CUPM_IMPL_CLASS_HEADER(base_name, T) \
558: PETSC_CUPM_BASE_CLASS_HEADER(PetscConcat(base_, base_name), T); \
559: using base_name = ::Petsc::device::cupm::impl::InterfaceImpl<T>; \
560: /* types */ \
561: using typename base_name::cupmComplex_t; \
562: using typename base_name::cupmError_t; \
563: using typename base_name::cupmEvent_t; \
564: using typename base_name::cupmStream_t; \
565: using typename base_name::cupmDeviceProp_t; \
566: using typename base_name::cupmMemcpyKind_t; \
567: using typename base_name::cupmPointerAttributes_t; \
568: using typename base_name::cupmMemoryType_t; \
569: using typename base_name::cupmDim3; \
570: using typename base_name::cupmMemPool_t; \
571: using typename base_name::cupmMemPoolAttr; \
572: /* variables */ \
573: using base_name::cupmSuccess; \
574: using base_name::cupmErrorNotReady; \
575: using base_name::cupmErrorDeviceAlreadyInUse; \
576: using base_name::cupmErrorSetOnActiveProcess; \
577: using base_name::cupmErrorStubLibrary; \
578: using base_name::cupmErrorNoDevice; \
579: using base_name::cupmStreamDefault; \
580: using base_name::cupmStreamNonBlocking; \
581: using base_name::cupmDeviceMapHost; \
582: using base_name::cupmMemcpyHostToDevice; \
583: using base_name::cupmMemcpyDeviceToHost; \
584: using base_name::cupmMemcpyDeviceToDevice; \
585: using base_name::cupmMemcpyHostToHost; \
586: using base_name::cupmMemcpyDefault; \
587: using base_name::cupmMemoryTypeHost; \
588: using base_name::cupmMemoryTypeDevice; \
589: using base_name::cupmMemoryTypeManaged; \
590: using base_name::cupmEventDisableTiming; \
591: using base_name::cupmHostAllocDefault; \
592: using base_name::cupmHostAllocWriteCombined; \
593: using base_name::cupmMemPoolAttrReleaseThreshold; \
594: /* functions */ \
595: using base_name::cupmGetErrorName; \
596: using base_name::cupmGetErrorString; \
597: using base_name::cupmGetLastError; \
598: using base_name::cupmGetDeviceCount; \
599: using base_name::cupmGetDeviceProperties; \
600: using base_name::cupmGetDevice; \
601: using base_name::cupmSetDevice; \
602: using base_name::cupmGetDeviceFlags; \
603: using base_name::cupmSetDeviceFlags; \
604: using base_name::cupmPointerGetAttributes; \
605: using base_name::cupmDeviceGetMemPool; \
606: using base_name::cupmMemPoolSetAttribute; \
607: using base_name::cupmInit; \
608: using base_name::cupmEventCreate; \
609: using base_name::cupmEventCreateWithFlags; \
610: using base_name::cupmEventDestroy; \
611: using base_name::cupmEventRecord; \
612: using base_name::cupmEventSynchronize; \
613: using base_name::cupmEventElapsedTime; \
614: using base_name::cupmEventQuery; \
615: using base_name::cupmStreamCreate; \
616: using base_name::cupmStreamCreateWithFlags; \
617: using base_name::cupmStreamGetFlags; \
618: using base_name::cupmStreamDestroy; \
619: using base_name::cupmStreamWaitEvent; \
620: using base_name::cupmStreamQuery; \
621: using base_name::cupmStreamSynchronize; \
622: using base_name::cupmDeviceSynchronize; \
623: using base_name::cupmGetSymbolAddress; \
624: using base_name::cupmMalloc; \
625: using base_name::cupmMallocAsync; \
626: using base_name::cupmMemcpy; \
627: using base_name::cupmMemcpyAsync; \
628: using base_name::cupmMallocHost; \
629: using base_name::cupmMemset; \
630: using base_name::cupmMemsetAsync; \
631: using base_name::cupmLaunchHostFunc
633: template <DeviceType>
634: struct Interface;
636: // The actual interface class
637: template <DeviceType T>
638: struct Interface : InterfaceImpl<T> {
639: PETSC_CUPM_IMPL_CLASS_HEADER(interface_type, T);
641: using cupmReal_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
642: using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;
644: // REVIEW ME: this needs to be cleaned up, it is unreadable
645: PETSC_NODISCARD static constexpr cupmScalar_t makeCupmScalar(PetscScalar s) noexcept
646: {
647: #if PetscDefined(USE_COMPLEX)
648: return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)};
649: #else
650: return static_cast<cupmReal_t>(s);
651: #endif
652: }
654: PETSC_NODISCARD static constexpr const cupmScalar_t *cupmScalarCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); }
656: PETSC_NODISCARD static constexpr cupmScalar_t *cupmScalarCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); }
658: PETSC_NODISCARD static constexpr const cupmReal_t *cupmRealCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); }
660: PETSC_NODISCARD static constexpr cupmReal_t *cupmRealCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); }
662: #if !defined(PETSC_PKG_CUDA_VERSION_GE)
663: #define PETSC_PKG_CUDA_VERSION_GE(...) 0
664: #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
665: #endif
666: PETSC_NODISCARD static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept
667: {
668: cupmPointerAttributes_t attr;
669: cupmError_t cerr;
672: if (registered) {
674: *registered = PETSC_FALSE;
675: }
676: if (managed) {
678: *managed = PETSC_FALSE;
679: }
680: // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
681: // a host pointer returns cudaErrorInvalidValue
682: cerr = cupmPointerGetAttributes(&attr, data);
683: cerr = cupmGetLastError();
684: // HIP seems to always have used memoryType though
685: #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || defined(__HIP_PLATFORM_HCC__)
686: const auto mtype = attr.memoryType;
687: if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
688: #else
689: if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) cerr;
690: const auto mtype = attr.type;
691: if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
692: #endif // CUDART_VERSION && CUDART_VERSION < 10000 || __HIP_PLATFORM_HCC__
693: if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
694: if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
695: return 0;
696: }
697: #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
698: #undef PETSC_PKG_CUDA_VERSION_GE
699: #endif
701: PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept
702: {
703: switch (mode) {
704: case PETSC_DEVICE_COPY_HTOH:
705: return cupmMemcpyHostToHost;
706: case PETSC_DEVICE_COPY_HTOD:
707: return cupmMemcpyHostToDevice;
708: case PETSC_DEVICE_COPY_DTOD:
709: return cupmMemcpyDeviceToDevice;
710: case PETSC_DEVICE_COPY_DTOH:
711: return cupmMemcpyDeviceToHost;
712: case PETSC_DEVICE_COPY_AUTO:
713: return cupmMemcpyDefault;
714: }
715: PetscUnreachable();
716: return cupmMemcpyDefault;
717: }
719: // these change what the arguments mean, so need to namespace these
720: template <typename M>
721: PETSC_NODISCARD static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept
722: {
723: static_assert(!std::is_void<M>::value, "");
726: if (PetscLikely(n)) {
727: cupmMallocAsync(reinterpret_cast<void **>(ptr), n * sizeof(M), stream);
728: } else {
729: *ptr = nullptr;
730: }
731: return 0;
732: }
734: template <typename M>
735: PETSC_NODISCARD static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept
736: {
737: PetscCUPMMallocAsync(ptr, n);
738: return 0;
739: }
741: template <typename M>
742: PETSC_NODISCARD static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept
743: {
744: static_assert(!std::is_void<M>::value, "");
747: *ptr = nullptr;
748: cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags);
749: return 0;
750: }
752: template <typename D, typename S = D>
753: PETSC_NODISCARD static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept
754: {
755: static_assert(sizeof(D) == sizeof(S), "");
756: static_assert(!std::is_void<D>::value && !std::is_void<S>::value, "");
757: const auto size = n * sizeof(D);
759: if (PetscUnlikely(!n)) return 0;
763: // do early return after nullptr check since we need to check that they arent both nullptrs
764: if (PetscUnlikely(dest == src)) return 0;
765: if (kind == cupmMemcpyHostToHost) {
766: // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy()
767: // than it is to just call the vendor functions. This assumes of course that the stream
768: // accounts for both memory regions being "idle"
769: if (cupmStreamQuery(stream) == cupmSuccess) {
770: PetscMemcpy(dest, src, size);
771: return 0;
772: }
773: // need to clear the potential cupmErrorNotReady generated by query above...
774: auto cerr = cupmGetLastError();
776: if (PetscUnlikely(cerr != cupmErrorNotReady)) cerr;
777: }
778: if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
779: cupmMemcpyAsync(dest, src, size, kind, stream);
780: } else {
781: cupmMemcpy(dest, src, size, kind);
782: }
784: // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
785: // (yet) or don't know the direction
786: if (kind == cupmMemcpyDeviceToHost) {
787: PetscLogGpuToCpu(size);
788: } else if (kind == cupmMemcpyHostToDevice) {
789: PetscLogCpuToGpu(size);
790: }
791: return 0;
792: }
794: template <typename D, typename S = D>
795: PETSC_NODISCARD static PetscErrorCode PetscCUPMMemcpy(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind) noexcept
796: {
797: PetscCUPMMemcpyAsync(dest, src, n, kind);
798: return 0;
799: }
801: template <typename M>
802: PETSC_NODISCARD static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept
803: {
804: static_assert(!std::is_void<M>::value, "");
806: if (PetscLikely(n)) {
807: const auto bytes = n * sizeof(M);
810: if (stream || use_async) {
811: cupmMemsetAsync(ptr, value, bytes, stream);
812: } else {
813: cupmMemset(ptr, value, bytes);
814: }
815: }
816: return 0;
817: }
819: template <typename M>
820: PETSC_NODISCARD static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept
821: {
822: PetscCUPMMemsetAsync(ptr, value, n);
823: return 0;
824: }
826: // these we can transparently wrap, no need to namespace it to Petsc
827: template <typename M>
828: PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &&ptr, cupmStream_t stream = nullptr) noexcept
829: {
830: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
832: if (ptr) {
833: auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);
835: ptr = nullptr;
836: if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
837: }
838: return cupmSuccess;
839: }
841: PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) noexcept { return interface_type::cupmFreeAsync(ptr, stream); }
843: template <typename M>
844: PETSC_NODISCARD static cupmError_t cupmFree(M &&ptr) noexcept
845: {
846: return cupmFreeAsync(std::forward<M>(ptr));
847: }
849: PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) noexcept { return cupmFreeAsync(ptr); }
851: template <typename M>
852: PETSC_NODISCARD static cupmError_t cupmFreeHost(M &&ptr) noexcept
853: {
854: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
855: const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
856: ptr = nullptr;
857: return cerr;
858: }
860: PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) noexcept { return interface_type::cupmFreeHost(ptr); }
862: // specific wrapper for device launch function, as the real function is a C routine and
863: // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
864: // boils down to the fact that ultimately we pass a
865: //
866: // void *args[] = {(void*)&kernel_args...};
867: //
868: // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
869: // happen to the kernel arguments so we must do it ourselves here. This function does this in
870: // 3 stages:
871: // 1. Enumerate the kernel arguments (cupmLaunchKernel)
872: // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
873: // expected by func() using the enumeration above (deduceKernelCall)
874: // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
875: // it. (interface_type::cupmLaunchKernel)
876: template <typename F, typename... Args>
877: PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
878: {
879: return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
880: }
882: template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
883: PETSC_NODISCARD static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept
884: {
885: static_assert(block_size > 0, "");
886: static_assert(warp_size > 0, "");
887: // want block_size to be a multiple of the warp_size
888: static_assert(block_size % warp_size == 0, "");
889: const auto nthread = std::min(n, block_size);
890: const auto nblock = (n + block_size - 1) / block_size;
892: // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
893: // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
894: PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
895: cupmLaunchKernel(std::forward<F>(func), nblock, nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...);
896: return 0;
897: }
899: private:
900: template <typename S, typename D, typename = void>
901: struct is_static_castable : std::false_type { };
903: template <typename S, typename D>
904: struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };
906: template <typename D, typename S>
907: static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
908: {
909: return static_cast<D>(std::forward<S>(src));
910: }
912: template <typename D, typename S>
913: static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
914: {
915: return const_cast<D>(std::forward<S>(src));
916: }
918: template <typename F, typename... Args, std::size_t... Idx>
919: PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
920: {
921: // clang-format off
922: return interface_type::template cupmLaunchKernel(
923: std::forward<F>(func),
924: std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
925: // can't static_cast() here since the function argument type may be cv-qualified, in
926: // which case we would need to const_cast(). But you can only const_cast()
927: // indirect types (pointers, references) and I don't want to add a
928: // static_cast_that_becomes_a_const_cast() SFINAE monster to this template mess. C-style
929: // casts luckily work here since it tries the following and uses the first one that
930: // succeeds:
931: // 1. const_cast()
932: // 2. static_cast()
933: // 3. static_cast() then const_cast()
934: // 4. reinterpret_cast()...
935: // hopefully we never get to reinterpret_cast() land
936: //(typename util::func_traits<F>::template arg<Idx>::type)(kernelArgs)...
937: cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
938: );
939: // clang-format on
940: }
941: };
943: #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(base_name, T) \
944: PETSC_CUPM_IMPL_CLASS_HEADER(PetscConcat(base_name, _impl), T); \
945: using base_name = ::Petsc::device::cupm::impl::Interface<T>; \
946: using typename base_name::cupmReal_t; \
947: using typename base_name::cupmScalar_t; \
948: using base_name::makeCupmScalar; \
949: using base_name::cupmScalarCast; \
950: using base_name::cupmRealCast; \
951: using base_name::PetscCUPMGetMemType; \
952: using base_name::PetscCUPMMemset; \
953: using base_name::PetscCUPMMemsetAsync; \
954: using base_name::PetscCUPMMalloc; \
955: using base_name::PetscCUPMMallocAsync; \
956: using base_name::PetscCUPMMallocHost; \
957: using base_name::PetscCUPMMemcpy; \
958: using base_name::PetscCUPMMemcpyAsync; \
959: using base_name::cupmFree; \
960: using base_name::cupmFreeAsync; \
961: using base_name::cupmFreeHost; \
962: using base_name::cupmLaunchKernel; \
963: using base_name::PetscCUPMLaunchKernel1D; \
964: using base_name::PetscDeviceCopyModeToCUPMMemcpyKind
966: } // namespace impl
968: } // namespace cupm
970: } // namespace device
972: } // namespace Petsc
974: #endif /* __cplusplus */
976: #endif /* PETSCCUPMINTERFACE_HPP */