/* * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ #ifndef _CUDA_PIPELINE_HELPERS_H_ # define _CUDA_PIPELINE_HELPERS_H_ # define _CUDA_PIPELINE_NAMESPACE nvcuda::experimental # define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental { # define _CUDA_PIPELINE_END_NAMESPACE } } # define _CUDA_PIPELINE_INTERNAL_NAMESPACE _CUDA_PIPELINE_NAMESPACE::__pipeline_internal # define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal { # define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE } _CUDA_PIPELINE_END_NAMESPACE # if !defined(_CUDA_PIPELINE_QUALIFIER) # define _CUDA_PIPELINE_QUALIFIER inline __device__ # endif # if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER) # define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__ # endif # if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) # define _CUDA_PIPELINE_ARCH_700_OR_LATER # endif # if (__CUDA_ARCH__ >= 800) # define _CUDA_PIPELINE_HAS_ASYNC_COPY 1 # else # define _CUDA_PIPELINE_HAS_ASYNC_COPY 0 # endif # if !defined(_CUDA_PIPELINE_MAX_STAGES) # define _CUDA_PIPELINE_MAX_STAGES 8 # endif # if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900))) # define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER # endif # if !defined(_CUDA_PIPELINE_DEBUG) # if defined(__CUDACC_DEBUG__) # define _CUDA_PIPELINE_DEBUG 1 # else # define _CUDA_PIPELINE_DEBUG 0 # endif # endif # if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG) # if !defined(__CUDACC_RTC__) # include # endif # define _CUDA_PIPELINE_ASSERT(x) assert((x)); # define _CUDA_PIPELINE_ABORT() assert(0); # else # define _CUDA_PIPELINE_ASSERT(x) # define _CUDA_PIPELINE_ABORT() __trap(); # endif # if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER) # define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m) # else # define _CUDA_PIPELINE_STATIC_ASSERT(c, m) # endif # if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__) # define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r" # else # define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l" # endif # if defined(__CUDACC_RTC__) typedef unsigned int uint32_t; typedef unsigned long long uint64_t; typedef uint64_t uintptr_t; # else # include # endif _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) == 2, "Size mismatch for type 'short'"); _CUDA_PIPELINE_STATIC_ASSERT(sizeof(int) == 4, "Size mismatch for type 'int'"); _CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2) == 8, "Size mismatch for type 'int2'"); _CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4) == 16, "Size mismatch for type 'int4'"); extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *); template _CUDA_PIPELINE_QUALIFIER void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src) { _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size"); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(dst) & (CopySize - 1))); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(src) & (CopySize - 1))); char* const d = reinterpret_cast(dst); const char* const s = reinterpret_cast(src); size_t copy_step_size; if (SourceSize == 0) { copy_step_size = CopySize; } else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) { copy_step_size = SourceSize; } else { copy_step_size = 1; } for (size_t i = 0; i < CopySize; i += copy_step_size) { const bool copy_source = SourceSize && (i < SourceSize); switch (copy_step_size) { case 1: d[i] = copy_source ? s[i] : char(); break; case 2: *reinterpret_cast(d + i) = copy_source ? *reinterpret_cast(s + i) : short(); break; case 4: *reinterpret_cast(d + i) = copy_source ? *reinterpret_cast(s + i) : int(); break; case 8: *reinterpret_cast(d + i) = copy_source ? *reinterpret_cast(s + i) : int2(); break; case 16: *reinterpret_cast(d + i) = copy_source ? *reinterpret_cast(s + i) : int4(); break; } } } template struct ImplementationChooser; template<> struct ImplementationChooser { template struct CpAsyncChooser { _CUDA_PIPELINE_STATIC_QUALIFIER void cp_async(void* __restrict__ dst, const void* __restrict__ src) { asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;" : : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize), "n"(SourceSize) : "memory"); } }; template struct CpAsyncChooser<16, SourceSize> { _CUDA_PIPELINE_STATIC_QUALIFIER void cp_async(void* __restrict__ dst, const void* __restrict__ src) { asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;" : : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize) : "memory"); } }; template _CUDA_PIPELINE_STATIC_QUALIFIER void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src) { _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size"); _CUDA_PIPELINE_ASSERT(__isShared(dst)); _CUDA_PIPELINE_ASSERT(__isGlobal(src)); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(dst) & (CopySize - 1))); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(src) & (CopySize - 1))); CpAsyncChooser::cp_async(dst, src); } _CUDA_PIPELINE_STATIC_QUALIFIER void pipeline_commit() { asm volatile ("cp.async.commit_group;"); } template _CUDA_PIPELINE_STATIC_QUALIFIER void pipeline_wait_prior() { asm volatile ("cp.async.wait_group %0;" : : "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES)); } _CUDA_PIPELINE_STATIC_QUALIFIER void pipeline_arrive_on(uint64_t* barrier) { _CUDA_PIPELINE_ASSERT(__isShared(barrier)); asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];" : : "r"(__nvvm_get_smem_pointer(barrier))); } }; template<> struct ImplementationChooser { template _CUDA_PIPELINE_STATIC_QUALIFIER void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src) { _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size"); _CUDA_PIPELINE_ASSERT(__isShared(dst)); _CUDA_PIPELINE_ASSERT(__isGlobal(src)); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(dst) & (CopySize - 1))); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(src) & (CopySize - 1))); pipeline_memcpy_sync(dst, src); } _CUDA_PIPELINE_STATIC_QUALIFIER void pipeline_commit() { } template _CUDA_PIPELINE_STATIC_QUALIFIER void pipeline_wait_prior() { } _CUDA_PIPELINE_STATIC_QUALIFIER void pipeline_arrive_on(uint64_t* barrier) { } }; template _CUDA_PIPELINE_QUALIFIER void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src) { _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size"); _CUDA_PIPELINE_ASSERT(__isShared(dst)); _CUDA_PIPELINE_ASSERT(__isGlobal(src)); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(dst) & (CopySize - 1))); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(src) & (CopySize - 1))); ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async(dst, src); } _CUDA_PIPELINE_QUALIFIER void pipeline_commit() { ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit(); } template _CUDA_PIPELINE_QUALIFIER void pipeline_wait_prior() { ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior(); } _CUDA_PIPELINE_QUALIFIER void pipeline_arrive_on(uint64_t* barrier) { ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier); } template _CUDA_PIPELINE_QUALIFIER void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src) { _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size."); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(src) & (CopySize - 1))); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(dst) & (CopySize - 1))); if (__isGlobal(src) && __isShared(dst)) { pipeline_memcpy_async(dst, src); } else { pipeline_memcpy_sync(dst, src); } } template _CUDA_PIPELINE_QUALIFIER void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src) { _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(src) & (Align - 1))); _CUDA_PIPELINE_ASSERT(!(reinterpret_cast(dst) & (Align - 1))); const char* s = reinterpret_cast(src); char* d = reinterpret_cast(dst); size_t remaining = CopySize; while (remaining) { if ((Align >= 16) && (remaining >= 16)) { pipeline_copy_strict<16, 16>(dst, src); d += 16; s += 16; remaining -= 16; } else if ((Align >= 8) && (remaining >= 8)) { pipeline_copy_strict<8, 8>(dst, src); d += 8; s += 8; remaining -= 8; } else if ((Align >= 4) && (remaining >= 4)) { pipeline_copy_strict<4, 4>(dst, src); d += 4; s += 4; remaining -= 4; } else if ((Align >= 2) && (remaining >= 2)) { *reinterpret_cast(d) = *reinterpret_cast(s); d += 2; s += 2; remaining -= 2; } else { *d = *s; d += 1; s += 1; remaining -= 1; } } } _CUDA_PIPELINE_END_INTERNAL_NAMESPACE #endif /* !_CUDA_PIPELINE_HELPERS_H_ */