init - 初始化项目

2022-05-06 01:58:53 +08:00
commit 90a5cc7cb6
6772 changed files with 2837787 additions and 0 deletions
--- a/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
@@ -0,0 +1,637 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
+
+#include "cudnn.hpp"
+#include "activation.hpp"
+
+#include "../pointer.hpp"
+#include "../workspace.hpp"
+
+#include <cudnn.h>
+
+#include <cstddef>
+#include <array>
+#include <algorithm>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+    /** describe convolution filters
+     *
+     * @tparam  T   type of elements in the kernels
+     */
+    template <class T>
+    class FilterDescriptor {
+    public:
+        FilterDescriptor() noexcept : descriptor{ nullptr } { }
+        FilterDescriptor(const FilterDescriptor&) = delete;
+        FilterDescriptor(FilterDescriptor&& other) noexcept
+            : descriptor{ other.descriptor } {
+            other.descriptor = nullptr;
+        }
+
+        /** constructs a filter descriptor from the filter dimensions provided in \p shape
+         *
+         * Shape dimensions:
+         * 0: number of filters
+         * 1: number of input feature maps
+         * 2..n: kernel dimensions
+         *
+         * Exception Guarantee: Strong
+         */
+        template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+        FilterDescriptor(const SequenceContainer& shape) {
+            constructor(shape.begin(), shape.end());
+        }
+
+        /** constructs a filter descriptor from the filter dimensions provided in [begin, end)
+         *
+         * Shape dimensions:
+         * 0: number of filters
+         * 1: number of input feature maps
+         * 2..n: kernel dimensions
+         *
+         * Exception Guarantee: Strong
+         */
+        template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
+        FilterDescriptor(ForwardItr begin, ForwardItr end) {
+            constructor(begin, end);
+        }
+
+        /** constructs a filter descriptor from the filter dimensions provided as arguments
+         *
+         * Shape dimensions:
+         * 0: number of filters
+         * 1: number of input feature maps
+         * 2..n: kernel dimensions
+         *
+         * Exception Guarantee: Strong
+         */
+        template <class ...Sizes>
+        FilterDescriptor(Sizes ...sizes) {
+            static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions");
+            static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
+            std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
+            constructor(std::begin(dims), std::end(dims));
+        }
+
+        ~FilterDescriptor() noexcept {
+            if (descriptor != nullptr) {
+                /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
+            }
+        }
+
+        FilterDescriptor& operator=(const FilterDescriptor&) = delete;
+        FilterDescriptor& operator=(FilterDescriptor&& other) noexcept {
+            descriptor = other.descriptor;
+            other.descriptor = nullptr;
+            return *this;
+        };
+
+        cudnnFilterDescriptor_t get() const noexcept { return descriptor; }
+
+    private:
+        template <class ForwardItr>
+        void constructor(ForwardItr start, ForwardItr end) {
+            CV_Assert(start != end);
+            CV_Assert(std::distance(start, end) >= 3);
+            CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
+
+            CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor));
+            try {
+                const auto rank = std::distance(start, end);
+                if (rank == 4) {
+                    std::array<int, 4> dims;
+                    std::copy(start, end, std::begin(dims));
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetFilter4dDescriptor(
+                            descriptor,
+                            detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
+                            dims[0], dims[1], dims[2], dims[3]
+                        )
+                    );
+                } else {
+                    std::vector<int> dims(start, end);
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetFilterNdDescriptor(
+                            descriptor,
+                            detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
+                            dims.size(), dims.data()
+                        )
+                    );
+                }
+            } catch (...) {
+                /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
+                throw;
+            }
+        }
+
+        cudnnFilterDescriptor_t descriptor;
+    };
+
+    /** describes a convolution operation
+     *
+     * @tparam  T   type of element participating in convolution
+     */
+    template <class T>
+    class ConvolutionDescriptor {
+    public:
+        ConvolutionDescriptor() noexcept : descriptor{ nullptr } { }
+        ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
+        ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept
+            : descriptor{ other.descriptor } {
+            other.descriptor = nullptr;
+        }
+
+        /** constructs a convolution descriptor
+         *
+         * Pre-conditions:
+         * - \p zero_padding, \p stride and \p dilation must have the same size
+         *
+         * The length of the containers is interpreted as the order of the convolution.
+         *
+         * Exception Guarantee: Strong
+         */
+        template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+        ConvolutionDescriptor(
+            const SequenceContainer& zero_padding,
+            const SequenceContainer& stride,
+            const SequenceContainer& dilation,
+            std::size_t group_count)
+        {
+            constructor(zero_padding, stride, dilation, group_count);
+        }
+
+        ~ConvolutionDescriptor() noexcept {
+            if (descriptor != nullptr) {
+                /* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
+            }
+        }
+
+        ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
+        ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept {
+            descriptor = other.descriptor;
+            other.descriptor = nullptr;
+            return *this;
+        };
+
+        cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; }
+
+    private:
+        template <class SequenceContainer>
+        void constructor(
+            const SequenceContainer& zero_padding,
+            const SequenceContainer& stride,
+            const SequenceContainer& dilation,
+            std::size_t group_count)
+        {
+            CV_Assert(zero_padding.size() == stride.size());
+            CV_Assert(zero_padding.size() == dilation.size());
+
+            CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor));
+            try {
+                const auto rank = zero_padding.size();
+                if (rank == 2) {
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetConvolution2dDescriptor(
+                            descriptor,
+                            zero_padding[0], zero_padding[1],
+                            stride[0], stride[1],
+                            dilation[0], dilation[1],
+                            CUDNN_CROSS_CORRELATION,
+                            detail::get_data_type<T>()
+                        )
+                    );
+                } else {
+                    std::vector<int> ipadding(std::begin(zero_padding), std::end(zero_padding));
+                    std::vector<int> istride(std::begin(stride), std::end(stride));
+                    std::vector<int> idilation(std::begin(dilation), std::end(dilation));
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetConvolutionNdDescriptor(
+                            descriptor,
+                            rank, ipadding.data(), istride.data(), idilation.data(),
+                            CUDNN_CROSS_CORRELATION,
+                            detail::get_data_type<T>()
+                        )
+                    );
+                }
+                CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
+
+#if CUDNN_MAJOR >= 8
+                /* cuDNN 7 and below use FMA math by default. cuDNN 8 includes TF32 Tensor Ops
+                 * in the default setting. TF32 convolutions have lower precision than FP32.
+                 * Hence, we set the math type to CUDNN_FMA_MATH to reproduce old behavior.
+                 */
+                CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_FMA_MATH));
+#endif
+
+                if (std::is_same<T, half>::value)
+                    CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
+            } catch (...) {
+                /* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
+                throw;
+            }
+        }
+
+        cudnnConvolutionDescriptor_t descriptor;
+    };
+
+    /** wrapper around a convolution algorithm
+     *
+     * @tparam  T   type of elements being convolved
+     */
+    template <class T>
+    class ConvolutionAlgorithm {
+    public:
+        ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
+        ConvolutionAlgorithm(ConvolutionAlgorithm&) = default;
+        ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default;
+
+        /** selects a good algorithm for convolution for given configuration
+         *
+         * Exception Guarantee: Strong
+         */
+        ConvolutionAlgorithm(
+            const Handle& handle,
+            const ConvolutionDescriptor<T>& convDesc,
+            const FilterDescriptor<T>& filterDesc,
+            const TensorDescriptor<T>& inputDesc,
+            const TensorDescriptor<T>& outputDesc)
+        {
+#if CUDNN_MAJOR >= 8
+            int requestedAlgoCount = 0, returnedAlgoCount = 0;
+            CUDA4DNN_CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle.get(), &requestedAlgoCount));
+            std::vector<cudnnConvolutionFwdAlgoPerf_t> results(requestedAlgoCount);
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnGetConvolutionForwardAlgorithm_v7(
+                    handle.get(),
+                    inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
+                    requestedAlgoCount,
+                    &returnedAlgoCount,
+                    &results[0]
+                )
+            );
+
+            size_t free_memory, total_memory;
+            CUDA4DNN_CHECK_CUDA(cudaMemGetInfo(&free_memory, &total_memory));
+
+            bool found_conv_algorithm = false;
+            for (int i = 0; i < returnedAlgoCount; i++)
+            {
+                if (results[i].status == CUDNN_STATUS_SUCCESS &&
+                    results[i].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+                    results[i].memory < free_memory)
+                {
+                    found_conv_algorithm = true;
+                    algo = results[i].algo;
+                    workspace_size = results[i].memory;
+                    break;
+                }
+            }
+
+            if (!found_conv_algorithm)
+                CV_Error (cv::Error::GpuApiCallError, "cuDNN did not return a suitable algorithm for convolution.");
+#else
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnGetConvolutionForwardAlgorithm(
+                    handle.get(),
+                    inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
+                    CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+                    0, /* no memory limit */
+                    &algo
+                )
+            );
+
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnGetConvolutionForwardWorkspaceSize(
+                    handle.get(),
+                    inputDesc.get(), filterDesc.get(), convDesc.get(), outputDesc.get(),
+                    algo, &workspace_size
+                )
+            );
+#endif
+        }
+
+        ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
+        ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default;
+
+        cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; }
+
+        /** number of bytes of workspace memory required by the algorithm */
+        std::size_t get_workspace_size() const noexcept { return workspace_size; }
+
+    private:
+        cudnnConvolutionFwdAlgo_t algo;
+        std::size_t workspace_size;
+    };
+
+    /** gives the shape of the output tensor of convolution
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void getConvolutionForwardOutputDim(
+        const ConvolutionDescriptor<T>& convDesc,
+        const FilterDescriptor<T>& filterDesc,
+        const TensorDescriptor<T>& inputDesc,
+        std::vector<int>& output)
+    {
+        output.clear();
+        output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */
+
+        std::vector<int> temp(CUDNN_DIM_MAX);
+        cudnnDataType_t tempDataType;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnGetTensorNdDescriptor(
+                inputDesc.get(),
+                CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
+                &tempDataType,
+                output.data(),
+                temp.data(),
+                temp.data()
+            )
+        );
+
+        const auto rank = output[0];
+        output.resize(rank);
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnGetConvolutionNdForwardOutputDim(
+                convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data()
+            )
+        );
+    }
+
+    /** @brief performs convolution
+     *
+     * dstValue = alpha * result + beta * priorDstValue
+     *
+     * @tparam          T           convolution element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuDNN Handle
+     * @param           convDesc    convolution description
+     * @param           convAlgo    algorithm to use for convolution
+     * @param           workspace   workspace memory which meets the requirements of \p convAlgo
+     * @param           filterDesc  filter descriptor
+     * @param[in]       filterPtr   pointer to device memory containing the filters
+     * @param           inputDesc   tensor descriptor describing the input
+     * @param[in]       inputPtr    pointer to input tensor in device memory
+     * @param           alpha       result scale factor
+     * @param           beta        previous value scale factor
+     * @param           outputDesc  tensor descriptor describing the output
+     * @param[out]      outputPtr   pointer to output tensor in device memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void convolve(
+        const Handle& handle,
+        const ConvolutionDescriptor<T>& convDesc,
+        const ConvolutionAlgorithm<T>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<T>& filterDesc,
+        DevicePtr<const T> filterPtr,
+        const TensorDescriptor<T>& inputDesc,
+        DevicePtr<const T> inputPtr,
+        T alpha, T beta,
+        const TensorDescriptor<T>& outputDesc,
+        DevicePtr<T> outputPtr)
+    {
+        CV_Assert(handle);
+
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnConvolutionForward(
+                handle.get(),
+                &alpha, inputDesc.get(), inputPtr.get(),
+                filterDesc.get(), filterPtr.get(),
+                convDesc.get(), convAlgo.get(),
+                static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+                &beta, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+    template <> inline
+    void convolve(
+        const Handle& handle,
+        const ConvolutionDescriptor<half>& convDesc,
+        const ConvolutionAlgorithm<half>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<half>& filterDesc,
+        DevicePtr<const half> filterPtr,
+        const TensorDescriptor<half>& inputDesc,
+        DevicePtr<const half> inputPtr,
+        half alpha, half beta,
+        const TensorDescriptor<half>& outputDesc,
+        DevicePtr<half> outputPtr)
+    {
+        CV_Assert(handle);
+
+        /* we specalize for fp16 as the scaling factors must be provided as `float` */
+        float alpha_ = alpha, beta_ = beta;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnConvolutionForward(
+                handle.get(),
+                &alpha_, inputDesc.get(), inputPtr.get(),
+                filterDesc.get(), filterPtr.get(),
+                convDesc.get(), convAlgo.get(),
+                static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+                &beta_, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+    /** @brief performs convolution, bias addition and activation simultaneously
+     *
+     * dstValue = act(alpha * conv(input) + bias)
+     *
+     * @tparam          T           convolution element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuDNN Handle
+     * @param           convDesc    convolution description
+     * @param           convAlgo    algorithm to use for convolution
+     * @param           workspace   workspace memory which meets the requirements of \p convAlgo
+     * @param           filterDesc  filter descriptor
+     * @param[in]       filterPtr   pointer to device memory containing the filters
+     * @param           alpha       convolution scale factor
+     * @param           inputDesc   tensor descriptor describing the input
+     * @param[in]       inputPtr    pointer to input tensor in device memory
+     * @param           biasDesc    tensor descriptor describing the bias
+     * @param[in]       biasPtr     pointer to bias tensor in device memory
+     * @param           actDesc     activation descriptor
+     * @param           outputDesc  tensor descriptor describing the output
+     * @param[out]      outputPtr   pointer to output tensor in device memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void convolve_with_bias_activation(
+        const Handle& handle,
+        T alpha,
+        const ConvolutionDescriptor<T>& convDesc,
+        const ConvolutionAlgorithm<T>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<T>& filterDesc,
+        DevicePtr<const T> filterPtr,
+        const TensorDescriptor<T>& inputDesc,
+        DevicePtr<const T> inputPtr,
+        const TensorDescriptor<T>& biasDesc,
+        DevicePtr<const T> biasPtr,
+        const ActivationDescriptor& actDesc,
+        const TensorDescriptor<T>& outputDesc,
+        DevicePtr<T> outputPtr)
+    {
+        CV_Assert(handle);
+
+        T alpha2 = 0.0;
+        CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
+            handle.get(),
+            &alpha, inputDesc.get(), inputPtr.get(),
+            filterDesc.get(), filterPtr.get(),
+            convDesc.get(), convAlgo.get(),
+            static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+            &alpha2, outputDesc.get(), outputPtr.get(),
+            biasDesc.get(), biasPtr.get(),
+            actDesc.get(),
+            outputDesc.get(), outputPtr.get()));
+    }
+
+    template <> inline
+    void convolve_with_bias_activation(
+        const Handle& handle,
+        half alpha,
+        const ConvolutionDescriptor<half>& convDesc,
+        const ConvolutionAlgorithm<half>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<half>& filterDesc,
+        DevicePtr<const half> filterPtr,
+        const TensorDescriptor<half>& inputDesc,
+        DevicePtr<const half> inputPtr,
+        const TensorDescriptor<half>& biasDesc,
+        DevicePtr<const half> biasPtr,
+        const ActivationDescriptor& actDesc,
+        const TensorDescriptor<half>& outputDesc,
+        DevicePtr<half> outputPtr)
+    {
+        CV_Assert(handle);
+
+        float alpha_ = alpha, alpha2 = 0.0;
+        CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
+            handle.get(),
+            &alpha_, inputDesc.get(), inputPtr.get(),
+            filterDesc.get(), filterPtr.get(),
+            convDesc.get(), convAlgo.get(),
+            static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+            &alpha2, outputDesc.get(), outputPtr.get(),
+            biasDesc.get(), biasPtr.get(),
+            actDesc.get(),
+            outputDesc.get(), outputPtr.get()));
+    }
+
+    /** @brief performs convolution, bias addition, eltwise addition and activation simultaneously
+     *
+     * dstValue = act(alpha1 * conv(input) + bias + alpha2 * eltwise)
+     *
+     * @tparam          T           convolution element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuDNN Handle
+     * @param           convDesc    convolution description
+     * @param           convAlgo    algorithm to use for convolution
+     * @param           workspace   workspace memory which meets the requirements of \p convAlgo
+     * @param           filterDesc  filter descriptor
+     * @param[in]       filterPtr   pointer to device memory containing the filters
+     * @param           alpha1      convolution scale factor
+     * @param           inputDesc   tensor descriptor describing the input
+     * @param[in]       inputPtr    pointer to input tensor in device memory
+     * @param           biasDesc    tensor descriptor describing the bias
+     * @param[in]       biasPtr     pointer to bias tensor in device memory
+     * @param           alpha2      eltwise scale factor
+     * @param           eltwiseDesc tensor descriptor describing the eltwise tensor
+     * @param[in]       eltwisePtr  pointer to the eltwise tensor in device memory
+     * @param           actDesc     activation descriptor
+     * @param           outputDesc  tensor descriptor describing the output
+     * @param[out]      outputPtr   pointer to output tensor in device memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void convolve_with_bias_eltwise_activation(
+        const Handle& handle,
+        T alpha1,
+        const ConvolutionDescriptor<T>& convDesc,
+        const ConvolutionAlgorithm<T>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<T>& filterDesc,
+        DevicePtr<const T> filterPtr,
+        const TensorDescriptor<T>& inputDesc,
+        DevicePtr<const T> inputPtr,
+        const TensorDescriptor<T>& biasDesc,
+        DevicePtr<const T> biasPtr,
+        T alpha2,
+        const TensorDescriptor<T>& eltwiseDesc,
+        DevicePtr<const T> eltwisePtr,
+        const ActivationDescriptor& actDesc,
+        const TensorDescriptor<T>& outputDesc,
+        DevicePtr<T> outputPtr)
+    {
+        CV_Assert(handle);
+
+        CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
+            handle.get(),
+            &alpha1, inputDesc.get(), inputPtr.get(),
+            filterDesc.get(), filterPtr.get(),
+            convDesc.get(), convAlgo.get(),
+            static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+            &alpha2, eltwiseDesc.get(), eltwisePtr.get(),
+            biasDesc.get(), biasPtr.get(),
+            actDesc.get(),
+            outputDesc.get(), outputPtr.get()));
+    }
+
+    template <> inline
+    void convolve_with_bias_eltwise_activation(
+        const Handle& handle,
+        half alpha1,
+        const ConvolutionDescriptor<half>& convDesc,
+        const ConvolutionAlgorithm<half>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<half>& filterDesc,
+        DevicePtr<const half> filterPtr,
+        const TensorDescriptor<half>& inputDesc,
+        DevicePtr<const half> inputPtr,
+        const TensorDescriptor<half>& biasDesc,
+        DevicePtr<const half> biasPtr,
+        half alpha2,
+        const TensorDescriptor<half>& eltwiseDesc,
+        DevicePtr<const half> eltwisePtr,
+        const ActivationDescriptor& actDesc,
+        const TensorDescriptor<half>& outputDesc,
+        DevicePtr<half> outputPtr)
+    {
+        CV_Assert(handle);
+
+        float alpha1_ = alpha1, alpha2_ = alpha2;
+        CUDA4DNN_CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
+            handle.get(),
+            &alpha1_, inputDesc.get(), inputPtr.get(),
+            filterDesc.get(), filterPtr.get(),
+            convDesc.get(), convAlgo.get(),
+            static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+            &alpha2_, eltwiseDesc.get(), eltwisePtr.get(),
+            biasDesc.get(), biasPtr.get(),
+            actDesc.get(),
+            outputDesc.get(), outputPtr.get()));
+    }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */