57#if defined(USE_OPENCL) || defined(USE_CUDA)
58 auto params = OpKernel::params_->conv();
62 const tensor_t&
W =
context.input(1);
63 const tensor_t& bias =
context.input(2);
70 CLCudaAPI::Program
program = ProgramManager::getInstance()
81 CLCudaAPI::Context
ctx =
context.device()->context();
85 for (serial_size_t
i = 0;
i <
in_data.size(); ++
i) {
94 W[0].begin(),
W[0].end());
97 bias[0].begin(), bias[0].end());
117 serial_size_t
res = device->device().MaxWorkGroupSize() % 16;
118 serial_size_t size = device->device().MaxWorkGroupSize() -
res;
120 auto global = std::vector<size_t>{size};
121 auto local = std::vector<size_t>{16};
124 auto event = CLCudaAPI::Event();
129 nn_info(
"## Running the kernel ...");
134 nn_info(
" > Took " + to_string(
event.GetElapsedTime()) +
" ms");
137 std::vector<float_t> out(
out_data[
i].size(), 0);
142 for (serial_size_t
j = 0;
j < out.size(); ++
j) {
143 std::cout << out[
j] <<
" ";
145 std::cout << std::endl;
148 std::copy(std::begin(out), std::end(out), std::back_inserter(
out_data[
i]));
151 throw nn_error(
"Not compiled with OpenCL");
Definition op_kernel.h:72