cxx11_tensor_argmax_gpu.cu (8886B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 11 #define EIGEN_TEST_NO_LONGDOUBLE 12 13 #define EIGEN_USE_GPU 14 15 #include "main.h" 16 #include <unsupported/Eigen/CXX11/Tensor> 17 18 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> 19 20 using Eigen::Tensor; 21 22 template <int Layout> 23 void test_gpu_simple_argmax() 24 { 25 Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97)); 26 Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1)); 27 Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1)); 28 in.setRandom(); 29 in *= in.constant(100.0); 30 in(0, 0, 0) = -1000.0; 31 in(71, 52, 96) = 1000.0; 32 33 std::size_t in_bytes = in.size() * sizeof(double); 34 std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); 35 36 double* d_in; 37 DenseIndex* d_out_max; 38 DenseIndex* d_out_min; 39 gpuMalloc((void**)(&d_in), in_bytes); 40 gpuMalloc((void**)(&d_out_max), out_bytes); 41 gpuMalloc((void**)(&d_out_min), out_bytes); 42 43 gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice); 44 45 Eigen::GpuStreamDevice stream; 46 Eigen::GpuDevice gpu_device(&stream); 47 48 Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97)); 49 Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1)); 50 Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1)); 51 52 gpu_out_max.device(gpu_device) = gpu_in.argmax(); 53 gpu_out_min.device(gpu_device) = gpu_in.argmin(); 54 55 assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 56 assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 57 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 58 59 VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); 60 VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); 61 62 gpuFree(d_in); 63 gpuFree(d_out_max); 64 gpuFree(d_out_min); 65 } 66 67 template <int DataLayout> 68 void test_gpu_argmax_dim() 69 { 70 Tensor<float, 4, DataLayout> tensor(2,3,5,7); 71 std::vector<int> dims; 72 dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); 73 74 for (int dim = 0; dim < 4; ++dim) { 75 tensor.setRandom(); 76 tensor = (tensor + tensor.constant(0.5)).log(); 77 78 array<DenseIndex, 3> out_shape; 79 for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; 80 81 Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape); 82 83 array<DenseIndex, 4> ix; 84 for (int i = 0; i < 2; ++i) { 85 for (int j = 0; j < 3; ++j) { 86 for (int k = 0; k < 5; ++k) { 87 for (int l = 0; l < 7; ++l) { 88 ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; 89 if (ix[dim] != 0) continue; 90 // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 91 tensor(ix) = 10.0; 92 } 93 } 94 } 95 } 96 97 std::size_t in_bytes = tensor.size() * sizeof(float); 98 std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); 99 100 float* d_in; 101 DenseIndex* d_out; 102 gpuMalloc((void**)(&d_in), in_bytes); 103 gpuMalloc((void**)(&d_out), out_bytes); 104 105 gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); 106 107 Eigen::GpuStreamDevice stream; 108 Eigen::GpuDevice gpu_device(&stream); 109 110 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); 111 Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape); 112 113 gpu_out.device(gpu_device) = gpu_in.argmax(dim); 114 115 assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 116 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 117 118 VERIFY_IS_EQUAL(tensor_arg.size(), 119 size_t(2*3*5*7 / tensor.dimension(dim))); 120 121 for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { 122 // Expect max to be in the first index of the reduced dimension 123 VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); 124 } 125 126 for (int i = 0; i < 2; ++i) { 127 for (int j = 0; j < 3; ++j) { 128 for (int k = 0; k < 5; ++k) { 129 for (int l = 0; l < 7; ++l) { 130 ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; 131 if (ix[dim] != tensor.dimension(dim) - 1) continue; 132 // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 133 tensor(ix) = 20.0; 134 } 135 } 136 } 137 } 138 139 gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); 140 141 gpu_out.device(gpu_device) = gpu_in.argmax(dim); 142 143 assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 144 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 145 146 for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { 147 // Expect max to be in the last index of the reduced dimension 148 VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); 149 } 150 151 gpuFree(d_in); 152 gpuFree(d_out); 153 } 154 } 155 156 template <int DataLayout> 157 void test_gpu_argmin_dim() 158 { 159 Tensor<float, 4, DataLayout> tensor(2,3,5,7); 160 std::vector<int> dims; 161 dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7); 162 163 for (int dim = 0; dim < 4; ++dim) { 164 tensor.setRandom(); 165 tensor = (tensor + tensor.constant(0.5)).log(); 166 167 array<DenseIndex, 3> out_shape; 168 for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; 169 170 Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape); 171 172 array<DenseIndex, 4> ix; 173 for (int i = 0; i < 2; ++i) { 174 for (int j = 0; j < 3; ++j) { 175 for (int k = 0; k < 5; ++k) { 176 for (int l = 0; l < 7; ++l) { 177 ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; 178 if (ix[dim] != 0) continue; 179 // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 180 tensor(ix) = -10.0; 181 } 182 } 183 } 184 } 185 186 std::size_t in_bytes = tensor.size() * sizeof(float); 187 std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); 188 189 float* d_in; 190 DenseIndex* d_out; 191 gpuMalloc((void**)(&d_in), in_bytes); 192 gpuMalloc((void**)(&d_out), out_bytes); 193 194 gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); 195 196 Eigen::GpuStreamDevice stream; 197 Eigen::GpuDevice gpu_device(&stream); 198 199 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); 200 Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape); 201 202 gpu_out.device(gpu_device) = gpu_in.argmin(dim); 203 204 assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 205 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 206 207 VERIFY_IS_EQUAL(tensor_arg.size(), 208 2*3*5*7 / tensor.dimension(dim)); 209 210 for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { 211 // Expect min to be in the first index of the reduced dimension 212 VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); 213 } 214 215 for (int i = 0; i < 2; ++i) { 216 for (int j = 0; j < 3; ++j) { 217 for (int k = 0; k < 5; ++k) { 218 for (int l = 0; l < 7; ++l) { 219 ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; 220 if (ix[dim] != tensor.dimension(dim) - 1) continue; 221 // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 222 tensor(ix) = -20.0; 223 } 224 } 225 } 226 } 227 228 gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); 229 230 gpu_out.device(gpu_device) = gpu_in.argmin(dim); 231 232 assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 233 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 234 235 for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { 236 // Expect max to be in the last index of the reduced dimension 237 VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); 238 } 239 240 gpuFree(d_in); 241 gpuFree(d_out); 242 } 243 } 244 245 EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu) 246 { 247 CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>()); 248 CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>()); 249 CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>()); 250 CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>()); 251 CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>()); 252 CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>()); 253 }