cxx11_tensor_gpu.cu (58446B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #define EIGEN_TEST_NO_LONGDOUBLE 11 #define EIGEN_TEST_NO_COMPLEX 12 13 #define EIGEN_USE_GPU 14 15 #include "main.h" 16 #include <unsupported/Eigen/CXX11/Tensor> 17 18 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> 19 20 #define EIGEN_GPU_TEST_C99_MATH EIGEN_HAS_CXX11 21 22 using Eigen::Tensor; 23 24 void test_gpu_nullary() { 25 Tensor<float, 1, 0, int> in1(2); 26 Tensor<float, 1, 0, int> in2(2); 27 in1.setRandom(); 28 in2.setRandom(); 29 30 std::size_t tensor_bytes = in1.size() * sizeof(float); 31 32 float* d_in1; 33 float* d_in2; 34 gpuMalloc((void**)(&d_in1), tensor_bytes); 35 gpuMalloc((void**)(&d_in2), tensor_bytes); 36 gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice); 37 gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice); 38 39 Eigen::GpuStreamDevice stream; 40 Eigen::GpuDevice gpu_device(&stream); 41 42 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1( 43 d_in1, 2); 44 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2( 45 d_in2, 2); 46 47 gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f); 48 gpu_in2.device(gpu_device) = gpu_in2.random(); 49 50 Tensor<float, 1, 0, int> new1(2); 51 Tensor<float, 1, 0, int> new2(2); 52 53 assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost, 54 gpu_device.stream()) == gpuSuccess); 55 assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost, 56 gpu_device.stream()) == gpuSuccess); 57 58 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 59 60 for (int i = 0; i < 2; ++i) { 61 VERIFY_IS_APPROX(new1(i), 3.14f); 62 VERIFY_IS_NOT_EQUAL(new2(i), in2(i)); 63 } 64 65 gpuFree(d_in1); 66 gpuFree(d_in2); 67 } 68 69 void test_gpu_elementwise_small() { 70 Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2)); 71 Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2)); 72 Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2)); 73 in1.setRandom(); 74 in2.setRandom(); 75 76 std::size_t in1_bytes = in1.size() * sizeof(float); 77 std::size_t in2_bytes = in2.size() * sizeof(float); 78 std::size_t out_bytes = out.size() * sizeof(float); 79 80 float* d_in1; 81 float* d_in2; 82 float* d_out; 83 gpuMalloc((void**)(&d_in1), in1_bytes); 84 gpuMalloc((void**)(&d_in2), in2_bytes); 85 gpuMalloc((void**)(&d_out), out_bytes); 86 87 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); 88 gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); 89 90 Eigen::GpuStreamDevice stream; 91 Eigen::GpuDevice gpu_device(&stream); 92 93 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( 94 d_in1, Eigen::array<Eigen::DenseIndex, 1>(2)); 95 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2( 96 d_in2, Eigen::array<Eigen::DenseIndex, 1>(2)); 97 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out( 98 d_out, Eigen::array<Eigen::DenseIndex, 1>(2)); 99 100 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; 101 102 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, 103 gpu_device.stream()) == gpuSuccess); 104 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 105 106 for (int i = 0; i < 2; ++i) { 107 VERIFY_IS_APPROX( 108 out(Eigen::array<Eigen::DenseIndex, 1>(i)), 109 in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i))); 110 } 111 112 gpuFree(d_in1); 113 gpuFree(d_in2); 114 gpuFree(d_out); 115 } 116 117 void test_gpu_elementwise() 118 { 119 Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 120 Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 121 Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 122 Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 123 in1.setRandom(); 124 in2.setRandom(); 125 in3.setRandom(); 126 127 std::size_t in1_bytes = in1.size() * sizeof(float); 128 std::size_t in2_bytes = in2.size() * sizeof(float); 129 std::size_t in3_bytes = in3.size() * sizeof(float); 130 std::size_t out_bytes = out.size() * sizeof(float); 131 132 float* d_in1; 133 float* d_in2; 134 float* d_in3; 135 float* d_out; 136 gpuMalloc((void**)(&d_in1), in1_bytes); 137 gpuMalloc((void**)(&d_in2), in2_bytes); 138 gpuMalloc((void**)(&d_in3), in3_bytes); 139 gpuMalloc((void**)(&d_out), out_bytes); 140 141 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); 142 gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); 143 gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice); 144 145 Eigen::GpuStreamDevice stream; 146 Eigen::GpuDevice gpu_device(&stream); 147 148 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 149 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 150 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 151 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); 152 153 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; 154 155 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 156 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 157 158 for (int i = 0; i < 72; ++i) { 159 for (int j = 0; j < 53; ++j) { 160 for (int k = 0; k < 97; ++k) { 161 VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k))); 162 } 163 } 164 } 165 166 gpuFree(d_in1); 167 gpuFree(d_in2); 168 gpuFree(d_in3); 169 gpuFree(d_out); 170 } 171 172 void test_gpu_props() { 173 Tensor<float, 1> in1(200); 174 Tensor<bool, 1> out(200); 175 in1.setRandom(); 176 177 std::size_t in1_bytes = in1.size() * sizeof(float); 178 std::size_t out_bytes = out.size() * sizeof(bool); 179 180 float* d_in1; 181 bool* d_out; 182 gpuMalloc((void**)(&d_in1), in1_bytes); 183 gpuMalloc((void**)(&d_out), out_bytes); 184 185 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); 186 187 Eigen::GpuStreamDevice stream; 188 Eigen::GpuDevice gpu_device(&stream); 189 190 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( 191 d_in1, 200); 192 Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out( 193 d_out, 200); 194 195 gpu_out.device(gpu_device) = (gpu_in1.isnan)(); 196 197 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, 198 gpu_device.stream()) == gpuSuccess); 199 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 200 201 for (int i = 0; i < 200; ++i) { 202 VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i))); 203 } 204 205 gpuFree(d_in1); 206 gpuFree(d_out); 207 } 208 209 void test_gpu_reduction() 210 { 211 Tensor<float, 4> in1(72,53,97,113); 212 Tensor<float, 2> out(72,97); 213 in1.setRandom(); 214 215 std::size_t in1_bytes = in1.size() * sizeof(float); 216 std::size_t out_bytes = out.size() * sizeof(float); 217 218 float* d_in1; 219 float* d_out; 220 gpuMalloc((void**)(&d_in1), in1_bytes); 221 gpuMalloc((void**)(&d_out), out_bytes); 222 223 gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); 224 225 Eigen::GpuStreamDevice stream; 226 Eigen::GpuDevice gpu_device(&stream); 227 228 Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); 229 Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); 230 231 array<Eigen::DenseIndex, 2> reduction_axis; 232 reduction_axis[0] = 1; 233 reduction_axis[1] = 3; 234 235 gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); 236 237 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 238 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 239 240 for (int i = 0; i < 72; ++i) { 241 for (int j = 0; j < 97; ++j) { 242 float expected = 0; 243 for (int k = 0; k < 53; ++k) { 244 for (int l = 0; l < 113; ++l) { 245 expected = 246 std::max<float>(expected, in1(i, k, j, l)); 247 } 248 } 249 VERIFY_IS_APPROX(out(i,j), expected); 250 } 251 } 252 253 gpuFree(d_in1); 254 gpuFree(d_out); 255 } 256 257 template<int DataLayout> 258 void test_gpu_contraction() 259 { 260 // with these dimensions, the output has 300 * 140 elements, which is 261 // more than 30 * 1024, which is the number of threads in blocks on 262 // a 15 SM GK110 GPU 263 Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31); 264 Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1)); 265 Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1)); 266 267 t_left.setRandom(); 268 t_right.setRandom(); 269 270 std::size_t t_left_bytes = t_left.size() * sizeof(float); 271 std::size_t t_right_bytes = t_right.size() * sizeof(float); 272 std::size_t t_result_bytes = t_result.size() * sizeof(float); 273 274 float* d_t_left; 275 float* d_t_right; 276 float* d_t_result; 277 278 gpuMalloc((void**)(&d_t_left), t_left_bytes); 279 gpuMalloc((void**)(&d_t_right), t_right_bytes); 280 gpuMalloc((void**)(&d_t_result), t_result_bytes); 281 282 gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); 283 gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); 284 285 Eigen::GpuStreamDevice stream; 286 Eigen::GpuDevice gpu_device(&stream); 287 288 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); 289 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1); 290 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1); 291 292 typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf; 293 MapXf m_left(t_left.data(), 300, 93); 294 MapXf m_right(t_right.data(), 93, 140); 295 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140); 296 297 typedef Tensor<float, 1>::DimensionPair DimPair; 298 Eigen::array<DimPair, 2> dims; 299 dims[0] = DimPair(2, 0); 300 dims[1] = DimPair(3, 1); 301 302 m_result = m_left * m_right; 303 gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); 304 305 gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); 306 307 for (DenseIndex i = 0; i < t_result.size(); i++) { 308 if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { 309 std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; 310 assert(false); 311 } 312 } 313 314 gpuFree(d_t_left); 315 gpuFree(d_t_right); 316 gpuFree(d_t_result); 317 } 318 319 template<int DataLayout> 320 void test_gpu_convolution_1d() 321 { 322 Tensor<float, 4, DataLayout> input(74,37,11,137); 323 Tensor<float, 1, DataLayout> kernel(4); 324 Tensor<float, 4, DataLayout> out(74,34,11,137); 325 input = input.constant(10.0f) + input.random(); 326 kernel = kernel.constant(7.0f) + kernel.random(); 327 328 std::size_t input_bytes = input.size() * sizeof(float); 329 std::size_t kernel_bytes = kernel.size() * sizeof(float); 330 std::size_t out_bytes = out.size() * sizeof(float); 331 332 float* d_input; 333 float* d_kernel; 334 float* d_out; 335 gpuMalloc((void**)(&d_input), input_bytes); 336 gpuMalloc((void**)(&d_kernel), kernel_bytes); 337 gpuMalloc((void**)(&d_out), out_bytes); 338 339 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); 340 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); 341 342 Eigen::GpuStreamDevice stream; 343 Eigen::GpuDevice gpu_device(&stream); 344 345 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); 346 Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4); 347 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137); 348 349 Eigen::array<Eigen::DenseIndex, 1> dims(1); 350 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 351 352 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 353 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 354 355 for (int i = 0; i < 74; ++i) { 356 for (int j = 0; j < 34; ++j) { 357 for (int k = 0; k < 11; ++k) { 358 for (int l = 0; l < 137; ++l) { 359 const float result = out(i,j,k,l); 360 const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) + 361 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3); 362 VERIFY_IS_APPROX(result, expected); 363 } 364 } 365 } 366 } 367 368 gpuFree(d_input); 369 gpuFree(d_kernel); 370 gpuFree(d_out); 371 } 372 373 void test_gpu_convolution_inner_dim_col_major_1d() 374 { 375 Tensor<float, 4, ColMajor> input(74,9,11,7); 376 Tensor<float, 1, ColMajor> kernel(4); 377 Tensor<float, 4, ColMajor> out(71,9,11,7); 378 input = input.constant(10.0f) + input.random(); 379 kernel = kernel.constant(7.0f) + kernel.random(); 380 381 std::size_t input_bytes = input.size() * sizeof(float); 382 std::size_t kernel_bytes = kernel.size() * sizeof(float); 383 std::size_t out_bytes = out.size() * sizeof(float); 384 385 float* d_input; 386 float* d_kernel; 387 float* d_out; 388 gpuMalloc((void**)(&d_input), input_bytes); 389 gpuMalloc((void**)(&d_kernel), kernel_bytes); 390 gpuMalloc((void**)(&d_out), out_bytes); 391 392 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); 393 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); 394 395 Eigen::GpuStreamDevice stream; 396 Eigen::GpuDevice gpu_device(&stream); 397 398 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); 399 Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4); 400 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7); 401 402 Eigen::array<Eigen::DenseIndex, 1> dims(0); 403 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 404 405 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 406 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 407 408 for (int i = 0; i < 71; ++i) { 409 for (int j = 0; j < 9; ++j) { 410 for (int k = 0; k < 11; ++k) { 411 for (int l = 0; l < 7; ++l) { 412 const float result = out(i,j,k,l); 413 const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) + 414 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3); 415 VERIFY_IS_APPROX(result, expected); 416 } 417 } 418 } 419 } 420 421 gpuFree(d_input); 422 gpuFree(d_kernel); 423 gpuFree(d_out); 424 } 425 426 void test_gpu_convolution_inner_dim_row_major_1d() 427 { 428 Tensor<float, 4, RowMajor> input(7,9,11,74); 429 Tensor<float, 1, RowMajor> kernel(4); 430 Tensor<float, 4, RowMajor> out(7,9,11,71); 431 input = input.constant(10.0f) + input.random(); 432 kernel = kernel.constant(7.0f) + kernel.random(); 433 434 std::size_t input_bytes = input.size() * sizeof(float); 435 std::size_t kernel_bytes = kernel.size() * sizeof(float); 436 std::size_t out_bytes = out.size() * sizeof(float); 437 438 float* d_input; 439 float* d_kernel; 440 float* d_out; 441 gpuMalloc((void**)(&d_input), input_bytes); 442 gpuMalloc((void**)(&d_kernel), kernel_bytes); 443 gpuMalloc((void**)(&d_out), out_bytes); 444 445 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); 446 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); 447 448 Eigen::GpuStreamDevice stream; 449 Eigen::GpuDevice gpu_device(&stream); 450 451 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); 452 Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4); 453 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71); 454 455 Eigen::array<Eigen::DenseIndex, 1> dims(3); 456 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 457 458 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 459 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 460 461 for (int i = 0; i < 7; ++i) { 462 for (int j = 0; j < 9; ++j) { 463 for (int k = 0; k < 11; ++k) { 464 for (int l = 0; l < 71; ++l) { 465 const float result = out(i,j,k,l); 466 const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) + 467 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3); 468 VERIFY_IS_APPROX(result, expected); 469 } 470 } 471 } 472 } 473 474 gpuFree(d_input); 475 gpuFree(d_kernel); 476 gpuFree(d_out); 477 } 478 479 template<int DataLayout> 480 void test_gpu_convolution_2d() 481 { 482 Tensor<float, 4, DataLayout> input(74,37,11,137); 483 Tensor<float, 2, DataLayout> kernel(3,4); 484 Tensor<float, 4, DataLayout> out(74,35,8,137); 485 input = input.constant(10.0f) + input.random(); 486 kernel = kernel.constant(7.0f) + kernel.random(); 487 488 std::size_t input_bytes = input.size() * sizeof(float); 489 std::size_t kernel_bytes = kernel.size() * sizeof(float); 490 std::size_t out_bytes = out.size() * sizeof(float); 491 492 float* d_input; 493 float* d_kernel; 494 float* d_out; 495 gpuMalloc((void**)(&d_input), input_bytes); 496 gpuMalloc((void**)(&d_kernel), kernel_bytes); 497 gpuMalloc((void**)(&d_out), out_bytes); 498 499 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); 500 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); 501 502 Eigen::GpuStreamDevice stream; 503 Eigen::GpuDevice gpu_device(&stream); 504 505 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); 506 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4); 507 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137); 508 509 Eigen::array<Eigen::DenseIndex, 2> dims(1,2); 510 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 511 512 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 513 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 514 515 for (int i = 0; i < 74; ++i) { 516 for (int j = 0; j < 35; ++j) { 517 for (int k = 0; k < 8; ++k) { 518 for (int l = 0; l < 137; ++l) { 519 const float result = out(i,j,k,l); 520 const float expected = input(i,j+0,k+0,l) * kernel(0,0) + 521 input(i,j+1,k+0,l) * kernel(1,0) + 522 input(i,j+2,k+0,l) * kernel(2,0) + 523 input(i,j+0,k+1,l) * kernel(0,1) + 524 input(i,j+1,k+1,l) * kernel(1,1) + 525 input(i,j+2,k+1,l) * kernel(2,1) + 526 input(i,j+0,k+2,l) * kernel(0,2) + 527 input(i,j+1,k+2,l) * kernel(1,2) + 528 input(i,j+2,k+2,l) * kernel(2,2) + 529 input(i,j+0,k+3,l) * kernel(0,3) + 530 input(i,j+1,k+3,l) * kernel(1,3) + 531 input(i,j+2,k+3,l) * kernel(2,3); 532 VERIFY_IS_APPROX(result, expected); 533 } 534 } 535 } 536 } 537 538 gpuFree(d_input); 539 gpuFree(d_kernel); 540 gpuFree(d_out); 541 } 542 543 template<int DataLayout> 544 void test_gpu_convolution_3d() 545 { 546 Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17)); 547 Tensor<float, 3, DataLayout> kernel(3,4,2); 548 Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17)); 549 input = input.constant(10.0f) + input.random(); 550 kernel = kernel.constant(7.0f) + kernel.random(); 551 552 std::size_t input_bytes = input.size() * sizeof(float); 553 std::size_t kernel_bytes = kernel.size() * sizeof(float); 554 std::size_t out_bytes = out.size() * sizeof(float); 555 556 float* d_input; 557 float* d_kernel; 558 float* d_out; 559 gpuMalloc((void**)(&d_input), input_bytes); 560 gpuMalloc((void**)(&d_kernel), kernel_bytes); 561 gpuMalloc((void**)(&d_out), out_bytes); 562 563 gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); 564 gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); 565 566 Eigen::GpuStreamDevice stream; 567 Eigen::GpuDevice gpu_device(&stream); 568 569 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); 570 Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2); 571 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17); 572 573 Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3); 574 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); 575 576 assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 577 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 578 579 for (int i = 0; i < 74; ++i) { 580 for (int j = 0; j < 35; ++j) { 581 for (int k = 0; k < 8; ++k) { 582 for (int l = 0; l < 136; ++l) { 583 for (int m = 0; m < 17; ++m) { 584 const float result = out(i,j,k,l,m); 585 const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) + 586 input(i,j+1,k+0,l+0,m) * kernel(1,0,0) + 587 input(i,j+2,k+0,l+0,m) * kernel(2,0,0) + 588 input(i,j+0,k+1,l+0,m) * kernel(0,1,0) + 589 input(i,j+1,k+1,l+0,m) * kernel(1,1,0) + 590 input(i,j+2,k+1,l+0,m) * kernel(2,1,0) + 591 input(i,j+0,k+2,l+0,m) * kernel(0,2,0) + 592 input(i,j+1,k+2,l+0,m) * kernel(1,2,0) + 593 input(i,j+2,k+2,l+0,m) * kernel(2,2,0) + 594 input(i,j+0,k+3,l+0,m) * kernel(0,3,0) + 595 input(i,j+1,k+3,l+0,m) * kernel(1,3,0) + 596 input(i,j+2,k+3,l+0,m) * kernel(2,3,0) + 597 input(i,j+0,k+0,l+1,m) * kernel(0,0,1) + 598 input(i,j+1,k+0,l+1,m) * kernel(1,0,1) + 599 input(i,j+2,k+0,l+1,m) * kernel(2,0,1) + 600 input(i,j+0,k+1,l+1,m) * kernel(0,1,1) + 601 input(i,j+1,k+1,l+1,m) * kernel(1,1,1) + 602 input(i,j+2,k+1,l+1,m) * kernel(2,1,1) + 603 input(i,j+0,k+2,l+1,m) * kernel(0,2,1) + 604 input(i,j+1,k+2,l+1,m) * kernel(1,2,1) + 605 input(i,j+2,k+2,l+1,m) * kernel(2,2,1) + 606 input(i,j+0,k+3,l+1,m) * kernel(0,3,1) + 607 input(i,j+1,k+3,l+1,m) * kernel(1,3,1) + 608 input(i,j+2,k+3,l+1,m) * kernel(2,3,1); 609 VERIFY_IS_APPROX(result, expected); 610 } 611 } 612 } 613 } 614 } 615 616 gpuFree(d_input); 617 gpuFree(d_kernel); 618 gpuFree(d_out); 619 } 620 621 622 #if EIGEN_GPU_TEST_C99_MATH 623 template <typename Scalar> 624 void test_gpu_lgamma(const Scalar stddev) 625 { 626 Tensor<Scalar, 2> in(72,97); 627 in.setRandom(); 628 in *= in.constant(stddev); 629 Tensor<Scalar, 2> out(72,97); 630 out.setZero(); 631 632 std::size_t bytes = in.size() * sizeof(Scalar); 633 634 Scalar* d_in; 635 Scalar* d_out; 636 gpuMalloc((void**)(&d_in), bytes); 637 gpuMalloc((void**)(&d_out), bytes); 638 639 gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); 640 641 Eigen::GpuStreamDevice stream; 642 Eigen::GpuDevice gpu_device(&stream); 643 644 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); 645 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); 646 647 gpu_out.device(gpu_device) = gpu_in.lgamma(); 648 649 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 650 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 651 652 for (int i = 0; i < 72; ++i) { 653 for (int j = 0; j < 97; ++j) { 654 VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j))); 655 } 656 } 657 658 gpuFree(d_in); 659 gpuFree(d_out); 660 } 661 #endif 662 663 template <typename Scalar> 664 void test_gpu_digamma() 665 { 666 Tensor<Scalar, 1> in(7); 667 Tensor<Scalar, 1> out(7); 668 Tensor<Scalar, 1> expected_out(7); 669 out.setZero(); 670 671 in(0) = Scalar(1); 672 in(1) = Scalar(1.5); 673 in(2) = Scalar(4); 674 in(3) = Scalar(-10.5); 675 in(4) = Scalar(10000.5); 676 in(5) = Scalar(0); 677 in(6) = Scalar(-1); 678 679 expected_out(0) = Scalar(-0.5772156649015329); 680 expected_out(1) = Scalar(0.03648997397857645); 681 expected_out(2) = Scalar(1.2561176684318); 682 expected_out(3) = Scalar(2.398239129535781); 683 expected_out(4) = Scalar(9.210340372392849); 684 expected_out(5) = std::numeric_limits<Scalar>::infinity(); 685 expected_out(6) = std::numeric_limits<Scalar>::infinity(); 686 687 std::size_t bytes = in.size() * sizeof(Scalar); 688 689 Scalar* d_in; 690 Scalar* d_out; 691 gpuMalloc((void**)(&d_in), bytes); 692 gpuMalloc((void**)(&d_out), bytes); 693 694 gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); 695 696 Eigen::GpuStreamDevice stream; 697 Eigen::GpuDevice gpu_device(&stream); 698 699 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7); 700 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); 701 702 gpu_out.device(gpu_device) = gpu_in.digamma(); 703 704 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 705 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 706 707 for (int i = 0; i < 5; ++i) { 708 VERIFY_IS_APPROX(out(i), expected_out(i)); 709 } 710 for (int i = 5; i < 7; ++i) { 711 VERIFY_IS_EQUAL(out(i), expected_out(i)); 712 } 713 714 gpuFree(d_in); 715 gpuFree(d_out); 716 } 717 718 template <typename Scalar> 719 void test_gpu_zeta() 720 { 721 Tensor<Scalar, 1> in_x(6); 722 Tensor<Scalar, 1> in_q(6); 723 Tensor<Scalar, 1> out(6); 724 Tensor<Scalar, 1> expected_out(6); 725 out.setZero(); 726 727 in_x(0) = Scalar(1); 728 in_x(1) = Scalar(1.5); 729 in_x(2) = Scalar(4); 730 in_x(3) = Scalar(-10.5); 731 in_x(4) = Scalar(10000.5); 732 in_x(5) = Scalar(3); 733 734 in_q(0) = Scalar(1.2345); 735 in_q(1) = Scalar(2); 736 in_q(2) = Scalar(1.5); 737 in_q(3) = Scalar(3); 738 in_q(4) = Scalar(1.0001); 739 in_q(5) = Scalar(-2.5); 740 741 expected_out(0) = std::numeric_limits<Scalar>::infinity(); 742 expected_out(1) = Scalar(1.61237534869); 743 expected_out(2) = Scalar(0.234848505667); 744 expected_out(3) = Scalar(1.03086757337e-5); 745 expected_out(4) = Scalar(0.367879440865); 746 expected_out(5) = Scalar(0.054102025820864097); 747 748 std::size_t bytes = in_x.size() * sizeof(Scalar); 749 750 Scalar* d_in_x; 751 Scalar* d_in_q; 752 Scalar* d_out; 753 gpuMalloc((void**)(&d_in_x), bytes); 754 gpuMalloc((void**)(&d_in_q), bytes); 755 gpuMalloc((void**)(&d_out), bytes); 756 757 gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); 758 gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice); 759 760 Eigen::GpuStreamDevice stream; 761 Eigen::GpuDevice gpu_device(&stream); 762 763 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6); 764 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6); 765 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6); 766 767 gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q); 768 769 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 770 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 771 772 VERIFY_IS_EQUAL(out(0), expected_out(0)); 773 VERIFY((std::isnan)(out(3))); 774 775 for (int i = 1; i < 6; ++i) { 776 if (i != 3) { 777 VERIFY_IS_APPROX(out(i), expected_out(i)); 778 } 779 } 780 781 gpuFree(d_in_x); 782 gpuFree(d_in_q); 783 gpuFree(d_out); 784 } 785 786 template <typename Scalar> 787 void test_gpu_polygamma() 788 { 789 Tensor<Scalar, 1> in_x(7); 790 Tensor<Scalar, 1> in_n(7); 791 Tensor<Scalar, 1> out(7); 792 Tensor<Scalar, 1> expected_out(7); 793 out.setZero(); 794 795 in_n(0) = Scalar(1); 796 in_n(1) = Scalar(1); 797 in_n(2) = Scalar(1); 798 in_n(3) = Scalar(17); 799 in_n(4) = Scalar(31); 800 in_n(5) = Scalar(28); 801 in_n(6) = Scalar(8); 802 803 in_x(0) = Scalar(2); 804 in_x(1) = Scalar(3); 805 in_x(2) = Scalar(25.5); 806 in_x(3) = Scalar(4.7); 807 in_x(4) = Scalar(11.8); 808 in_x(5) = Scalar(17.7); 809 in_x(6) = Scalar(30.2); 810 811 expected_out(0) = Scalar(0.644934066848); 812 expected_out(1) = Scalar(0.394934066848); 813 expected_out(2) = Scalar(0.0399946696496); 814 expected_out(3) = Scalar(293.334565435); 815 expected_out(4) = Scalar(0.445487887616); 816 expected_out(5) = Scalar(-2.47810300902e-07); 817 expected_out(6) = Scalar(-8.29668781082e-09); 818 819 std::size_t bytes = in_x.size() * sizeof(Scalar); 820 821 Scalar* d_in_x; 822 Scalar* d_in_n; 823 Scalar* d_out; 824 gpuMalloc((void**)(&d_in_x), bytes); 825 gpuMalloc((void**)(&d_in_n), bytes); 826 gpuMalloc((void**)(&d_out), bytes); 827 828 gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); 829 gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice); 830 831 Eigen::GpuStreamDevice stream; 832 Eigen::GpuDevice gpu_device(&stream); 833 834 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7); 835 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7); 836 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7); 837 838 gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x); 839 840 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 841 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 842 843 for (int i = 0; i < 7; ++i) { 844 VERIFY_IS_APPROX(out(i), expected_out(i)); 845 } 846 847 gpuFree(d_in_x); 848 gpuFree(d_in_n); 849 gpuFree(d_out); 850 } 851 852 template <typename Scalar> 853 void test_gpu_igamma() 854 { 855 Tensor<Scalar, 2> a(6, 6); 856 Tensor<Scalar, 2> x(6, 6); 857 Tensor<Scalar, 2> out(6, 6); 858 out.setZero(); 859 860 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; 861 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; 862 863 for (int i = 0; i < 6; ++i) { 864 for (int j = 0; j < 6; ++j) { 865 a(i, j) = a_s[i]; 866 x(i, j) = x_s[j]; 867 } 868 } 869 870 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); 871 Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, 872 {0.0, 0.6321205588285578, 0.7768698398515702, 873 0.9816843611112658, 9.999500016666262e-05, 1.0}, 874 {0.0, 0.4275932955291202, 0.608374823728911, 875 0.9539882943107686, 7.522076445089201e-07, 1.0}, 876 {0.0, 0.01898815687615381, 0.06564245437845008, 877 0.5665298796332909, 4.166333347221828e-18, 1.0}, 878 {0.0, 0.9999780593618628, 0.9999899967080838, 879 0.9999996219837988, 0.9991370418689945, 1.0}, 880 {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; 881 882 883 884 std::size_t bytes = a.size() * sizeof(Scalar); 885 886 Scalar* d_a; 887 Scalar* d_x; 888 Scalar* d_out; 889 assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess); 890 assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess); 891 assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess); 892 893 gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice); 894 gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice); 895 896 Eigen::GpuStreamDevice stream; 897 Eigen::GpuDevice gpu_device(&stream); 898 899 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); 900 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); 901 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); 902 903 gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); 904 905 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 906 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 907 908 for (int i = 0; i < 6; ++i) { 909 for (int j = 0; j < 6; ++j) { 910 if ((std::isnan)(igamma_s[i][j])) { 911 VERIFY((std::isnan)(out(i, j))); 912 } else { 913 VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]); 914 } 915 } 916 } 917 918 gpuFree(d_a); 919 gpuFree(d_x); 920 gpuFree(d_out); 921 } 922 923 template <typename Scalar> 924 void test_gpu_igammac() 925 { 926 Tensor<Scalar, 2> a(6, 6); 927 Tensor<Scalar, 2> x(6, 6); 928 Tensor<Scalar, 2> out(6, 6); 929 out.setZero(); 930 931 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; 932 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; 933 934 for (int i = 0; i < 6; ++i) { 935 for (int j = 0; j < 6; ++j) { 936 a(i, j) = a_s[i]; 937 x(i, j) = x_s[j]; 938 } 939 } 940 941 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); 942 Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, 943 {1.0, 0.36787944117144233, 0.22313016014842982, 944 0.018315638888734182, 0.9999000049998333, 0.0}, 945 {1.0, 0.5724067044708798, 0.3916251762710878, 946 0.04601170568923136, 0.9999992477923555, 0.0}, 947 {1.0, 0.9810118431238462, 0.9343575456215499, 948 0.4334701203667089, 1.0, 0.0}, 949 {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, 950 3.7801620118431334e-07, 0.0008629581310054535, 951 0.0}, 952 {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; 953 954 std::size_t bytes = a.size() * sizeof(Scalar); 955 956 Scalar* d_a; 957 Scalar* d_x; 958 Scalar* d_out; 959 gpuMalloc((void**)(&d_a), bytes); 960 gpuMalloc((void**)(&d_x), bytes); 961 gpuMalloc((void**)(&d_out), bytes); 962 963 gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice); 964 gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice); 965 966 Eigen::GpuStreamDevice stream; 967 Eigen::GpuDevice gpu_device(&stream); 968 969 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); 970 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6); 971 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6); 972 973 gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); 974 975 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 976 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 977 978 for (int i = 0; i < 6; ++i) { 979 for (int j = 0; j < 6; ++j) { 980 if ((std::isnan)(igammac_s[i][j])) { 981 VERIFY((std::isnan)(out(i, j))); 982 } else { 983 VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]); 984 } 985 } 986 } 987 988 gpuFree(d_a); 989 gpuFree(d_x); 990 gpuFree(d_out); 991 } 992 993 #if EIGEN_GPU_TEST_C99_MATH 994 template <typename Scalar> 995 void test_gpu_erf(const Scalar stddev) 996 { 997 Tensor<Scalar, 2> in(72,97); 998 in.setRandom(); 999 in *= in.constant(stddev); 1000 Tensor<Scalar, 2> out(72,97); 1001 out.setZero(); 1002 1003 std::size_t bytes = in.size() * sizeof(Scalar); 1004 1005 Scalar* d_in; 1006 Scalar* d_out; 1007 assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess); 1008 assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess); 1009 1010 gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); 1011 1012 Eigen::GpuStreamDevice stream; 1013 Eigen::GpuDevice gpu_device(&stream); 1014 1015 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); 1016 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); 1017 1018 gpu_out.device(gpu_device) = gpu_in.erf(); 1019 1020 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 1021 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 1022 1023 for (int i = 0; i < 72; ++i) { 1024 for (int j = 0; j < 97; ++j) { 1025 VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j))); 1026 } 1027 } 1028 1029 gpuFree(d_in); 1030 gpuFree(d_out); 1031 } 1032 1033 template <typename Scalar> 1034 void test_gpu_erfc(const Scalar stddev) 1035 { 1036 Tensor<Scalar, 2> in(72,97); 1037 in.setRandom(); 1038 in *= in.constant(stddev); 1039 Tensor<Scalar, 2> out(72,97); 1040 out.setZero(); 1041 1042 std::size_t bytes = in.size() * sizeof(Scalar); 1043 1044 Scalar* d_in; 1045 Scalar* d_out; 1046 gpuMalloc((void**)(&d_in), bytes); 1047 gpuMalloc((void**)(&d_out), bytes); 1048 1049 gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); 1050 1051 Eigen::GpuStreamDevice stream; 1052 Eigen::GpuDevice gpu_device(&stream); 1053 1054 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); 1055 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97); 1056 1057 gpu_out.device(gpu_device) = gpu_in.erfc(); 1058 1059 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 1060 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 1061 1062 for (int i = 0; i < 72; ++i) { 1063 for (int j = 0; j < 97; ++j) { 1064 VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j))); 1065 } 1066 } 1067 1068 gpuFree(d_in); 1069 gpuFree(d_out); 1070 } 1071 #endif 1072 template <typename Scalar> 1073 void test_gpu_ndtri() 1074 { 1075 Tensor<Scalar, 1> in_x(8); 1076 Tensor<Scalar, 1> out(8); 1077 Tensor<Scalar, 1> expected_out(8); 1078 out.setZero(); 1079 1080 in_x(0) = Scalar(1); 1081 in_x(1) = Scalar(0.); 1082 in_x(2) = Scalar(0.5); 1083 in_x(3) = Scalar(0.2); 1084 in_x(4) = Scalar(0.8); 1085 in_x(5) = Scalar(0.9); 1086 in_x(6) = Scalar(0.1); 1087 in_x(7) = Scalar(0.99); 1088 in_x(8) = Scalar(0.01); 1089 1090 expected_out(0) = std::numeric_limits<Scalar>::infinity(); 1091 expected_out(1) = -std::numeric_limits<Scalar>::infinity(); 1092 expected_out(2) = Scalar(0.0); 1093 expected_out(3) = Scalar(-0.8416212335729142); 1094 expected_out(4) = Scalar(0.8416212335729142); 1095 expected_out(5) = Scalar(1.2815515655446004); 1096 expected_out(6) = Scalar(-1.2815515655446004); 1097 expected_out(7) = Scalar(2.3263478740408408); 1098 expected_out(8) = Scalar(-2.3263478740408408); 1099 1100 std::size_t bytes = in_x.size() * sizeof(Scalar); 1101 1102 Scalar* d_in_x; 1103 Scalar* d_out; 1104 gpuMalloc((void**)(&d_in_x), bytes); 1105 gpuMalloc((void**)(&d_out), bytes); 1106 1107 gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); 1108 1109 Eigen::GpuStreamDevice stream; 1110 Eigen::GpuDevice gpu_device(&stream); 1111 1112 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6); 1113 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6); 1114 1115 gpu_out.device(gpu_device) = gpu_in_x.ndtri(); 1116 1117 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 1118 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 1119 1120 VERIFY_IS_EQUAL(out(0), expected_out(0)); 1121 VERIFY((std::isnan)(out(3))); 1122 1123 for (int i = 1; i < 6; ++i) { 1124 if (i != 3) { 1125 VERIFY_IS_APPROX(out(i), expected_out(i)); 1126 } 1127 } 1128 1129 gpuFree(d_in_x); 1130 gpuFree(d_out); 1131 } 1132 1133 template <typename Scalar> 1134 void test_gpu_betainc() 1135 { 1136 Tensor<Scalar, 1> in_x(125); 1137 Tensor<Scalar, 1> in_a(125); 1138 Tensor<Scalar, 1> in_b(125); 1139 Tensor<Scalar, 1> out(125); 1140 Tensor<Scalar, 1> expected_out(125); 1141 out.setZero(); 1142 1143 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN(); 1144 1145 Array<Scalar, 1, Dynamic> x(125); 1146 Array<Scalar, 1, Dynamic> a(125); 1147 Array<Scalar, 1, Dynamic> b(125); 1148 Array<Scalar, 1, Dynamic> v(125); 1149 1150 a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1151 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1152 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1153 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1154 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1155 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1156 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1157 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1158 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1159 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1160 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1161 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1162 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 1163 31.62177660168379, 31.62177660168379, 31.62177660168379, 1164 31.62177660168379, 31.62177660168379, 31.62177660168379, 1165 31.62177660168379, 31.62177660168379, 31.62177660168379, 1166 31.62177660168379, 31.62177660168379, 31.62177660168379, 1167 31.62177660168379, 31.62177660168379, 31.62177660168379, 1168 31.62177660168379, 31.62177660168379, 31.62177660168379, 1169 31.62177660168379, 31.62177660168379, 31.62177660168379, 1170 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, 1171 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 1172 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 1173 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999; 1174 1175 b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 1176 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, 1177 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, 1178 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, 1179 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 1180 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1181 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 1182 31.62177660168379, 31.62177660168379, 31.62177660168379, 1183 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 1184 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 1185 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, 1186 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, 1187 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, 1188 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 1189 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 1190 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 1191 31.62177660168379, 31.62177660168379, 31.62177660168379, 1192 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 1193 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 1194 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999, 1195 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379, 1196 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999, 1197 999.999, 999.999, 999.999; 1198 1199 x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1200 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 1201 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 1202 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 1203 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, 1204 -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1205 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 1206 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 1207 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1; 1208 1209 v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 1210 nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 1211 nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan, 1212 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 1213 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan, 1214 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, 1215 nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256, 1216 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001, 1217 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 1218 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 1219 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan, 1220 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, 1221 nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444, 1222 nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229, 1223 nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, 1224 nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306, 1225 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302, 1226 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 1227 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196, 1228 0.5000000000004947, 0.9999999999999999, nan; 1229 1230 for (int i = 0; i < 125; ++i) { 1231 in_x(i) = x(i); 1232 in_a(i) = a(i); 1233 in_b(i) = b(i); 1234 expected_out(i) = v(i); 1235 } 1236 1237 std::size_t bytes = in_x.size() * sizeof(Scalar); 1238 1239 Scalar* d_in_x; 1240 Scalar* d_in_a; 1241 Scalar* d_in_b; 1242 Scalar* d_out; 1243 gpuMalloc((void**)(&d_in_x), bytes); 1244 gpuMalloc((void**)(&d_in_a), bytes); 1245 gpuMalloc((void**)(&d_in_b), bytes); 1246 gpuMalloc((void**)(&d_out), bytes); 1247 1248 gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); 1249 gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice); 1250 gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice); 1251 1252 Eigen::GpuStreamDevice stream; 1253 Eigen::GpuDevice gpu_device(&stream); 1254 1255 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125); 1256 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125); 1257 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125); 1258 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125); 1259 1260 gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x); 1261 1262 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); 1263 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 1264 1265 for (int i = 1; i < 125; ++i) { 1266 if ((std::isnan)(expected_out(i))) { 1267 VERIFY((std::isnan)(out(i))); 1268 } else { 1269 VERIFY_IS_APPROX(out(i), expected_out(i)); 1270 } 1271 } 1272 1273 gpuFree(d_in_x); 1274 gpuFree(d_in_a); 1275 gpuFree(d_in_b); 1276 gpuFree(d_out); 1277 } 1278 1279 template <typename Scalar> 1280 void test_gpu_i0e() 1281 { 1282 Tensor<Scalar, 1> in_x(21); 1283 Tensor<Scalar, 1> out(21); 1284 Tensor<Scalar, 1> expected_out(21); 1285 out.setZero(); 1286 1287 Array<Scalar, 1, Dynamic> in_x_array(21); 1288 Array<Scalar, 1, Dynamic> expected_out_array(21); 1289 1290 in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, 1291 -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; 1292 1293 expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361, 1294 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857, 1295 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554, 1296 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163, 1297 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128, 1298 0.0897803118848; 1299 1300 for (int i = 0; i < 21; ++i) { 1301 in_x(i) = in_x_array(i); 1302 expected_out(i) = expected_out_array(i); 1303 } 1304 1305 std::size_t bytes = in_x.size() * sizeof(Scalar); 1306 1307 Scalar* d_in; 1308 Scalar* d_out; 1309 gpuMalloc((void**)(&d_in), bytes); 1310 gpuMalloc((void**)(&d_out), bytes); 1311 1312 gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice); 1313 1314 Eigen::GpuStreamDevice stream; 1315 Eigen::GpuDevice gpu_device(&stream); 1316 1317 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21); 1318 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21); 1319 1320 gpu_out.device(gpu_device) = gpu_in.bessel_i0e(); 1321 1322 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, 1323 gpu_device.stream()) == gpuSuccess); 1324 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 1325 1326 for (int i = 0; i < 21; ++i) { 1327 VERIFY_IS_APPROX(out(i), expected_out(i)); 1328 } 1329 1330 gpuFree(d_in); 1331 gpuFree(d_out); 1332 } 1333 1334 template <typename Scalar> 1335 void test_gpu_i1e() 1336 { 1337 Tensor<Scalar, 1> in_x(21); 1338 Tensor<Scalar, 1> out(21); 1339 Tensor<Scalar, 1> expected_out(21); 1340 out.setZero(); 1341 1342 Array<Scalar, 1, Dynamic> in_x_array(21); 1343 Array<Scalar, 1, Dynamic> expected_out_array(21); 1344 1345 in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, 1346 -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; 1347 1348 expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565, 1349 -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293, 1350 -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249, 1351 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384, 1352 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872, 1353 0.0875062221833; 1354 1355 for (int i = 0; i < 21; ++i) { 1356 in_x(i) = in_x_array(i); 1357 expected_out(i) = expected_out_array(i); 1358 } 1359 1360 std::size_t bytes = in_x.size() * sizeof(Scalar); 1361 1362 Scalar* d_in; 1363 Scalar* d_out; 1364 gpuMalloc((void**)(&d_in), bytes); 1365 gpuMalloc((void**)(&d_out), bytes); 1366 1367 gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice); 1368 1369 Eigen::GpuStreamDevice stream; 1370 Eigen::GpuDevice gpu_device(&stream); 1371 1372 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21); 1373 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21); 1374 1375 gpu_out.device(gpu_device) = gpu_in.bessel_i1e(); 1376 1377 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, 1378 gpu_device.stream()) == gpuSuccess); 1379 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 1380 1381 for (int i = 0; i < 21; ++i) { 1382 VERIFY_IS_APPROX(out(i), expected_out(i)); 1383 } 1384 1385 gpuFree(d_in); 1386 gpuFree(d_out); 1387 } 1388 1389 template <typename Scalar> 1390 void test_gpu_igamma_der_a() 1391 { 1392 Tensor<Scalar, 1> in_x(30); 1393 Tensor<Scalar, 1> in_a(30); 1394 Tensor<Scalar, 1> out(30); 1395 Tensor<Scalar, 1> expected_out(30); 1396 out.setZero(); 1397 1398 Array<Scalar, 1, Dynamic> in_a_array(30); 1399 Array<Scalar, 1, Dynamic> in_x_array(30); 1400 Array<Scalar, 1, Dynamic> expected_out_array(30); 1401 1402 // See special_functions.cpp for the Python code that generates the test data. 1403 1404 in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1405 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0, 1406 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0; 1407 1408 in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05, 1409 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065, 1410 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288, 1411 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458, 1412 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233, 1413 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677, 1414 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568; 1415 1416 expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263, 1417 -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645, 1418 -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314, 1419 -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921, 1420 -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446, 1421 -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363, 1422 -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042, 1423 -0.0125850719397, -0.00455500206958, -0.00476436993148; 1424 1425 for (int i = 0; i < 30; ++i) { 1426 in_x(i) = in_x_array(i); 1427 in_a(i) = in_a_array(i); 1428 expected_out(i) = expected_out_array(i); 1429 } 1430 1431 std::size_t bytes = in_x.size() * sizeof(Scalar); 1432 1433 Scalar* d_a; 1434 Scalar* d_x; 1435 Scalar* d_out; 1436 gpuMalloc((void**)(&d_a), bytes); 1437 gpuMalloc((void**)(&d_x), bytes); 1438 gpuMalloc((void**)(&d_out), bytes); 1439 1440 gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice); 1441 gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice); 1442 1443 Eigen::GpuStreamDevice stream; 1444 Eigen::GpuDevice gpu_device(&stream); 1445 1446 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30); 1447 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30); 1448 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30); 1449 1450 gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x); 1451 1452 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, 1453 gpu_device.stream()) == gpuSuccess); 1454 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 1455 1456 for (int i = 0; i < 30; ++i) { 1457 VERIFY_IS_APPROX(out(i), expected_out(i)); 1458 } 1459 1460 gpuFree(d_a); 1461 gpuFree(d_x); 1462 gpuFree(d_out); 1463 } 1464 1465 template <typename Scalar> 1466 void test_gpu_gamma_sample_der_alpha() 1467 { 1468 Tensor<Scalar, 1> in_alpha(30); 1469 Tensor<Scalar, 1> in_sample(30); 1470 Tensor<Scalar, 1> out(30); 1471 Tensor<Scalar, 1> expected_out(30); 1472 out.setZero(); 1473 1474 Array<Scalar, 1, Dynamic> in_alpha_array(30); 1475 Array<Scalar, 1, Dynamic> in_sample_array(30); 1476 Array<Scalar, 1, Dynamic> expected_out_array(30); 1477 1478 // See special_functions.cpp for the Python code that generates the test data. 1479 1480 in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1481 1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 1482 100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0; 1483 1484 in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05, 1485 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065, 1486 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288, 1487 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458, 1488 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233, 1489 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677, 1490 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568; 1491 1492 expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738, 1493 1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243, 1494 0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302, 1495 1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534, 1496 0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812, 1497 1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061, 1498 0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206, 1499 1.00106492525, 0.97734200649, 1.02198794179; 1500 1501 for (int i = 0; i < 30; ++i) { 1502 in_alpha(i) = in_alpha_array(i); 1503 in_sample(i) = in_sample_array(i); 1504 expected_out(i) = expected_out_array(i); 1505 } 1506 1507 std::size_t bytes = in_alpha.size() * sizeof(Scalar); 1508 1509 Scalar* d_alpha; 1510 Scalar* d_sample; 1511 Scalar* d_out; 1512 gpuMalloc((void**)(&d_alpha), bytes); 1513 gpuMalloc((void**)(&d_sample), bytes); 1514 gpuMalloc((void**)(&d_out), bytes); 1515 1516 gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice); 1517 gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice); 1518 1519 Eigen::GpuStreamDevice stream; 1520 Eigen::GpuDevice gpu_device(&stream); 1521 1522 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30); 1523 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30); 1524 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30); 1525 1526 gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample); 1527 1528 assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, 1529 gpu_device.stream()) == gpuSuccess); 1530 assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); 1531 1532 for (int i = 0; i < 30; ++i) { 1533 VERIFY_IS_APPROX(out(i), expected_out(i)); 1534 } 1535 1536 gpuFree(d_alpha); 1537 gpuFree(d_sample); 1538 gpuFree(d_out); 1539 } 1540 1541 EIGEN_DECLARE_TEST(cxx11_tensor_gpu) 1542 { 1543 CALL_SUBTEST_1(test_gpu_nullary()); 1544 CALL_SUBTEST_1(test_gpu_elementwise_small()); 1545 CALL_SUBTEST_1(test_gpu_elementwise()); 1546 CALL_SUBTEST_1(test_gpu_props()); 1547 CALL_SUBTEST_1(test_gpu_reduction()); 1548 CALL_SUBTEST_2(test_gpu_contraction<ColMajor>()); 1549 CALL_SUBTEST_2(test_gpu_contraction<RowMajor>()); 1550 CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>()); 1551 CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>()); 1552 CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d()); 1553 CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d()); 1554 CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>()); 1555 CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>()); 1556 #if !defined(EIGEN_USE_HIP) 1557 // disable these tests on HIP for now. 1558 // they hang..need to investigate and fix 1559 CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>()); 1560 CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>()); 1561 #endif 1562 1563 #if EIGEN_GPU_TEST_C99_MATH 1564 // std::erf, std::erfc, and so on where only added in c++11. We use them 1565 // as a golden reference to validate the results produced by Eigen. Therefore 1566 // we can only run these tests if we use a c++11 compiler. 1567 CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f)); 1568 CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f)); 1569 CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f)); 1570 CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f)); 1571 1572 CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0)); 1573 CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0)); 1574 CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01)); 1575 CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001)); 1576 1577 CALL_SUBTEST_4(test_gpu_erf<float>(1.0f)); 1578 CALL_SUBTEST_4(test_gpu_erf<float>(100.0f)); 1579 CALL_SUBTEST_4(test_gpu_erf<float>(0.01f)); 1580 CALL_SUBTEST_4(test_gpu_erf<float>(0.001f)); 1581 1582 CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f)); 1583 // CALL_SUBTEST(test_gpu_erfc<float>(100.0f)); 1584 CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs 1585 CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f)); 1586 CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f)); 1587 1588 CALL_SUBTEST_4(test_gpu_erf<double>(1.0)); 1589 CALL_SUBTEST_4(test_gpu_erf<double>(100.0)); 1590 CALL_SUBTEST_4(test_gpu_erf<double>(0.01)); 1591 CALL_SUBTEST_4(test_gpu_erf<double>(0.001)); 1592 1593 CALL_SUBTEST_4(test_gpu_erfc<double>(1.0)); 1594 // CALL_SUBTEST(test_gpu_erfc<double>(100.0)); 1595 CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs 1596 CALL_SUBTEST_4(test_gpu_erfc<double>(0.01)); 1597 CALL_SUBTEST_4(test_gpu_erfc<double>(0.001)); 1598 1599 #if !defined(EIGEN_USE_HIP) 1600 // disable these tests on HIP for now. 1601 1602 CALL_SUBTEST_5(test_gpu_ndtri<float>()); 1603 CALL_SUBTEST_5(test_gpu_ndtri<double>()); 1604 1605 CALL_SUBTEST_5(test_gpu_digamma<float>()); 1606 CALL_SUBTEST_5(test_gpu_digamma<double>()); 1607 1608 CALL_SUBTEST_5(test_gpu_polygamma<float>()); 1609 CALL_SUBTEST_5(test_gpu_polygamma<double>()); 1610 1611 CALL_SUBTEST_5(test_gpu_zeta<float>()); 1612 CALL_SUBTEST_5(test_gpu_zeta<double>()); 1613 #endif 1614 1615 CALL_SUBTEST_5(test_gpu_igamma<float>()); 1616 CALL_SUBTEST_5(test_gpu_igammac<float>()); 1617 1618 CALL_SUBTEST_5(test_gpu_igamma<double>()); 1619 CALL_SUBTEST_5(test_gpu_igammac<double>()); 1620 1621 #if !defined(EIGEN_USE_HIP) 1622 // disable these tests on HIP for now. 1623 CALL_SUBTEST_6(test_gpu_betainc<float>()); 1624 CALL_SUBTEST_6(test_gpu_betainc<double>()); 1625 1626 CALL_SUBTEST_6(test_gpu_i0e<float>()); 1627 CALL_SUBTEST_6(test_gpu_i0e<double>()); 1628 1629 CALL_SUBTEST_6(test_gpu_i1e<float>()); 1630 CALL_SUBTEST_6(test_gpu_i1e<double>()); 1631 1632 CALL_SUBTEST_6(test_gpu_i1e<float>()); 1633 CALL_SUBTEST_6(test_gpu_i1e<double>()); 1634 1635 CALL_SUBTEST_6(test_gpu_igamma_der_a<float>()); 1636 CALL_SUBTEST_6(test_gpu_igamma_der_a<double>()); 1637 1638 CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>()); 1639 CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>()); 1640 #endif 1641 1642 #endif 1643 }