cxx11_tensor_complex_gpu.cu (6636B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #define EIGEN_TEST_NO_LONGDOUBLE 11 12 #define EIGEN_USE_GPU 13 14 #include "main.h" 15 #include <unsupported/Eigen/CXX11/Tensor> 16 17 using Eigen::Tensor; 18 19 void test_cuda_nullary() { 20 Tensor<std::complex<float>, 1, 0, int> in1(2); 21 Tensor<std::complex<float>, 1, 0, int> in2(2); 22 in1.setRandom(); 23 in2.setRandom(); 24 25 std::size_t float_bytes = in1.size() * sizeof(float); 26 std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>); 27 28 std::complex<float>* d_in1; 29 std::complex<float>* d_in2; 30 float* d_out2; 31 cudaMalloc((void**)(&d_in1), complex_bytes); 32 cudaMalloc((void**)(&d_in2), complex_bytes); 33 cudaMalloc((void**)(&d_out2), float_bytes); 34 cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice); 35 cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice); 36 37 Eigen::GpuStreamDevice stream; 38 Eigen::GpuDevice gpu_device(&stream); 39 40 Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1( 41 d_in1, 2); 42 Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2( 43 d_in2, 2); 44 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2( 45 d_out2, 2); 46 47 gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f)); 48 gpu_out2.device(gpu_device) = gpu_in2.abs(); 49 50 Tensor<std::complex<float>, 1, 0, int> new1(2); 51 Tensor<float, 1, 0, int> new2(2); 52 53 assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost, 54 gpu_device.stream()) == cudaSuccess); 55 assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost, 56 gpu_device.stream()) == cudaSuccess); 57 58 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); 59 60 for (int i = 0; i < 2; ++i) { 61 VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f)); 62 VERIFY_IS_APPROX(new2(i), std::abs(in2(i))); 63 } 64 65 cudaFree(d_in1); 66 cudaFree(d_in2); 67 cudaFree(d_out2); 68 } 69 70 71 static void test_cuda_sum_reductions() { 72 73 Eigen::GpuStreamDevice stream; 74 Eigen::GpuDevice gpu_device(&stream); 75 76 const int num_rows = internal::random<int>(1024, 5*1024); 77 const int num_cols = internal::random<int>(1024, 5*1024); 78 79 Tensor<std::complex<float>, 2> in(num_rows, num_cols); 80 in.setRandom(); 81 82 Tensor<std::complex<float>, 0> full_redux; 83 full_redux = in.sum(); 84 85 std::size_t in_bytes = in.size() * sizeof(std::complex<float>); 86 std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>); 87 std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes)); 88 std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes)); 89 gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); 90 91 TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols); 92 TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr); 93 94 out_gpu.device(gpu_device) = in_gpu.sum(); 95 96 Tensor<std::complex<float>, 0> full_redux_gpu; 97 gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); 98 gpu_device.synchronize(); 99 100 // Check that the CPU and GPU reductions return the same result. 101 VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); 102 103 gpu_device.deallocate(gpu_in_ptr); 104 gpu_device.deallocate(gpu_out_ptr); 105 } 106 107 static void test_cuda_mean_reductions() { 108 109 Eigen::GpuStreamDevice stream; 110 Eigen::GpuDevice gpu_device(&stream); 111 112 const int num_rows = internal::random<int>(1024, 5*1024); 113 const int num_cols = internal::random<int>(1024, 5*1024); 114 115 Tensor<std::complex<float>, 2> in(num_rows, num_cols); 116 in.setRandom(); 117 118 Tensor<std::complex<float>, 0> full_redux; 119 full_redux = in.mean(); 120 121 std::size_t in_bytes = in.size() * sizeof(std::complex<float>); 122 std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>); 123 std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes)); 124 std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes)); 125 gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); 126 127 TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols); 128 TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr); 129 130 out_gpu.device(gpu_device) = in_gpu.mean(); 131 132 Tensor<std::complex<float>, 0> full_redux_gpu; 133 gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); 134 gpu_device.synchronize(); 135 136 // Check that the CPU and GPU reductions return the same result. 137 VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); 138 139 gpu_device.deallocate(gpu_in_ptr); 140 gpu_device.deallocate(gpu_out_ptr); 141 } 142 143 static void test_cuda_product_reductions() { 144 145 Eigen::GpuStreamDevice stream; 146 Eigen::GpuDevice gpu_device(&stream); 147 148 const int num_rows = internal::random<int>(1024, 5*1024); 149 const int num_cols = internal::random<int>(1024, 5*1024); 150 151 Tensor<std::complex<float>, 2> in(num_rows, num_cols); 152 in.setRandom(); 153 154 Tensor<std::complex<float>, 0> full_redux; 155 full_redux = in.prod(); 156 157 std::size_t in_bytes = in.size() * sizeof(std::complex<float>); 158 std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>); 159 std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes)); 160 std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes)); 161 gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); 162 163 TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols); 164 TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr); 165 166 out_gpu.device(gpu_device) = in_gpu.prod(); 167 168 Tensor<std::complex<float>, 0> full_redux_gpu; 169 gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); 170 gpu_device.synchronize(); 171 172 // Check that the CPU and GPU reductions return the same result. 173 VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); 174 175 gpu_device.deallocate(gpu_in_ptr); 176 gpu_device.deallocate(gpu_out_ptr); 177 } 178 179 180 EIGEN_DECLARE_TEST(test_cxx11_tensor_complex) 181 { 182 CALL_SUBTEST(test_cuda_nullary()); 183 CALL_SUBTEST(test_cuda_sum_reductions()); 184 CALL_SUBTEST(test_cuda_mean_reductions()); 185 CALL_SUBTEST(test_cuda_product_reductions()); 186 }