cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cxx11_tensor_complex_gpu.cu (6636B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #define EIGEN_TEST_NO_LONGDOUBLE
     11 
     12 #define EIGEN_USE_GPU
     13 
     14 #include "main.h"
     15 #include <unsupported/Eigen/CXX11/Tensor>
     16 
     17 using Eigen::Tensor;
     18 
     19 void test_cuda_nullary() {
     20   Tensor<std::complex<float>, 1, 0, int> in1(2);
     21   Tensor<std::complex<float>, 1, 0, int> in2(2);
     22   in1.setRandom();
     23   in2.setRandom();
     24 
     25   std::size_t float_bytes = in1.size() * sizeof(float);
     26   std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
     27 
     28   std::complex<float>* d_in1;
     29   std::complex<float>* d_in2;
     30   float* d_out2;
     31   cudaMalloc((void**)(&d_in1), complex_bytes);
     32   cudaMalloc((void**)(&d_in2), complex_bytes);
     33   cudaMalloc((void**)(&d_out2), float_bytes);
     34   cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
     35   cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
     36 
     37   Eigen::GpuStreamDevice stream;
     38   Eigen::GpuDevice gpu_device(&stream);
     39 
     40   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
     41       d_in1, 2);
     42   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
     43       d_in2, 2);
     44   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
     45       d_out2, 2);
     46 
     47   gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
     48   gpu_out2.device(gpu_device) = gpu_in2.abs();
     49 
     50   Tensor<std::complex<float>, 1, 0, int> new1(2);
     51   Tensor<float, 1, 0, int> new2(2);
     52 
     53   assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
     54                          gpu_device.stream()) == cudaSuccess);
     55   assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
     56                          gpu_device.stream()) == cudaSuccess);
     57 
     58   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
     59 
     60   for (int i = 0; i < 2; ++i) {
     61     VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
     62     VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
     63   }
     64 
     65   cudaFree(d_in1);
     66   cudaFree(d_in2);
     67   cudaFree(d_out2);
     68 }
     69 
     70 
     71 static void test_cuda_sum_reductions() {
     72 
     73   Eigen::GpuStreamDevice stream;
     74   Eigen::GpuDevice gpu_device(&stream);
     75 
     76   const int num_rows = internal::random<int>(1024, 5*1024);
     77   const int num_cols = internal::random<int>(1024, 5*1024);
     78 
     79   Tensor<std::complex<float>, 2> in(num_rows, num_cols);
     80   in.setRandom();
     81 
     82   Tensor<std::complex<float>, 0> full_redux;
     83   full_redux = in.sum();
     84 
     85   std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
     86   std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
     87   std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
     88   std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
     89   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
     90 
     91   TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
     92   TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
     93 
     94   out_gpu.device(gpu_device) = in_gpu.sum();
     95 
     96   Tensor<std::complex<float>, 0> full_redux_gpu;
     97   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
     98   gpu_device.synchronize();
     99 
    100   // Check that the CPU and GPU reductions return the same result.
    101   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
    102 
    103   gpu_device.deallocate(gpu_in_ptr);
    104   gpu_device.deallocate(gpu_out_ptr);
    105 }
    106 
    107 static void test_cuda_mean_reductions() {
    108 
    109   Eigen::GpuStreamDevice stream;
    110   Eigen::GpuDevice gpu_device(&stream);
    111 
    112   const int num_rows = internal::random<int>(1024, 5*1024);
    113   const int num_cols = internal::random<int>(1024, 5*1024);
    114 
    115   Tensor<std::complex<float>, 2> in(num_rows, num_cols);
    116   in.setRandom();
    117 
    118   Tensor<std::complex<float>, 0> full_redux;
    119   full_redux = in.mean();
    120 
    121   std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
    122   std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
    123   std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
    124   std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
    125   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
    126 
    127   TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
    128   TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
    129 
    130   out_gpu.device(gpu_device) = in_gpu.mean();
    131 
    132   Tensor<std::complex<float>, 0> full_redux_gpu;
    133   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
    134   gpu_device.synchronize();
    135 
    136   // Check that the CPU and GPU reductions return the same result.
    137   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
    138 
    139   gpu_device.deallocate(gpu_in_ptr);
    140   gpu_device.deallocate(gpu_out_ptr);
    141 }
    142 
    143 static void test_cuda_product_reductions() {
    144 
    145   Eigen::GpuStreamDevice stream;
    146   Eigen::GpuDevice gpu_device(&stream);
    147 
    148   const int num_rows = internal::random<int>(1024, 5*1024);
    149   const int num_cols = internal::random<int>(1024, 5*1024);
    150 
    151   Tensor<std::complex<float>, 2> in(num_rows, num_cols);
    152   in.setRandom();
    153 
    154   Tensor<std::complex<float>, 0> full_redux;
    155   full_redux = in.prod();
    156 
    157   std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
    158   std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
    159   std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
    160   std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
    161   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
    162 
    163   TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
    164   TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
    165 
    166   out_gpu.device(gpu_device) = in_gpu.prod();
    167 
    168   Tensor<std::complex<float>, 0> full_redux_gpu;
    169   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
    170   gpu_device.synchronize();
    171 
    172   // Check that the CPU and GPU reductions return the same result.
    173   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
    174 
    175   gpu_device.deallocate(gpu_in_ptr);
    176   gpu_device.deallocate(gpu_out_ptr);
    177 }
    178 
    179 
    180 EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
    181 {
    182   CALL_SUBTEST(test_cuda_nullary());
    183   CALL_SUBTEST(test_cuda_sum_reductions());
    184   CALL_SUBTEST(test_cuda_mean_reductions());
    185   CALL_SUBTEST(test_cuda_product_reductions());
    186 }