CUDA::基本::並列処理2

現在地 >> メニュー >> CUDA >> CUDA::基本::並列処理 >> CUDA::基本::並列処理2

→ CUDA::基本::並列処理3

問題

CUDA::基本::並列処理のプログラムで、ブロック数やスレッド数を以下のように変更せよ。

条件：
　ブロック数：2個
　スレッド数：50個
　
　→　(ブロック2個)×(スレッド50個) = 100要素

答え

main.cu

#include <iostream>
#include <cutil.h>

using namespace std;

extern __global__ void ADD_DATA_ON_GPU(float *input);

int main(int argc, char **argv)
{

CUT_DEVICE_INIT(); //初期化

/** 各種メモリ確保 **/
float* x = new float[100]; //CPU
for( int i = 0; i < 100; i++)
{
x[i] = static_cast<float>(i);
}

float* gpu_x; //GPU
CUDA_SAFE_CALL(cudaMalloc( reinterpret_cast<void**>( &gpu_x), sizeof(float) * 100) );

/* メイン => デバイスのメモリにコピー */
CUDA_SAFE_CALL( cudaMemcpy( gpu_x, x, sizeof(float) * 100 , cudaMemcpyHostToDevice));

dim3 block(2,1,1);//ブロック2個
dim3 threads(50,1,1); //(スレッド50)個 × ブロック2個 = 100要素

/* GPUで計算 */
ADD_DATA_ON_GPU<<<block, threads,sizeof(float) * 100>>>(gpu_x);

/* GPU => CPUへコピー */
CUDA_SAFE_CALL( cudaMemcpy( x, gpu_x, sizeof(float) * 100, cudaMemcpyDeviceToHost) );

for( int i = 0; i < 100; i++) //出力
{
cout << x[i]<<"\n";
}

/* メモリ解放 */
delete[] x; x = NULL;
cudaFree(gpu_x); gpu_x = NULL;

CUT_EXIT(argc, argv);//終了

return 0;
}

main_kernel.cu

#include <iostream>
#include <cutil.h>

using namespace std;

#define BLOCK_NUM_X 2

__global__ void ADD_DATA_ON_GPU(float *input)
{
extern __shared__ float shared[];

const int X = blockIdx.x * blockDim.x + threadIdx.x;
const int Y = blockIdx.y * blockDim.y + threadIdx.y;

const int element_id = X + Y * blockDim.x * BLOCK_NUM_X;

/* 共有メモリに読み込み(スレッド毎) */
shared[element_id] = input[element_id];

__syncthreads(); //同期をとる

/* 各スレッドでデータを加算 */
shared[element_id] = shared[element_id] + 1000;

input[element_id] = shared[element_id];

}