Basic example#
This is simple vector-sum kernel using CUDA.
The example:
- Generates test data on the host
- Sends data to the device
- Launches a kernel on the device
- Receives data back from the device
- Checks that the data is correct
Build and run the example by following the general instructions.
Example source code#
#include <vector>
#include <iostream>
// The kernel we are going to launch
__global__ void basicSum(const int * a, const int * b, size_t n, int * out) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx < n)
{
out[idx] = a[idx] + b[idx];
}
}
// A generic helper function to simplify error handling.
void check(cudaError_t error, const char * file, size_t line) {
if (error != cudaSuccess)
{
std::cout << "cuda error: " << cudaGetErrorString(error) << " at " << file << ":" << line << std::endl;
exit(1);
}
}
// A wrapper for the helper function above to include the filename and line number
// where the error occurs into the output.
#define CHECK(error) check(error, __FILE__, __LINE__)
int main(int argc, char ** argv) {
const size_t N = 4096;
const size_t BYTES = N * sizeof(int);
std::vector<int> a(N);
std::vector<int> b(N);
std::vector<int> out(N);
// Generate input data
for (size_t i = 0; i < N; i++) {
a[i] = i * 2;
b[i] = N - i;
}
int * devA;
int * devB;
int * devOut;
// Allocate memory for the inputs and the output
CHECK(cudaMalloc(&devA, BYTES));
CHECK(cudaMalloc(&devB, BYTES));
CHECK(cudaMalloc(&devOut, BYTES));
// Copy the input data to the device
CHECK(cudaMemcpy(devA, a.data(), BYTES, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(devB, b.data(), BYTES, cudaMemcpyHostToDevice));
// Launch the kernel
basicSum<<<N / 256 + 1, 256>>>(devA, devB, N, devOut);
CHECK(cudaDeviceSynchronize());
CHECK(cudaGetLastError());
// Copy the output data back to host
CHECK(cudaMemcpy(out.data(), devOut, BYTES, cudaMemcpyDeviceToHost));
// Free up the memory we allocated for the inputs and the output
CHECK(cudaFree(devA));
CHECK(cudaFree(devB));
CHECK(cudaFree(devOut));
// Test that the output matches our expectations
for (size_t i = 0; i < N; i++) {
if (a[i] + b[i] != out[i]) {
std::cout << "Incorrect sum: " << a[i] << " + " << b[i] << " = " << out[i] << " ?\n";
}
}
std::cout << "Example finished" << std::endl;
return 0;
}
CMakeLists.txt
used#
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
project(example_basic LANGUAGES CUDA)
add_executable(example_basic basic.cu)