Skip to content

SCALE documentation

Basic example

Basic example#

This is simple vector-sum kernel using CUDA.

The example:

Generates test data on the host
Sends data to the device
Launches a kernel on the device
Receives data back from the device
Checks that the data is correct

Build and run the example by following the general instructions.

Example source code#

#include <vector>
#include <iostream>


// The kernel we are going to launch
__global__ void basicSum(const int * a, const int * b, size_t n, int * out) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx < n)
    {
        out[idx] = a[idx] + b[idx];
    }
}


// A generic helper function to simplify error handling.
void check(cudaError_t error, const char * file, size_t line) {
    if (error != cudaSuccess)
    {
        std::cout << "cuda error: " << cudaGetErrorString(error) << " at " << file << ":" << line << std::endl;
        exit(1);
    }
}


// A wrapper for the helper function above to include the filename and line number
// where the error occurs into the output.
#define CHECK(error) check(error, __FILE__, __LINE__)


int main(int argc, char ** argv) {

    const size_t N = 4096;
    const size_t BYTES = N * sizeof(int);

    std::vector<int> a(N);
    std::vector<int> b(N);
    std::vector<int> out(N);

    // Generate input data
    for (size_t i = 0; i < N; i++) {
        a[i] = i * 2;
        b[i] = N - i;
    }

    int * devA;
    int * devB;
    int * devOut;

    // Allocate memory for the inputs and the output
    CHECK(cudaMalloc(&devA, BYTES));
    CHECK(cudaMalloc(&devB, BYTES));
    CHECK(cudaMalloc(&devOut, BYTES));

    // Copy the input data to the device
    CHECK(cudaMemcpy(devA, a.data(), BYTES, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(devB, b.data(), BYTES, cudaMemcpyHostToDevice));

    // Launch the kernel
    basicSum<<<N / 256 + 1, 256>>>(devA, devB, N, devOut);
    CHECK(cudaDeviceSynchronize());
    CHECK(cudaGetLastError());

    // Copy the output data back to host
    CHECK(cudaMemcpy(out.data(), devOut, BYTES, cudaMemcpyDeviceToHost));

    // Free up the memory we allocated for the inputs and the output
    CHECK(cudaFree(devA));
    CHECK(cudaFree(devB));
    CHECK(cudaFree(devOut));

    // Test that the output matches our expectations
    for (size_t i = 0; i < N; i++) {
        if (a[i] + b[i] != out[i]) {
            std::cout << "Incorrect sum: " << a[i] << " + " << b[i] << " = " << out[i] << " ?\n";
        }
    }

    std::cout << "Example finished" << std::endl;

    return 0;
}

`CMakeLists.txt` used#

cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
project(example_basic LANGUAGES CUDA)

add_executable(example_basic basic.cu)