Sie sind auf Seite 1von 3

COURSE : BCS3413 Parallel Programming

STUDENT NAME : Azrul Bin Hazizan (B01150007)


ASSIGNMENT : Assignment Individual
SUBMISSION DATE : 23rd November 2017

ASSIGNMENT TOPIC : NVIDIA CUDA Programming

STUDENT DECLARATION

I declare that this material, which I now submit for assessment, is entirely my own work and has
not been taken from the work of others, save and to the extent that such work has been cited and
acknowledged within the text of my work.

I understand that plagiarism, collusion, and copying are grave and serious offences in the university
and accept the penalties that would be imposed should I engage in plagiarism, collusion or copying.
I have read and understood the Assignment Regulations set out in the assignment documentation.

I have identified and included the source of all facts, ideas, opinions, and viewpoints of others in
the assignment references. Direct quotations from books, journal articles, internet sources, module
text, or any other source whatsoever are acknowledged and the source cited are identified in the
assignment references.

This assignment, or any part of it, has not been previously submitted by me or any other person for
assessment on this or any other course of study

DATE : 23rd November 2017


:
SIGNATURE

MARKS :
COMMENT :
1. Write a CUDA code for adding two vectors based on the above code to be run on a GPGPU.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>

__global__ void vectorAdd(const double *A, const double *B, double *C, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < n)
{
C[i] = A[i] + B[i];
}
}

int main(void)
{

int n = 100000;
size_t bytes = n * sizeof(double);
printf("[Vector addition of %d elements]\n", n);

double *h_A = (double *)malloc(bytes);


double *h_B = (double *)malloc(bytes);
double *h_C = (double *)malloc(bytes);

for (int i = 0; i < n; ++i)


{
h_A[i] = sin(i) * sin(i);
h_B[i] = cos(i) * cos(i);
}

double *d_A = NULL;


cudaMalloc((void **)&d_A, bytes);

double *d_B = NULL;


cudaMalloc((void **)&d_B, bytes);

double *d_C = NULL;


cudaMalloc((void **)&d_C, bytes);

printf("Copy input data from the host memory to the CUDA device\n");
cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice);

int threadsPerBlock = 256;


int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
vectorAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C, n);
printf("Copy output data from the CUDA device to the host memory\n");
cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost);

double sum = 0;
for (int i = 0; i<n; i++) {
sum += h_C[i];
}
sum = sum / n;
printf("Final result: %f\n", sum);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

// Free host memory


free(h_A);
free(h_B);
free(h_C);

// Reset the device and exit


cudaDeviceReset();

printf("Done\n");
return 0;
}

Output:

Das könnte Ihnen auch gefallen