Sie sind auf Seite 1von 28

2015-16

MALLA REDDY ENGINEERING COLLEGE(AUTONOMOUS)


Course Code: 50531 LTP
-- 4
Credits: 2
B. Tech- VII Semester
ADVANCED COMPUTER ARCHITECTURE LAB

Objectives:
1.This course will give advanced concepts of Computer organization and architectures with
process and environment behavior for minimizing the recourses.

2.To understand principles that are shaping computing and organizational paradigms that
determine the capabilities, performance and success of computer systems.

List of Programs:
1. Installation and configuring with CUDA
2. Getting started with CUDA
A) Write a program for printing ―Hello, world‖ using CUDA.
B) Write program that adds two numbers together using a kernel function:
3. Write a program that implement the following atomic operations using CUDA
i. Add (add), Sub (subtract), Inc (increment), Dec (decrement)
ii. And (bit-wise and), Or (bit-wise or) ,Xor (bit-wise exclusive or)
iii. Exch (Exchange)
4. How Heterogeneous Computing concepts are implemented CUDA? Write a program
for the same.
5. Write a program for data movement in CUDA
6. Implement a parallel program using CUDA
7. Write a program for de Casteljau algorithm using CUDA
8. Write a program for implementing a Simple Thread.
9. Illustrate how the matrix multiplication is implemented With Shared Memory.
10. Illustrate how the matrix multiplication is implemented Without Shared Memory.
11. Implement Asynchronous Concurrent Execution using streams
12. Write a program for testing Race conditions using CUDA
Outcomes:
Upon completion of the course, the students are expected to:
• Understand and apply concept and principle of cache memory and virtual memory to
high -performance computer architecture.
• Understand pipelining and its speed advantage & design pipelined logic.
• Proficient in fault-tolerant design techniques and examine various methods of error
detection and correction such as TMR and Hamming Codes.
• Identify tradeoffs between complex instruction set computers (CISC) and reduced
instruction set computers (RISC).
• Analyze and perform tradeoffs between the cost, performance, and reliability
ofalternative computer architectures.

1.Installation and configuring with CUDA


Prerequisites for cuda installation
i3 processor
8GB RAM
NVIDIA graphic card insert on mother board
Ubuntu 16.04 Operating system
Internet connection
To install cuda on ubuntu 16.04 command is
$sudo get -apt install nvidia-cuda-toolkit
To check whether cuda is installed on ubuntu16.04 command is
$nvcc –version
Then it shows the version of cuda installed on ubuntu16.04
cse2@cse2-Vostro-3653:~$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17
cse2@cse2-Vostro-3653:~$
2. Getting started with CUDA
A)Write a program for printing ―Hello, world‖ using CUDA.
#include <stdio.h>

__device__ const char *STR = "HELLO WORLD!";


const char STR_LENGTH = 12;

__global__ void hello()


{
printf("%c\n", STR[threadIdx.x % STR_LENGTH]);
}

int main(void)
{
int num_threads = STR_LENGTH;
int num_blocks = 1;
hello<<<num_blocks,num_threads>>>();
cudaDeviceSynchronize();

return 0;
}
WEEK 2A Output
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse1@cse1-Vostro-3653:~$ nvcc week2a.cu
cse1@cse1-Vostro-3653:~$ ./a.out
H
E
L
L
O

W
O
R
L
D
!
cse1@cse1-Vostro-3653:~$

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////

B) Write program that adds two numbers together using a kernel function:
#include "stdio.h"
__global__ void add(int a, int b, int *c)
{
*c = a + b;
}
int main()
{
int a,b,c;
int *dev_c;
/*a=3;
b=4;*/
printf("enter a and b values");
scanf(" %d%d",&a,&b);
cudaMalloc((void**)&dev_c, sizeof(int));
add<<<1,1>>>(a,b,dev_c);
cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d + %d is %d\n", a, b, c);
cudaFree(dev_c);
return 0;
}

WEEK 2B Output
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse1@cse1-Vostro-3653:~$ gedit week2b.cu
cse1@cse1-Vostro-3653:~$ nvcc week2b.cu
cse1@cse1-Vostro-3653:~$ ./a.out
enter a and b values3
4
3 + 4 is 7
cse1@cse1-Vostro-3653:~$

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3. Write a program that implement the following atomic operations using CUDA i. Add
(add)

#include <stdio.h>
#include "cuda.h"
__global__ void Sum( int *sum , int size, int* index)

{
register int i = atomicAdd(index,1);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
sum[i] = idx;
}
int main(int argc, char* argv[])

{
int W = 5;
int H = 5;
int *hSum ,*dSum , size = 5;
int* d_index=0;
int h_index=0;
hSum = (int*)malloc(sizeof(int)*W*H);
memset( hSum, 0, sizeof(int)*W*H);
cudaMalloc( (void**) &dSum, sizeof(int)*W*H );
cudaMalloc( (void**) &d_index, sizeof(int) );
cudaMemcpy(dSum, hSum , sizeof(int)*W*H, cudaMemcpyHostToDevice);
cudaMemcpy(d_index, &h_index , sizeof(int), cudaMemcpyHostToDevice);
Sum<<<W,H>>>( dSum , size, d_index );
cudaMemcpy(hSum, dSum, sizeof(int)*W*H, cudaMemcpyDeviceToHost);
cudaMemcpy(&h_index , d_index, sizeof(int), cudaMemcpyDeviceToHost);
fprintf(stderr, "%d\n", h_index);
for( int i=0; i<W*H; ++i )
fprintf( stdout, " %d %d\n", i, hSum[i] );
free(hSum);
cudaFree(dSum);
cudaFree(d_index);
return 0;

WEEK 3 Output
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse1@cse1-Vostro-3653:~$ nvcc week3.cu
cse1@cse1-Vostro-3653:~$ ./a.out
25
05
16
27
38
49
50
61
72
83
94
10 15
11 16
12 17
13 18
14 19
15 10
16 11
17 12
18 13
19 14
20 20
21 21
22 22
23 23
24 24
cse1@cse1-Vostro-3653:~$

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////

4. How Heterogeneous Computing concepts are implemented CUDA? Write a program


for the same.
#include<stdio.h>
__global__ void increment_gpu(float *a, float b, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx] = a[idx] + b;
//return idx;
}
int main(void)
{
float *a,b;int c=5,d=7;
increment_gpu<<<4.0,4>>>(a, b, 16);
printf("%d\n",c+d);
return 0;}

WEEK 4 Output
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse1@cse1-Vostro-3653:~$ gedit week4.cu
cse1@cse1-Vostro-3653:~$ nvcc week4.cu
week4.cu(11): warning: variable "a" is used before its value is set

week4.cu(11): warning: variable "b" is used before its value is set

week4.cu(11): warning: variable "a" is used before its value is set

week4.cu(11): warning: variable "b" is used before its value is set

cse1@cse1-Vostro-3653:~$ ./a.out
12
cse1@cse1-Vostro-3653:~$

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////

5) Write a program for data movement in CUDA

#include<stdio.h>
int main()
{
const unsigned int X=1048576; //1 Megabyte
printf("The value of X in Host %d\n",X);
const unsigned int bytes = X*sizeof(int);
int *hostArray= (int*)malloc(bytes);
int *deviceArray;
cudaMalloc((int**)&deviceArray,bytes);
memset(hostArray,0,bytes);
cudaMemcpy(deviceArray,hostArray,bytes,cudaMemcpyHostToDevice);
printf("The value of X moved from host to Device%d\n",X);
cudaMemcpy(hostArray,deviceArray,bytes,cudaMemcpyDeviceToHost);
printf("The value of X moved from Device to host %d\n",X);

cudaFree(deviceArray);

WEEK 5 Output
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse1@cse1-Vostro-3653:~$ nvcc week5.cu
cse1@cse1-Vostro-3653:~$ ./a.out
The value of X in Host 1048576
The value of X moved from host to Device1048576
The value of X moved from Device to host 1048576
cse1@cse1-Vostro-3653:~$

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////

6. Implement a parallel program using CUDA

#include <cuda.h>
#include <stdlib.h>
#define N (1024*1024)
void random_ints(int *a, int n)
{
int i;
for (i = 0; i < n; ++i)
a[i] = rand() %5000;
}
__global__ void add1(int *a, int *b, int *c)
{
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
//Vector Addition on the Device
__global__ void add2(int *a, int *b, int *c)
{
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
//Returning to our parallelized add() kernel
__global__ void add3(int *a, int *b, int *c)
{
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
int main(void)
{
int *a, *b, *c;
// host copies of a, b, c
int *d_a, *d_b, *d_c;
// device copies of a, b, c
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size);
printf("\n the two numbers are %d\t,%d\t",*a,*b);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU with N blocks
add1<<<4,4>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
printf("\n result is %d",*c);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
WEEK6 Output
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
the two numbers are 4383 ,4562
result is 8945
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
7. Write a program for de Casteljau algorithm using CUDA

#include <stdio.h>
void WaitForEnter()
{
printf("Press Enter to quit");
fflush(stdin);
getchar();
}

float mix(float a, float b, float t)


{
// degree 1
return a * (1.0f - t) + b*t;
}

float BezierQuadratic(float A, float B, float C, float t)


{
// degree 2
float AB = mix(A, B, t);
float BC = mix(B, C, t);
return mix(AB, BC, t);
}

float BezierCubic(float A, float B, float C, float D, float t)


{
// degree 3
float ABC = BezierQuadratic(A, B, C, t);
float BCD = BezierQuadratic(B, C, D, t);
return mix(ABC, BCD, t);
}

float BezierQuartic(float A, float B, float C, float D, float E, float t)


{
// degree 4
float ABCD = BezierCubic(A, B, C, D, t);
float BCDE = BezierCubic(B, C, D, E, t);
return mix(ABCD, BCDE, t);
}

float BezierQuintic(float A, float B, float C, float D, float E, float F, float t)


{
// degree 5
float ABCDE = BezierQuartic(A, B, C, D, E, t);
float BCDEF = BezierQuartic(B, C, D, E, F, t);
return mix(ABCDE, BCDEF, t);
}

float BezierSextic(float A, float B, float C, float D, float E, float F, float G, float t)


{
// degree 6
float ABCDEF = BezierQuintic(A, B, C, D, E, F, t);
float BCDEFG = BezierQuintic(B, C, D, E, F, G, t);
return mix(ABCDEF, BCDEFG, t);
}

int main(int argc, char **argv)


{
struct SPoint
{
float x;
float y;
};

SPoint controlPoints[7] =
{
{ 0.0f, 1.1f },
{ 2.0f, 8.3f },
{ 0.5f, 6.5f },
{ 5.1f, 4.7f },
{ 3.3f, 3.1f },
{ 1.4f, 7.5f },
{ 2.1f, 0.0f },
};

//calculate some points on a sextic curve!


const float c_numPoints = 10;
for (int i = 0; i < c_numPoints; ++i)
{
float t = ((float)i) / (float(c_numPoints - 1));
SPoint p;
p.x = BezierSextic(controlPoints[0].x, controlPoints[1].x, controlPoints[2].x,
controlPoints[3].x, controlPoints[4].x, controlPoints[5].x, controlPoints[6].x, t);
p.y = BezierSextic(controlPoints[0].y, controlPoints[1].y, controlPoints[2].y,
controlPoints[3].y, controlPoints[4].y, controlPoints[5].y, controlPoints[6].y, t);
printf("point at time %0.2f = (%0.2f, %0.2f)n", t, p.x, p.y);
}

WaitForEnter();
}
Week7 output
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse2@cse2-Vostro-3653:~$ nvcc ee.cu
cse2@cse2-Vostro-3653:~$ ./a.out
point at time 0.00 = (0.00, 1.10)npoint at time 0.11 = (0.90, 4.46)npoint at time 0.22 = (1.50,
5.73)npoint at time 0.33 = (2.11, 5.83)npoint at time 0.44 = (2.65, 5.45)npoint at time 0.56 =
(2.93, 5.01)npoint at time 0.67 = (2.83, 4.70)npoint at time 0.78 = (2.43, 4.34)npoint at time
0.89 = (2.03, 3.24)npoint at time 1.00 = (2.10, 0.00)nPress Enter to quit^C
cse2@cse2-Vostro-3653:~$
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
8. Write a program for implementing a Simple Thread.

#include <stdio.h>
#include "cuda.h"
__global__ void Sum( int *sum , int size, int* index)

{
register int i = atomicAdd(index,1);
int idx = blockIdx.x * blockDim.x + threadIdx.x;
sum[i] = idx;
}
int main(int argc, char* argv[])

{
int W = 5;
int H = 5;
int *hSum ,*dSum , size = 5;
int* d_index=0;
int h_index=0;
hSum = (int*)malloc(sizeof(int)*W*H);
memset( hSum, 0, sizeof(int)*W*H);
cudaMalloc( (void**) &dSum, sizeof(int)*W*H );
cudaMalloc( (void**) &d_index, sizeof(int) );
cudaMemcpy(dSum, hSum , sizeof(int)*W*H, cudaMemcpyHostToDevice);
cudaMemcpy(d_index, &h_index , sizeof(int), cudaMemcpyHostToDevice);
Sum<<<W,H>>>( dSum , size, d_index );
cudaMemcpy(hSum, dSum, sizeof(int)*W*H, cudaMemcpyDeviceToHost);
cudaMemcpy(&h_index , d_index, sizeof(int), cudaMemcpyDeviceToHost);
fprintf(stderr, "%d\n", h_index);
for( int i=0; i<W*H; ++i )
fprintf( stdout, " %d %d\n", i, hSum[i] );
free(hSum);
cudaFree(dSum);
cudaFree(d_index);
return 0;
}
WEEK 8 Output
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse1@cse1-Vostro-3653:~$ nvcc week8.cu
cse1@cse1-Vostro-3653:~$ ./a.out
25
05
16
27
38
49
50
61
72
83
94
10 15
11 16
12 17
13 18
14 19
15 10
16 11
17 12
18 13
19 14
20 20
21 21
22 22
23 23
24 24
cse1@cse1-Vostro-3653:~$

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////

9. Illustrate how the matrix multiplication is implemented With Shared Memory

#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>

// This code assumes that your device support block size of 1024
#define MAX_RANGE 9999

#define funcCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
printf( "Failed to run stmt %d ", __LINE__); \
printf( "Got CUDA error ... %s ", cudaGetErrorString(err)); \
return -1; \
} \
} while(0)

// Compute C = A * B
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
__shared__ float sA[32][32]; // Tile size of 32x32
__shared__ float sB[32][32];

int Row = blockDim.y*blockIdx.y + threadIdx.y;


int Col = blockDim.x*blockIdx.x + threadIdx.x;
float Cvalue = 0.0;
sA[threadIdx.y][threadIdx.x] = 0.0;
sB[threadIdx.y][threadIdx.x] = 0.0;

for (int k = 0; k < (((numAColumns - 1)/ 32) + 1); k++)


{
if ( (Row < numARows) && (threadIdx.x + (k*32)) < numAColumns)
{
sA[threadIdx.y][threadIdx.x] = A[(Row*numAColumns) + threadIdx.x + (k*32)];
}
else
{
sA[threadIdx.y][threadIdx.x] = 0.0;
}
if ( Col < numBColumns && (threadIdx.y + k*32) < numBRows)
{
sB[threadIdx.y][threadIdx.x] = B[(threadIdx.y + k*32)*numBColumns + Col];
}
else
{
sB[threadIdx.y][threadIdx.x] = 0.0;
}
__syncthreads();

for (int j = 0; j < 32; ++j)


{
Cvalue += sA[threadIdx.y][j] * sB[j][threadIdx.x];
}
}
if (Row < numCRows && Col < numCColumns)
{
C[Row*numCColumns + Col] = Cvalue;
}
}

void matMultiplyOnHost(float * A, float * B, float * C, int numARows,


int numAColumns, int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
for (int i=0; i < numARows; i ++)
{
for (int j = 0; j < numAColumns; j++)
{
C[i*numCColumns + j ] = 0.0;
for (int k = 0; k < numCColumns; k++)
{
C[i*numCColumns + j ] += A[i*numAColumns + k] * B [k*numBColumns + j];
}
}
}
return;
}

int main(int argc, char ** argv) {


float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * hostComputedC;
float * deviceA;
float * deviceB;
float * deviceC;

// Please adjust rows and columns according to you need.


int numARows = 512; // number of rows in the matrix A
int numAColumns = 512; // number of columns in the matrix A
int numBRows = 512; // number of rows in the matrix B
int numBColumns = 512; // number of columns in the matrix B

int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)

hostA = (float *) malloc(sizeof(float)*numARows*numAColumns);


hostB = (float *) malloc(sizeof(float)*numBRows*numBColumns);

for (int i = 0; i < numARows*numAColumns; i++)


{
hostA[i] = (rand() % MAX_RANGE) / 2.0;
}
for (int i = 0; i < numBRows*numBColumns; i++)
{
hostB[i] = (rand() % MAX_RANGE) / 2.0;
}

// Setting numCRows and numCColumns


numCRows = numARows;
numCColumns = numBColumns;

hostC = (float *) malloc(sizeof(float)*numCRows*numCColumns);


hostComputedC = (float *) malloc(sizeof(float)*numCRows*numCColumns);

// Allocating GPU memory


funcCheck(cudaMalloc((void **)&deviceA, sizeof(float)*numARows*numAColumns));
funcCheck(cudaMalloc((void **)&deviceB, sizeof(float)*numBRows*numBColumns));
funcCheck(cudaMalloc((void **)&deviceC, sizeof(float)*numCRows*numCColumns));

// Copy memory to the GPU


funcCheck(cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns,
cudaMemcpyHostToDevice));
funcCheck(cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns,
cudaMemcpyHostToDevice));

// Initialize the grid and block dimensions


dim3 dimBlock(32, 32, 1);
dim3 dimGrid((numCColumns/32) + 1, (numCRows/32) + 1, 1);
//@@ Launch the GPU Kernel here
matrixMultiplyShared<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC,
numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);

cudaError_t err1 = cudaPeekAtLastError();


cudaDeviceSynchronize();
printf( "Got CUDA error ... %s \n", cudaGetErrorString(err1));

// Copy the results in GPU memory back to the CPU


funcCheck(cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns,
cudaMemcpyDeviceToHost));

matMultiplyOnHost(hostA, hostB, hostComputedC, numARows, numAColumns,


numBRows, numBColumns, numCRows, numCColumns);

for (int i=0; i < numCColumns*numCRows; i++)


{
if (hostComputedC[i] != hostC[i] )
{
printf("Mismatch at Row = %d Col = %d hostComputed[] = %f --device[] %f\n", i
/ numCColumns, i % numCColumns, hostComputedC[i], hostC[i]);
break;
}
}
// Free the GPU memory
funcCheck(cudaFree(deviceA));
funcCheck(cudaFree(deviceB));
funcCheck(cudaFree(deviceC));

free(hostA);
free(hostB);
free(hostC);
free(hostComputedC);

return 0;
}

WEEK 9 Output
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse1@cse1-Vostro-3653:~$ nvcc sm9.cu
cse1@cse1-Vostro-3653:~$ ./a.out
Got CUDA error ... no error
Mismatch at Row = 0 Col = 0 hostComputed[] = 3429991680.000000 --device[]
3429916160.000000
cse1@cse1-Vostro-3653:~$

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////

10. Illustrate how the matrix multiplication is implemented Without Shared Memory.
#include "stdio.h"
#define COLUMNS 3
#define ROWS 2
__global__ void add(int *a, int *b, int *c)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}
int main()
{
int a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
for (int y = 0; y < ROWS; y++)
// Fill Arrays
for (int x = 0; x < COLUMNS; x++)
{
a[y][x] = x; b[y][x] = y;
}
cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int), cudaMemcpyHostToDevice);
dim3 grid(COLUMNS,ROWS);
add<<<4,4>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int), cudaMemcpyDeviceToHost);
for (int y = 0; y < ROWS; y++)
// Output Arrays
{
for (int x = 0; x < COLUMNS; x++)
{
printf("[%d][%d]=%d ",y,x,c[y][x]);
}
printf("\n");
}
return 0;
}
Week 10 output:
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////[0][0]=0
[0][1]=1 [0][2]=2
[1][0]=1 [1][1]=0 [1][2]=0
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
11. Implement Asynchronous Concurrent Execution using streams
const int N = 1 << 20;

__global__ void kernel(float *x, int n)


{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}

int main()
{
const int num_streams = 8;

cudaStream_t streams[num_streams];
float *data[num_streams];

for (int i = 0; i < num_streams; i++) {


cudaStreamCreate(&streams[i]);

cudaMalloc(&data[i], N * sizeof(float));

// launch one worker kernel per stream


kernel<<<1, 64, 0, streams[i]>>>(data[i], N);

// launch a dummy kernel on the default stream


kernel<<<1, 1>>>(0, 0);
}

cudaDeviceReset();

return 0;
}
WEEK 11 Output
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse1@cse1-Vostro-3653:~$ nvcc w11.cu
cse1@cse1-Vostro-3653:~$ ./a.out
cse1@cse1-Vostro-3653:~$ nvcc ./w11.cu -o stream_legacy
cse1@cse1-Vostro-3653:~$
////////////////////////////////////////////////////////////////////////////////////////////////////////////////

12. Write a program for testing Race conditions using CUDA

#include<stdio.h>

__device__ void print_list(int* data){


if(threadIdx.x==0) {
for(int i=0; i<64; i++) {
printf("%d ", data[i]);
}
printf("\n\n");
}
}

__device__ void swap(int* a, int* b) {


int temp;
if(*a > *b) {
temp = *a;
*a = *b;
*b = temp;
}
}

__device__ void sort32(int* data) {


for(int i=0; i<16; i++) {
swap(&data[2*threadIdx.x],&data[(2*threadIdx.x)+1]);
// __syncthreads();
swap(&data[(2*threadIdx.x)+1],&data[(2*threadIdx.x)+2]);
// __syncthreads();
// These syncthreads cause the program to produce the correct result
}
}

__global__ void testKernel(void) {


__shared__ int data[65];

// Generate some un-sorted data...


data[2*threadIdx.x] = threadIdx.x+20;
data[(2*threadIdx.x)+1] = 100-threadIdx.x;
data[64]=99999; // easier than dealing with edge case

sort32(data);

print_list(data); // Should be sorted at this point.


}

int main(void) {
testKernel<<<1,32>>>(); // Just 1 warp!
printf("%d\n", cudaDeviceSynchronize());
return 0;
}
WEEK 12 Output
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
cse1@cse1-Vostro-3653:~$ nvcc w12a.cu
cse1@cse1-Vostro-3653:~$ ./a.out
20 21 100 22 99 23 98 24 97 25 96 26 95 27 94 28 93 29 92 30 91 31 90 32 89 33 88 34 87 35 86
36 85 37 84 38 83 39 82 40 81 41 80 42 79 43 78 44 77 45 76 46 75 47 74 48 73 49 72 50 71 51
70 69
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

Das könnte Ihnen auch gefallen