Nothing Special   »   [go: up one dir, main page]

Input: Output: 1. Sub String Program

Download as doc, pdf, or txt
Download as doc, pdf, or txt
You are on page 1of 8

#define SIZE 4

__global__ void dotProduct(int *a, int *b, int *c)


{
int i = threadIdx.x;
*c += a[i]*b[i];
//atomicAdd(c, a[i]*b[i])
}

int main()
{
int a[SIZE] = {1,2,3,4};
int b[SIZE] = {1,2,3,4};
int c = 0;

int *da, *db, *dc, size = SIZE*sizeof(int);


cudaMalloc((void **)&da, size);
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&db, size);
cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&dc, sizeof(int));
cudaMemcpy(dc, &c, sizeof(int), cudaMemcpyHostToDevice);

dotProduct<<<1,SIZE>>>(da, db, dc);


cudaMemcpy(&c, dc, sizeof(int), cudaMemcpyDeviceToHost);
cout<<"Dot Product : "<<c;
return 0;
}

Input:
Output:
Dot Product: 30

1. Sub String

Program:

__global__ void subString(char *da, char *db, int lena, int lenb, int *count)
{
int i = threadIdx.x, c = 0;

for (int j = 0; j<lenb; i++, j++)


if (da[i] == db[j])
c++;
if (c == lenb)
count++;
// atomicAdd(count, 1);
}
int main()
{
char a[] = "HaiHelloHowru", b[] = "H";
int lena = strlen(a), lenb = strlen(b);
cout << a << "\t" << lena << endl << b << "\t" << lenb << endl;

char *da, *db;


cudaMalloc((void**)&da, lena);
cudaMemcpy(da, a, lena, cudaMemcpyHostToDevice);
cudaMalloc((void**)&db, lenb);
cudaMemcpy(db, b, lenb, cudaMemcpyHostToDevice);

int c = 0, *dc;
cudaMalloc((void **)&dc, sizeof(int));
cudaMemcpy(dc, &c, sizeof(int), cudaMemcpyHostToDevice);

subString << <1, lena - lenb >> >(da, db, lena, lenb, dc);

cudaMemcpy(&c, dc, sizeof(int), cudaMemcpyDeviceToHost);

cudaDeviceSynchronize();

cout << "\nCount : " << c;

return 0;
}

Input:
Output:
HaiHelloHowru 13 H 1
Count : 3

2. Matrix Multiplication

Program:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;
#define SIZE 20
#define TILESIZE 2
#define WINDOW 2

__global__ void matMul(int *a, int *b, int *c)


{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < SIZE)
for (int j = 0; j<SIZE; j++)
for (int k = 0; k<SIZE; k++)
c[i*SIZE + j] += a[i*SIZE + k] * b[k*SIZE + j];
}

__global__ void matMul2D(int *a, int *b, int *c)


{
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
int I = i*WINDOW;
int J = j*WINDOW;
int m = I + WINDOW, n = J + WINDOW, k;

if (I < SIZE && J < SIZE)


for (i = I; i<m && i<SIZE; i++)
for (j = J; j<n && j<SIZE; j++)
for (k = 0; k<SIZE; k++)
c[i*SIZE + j] += a[i*SIZE + k] * b[k*SIZE + j];
}
void printMatrix(int *a)
{
for (int i = 0; i<SIZE; i++)
{
for (int j = 0; j<SIZE; j++)
cout << *a++ << "\t";
cout << endl;
}
}
//
int main()
{
int a[SIZE*SIZE], b[SIZE*SIZE], c[SIZE*SIZE], d[SIZE*SIZE];
int i, j, k = 0;

for (i = 0; i<SIZE; i++)


for (j = 0; j<SIZE; j++)
{
a[i*SIZE + j] = 1;
b[i*SIZE + j] = k++;
c[i*SIZE + j] = d[i*SIZE + j] = 0;
}
int *da, *db, *dc;
int size = SIZE*SIZE*sizeof(int);
cudaMalloc(&da, size);
cudaMalloc(&db, size);
cudaMalloc(&dc, size);

cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);


cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(dc, c, size, cudaMemcpyHostToDevice);

/* dim3 block(TILESIZE);
dim3 grid(SIZE/TILESIZE+1);
matMul<<<grid,block>>>(da, db, dc);
cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost);

cudaMemcpy(dc, d, size, cudaMemcpyHostToDevice);*/


dim3 block2D(TILESIZE, TILESIZE);
dim3 grid2D(SIZE / (TILESIZE*WINDOW) + 1, SIZE / (TILESIZE*WINDOW) + 1);
matMul2D << <grid2D, block2D >> >(da, db, dc);
cudaMemcpy(d, dc, size, cudaMemcpyDeviceToHost);

/* cout<<endl;
printMatrix(c);
cout<<endl;
printMatrix(d);*/
}

Input
Matrix A
22222
22222
22222
22222
22222

Matrix B
11111
11111
11111
11111
11111

Output:
Matrix C
10 10 10 10 10
10 10 10 10 10
10 10 10 10 10
10 10 10 10 10
10 10 10 10 10

3. Quick Sort
Program

#define N 5

using namespace std;

__global__ void quickSort(int *x, int *dfirst, int *dlast, int *list)
{
int idx = threadIdx.x;
int first = dfirst[idx];
int last = dlast[idx];
list[idx] = 0;

if(first<last)
{
int pivot, j, temp, i;

pivot = first;
i = first;
j = last;

while(i<j)
{
while(x[i]<=x[pivot] && i<last)
i++;
while(x[j] > x[pivot])
j--;
if(i<j)
{
temp = x[i];
x[i] = x[j];
x[j] = temp;
}
}

temp = x[pivot];
x[pivot] = x[j];
x[j] = temp;

for(i=first; i<=last; i++)


if(x[i] > x[i+1])
{
list[idx] = j+1;
break;
}
}
}

int main()
{
int a[N] = {1, 5, 9, 3, 6}, *da, i, size = N*sizeof(int), len = 0;
int *list, *dlist, *dfirst, *dlast;

cudaMalloc(&da, size);
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
vector<int> v;

while(true)
{
size = (++len)*sizeof(int);

int *first = (int *)malloc(size);


int *last = (int *)malloc(size);

first[0] = 0;
last[len-1] = N-1;

for(i=0; i<v.size(); i++)


{
first[i+1] = v[i]+1;
last[i] = v[i]-1;
}

cudaMalloc(&dfirst, size);
cudaMemcpy(dfirst, first, size, cudaMemcpyHostToDevice);
cudaMalloc(&dlast, size);
cudaMemcpy(dlast, last, size, cudaMemcpyHostToDevice);

cudaMalloc(&dlist, size);

quickSort<<<1,len>>>(da, dfirst, dlast, dlist);

list = (int *)malloc(size);


cudaMemcpy(list, dlist, size, cudaMemcpyDeviceToHost);

v.clear();
for(i=0; i<len; i++)
if(list[i] != 0)
v.push_back(list[i]-1);
len = v.size();

if(len == 0)
break;
}

cudaMemcpy(a, da, N*sizeof(int), cudaMemcpyDeviceToHost);


for(i=0; i<N; i++)
printf("%d\t", a[i]);
}

Input:
15936
Output:
13569

4. Gauss Elimination

Program
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

#define N 4
#define M N+1

__global__ void ge(float *A, int i)


{
for (int j = blockIdx.x + i + 1; j<N; j += gridDim.x)
{
float ratio = A[j*(N + 1) + i] / A[i*(N + 1) + i];
for (int k = threadIdx.x + i; k <= N; k += blockDim.x)
A[j*(N + 1) + k] -= ratio*A[i*(N + 1) + k];
}
}

void print(float *A)


{
for (int i = 0; i<N; i++)
{
for (int j = 0; j<N + 1; j++)
printf("%f ", A[i*(N + 1) + j]);
printf("\n");
}
printf("\n");
}

int main()
{
float A[N*(N + 1)] = { 2, 1, -1, 2, 5, 4, 5, -3, 6, 9, -2, 5, -2, 6,
4, 4, 11, -4, 8, 2 };
float *dev_a;
int size = N*(N + 1)*sizeof(float), i, j;

print(A);

cudaMalloc(&dev_a, size);
cudaMemcpy(dev_a, A, size, cudaMemcpyHostToDevice);

for (i = 0; i<N - 1; i++)


ge << <N - 1 - i, N - i + 1 >> >(dev_a, i);

cudaMemcpy(A, dev_a, size, cudaMemcpyDeviceToHost);


print(A);

for (i = N - 1; i >= 0; i--)


{
for (j = N - 1; j >= i + 1; j--)
A[i*(N + 1) + N] -= A[i*(N + 1) + j];
A[i*(N + 1) + N] /= A[i*(N + 1) + j];
printf("x%d=%.1f\n", i, A[i*(N + 1) + N]);

for (j = i - 1; j >= 0; j--)


A[j*(N + 1) + i] *= A[i*(N + 1) + N];
}
return 0;
}

You might also like