HPC Codes-2

Assignment No.
1:
Design and implement Parallel Breadth-First Search and Depth First Search based on
existing algorithms using OpenMP. Use a Tree or an undirected graph for BFS and DFS.
Code:
%%cu
#include <iostream>
#include <vector>
#include <queue>
#include <omp.h>
using namespace std;
// Graph class representing the adjacency list

class Graph {
int V; // Number of vertices
vector<vector<int>> adj; // Adjacency list
public:
Graph(int V) : V(V), adj(V) {}
// Add an edge to the graph

void addEdge(int v, int w) {
adj[v].push_back(w);
}
// Parallel Depth-First Search

void parallelDFS(int startVertex) {
vector<bool> visited(V, false);
parallelDFSUtil(startVertex, visited);
}
// Parallel DFS utility function

void parallelDFSUtil(int v, vector<bool>& visited) {
visited[v] = true;
cout << v << " ";
#pragma omp parallel for

for (int i = 0; i < adj[v].size(); ++i) {
int n = adj[v][i];
if (!visited[n])
parallelDFSUtil(n, visited);
}
}
// Parallel Breadth-First Search
void parallelBFS(int startVertex) {
vector<bool> visited(V, false);
queue<int> q;
visited[startVertex] = true;
q.push(startVertex);
while (!q.empty()) {
int v = q.front();
q.pop();
cout << v << " ";
#pragma omp parallel for

for (int i = 0; i < adj[v].size(); ++i) {
int n = adj[v][i];
if (!visited[n]) {
visited[n] = true;
q.push(n);
}
}
}
}
};
int main() {
// Create a graph
Graph g(7);
g.addEdge(0, 1);
g.addEdge(0, 2);
g.addEdge(1, 3);
g.addEdge(1, 4);
g.addEdge(2, 5);
g.addEdge(2, 6);
/*
0 -------->1
| /\
| / \
| / \
v v v
2 ----> 3 4
| |
| |
v v
5 6
*/
cout << "Depth-First Search (DFS): ";
g.parallelDFS(0);
cout << endl;
cout << "Breadth-First Search (BFS): ";

g.parallelBFS(0);
cout << endl;
return 0;
}
Output:
Depth-First Search (DFS): 0 1 3 4 2 5 6
Breadth-First Search (BFS): 0 1 2 3 4 5 6
Assignment No. 2:
Write a program to implement Parallel Bubble Sort and Merge sort using OpenMP. Use
existing algorithms and measure the performance of sequential and parallel algorithms.
Code - Parallel Bubble Sort:

#include<iostream>
#include<omp.h>
void bubble(int array[], int n){

for (int i = 0; i < n - 1; i++){
for (int j = 0; j < n - i - 1; j++){
if (array[j] > array[j + 1]) swap(array[j], array[j + 1]);
}
}
}
void pBubble(int array[], int n){

//Sort odd indexed numbers
for(int i = 0; i < n; ++i){
#pragma omp for
for (int j = 1; j < n; j += 2){
if (array[j] < array[j-1])
{
swap(array[j], array[j - 1]);
}
}
// Synchronize
#pragma omp barrier
//Sort even indexed numbers

#pragma omp for
for (int j = 2; j < n; j += 2){
if (array[j] < array[j-1])
{
swap(array[j], array[j - 1]);
}
}
}
}
void printArray(int arr[], int n){

for(int i = 0; i < n; i++) cout << arr[i] << " ";
cout << "\n";
}
int main(){
// Set up variables
int n = 10;
int arr[n];
int brr[n];
double start_time, end_time;
// Create an array with numbers starting from n to 1

for(int i = 0, j = n; i < n; i++, j--) arr[i] = j;
// Sequential time
start_time = omp_get_wtime();
bubble(arr, n);
end_time = omp_get_wtime();
cout << "Sequential Bubble Sort took : " << end_time - start_time << " seconds.\n";
printArray(arr, n);
// Reset the array

// Parallel time
pBubble(arr, n);
cout << "Parallel Bubble Sort took : " << end_time - start_time << " seconds.\n";
printArray(arr, n);
}
Output:
Sequential Bubble Sort took : 0.00957767 seconds.
Parallel Bubble Sort took : 0.00988083 seconds.
Code - Parallel Merge Sort:

#include <iostream>
#include <omp.h>
void merge(int arr[], int low, int mid, int high) {

// Create arrays of left and right partititons
int n1 = mid - low + 1;
int n2 = high - mid;
int left[n1];
int right[n2];
// Copy all left elements

for (int i = 0; i < n1; i++) left[i] = arr[low + i];
// Copy all right elements

for (int j = 0; j < n2; j++) right[j] = arr[mid + 1 + j];
// Compare and place elements

int i = 0, j = 0, k = low;
while (i < n1 && j < n2) {

if (left[i] <= right[j]){
arr[k] = left[i];
i++;
}
else{
arr[k] = right[j];
j++;
}
k++;
}
// If any elements are left out

while (i < n1) {
arr[k] = left[i];
i++;
k++;
}
while (j < n2) {

arr[k] = right[j];
j++;
k++;
}
}
void parallelMergeSort(int arr[], int low, int high) {

if (low < high) {
int mid = (low + high) / 2;
#pragma omp parallel sections

{
#pragma omp section
{
parallelMergeSort(arr, low, mid);
}
#pragma omp section
{
parallelMergeSort(arr, mid + 1, high);
}
}
merge(arr, low, mid, high);
}
}
void mergeSort(int arr[], int low, int high) {

if (low < high) {
int mid = (low + high) / 2;
mergeSort(arr, low, mid);
mergeSort(arr, mid + 1, high);
merge(arr, low, mid, high);
}
}
int main() {
int n = 1000;
int arr[n];
double start_time, end_time;
// Create an array with numbers starting from n to 1.

// Measure Sequential Time

mergeSort(arr, 0, n - 1);
cout << "Time taken by sequential algorithm: " << end_time - start_time << "
seconds\n";
// Reset the array

//Measure Parallel time

parallelMergeSort(arr, 0, n - 1);
cout << "Time taken by parallel algorithm: " << end_time - start_time << " seconds";
return 0;
}
Output:
Time taken by sequential algorithm: 0.000135859 seconds
Time taken by parallel algorithm: 0.000123855 seconds
Assignment No. 3:
Implement Min, Max, Sum and Average operations using Parallel Reduction.
.cpp Code:
%%cu
/*
Windows does not support user defined reductions.
This program may not run on MVSC++ compilers for Windows.
Please use Linux Environment.[You can try using "windows subsystem for linux"]
*/
#include<iostream>
#include<omp.h>

int minval(int arr[], int n){
int minval = arr[0];
#pragma omp parallel for reduction(min : minval)
for(int i = 0; i < n; i++){
if(arr[i] < minval) minval = arr[i];
}
return minval;
}
int maxval(int arr[], int n){

int maxval = arr[0];
#pragma omp parallel for reduction(max : maxval)
for(int i = 0; i < n; i++){
if(arr[i] > maxval) maxval = arr[i];
}
return maxval;
}
int sum(int arr[], int n){

int sum = 0;
#pragma omp parallel for reduction(+ : sum)
for(int i = 0; i < n; i++){
sum += arr[i];
}
return sum;
}
int average(int arr[], int n){

return (double)sum(arr, n) / n;
}
int main(){
int n = 5;
int arr[] = {1,2,3,4,5};
cout << "The minimum value is: " << minval(arr, n) << '\n';
cout << "The maximum value is: " << maxval(arr, n) << '\n';
cout << "The summation is: " << sum(arr, n) << '\n';
cout << "The average is: " << average(arr, n) << '\n';
return 0;
}
Output:
The minimum value is: 1
The maximum value is: 5
The summation is: 15
The average is: 3
Assignment No. 4:
Write a CUDA Program for :
1. Addition of two large vectors
2. Matrix Multiplication using CUDA C
Code - Addition of Two large Vectors:

%%cu
#include <iostream>
__global__ void add(int* A, int* B, int* C, int size) {

int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < size) {

C[tid] = A[tid] + B[tid];
}
}
void initialize(int* vector, int size) {

for (int i = 0; i < size; i++) {
vector[i] = rand() % 10;
}
}
void print(int* vector, int size) {

cout << vector[i] << " ";
}
cout << endl;
}
int main() {
int N = 4;
int* A, * B, * C;
int vectorSize = N;
size_t vectorBytes = vectorSize * sizeof(int);
A = new int[vectorSize];
B = new int[vectorSize];
C = new int[vectorSize];
initialize(A, vectorSize);
initialize(B, vectorSize);
cout << "Vector A: ";
print(A, N);
cout << "Vector B: ";
print(B, N);
int* X, * Y, * Z;
cudaMalloc(&X, vectorBytes);
cudaMalloc(&Y, vectorBytes);
cudaMalloc(&Z, vectorBytes);
cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);

cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;

int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);
cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);
cout << "Addition: ";

print(C, N);
delete[] A;
delete[] B;
delete[] C;
cudaFree(X);
cudaFree(Y);
cudaFree(Z);
return 0;
}
Output:
Vector A: 3 6 7 5
Vector B: 3 5 6 2
Addition: 6 11 13 7
Code - Matrix Multiplication using CUDA C:

%%cu
#include <iostream>
// CUDA code to multiply matrices

__global__ void multiply(int* A, int* B, int* C, int size) {
// Uses thread indices and block indices to compute each element
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < size && col < size) {

int sum = 0;
sum += A[row * size + i] * B[i * size + col];
}
C[row * size + col] = sum;
}
}
void initialize(int* matrix, int size) {

for (int i = 0; i < size * size; i++) {
matrix[i] = rand() % 10;
}
}
void print(int* matrix, int size) {

for (int row = 0; row < size; row++) {
for (int col = 0; col < size; col++) {
cout << matrix[row * size + col] << " ";
}
cout << '\n';
}
cout << '\n';
}
int main() {
int* A, * B, * C;
int N = 2;
int blockSize = 16;
int matrixSize = N * N;
size_t matrixBytes = matrixSize * sizeof(int);
A = new int[matrixSize];
B = new int[matrixSize];
C = new int[matrixSize];
initialize(A, N);
initialize(B, N);
cout << "Matrix A: \n";
print(A, N);
cout << "Matrix B: \n";

print(B, N);
int* X, * Y, * Z;
// Allocate space
cudaMalloc(&X, matrixBytes);
cudaMalloc(&Y, matrixBytes);
cudaMalloc(&Z, matrixBytes);
// Copy values from A to X

cudaMemcpy(X, A, matrixBytes, cudaMemcpyHostToDevice);
// Copy values from A to X and B to Y

cudaMemcpy(Y, B, matrixBytes, cudaMemcpyHostToDevice);
// Threads per CTA dimension

int THREADS = 2;
// Blocks per grid dimension (assumes THREADS divides N evenly)

int BLOCKS = N / THREADS;
// Use dim3 structs for block and grid dimensions

dim3 threads(THREADS, THREADS);
dim3 blocks(BLOCKS, BLOCKS);
// Launch kernel
multiply<<<blocks, threads>>>(X, Y, Z, N);
cudaMemcpy(C, Z, matrixBytes, cudaMemcpyDeviceToHost);

cout << "Multiplication of matrix A and B: \n";
print(C, N);
delete[] A;
delete[] B;
delete[] C;
cudaFree(X);
cudaFree(Y);
cudaFree(Z);
return 0;
}
Output:
Matrix A:
36
75
Matrix B:
35
62
Multiplication of matrix A and B:

45 27
51 45

HPC Codes-2

Uploaded by

Copyright:

Available Formats

HPC Codes-2

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

HPC Codes-2

Uploaded by

Copyright:

Available Formats

Assignment No.

using namespace std;

// Graph class representing the adjacency list

// Add an edge to the graph

// Parallel Depth-First Search

// Parallel DFS utility function

#pragma omp parallel for

#pragma omp parallel for

cout << "Breadth-First Search (BFS): ";

Code - Parallel Bubble Sort:

using namespace std;

void bubble(int array[], int n){

void pBubble(int array[], int n){

//Sort even indexed numbers

void printArray(int arr[], int n){

// Create an array with numbers starting from n to 1

// Reset the array

Code - Parallel Merge Sort:

using namespace std;

void merge(int arr[], int low, int mid, int high) {

// Copy all left elements

// Copy all right elements

// Compare and place elements

while (i < n1 && j < n2) {

// If any elements are left out

while (j < n2) {

void parallelMergeSort(int arr[], int low, int high) {

#pragma omp parallel sections

void mergeSort(int arr[], int low, int high) {

// Create an array with numbers starting from n to 1.

// Measure Sequential Time

// Reset the array

//Measure Parallel time

using namespace std;

int maxval(int arr[], int n){

int sum(int arr[], int n){

int average(int arr[], int n){

Code - Addition of Two large Vectors:

__global__ void add(int* A, int* B, int* C, int size) {

if (tid < size) {

void initialize(int* vector, int size) {

void print(int* vector, int size) {

cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);

int threadsPerBlock = 256;

add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);

cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);

cout << "Addition: ";

Code - Matrix Multiplication using CUDA C:

// CUDA code to multiply matrices

if (row < size && col < size) {

void initialize(int* matrix, int size) {

void print(int* matrix, int size) {

cout << "Matrix B: \n";

// Copy values from A to X

// Copy values from A to X and B to Y

// Threads per CTA dimension

// Blocks per grid dimension (assumes THREADS divides N evenly)

// Use dim3 structs for block and grid dimensions

cudaMemcpy(C, Z, matrixBytes, cudaMemcpyDeviceToHost);

Multiplication of matrix A and B:

global void add(int* A, int* B, int* C, int size) {