HPC Printout 1
HPC Printout 1
2)./bfs
Output:
Enter data => 5
Do you want to insert one more node? (y/n) y
Enter data => 3
Do you want to insert one more node? (y/n) y
Enter data => 2
Do you want to insert one more node? (y/n) y
Enter data => 1
Do you want to insert one more node? (y/n) y
Enter data => 8 Do you want to insert one more node? (y/n) n
5 3 7 2 1 8
int main() {
int n, m, start_node;
cout << "Enter No of Node,Edges,and start node:" ;
cin >> n >> m >> start_node;
//n: node,m:edges
cout << "Enter Pair of edges:" ;
for (int i = 0; i < m; i++)
{
int u, v;
cin >> u >> v;
//u and v: Pair of edges
graph[u].push_back(v);
graph[v].push_back(u);
}
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
visited[i] = false;
}
dfs(start_node);
/*
for (int i = 0; i < n; i++) { if (visited[i])
{
cout << i << " ";
}
}*/
return 0;
}
Assignment No.: 2
# This guarantees that the array is fully sorted before the loop ends
import numpy as np
import time
import random
import omp
def parallel_merge_sort(arr):
n = len(arr)
# Base case if n == 1: return arr
# Split the array into two halves mid = n // 2
left = arr[:mid] right = arr[mid:]
# Use the parallel construct to distribute the work among the threads
# Each thread sorts a portion of the array
with omp.parallel(num_threads=omp.get_max_threads(), default_shared=False): l
eft_sorted = parallel_merge_sort(left)
right_sorted = parallel_merge_sort(right)
# Merge the two sorted halves i = j = 0
n1, n2 = len(left_sorted), len(right_sorted) merged_arr = np.zeros(n1+n2, dtype=int)
# Use the parallel construct to distribute the loop iterations among the threads
# Each thread merges a portion of the array
with omp.parallel(num_threads=omp.get_max_threads(), default_shared=False, private=['k']):
fork in range(n1+n2): if i == n1:
merged_arr[k:] = right_sorted[j:]
break
elif j == n2:
merged_arr[k:] = left_sorted[i:]
break
elif left_sorted[i] <= right_sorted[j]:
merged_arr[k] = left_sorted[i]
i += 1
else:
merged_arr[k] = right_sorted[j]
j += 1
return merged_arr
if _name_ == '_main_':
# Generate a random array of 10,000 integers
arr = np.array([random.randint(0, 100) for i in range(10000)])
print(f"Original array: {arr}")
start_time = time.time()
sorted_arr = parallel_merge_sort(arr)
end_time = time.time()
print(f"Sorted array: {sorted_arr}")
print(f"Execution time: {end_time - start_time} seconds")
Assignment No.: 3
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define CHUNK_SIZE 1000
struct ChunkStats {
int min_val;
int sum_val;
int size;
};
struct ChunkStats get_chunk_stats(int* chunk, int chunk_size) {
// compute the minimum, sum, and size of a chunk struct ChunkStats stats;
stats.min_val = chunk[0]; stats.sum_val
= 0; stats.size = chunk_size;
for (int i = 0; i < chunk_size; i++) {
stats.min_val = chunk[i] < stats.min_val ? chunk[i] : stats.min_val;
stats.sum_val += chunk[i];
}
return stats;
}
void parallel_reduction_min_avg(int* data, int data_size, int* min_val_ptr, double* avg_val_ptr) { // split the
data into chunks
int num_threads = omp_get_max_threads();
int chunk_size = data_size / num_threads;
int num_chunks = num_threads;
if (data_size % chunk_size != 0) {
num_chunks++;
}
struct ChunkStats* chunk_stats = malloc(num_chunks * sizeof(struct ChunkStats));
int i, j;
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
void parallel_reduction_max_sum(int* data, int size, int* max_val_ptr, int* sum_val_ptr) {
// Initialize shared variables
*Max_val_ptr = data[0];
*Sum_val_ptr = 0;
// Compute maximum and sum of each chunk in parallel
#pragma omp parallel for reduction(max: *max_val_ptr) reduction(+: *sum_val_ptr) for (int i = 0; i < size; i++)
{
if (data[i] > *max_val_ptr) {
*Max_val_ptr = data[i];
}
*Sum_val_ptr += data[i];
}
// Combine maximum and sum values from each chunk #pragma omp parallel sections
{
#pragma omp section
{
// Compute maximum value
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
void parallel_reduction_max_sum(int* data, int size, int* max_val_ptr, int* sum_val_ptr) {
// Initialize shared variables
*Max_val_ptr = data[0]; *sum_val_ptr = 0;
// Compute maximum and sum of each chunk in parallel
#pragma omp parallel for reduction(max: *max_val_ptr) reduction(+: *sum_val_ptr)
for (int i = 0; i < size; i++) {
if (data[i] > *max_val_ptr) {
*Max_val_ptr = data[i];
}
*Sum_val_ptr += data[i];
}
// Combine maximum and sum values from each chunk #pragma omp parallel sections
{
#pragma omp section
{
// Compute maximum value
for (int i = 1; i < omp_get_num_threads(); i++) {
int thread_max_val;
#pragma omp critical
{
thread_max_val = *max_val_ptr;
}
#pragma omp flush
int main() {
int data_size = 1000000;
int* data = malloc(data_size * sizeof(int));
for (int i = 0; i < data_size; i++) {
data[i] = rand() % 100;
}
int max_val, sum_val;
parallel_reduction_max_sum(data, data_size, &max_val, &sum_val);
printf("Maximum value: %d\n", max_val);
printf("Sum value: %d\n", sum_val);
free(data); return 0;
}
Output:
#include <stdio.h>
#include <stdlib.h>
// CUDA kernel for vector addition
global void vectorAdd(int *a, int *b, int *c, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) {
c[i] = a[i] + b[i];
}
}
int main() {
int n = 1000000;
// Vector size int *a, *b, *c;
// Host vectors
int *d_a, *d_b, *d_c;
// Device vectors int size = n * sizeof(int);
// Size in bytes
//Allocate memory for host vectors a = (int*) malloc(size);
b = (int*) malloc(size);
c = (int*) malloc(size);
// Initialize host vectors
for (int i = 0; i < n; i++) {
a[i] = i;
b[i] = i;
}
//Allocate memory for device vectors cudaMalloc((void**) &d_a, size);
cudaMalloc((void**) &d_b, size);
cudaMalloc((void**) &d_c, size);
// Copy host vectors to device vectors
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
//Define block size and grid size int blockSize = 256;
int gridSize = (n + blockSize - 1) / blockSize;
// Launch kernel
vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
//Copy device result vector to host result vector cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Verify the result
for (int i = 0; i < n; i++) { if (c[i] != 2*i) {
printf("Error: c[%d] = %d\n", i, c[i]);
break;
}
}
//Free device memory curare(d_a);
curare(d_b);
curare(d_c);
//Free host memory free(a); free(b);
free(c);
return 0;
}
This program uses CUDA to add two large vectors of size 1000000. The vectors are initialized on the host, and
then copied to the device memory. A kernel function is defined to perform the vector addition, and then launched
on the device. The result is copied back to the host memory and verified. Finally, the device and host memories
are freed.
This program multiplies two matrices of size n using CUDA. It first allocates host memory for the matrices and
initializes them. Then it allocates device memory and copies the matrices to the device. It sets the kernel launch
configuration and launches the kernel function matrix_multiply. The kernel function performs the matrix
multiplication and stores the result in matrix c. Finally, it copies the result back to the host and frees the device
and host memory.
The kernel function calculates the row and column indices of the output matrix using the block index and thread
index. It then uses a for loop to calculate the sum of the products of the corresponding elements in the input
matrices. The result is stored in the output matrix.
Note that in this program, we use CUDA events to measure the elapsed time of the kernel function. This is because
the kernel function runs asynchronously on the GPU, so we need to use events to synchronize the host and device
and measure the time accurately.
#include <stdio.h>
#define BLOCK_SIZE 16
global void matrix_multiply(float *a, float *b, float *c, int n)
{
int row = blocked * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0;
if (row < n && col < n) {
for (int i = 0; i < n; ++i) {
sum += a[row * n + i] * b[i * n + col];
}
c[row * n + col] = sum;}
}
int main()
{
int n = 1024;
size_t size = n * n * sizeof(float);
float *a, *b, *c;
float *d_a, *d_b, *d_c;
cudaEvent_t start, stop;
float elapsed_time;
// Allocate host memory a = (float*)malloc(size);
b =(float*)malloc(size);
c =(float*)malloc(size);
// Initialize matrices
for (int i = 0; i < n * n; ++i) {
a[i] = i % n;
b[i] = i % n;
}
//Allocate device memory
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
// Copy input data to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
Code:
import tensorflow as tf
model = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(10, activation='softmax')
])
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# Train the model
model.fit(x_train_chunk, y_train_chunk, epochs=1, batch_size=32)
# Compute the accuracy on the training data
train_loss, train_acc = model.evaluate(x_train_chunk, y_train_chunk, verbose=2)
# Reduce the accuracy across all nodes
train_acc = comm.allreduce(train_acc, op=MPI.SUM)
return train_acc / size
Run the training loop:
epochs = 5
for epoch in range(epochs):
# Train the model
train_acc = train(model, x_train, y_train, rank, size)
# Compute the accuracy on the test data
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
# Reduce the accuracy across all nodes
test_acc = comm.allreduce(test_acc, op=MPI.SUM)
# Print the results if rank == 0:
print(f"Epoch {epoch + 1}: Train accuracy = {train_acc:.4f}, Test accuracy = {test_acc / size:.4f}")
Output:
Epoch 1: Train accuracy = 0.9773, Test accuracy = 0.9745