The value of π can be estimated mathematically by numerical integration
We can approximate the integration as a sum of rectangles
where
each rectangle has width Δx and height F(xi) at the middle of interval i as demonstrated in the following diagram:
Computing all the rectangles one after another by a single CPU core on a node is done by a serial program. A node may have multiple CPU cores, but only a single CPU core is used like below:
Computation of rectangles is distributed among different CPU cores on a node through threads is done by a shared-memory parallel program. Computation of rectangles on each CPU core is done simultaneously through threads like the following way:
Computation of rectangles is distributed among different CPU cores on multiple nodes by a distributed-memory parallel program. Computation of rectangles on each CPU core is done simultaneously and communication is required between nodes when each node finishes their own computation like below:
Implementation of the serial program of the toy application in various programming languages is provided below:
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <unistd.h> /* Reference: http://www.openmp.org/wp-content/uploads/Intro_To_OpenMP_Mattson.pdf */ #define TIME_START clock_gettime(CLOCK_MONOTONIC_RAW, &t1) #define TIME_STOP clock_gettime(CLOCK_MONOTONIC_RAW, &t2) #define TIME_ELAPSED_SEC t2.tv_sec - t1.tv_sec + (t2.tv_nsec - t1.tv_nsec)/1.0e9 int main(int argc, char ** argv) { double step, x, pi, sum = 0.0; long i, number_of_steps = 960000000000; // Total number of rectangles to be computed (factor of 48) step = 1.0 / (double) number_of_steps; struct timespec t1, t2; TIME_START; for (i = 0; i < number_of_steps; i++) { x = (i + 0.5) * step; sum += 4.0 / (1.0 + x * x); } TIME_STOP; pi = step * sum; printf("The value of pi with %ld steps: %f and time: %f secs\n", number_of_steps, pi, TIME_ELAPSED_SEC); return 0; }
The above code can be compiled in the following way:
$ gcc -O3 -Wall -lrt -o pi_serial pi_serial.c
import time if __name__ == '__main__': sum = 0.0 number_of_steps = 9600000000 # Total number of rectangles to be computed (factor of 48) step = 1.0 / number_of_steps start = time.time() for i in range(number_of_steps): x = (i + 0.5) * step sum += 4.0 / (1.0 + x * x) end = time.time() pi = step * sum print(f"The value of pi with {number_of_steps} steps: {pi} and time: {end - start:.6f} secs")
Implementation of the parallel program of the toy application in various programming languages and models is provided below:
Implementation in C of the shared-memory parallel program of the toy application is provided below:
#ifdef _OPENMP #include <omp.h> #endif #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <unistd.h> #include <sys/time.h> /* Reference: http://www.openmp.org/wp-content/uploads/Intro_To_OpenMP_Mattson.pdf */ #define TIME_START clock_gettime(CLOCK_MONOTONIC_RAW, &t1) #define TIME_STOP clock_gettime(CLOCK_MONOTONIC_RAW, &t2) #define TIME_ELAPSED_SEC t2.tv_sec - t1.tv_sec + (t2.tv_nsec - t1.tv_nsec)/1.0e9 int main(int argc, char ** argv) { int number_of_threads; long number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48) start_index, steps_per_thread; double step, pi = 0.0; struct timespec t1, t2; step = 1.0 / (double) number_of_steps; TIME_START; #pragma omp parallel private (start_index) { long i; int thread_id, omp_number_of_threads; double x, sum; thread_id = omp_get_thread_num(); omp_number_of_threads = omp_get_num_threads(); steps_per_thread = number_of_steps / omp_number_of_threads; if (thread_id == 0) { number_of_threads = omp_number_of_threads; if (number_of_steps < omp_number_of_threads) { printf("Number of steps should not be less than number of threads.\n"); exit(1); } } start_index = steps_per_thread * thread_id; if (thread_id == omp_number_of_threads - 1) { steps_per_thread += number_of_steps % omp_number_of_threads; } for (i = start_index, sum = 0.0; i < (start_index + steps_per_thread); i++) { x = (i + 0.5) * step; sum += 4.0 / (1.0 + x * x); } #pragma omp critical pi += sum*step; } TIME_STOP; printf("The value of pi with %ld steps, %d omp threads: %f, took %f secs\n", number_of_steps, number_of_threads, pi, TIME_ELAPSED_SEC); return 0; }
The above code can be compiled in the following way:
$ gcc -fopenmp -Wall -g -O3 -lrt -lgomp -o pi_omp pi_omp.c
Implementation in C of the distributed-memory parallel program of the toy application is provided below:
#include <stdio.h> #include <mpi.h> #include <math.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <unistd.h> #include <sys/time.h> /* Reference: http://www.mcs.anl.gov/research/projects/mpi/tutorial/mpiexmpl/src/pi/C/main.html */ #define TIME_START clock_gettime(CLOCK_MONOTONIC_RAW, &t1) #define TIME_STOP clock_gettime(CLOCK_MONOTONIC_RAW, &t2) #define TIME_ELAPSED_SEC t2.tv_sec - t1.tv_sec + (t2.tv_nsec - t1.tv_nsec)/1.0e9 int main(int argc, char **argv) { long number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48) steps_per_rank, i, start_index; int rank, size; double PI25DT = 3.141592653589793238462643, mypi, pi, step, sum = 0.0, x, mytime, maxtime; struct timespec t1, t2; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); step = 1.0 / (double) number_of_steps; steps_per_rank = number_of_steps / size; if (rank == 0 && number_of_steps < size) { printf("Number of steps should not be less than number of ranks.\n"); exit(1); } TIME_START; start_index = steps_per_rank * rank; if (rank == size - 1) { steps_per_rank += number_of_steps % size; } for (i = start_index; i < (start_index + steps_per_rank); i++) { x = (i + 0.5) * step; sum += 4.0 / (1.0 + x * x); } mypi = step * sum; MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); TIME_STOP; mytime = TIME_ELAPSED_SEC; MPI_Reduce(&mytime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (rank == 0) { printf("pi is approximately %.16f, Error is %.16f, number_of_steps is %ld and took %f secs\n", pi, fabs(pi - PI25DT), number_of_steps, maxtime); } MPI_Finalize(); return 0; }
The above code can be compiled in the following way:
$ module load openmpi/4.1.5 $ mpicc -Wall -g -O3 -lm -lrt -o pi_mpi pi_mpi.c
Implementation in C of the combination of the distributed-memory and shared-memory parallel program of the toy application is provided below:
#ifdef _OPENMP #include <omp.h> #endif #include <mpi.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <unistd.h> #include <sys/time.h> #define TIME_START clock_gettime(CLOCK_MONOTONIC_RAW, &t1) #define TIME_STOP clock_gettime(CLOCK_MONOTONIC_RAW, &t2) #define TIME_ELAPSED_SEC t2.tv_sec - t1.tv_sec + (t2.tv_nsec - t1.tv_nsec)/1.0e9 int main(int argc, char ** argv) { int number_of_threads, size, rank; long number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48) start_index, steps_per_thread, start_rank; double step, mypi = 0.0, pi, mytime, maxtime; struct timespec t1, t2; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); step = 1.0 / (double) number_of_steps; start_rank = (long)(number_of_steps / size) * rank; TIME_START; #pragma omp parallel private (start_index) { long i; int thread_id, omp_number_of_threads; double x, sum; thread_id = omp_get_thread_num(); omp_number_of_threads = omp_get_num_threads(); steps_per_thread = (long)(number_of_steps / size) / omp_number_of_threads; if (thread_id == 0) { number_of_threads = omp_number_of_threads; if (rank == 0 && number_of_steps < size * omp_number_of_threads) { printf("Number of steps should not be less than (ranks * threads).\n"); exit(1); } } start_index = start_rank + steps_per_thread * thread_id; if (thread_id == omp_number_of_threads - 1) { steps_per_thread += (long)(number_of_steps / size) % omp_number_of_threads; if (rank == size - 1) { steps_per_thread += number_of_steps % size; } } for (i = start_index, sum = 0.0; i < (start_index + steps_per_thread); i++) { x = (i + 0.5) * step; sum += 4.0 / (1.0 + x * x); } #pragma omp critical mypi += sum*step; } MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); TIME_STOP; mytime = TIME_ELAPSED_SEC; MPI_Reduce(&mytime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (rank == 0) { printf("The value of pi with %ld steps, %d omp threads: %f, took %f secs\n", number_of_steps, number_of_threads, pi, maxtime); } MPI_Finalize(); return 0; }
The above code can be compiled in the following way:
$ module load openmpi/4.1.5 $ mpicc -fopenmp -Wall -g -O3 -lrt -lgomp -o pi_hybrid_mpi_omp pi_hybrid_mpi_omp.c
Implementation in C of the shared-memory parallel program of the toy application is provided below:
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <unistd.h> #include <cuda.h> #include <cuda_runtime.h> #define THREADS_PER_BLOCK 1024 #define BLOCKS_PER_GRID 16384 __global__ void _cuda_compute_thread_work(double * device_sum, double step, unsigned long number_of_steps) { unsigned long number_of_threads = gridDim.x * blockDim.x, steps_per_thread = number_of_steps / number_of_threads, thread_id = (blockDim.x * blockIdx.x) + threadIdx.x, start_index = thread_id * steps_per_thread; double x; if (thread_id == number_of_threads - 1) { steps_per_thread += number_of_steps % number_of_threads; } for (unsigned long i = start_index; i < (start_index + steps_per_thread); i++) { x = (i + 0.5) * step; device_sum[thread_id] += 4.0 / (1.0 + x * x); } } __global__ void _cuda_parallel_sum(double * input, double * output) { extern __shared__ double shared_data[]; unsigned long i = blockIdx.x * blockDim.x + threadIdx.x, thread_id = threadIdx.x; // Each thread loads one element from global to shared memory shared_data[thread_id] = input[i]; __syncthreads(); // Do reduction in shared memory for (unsigned long s = blockDim.x / 2; s > 0; s >>= 1) { if (thread_id < s) { shared_data[thread_id] += shared_data[thread_id + s]; } __syncthreads(); } // Write result for this block to global memory if (thread_id == 0) { output[blockIdx.x] = shared_data[0]; } } unsigned long two_pow_number(double n) { return pow(2, (unsigned long)(log10(n)/log10(2))); } int main(int argc, char ** argv) { double step, pi, *host_remaining_sum, *device_sum, *host_sum, *device_block_sum; unsigned long number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48) number_of_threads = BLOCKS_PER_GRID * THREADS_PER_BLOCK, two_pow_grid_size = two_pow_number(BLOCKS_PER_GRID), two_pow_block_size = two_pow_number(THREADS_PER_BLOCK), extra_number_of_threads = number_of_threads - two_pow_grid_size * two_pow_block_size; struct timeval t1, t2; if (number_of_steps < number_of_threads) { printf("Number of steps should not be less than number of threads.\n"); exit(1); } step = 1.0 / (double) number_of_steps; // Allocate host memory host_remaining_sum = (double *)malloc(sizeof(double) * extra_number_of_threads); host_sum = (double *)malloc(two_pow_grid_size * sizeof(double)); // Initialize host arrays for (unsigned long i = 0; i < extra_number_of_threads; i++) { host_remaining_sum[i] = 0.0; } for (unsigned long i = 0; i < two_pow_grid_size; i++) { host_sum[i] = 0.0; } // Allocate device memory cudaMalloc((void**)&device_sum, sizeof(double) * number_of_threads); cudaMalloc((void **)&device_block_sum, two_pow_grid_size * sizeof(double)); gettimeofday(&t1, 0); // Executing kernel _cuda_compute_thread_work<<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(device_sum, step, number_of_steps); _cuda_parallel_sum<<<two_pow_grid_size, two_pow_block_size, two_pow_block_size*sizeof(double)>>>(device_sum, device_block_sum); // Transfer data back to host memory cudaMemcpy(host_remaining_sum, &device_sum[two_pow_grid_size * two_pow_block_size], sizeof(double) * extra_number_of_threads, cudaMemcpyDeviceToHost); cudaMemcpy(host_sum, device_block_sum, two_pow_grid_size * sizeof(double), cudaMemcpyDeviceToHost); // Accumulates the sum in the first element for (unsigned long i = 1; i < two_pow_grid_size; i++) { host_sum[0] += host_sum[i]; } // Accumulates the remaining sum in the first element for (unsigned long i = 0; i < extra_number_of_threads; i++) { host_sum[0] += host_remaining_sum[i]; } pi = step * host_sum[0]; gettimeofday(&t2, 0); double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0; printf("The value of pi with %ld steps: %f and time: %f secs\n", number_of_steps, pi, time); // Deallocate device memory cudaFree(device_sum); cudaFree(device_block_sum); // Deallocate host memory free(host_remaining_sum); free(host_sum); return 0; }
The above code can be compiled in the following way:
$ module load cuda/12.0.0 $ nvcc -g -O3 -lm -lrt -o pi_cuda pi_cuda.cu
Implementation in C of the combination of the distributed-memory and shared-memory parallel program of the toy application is provided below:
//-------------------------- // pi_hybrid_mpi_cuda_main.c //-------------------------- #include <mpi.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <unistd.h> #include <math.h> #define THREADS_PER_BLOCK 1024 #define BLOCKS_PER_GRID 16384 void launch_cuda_function(unsigned long number_of_steps, unsigned long number_of_threads, unsigned long extra_number_of_threads, unsigned long two_pow_grid_size, unsigned long two_pow_block_size, unsigned long grid_size, unsigned long block_size, double step, int size, int rank, unsigned long start_rank, double *host_sum, double*host_remaining_sum); unsigned long two_pow_number(double n) { return pow(2, (unsigned long)(log10(n)/log10(2))); } int main(int argc, char ** argv) { int size, rank; double step, mypi = 0.0, pi, *host_remaining_sum, *host_sum; unsigned long number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48) block_size = THREADS_PER_BLOCK, grid_size = BLOCKS_PER_GRID, number_of_threads = grid_size * block_size, two_pow_grid_size = two_pow_number(grid_size), two_pow_block_size = two_pow_number(block_size), extra_number_of_threads = number_of_threads - two_pow_grid_size * two_pow_block_size, start_rank; struct timeval t1, t2; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (rank == 0 && number_of_steps < size * number_of_threads) { printf("Number of steps should not be less than (ranks * threads).\n"); exit(1); } step = 1.0 / (double) number_of_steps; start_rank = (unsigned long)(number_of_steps / size) * rank; // Allocate host memory host_remaining_sum = (double *)malloc(sizeof(double) * extra_number_of_threads); host_sum = (double *)malloc(two_pow_grid_size * sizeof(double)); // Initialize host arrays for (unsigned long i = 0; i < extra_number_of_threads; i++) { host_remaining_sum[i] = 0.0; } for (unsigned long i = 0; i < two_pow_grid_size; i++) { host_sum[i] = 0.0; } gettimeofday(&t1, 0); launch_cuda_function(number_of_steps, number_of_threads, extra_number_of_threads, two_pow_grid_size, two_pow_block_size, grid_size, block_size, step, size, rank, start_rank, host_sum, host_remaining_sum); // Accumulates the sum in the first element for (unsigned long i = 1; i < two_pow_grid_size; i++) { host_sum[0] += host_sum[i]; } // Accumulates the remaining sum in the first element for (unsigned long i = 0; i < extra_number_of_threads; i++) { host_sum[0] += host_remaining_sum[i]; } mypi = step * host_sum[0]; MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); gettimeofday(&t2, 0); double mytime = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0, maxtime; MPI_Reduce(&mytime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (rank == 0) { printf("The value of pi with %ld steps: %f and time: %f secs\n", number_of_steps, pi, maxtime); } // Deallocate host memory free(host_remaining_sum); free(host_sum); MPI_Finalize(); return 0; } //---------------------- // pi_hybrid_mpi_cuda.cu //---------------------- #include <stdio.h> #include <stdlib.h> #include <cuda.h> #include <cuda_runtime.h> __global__ void _cuda_compute_thread_work(double * device_sum, double step, int size, int rank, unsigned long start_rank, unsigned long number_of_steps) { unsigned long number_of_threads = gridDim.x * blockDim.x, steps_per_thread = (unsigned long)(number_of_steps / size) / number_of_threads, thread_id = (blockDim.x * blockIdx.x) + threadIdx.x, start_index = start_rank + thread_id * steps_per_thread; double x; if (thread_id == number_of_threads - 1) { steps_per_thread += (unsigned long)(number_of_steps / size) % number_of_threads; if (rank == size - 1) { steps_per_thread += number_of_steps % size; } } for (unsigned long i = start_index; i < (start_index + steps_per_thread); i++) { x = (i + 0.5) * step; device_sum[thread_id] += 4.0 / (1.0 + x * x); } } __global__ void _cuda_parallel_sum(double * input, double * output) { extern __shared__ double shared_data[]; unsigned long i = blockIdx.x * blockDim.x + threadIdx.x, thread_id = threadIdx.x; // Each thread loads one element from global to shared memory shared_data[thread_id] = input[i]; __syncthreads(); // Do reduction in shared memory for (unsigned long s = blockDim.x / 2; s > 0; s >>= 1) { if (thread_id < s) { shared_data[thread_id] += shared_data[thread_id + s]; } __syncthreads(); } // Write result for this block to global memory if (thread_id == 0) { output[blockIdx.x] = shared_data[0]; } } extern "C" void launch_cuda_function(unsigned long number_of_steps, unsigned long number_of_threads, unsigned long extra_number_of_threads, unsigned long two_pow_grid_size, unsigned long two_pow_block_size, unsigned long grid_size, unsigned long block_size, double step, int size, int rank, unsigned long start_rank, double *host_sum, double*host_remaining_sum) { double *device_sum, *device_block_sum; // Allocate device memory cudaMalloc((void**)&device_sum, sizeof(double) * number_of_threads); cudaMalloc((void **)&device_block_sum, two_pow_grid_size * sizeof(double)); // Executing kernel _cuda_compute_thread_work<<<grid_size, block_size>>>(device_sum, step, size, rank, start_rank, number_of_steps); _cuda_parallel_sum<<<two_pow_grid_size, two_pow_block_size, two_pow_block_size*sizeof(double)>>>(device_sum, device_block_sum); // Transfer data back to host memory cudaMemcpy(host_remaining_sum, &device_sum[two_pow_grid_size * two_pow_block_size], sizeof(double) * extra_number_of_threads, cudaMemcpyDeviceToHost); cudaMemcpy(host_sum, device_block_sum, two_pow_grid_size * sizeof(double), cudaMemcpyDeviceToHost); // Deallocate device memory cudaFree(device_sum); cudaFree(device_block_sum); }
The above codes can be compiled in the following way:
$ module load openmpi/4.1.5 $ module load cuda/12.0.0 $ mpicc -c pi_hybrid_mpi_cuda_main.c -g -O3 -lm -lrt -lstdc++ -lcudart $ nvcc -c pi_hybrid_mpi_cuda.cu -g -O3 -lm -lrt -lstdc++ -lcudart $ mpicc -o pi_hybrid_mpi_cuda_main pi_hybrid_mpi_cuda_main.o pi_hybrid_mpi_cuda.o -g -O3 -lm -lrt -lstdc++ -lcudart
Implementation in Python of the distributed-memory parallel program of the toy application is provided below:
import numpy import time from mpi4py import MPI if __name__ == '__main__': number_of_steps = 9600000000 # Total number of rectangles to be computed (factor of 48) PI25DT = 3.141592653589793238462643 sum = 0.0 mypi = numpy.zeros(1) pi = numpy.zeros(1) mytime = numpy.zeros(1) maxtime = numpy.zeros(1) comm = MPI.COMM_WORLD if comm.rank == 0 and number_of_steps < comm.size: print("Number of steps should not be less than number of ranks.") exit(1) step = 1.0 / number_of_steps steps_per_rank = int(number_of_steps / comm.size) start = time.time() start_index = steps_per_rank * comm.rank if comm.rank == comm.size - 1: steps_per_rank += number_of_steps % comm.size for i in range(start_index, start_index + steps_per_rank): x = (i + 0.5) * step sum += 4.0 / (1.0 + x * x) mypi[0] = step * sum comm.Reduce([mypi, MPI.DOUBLE], [pi, MPI.DOUBLE], op = MPI.SUM, root = 0) end = time.time() mytime[0] = end - start comm.Reduce([mytime, MPI.DOUBLE], [maxtime, MPI.DOUBLE], op = MPI.MAX, root = 0) if comm.rank == 0: print(f"pi is approximately {pi[0]:.16f}, Error is {pi[0] - PI25DT:.16f}, number_of_steps is {number_of_steps} and took {maxtime[0]:.6f} secs")