Page tree

On this page

 

Toy Application

Pi Calculation by Numerical Integration

The value of π can be estimated mathematically by numerical integration

We can approximate the integration as a sum of rectangles

where

each rectangle has width Δx and height F(xi) at the middle of interval i as demonstrated in the following diagram:

Computing all the rectangles one after another by a single CPU core on a node is done by a serial program. A node may have multiple CPU cores, but only a single CPU core is used like below:

Computation of rectangles is distributed among different CPU cores on a node through threads is done by a shared-memory parallel program. Computation of rectangles on each CPU core is done simultaneously through threads like the following way:

Computation of rectangles is distributed among different CPU cores on multiple nodes by a distributed-memory parallel program. Computation of rectangles on each CPU core is done simultaneously and communication is required between nodes when each node finishes their own computation like below:

Serial Implementation

Implementation of the serial program of the toy application in various programming languages is provided below:

C

Implementation in C (pi_serial.c)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>

/* Reference: http://www.openmp.org/wp-content/uploads/Intro_To_OpenMP_Mattson.pdf */

#define TIME_START             clock_gettime(CLOCK_MONOTONIC_RAW, &t1)
#define TIME_STOP              clock_gettime(CLOCK_MONOTONIC_RAW, &t2)
#define TIME_ELAPSED_SEC       t2.tv_sec - t1.tv_sec + (t2.tv_nsec - t1.tv_nsec)/1.0e9

int main(int argc, char ** argv) {
    double 	step,
			x,
			pi,
			sum = 0.0;
    long	i,
			number_of_steps = 960000000000; // Total number of rectangles to be computed (factor of 48)
    step = 1.0 / (double) number_of_steps;

    struct timespec t1, t2;
    TIME_START;

    for (i = 0; i < number_of_steps; i++) {
        x = (i + 0.5) * step;
        sum += 4.0 / (1.0 + x * x);
    }

    TIME_STOP;

    pi = step * sum;

    printf("The value of pi with %ld steps: %f and time: %f secs\n", number_of_steps, pi, TIME_ELAPSED_SEC);

    return 0;
}

The above code can be compiled in the following way:

$ gcc -O3 -Wall  -lrt -o pi_serial pi_serial.c

Python

Implementation in Python (pi_serial.py)
import time

if __name__ == '__main__':
    sum = 0.0
    number_of_steps = 9600000000  # Total number of rectangles to be computed (factor of 48)
    step = 1.0 / number_of_steps

    start = time.time()

    for i in range(number_of_steps):
        x = (i + 0.5) * step
        sum += 4.0 / (1.0 + x * x)

    end = time.time()

    pi = step * sum

    print(f"The value of pi with {number_of_steps} steps: {pi} and time: {end - start:.6f} secs")

Parallel Implementation

Implementation of the parallel program of the toy application in various programming languages and models is provided below:

OpenMP

Implementation in C of the shared-memory parallel program of the toy application is provided below:

Implementation in C (pi_omp.c)
#ifdef _OPENMP
#include <omp.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include <sys/time.h>

/* Reference: http://www.openmp.org/wp-content/uploads/Intro_To_OpenMP_Mattson.pdf */

#define TIME_START             	clock_gettime(CLOCK_MONOTONIC_RAW, &t1)
#define TIME_STOP              	clock_gettime(CLOCK_MONOTONIC_RAW, &t2)
#define TIME_ELAPSED_SEC       	t2.tv_sec - t1.tv_sec + (t2.tv_nsec - t1.tv_nsec)/1.0e9

int main(int argc, char ** argv) {
    int 	number_of_threads;
    long 	number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48)
			start_index,
			steps_per_thread;
    double 	step,
			pi = 0.0;
    struct  timespec t1, t2;

	step = 1.0 / (double) number_of_steps;

    TIME_START;

    #pragma omp parallel private (start_index)
    { 
        long 	i;
		int		thread_id,
				omp_number_of_threads;
        double 	x,
				sum;

        thread_id = omp_get_thread_num();
        omp_number_of_threads = omp_get_num_threads();
		steps_per_thread = number_of_steps / omp_number_of_threads;

        if (thread_id == 0) {
	    	number_of_threads = omp_number_of_threads;
            if (number_of_steps < omp_number_of_threads) {
                printf("Number of steps should not be less than number of threads.\n");
                exit(1);
            }
        }
        start_index = steps_per_thread * thread_id;

		if (thread_id == omp_number_of_threads - 1) {
        	steps_per_thread += number_of_steps % omp_number_of_threads;
    	}

        for (i = start_index, sum = 0.0; i < (start_index + steps_per_thread); i++) {
            x = (i + 0.5) * step;
            sum += 4.0 / (1.0 + x * x);
        }
		#pragma omp critical
	    	pi += sum*step;
    }

    TIME_STOP;

    printf("The value of pi with %ld steps, %d omp threads: %f, took %f secs\n", number_of_steps, number_of_threads, pi, TIME_ELAPSED_SEC);

    return 0;
}

The above code can be compiled in the following way:

$ gcc -fopenmp -Wall -g  -O3 -lrt -lgomp -o pi_omp pi_omp.c

MPI

Implementation in C of the distributed-memory parallel program of the toy application is provided below:

Implementation in C (pi_mpi.c)
#include <stdio.h>
#include <mpi.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include <sys/time.h>

/* Reference: http://www.mcs.anl.gov/research/projects/mpi/tutorial/mpiexmpl/src/pi/C/main.html */

#define TIME_START             	clock_gettime(CLOCK_MONOTONIC_RAW, &t1)
#define TIME_STOP              	clock_gettime(CLOCK_MONOTONIC_RAW, &t2)
#define TIME_ELAPSED_SEC       	t2.tv_sec - t1.tv_sec + (t2.tv_nsec - t1.tv_nsec)/1.0e9

int main(int argc, char **argv) {
    long	number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48)
			steps_per_rank,
			i,
			start_index;

    int		rank,
			size;
    double 	PI25DT = 3.141592653589793238462643,
    	   	mypi,
			pi,
			step,
			sum = 0.0,
			x,
			mytime,
			maxtime;
    struct  timespec t1, t2;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    step   = 1.0 / (double) number_of_steps;
    steps_per_rank = number_of_steps / size;

    if (rank == 0 && number_of_steps < size) {
        printf("Number of steps should not be less than number of ranks.\n");
        exit(1);
    }

    TIME_START;

    start_index = steps_per_rank * rank;

    if (rank == size - 1) {
        steps_per_rank += number_of_steps % size;
    }

    for (i = start_index; i < (start_index + steps_per_rank); i++) {
        x = (i + 0.5) * step;
        sum += 4.0 / (1.0 + x * x);
    }

    mypi = step * sum;
    MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    TIME_STOP;

    mytime = TIME_ELAPSED_SEC;
    MPI_Reduce(&mytime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	
    if (rank == 0) {
        printf("pi is approximately %.16f, Error is %.16f, number_of_steps is %ld and took %f secs\n", pi, fabs(pi - PI25DT), number_of_steps, maxtime);
    }

    MPI_Finalize();
    return 0;
}

The above code can be compiled in the following way:

$ module load openmpi/4.1.5

$ mpicc -Wall -g  -O3 -lm -lrt -o pi_mpi pi_mpi.c

Hybrid MPI and OpenMP

Implementation in C of the combination of the distributed-memory and shared-memory parallel program of the toy application is provided below:

Implementation in C (pi_hybrid_mpi_omp.c)
#ifdef _OPENMP
#include <omp.h>
#endif
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include <sys/time.h>

#define TIME_START             	clock_gettime(CLOCK_MONOTONIC_RAW, &t1)
#define TIME_STOP              	clock_gettime(CLOCK_MONOTONIC_RAW, &t2)
#define TIME_ELAPSED_SEC		t2.tv_sec - t1.tv_sec + (t2.tv_nsec - t1.tv_nsec)/1.0e9

int main(int argc, char ** argv) {
    int 	number_of_threads,
			size,
			rank;
    long 	number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48)
			start_index,
			steps_per_thread,
			start_rank;
    double 	step,
			mypi = 0.0,
			pi,
			mytime,
        	maxtime;
    struct  timespec t1, t2;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    step = 1.0 / (double) number_of_steps;
    start_rank = (long)(number_of_steps / size) * rank;

    TIME_START;

    #pragma omp parallel private (start_index)
    { 
        long 	i;
		int		thread_id,
				omp_number_of_threads;
        double 	x,
				sum;

        thread_id = omp_get_thread_num();
        omp_number_of_threads = omp_get_num_threads();
		steps_per_thread = (long)(number_of_steps / size) / omp_number_of_threads;

        if (thread_id == 0) {
	    	number_of_threads = omp_number_of_threads;
            if (rank == 0 && number_of_steps < size * omp_number_of_threads) {
                printf("Number of steps should not be less than (ranks * threads).\n");
                exit(1);
            }
        }

        start_index = start_rank + steps_per_thread * thread_id;

		if (thread_id == omp_number_of_threads - 1) {
        	steps_per_thread += (long)(number_of_steps / size) % omp_number_of_threads;
            if (rank == size - 1) {
                steps_per_thread += number_of_steps % size;
            }
    	}

        for (i = start_index, sum = 0.0; i < (start_index + steps_per_thread); i++) {
            x = (i + 0.5) * step;
            sum += 4.0 / (1.0 + x * x);
        }
        #pragma omp critical
	    	mypi += sum*step;
    }

    MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    TIME_STOP;

    mytime = TIME_ELAPSED_SEC;

    MPI_Reduce(&mytime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        printf("The value of pi with %ld steps, %d omp threads: %f, took %f secs\n", number_of_steps, number_of_threads, pi, maxtime);
    }
    MPI_Finalize();
    return 0;
}

The above code can be compiled in the following way:

$ module load openmpi/4.1.5

$ mpicc -fopenmp -Wall -g  -O3 -lrt -lgomp -o pi_hybrid_mpi_omp pi_hybrid_mpi_omp.c

GPU CUDA

Implementation in C of the shared-memory parallel program of the toy application is provided below:

Implementation in C (pi_cuda.cu)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include <cuda.h>
#include <cuda_runtime.h>
  
#define THREADS_PER_BLOCK  1024
#define BLOCKS_PER_GRID   16384
 
__global__ void _cuda_compute_thread_work(double * device_sum, double step, unsigned long number_of_steps) {
    unsigned long    number_of_threads = gridDim.x * blockDim.x,
            	     steps_per_thread = number_of_steps / number_of_threads,
                     thread_id = (blockDim.x * blockIdx.x) + threadIdx.x,
                     start_index = thread_id * steps_per_thread;
    double           x;

    if (thread_id == number_of_threads - 1) {
        steps_per_thread += number_of_steps % number_of_threads;
    }
  
    for (unsigned long i = start_index; i < (start_index + steps_per_thread); i++) {
        x = (i + 0.5) * step;
        device_sum[thread_id] += 4.0 / (1.0 + x * x);
    }
}
  
__global__ void _cuda_parallel_sum(double * input, double * output) {
    extern __shared__ double	shared_data[];
    unsigned long               i = blockIdx.x * blockDim.x + threadIdx.x,
                                thread_id = threadIdx.x;
 
    // Each thread loads one element from global to shared memory
    shared_data[thread_id] = input[i];
    __syncthreads();
 
    // Do reduction in shared memory
    for (unsigned long s = blockDim.x / 2; s > 0; s >>= 1) {
        if (thread_id < s) {
            shared_data[thread_id] += shared_data[thread_id + s];
        }
        __syncthreads();
    }
 
    // Write result for this block to global memory
    if (thread_id == 0) {
        output[blockIdx.x] = shared_data[0];
    }
}
  
unsigned long two_pow_number(double n) {
    return pow(2, (unsigned long)(log10(n)/log10(2)));
}
  
int main(int argc, char ** argv) {
    double  		step,
            		pi,
            		*host_remaining_sum,
            		*device_sum,
            		*host_sum,
            		*device_block_sum;
    unsigned long   number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48)
	    		    number_of_threads = BLOCKS_PER_GRID * THREADS_PER_BLOCK,
            		two_pow_grid_size = two_pow_number(BLOCKS_PER_GRID),
            		two_pow_block_size = two_pow_number(THREADS_PER_BLOCK),
            		extra_number_of_threads = number_of_threads - two_pow_grid_size * two_pow_block_size;
    struct timeval 	t1, t2;

    if (number_of_steps < number_of_threads) {
        printf("Number of steps should not be less than number of threads.\n");
        exit(1);
    }
  
    step = 1.0 / (double) number_of_steps;
  
    // Allocate host memory
    host_remaining_sum = (double *)malloc(sizeof(double) * extra_number_of_threads);
    host_sum = (double *)malloc(two_pow_grid_size * sizeof(double));
  
    // Initialize host arrays
    for (unsigned long i = 0; i < extra_number_of_threads; i++) {
        host_remaining_sum[i] = 0.0;
    }
    for (unsigned long i = 0; i < two_pow_grid_size; i++) {
        host_sum[i] = 0.0;
    }
  
    // Allocate device memory
    cudaMalloc((void**)&device_sum, sizeof(double) * number_of_threads);
    cudaMalloc((void **)&device_block_sum, two_pow_grid_size * sizeof(double));
  
    gettimeofday(&t1, 0);
 
    // Executing kernel
    _cuda_compute_thread_work<<<BLOCKS_PER_GRID, THREADS_PER_BLOCK>>>(device_sum, step, number_of_steps);
    _cuda_parallel_sum<<<two_pow_grid_size, two_pow_block_size, two_pow_block_size*sizeof(double)>>>(device_sum, device_block_sum);
 
    // Transfer data back to host memory
    cudaMemcpy(host_remaining_sum, &device_sum[two_pow_grid_size * two_pow_block_size], sizeof(double) * extra_number_of_threads, cudaMemcpyDeviceToHost);
    cudaMemcpy(host_sum, device_block_sum, two_pow_grid_size * sizeof(double), cudaMemcpyDeviceToHost);
 
    // Accumulates the sum in the first element
    for (unsigned long i = 1; i < two_pow_grid_size; i++) {
        host_sum[0] += host_sum[i];
    }

    // Accumulates the remaining sum in the first element
    for (unsigned long i = 0; i < extra_number_of_threads; i++) {
        host_sum[0] += host_remaining_sum[i];
    }
 
    pi = step * host_sum[0];
    gettimeofday(&t2, 0);
  
    double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0;
  
    printf("The value of pi with %ld steps: %f and time: %f secs\n", number_of_steps, pi, time);
  
    // Deallocate device memory
    cudaFree(device_sum);
    cudaFree(device_block_sum);
  
    // Deallocate host memory
    free(host_remaining_sum);
    free(host_sum);
  
    return 0;
}

The above code can be compiled in the following way:

$ module load cuda/12.0.0

$ nvcc -g  -O3 -lm -lrt -o pi_cuda pi_cuda.cu

Hybrid MPI and GPU CUDA

Implementation in C of the combination of the distributed-memory and shared-memory parallel program of the toy application is provided below:

Implementation in C (pi_hybrid_mpi_cuda_main.c and pi_hybrid_mpi_cuda.cu)
//--------------------------
// pi_hybrid_mpi_cuda_main.c
//--------------------------

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include <math.h>

#define THREADS_PER_BLOCK  1024
#define BLOCKS_PER_GRID   16384

void launch_cuda_function(unsigned long number_of_steps, unsigned long number_of_threads, unsigned long extra_number_of_threads, unsigned long two_pow_grid_size, unsigned long two_pow_block_size, unsigned long grid_size, unsigned long block_size, double step, int size, int rank, unsigned long start_rank, double *host_sum, double*host_remaining_sum);

unsigned long two_pow_number(double n) {
    return pow(2, (unsigned long)(log10(n)/log10(2)));
}
  
int main(int argc, char ** argv) {
    int					size,
						rank;
    double  			step,
						mypi = 0.0,
            			pi,
            			*host_remaining_sum,
            			*host_sum;
    unsigned long       number_of_steps = 960000000000, // Total number of rectangles to be computed (factor of 48)
						block_size = THREADS_PER_BLOCK,
						grid_size = BLOCKS_PER_GRID,
	    				number_of_threads = grid_size * block_size,
            			two_pow_grid_size = two_pow_number(grid_size),
            			two_pow_block_size = two_pow_number(block_size),
            			extra_number_of_threads = number_of_threads - two_pow_grid_size * two_pow_block_size,
						start_rank;
    struct timeval 		t1, t2;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (rank == 0 && number_of_steps < size * number_of_threads) {
        printf("Number of steps should not be less than (ranks * threads).\n");
        exit(1);
    }
  
    step = 1.0 / (double) number_of_steps;
    start_rank = (unsigned long)(number_of_steps / size) * rank;
  
    // Allocate host memory
    host_remaining_sum = (double *)malloc(sizeof(double) * extra_number_of_threads);
    host_sum = (double *)malloc(two_pow_grid_size * sizeof(double));
  
    // Initialize host arrays
    for (unsigned long i = 0; i < extra_number_of_threads; i++) {
        host_remaining_sum[i] = 0.0;
    }
    for (unsigned long i = 0; i < two_pow_grid_size; i++) {
        host_sum[i] = 0.0;
    }

    gettimeofday(&t1, 0);

    launch_cuda_function(number_of_steps, number_of_threads, extra_number_of_threads, two_pow_grid_size, two_pow_block_size, grid_size, block_size, step, size, rank, start_rank, host_sum, host_remaining_sum);

    // Accumulates the sum in the first element
    for (unsigned long i = 1; i < two_pow_grid_size; i++) {
        host_sum[0] += host_sum[i];
    }

    // Accumulates the remaining sum in the first element
    for (unsigned long i = 0; i < extra_number_of_threads; i++) {
        host_sum[0] += host_remaining_sum[i];
    }

    mypi = step * host_sum[0];
    MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    gettimeofday(&t2, 0);

    double    mytime = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0,
              maxtime;

    MPI_Reduce(&mytime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        printf("The value of pi with %ld steps: %f and time: %f secs\n", number_of_steps, pi, maxtime);
    }

    // Deallocate host memory
    free(host_remaining_sum);
    free(host_sum);

    MPI_Finalize();  
    return 0;
}

//----------------------
// pi_hybrid_mpi_cuda.cu
//----------------------

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void _cuda_compute_thread_work(double * device_sum, double step, int size, int rank, unsigned long start_rank, unsigned long number_of_steps) {
    unsigned long    number_of_threads = gridDim.x * blockDim.x,
                     steps_per_thread = (unsigned long)(number_of_steps / size) / number_of_threads,
                     thread_id = (blockDim.x * blockIdx.x) + threadIdx.x,
                     start_index = start_rank + thread_id * steps_per_thread;
    double           x;

    if (thread_id == number_of_threads - 1) {
        steps_per_thread += (unsigned long)(number_of_steps / size) % number_of_threads;
        if (rank == size - 1) {
            steps_per_thread += number_of_steps % size;
        }
    }

    for (unsigned long i = start_index; i < (start_index + steps_per_thread); i++) {
        x = (i + 0.5) * step;
        device_sum[thread_id] += 4.0 / (1.0 + x * x);
    }
}

__global__ void _cuda_parallel_sum(double * input, double * output) {
    extern __shared__ double    shared_data[];
    unsigned long               i = blockIdx.x * blockDim.x + threadIdx.x,
                                thread_id = threadIdx.x;

    // Each thread loads one element from global to shared memory
    shared_data[thread_id] = input[i];
    __syncthreads();

    // Do reduction in shared memory
    for (unsigned long s = blockDim.x / 2; s > 0; s >>= 1) {
        if (thread_id < s) {
            shared_data[thread_id] += shared_data[thread_id + s];
        }
        __syncthreads();
    }

    // Write result for this block to global memory
    if (thread_id == 0) {
        output[blockIdx.x] = shared_data[0];
    }
}

extern "C" void launch_cuda_function(unsigned long number_of_steps, unsigned long number_of_threads, unsigned long extra_number_of_threads, unsigned long two_pow_grid_size, unsigned long two_pow_block_size, unsigned long grid_size, unsigned long block_size, double step, int size, int rank, unsigned long start_rank, double *host_sum, double*host_remaining_sum) {
    double    *device_sum,
              *device_block_sum;

    // Allocate device memory
    cudaMalloc((void**)&device_sum, sizeof(double) * number_of_threads);
    cudaMalloc((void **)&device_block_sum, two_pow_grid_size * sizeof(double));

    // Executing kernel
    _cuda_compute_thread_work<<<grid_size, block_size>>>(device_sum, step, size, rank, start_rank, number_of_steps);
    _cuda_parallel_sum<<<two_pow_grid_size, two_pow_block_size, two_pow_block_size*sizeof(double)>>>(device_sum, device_block_sum);

    // Transfer data back to host memory
    cudaMemcpy(host_remaining_sum, &device_sum[two_pow_grid_size * two_pow_block_size], sizeof(double) * extra_number_of_threads, cudaMemcpyDeviceToHost);
    cudaMemcpy(host_sum, device_block_sum, two_pow_grid_size * sizeof(double), cudaMemcpyDeviceToHost);

    // Deallocate device memory
    cudaFree(device_sum);
    cudaFree(device_block_sum);
}

The above codes can be compiled in the following way:

$ module load openmpi/4.1.5
$ module load cuda/12.0.0

$ mpicc -c pi_hybrid_mpi_cuda_main.c -g  -O3 -lm -lrt -lstdc++ -lcudart
$ nvcc -c pi_hybrid_mpi_cuda.cu -g  -O3 -lm -lrt -lstdc++ -lcudart
$ mpicc -o pi_hybrid_mpi_cuda_main pi_hybrid_mpi_cuda_main.o pi_hybrid_mpi_cuda.o -g  -O3 -lm -lrt -lstdc++ -lcudart

Python MPI

Implementation in Python of the distributed-memory parallel program of the toy application is provided below:

Implementation in Python (pi_mpi.py)
import numpy
import time
from mpi4py import MPI

if __name__ == '__main__':
    number_of_steps = 9600000000  # Total number of rectangles to be computed (factor of 48)
    PI25DT = 3.141592653589793238462643

    sum = 0.0

    mypi = numpy.zeros(1)
    pi = numpy.zeros(1)
    mytime = numpy.zeros(1)
    maxtime = numpy.zeros(1)

    comm = MPI.COMM_WORLD

    if comm.rank == 0 and number_of_steps < comm.size:
        print("Number of steps should not be less than number of ranks.")
        exit(1)

    step = 1.0 / number_of_steps
    steps_per_rank = int(number_of_steps / comm.size)

    start = time.time()

    start_index = steps_per_rank * comm.rank

    if comm.rank == comm.size - 1:
        steps_per_rank += number_of_steps % comm.size

    for i in range(start_index, start_index + steps_per_rank):
        x = (i + 0.5) * step
        sum += 4.0 / (1.0 + x * x)

    mypi[0] = step * sum

    comm.Reduce([mypi, MPI.DOUBLE], [pi, MPI.DOUBLE], op = MPI.SUM, root = 0)

    end = time.time()

    mytime[0] = end - start
    comm.Reduce([mytime, MPI.DOUBLE], [maxtime, MPI.DOUBLE], op = MPI.MAX, root = 0)

    if comm.rank == 0:
        print(f"pi is approximately {pi[0]:.16f}, Error is {pi[0] - PI25DT:.16f}, number_of_steps is {number_of_steps} and took {maxtime[0]:.6f} secs")