Logo Cineca Logo SCAI

You are here

Exercise 2 solution


 For C programmers: two different solutions are presented. What are the main differences? (answer)
Also, other solutions are available depending on the choice of the MPI_THREAD parameter used for the parallel initialization:
MPI_THREAD_SINGLE
MPI_THREAD_FUNNELED
MPI_THREAD_MULTIPLE


C solution 1

/*
* It evolves the equation:
* u,t + u,x + u,y = 0
* Using a Lax scheme.
* The initial data is a cruddy gaussian.
* Boundaries are flat: copying the value of the neighbour
*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <stdlib.h>

#include <math.h>
#include <mpi.h>
#include <omp.h>

#define NX 100
#define NY 100
#define LX 2.0
#define LY 2.0

#define sigma 0.1
#define tmax 100

/* conversions from discrete to real coordinates */
float ix2x(int ix){
return ((ix - 1) - (NX-1) / 2.0)*LX/(NX-1);
}

/*
* THE FOLLOWING CHANGES: every processor has a different offset.
* I also pass proc_y as an argument instead of proc_me since it will
* become useful when saving the output files.
*/

float iy2y(int iy, int proc_y, int nprocs){
return (iy-1 - (NY-1) / 2.0 + proc_y*(NY / nprocs) + (proc_y < NY % nprocs? proc_y:NY % nprocs)) * LY/(NY-1);
}

/* Function for evaluating the results that calculates the sum of the array values */
float summa(int nx, int ny, float* val){
float summma=0.0;
int ix,iy;
for(iy=1;iy<=ny;++iy)
for(ix=1;ix<=nx;++ix){
summma+=val[((nx+2)*iy)+ix];
}
return(summma);
}

/*
* initialize the system with a gaussian temperature distribution
*/
int init_transport(float *temp, int NLY, int proc_me, int nprocs){
int ix,iy;
float x,y;

for(iy=1;iy<NLY+1;++iy)
for(ix=1;ix<NX+1;++ix){
x=ix2x(ix);
y=iy2y(iy,proc_me, nprocs);
temp[((NX+2)*iy)+ix] = tmax*exp((-((x*x)+(y*y)))/(2.0*(sigma*sigma)));
}
return 0;
}

/*
* save the temperature distribution
* the ascii format is suitable for splot gnuplot function
*/
int save_gnuplot(char *filename, float *temp, int NLY, int proc_me, int nprocs){
float *buf;
int ix, iy, iproc=0;
FILE *fp;

MPI_Status status;

if(proc_me == 0){

buf = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));
fp = fopen(filename, "w");

for(iy=1;iy<=NLY;++iy){
for(ix=1;ix<=NX;++ix){
fprintf(fp, "\t%f\t%f\t%g\n", ix2x(ix), iy2y(iy, proc_me, nprocs), temp[((NX+2)*iy)+ix]);
}
fprintf(fp, "\n");
}

for(iproc=1; iproc<nprocs; iproc++){
int nly, count;
MPI_Recv(buf, (NX+2)*(NLY+2), MPI_REAL, iproc, 0, MPI_COMM_WORLD, &status);
MPI_Get_count( &status, MPI_REAL, &count );

nly = count / (NX + 2) - 2;
for(iy=1;iy<=nly;++iy){
for(ix=1;ix<=NX;++ix){
fprintf(fp, "\t%f\t%f\t%g\n", ix2x(ix), iy2y(iy, iproc, nprocs), buf[((NX+2)*iy)+ix]);
}

fprintf(fp, "\n");
}
}
free(buf);
fclose(fp);
}

else{
MPI_Send(temp, (NX+2)*(NLY+2), MPI_REAL, 0, 0, MPI_COMM_WORLD);
}

return 0;
}

int update_boundaries_FLAT(float *temp, int NLY, int nprocs, int proc_me, int proc_up, int proc_down){
MPI_Status status, status1;
int iy=0, ix=0;

#pragma omp for
for(iy=1;iy<=NLY;++iy){
temp[(NX+2)*iy] = temp[((NX+2)*iy)+1];
temp[((NX+2)*iy)+(NX+1)] = temp[((NX+2)*iy)+NX];
}

/*
* only the lowest has the lower boundary condition
*/
if (proc_me==0)
#pragma omp for
for(ix=0;ix<=NX+1;++ix) temp[ix] = temp[(NX+2)+ix];

/*
* only the highest has the upper boundary condition
*/
if (proc_me==nprocs-1)
#pragma omp for
for(ix=0;ix<=NX+1;++ix) temp[((NX+2)*(NLY+1))+ix] = temp[((NX+2)*(NLY))+ix];

// omp for implicit barrier no need here for extra #pragma omp barrier
#pragma omp single
{
/*
*communicate the ghost-cells
*
*
* lower-down
*/
MPI_Sendrecv(&temp[(NX+2)+1], NX, MPI_REAL, proc_down, 0, &temp[((NX+2)*(NLY+1))+1], NX, MPI_REAL, proc_up,
0, MPI_COMM_WORLD, &status);

/*
* higher-up
*/
MPI_Sendrecv(&temp[((NX+2)*(NLY))+1], NX, MPI_REAL, proc_up, 0, &temp[1], NX, MPI_REAL, proc_down,
0, MPI_COMM_WORLD, &status1);
}
return 0;
}

int evolve(float dtfact, float *temp, float *temp_new, int NLY) {
float dx, dt;
int ix,iy;
float temp0;

dx = 2*LX/NX;
dt = dtfact*dx/sqrt(3.0);

#pragma omp for
for(iy=1;iy<=NLY;++iy)
for(ix=1;ix<=NX;++ix){
temp0 = temp[((NX+2)*iy)+ix];
temp_new[((NX+2)*iy)+ix] = temp0-0.5*dt*(temp[((NX+2)*(iy+1))+ix]-temp[((NX+2)*(iy-1))+ix]
+temp[((NX+2)*iy)+(ix+1)]-temp[((NX+2)*iy)+(ix-1)])/dx;
}

#pragma omp for
for(iy=0;iy<NLY+2;++iy)
for(ix=0;ix<NX+2;++ix)
temp[((NX+2)*iy)+ix] = temp_new[((NX+2)*iy)+ix];

return 0;
}

int main(int argc, char* argv[]){
int nprocs, proc_me, proc_up, proc_down;

int NLY;
float *temp, *temp_new,before,after;
int requested,provided;

requested=MPI_THREAD_SERIALIZED;
MPI_Init_thread( &argc, &argv, requested, &provided);

MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &proc_me);

/* Check against a possible run with nprocs > NY */
if ( NY < nprocs ) {
if (proc_me==0) {
printf("\nTrying to run with %d processes, while the maximum allowed size is %d (NY)\n",nprocs,NY);
}

MPI_Finalize();
}

/*
* all the communications from/to MPI_PROC_NULL do nothing
*/
proc_down = proc_me-1;
proc_up = proc_me+1;

if(proc_down < 0)
proc_down = MPI_PROC_NULL;

if(proc_up == nprocs) proc_up = MPI_PROC_NULL;

NLY = NY/nprocs;
if (proc_me < NY % nprocs)
NLY++;

temp = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));
temp_new = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));

init_transport(temp, NLY, proc_me, nprocs);
#pragma omp parallel
{
update_boundaries_FLAT(temp, NLY, nprocs, proc_me, proc_up, proc_down);

#pragma omp master
{
float tbefore =summa(NX, NLY, temp);
MPI_Reduce(&tbefore, &before, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
if(proc_me==0) {
printf(" sum temp before: %f\n",before);
}
}
}

save_gnuplot("transport.dat", temp, NLY, proc_me, nprocs);

#pragma omp parallel
{
printf("process %d thread %d\n", proc_me, omp_get_thread_num());
int i;
for(i=1;i<=500;++i){
evolve(0.1, temp, temp_new, NLY);
update_boundaries_FLAT(temp, NLY, nprocs, proc_me, proc_up, proc_down);
}
}

save_gnuplot("transport_end.dat", temp, NLY, proc_me, nprocs);

float tafter =summa(NX, NLY, temp);
MPI_Reduce(&tafter, &after, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
if(proc_me==0) {
printf(" sum temp after: %f\n",after);
}

free(temp);
free(temp_new);
MPI_Finalize();

/* return 0; */
}

 

C solution 2

/*
* It evolves the equation:
* u,t + u,x + u,y = 0
* Using a Lax scheme.
* The initial data is a cruddy gaussian.
* Boundaries are flat: copying the value of the neighbour
*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <stdlib.h>

#include <math.h>
#include <mpi.h>
#include "omp.h"

#define NX 100
#define NY 100
#define LX 2.0
#define LY 2.0

#define sigma 0.1
#define tmax 100

/* conversions from discrete to real coordinates */
float ix2x(int ix){
return ((ix - 1) - (NX-1) / 2.0)*LX/(NX-1);
}

/*
* THE FOLLOWING CHANGES: every processor has a different offset.
* I also pass proc_y as an argument instead of proc_me since it will
* become useful when saving the output files.
*/

float iy2y(int iy, int proc_y, int nprocs){
return (iy-1 - (NY-1) / 2.0 + proc_y*(NY / nprocs) + (proc_y < NY % nprocs? proc_y:NY % nprocs)) * LY/(NY-1);
}

/* Function for evaluating the results that calculates the sum of the array values */
float summa(int nx, int ny, float* val){
float summma=0.0;
int ix,iy;
for(iy=1;iy<=ny;++iy)
for(ix=1;ix<=nx;++ix){
summma+=val[((nx+2)*iy)+ix];
}
return(summma);
}

/*
* initialize the system with a gaussian temperature distribution
*/
int init_transport(float *temp, int NLY, int proc_me, int nprocs){
int ix,iy;
float x,y;

for(iy=1;iy<NLY+1;++iy)
for(ix=1;ix<NX+1;++ix){
x=ix2x(ix);
y=iy2y(iy,proc_me, nprocs);
temp[((NX+2)*iy)+ix] = tmax*exp((-((x*x)+(y*y)))/(2.0*(sigma*sigma)));
}
return 0;
}

/*
* save the temperature distribution
* the ascii format is suitable for splot gnuplot function
*/
int save_gnuplot(char *filename, float *temp, int NLY, int proc_me, int nprocs){
float *buf;
int ix, iy, iproc=0;
FILE *fp;

MPI_Status status;

if(proc_me == 0){

buf = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));
fp = fopen(filename, "w");

for(iy=1;iy<=NLY;++iy){
for(ix=1;ix<=NX;++ix){
fprintf(fp, "\t%f\t%f\t%g\n", ix2x(ix), iy2y(iy, proc_me, nprocs), temp[((NX+2)*iy)+ix]);
}
fprintf(fp, "\n");
}

for(iproc=1; iproc<nprocs; iproc++){
int nly, count;
MPI_Recv(buf, (NX+2)*(NLY+2), MPI_REAL, iproc, 0, MPI_COMM_WORLD, &status);
MPI_Get_count( &status, MPI_REAL, &count );

nly = count / (NX + 2) - 2;
for(iy=1;iy<=nly;++iy){
for(ix=1;ix<=NX;++ix){
fprintf(fp, "\t%f\t%f\t%g\n", ix2x(ix), iy2y(iy, iproc, nprocs), buf[((NX+2)*iy)+ix]);
}

fprintf(fp, "\n");
}
}
free(buf);
fclose(fp);
}

else{
MPI_Send(temp, (NX+2)*(NLY+2), MPI_REAL, 0, 0, MPI_COMM_WORLD);
}

return 0;
}

int update_boundaries_FLAT(float *temp, int NLY, int nprocs, int proc_me, int proc_up, int proc_down){
MPI_Status status, status1;
int iy=0, ix=0;

#pragma omp parallel for
for(iy=1;iy<=NLY;++iy){
temp[(NX+2)*iy] = temp[((NX+2)*iy)+1];
temp[((NX+2)*iy)+(NX+1)] = temp[((NX+2)*iy)+NX];
}

/*
* only the lowest has the lower boundary condition
*/
if (proc_me==0)
#pragma omp parallel for
for(ix=0;ix<=NX+1;++ix) temp[ix] = temp[(NX+2)+ix];

/*
* only the highest has the upper boundary condition
*/
if (proc_me==nprocs-1)
#pragma omp parallel for
for(ix=0;ix<=NX+1;++ix) temp[((NX+2)*(NLY+1))+ix] = temp[((NX+2)*(NLY))+ix];


// omp for implicit barrier no need here for extra #pragma omp barrier
#pragma omp single
{
/*
*communicate the ghost-cells
*
* lower-down
*/
MPI_Sendrecv(&temp[(NX+2)+1], NX, MPI_REAL, proc_down, 0, &temp[((NX+2)*(NLY+1))+1], NX, MPI_REAL, proc_up,
0, MPI_COMM_WORLD, &status);

/*
* higher-up
*/
MPI_Sendrecv(&temp[((NX+2)*(NLY))+1], NX, MPI_REAL, proc_up, 0, &temp[1], NX, MPI_REAL, proc_down,
0, MPI_COMM_WORLD, &status1);
}
return 0;
}

int evolve(float dtfact, float *temp, float *temp_new, int NLY){
float dx, dt;

dx = 2*LX/NX;
dt = dtfact*dx/sqrt(3.0);

#pragma omp parallel for
for(int iy=1;iy<=NLY;++iy)
for(int ix=1;ix<=NX;++ix){
float temp0 = temp[((NX+2)*iy)+ix];
temp_new[((NX+2)*iy)+ix] = temp0-0.5*dt*(temp[((NX+2)*(iy+1))+ix]-temp[((NX+2)*(iy-1))+ix]
+temp[((NX+2)*iy)+(ix+1)]-temp[((NX+2)*iy)+(ix-1)])/dx;
}

#pragma omp parallel for
for(int iy=0;iy<NLY+2;++iy)
for(int ix=0;ix<NX+2;++ix)
temp[((NX+2)*iy)+ix] = temp_new[((NX+2)*iy)+ix];

return 0;
}


int main(int argc, char* argv[]){

int nprocs, proc_me, proc_up, proc_down;

int NLY;
float *temp, *temp_new,before,after;
int requested,provided;

requested=MPI_THREAD_SERIALIZED;
MPI_Init_thread( &argc, &argv, requested, &provided);


MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &proc_me);

/* Check against a possible run with nprocs > NY */
if ( NY < nprocs ) {
if (proc_me==0) {
printf("\nTrying to run with %d processes, while the maximum allowed size is %d (NY)\n",nprocs,NY);
}

MPI_Finalize();
}

/*
* all the communications from/to MPI_PROC_NULL do nothing
*/

proc_down = proc_me-1;
proc_up = proc_me+1;

if(proc_down < 0)
proc_down = MPI_PROC_NULL;

if(proc_up == nprocs) proc_up = MPI_PROC_NULL;

NLY = NY/nprocs;
if (proc_me < NY % nprocs)
NLY++;

temp = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));
temp_new = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));

init_transport(temp, NLY, proc_me, nprocs);
update_boundaries_FLAT(temp, NLY, nprocs, proc_me, proc_up, proc_down);

#pragma omp master
{
float tbefore =summa(NX, NLY, temp);
MPI_Reduce(&tbefore, &before, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
if(proc_me==0) {
printf(" sum temp before: %f\n",before);
}
}

save_gnuplot("transport.dat", temp, NLY, proc_me, nprocs);

printf("process %d thread %d\n", proc_me, omp_get_thread_num());
int i;
for(i=1;i<=500;++i){
evolve(0.1, temp, temp_new, NLY);
update_boundaries_FLAT(temp, NLY, nprocs, proc_me, proc_up, proc_down);
}

save_gnuplot("transport_end.dat", temp, NLY, proc_me, nprocs);

float tafter =summa(NX, NLY, temp);
MPI_Reduce(&tafter, &after, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
if(proc_me==0) {
printf(" sum temp after: %f\n",after);
}

free(temp);
free(temp_new);
MPI_Finalize();

/* return 0; */
}
 


Fortran 

!
! It evolves the equation:
! u,t + u,x + u,y = 0
! Using a Lax scheme.
! The initial data is a cruddy gaussian.
! Boundaries are flat: copying the value of the neighbour
!
! parallel WITHOUT TOPOLOGIES: distributed by columns
! dynamic allocation to decide run-time the size to allocate
! no reminder for teaching purposes

module comms
implicit none
save
include 'mpif.h'

integer :: nprocs
integer :: proc_me,proc_up,proc_down

CONTAINS

subroutine INIT_COMMS
integer ierr

call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,proc_me,ierr)

proc_up = proc_me + 1
proc_down = proc_me - 1

! all the communications from/to MPI_PROC_NULL do nothing
if (proc_down < 0) proc_down=MPI_PROC_NULL
if (proc_up >= nprocs) proc_up=MPI_PROC_NULL

end subroutine INIT_COMMS
end module comms

module transport
use comms
implicit none
save

integer, parameter :: NX=100
integer, parameter :: NY=100
real, parameter :: LX=2.0
real, parameter :: LY=2.0

integer :: NLY

real,allocatable :: temp(:,:)
real,allocatable :: temp_new(:,:)

CONTAINS
!conversions from discrete to real coordinates
real function ix2x(ix)
integer ix
ix2x = ((ix-1)-(NX-1)/2.0)*LX/(NX-1)
end function ix2x

! THE FOLLOWING CHANGES: every processor has a different offset.
! I also pass proc_y as an argument instead of proc_me since it will
! become useful when saving the output files.
real function iy2y(iy,proc_y)

integer proc_y
integer iy

if (proc_y < mod(NY,nprocs)) then
iy2y = (iy-1 - (NY-1)/2.0 + proc_y*(NY/nprocs) + proc_y) * LY/(NY-1)
else
iy2y = (iy-1 - (NY-1)/2.0 + proc_y*(NY/nprocs) + mod(NY,nprocs)) * LY/(NY-1)
endif

end function iy2y

! initialize the system with a gaussian temperature distribution
! in parallel, allocate the system too
subroutine init_transport
use comms

integer ix,iy
real x,y
real,parameter :: sigma = 0.1
real,parameter :: tmax = 100

NLY = NY/nprocs
if(proc_me < mod(NY,nprocs)) NLY=NLY+1

allocate ( temp(0:NX+1 , 0:NLY+1) )
allocate ( temp_new(0:NX+1 , 0:NLY+1) )

!! DO loops on local indeces only
do iy=1,NLY
do ix=1,NX
x=ix2x(ix)
y=iy2y(iy,proc_me)
temp(ix,iy) = tmax*exp(-(x**2+y**2)/(2.0*sigma**2))
enddo
enddo

end subroutine init_transport

! save the temperature distribution
! the ascii format is suitable for splot gnuplot function
! collective, use temp_new as temporary buffer
subroutine save_gnuplot(filename)
character(len=*) filename
integer ix,iy
integer iproc
integer ierr
integer status(MPI_STATUS_SIZE)

if (proc_me == 0) then
open(unit=20,file=filename,form='formatted')

do iy=1,NLY
do ix=1,NX
write(20,*) ix2x(ix),iy2y(iy,0),temp(ix,iy)
enddo
write(20,*)
enddo

do iproc=1,nprocs-1
call MPI_RECV(temp_new,(NX+2)*(NLY+2),MPI_REAL,iproc,0, &
MPI_COMM_WORLD,status,ierr)

do iy=1,NLY
do ix=1,NX
write(20,*) ix2x(ix),iy2y(iy,iproc),temp_new(ix,iy)
enddo
write(20,*)
enddo
enddo
close(20)

else
call MPI_SEND(temp,(NX+2)*(NLY+2),MPI_REAL,0,0,MPI_COMM_WORLD,ierr)
endif

end subroutine save_gnuplot

!! NY => NLY
subroutine update_boundaries_FLAT
integer status(MPI_STATUS_SIZE)
integer ierr

!$OMP WORKSHARE
temp(0 , 1:NLY) = temp(1 , 1:NLY)
temp(NX+1, 1:NLY) = temp(NX , 1:NLY)
!$OMP END WORKSHARE

!! only the lowest has the lower boundary condition
if (proc_me==0) then
!$OMP WORKSHARE
temp(1:NX , 0) = temp(1:NX , 1)
!$OMP END WORKSHARE
endif

!! only the highest has the upper boundary condition
if (proc_me==nprocs-1) then
!$OMP WORKSHARE
temp(1:NX , NLY+1) = temp(1:NX , NLY)
!$OMP END WORKSHARE
endif

!$OMP SINGLE
!! communicate the ghost-cells
!! lower-down
call MPI_SENDRECV(temp(1,1),NX,MPI_REAL,proc_down,0, &
temp(1,NLY+1),NX,MPI_REAL,proc_up,0,MPI_COMM_WORLD,status,ierr)
!! higher-up
call MPI_SENDRECV(temp(1,NLY),NX,MPI_REAL,proc_up,0, &
temp(1,0),NX,MPI_REAL,proc_down,0,MPI_COMM_WORLD,status,ierr)
!$OMP END SINGLE

end subroutine update_boundaries_FLAT

subroutine evolve(dtfact)
real dtfact
real dx,dt
integer ix,iy
real temp0

dx = 2*LX/NX
dt = dtfact*dx/sqrt(3.0)

!$OMP DO
do iy=1,NLY
do ix=1,NX

temp0 = temp(ix,iy)
temp_new(ix,iy) = temp0 - 0.5 * dt * &
(temp(ix+1,iy)-temp(ix-1,iy)+temp(ix,iy+1)-temp(ix,iy-1)) / dx

enddo
enddo
!$OMP END DO

!$OMP WORKSHARE
temp = temp_new
!$OMP END WORKSHARE

end subroutine evolve
end module transport


program prova
use transport
use comms
use omp_lib
implicit none

integer i
integer ierr
integer provided
real before,tbefore,after,tafter

call MPI_INIT_THREAD( MPI_THREAD_SERIALIZED, provided, ierr)
call init_comms

call init_transport
!$OMP PARALLEL
call update_boundaries_FLAT
!$OMP END PARALLEL

tbefore=sum(temp(1:NX,1:NLY))
call MPI_Reduce(tbefore,before,1,MPI_REAL,MPI_SUM,0,MPI_COMM_WORLD,ierr)
if (proc_me==0) write(*,*) 'sum temp before',before

call save_gnuplot('transport.dat')

!$OMP PARALLEL
write (*,*) "process", proc_me, " thread ", omp_get_thread_num()
do i=1,500
call evolve(0.1)
call update_boundaries_FLAT
enddo
!$OMP END PARALLEL

call save_gnuplot('transport_end.dat')

tafter=sum(temp(1:NX,1:NLY))
call MPI_Reduce(tafter,after,1,MPI_REAL,MPI_SUM,0,MPI_COMM_WORLD,ierr)
if (proc_me==0) write(*,*) 'sum temp after',after

call MPI_FINALIZE(ierr)

end program prova

 

Jobscript template (FERMI)

#!/bin/bash
# @ job_type=bluegene
# @ bg_size=64
# @ wall_clock_limit = 00:10:00
# @ account_no =
# @ queue
module load bgq-gnu zlib
export OMP_NUM_THREADS=64

runjob --np 25 --ranks-per-node 1 --env-all : ./transport-hybrid.x

 

Jobscript template (PLX)

#!/bin/bash
#PBS -N transhyb
#PBS -l select=2:ncpus=8:mpiprocs=4
#PBS -l walltime=0:10:00
#PBS -A
#PBS -q debug

cd $PBS_O_WORKDIR
module load autoload openmpi/1.4.5-mt--intel--11.1--binary

mpirun -n 8 ./transport-hybrid.x