You are here

Exercise 2 solution

For C programmers: two different solutions are presented. What are the main differences? (answer)
Also, other solutions are available depending on the choice of the MPI_THREAD parameter used for the parallel initialization:
MPI_THREAD_SINGLE
MPI_THREAD_FUNNELED
MPI_THREAD_MULTIPLE

C solution 1

/*
 *    It evolves the equation:
 *                            u,t + u,x + u,y = 0
 *    Using a Lax scheme.
 *    The initial data is a cruddy gaussian.
 *    Boundaries are flat: copying the value of the neighbour
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <stdlib.h>

#include <math.h>
#include <mpi.h>
#include <omp.h>

#define NX 100
#define NY 100
#define LX 2.0
#define LY 2.0

#define sigma 0.1
#define tmax 100

/* conversions from discrete to real coordinates */
float ix2x(int ix){
  return ((ix - 1) - (NX-1) / 2.0)*LX/(NX-1); 
}

/*
 * THE FOLLOWING CHANGES: every processor has a different offset.
 * I also pass proc_y as an argument instead of proc_me since it will
 * become useful when saving the output files. 
 */

float iy2y(int iy, int proc_y, int nprocs){ 
   return (iy-1 - (NY-1) / 2.0 + proc_y*(NY / nprocs) + (proc_y < NY % nprocs? proc_y:NY % nprocs)) * LY/(NY-1);  
}

/* Function for evaluating the results that calculates the sum of the array values */
float summa(int nx, int ny, float* val){
float summma=0.0;
int ix,iy;
    for(iy=1;iy<=ny;++iy)
        for(ix=1;ix<=nx;++ix){
          summma+=val[((nx+2)*iy)+ix];
 }
  return(summma);
}

/* 
 * initialize the system with a gaussian temperature distribution 
 */
int init_transport(float *temp, int NLY, int proc_me, int nprocs){
  int ix,iy;
  float x,y;

  for(iy=1;iy<NLY+1;++iy)
    for(ix=1;ix<NX+1;++ix){ 
    x=ix2x(ix);
    y=iy2y(iy,proc_me, nprocs);
    temp[((NX+2)*iy)+ix] = tmax*exp((-((x*x)+(y*y)))/(2.0*(sigma*sigma)));
    }
    return 0;
}

/*
 * save the temperature distribution
 * the ascii format is suitable for splot gnuplot function
 */
int save_gnuplot(char *filename, float *temp, int NLY, int proc_me, int nprocs){
  float *buf;
  int ix, iy, iproc=0;
  FILE *fp;

  MPI_Status status;

  if(proc_me == 0){
     
    buf = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));
    fp = fopen(filename, "w");

    for(iy=1;iy<=NLY;++iy){   
      for(ix=1;ix<=NX;++ix){
        fprintf(fp, "\t%f\t%f\t%g\n", ix2x(ix), iy2y(iy, proc_me, nprocs), temp[((NX+2)*iy)+ix]);
      }
      fprintf(fp, "\n");
    }

    for(iproc=1; iproc<nprocs; iproc++){
      int nly, count;
      MPI_Recv(buf, (NX+2)*(NLY+2), MPI_REAL, iproc, 0, MPI_COMM_WORLD, &status);
      MPI_Get_count( &status, MPI_REAL, &count );

      nly = count / (NX + 2) - 2;       
      for(iy=1;iy<=nly;++iy){   
        for(ix=1;ix<=NX;++ix){
          fprintf(fp, "\t%f\t%f\t%g\n", ix2x(ix), iy2y(iy, iproc, nprocs), buf[((NX+2)*iy)+ix]);
        }

        fprintf(fp, "\n");
      }
    }
    free(buf);
    fclose(fp);
  }

  else{
    MPI_Send(temp, (NX+2)*(NLY+2), MPI_REAL, 0, 0, MPI_COMM_WORLD);
  }

  return 0;
}

int update_boundaries_FLAT(float *temp, int NLY, int nprocs, int proc_me, int proc_up, int proc_down){
  MPI_Status status, status1;
  int iy=0, ix=0;
    
#pragma omp for  
  for(iy=1;iy<=NLY;++iy){
    temp[(NX+2)*iy] = temp[((NX+2)*iy)+1];
    temp[((NX+2)*iy)+(NX+1)] = temp[((NX+2)*iy)+NX];
  }

/*  
  * only the lowest has the lower boundary condition  
 */
  if (proc_me==0) 
#pragma omp for      
    for(ix=0;ix<=NX+1;++ix) temp[ix] = temp[(NX+2)+ix];
    
/*
  * only the highest has the upper boundary condition  
 */
  if (proc_me==nprocs-1) 
#pragma omp for  
        for(ix=0;ix<=NX+1;++ix) temp[((NX+2)*(NLY+1))+ix] = temp[((NX+2)*(NLY))+ix];

// omp for implicit barrier no need here for extra #pragma omp barrier
#pragma omp single
  {  
/*  
  *communicate the ghost-cells  
  *
  *  
  * lower-down  
 */
  MPI_Sendrecv(&temp[(NX+2)+1], NX, MPI_REAL, proc_down, 0, &temp[((NX+2)*(NLY+1))+1], NX, MPI_REAL, proc_up,
  0, MPI_COMM_WORLD, &status);

/*
  * higher-up  
 */
  MPI_Sendrecv(&temp[((NX+2)*(NLY))+1], NX, MPI_REAL, proc_up, 0, &temp[1], NX, MPI_REAL, proc_down,
  0, MPI_COMM_WORLD, &status1);
  }
  return 0;
}

int evolve(float dtfact, float *temp, float *temp_new, int NLY) { 
  float dx, dt;
  int ix,iy;
  float temp0;
    
  dx = 2*LX/NX;
  dt = dtfact*dx/sqrt(3.0);

#pragma omp for 
  for(iy=1;iy<=NLY;++iy)
    for(ix=1;ix<=NX;++ix){
    temp0 = temp[((NX+2)*iy)+ix];
    temp_new[((NX+2)*iy)+ix] = temp0-0.5*dt*(temp[((NX+2)*(iy+1))+ix]-temp[((NX+2)*(iy-1))+ix]
    +temp[((NX+2)*iy)+(ix+1)]-temp[((NX+2)*iy)+(ix-1)])/dx;
    }
    
#pragma omp for  
    for(iy=0;iy<NLY+2;++iy)
      for(ix=0;ix<NX+2;++ix)
        temp[((NX+2)*iy)+ix] = temp_new[((NX+2)*iy)+ix];
    
    return 0;
}

int main(int argc, char* argv[]){
  int nprocs, proc_me, proc_up, proc_down;

  int NLY;
  float *temp, *temp_new,before,after;
  int requested,provided;
  
  requested=MPI_THREAD_SERIALIZED; 
  MPI_Init_thread( &argc, &argv, requested, &provided);
   
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &proc_me);

  /* Check against a possible run with nprocs > NY */
  if ( NY < nprocs ) {
    if (proc_me==0) {
      printf("\nTrying to run with %d processes, while the maximum allowed size is %d (NY)\n",nprocs,NY);
    }

    MPI_Finalize();
  }

/*  
  *  all the communications from/to MPI_PROC_NULL do nothing
 */
  proc_down = proc_me-1;
  proc_up = proc_me+1;

  if(proc_down < 0)
    proc_down = MPI_PROC_NULL;
    
  if(proc_up == nprocs) proc_up = MPI_PROC_NULL;

  NLY = NY/nprocs;
  if (proc_me < NY % nprocs)
    NLY++;
    
  temp = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));
  temp_new = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));

  init_transport(temp, NLY, proc_me, nprocs);
#pragma omp parallel  
{
    update_boundaries_FLAT(temp, NLY, nprocs, proc_me, proc_up, proc_down);

#pragma omp master  
{
      float tbefore =summa(NX, NLY, temp);
    MPI_Reduce(&tbefore, &before, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
        if(proc_me==0) {
           printf(" sum temp before: %f\n",before);
    }
    }
}   

 save_gnuplot("transport.dat", temp, NLY, proc_me, nprocs);   

#pragma omp parallel  
{
  printf("process %d thread %d\n", proc_me, omp_get_thread_num());
  int i;
  for(i=1;i<=500;++i){
    evolve(0.1, temp, temp_new, NLY);
    update_boundaries_FLAT(temp, NLY, nprocs, proc_me, proc_up, proc_down);
    }
}  

  save_gnuplot("transport_end.dat", temp, NLY, proc_me, nprocs);   

  float tafter =summa(NX, NLY, temp);
        MPI_Reduce(&tafter, &after, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
        if(proc_me==0) {
           printf(" sum temp after: %f\n",after);
    }

  free(temp);
  free(temp_new);
  MPI_Finalize();

  /*    return 0; */
}

C solution 2

/*
 *    It evolves the equation:
 *                            u,t + u,x + u,y = 0
 *    Using a Lax scheme.
 *    The initial data is a cruddy gaussian.
 *    Boundaries are flat: copying the value of the neighbour
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <stdlib.h>

#include <math.h>
#include <mpi.h>
#include "omp.h"

#define NX 100
#define NY 100
#define LX 2.0
#define LY 2.0

#define sigma 0.1
#define tmax 100

/* conversions from discrete to real coordinates */
float ix2x(int ix){
  return ((ix - 1) - (NX-1) / 2.0)*LX/(NX-1); 
}

/*
 * THE FOLLOWING CHANGES: every processor has a different offset.
 * I also pass proc_y as an argument instead of proc_me since it will
 * become useful when saving the output files. 
 */

float iy2y(int iy, int proc_y, int nprocs){ 
   return (iy-1 - (NY-1) / 2.0 + proc_y*(NY / nprocs) + (proc_y < NY % nprocs? proc_y:NY % nprocs)) * LY/(NY-1);  
}

/* Function for evaluating the results that calculates the sum of the array values */
float summa(int nx, int ny, float* val){
float summma=0.0;
int ix,iy;
    for(iy=1;iy<=ny;++iy)
        for(ix=1;ix<=nx;++ix){
          summma+=val[((nx+2)*iy)+ix];
 }
  return(summma);
}

/* 
 * initialize the system with a gaussian temperature distribution 
 */
int init_transport(float *temp, int NLY, int proc_me, int nprocs){
  int ix,iy;
  float x,y;

  for(iy=1;iy<NLY+1;++iy)
    for(ix=1;ix<NX+1;++ix){ 
    x=ix2x(ix);
    y=iy2y(iy,proc_me, nprocs);
    temp[((NX+2)*iy)+ix] = tmax*exp((-((x*x)+(y*y)))/(2.0*(sigma*sigma)));
    }
    return 0;
}

/*
 * save the temperature distribution
 * the ascii format is suitable for splot gnuplot function
 */
int save_gnuplot(char *filename, float *temp, int NLY, int proc_me, int nprocs){
  float *buf;
  int ix, iy, iproc=0;
  FILE *fp;

  MPI_Status status;

  if(proc_me == 0){
     
    buf = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));
    fp = fopen(filename, "w");

    for(iy=1;iy<=NLY;++iy){   
      for(ix=1;ix<=NX;++ix){
        fprintf(fp, "\t%f\t%f\t%g\n", ix2x(ix), iy2y(iy, proc_me, nprocs), temp[((NX+2)*iy)+ix]);
      }
      fprintf(fp, "\n");
    }

    for(iproc=1; iproc<nprocs; iproc++){
      int nly, count;
      MPI_Recv(buf, (NX+2)*(NLY+2), MPI_REAL, iproc, 0, MPI_COMM_WORLD, &status);
      MPI_Get_count( &status, MPI_REAL, &count );

      nly = count / (NX + 2) - 2;       
      for(iy=1;iy<=nly;++iy){   
        for(ix=1;ix<=NX;++ix){
          fprintf(fp, "\t%f\t%f\t%g\n", ix2x(ix), iy2y(iy, iproc, nprocs), buf[((NX+2)*iy)+ix]);
        }

        fprintf(fp, "\n");
      }
    }
    free(buf);
    fclose(fp);
  }

  else{
    MPI_Send(temp, (NX+2)*(NLY+2), MPI_REAL, 0, 0, MPI_COMM_WORLD);
  }

  return 0;
}

int update_boundaries_FLAT(float *temp, int NLY, int nprocs, int proc_me, int proc_up, int proc_down){
  MPI_Status status, status1;
  int iy=0, ix=0;
    
#pragma omp parallel for  
  for(iy=1;iy<=NLY;++iy){
    temp[(NX+2)*iy] = temp[((NX+2)*iy)+1];
    temp[((NX+2)*iy)+(NX+1)] = temp[((NX+2)*iy)+NX];
  }

/*  
  * only the lowest has the lower boundary condition  
 */
  if (proc_me==0) 
#pragma omp parallel for      
    for(ix=0;ix<=NX+1;++ix) temp[ix] = temp[(NX+2)+ix];
    
/*
  * only the highest has the upper boundary condition  
 */
  if (proc_me==nprocs-1) 
#pragma omp parallel for  
        for(ix=0;ix<=NX+1;++ix) temp[((NX+2)*(NLY+1))+ix] = temp[((NX+2)*(NLY))+ix];


// omp for implicit barrier no need here for extra #pragma omp barrier
#pragma omp single
  {  
/*  
  *communicate the ghost-cells  
  *  
  * lower-down  
 */
  MPI_Sendrecv(&temp[(NX+2)+1], NX, MPI_REAL, proc_down, 0, &temp[((NX+2)*(NLY+1))+1], NX, MPI_REAL, proc_up,
  0, MPI_COMM_WORLD, &status);

/*
  * higher-up  
 */
  MPI_Sendrecv(&temp[((NX+2)*(NLY))+1], NX, MPI_REAL, proc_up, 0, &temp[1], NX, MPI_REAL, proc_down,
  0, MPI_COMM_WORLD, &status1);
  }
  return 0;
}

int evolve(float dtfact, float *temp, float *temp_new, int NLY){
  float dx, dt;
    
  dx = 2*LX/NX;
  dt = dtfact*dx/sqrt(3.0);

#pragma omp parallel for 
  for(int iy=1;iy<=NLY;++iy)
    for(int ix=1;ix<=NX;++ix){
    float temp0 = temp[((NX+2)*iy)+ix];
    temp_new[((NX+2)*iy)+ix] = temp0-0.5*dt*(temp[((NX+2)*(iy+1))+ix]-temp[((NX+2)*(iy-1))+ix]
    +temp[((NX+2)*iy)+(ix+1)]-temp[((NX+2)*iy)+(ix-1)])/dx;
    }
    
#pragma omp parallel for  
    for(int iy=0;iy<NLY+2;++iy)
      for(int ix=0;ix<NX+2;++ix)
        temp[((NX+2)*iy)+ix] = temp_new[((NX+2)*iy)+ix];
    
    return 0;
}


int main(int argc, char* argv[]){

  int nprocs, proc_me, proc_up, proc_down;

  int NLY;
  float *temp, *temp_new,before,after;
  int requested,provided;
  
  requested=MPI_THREAD_SERIALIZED; 
  MPI_Init_thread( &argc, &argv, requested, &provided);
   
  
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &proc_me);

  /* Check against a possible run with nprocs > NY */
  if ( NY < nprocs ) {
    if (proc_me==0) {
      printf("\nTrying to run with %d processes, while the maximum allowed size is %d (NY)\n",nprocs,NY);
    }

    MPI_Finalize();
  }

/*  
  *  all the communications from/to MPI_PROC_NULL do nothing
 */

  proc_down = proc_me-1;
  proc_up = proc_me+1;

  if(proc_down < 0)
    proc_down = MPI_PROC_NULL;
    
  if(proc_up == nprocs) proc_up = MPI_PROC_NULL;

  NLY = NY/nprocs;
  if (proc_me < NY % nprocs)
    NLY++;
    
  temp = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));
  temp_new = (float *) malloc ((NX+2)*(NLY+2)*sizeof(float));

  init_transport(temp, NLY, proc_me, nprocs); 
  update_boundaries_FLAT(temp, NLY, nprocs, proc_me, proc_up, proc_down);

#pragma omp master  
{
      float tbefore =summa(NX, NLY, temp);
    MPI_Reduce(&tbefore, &before, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
        if(proc_me==0) {
           printf(" sum temp before: %f\n",before);
    }
    }   

  save_gnuplot("transport.dat", temp, NLY, proc_me, nprocs);   

  printf("process %d thread %d\n", proc_me, omp_get_thread_num());
  int i;
  for(i=1;i<=500;++i){
    evolve(0.1, temp, temp_new, NLY);
    update_boundaries_FLAT(temp, NLY, nprocs, proc_me, proc_up, proc_down);
    }  

  save_gnuplot("transport_end.dat", temp, NLY, proc_me, nprocs);   

  float tafter =summa(NX, NLY, temp);
        MPI_Reduce(&tafter, &after, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
        if(proc_me==0) {
           printf(" sum temp after: %f\n",after);
    }

  free(temp);
  free(temp_new);
  MPI_Finalize();

  /*    return 0; */
}

Fortran 

!
!     It evolves the equation:
!                             u,t + u,x + u,y = 0
!     Using a Lax scheme.
!     The initial data is a cruddy gaussian.
!     Boundaries are flat: copying the value of the neighbour
!
! parallel WITHOUT TOPOLOGIES: distributed by columns
! dynamic allocation to decide run-time the size to allocate
! no reminder for teaching purposes

module comms
  implicit none
  save
  include 'mpif.h'

  integer :: nprocs
  integer :: proc_me,proc_up,proc_down

CONTAINS
  
  subroutine INIT_COMMS
    integer ierr

    call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
    call MPI_COMM_RANK(MPI_COMM_WORLD,proc_me,ierr)

    proc_up = proc_me + 1
    proc_down = proc_me - 1

    ! all the communications from/to MPI_PROC_NULL do nothing
    if (proc_down < 0) proc_down=MPI_PROC_NULL
    if (proc_up >= nprocs) proc_up=MPI_PROC_NULL

  end subroutine INIT_COMMS
end module comms

module transport
  use comms
  implicit none
  save

  integer, parameter :: NX=100
  integer, parameter :: NY=100
  real, parameter    :: LX=2.0
  real, parameter    :: LY=2.0

  integer            :: NLY

  real,allocatable   ::  temp(:,:)
  real,allocatable   ::  temp_new(:,:)

CONTAINS
  !conversions from discrete to real coordinates
  real function ix2x(ix)
    integer ix
    ix2x = ((ix-1)-(NX-1)/2.0)*LX/(NX-1)
  end function ix2x

 ! THE FOLLOWING CHANGES: every processor has a different offset.
 ! I also pass proc_y as an argument instead of proc_me since it will
 ! become useful when saving the output files.
  real function iy2y(iy,proc_y)

    integer proc_y
    integer iy

    if (proc_y < mod(NY,nprocs)) then
      iy2y = (iy-1 - (NY-1)/2.0 + proc_y*(NY/nprocs) + proc_y) * LY/(NY-1)
    else
      iy2y = (iy-1 - (NY-1)/2.0 + proc_y*(NY/nprocs) + mod(NY,nprocs)) * LY/(NY-1)
    endif

  end function iy2y

  ! initialize the system with a gaussian temperature distribution
  ! in parallel, allocate the system too
  subroutine init_transport
    use comms

    integer ix,iy
    real x,y
    real,parameter :: sigma = 0.1
    real,parameter :: tmax = 100

    NLY = NY/nprocs
    if(proc_me < mod(NY,nprocs)) NLY=NLY+1

    allocate (     temp(0:NX+1 , 0:NLY+1) )
    allocate ( temp_new(0:NX+1 , 0:NLY+1) )

    !! DO loops on local indeces only
    do iy=1,NLY
       do ix=1,NX
          x=ix2x(ix)
          y=iy2y(iy,proc_me)
          temp(ix,iy) = tmax*exp(-(x**2+y**2)/(2.0*sigma**2))
       enddo
    enddo

  end subroutine init_transport

  ! save the temperature distribution
  ! the ascii format is suitable for splot gnuplot function
  ! collective, use temp_new as temporary buffer
  subroutine save_gnuplot(filename)
    character(len=*) filename
    integer ix,iy
    integer iproc
    integer ierr
    integer status(MPI_STATUS_SIZE)

    if (proc_me == 0) then
       open(unit=20,file=filename,form='formatted')

       do iy=1,NLY
          do ix=1,NX
              write(20,*) ix2x(ix),iy2y(iy,0),temp(ix,iy)
          enddo
          write(20,*)
       enddo

       do iproc=1,nprocs-1
          call MPI_RECV(temp_new,(NX+2)*(NLY+2),MPI_REAL,iproc,0, &
               MPI_COMM_WORLD,status,ierr)

          do iy=1,NLY
             do ix=1,NX
                write(20,*) ix2x(ix),iy2y(iy,iproc),temp_new(ix,iy)
             enddo
             write(20,*)
          enddo
       enddo
       close(20)

    else
       call MPI_SEND(temp,(NX+2)*(NLY+2),MPI_REAL,0,0,MPI_COMM_WORLD,ierr)
    endif

  end subroutine save_gnuplot

  !! NY => NLY
  subroutine update_boundaries_FLAT
    integer status(MPI_STATUS_SIZE)
    integer ierr

!$OMP WORKSHARE
    temp(0   ,  1:NLY) = temp(1  , 1:NLY)
    temp(NX+1, 1:NLY) = temp(NX   , 1:NLY)
!$OMP END WORKSHARE

    !! only the lowest has the lower boundary condition
    if (proc_me==0) then
!$OMP WORKSHARE
temp(1:NX , 0)    = temp(1:NX , 1)
!$OMP END WORKSHARE
    endif

    !! only the highest has the upper boundary condition
    if (proc_me==nprocs-1) then
!$OMP WORKSHARE
temp(1:NX , NLY+1) = temp(1:NX , NLY)
!$OMP END WORKSHARE
    endif

!$OMP SINGLE
    !! communicate the ghost-cells
    !! lower-down
    call MPI_SENDRECV(temp(1,1),NX,MPI_REAL,proc_down,0, &
         temp(1,NLY+1),NX,MPI_REAL,proc_up,0,MPI_COMM_WORLD,status,ierr)
    !! higher-up
    call MPI_SENDRECV(temp(1,NLY),NX,MPI_REAL,proc_up,0, &
         temp(1,0),NX,MPI_REAL,proc_down,0,MPI_COMM_WORLD,status,ierr)
!$OMP END SINGLE

  end subroutine update_boundaries_FLAT
 
  subroutine evolve(dtfact)
    real dtfact
    real dx,dt
    integer ix,iy
    real temp0

    dx = 2*LX/NX
    dt = dtfact*dx/sqrt(3.0)

!$OMP DO
    do iy=1,NLY
       do ix=1,NX

          temp0 = temp(ix,iy)
          temp_new(ix,iy) = temp0 - 0.5 * dt * &
               (temp(ix+1,iy)-temp(ix-1,iy)+temp(ix,iy+1)-temp(ix,iy-1)) / dx

       enddo
    enddo
!$OMP END DO

!$OMP WORKSHARE
    temp = temp_new
!$OMP END WORKSHARE

  end subroutine evolve
end module transport


program prova
  use transport
  use comms
  use omp_lib
  implicit none

  integer i
  integer ierr
  integer provided
  real before,tbefore,after,tafter

  call MPI_INIT_THREAD( MPI_THREAD_SERIALIZED, provided, ierr)
  call init_comms

  call init_transport
!$OMP PARALLEL
  call update_boundaries_FLAT
!$OMP END PARALLEL

  tbefore=sum(temp(1:NX,1:NLY))
  call MPI_Reduce(tbefore,before,1,MPI_REAL,MPI_SUM,0,MPI_COMM_WORLD,ierr)
  if (proc_me==0)  write(*,*) 'sum temp before',before

  call save_gnuplot('transport.dat')

!$OMP PARALLEL
  write (*,*) "process", proc_me, " thread ", omp_get_thread_num()
  do i=1,500
     call evolve(0.1)
     call update_boundaries_FLAT
  enddo
!$OMP END PARALLEL

  call save_gnuplot('transport_end.dat')

  tafter=sum(temp(1:NX,1:NLY))
  call MPI_Reduce(tafter,after,1,MPI_REAL,MPI_SUM,0,MPI_COMM_WORLD,ierr)
  if (proc_me==0)  write(*,*) 'sum temp after',after

  call MPI_FINALIZE(ierr)

end program prova

Jobscript template (FERMI)

#!/bin/bash
# @ job_type=bluegene
# @ bg_size=64
# @ wall_clock_limit = 00:10:00
# @ account_no = 
# @ queue
module load bgq-gnu zlib
export OMP_NUM_THREADS=64

runjob --np 25 --ranks-per-node 1 --env-all : ./transport-hybrid.x

Jobscript template (PLX)

#!/bin/bash
#PBS -N transhyb
#PBS -l select=2:ncpus=8:mpiprocs=4
#PBS -l walltime=0:10:00
#PBS -A 
#PBS -q debug

cd $PBS_O_WORKDIR
module load autoload openmpi/1.4.5-mt--intel--11.1--binary

mpirun -n 8 ./transport-hybrid.x