Exercise 3 solution

C 

/* 
 * This program demonstrates the use of MPI_Alltoall when
 * transposing a square matrix.
 * For simplicity, the number of processes is 4 and the dimension
 * of the matrix is fixed to NROW
 */

#include <stdio.h>
#include <mpi.h>
#define NROW 16384
#define NP 4
#define NBLK NROW/NP
#define ONEML 99999999999
#define SIZE NROW*NROW*2*8
void
trans (double *a, int n)
/* transpose square matrix a, dimension nxn
 * Consider this as a black box for the MPI course
 */

{
  int i, j;
  int ij, ji, l;
  double tmp;
  ij = 0;
  l = -1;
  for (i = 0; i < n; i++)
    {
      l += n + 1;
      ji = l;
      ij += i + 1;
      for (j = i+1; j < n; j++)
	{
	  tmp = a[ij];
	  a[ij] = a[ji];
	  a[ji] = tmp;
	  ij++;
	  ji += n;
	}
    }
}

int
main (int argc, char *argv[])
{
  double a[NROW][NBLK];
  double b[NROW][NBLK];

  int i, j, nprocs, rank,provided,itemp,omp_get_num_threads(void);
  double r0,r1;

  MPI_Init_thread( &argc, &argv, MPI_THREAD_FUNNELED, &provided);
  //MPI_Init (&argc, &argv);
  MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);

  if(rank == 0)
  {
    printf ("MPI thread support required and provided: %d %d\n",MPI_THREAD_FUNNELED,provided);
    printf("Transposing a %d x %d matrix, divided among %d processors\n",NROW,NROW,NP);
#pragma omp parallel
#pragma omp master
    printf("Using %d threads\n",omp_get_num_threads());
  }
  if (nprocs != NP)
    {
      if (rank == 0)
	printf ("Error, number of processes must be %d\n",NP);
      MPI_Finalize ();
      return 1;
    }
#pragma omp parallel 
{  
#pragma omp master
 r0 = MPI_Wtime();
#pragma omp for private(i,j)
   for (i = 0; i < NROW; i++)
    for (j = 0; j < NBLK; j++)
      a[i][j] = ONEML * i + j + NBLK * rank; /* give every element a unique value */

#pragma omp master
  {
  r0 = MPI_Wtime()-r0; 
  {
  if(rank == 0)
  {
    printf("Building matrix time (sec) %f\n",r0);
  }
  }
  /* do the MPI part of the transpose */
  /* Tricky here is the number of items to send and receive. 
   * Not NROWxNBLK as one may guess, but the amount to send to one process
   * and the amount to receive from any process */
  /* MPI_Alltoall does not a transpose of the data received, we have to
   * do this ourself: */
 r0 = MPI_Wtime();
  MPI_Alltoall (&a[0][0],	/* address of data to send  */
		NBLK * NBLK,	/* number of items to send to one process  */
		MPI_DOUBLE,	/* type of data  */
		&b[0][0],	/* address for receiving the data  */
		/* NOTE: send data and receive data may NOT overlap */
		NBLK * NBLK,	/* number of items to receive 
				   from any process  */
		MPI_DOUBLE,	/* type of receive data  */
		MPI_COMM_WORLD);
 r0 = MPI_Wtime()-r0;
  if(rank == 0)
  {
    printf("MPI_Alltoall time (sec) %f\n",r0);
  }
 }
#pragma omp barrier
  /* transpose NP square matrices, order NBLKxNBLK: */
#pragma omp master
r1 = MPI_Wtime();
#pragma omp for
  for (i = 0; i < NP; i++)
      trans (&b[i * NBLK][0], NBLK);
#pragma omp master
r1 = MPI_Wtime()-r1;
 }
  if(rank == 0)
  {
    printf("Transpose block matrices time (sec) %f\n",r1);
    printf("Bandwidth (GB/sec) %f\n",(double)SIZE*1.e-9/(r1+r0));
  }

   /* now check the result */

  for (i = 0; i < NROW; i++)
    for (j = 0; j < NBLK; j++)
      {
	if (b[i][j] != ONEML * (j + NBLK * rank) + i )
	  {
	    printf ("process %d found b[%d][%d] = %f, but %f was expected\n",
		    rank, i, j, b[i][j], (double) (ONEML * (j + NBLK * rank) + i));
	    MPI_Abort (MPI_COMM_WORLD,1);
	    return 1;
	  }
      }
  if (rank == 0)
    printf ("Transpose seems ok\n");
  MPI_Finalize ();
  return 0;
}

Fortran


subroutine trans (a, n)
!* transpose square matrix a, dimension nxn
!* Consider this as a black box for the MPI course
!*/
  integer i, j, ij, ji, l,n
  double precision tmp
  double precision a(*)

  ij = 0
  l = -1
  do i = 0, n-1
      l = l+ n + 1
      ji = l
      ij = ij+i + 1
      do j=i+1,n-1 
	  tmp = a(ij+1)
	  a(ij+1) = a(ji+1)
	  a(ji+1) = tmp
	  ij = ij+1
	  ji = ji+ n
      enddo	
  enddo 
  return
end subroutine trans
!* 
!* This program demonstrates the use of MPI_Alltoall when
!* transposing a square matrix.
!* For simplicity, the number of processes is 4 and the dimension
!* of the matrix is fixed to NROW
!*/
 program main
  implicit none
  include 'mpif.h'
  integer :: nprocs,omp_get_num_threads
  integer :: proc_me,provided
    integer ierr
    integer,parameter::NROW=8192
    integer,parameter::NP=4
    integer,parameter::NBLK=NROW/NP
    double precision,parameter::ONEML=99999999999.0
    double precision,allocatable:: a(:,:)
    double precision,allocatable:: b(:,:)

  integer i, j, rank,itemp
  double precision r0
  call MPI_Init_thread(MPI_THREAD_FUNNELED, provided,ierr);
  call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
  call MPI_COMM_RANK(MPI_COMM_WORLD,proc_me,ierr)
  allocate( a(NBLK,NROW),b(NBLK,NROW))

  if(proc_me.eq.0) then
     write(*,*) 'MPI thread support required and provided','required:',MPI_THREAD_FUNNELED,'provided:',provided
     !$omp parallel
     !$omp master
      write(*,*) 'using:',omp_get_num_threads(),'threads'
     !$omp end master
     !$omp end parallel   
  endif 
  if(proc_me.eq.0) write(*,*) 'Transposing a',NROW,'x',NROW,' matrix, divided among',NP,'processors'
  if (nprocs.ne.NP) then
      if (proc_me.eq.0) then
	write(*,*) 'Error, number of processes must be',NP
      end if
      call MPI_Finalize (ierr)
	stop
   endif
 !$omp parallel
 !$omp master
 r0 = MPI_Wtime()
 !$omp end master
 !$omp do private(i,j)
  do j=1,NROW
     do i=1,NBLK
      a(i,j) = ONEML * j + i + NBLK *proc_me 
     enddo
  enddo
 !$omp end do
 !$omp master
 r0 = MPI_Wtime()-r0
 if(proc_me.eq.0) write(*,*) 'Building matrix time (sec)',r0
  
  !* do the MPI part of the transpose */
  !/* Tricky here is the number of items to send and receive. 
  ! * Not NROWxNBLK as one may guess, but the amount to send to one process
  ! * and the amount to receive from any process */

 r0 = MPI_Wtime()
  !/* MPI_Alltoall does not a transpose of the data received, we have to
  ! * do this ourself: */
  call MPI_Alltoall (a(1,1), NBLK * NBLK, MPI_DOUBLE_PRECISION, b(1,1), NBLK * NBLK, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD,ierr)
 r0 = MPI_Wtime()-r0
  if(proc_me.eq.0) write(*,*) 'MPI_Alltoall time (sec)',r0
  !* transpose NP square matrices, order NBLKxNBLK: */
  r0 = MPI_Wtime()
!$omp end master
!$omp barrier
!$omp do
  do i=1,NP
     call trans(b(1,(i-1) * NBLK+1),NBLK)
  enddo
 !$omp end do
 !$omp master
  r0 = MPI_Wtime()-r0
 !$omp end master
!$omp end parallel
  if(proc_me.eq.0) write(*,*) 'Transpose block matrices time (sec)',r0

   !* now check the result */
  do j=1,NROW  
     do i=1,NBLK
	if (b(i,j).ne. ONEML * (i + NBLK * proc_me) + j ) then
	    write(*,*) 'process',proc_me,'b',b(i,j),'expected', ONEML * (i + NBLK * proc_me) + j
	    call MPI_Abort (MPI_COMM_WORLD,1)
	endif 
     enddo
  enddo
  if (proc_me.eq.0) write(*,*) 'Transpose seems ok!'
  call MPI_Finalize (ierr)
end program main

Menu utility

Main menu

You are here

Exercise 3 solution

Menu utility

Search form

Main menu

You are here

Exercise 3 solution