Logo Cineca Logo SCAI

You are here

FFT solution 4

C

#include "p3dfft.h"
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

int main(int argc,char **argv)
{
double *A,*B,*p,*C;
int i, j, k, x, y, z, nx, ny, nz;
int proc_id, nproc, dims[2];
int memsize[3];
int istart[3], isize[3], iend[3];
int fstart[3], fsize[3], fend[3];
int conf, iproc, jproc, ng[3];
long int Nglob, Ntot;
double pi, twopi, sinyz;
double cdiff, ccdiff, ans, prec;
double *sinx, *siny, *sinz, factor;
unsigned char op_f[3]="fft", op_b[3]="tff";

/* MPI Initializzation */
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
MPI_Comm_rank(MPI_COMM_WORLD,&proc_id);

/* Dimension definition */
nx=ny=nz=128;

/* Dims create */
dims[0]=dims[1]=0;
MPI_Dims_create(nproc,2,dims);
if(dims[0] > dims[1]) {
dims[0] = dims[1];
dims[1] = nproc/dims[0];
}
/* Other initializzations */
pi = atan(1.0)*4.0;
twopi = 2.0*pi;
/* Initialize P3DFFT */
p3dfft_setup(dims,nx,ny,nz,1,memsize);

MPI_Barrier(MPI_COMM_WORLD);

if(proc_id == 0) printf("Calling get_dims 1\n");
conf = 1;
/* Get dimensions for input array - real numbers, X-pencil shape.
Note that we are following the Fortran ordering, i.e.
the dimension with stride-1 is X. */
p3dfft_get_dims(istart,iend,isize,conf);

if(proc_id == 0) printf("Calling get_dims 2\n");
conf = 2;
/* Get dimensions for output array - complex numbers, Z-pencil shape.
Stride-1 dimension could be X or Z, depending on how the library
was compiled (stride1 option) */
p3dfft_get_dims(fstart,fend,fsize,conf);

if(proc_id == 0) printf("nx = %d; ny = %d; nz = %d\n", nx, ny, nz);

/* Allocatring */
if(proc_id == 0) printf("Allocating A\n");
A = (double *) malloc(sizeof(double) * isize[0]*isize[1]*isize[2]);
if(proc_id == 0) printf("Allocating B\n");
B = (double *) malloc(sizeof(double) * fsize[0]*fsize[1]*fsize[2]*2);
if(proc_id == 0) printf("Allocating C\n");
C = (double *) malloc(sizeof(double) * isize[0]*isize[1]*isize[2]);

if(A == NULL)
printf("%d: Error allocating array A (%d)\n",proc_id,isize[0]*isize[1]*isize[2]);
if(B == NULL)
printf("%d: Error allocating array B (%d)\n",proc_id,fsize[0]*fsize[1]*fsize[2]*2);
if(C == NULL)
printf("%d: Error allocating array C (%d)\n",proc_id,isize[0]*isize[1]*isize[2]);

sinx = malloc(sizeof(double)*nx);
siny = malloc(sizeof(double)*ny);
sinz = malloc(sizeof(double)*nz);
/* Initial Solution */
if(proc_id == 0) printf("Initial solution Step 1 of 2\n");
for(z=0;z < isize[2];z++){
sinz[z] = sin((z+istart[2]-1)*twopi/nz);
}
for(y=0;y < isize[1];y++){
siny[y] = sin((y+istart[1]-1)*twopi/ny);
}
for(x=0;x < isize[0];x++){
sinx[x] = sin((x+istart[0]-1)*twopi/nx);
}
if(proc_id == 0) printf("Initial solution Step 2 of 2\n");
p = A;
for(z=0;z < isize[2];z++){
for(y=0;y < isize[1];y++) {
sinyz = siny[y]*sinz[z];
for(x=0;x < isize[0];x++)
*p++ = sinx[x]*sinyz;
}
}
if(proc_id == 0) printf("Create Normalizzation Factor\n");
Ntot = fsize[0]*fsize[1];
Ntot *= fsize[2]*2;
Nglob = nx * ny;
Nglob *= nz;
factor = 1.0/Nglob;

MPI_Barrier(MPI_COMM_WORLD);
if(proc_id == 0) printf("Real to complex DFT upward\n");
p3dfft_ftran_r2c(A,B,op_f);

MPI_Barrier(MPI_COMM_WORLD);
if(proc_id == 0) printf("Normalizzation\n");
/* Normalizzation */
for(i=0;i < Ntot;i++){
B[i] *= factor;
}

MPI_Barrier(MPI_COMM_WORLD);
if(proc_id == 0) printf("Real to complex DFT backward\n");
p3dfft_btran_c2r(B,C,op_b);

MPI_Barrier(MPI_COMM_WORLD);
if(proc_id == 0) printf("P3DFFT cleanup\n");
p3dfft_clean();

if(proc_id == 0) printf("Check Results\n");
/* Check results */
cdiff = 0.0; p = C;
for(z=0;z < isize[2];z++){
for(y=0;y < isize[1];y++) {
sinyz =siny[y]*sinz[z];
for(x=0;x < isize[0];x++) {
ans = sinx[x]*sinyz;
if(cdiff < fabs(*p - ans))
cdiff = fabs(*p - ans);
p++;
}
}
}
MPI_Reduce(&cdiff,&ccdiff,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);

if(proc_id == 0) {
prec = 1.0e-7;

if(ccdiff > prec * Nglob*0.25)
printf("Results are incorrect\n");
else
printf("Results are correct\n");

printf("max diff =%g\n",ccdiff);
}

if(proc_id == 0) printf("MPI Finalize\n");
MPI_Finalize();

}

 

   FORTRAN

 PROGRAM FFT_3D_2Decomp_MPI
use mpi
use, intrinsic :: iso_c_binding
use decomp_2d
use decomp_2d_fft
implicit none
integer, parameter :: L = 128
integer, parameter :: M = 128
integer, parameter :: N = 128
real, parameter :: prec = 1.0e-7
integer :: p_row, p_col
integer :: nx, ny, nz
integer :: nglob, nlocin, nlocout
integer, dimension(2) :: dims
integer, dimension(3) :: fft_start, fft_end, fft_size
real :: diff, cdiff, ccdiff, pi, twopi
real(mytype), allocatable, dimension(:) :: sinx, siny, sinz
real(mytype), allocatable, dimension(:,:,:) :: in, out2
complex(mytype), allocatable, dimension(:,:,:) :: out
real(mytype) :: dummy
integer :: ierror, i,j,k, numproc, mype
integer, dimension(3) :: sizex, sizez

! ===== Initialize
call MPI_INIT(ierror)
call MPI_Comm_size(MPI_COMM_WORLD, numproc, ierror)
call MPI_COMM_RANK(MPI_COMM_WORLD, mype, ierror)

dims = 0
call MPI_Dims_create(numproc, 2, dims, ierror)
if (dims(1) .gt. dims(2)) then
dims(1) = dims(2)
dims(2) = numproc/dims(1)
endif
p_row = dims(1)
p_col = dims(2)

if (mype .eq. 0) write(*,*) 'Initializing 2Decomp&FFT...'

call decomp_2d_init(L,M,N,p_row,p_col)
call decomp_2d_fft_init
call MPI_Barrier(MPI_COMM_WORLD, ierror)

if (mype .eq. 0) write(*,*) 'Initialize variables...'

pi = atan(1.0)*4.0;
twopi = 2.0*pi;

do i =1, 3
sizex(i) = xend(i) - xstart(i) + 1
sizez(i) = zend(i) - zstart(i) + 1
end do

nglob = L * M * N
nlocin = sizex(1) * sizex(2) * sizex(3)
nlocout = sizez(1) * sizez(2) * sizez(3)

if (mype .eq. 0) write(*,*) 'Allocating array...'

call decomp_2d_fft_get_size(fft_start,fft_end,fft_size)
allocate (in(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
allocate (out(fft_start(1):fft_end(1), fft_start(2):fft_end(2), fft_start(3):fft_end(3)))
allocate (out2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))

allocate (sinx(L))
allocate (siny(M))
allocate (sinz(N))

! ===== each processor gets its local portion of global data =====
if (mype .eq. 0) write(*,*) 'Initializing array...'
do i =1, L
sinx(i) = sin(twopi*real(i)/L)
end do
do j =1, M
siny(j) = sin(twopi*real(j)/M)
end do
do k =1, N
sinz(k) = sin(twopi*real(k)/N)
end do
do k=xstart(3),xend(3)
do j=xstart(2),xend(2)
dummy = sinz(k) * siny(j)
do i=xstart(1),xend(1)
in(i,j,k) = dummy * sinx(i)
end do
end do
end do

! ===== 3D forward FFT =====
if (mype .eq. 0) write(*,*) 'Forward FFT...'
call decomp_2d_fft_3d(in, out)
call MPI_Barrier(MPI_COMM_WORLD, ierror)

! ===== Normalizzation =====
if (mype .eq. 0) write(*,*) 'Normalizzation...'
do k=fft_start(3),fft_end(3)
do j=fft_start(2),fft_end(2)
do i=fft_start(1),fft_end(1)
out(i,j,k) = out(i,j,k)/nglob
end do
end do
end do
call MPI_Barrier(MPI_COMM_WORLD, ierror)

! ===== 3D backward =====
if (mype .eq. 0) write(*,*) 'Backward FFT...'
call decomp_2d_fft_3d(out, out2)

! ===== Compare =====
call MPI_Barrier(MPI_COMM_WORLD, ierror)
if (mype .eq. 0) write(*,*) 'Comparing...'
cdiff =0.
do k=xstart(3),xend(3)
do j=xstart(2),xend(2)
do i=xstart(1),xend(1)
diff = abs(in(i,j,k) - out2(i,j,k))
if(cdiff .lt. diff) then
cdiff = diff
endif
end do
end do
end do
call MPI_Reduce(cdiff, ccdiff, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD, ierror)
if (mype .eq. 0) then
write(*,*) 'Cdiff_max =', ccdiff
if(cdiff .gt. (prec*nglob*0.25)) then
write(*,*) 'Results are incorrect:'
else
write(*,*) 'Results are correct!'
endif
endif

! =========== Finalize ===============
if (mype .eq. 0) write(*,*) 'Finalizzing...'
call decomp_2d_fft_finalize
call decomp_2d_finalize
deallocate(in, out, out2)
call MPI_FINALIZE(ierror)

end

 

For compilation (on FERMI):

module purge
module load autoload fftw/3.3.2--bgq-xl--1.0 bgq-xl/1.0

 # fortran
mpixlf2003_r -O3 -qstrict -qarch=qp -qtune=qp -I$FFTW3_INCLUDE Es_3.f90 -L$FFTW3_LIB -lfftw3_mpi -lfftw3f -lfftw3 -lm -o Es_3

# C
mpixlc_r -O3 -qstrict -qarch=qp -qtune=qp -I$FFTW3_INCLUDE Es_3.c -L$FFTW3_LIB -lfftw3_mpi -lfftw3f -lfftw3 -lm -o Es_3


For compilation (on EURORA):

module purge
module load profile/advanced
module load autoload 2Decomp_fft/1.5.847--openmpi--1.6.4--intel--cs-xe-2013--binary

# fortran
mpif90 -O3 -fpp -I. -I$DECOMP_2D_FFT_INCLUDE -I$FFTW_INC es4.f90 -L$DECOMP_2D_FFT_LIB  -L$FFTW_LIB -l2decomp_fft -lfftw3_mpi -lfftw3 -o Es_4

# C
The P3DFFT library is not currently available on EURORA cluster due to some bugs on the library itself. When a new and more stable version of the library will be released we will install it also on EURORA and GALILEO.