p3dfft/BGQ 3d FFT tests

Machine

ANL BGQ Vesta

Example Program

test_inverse_f.x does a 3d distributed fft

Input
We are doing a 512^3 FFT
[tkaiser@vestalac1 p3dfft]$ cat stdin
512 512 512 2 1

[tkaiser@vestalac1 p3dfft]$ cat dims
4 4

Build comments

Build4, build6, build3, build7, and build9 used fftw for their 1d transforms. Build4 and build6 used the compile option export CFLAGS="-qarch=qp" when creating the fftw library. No other builds used this option. Build9 added export FCFLAGS="-O3 -qhot -qsimd=auto" and export CFLAGS="-O3 -qhot -qsimd=auto".

Build5 and build8 used ESSL for the 1d transforms. Build8 added the option export FCFLAGS="-O3 -qhot -qsimd=auto".

Build xxxxxx was preinstalled on the ANL system. It uses also uses ESSL for 1d transforms.

Results Comments

The runs using 1024 MPI tasks were not optimal because the grid size 512^3 was too small to take advantage of all of the tasks

The flag export CFLAGS="-qarch=qp" for build4 and build6 produces the runs with the slowest results. We note that this is a recommended option from compiler documentation (ouch).

The flags FCFLAGS="-O3 -qhot -qsimd=auto" and export CFLAGS="-O3 -qhot -qsimd=auto" produced the best results for the builds both for versions that used fftw and ESSL. The ESSL build with these flags ran at the same speed as the xxxxxx ANL build, a bit faster for some tests and a bit slower for others. The fftw build9 that used these flags was slightly slower.

Times

runbuildnodestasks
per
node
total
tasks
time
out16_128build48161281.0059
out16_128build68161281.0053
out16_128build38161280.4073
out16_128build78161280.3977
out16_128build98161280.2939
out16_128build58161280.3532
out16_128build88161280.2479
out16_128xxxxxx8161280.2715
      
      
out32_256build48322560.6071
out32_256build68322560.6066
out32_256build38322560.2907
out32_256build78322560.2832
out32_256build98322560.2419
out32_256build58322560.2573
out32_256build88322560.2145
out32_256xxxxxx8322560.2009
      
      
out64_512build48645120.4295
out64_512build68645120.4360
out64_512build38645120.2442
out64_512build78645120.2354
out64_512build98645120.2203
out64_512build58645120.2243
out64_512build88645120.2068
out64_512xxxxxx8645120.1842
      
      
out16_256build416162560.5026
out16_256build616162560.5024
out16_256build316162560.2045
out16_256build716162560.1982
out16_256build916162560.1472
out16_256build516162560.1769
out16_256build816162560.1223
out16_256xxxxxx16162560.1298
      
      
out32_512build416325120.3011
out32_512build616325120.3054
out32_512build316325120.1481
out32_512build716325120.1420
out32_512build916325120.1189
out32_512build516325120.1295
out32_512build816325120.1075
out32_512xxxxxx16325120.1056
      
      
out64_1024build4166410240.2190
out64_1024build6166410240.2167
out64_1024build3166410240.1298
out64_1024build7166410240.1275
out64_1024build9166410240.1197
out64_1024build5166410240.1139
out64_1024build8166410240.1057
out64_1024xxxxxx166410240.1033



Build Summaries

Build build4 build6
fftw Directory build2 build4
fftw Flags export CC=bgxlc_r export F77=bgxlf90_r export FC=bgxlf90_r export CFLAGS="-qarch=qp" export CC=xlc_r export F77=xlf90_r export FC=xlf90_r export CFLAGS="-qarch=qp"
ESSL Flags - -
CFLAGS export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r" export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r"
FCFLAGS - -
LDFLAGS/LIBS export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm" export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm"
configure ./configure --prefix=/home/tkaiser/p3dfft/build4 \ --enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build2/fftw/build2 ./configure --prefix=/home/tkaiser/p3dfft/build6 \ --enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build2/fftw/build4
Note -qarch=qp “ouch” slow -qarch=qp “ouch” slow



Build build3 build7 build9
fftw Directory build1 build5 build6
fftw Flags export CC=xlc_r export F77=xlf90_r export FC=xlf90_r export CC=bgxlc_r export F77=bgxlf90_r export FC=bgxlf90_r export CC=bgxlc_r export F77=bgxlf90_r export FC=bgxlf90_r export CFLAGS="-O3 -qhot -qsimd=auto" export FCFLAGS="-O3 -qhot -qsimd=auto"
ESSL Flags - - -
CFLAGS export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r" export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r" export CFLAGS="-O3 -qhot -qsimd=auto -L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r"
FCFLAGS - - export FCFLAGS="-O3 -qhot -qsimd=auto"
LDFLAGS/LIBS export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm" export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm" export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm"
configure ./configure --prefix=/home/tkaiser/p3dfft/build3 \ --enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build1/fftw/build1 ./configure --prefix=/home/tkaiser/p3dfft/build7 \ --enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build5/fftw/build5 ./configure --prefix=/home/tkaiser/p3dfft/build9 \ --enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build6/fftw/build6
Note - - export FCFLAGS="-O3 -qhot -qsimd=auto" best fftw build



Build build5 build8 xxxxxx
fftw Directory - - -
fftw Flags - - -
ESSL Flags --enable-essl --enable-essl --enable-essl
CFLAGS export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r" export CFLAGS="-O3 -qhot -qsimd=auto -L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r" -
FCFLAGS - export FCFLAGS="-O3 -qhot -qsimd=auto" -
LDFLAGS/LIBS export LIBS="-L/home/tkaiser/lib/essl -lessl -L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm" export LIBS="-L/home/tkaiser/lib/essl -lessl -L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm" -
configure ./configure --prefix=/home/tkaiser/p3dfft/build5 \ --enable-ibm --enable-essl ./configure --prefix=/home/tkaiser/p3dfft/build8 \ --enable-ibm --enable-essl -
Note - export FCFLAGS="-O3 -qhot -qsimd=auto" matched ANL build for some cases ANL build



*** build3 details ****

export FC=mpixlf90_r
export CC=mpixlc_r
export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm"
make clean
export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r"
./configure --prefix=/home/tkaiser/p3dfft/build3 \
--enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build1/fftw/build1

[tkaiser@vestalac1 p3dfft]$ cat /home/tkaiser/lib/build1/fftw/build1//script1
export CC=xlc_r
export F77=xlf90_r
export FC=xlf90_r
tar -xzf fftw-3.3.3.tar.gz
cd /home/tkaiser/lib/fftw-3.3.3
./configure --prefix=/home/tkaiser/lib/build1/fftw/build1
make
make install

*** build4 details ****

export FC=mpixlf90_r
export CC=mpixlc_r
export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r"
export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm"
make clean
./configure --prefix=/home/tkaiser/p3dfft/build4 \
--enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build2/fftw/build2

[tkaiser@vestalac1 p3dfft]$ cat /home/tkaiser/lib/build2/fftw/build2/script2 
export CC=bgxlc_r
export F77=bgxlf90_r
export FC=bgxlf90_r
export CFLAGS="-qarch=qp"
rm -rf fftw-3.3.3.tar
tar -xzf fftw-3.3.3.tar.gz
cd /home/tkaiser/lib/fftw-3.3.3
./configure --prefix=/home/tkaiser/lib/build2/fftw/build2
make
make install


export FC=xlf90_r
tar -xzf fftw-3.3.3.tar.gz
cd /home/tkaiser/lib/fftw-3.3.3
./configure --prefix=/home/tkaiser/lib/build1/fftw/build1
make
make install

*** build5 details ****

[tkaiser@vestalac1 p3dfft]$ cat doit5
export FC=mpixlf90_r
export CC=mpixlc_r
export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r"
export LIBS="-L/home/tkaiser/lib/essl -lessl -L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm"
make clean
./configure --prefix=/home/tkaiser/p3dfft/build5 \
--enable-ibm --enable-essl

*** build6 details ****

[tkaiser@vestalac1 p3dfft.2.5.1]$ cat doit6
export FC=mpixlf90_r
export CC=mpixlc_r
export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm"
make clean
export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r"
./configure --prefix=/home/tkaiser/p3dfft/build6 \
--enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build2/fftw/build4
[tkaiser@vestalac1 p3dfft.2.5.1]$ cat /home/tkaiser/lib/script4
export CC=xlc_r
export F77=xlf90_r
export FC=xlf90_r
export CFLAGS="-qarch=qp"
rm -rf fftw-3.3.3.tar
tar -xzf fftw-3.3.3.tar.gz
cd /home/tkaiser/lib/fftw-3.3.3
./configure --prefix=/home/tkaiser/lib/build2/fftw/build4
make
make install

*** build7 details ****

[tkaiser@vestalac1 p3dfft]$ cat doit7
export FC=mpixlf90_r
export CC=mpixlc_r
export CFLAGS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r"
export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm"
make clean
./configure --prefix=/home/tkaiser/p3dfft/build7 \
--enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build5/fftw/build5

[tkaiser@vestalac1 p3dfft]$ cat /home/tkaiser/lib/build5/fftw/build5/script5 
export CC=bgxlc_r
export F77=bgxlf90_r
export FC=bgxlf90_r
#export CFLAGS="-qarch=qp"
rm -rf fftw-3.3.3.tar
tar -xzf fftw-3.3.3.tar.gz
cd /home/tkaiser/lib/fftw-3.3.3
./configure --prefix=/home/tkaiser/lib/build5/fftw/build5
make
make install

*** build8 details ****

[tkaiser@vestalac1 p3dfft]$ cat doit8
export FC=mpixlf90_r
export CC=mpixlc_r
export CFLAGS="-O3 -qhot -qsimd=auto -L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r"
export FCFLAGS="-O3 -qhot -qsimd=auto"
export LIBS="-L/home/tkaiser/lib/essl -lessl -L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm"
make clean
./configure --prefix=/home/tkaiser/p3dfft/build8 \
--enable-ibm --enable-essl
make
make install
[tkaiser@vestalac1 p3dfft]$ 

*** build9 details ****

[tkaiser@vestalac1 p3dfft]$ cat doit9
export FC=mpixlf90_r
export CC=mpixlc_r
export CFLAGS="-O3 -qhot -qsimd=auto -L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r"
export FCFLAGS="-O3 -qhot -qsimd=auto"
export LIBS="-L/soft/compilers/ibmcmp-feb2013/xlf/bg/14.1/lib64 -lxlf90_r -lxlfmath -lm"
make clean
./configure --prefix=/home/tkaiser/p3dfft/build9 \
--enable-ibm --enable-fftw --with-fftw=/home/tkaiser/lib/build6/fftw/build6
make
make install

[tkaiser@vestalac1 p3dfft]$ cat /home/tkaiser/lib/build6/fftw/build6/script6 
export CC=bgxlc_r
export F77=bgxlf90_r
export FC=bgxlf90_r
export CFLAGS="-O3 -qhot -qsimd=auto"
export FCFLAGS="-O3 -qhot -qsimd=auto"
rm -rf fftw-3.3.3
tar -xzf fftw-3.3.3.tar.gz
cd /home/tkaiser/lib/fftw-3.3.3
./configure --prefix=/home/tkaiser/lib/build6/fftw/build6
make
make install

Run Script

[tkaiser@vestalac1 p3dfft]$ cat mscript 
#!/bin/bash

  echo "Starting Cobalt job script"
  mkdir $COBALT_JOBID
  cd $COBALT_JOBID
  cat $0 > script
  printenv  > env
  cp ../stdin .
  cp ../dims .
export OMP_NUM_THREADS=1
export APP0=/soft/libraries/3rdparty/p3dfft-2.4-patched/share/p3dfft-samples/test_inverse_f.x
export APP3=/home/tkaiser/p3dfft/build3/share/p3dfft-samples/test_inverse_f.x
export APP4=/home/tkaiser/p3dfft/build4/share/p3dfft-samples/test_inverse_f.x
export APP5=/home/tkaiser/p3dfft/build5/share/p3dfft-samples/test_inverse_f.x
export APP6=/home/tkaiser/p3dfft/build6/share/p3dfft-samples/test_inverse_f.x
export APP7=/home/tkaiser/p3dfft/build7/share/p3dfft-samples/test_inverse_f.x
export APP8=/home/tkaiser/p3dfft/build8/share/p3dfft-samples/test_inverse_f.x
export APP9=/home/tkaiser/p3dfft/build9/share/p3dfft-samples/test_inverse_f.x
#for P in 1 2 4
for P in 16 32 64
do
#for NT in 1 4 8 16 32 
#for NT in 16 32 64
for ND in 8 16
do
#for APP in   $APP0 $APP3 $APP4 $APP5 $APP6 $APP7 $APP8 $APP9
 for APP in   $APP0 $APP5 $APP8 $APP4 $APP6 $APP3 $APP7 $APP9
do
NT=`expr $P \* $ND`
echo $APP >> /home/tkaiser/p3dfft/$COBALT_JOBID/out"$P"_$NT
runjob  -p $P -n $NT --block $COBALT_PARTNAME  --envs OMP_NUM_THREADS=1 : $APP  >> /home/tkaiser/p3dfft/$COBALT_JOBID/out"$P"_$NT
done
done
done
#runjob  -p $P -n $NT --block $COBALT_PARTNAME  --envs OMP_NUM_THREADS=1 : /home/tkaiser/hello/hello.omp > /home/tkaiser/p3dfft/$COBALT_JOBID/map"$P"_$NT
#### run command ####
#### qsub -q default -n 4 -t 02:00:00 --mode script lscript

egrep "test_inverse|cpu time per loop" out* | jlines 2 | sort > results
for o in out16_128  out16_256  out32_256  out32_512  out64_1024  out64_512 ; do echo $o ; grep Task $o | sed "s/.*)//"  | sort -u | wc ; done >> results

[tkaiser@vestalac1 p3dfft]$