A test for parallel computing at multiple processors, when files must be copied from/scr or another arbitrary location located outside the AFS.

# “defq” is the default and route only queue with
# targets: common and ib queues
#PBS -q defq
# Ask for 20 minutes wall clock’s time for the whole job
#PBS -l walltime=00:20:00
# Ask for 5 minutes CPU time for the whole job
#PBS -l cput=00:05:00
# Ask for 10 CPUes in total
#PBS -l nodes=10
## Ask for 10 hosts x 4 cpus
##PBS -l nodes=wn001.jinr.ru:ppn=4+wn002.jinr.ru:ppn=4+wn003.jinr.ru:ppn=4+wn004.jinr.ru:ppn=4+wn005.jinr.ru:ppn=4+wn006.jinr.ru:ppn=4+wn007.jinr.ru:ppn=4+wn008.jinr.ru:ppn=4+wn009.jinr.ru:ppn=4+wn010.jinr.ru:ppn=4
# Job is not restartable
#PBS -r n
# No checkpoint for the job (not implemented on linux)
#PBS -c n
#
# start of the real jobs from here
#
echo “==================================================”
# Print user name
myname=`whoami` 2>&1
echo “whoami=$myname”
# Print execution host (mother superior)
echo “Mother Superior host: “`hostname -f`
echo “Number of allocated CPU: “`cat $PBS_NODEFILE | wc -l`
echo “All allocated hosts:”
cat $PBS_NODEFILE | sort | uniq
# Print current directory
echo “pwd=”`pwd 2>&1`
# Check for PAG
echo “id=”`id 2>&1`
# Check for my procs and environment
echo “==================== klist ======================”
klist -5 2>&1 | grep -Ev ‘^[[:space:]]*$’
echo “==================== tokens ======================”
tokens 2>&1 | grep -Ev ‘^[[:space:]]*$’
echo “============== my processes ======================”
ps -ef | grep $myname | grep -v ‘grep ‘ | grep -v ‘ps -ef’
echo “============ /tmp/tkt* /tmp/krb5cc* ============”
/bin/ls -lt /tmp/tkt* /tmp/krb5cc* 2>&1 | grep $myname | head -5
echo “==================================================”
# The program source is in lxpub05:/scr/u/vmi/myprog.c
# Go to $TMPDIR and copy myprog.c
echo “cd \$TMPDIR”
cd $TMPDIR 2>&1
if test $? -ne 0 ; then
echo “ERROR: can not cd to \”$TMPDIR\””
exit 1
fi
# Print current directory
echo “pwd=”`pwd 2>&1`
scp -p2 lxpub05:/scr/u/vmi/myprog.c .
if test $? -ne 0 ; then
echo “ERROR: can not scp lxpub05:/scr/u/vmi/myprog.c”
exit 2
fi
echo “Got myprog.c from lxpub05:/scr/u/vmi/myprog.c”
# Build the executable.
echo “mpicc: “`which mpicc 2>&1`
echo “mpicc -i-dynamic -o myprog myprog.c”
mpicc -i-dynamic -o myprog myprog.c 2>&1
# Distribute program to all allocated nodes exclude this one
for node in `cat $PBS_NODEFILE | sort | uniq` ; do
test X”$node” = X”`hostname -f`” && continue
ssh -2x $node “mkdir -p “`pwd`
scp -p2 myprog $node:`pwd`
if test $? -ne 0 ; then
echo “ERROR: can not scp myprog to $node:”`pwd`
exit 3
fi
echo “Copy done to $node:”`pwd`”/myprog”
done
# And run it in all allocated nodes
echo “mpiexec: “`which mpiexec 2>&1`
if test -x myprog ; then
# is infiniband loaded on the node
if test X”`/sbin/lsmod | grep ^ib_core\ `” != “X” ; then
# with infiniband – default
echo “ulimit -l 262144”
ulimit -l 262144 2>&1
echo “================= run program ====================”
echo “mpiexec ./myprog”
mpiexec ./myprog 2>&1
else
# no infiniband – disable openib
echo “================= run program ====================”
echo “mpiexec –mca btl ^openib ./myprog”
mpiexec –mca btl ^openib ./myprog 2>&1
fi
else
echo “myprog (executable) not found in “`pwd`
fi
echo “==================================================”
# That’s all
echo “done”