#! /bin/bash

#
# Copyright (c) 2015 ARM Limited
# All rights reserved
#
# The license below extends only to copyright in the software and shall
# not be construed as granting a license to any other intellectual
# property including but not limited to intellectual property relating
# to a hardware implementation of the functionality of the software
# licensed hereunder.  You may use the software subject to the license
# terms below provided that you ensure that this notice is replicated
# unmodified and in its entirety in all distributions of the software,
# modified or unmodified, in source code or in binary form.
#
# Copyright (c) 2015 University of Illinois Urbana Champaign
# All rights reserved
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Authors: Gabor Dozsa
#          Mohammad Alian


# This is a wrapper script to run a dist gem5 simulations.
# See the usage_func() below for hints on how to use it. Also,
# there are some examples in the util/dist directory (e.g.
# see util/dist/test-2nodes-AArch64.sh)
#
#
# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
# environment variable (which is what LSF does by default).
# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
# allocated to launch the gem5 processes, 2 of them are on host hname1
# and 4 of them are on host hname2.
# If LSB_MCPU_HOSTS environment variable is not defined then we launch all
# processes on the localhost.
#
# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
# boot params. The total number of gem5 processes is also passed in.
# These values can be used in the boot script to configure the MAC/IP
# addresses - among other things (see util/dist/bootscript.rcS).
#
# Each gem5 process will create an m5out.$GEM5_RANK directory for
# the usual output files. Furthermore, there will be a separate log file
# for each ssh session (we use ssh to start gem5 processes) and one for
# the server. These are called log.$GEM5_RANK and log.switch.
#


# print help
usage_func ()
{
    echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch]  [--sw-args sw_args] [-fs fullsystem]  [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe "
    echo "     -debug    : debug mode (start gem5 in gdb)"
    echo "     nnodes    : number of gem5 processes"
    echo "     rundir    : run simulation under this path. If not specified, current dir will be used"
    echo "     ckptdir   : dump/restore checkpoints to/from this path. If not specified, current dir will be used"

    echo "     fullsystem: fullsystem config file"
    echo "     fs_args   : fullsystem config specific argument list: arg1 arg2 ..."
    echo "     port      : switch listen port"
    echo "     switch    : switch config file"
    echo "     sw_args   : switch config specific argument list: arg1 arg2 ..."
    echo "     conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..."
    echo "     gem5_exe  : gem5 executable (full path required)"
    echo "     m5_args   : common m5 argument list (e.g. debug flags): arg1 arg2 ..."
    echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
}

# Process (optional) command line options
FS_ARGS=" "
SW_ARGS=" "
CF_ARGS=" "
M5_ARGS=" "
while (($# > 0))
do
    case "x$1" in
        x-debug)
            GEM5_DEBUG="-debug"
            shift 1
            ;;
        x-n|x-nodes)
            NNODES=$2
            shift 2
            ;;
        x-r|x-rundir)
            RUN_DIR=$2
            shift 2
            ;;
        x-c|x-ckptdir)
            CKPT_DIR=$2
            shift 2
            ;;
        x-p|x-port)
            SW_PORT=$2
            shift 2
            ;;
        x-s|x-switch)
            SW_CONFIG=$2
            shift 2
            ;;
	x--sw-args)
	    CUR_ARGS="SW_ARGS"
	    shift 1
	    ;;
        x-f|x-fullsystem)
            FS_CONFIG=$2
            shift 2
            ;;
	x--fs-args)
	    CUR_ARGS="FS_ARGS"
	    shift 1
	    ;;
	x--cf-args)
	    CUR_ARGS="CF_ARGS"
	    shift 1
	    ;;
	x--m5-args)
	    CUR_ARGS="M5_ARGS"
	    shift 1
	    ;;
	x-x)
	    GEM5_EXE=$2
	    shift 2
	    ;;
	x-*)
	    [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; }
	    case "x$2" in
		x-*|x)
		    eval $CUR_ARGS=\"${!CUR_ARGS} $1\"
		    shift 1
		    ;;
		*)
		    eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\"
		    shift 2
		    ;;
	    esac
	    ;;
        *)
            echo "Unknown arg: $1"
	    usage_func
	    exit 1
            ;;
    esac
done

# Default values to use (in case they are not defined as command line options)
DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py
DEFAULT_SW_CONFIG=$M5_PATH/configs/dist/sw.py
DEFAULT_SW_PORT=2200

[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG
[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG
[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT
[ -z "$NNODES" ] && NNODES=2
[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd)
[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd)

#  Check if all the executables we need exist
[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; }
[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; }
[ -x "$GEM5_EXE" ]   || { echo "Executable ${GEM5_EXE} not found"; exit 1; }
# make sure that RUN_DIR exists
mkdir -p $RUN_DIR > /dev/null 2>&1

declare -a SSH_PIDS
declare -a HOSTS
declare -a NCORES

# Find out which cluster hosts/slots are allocated or
# use localhost if there is no LSF allocation.
# We assume that allocated slots are listed in the LSB_MCPU_HOSTS
# environment variable in the form:
# host1 nslots1 host2 nslots2 ...
# (This is what LSF does by default.)
NH=0
[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES"
host=""
for hc in $LSB_MCPU_HOSTS
do
    if [ "x$host" == "x" ]
    then
        host=$hc
        HOSTS+=($hc)
    else
        NCORES+=($hc)
        ((NH+=hc))
        host=""
    fi
done
((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }

# function to clean up and abort if something goes wrong
abort_func ()
{
    echo
    echo "KILLED $(date)"
    # Try to Kill the server first. That should trigger an exit for all connected
    # gem5 processes.
    [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null
    sleep 20
    # (try to) kill gem5 processes - just in case something went wrong with the
    # server triggered exit
    bname=$(basename $GEM5_EXE)
    killall -q -s SIGKILL $bname
    for h in ${HOSTS[@]}
    do
	ssh $h killall -q -s SIGKILL $bname
    done
    sleep 5
    # kill the watchdog
    [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
    exit -1
}

# We need a watchdog to trigger full clean up if a gem5 process dies
watchdog_func ()
{
    while true
    do
        sleep 30
        ((NDEAD=0))
        for p in ${SSH_PIDS[*]}
        do
            kill -0 $p 2>/dev/null || ((NDEAD+=1))
        done
        kill -0 $SW_PID || ((NDEAD+=1))
        if ((NDEAD>0))
        then
            # we may be in the middle of an orderly termination,
            # give it some time to complete before reporting abort
            sleep 60
            echo -n "(I) (some) gem5 process(es) exited"
            abort_func
        fi
    done
}

# This function launches the gem5 processes. The only purpose is to enable
# launching gem5 processes under gdb control for debugging
start_func ()
{
      local N=$1
      local HOST=$2
      local ENV_ARGS=$3
      shift 3
      if [ "x$GEM5_DEBUG" != "x" ]
      then
	      echo "DEBUG starting terminal..."
	      MY_ARGS="$@"
	      xterm -e "gdb --args $MY_ARGS" &
      else
        ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N &
      fi
}

# block till the gem5 process starts
connected ()
{
    FILE=$1
    STRING=$2
    echo -n "waiting for $3 to start "
    while : ;
    do
        kill -0 $4 || { echo "Failed to start $3"; exit -1; }
        [[ -f "$FILE" ]] &&                                                   \
        grep -q "$STRING" "$FILE" &&                                          \
        echo -e "\nnode #$3 started" &&                                       \
        break

        sleep 2
        echo -n "."
    done
}

# Trigger full clean up in case we are being killed by external signal
trap 'abort_func' INT TERM

# env args to be passed explicitly to gem5 processes started via ssh
ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"

#cleanup log files before starting gem5 processes
rm $RUN_DIR/log.switch > /dev/null 2>&1

# make sure that CKPT_DIR exists
mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1
# launch switch gem5
SW_HOST=${HOSTS[0]}
echo "launch switch gem5 process on $SW_HOST ..."
start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch   \
          $M5_ARGS                                                            \
          $SW_CONFIG                                                          \
          $SW_ARGS                                                            \
          $CF_ARGS                                                            \
          --checkpoint-dir=$CKPT_DIR/m5out.switch                             \
          --is-switch                                                         \
          --dist-size=$NNODES                                                 \
          --dist-server-port=$SW_PORT
SW_PID=$!

# block here till switch process starts
connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID
LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch)

IFS=' ' read -ra ADDR <<< "$LINE"
# actual port that switch is listening on may be different
# from what we specified if the port was busy
SW_PORT=${ADDR[5]}

# Now launch all the gem5 processes with ssh.
echo "START $(date)"
n=0
for ((i=0; i < ${#HOSTS[@]}; i++))
do
    h=${HOSTS[$i]}
    for ((j=0; j < ${NCORES[i]}; j++))
    do
        #cleanup log files before starting gem5 processes
        rm $RUN_DIR/log.$n > /dev/null 2>&1
        # make sure that CKPT_DIR exists
        mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1
	    echo "starting gem5 on $h ..."
	    start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n       \
                       $M5_ARGS                                               \
                       $FS_CONFIG                                             \
                       $FS_ARGS                                               \
                       $CF_ARGS                                               \
                       --checkpoint-dir=$CKPT_DIR/m5out.$n                    \
	               --dist                                                 \
	               --dist-rank=$n                                         \
	               --dist-size=$NNODES                                    \
                       --dist-server-name=${HOSTS[0]}                         \
                       --dist-server-port=$SW_PORT
	    SSH_PIDS[$n]=$!
	((n+=1))
    done
done

# Wait here if it is a debug session
[ "x$GEM5_DEBUG" == "x" ] || {  echo "DEBUG session"; wait $SW_PID; exit -1; }

# start watchdog to trigger complete abort (after a grace period) if any
# gem5 process dies
watchdog_func &
WATCHDOG_PID=$!

# wait for exit statuses
((NFAIL=0))
for p in ${SSH_PIDS[*]}
do
    wait $p || ((NFAIL+=1))
done
wait $SW_PID || ((NFAIL+=1))

# all done, let's terminate the watchdog
kill $WATCHDOG_PID 2>/dev/null

if ((NFAIL==0))
then
    echo "EXIT $(date)"
else
    echo "ABORT $(date)"
fi