From 64ca31976fe91eedd91b2d703c6e3e62328f8e1d Mon Sep 17 00:00:00 2001 From: Gabor Dozsa Date: Thu, 7 Jan 2016 16:33:47 -0600 Subject: config: Updates for distributed gem5 simulations --- util/dist/gem5-dist.sh | 385 ++++++++++++++++++++++++++++++++++ util/dist/test/simple_bootscript.rcS | 107 ++++++++++ util/dist/test/test-2nodes-AArch64.sh | 82 ++++++++ 3 files changed, 574 insertions(+) create mode 100755 util/dist/gem5-dist.sh create mode 100644 util/dist/test/simple_bootscript.rcS create mode 100644 util/dist/test/test-2nodes-AArch64.sh (limited to 'util/dist') diff --git a/util/dist/gem5-dist.sh b/util/dist/gem5-dist.sh new file mode 100755 index 000000000..8fa799acc --- /dev/null +++ b/util/dist/gem5-dist.sh @@ -0,0 +1,385 @@ +#! /bin/bash + +# +# Copyright (c) 2015 ARM Limited +# All rights reserved +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Copyright (c) 2015 University of Illinois Urbana Champaign +# All rights reserved +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Gabor Dozsa +# Mohammad Alian + + +# This is a wrapper script to run a dist gem5 simulations. +# See the usage_func() below for hints on how to use it. Also, +# there are some examples in the util/dist directory (e.g. +# see util/dist/test-2nodes-AArch64.sh) +# +# +# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS +# environment variable (which is what LSF does by default). +# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots +# allocated to launch the gem5 processes, 2 of them are on host hname1 +# and 4 of them are on host hname2. +# If LSB_MCPU_HOSTS environment variable is not defined then we launch all +# processes on the localhost. +# +# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel +# boot params. The total number of gem5 processes is also passed in. +# These values can be used in the boot script to configure the MAC/IP +# addresses - among other things (see util/dist/bootscript.rcS). +# +# Each gem5 process will create an m5out.$GEM5_RANK directory for +# the usual output files. Furthermore, there will be a separate log file +# for each ssh session (we use ssh to start gem5 processes) and one for +# the server. These are called log.$GEM5_RANK and log.switch. +# + + +# print help +usage_func () +{ + echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch] [--sw-args sw_args] [-fs fullsystem] [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe " + echo " -debug : debug mode (start gem5 in gdb)" + echo " nnodes : number of gem5 processes" + echo " rundir : run simulation under this path. If not specified, current dir will be used" + echo " ckptdir : dump/restore checkpoints to/from this path. If not specified, current dir will be used" + + echo " fullsystem: fullsystem config file" + echo " fs_args : fullsystem config specific argument list: arg1 arg2 ..." + echo " port : switch listen port" + echo " switch : switch config file" + echo " sw_args : switch config specific argument list: arg1 arg2 ..." + echo " conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..." + echo " gem5_exe : gem5 executable (full path required)" + echo " m5_args : common m5 argument list (e.g. debug flags): arg1 arg2 ..." + echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost." +} + +# Process (optional) command line options +FS_ARGS=" " +SW_ARGS=" " +CF_ARGS=" " +M5_ARGS=" " +while (($# > 0)) +do + case "x$1" in + x-debug) + GEM5_DEBUG="-debug" + shift 1 + ;; + x-n|x-nodes) + NNODES=$2 + shift 2 + ;; + x-r|x-rundir) + RUN_DIR=$2 + shift 2 + ;; + x-c|x-ckptdir) + CKPT_DIR=$2 + shift 2 + ;; + x-p|x-port) + SW_PORT=$2 + shift 2 + ;; + x-s|x-switch) + SW_CONFIG=$2 + shift 2 + ;; + x--sw-args) + CUR_ARGS="SW_ARGS" + shift 1 + ;; + x-f|x-fullsystem) + FS_CONFIG=$2 + shift 2 + ;; + x--fs-args) + CUR_ARGS="FS_ARGS" + shift 1 + ;; + x--cf-args) + CUR_ARGS="CF_ARGS" + shift 1 + ;; + x--m5-args) + CUR_ARGS="M5_ARGS" + shift 1 + ;; + x-x) + GEM5_EXE=$2 + shift 2 + ;; + x-*) + [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; } + case "x$2" in + x-*|x) + eval $CUR_ARGS=\"${!CUR_ARGS} $1\" + shift 1 + ;; + *) + eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\" + shift 2 + ;; + esac + ;; + *) + echo "Unknown arg: $1" + usage_func + exit 1 + ;; + esac +done + +# Default values to use (in case they are not defined as command line options) +DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py +DEFAULT_SW_CONFIG=$M5_PATH/configs/example/sw.py +DEFAULT_SW_PORT=2200 + +[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG +[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG +[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT +[ -z "$NNODES" ] && NNODES=2 +[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd) +[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd) + +# Check if all the executables we need exist +[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; } +[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; } +[ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; } +# make sure that RUN_DIR exists +mkdir -p $RUN_DIR > /dev/null 2>&1 + +declare -a SSH_PIDS +declare -a HOSTS +declare -a NCORES + +# Find out which cluster hosts/slots are allocated or +# use localhost if there is no LSF allocation. +# We assume that allocated slots are listed in the LSB_MCPU_HOSTS +# environment variable in the form: +# host1 nslots1 host2 nslots2 ... +# (This is what LSF does by default.) +NH=0 +[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES" +host="" +for hc in $LSB_MCPU_HOSTS +do + if [ "x$host" == "x" ] + then + host=$hc + HOSTS+=($hc) + else + NCORES+=($hc) + ((NH+=hc)) + host="" + fi +done +((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; } + +# function to clean up and abort if something goes wrong +abort_func () +{ + echo + echo "KILLED $(date)" + # Try to Kill the server first. That should trigger an exit for all connected + # gem5 processes. + [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null + sleep 20 + # (try to) kill gem5 processes - just in case something went wrong with the + # server triggered exit + bname=$(basename $GEM5_EXE) + killall -q -s SIGKILL $bname + for h in ${HOSTS[@]} + do + ssh $h killall -q -s SIGKILL $bname + done + sleep 5 + # kill the watchdog + [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null + exit -1 +} + +# We need a watchdog to trigger full clean up if a gem5 process dies +watchdog_func () +{ + while true + do + sleep 30 + ((NDEAD=0)) + for p in ${SSH_PIDS[*]} + do + kill -0 $p 2>/dev/null || ((NDEAD+=1)) + done + kill -0 $SW_PID || ((NDEAD+=1)) + if ((NDEAD>0)) + then + # we may be in the middle of an orderly termination, + # give it some time to complete before reporting abort + sleep 60 + echo -n "(I) (some) gem5 process(es) exited" + abort_func + fi + done +} + +# This function launches the gem5 processes. The only purpose is to enable +# launching gem5 processes under gdb control for debugging +start_func () +{ + local N=$1 + local HOST=$2 + local ENV_ARGS=$3 + shift 3 + if [ "x$GEM5_DEBUG" != "x" ] + then + echo "DEBUG starting terminal..." + MY_ARGS="$@" + xterm -e "gdb --args $MY_ARGS" & + else + ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N & + fi +} + +# block till the gem5 process starts +connected () +{ + FILE=$1 + STRING=$2 + echo -n "waiting for $3 to start " + while : ; + do + kill -0 $4 || { echo "Failed to start $3"; exit -1; } + [[ -f "$FILE" ]] && \ + grep -q "$STRING" "$FILE" && \ + echo -e "\nnode #$3 started" && \ + break + + sleep 2 + echo -n "." + done +} + +# Trigger full clean up in case we are being killed by external signal +trap 'abort_func' INT TERM + +# env args to be passed explicitly to gem5 processes started via ssh +ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH" + +#cleanup log files before starting gem5 processes +rm $RUN_DIR/log.switch > /dev/null 2>&1 + +# make sure that CKPT_DIR exists +mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1 +# launch switch gem5 +SW_HOST=${HOSTS[0]} +echo "launch switch gem5 process on $SW_HOST ..." +start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch \ + $M5_ARGS \ + $SW_CONFIG \ + $SW_ARGS \ + $CF_ARGS \ + --checkpoint-dir=$CKPT_DIR/m5out.switch \ + --is-switch \ + --dist-size=$NNODES \ + --dist-server-port=$SW_PORT +SW_PID=$! + +# block here till switch process starts +connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID +LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch) + +IFS=' ' read -ra ADDR <<< "$LINE" +# actual port that switch is listening on may be different +# from what we specified if the port was busy +SW_PORT=${ADDR[5]} + +# Now launch all the gem5 processes with ssh. +echo "START $(date)" +n=0 +for ((i=0; i < ${#HOSTS[@]}; i++)) +do + h=${HOSTS[$i]} + for ((j=0; j < ${NCORES[i]}; j++)) + do + #cleanup log files before starting gem5 processes + rm $RUN_DIR/log.$n > /dev/null 2>&1 + # make sure that CKPT_DIR exists + mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1 + echo "starting gem5 on $h ..." + start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n \ + $M5_ARGS \ + $FS_CONFIG \ + $FS_ARGS \ + $CF_ARGS \ + --checkpoint-dir=$CKPT_DIR/m5out.$n \ + --dist \ + --dist-rank=$n \ + --dist-size=$NNODES \ + --dist-server-name=${HOSTS[0]} \ + --dist-server-port=$SW_PORT + SSH_PIDS[$n]=$! + ((n+=1)) + done +done + +# Wait here if it is a debug session +[ "x$GEM5_DEBUG" == "x" ] || { echo "DEBUG session"; wait $SW_PID; exit -1; } + +# start watchdog to trigger complete abort (after a grace period) if any +# gem5 process dies +watchdog_func & +WATCHDOG_PID=$! + +# wait for exit statuses +((NFAIL=0)) +for p in ${SSH_PIDS[*]} +do + wait $p || ((NFAIL+=1)) +done +wait $SW_PID || ((NFAIL+=1)) + +# all done, let's terminate the watchdog +kill $WATCHDOG_PID 2>/dev/null + +if ((NFAIL==0)) +then + echo "EXIT $(date)" +else + echo "ABORT $(date)" +fi diff --git a/util/dist/test/simple_bootscript.rcS b/util/dist/test/simple_bootscript.rcS new file mode 100644 index 000000000..7c9b75538 --- /dev/null +++ b/util/dist/test/simple_bootscript.rcS @@ -0,0 +1,107 @@ +#!/bin/bash + + +# +# Copyright (c) 2015 ARM Limited +# All rights reserved +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Gabor Dozsa +# +# +# This is an example boot script to use for dist-gem5 runs. The important +# task here is to extract the rank and size information through the m5 +# initparam utility and use those to configure MAC/IP addresses and hostname. +# +# You are expected to customize this scipt for your needs (e.g. change +# the command at the end of the scipt to run your tests/workloads. + +source /root/.bashrc +echo "bootscript.rcS is running" + +# Retrieve dist-gem5 rank and size parameters using the 'm5' utility +MY_RANK=$(/sbin/m5 initparam dist-rank) +[ $? = 0 ] || { echo "m5 initparam failed"; exit -1; } +MY_SIZE=$(/sbin/m5 initparam dist-size) +[ $? = 0 ] || { echo "m5 initparam failed"; exit -1; } + +/bin/hostname node${MY_RANK} + +# Keep MAC address assignment simple for now ... +(($MY_RANK > 97)) && { echo "(E) Rank must be less than 98"; /sbin/m5 abort; } +((MY_ADDR = MY_RANK + 2)) +if (($MY_ADDR < 10)) +then + MY_ADDR_PADDED=0${MY_ADDR} +else + MY_ADDR_PADDED=${MY_ADDR} +fi + +/sbin/ifconfig eth0 hw ether 00:90:00:00:00:${MY_ADDR_PADDED} +/sbin/ifconfig eth0 192.168.0.${MY_ADDR} netmask 255.255.255.0 up + +/sbin/ifconfig -a + +echo "Hello from $MY_RANK of $MY_SIZE" + +# Now that our network interface is configured we can use the usual commands to +# contact the other systems, e.g. let's try to ping a "neighbour" system +if ((MY_RANK < MY_SIZE - 1)) +then + ping -c 1 192.168.0.$((MY_ADDR + 1)) +else + ping -c 1 192.168.0.2 +fi + + +if [ "$MY_RANK" == "0" ] +then + # Trigger an immediate checkpoint at the next sync (by passing a non-zero + # delay param to m5 ckpt) + /sbin/m5 checkpoint 1 + echo "A real multi node workload might start here ..." + # Trigger an immediate exit at the next sync (by passing a non-zero delay + # param to m5 exit) + /sbin/m5 exit 1 +else + # This is to avoid other (rank!=0) gem5 processes exiting + # before the test (started by rank 0) completes. When rank 0 completes the + # test it will exit and that will trigger a notification to all the peer + # gem5 processes to stop the simulation. + echo "sleep forever..." + while /bin/true + do + sleep 5 + done +fi diff --git a/util/dist/test/test-2nodes-AArch64.sh b/util/dist/test/test-2nodes-AArch64.sh new file mode 100644 index 000000000..ebdf07110 --- /dev/null +++ b/util/dist/test/test-2nodes-AArch64.sh @@ -0,0 +1,82 @@ +#! /bin/bash + +# +# Copyright (c) 2015 ARM Limited +# All rights reserved +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Gabor Dozsa +# +# +# This is an example script to start a dist gem5 simulations using +# two AArch64 systems. It is also uses the example +# dist gem5 bootscript util/dist/test/simple_bootscript.rcS that will +# run the linux ping command to check if we can see the peer system +# connected via the simulated Ethernet link. + +GEM5_DIR=$(pwd)/$(dirname $0)/../../.. + +IMG=$M5_PATH/disks/aarch64-ubuntu-trusty-headless.img +VMLINUX=$M5_PATH/binaries/vmlinux.aarch64.20140821 +DTB=$M5_PATH/binaries/vexpress.aarch64.20140821.dtb + +FS_CONFIG=$GEM5_DIR/configs/example/fs.py +SW_CONFIG=$GEM5_DIR/configs/example/sw.py +GEM5_EXE=$GEM5_DIR/build/ARM/gem5.opt + +BOOT_SCRIPT=$GEM5_DIR/util/dist/test/simple_bootscript.rcS +GEM5_DIST_SH=$GEM5_DIR/util/dist/gem5-dist.sh + +DEBUG_FLAGS="--debug-flags=DistEthernet" +#CHKPT_RESTORE="-r1" + +NNODES=2 + +$GEM5_DIST_SH -n $NNODES \ + -x $GEM5_EXE \ + -s $SW_CONFIG \ + -f $FS_CONFIG \ + --m5-args \ + $DEBUG_FLAGS \ + --fs-args \ + --cpu-type=atomic \ + --num-cpus=1 \ + --machine-type=VExpress_EMM64 \ + --disk-image=$IMG \ + --kernel=$VMLINUX \ + --dtb-filename=$DTB \ + --script=$BOOT_SCRIPT \ + --cf-args \ + $CHKPT_RESTORE + -- cgit v1.2.3