summaryrefslogtreecommitdiff
path: root/util/dist/gem5-dist.sh
blob: c0b49128a8d51fb0aac5c0fac6717007d389902a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
#! /bin/bash

#
# Copyright (c) 2015 ARM Limited
# All rights reserved
#
# The license below extends only to copyright in the software and shall
# not be construed as granting a license to any other intellectual
# property including but not limited to intellectual property relating
# to a hardware implementation of the functionality of the software
# licensed hereunder.  You may use the software subject to the license
# terms below provided that you ensure that this notice is replicated
# unmodified and in its entirety in all distributions of the software,
# modified or unmodified, in source code or in binary form.
#
# Copyright (c) 2015 University of Illinois Urbana Champaign
# All rights reserved
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Authors: Gabor Dozsa
#          Mohammad Alian


# This is a wrapper script to run a dist gem5 simulations.
# See the usage_func() below for hints on how to use it. Also,
# there are some examples in the util/dist directory (e.g.
# see util/dist/test-2nodes-AArch64.sh)
#
#
# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
# environment variable (which is what LSF does by default).
# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
# allocated to launch the gem5 processes, 2 of them are on host hname1
# and 4 of them are on host hname2.
# If LSB_MCPU_HOSTS environment variable is not defined then we launch all
# processes on the localhost.
#
# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
# boot params. The total number of gem5 processes is also passed in.
# These values can be used in the boot script to configure the MAC/IP
# addresses - among other things (see util/dist/bootscript.rcS).
#
# Each gem5 process will create an m5out.$GEM5_RANK directory for
# the usual output files. Furthermore, there will be a separate log file
# for each ssh session (we use ssh to start gem5 processes) and one for
# the server. These are called log.$GEM5_RANK and log.switch.
#


# print help
usage_func ()
{
    echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch]  [--sw-args sw_args] [-fs fullsystem]  [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe "
    echo "     -debug    : debug mode (start gem5 in gdb)"
    echo "     nnodes    : number of gem5 processes"
    echo "     rundir    : run simulation under this path. If not specified, current dir will be used"
    echo "     ckptdir   : dump/restore checkpoints to/from this path. If not specified, current dir will be used"

    echo "     fullsystem: fullsystem config file"
    echo "     fs_args   : fullsystem config specific argument list: arg1 arg2 ..."
    echo "     port      : switch listen port"
    echo "     switch    : switch config file"
    echo "     sw_args   : switch config specific argument list: arg1 arg2 ..."
    echo "     conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..."
    echo "     gem5_exe  : gem5 executable (full path required)"
    echo "     m5_args   : common m5 argument list (e.g. debug flags): arg1 arg2 ..."
    echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
}

# Process (optional) command line options
FS_ARGS=" "
SW_ARGS=" "
CF_ARGS=" "
M5_ARGS=" "
while (($# > 0))
do
    case "x$1" in
        x-debug)
            GEM5_DEBUG="-debug"
            shift 1
            ;;
        x-n|x-nodes)
            NNODES=$2
            shift 2
            ;;
        x-r|x-rundir)
            RUN_DIR=$2
            shift 2
            ;;
        x-c|x-ckptdir)
            CKPT_DIR=$2
            shift 2
            ;;
        x-p|x-port)
            SW_PORT=$2
            shift 2
            ;;
        x-s|x-switch)
            SW_CONFIG=$2
            shift 2
            ;;
	x--sw-args)
	    CUR_ARGS="SW_ARGS"
	    shift 1
	    ;;
        x-f|x-fullsystem)
            FS_CONFIG=$2
            shift 2
            ;;
	x--fs-args)
	    CUR_ARGS="FS_ARGS"
	    shift 1
	    ;;
	x--cf-args)
	    CUR_ARGS="CF_ARGS"
	    shift 1
	    ;;
	x--m5-args)
	    CUR_ARGS="M5_ARGS"
	    shift 1
	    ;;
	x-x)
	    GEM5_EXE=$2
	    shift 2
	    ;;
	x-*)
	    [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; }
	    case "x$2" in
		x-*|x)
		    eval $CUR_ARGS=\"${!CUR_ARGS} $1\"
		    shift 1
		    ;;
		*)
		    eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\"
		    shift 2
		    ;;
	    esac
	    ;;
        *)
            echo "Unknown arg: $1"
	    usage_func
	    exit 1
            ;;
    esac
done

# Default values to use (in case they are not defined as command line options)
DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py
DEFAULT_SW_CONFIG=$M5_PATH/configs/dist/sw.py
DEFAULT_SW_PORT=2200

[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG
[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG
[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT
[ -z "$NNODES" ] && NNODES=2
[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd)
[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd)

#  Check if all the executables we need exist
[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; }
[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; }
[ -x "$GEM5_EXE" ]   || { echo "Executable ${GEM5_EXE} not found"; exit 1; }
# make sure that RUN_DIR exists
mkdir -p $RUN_DIR > /dev/null 2>&1

declare -a SSH_PIDS
declare -a HOSTS
declare -a NCORES

# Find out which cluster hosts/slots are allocated or
# use localhost if there is no LSF allocation.
# We assume that allocated slots are listed in the LSB_MCPU_HOSTS
# environment variable in the form:
# host1 nslots1 host2 nslots2 ...
# (This is what LSF does by default.)
NH=0
[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES"
host=""
for hc in $LSB_MCPU_HOSTS
do
    if [ "x$host" == "x" ]
    then
        host=$hc
        HOSTS+=($hc)
    else
        NCORES+=($hc)
        ((NH+=hc))
        host=""
    fi
done
((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }

# function to clean up and abort if something goes wrong
abort_func ()
{
    echo
    echo "KILLED $(date)"
    # Try to Kill the server first. That should trigger an exit for all connected
    # gem5 processes.
    [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null
    sleep 20
    # (try to) kill gem5 processes - just in case something went wrong with the
    # server triggered exit
    bname=$(basename $GEM5_EXE)
    killall -q -s SIGKILL $bname
    for h in ${HOSTS[@]}
    do
	ssh $h killall -q -s SIGKILL $bname
    done
    sleep 5
    # kill the watchdog
    [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
    exit -1
}

# We need a watchdog to trigger full clean up if a gem5 process dies
watchdog_func ()
{
    while true
    do
        sleep 30
        ((NDEAD=0))
        for p in ${SSH_PIDS[*]}
        do
            kill -0 $p 2>/dev/null || ((NDEAD+=1))
        done
        kill -0 $SW_PID || ((NDEAD+=1))
        if ((NDEAD>0))
        then
            # we may be in the middle of an orderly termination,
            # give it some time to complete before reporting abort
            sleep 60
            echo -n "(I) (some) gem5 process(es) exited"
            abort_func
        fi
    done
}

# This function launches the gem5 processes. The only purpose is to enable
# launching gem5 processes under gdb control for debugging
start_func ()
{
      local N=$1
      local HOST=$2
      local ENV_ARGS=$3
      shift 3
      if [ "x$GEM5_DEBUG" != "x" ]
      then
	      echo "DEBUG starting terminal..."
	      MY_ARGS="$@"
	      xterm -e "gdb --args $MY_ARGS" &
      else
        ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N &
      fi
}

# block till the gem5 process starts
connected ()
{
    FILE=$1
    STRING=$2
    echo -n "waiting for $3 to start "
    while : ;
    do
        kill -0 $4 || { echo "Failed to start $3"; exit -1; }
        [[ -f "$FILE" ]] &&                                                   \
        grep -q "$STRING" "$FILE" &&                                          \
        echo -e "\nnode #$3 started" &&                                       \
        break

        sleep 2
        echo -n "."
    done
}

# Trigger full clean up in case we are being killed by external signal
trap 'abort_func' INT TERM

# env args to be passed explicitly to gem5 processes started via ssh
ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"

#cleanup log files before starting gem5 processes
rm $RUN_DIR/log.switch > /dev/null 2>&1

# make sure that CKPT_DIR exists
mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1
# launch switch gem5
SW_HOST=${HOSTS[0]}
echo "launch switch gem5 process on $SW_HOST ..."
start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch   \
          $M5_ARGS                                                            \
          $SW_CONFIG                                                          \
          $SW_ARGS                                                            \
          $CF_ARGS                                                            \
          --checkpoint-dir=$CKPT_DIR/m5out.switch                             \
          --is-switch                                                         \
          --dist-size=$NNODES                                                 \
          --dist-server-port=$SW_PORT
SW_PID=$!

# block here till switch process starts
connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID
LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch)

IFS=' ' read -ra ADDR <<< "$LINE"
# actual port that switch is listening on may be different
# from what we specified if the port was busy
SW_PORT=${ADDR[5]}

# Now launch all the gem5 processes with ssh.
echo "START $(date)"
n=0
for ((i=0; i < ${#HOSTS[@]}; i++))
do
    h=${HOSTS[$i]}
    for ((j=0; j < ${NCORES[i]}; j++))
    do
        #cleanup log files before starting gem5 processes
        rm $RUN_DIR/log.$n > /dev/null 2>&1
        # make sure that CKPT_DIR exists
        mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1
	    echo "starting gem5 on $h ..."
	    start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n       \
                       $M5_ARGS                                               \
                       $FS_CONFIG                                             \
                       $FS_ARGS                                               \
                       $CF_ARGS                                               \
                       --checkpoint-dir=$CKPT_DIR/m5out.$n                    \
	               --dist                                                 \
	               --dist-rank=$n                                         \
	               --dist-size=$NNODES                                    \
                       --dist-server-name=${HOSTS[0]}                         \
                       --dist-server-port=$SW_PORT
	    SSH_PIDS[$n]=$!
	((n+=1))
    done
done

# Wait here if it is a debug session
[ "x$GEM5_DEBUG" == "x" ] || {  echo "DEBUG session"; wait $SW_PID; exit -1; }

# start watchdog to trigger complete abort (after a grace period) if any
# gem5 process dies
watchdog_func &
WATCHDOG_PID=$!

# wait for exit statuses
((NFAIL=0))
for p in ${SSH_PIDS[*]}
do
    wait $p || ((NFAIL+=1))
done
wait $SW_PID || ((NFAIL+=1))

# all done, let's terminate the watchdog
kill $WATCHDOG_PID 2>/dev/null

if ((NFAIL==0))
then
    echo "EXIT $(date)"
else
    echo "ABORT $(date)"
fi