summaryrefslogtreecommitdiff
path: root/util/multi/gem5-multi.sh
blob: 4b4937c90a63bfdb431c435a821047bfe3b34b6f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#! /bin/bash

#
# Copyright (c) 2015 ARM Limited
# All rights reserved
#
# The license below extends only to copyright in the software and shall
# not be construed as granting a license to any other intellectual
# property including but not limited to intellectual property relating
# to a hardware implementation of the functionality of the software
# licensed hereunder.  You may use the software subject to the license
# terms below provided that you ensure that this notice is replicated
# unmodified and in its entirety in all distributions of the software,
# modified or unmodified, in source code or in binary form.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Authors: Gabor Dozsa


# This is a wrapper script to run a multi gem5 simulations.
# See the usage_func() below for hints on how to use it. Also,
# there are some examples in the util/multi directory (e.g.
# see util/multi/test-2nodes-AArch64.sh)
#
#
# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
# environment variable (which is what LSF does by default).
# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
# allocated to launch the gem5 processes, 2 of them are on host hname1
# and 4 of them are on host hname2.
# If LSB_MCPU_HOSTS environment variable is not defined then we launch all
# processes on the localhost.
#
# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
# boot params. The total number of gem5 processes is also passed in.
# These values can be used in the boot script to configure the MAC/IP
# addresses - among other things (see util/multi/bootscript.rcS).
#
# Each gem5 process will create an m5out.$GEM5_RANK directory for
# the usual output files. Furthermore, there will be a separate log file
# for each ssh session (we use ssh to start gem5 processes) and one for
# the server. These are called log.$GEM5_RANK and log.server.
#


# print help
usage_func ()
{
    echo "Usage:$0 [-debug] [-n nnodes] [-s server] [-p port] gem5_exe gem5_args"
    echo "     -debug   : debug mode (start gem5 in gdb)"
    echo "     nnodes   : number of gem5 processes"
    echo "     server   : message server executable"
    echo "     port     : message server listen port"
    echo "     gem5_exe : gem5 executable (full path required)"
    echo "     gem5_args: usual gem5 arguments ( m5 options, config script options)"
    echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
}


# Process (optional) command line options

while true
do
    case "x$1" in
        x-n|x-nodes)
            NNODES=$2
            shift 2
            ;;
        x-s|x-server)
            TCP_SERVER=$2
            shift 2
            ;;
        x-p|x-port)
            SERVER_PORT=$2
            shift 2
            ;;
        x-debug)
            GEM5_DEBUG="-debug"
            shift 1
            ;;
        *)
            break
            ;;
    esac
done

# The remaining command line args must be the usual gem5 command
(($# < 2)) && { usage_func; exit -1; }
GEM5_EXE=$1
shift
GEM5_ARGS="$*"

# Default values to use (in case they are not defined as command line options)
DEFAULT_TCP_SERVER=$(dirname $0)/../../util/multi/tcp_server
DEFAULT_SERVER_PORT=2200

[ -z "$TCP_SERVER" ]  && TCP_SERVER=$DEFAULT_TCP_SERVER
[ -z "$SERVER_PORT" ] && SERVER_PORT=$DEFAULT_SERVER_PORT
[ -z "$NNODES" ]      && NNODES=2


#  Check if all the executables we need exist
[ -x "$TCP_SERVER" ] || { echo "Executable ${TCP_SERVER} not found"; exit 1; }
[ -x "$GEM5_EXE" ]   || { echo "Executable ${GEM5_EXE} not found"; exit 1; }


declare -a SSH_PIDS
declare -a HOSTS
declare -a NCORES

# Find out which cluster hosts/slots are allocated or
# use localhost if there is no LSF allocation.
# We assume that allocated slots are listed in the LSB_MCPU_HOSTS
# environment variable in the form:
# host1 nslots1 host2 nslots2 ...
# (This is what LSF does by default.)
NH=0
[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="localhost $NNODES"
host=""
for hc in $LSB_MCPU_HOSTS
do
    if [ "x$host" == "x" ]
    then
        host=$hc
        HOSTS+=($hc)
    else
        NCORES+=($hc)
        ((NH+=hc))
        host=""
    fi
done
((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }
#echo "hosts: ${HOSTS[@]}"
#echo "hosts: ${NCORES[@]}"
#echo ${#HOSTS[@]}


# function to clean up and abort if something goes wrong
abort_func ()
{
    echo
    echo "KILLED $(date)"
    # (try to) kill all gem5 processes on all hosts
    bname=$(basename $GEM5_EXE)
    killall -q $bname
    for h in ${HOSTS[@]}
    do
        ssh $h killall -q $bname
    done
    sleep 3
    # kill the message server and the watchdog
    [ "x$SERVER_PID" != "x" ] && kill $SERVER_PID 2>/dev/null
    [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
    exit -1
}


# We need a watchdog to trigger full clean up if a gem5 process dies
watchdog_func ()
{
    while true
    do
        sleep 30
        ((NDEAD=0))
        for p in ${SSH_PIDS[*]}
        do
            kill -0 $p 2>/dev/null || ((NDEAD+=1))
        done
        kill -0 $SERVER_PID || ((NDEAD+=1))
        if ((NDEAD>0))
        then
            # we may be in the middle of an orderly termination,
            # give it some time to complete before reporting abort
            sleep 60
            echo -n "(I) (some) gem5 process(es) exited"
            abort_func
        fi
    done
}

# This function launches the gem5 processes. We use it only to allow launching
# gem5 processes under gdb control (in the foreground) for debugging
start_func ()
{
    local N=$1
    local HOST=$2
    local ENV_ARGS=$3
    shift 3
    if [ "x$GEM5_DEBUG" != "x" ]
    then
        gdb --args "$@"
    else
        ssh $HOST $ENV_ARGS "$@" &>log.$N & 
    fi
}


# Trigger full clean up in case we are being killed by external signal
trap 'abort_func' INT TERM

# env args to be passed explicitly to gem5 processes started via ssh
ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"

# launch the mesage server and check if it has started okay
$TCP_SERVER $GEM5_DEBUG $NNODES $SERVER_PORT &>log.server &
SERVER_PID=$!
sleep 2
kill -0 $SERVER_PID || { echo "Failed to start message server"; exit -1; }

# Now launch all the gem5 processes with ssh.
echo "START $(date)"
n=0
for ((i=0; i < ${#HOSTS[@]}; i++))
do
    h=${HOSTS[$i]}
    for ((j=0; j < ${NCORES[i]}; j++))
    do
        echo "starting gem5 on $h ..."
        start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $(pwd)/m5out.$n $GEM5_ARGS \
        --multi                                                              \
        --multi-rank=$n                                                      \
        --multi-server-name=${HOSTS[0]}                                      \
        --multi-server-port=$SERVER_PORT                                     \
        --testsys-toplevel-LinuxArmSystem.boot_osflags="\"GEM5_RANK=$n GEM5_SIZE=$NNODES\""
        SSH_PIDS[$n]=$!
        ((n+=1))
    done
done

[ "x$GEM5_DEBUG" == "x" ] || {  kill $SERVER_PID; echo "DEBUG exit"; exit -1; }

# start watchdog to trigger complete abort (after a grace period) if any
# gem5 process dies
watchdog_func &
WATCHDOG_PID=$!

# wait for exit statuses
((NFAIL=0))
for p in ${SSH_PIDS[*]}
do
    wait $p || ((NFAIL+=1))
done
wait $SERVER_PID || ((NFAIL+=1))

# all done, let's terminate the watchdog
kill $WATCHDOG_PID 2>/dev/null

if ((NFAIL==0))
then
    echo "EXIT $(date)"
else
    echo "ABORT $(date)"
fi