summaryrefslogtreecommitdiff
path: root/util/qdo
blob: 8008a40c4748e029e891e0a70691c95835f3cd48 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#! /usr/bin/env python2.7

# Copyright (c) 2004-2005, 2007 The Regents of The University of Michigan
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Authors: Steve Reinhardt
#          Ali Saidi

# Important!
# This script expects a simple $ prompt, if you are using a shell other than
# sh which defaults to this you'll need to add something like the following
# to your bashrc/bash_profile script:
#if [ "$OAR_USER" = "xxxx" ]; then
#   PS1='$ '


import sys
import os
import re
import time
import optparse

import pexpect

progname = os.path.basename(sys.argv[0])

usage = "%prog [options] command [command arguments]"
optparser = optparse.OptionParser(usage=usage)
optparser.allow_interspersed_args=False
optparser.add_option('-e', dest='stderr_file',
                     help='command stderr output file')
optparser.add_option('-o', dest='stdout_file',
                     help='command stdout output file')
optparser.add_option('-l', dest='save_log', action='store_true',
                     help='save oarsub output log file')
optparser.add_option('-N', dest='job_name',
                     help='oarsub job name')
optparser.add_option('-q', dest='dest_queue',
                     help='oarsub destination queue')
optparser.add_option('--qwait', dest='oarsub_timeout', type='int',
                     help='oarsub queue wait timeout', default=30*60)
optparser.add_option('-t', dest='cmd_timeout', type='int',
                     help='command execution timeout', default=600*60)

(options, cmd) = optparser.parse_args()

if cmd == []:
    print >>sys.stderr, "%s: missing command" % progname
    sys.exit(1)

# If we want to do this, need to add check here to make sure cmd[0] is
# a valid PBS job name, else oarsub will die on us.
#
#if not options.job_name:
#    options.job_name = cmd[0]

cwd = os.getcwd()

# Deal with systems where /n is a symlink to /.automount
if cwd.startswith('/.automount/'):
    cwd = cwd.replace('/.automount/', '/n/', 1)

if not cwd.startswith('/n/poolfs/'):
    print >>sys.stderr, "Error: current directory must be under /n/poolfs."
    sys.exit(1)

# The Shell class wraps pexpect.spawn with some handy functions that
# assume the thing on the other end is a Bourne/bash shell.
class Shell(pexpect.spawn):
    # Regexp to match the shell prompt.  We change the prompt to
    # something fixed and distinctive to make it easier to match
    # reliably.
    prompt_re = re.compile('qdo\$ ')

    def __init__(self, cmd):
        # initialize base pexpect.spawn object
        try:
            pexpect.spawn.__init__(self, cmd)
        except pexpect.ExceptionPexpect, exc:
            print "%s:" % progname, exc
            sys.exit(1)
        # full_output accumulates the full output of the session
        self.full_output = ""
        self.quick_timeout = 15
        # wait for a prompt, then change it
        try:
            self.expect('\$ ', options.oarsub_timeout)
        except pexpect.TIMEOUT:
            print >>sys.stderr, "%s: oarsub timed out." % progname
            self.kill(9)
            self.safe_close()
            sys.exit(1)
        self.do_command('unset PROMPT_COMMAND; PS1="qdo$ "')

    # version of expect that updates full_output too
    def expect(self, regexp, timeout = -1):
        pexpect.spawn.expect(self, regexp, timeout)
        self.full_output += self.before + self.after

    # Just issue a command and wait for the next prompt.
    # Returns a string containing the output of the command.
    def do_bare_command(self, cmd, timeout = -1):
        global full_output
        self.sendline(cmd)
        # read back the echo of the command
        self.readline()
        # wait for the next prompt
        self.expect(self.prompt_re, timeout)
        output = self.before.rstrip()
        return output

    # Issue a command, then query its exit status.
    # Returns a (string, int) tuple with the command output and the status.
    def do_command(self, cmd, timeout = -1):
        # do the command itself
        output = self.do_bare_command(cmd, timeout)
        # collect status
        status = int(self.do_bare_command("echo $?", self.quick_timeout))
        return (output, status)

    # Check to see if the given directory exists.
    def dir_exists(self, dirname):
        (output, status) = shell.do_command('[ -d %s ]' % dirname,
                                            self.quick_timeout)
        return status == 0

    # Don't actually try to close it.. just wait until it closes by itself
    # We can't actually kill the pid which is what it's trying to do, and if
    # we call wait we could be in an unfortunate situation of it printing input
    # right as we call wait, so the input is never read and the process never ends
    def safe_close(self):
        count = 0
        while self.isalive() and count < 10:
            time.sleep(1)
        self.close(force=False)

# Spawn the interactive pool job.

# Hack to do link on poolfs... disabled for now since
# compiler/linker/library versioning problems between poolfs and
# nodes.  May never work since poolfs is x86-64 and nodes are 32-bit.
if False and len(cmd) > 50:
    shell_cmd = 'ssh -t poolfs /bin/sh -l'
    print "%s: running %s on poolfs" % (progname, cmd[0])
else:
    shell_cmd = 'oarsub -I'
    if options.job_name:
        shell_cmd += ' -n "%s"' % options.job_name
    if options.dest_queue:
        shell_cmd += ' -q ' + options.dest_queue
    shell_cmd += ' -d %s' % cwd

shell = Shell(shell_cmd)

try:
    # chdir to cwd
    (output, status) = shell.do_command('cd ' + cwd)

    if status != 0:
        raise OSError, "Can't chdir to %s" % cwd

    # wacky hack: sometimes scons will create an output directory then
    # fork a job to generate files in that directory, and the job will
    # get run before the directory creation propagates through NFS.
    # This hack looks for a '-o' option indicating an output file and
    # waits for the corresponding directory to appear if necessary.
    try:
        if 'cc' in cmd[0] or 'g++' in cmd[0]:
            output_dir = os.path.dirname(cmd[cmd.index('-o')+1])
        elif 'm5' in cmd[0]:
            output_dir = cmd[cmd.index('-d')+1]
        else:
            output_dir = None
    except (ValueError, IndexError):
        # no big deal if there's no '-o'/'-d' or if it's the final argument
        output_dir = None

    if output_dir:
        secs_waited = 0
        while not shell.dir_exists(output_dir) and secs_waited < 90:
            time.sleep(5)
            secs_waited += 5
        if secs_waited > 30:
            print "waited", secs_waited, "seconds for", output_dir

    # run command
    if options.stdout_file:
        cmd += ['>', options.stdout_file]
    if options.stderr_file:
        cmd += ['2>', options.stderr_file]
    try:
        (output, status) = shell.do_command(' '.join(cmd), options.cmd_timeout)
    except pexpect.TIMEOUT:
            print >>sys.stderr, "%s: command timed out after %d seconds." \
                  % (progname, options.cmd_timeout)
            shell.sendline('~.') # oarsub/ssh termination escape sequence
            shell.safe_close()
            status = 3
    if output:
        print output
finally:
    # end job
    if shell.isalive():
        shell.sendline('exit')
        shell.expect('Disconnected from OAR job .*')
        shell.safe_close()

    # if there was an error, log the output even if not requested
    if status != 0 or options.save_log:
        log = file('qdo-log.' + str(os.getpid()), 'w')
        log.write(shell.full_output)
        log.close()
del shell

sys.exit(status)