test/benchpdfs.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

#!/usr/bin/python
import sys, os, os.path, urllib2, gzip, bz2, traceback

# Written by Krzysztof Kowalczyk (http://blog.kowalczyk.info)
# This code is in public domain.
#
# A regression testing script
#
# Given a list of urls to PDF files, it downloads them and runs
# pdftool draw -m $file-name
# on each file. This allows catching crashes e.g. on linux:
# python test/benchpdfs.py | grep Segmentation
# will produce an output if pdftool crashed on any of the pdfs
#
# Regression PDFs can be put anywhere. They can be gzipp'ed or bzip2'ed
# to save the bandwidth (in which case url must end in .gz or .bz2)
# 
# The script doesn't redownload the file if it has been downloaded before.
#
# Missing files are ignored
#
# By convetion names of PDF files are sha1 hash over the uncompressed content.
# They have a nice property of being unique for each file.
# To generate the name run sha1sum on an (uncompressed) pdf, rename
# the file to a result of that + .pdf and optionally compress with gzip or bzip2

pdfs_to_test = [
    "http://darcs.kowalczyk.info/testpdfs/293bcd6b00e006d66fdc62ea436508f3ebb30219.pdf.gz"
]

local_pdfs_dir = os.path.expanduser("~/testpdfs")

def dir_exists(path):
    if os.path.exists(path):
        return os.path.isdir(path)
    return False

def file_exists(path):
    if os.path.exists(path):
        return os.path.isfile(path)
    return False

# make a directory if doesn't exist yet. 
def make_dir(path):
    if not dir_exists(path): os.makedirs(path)

def write_to_file(path, data):
    fo = open(path, "wb")
    fo.write(data)
    fo.close()

# Does HTTP GET or POST (if data != None). Returns body of the response or 
# None if there was an error
# If username/pwd are provided, we assume it's for basic authentication
def do_http(url, data = None, dump_exception=False):
    body = None
    try:
        req = urllib2.Request(url, data)
        resp = urllib2.urlopen(req)
        body = resp.read()
    except:
        if dump_exception:
            print "do_http failed",url
            print '-'*60
            traceback.print_exc(file=sys.stdout)
            print '-'*60
    return body

# Tries to find root of the repository. Starts and pwd and goes up
# until can't go anymore or finds "mupdf" directory
def find_repo_root():
    curdir = os.getcwd()
    prevdir = None
    while curdir != prevdir:
        if dir_exists(os.path.join(curdir, "mupdf")):
            return curdir
        prevdir = curdir
        curdir = os.path.dirname(curdir)
    return None

def find_pdftool():
    root = find_repo_root()
    if root == None:
        print "Didn't find the root directory"
        print "Current directory: '%s'" % os.getcwd()
        sys.exit(1)
    print root
    # check build results for Jam and Makefile
    for f in [os.path.join("obj-rel", "pdftool"), os.path.join("obj-dbg", "pdftool"), 
              os.path.join("build", "relase", "pdftool"), os.path.join("build", "debug", "pdftool")]:
        path = os.path.join(root, f)
        if file_exists(path):
            return path
    print "Didn't find pdftool. Did you build it?"
    print "Root dir: '%s'" % root
    sys.exit(1)

def is_gzipped(filename): return filename.endswith(".gz")
def is_bzip2ed(filename): return filename.endswith(".bz2")

def uncompress_if_needed(filepath):
    if is_gzipped(filepath):
        finalpath = filepath[:-len(".gz")]
        print "Uncompressing '%s' to '%s'" % (filepath, finalpath)
        fin = gzip.open(filepath, "rb")
        fout = open(finalpath, "wb")
        data = fin.read()
        fout.write(data)
        fin.close()
        fout.close()
        os.remove(filepath)
    elif is_bzip2ed(filepath):
        finalpath = filepath[:-len(".bz2")]
        print "Uncompressing '%s' to '%s'" % (filepath, finalpath)
        fin = bz2.BZ2File(filepath, "r")
        fout = open(finalpath, "wb")
        data = fin.read()
        fout.write(data)
        fin.close()
        fout.close()
        os.remove(filepath)

def pdfname_from_url(url): return url.split("/")[-1]

def final_pdfname_from_url(url):
    potentially_compressed = pdfname_from_url(url)
    for suffix in [".gz", ".bz2"]:
        if potentially_compressed.endswith(suffix):
            return potentially_compressed[:-len(suffix)]
    return potentially_compressed

def main():
    print "Starting the test"
    pdftool = find_pdftool() # make sure to abort early if pdftool doesn't exist
    #print "pdftool: '%s'" % pdftool
    make_dir(local_pdfs_dir)
    for pdfurl in pdfs_to_test:
        pdfname = pdfname_from_url(pdfurl)
        local_pdf_path = os.path.join(local_pdfs_dir, pdfname)
        final_pdfname = final_pdfname_from_url(pdfurl)
        local_final_pdf_path = os.path.join(local_pdfs_dir, final_pdfname)
        # Download the file if not already downloaded
        if not os.path.exists(local_final_pdf_path):
            print "Downloading pdf file '%s' as '%s'" % (pdfurl, local_pdf_path)
            pdf_file_data = do_http(pdfurl)
            if None == pdf_file_data:
                print "Failed to download '%s'" % pdfurl
                continue # don't stop the test just because of that
            write_to_file(local_pdf_path, pdf_file_data)
            uncompress_if_needed(local_pdf_path)
        cmd = pdftool + " draw -m " + local_final_pdf_path
        print "Running '%s'" % cmd
        os.system(cmd)

if __name__ == "__main__":
    main()