1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
#!/usr/bin/python
import sys, os, os.path, urllib2, gzip, bz2, traceback
# Written by Krzysztof Kowalczyk (http://blog.kowalczyk.info)
# This code is in public domain.
#
# A regression testing script
#
# Given a list of urls to PDF files, it downloads them and runs
# pdftool draw -m $file-name
# on each file. This allows catching crashes e.g. on linux:
# python test/benchpdfs.py | grep Segmentation
# will produce an output if pdftool crashed on any of the pdfs
#
# Regression PDFs can be put anywhere. They can be gzipp'ed or bzip2'ed
# to save the bandwidth (in which case url must end in .gz or .bz2)
#
# The script doesn't redownload the file if it has been downloaded before.
#
# Missing files are ignored
#
# By convetion names of PDF files are sha1 hash over the uncompressed content.
# They have a nice property of being unique for each file.
# To generate the name run sha1sum on an (uncompressed) pdf, rename
# the file to a result of that + .pdf and optionally compress with gzip or bzip2
pdfs_to_test = [
"http://darcs.kowalczyk.info/testpdfs/293bcd6b00e006d66fdc62ea436508f3ebb30219.pdf.gz"
]
local_pdfs_dir = os.path.expanduser("~/testpdfs")
def dir_exists(path):
if os.path.exists(path):
return os.path.isdir(path)
return False
def file_exists(path):
if os.path.exists(path):
return os.path.isfile(path)
return False
# make a directory if doesn't exist yet.
def make_dir(path):
if not dir_exists(path): os.makedirs(path)
def write_to_file(path, data):
fo = open(path, "wb")
fo.write(data)
fo.close()
# Does HTTP GET or POST (if data != None). Returns body of the response or
# None if there was an error
# If username/pwd are provided, we assume it's for basic authentication
def do_http(url, data = None, dump_exception=False):
body = None
try:
req = urllib2.Request(url, data)
resp = urllib2.urlopen(req)
body = resp.read()
except:
if dump_exception:
print "do_http failed",url
print '-'*60
traceback.print_exc(file=sys.stdout)
print '-'*60
return body
# Tries to find root of the repository. Starts and pwd and goes up
# until can't go anymore or finds "mupdf" directory
def find_repo_root():
curdir = os.getcwd()
prevdir = None
while curdir != prevdir:
if dir_exists(os.path.join(curdir, "mupdf")):
return curdir
prevdir = curdir
curdir = os.path.dirname(curdir)
return None
def find_pdftool():
root = find_repo_root()
if root == None:
print "Didn't find the root directory"
print "Current directory: '%s'" % os.getcwd()
sys.exit(1)
print root
# check build results for Jam and Makefile
for f in [os.path.join("obj-rel", "pdftool"), os.path.join("obj-dbg", "pdftool"),
os.path.join("build", "relase", "pdftool"), os.path.join("build", "debug", "pdftool")]:
path = os.path.join(root, f)
if file_exists(path):
return path
print "Didn't find pdftool. Did you build it?"
print "Root dir: '%s'" % root
sys.exit(1)
def is_gzipped(filename): return filename.endswith(".gz")
def is_bzip2ed(filename): return filename.endswith(".bz2")
def uncompress_if_needed(filepath):
if is_gzipped(filepath):
finalpath = filepath[:-len(".gz")]
print "Uncompressing '%s' to '%s'" % (filepath, finalpath)
fin = gzip.open(filepath, "rb")
fout = open(finalpath, "wb")
data = fin.read()
fout.write(data)
fin.close()
fout.close()
os.remove(filepath)
elif is_bzip2ed(filepath):
finalpath = filepath[:-len(".bz2")]
print "Uncompressing '%s' to '%s'" % (filepath, finalpath)
fin = bz2.BZ2File(filepath, "r")
fout = open(finalpath, "wb")
data = fin.read()
fout.write(data)
fin.close()
fout.close()
os.remove(filepath)
def pdfname_from_url(url): return url.split("/")[-1]
def final_pdfname_from_url(url):
potentially_compressed = pdfname_from_url(url)
for suffix in [".gz", ".bz2"]:
if potentially_compressed.endswith(suffix):
return potentially_compressed[:-len(suffix)]
return potentially_compressed
def main():
print "Starting the test"
pdftool = find_pdftool() # make sure to abort early if pdftool doesn't exist
#print "pdftool: '%s'" % pdftool
make_dir(local_pdfs_dir)
for pdfurl in pdfs_to_test:
pdfname = pdfname_from_url(pdfurl)
local_pdf_path = os.path.join(local_pdfs_dir, pdfname)
final_pdfname = final_pdfname_from_url(pdfurl)
local_final_pdf_path = os.path.join(local_pdfs_dir, final_pdfname)
# Download the file if not already downloaded
if not os.path.exists(local_final_pdf_path):
print "Downloading pdf file '%s' as '%s'" % (pdfurl, local_pdf_path)
pdf_file_data = do_http(pdfurl)
if None == pdf_file_data:
print "Failed to download '%s'" % pdfurl
continue # don't stop the test just because of that
write_to_file(local_pdf_path, pdf_file_data)
uncompress_if_needed(local_pdf_path)
cmd = pdftool + " draw -m " + local_final_pdf_path
print "Running '%s'" % cmd
os.system(cmd)
if __name__ == "__main__":
main()
|