#!/usr/bin/python import sys, os, os.path, urllib2, gzip, bz2, traceback # Written by Krzysztof Kowalczyk (http://blog.kowalczyk.info) # This code is in public domain. # # A regression testing script # # Given a list of urls to PDF files, it downloads them and runs # pdftool draw -m $file-name # on each file. This allows catching crashes e.g. on linux: # python test/benchpdfs.py | grep Segmentation # will produce an output if pdftool crashed on any of the pdfs # # Regression PDFs can be put anywhere. They can be gzipp'ed or bzip2'ed # to save the bandwidth (in which case url must end in .gz or .bz2) # # The script doesn't redownload the file if it has been downloaded before. # # Missing files are ignored # # By convetion names of PDF files are sha1 hash over the uncompressed content. # They have a nice property of being unique for each file. # To generate the name run sha1sum on an (uncompressed) pdf, rename # the file to a result of that + .pdf and optionally compress with gzip or bzip2 pdfs_to_test = [ "http://darcs.kowalczyk.info/testpdfs/293bcd6b00e006d66fdc62ea436508f3ebb30219.pdf.gz" ] local_pdfs_dir = os.path.expanduser("~/testpdfs") def dir_exists(path): if os.path.exists(path): return os.path.isdir(path) return False def file_exists(path): if os.path.exists(path): return os.path.isfile(path) return False # make a directory if doesn't exist yet. def make_dir(path): if not dir_exists(path): os.makedirs(path) def write_to_file(path, data): fo = open(path, "wb") fo.write(data) fo.close() # Does HTTP GET or POST (if data != None). Returns body of the response or # None if there was an error # If username/pwd are provided, we assume it's for basic authentication def do_http(url, data = None, dump_exception=False): body = None try: req = urllib2.Request(url, data) resp = urllib2.urlopen(req) body = resp.read() except: if dump_exception: print "do_http failed",url print '-'*60 traceback.print_exc(file=sys.stdout) print '-'*60 return body # Tries to find root of the repository. Starts and pwd and goes up # until can't go anymore or finds "mupdf" directory def find_repo_root(): curdir = os.getcwd() prevdir = None while curdir != prevdir: if dir_exists(os.path.join(curdir, "mupdf")): return curdir prevdir = curdir curdir = os.path.dirname(curdir) return None def find_pdftool(): root = find_repo_root() if root == None: print "Didn't find the root directory" print "Current directory: '%s'" % os.getcwd() sys.exit(1) print root # check build results for Jam and Makefile for f in [os.path.join("obj-rel", "pdftool"), os.path.join("obj-dbg", "pdftool"), os.path.join("build", "relase", "pdftool"), os.path.join("build", "debug", "pdftool")]: path = os.path.join(root, f) if file_exists(path): return path print "Didn't find pdftool. Did you build it?" print "Root dir: '%s'" % root sys.exit(1) def is_gzipped(filename): return filename.endswith(".gz") def is_bzip2ed(filename): return filename.endswith(".bz2") def uncompress_if_needed(filepath): if is_gzipped(filepath): finalpath = filepath[:-len(".gz")] print "Uncompressing '%s' to '%s'" % (filepath, finalpath) fin = gzip.open(filepath, "rb") fout = open(finalpath, "wb") data = fin.read() fout.write(data) fin.close() fout.close() os.remove(filepath) elif is_bzip2ed(filepath): finalpath = filepath[:-len(".bz2")] print "Uncompressing '%s' to '%s'" % (filepath, finalpath) fin = bz2.BZ2File(filepath, "r") fout = open(finalpath, "wb") data = fin.read() fout.write(data) fin.close() fout.close() os.remove(filepath) def pdfname_from_url(url): return url.split("/")[-1] def final_pdfname_from_url(url): potentially_compressed = pdfname_from_url(url) for suffix in [".gz", ".bz2"]: if potentially_compressed.endswith(suffix): return potentially_compressed[:-len(suffix)] return potentially_compressed def main(): print "Starting the test" pdftool = find_pdftool() # make sure to abort early if pdftool doesn't exist #print "pdftool: '%s'" % pdftool make_dir(local_pdfs_dir) for pdfurl in pdfs_to_test: pdfname = pdfname_from_url(pdfurl) local_pdf_path = os.path.join(local_pdfs_dir, pdfname) final_pdfname = final_pdfname_from_url(pdfurl) local_final_pdf_path = os.path.join(local_pdfs_dir, final_pdfname) # Download the file if not already downloaded if not os.path.exists(local_final_pdf_path): print "Downloading pdf file '%s' as '%s'" % (pdfurl, local_pdf_path) pdf_file_data = do_http(pdfurl) if None == pdf_file_data: print "Failed to download '%s'" % pdfurl continue # don't stop the test just because of that write_to_file(local_pdf_path, pdf_file_data) uncompress_if_needed(local_pdf_path) cmd = pdftool + " draw -m " + local_final_pdf_path print "Running '%s'" % cmd os.system(cmd) if __name__ == "__main__": main()