"""HTML batch validator Usage: python validate.py config.txt adapted from Perl version at http://atelier89.de/users/dirk/checkhtml.html You need httplib_multipart.py and if you want to validate files like .shtml or .php you need the specially modified version which uses a default content-type mimetype of "text/html" (UPLOAD should be 0 in this case). specify the following values in config: VALIDATORURL = w3 FILESERVERROOT = 'e:\\files' absolute path to your local fileroot, Unix: use /, Win use \\ separator! LOCALSERVERURL = 'http://example.org' GET from this URL if UPLOAD=0 or UPLOADFROMURL=1 VALIDATEPATH = 'home' validate all files starting from this path SKIPPATHS = ['WEB-INF'] skip all files and subdirectories in these paths UPLOAD = 1 POST (upload from FILESERVERROOT) files (=1) or GET (from LOCALSERVERURL) results (=0) UPLOADFROMURL = 1 If UPLOAD=1 (POST) HTML to validate will be fetched from LOCALSERVERURL, else from FILESERVERROOT\VALIDATEPATH. Filenames to be fetched will always be the filenames in FILESERVERROOT\VALIDATEPATH. EXTS = ['html', 'htm'] validate files with these extensions REPORTDIR = '__validator' reports are saved in this directory OPENREPORTS = 0 If =1 automatically open report pages in default HTML viewer (normally a webbrowser). """ __version__ = '1.7' __author__ = 'Christof Hoeke 090620' import fnmatch import httplib # httplib.HTTPConnection.debuglevel = 1 import os import sys import time import urllib import urlparse import webbrowser import httplib_multipart # validator URLs w3 = 'validator.w3.org:80' # use VALIDATORURL = w3 PATH = '/check' PROTOCOL = 'http' UPLOAD = 1 UPLOADFROMURL = 1 FILESERVERROOT = 'e:\\files' LOCALSERVERURL = 'http://example.org' VALIDATEPATH = 'home' SKIPPATHS = ['include', 'WEB-INF'] EXTS = ['html', 'htm'] REPORTDIR = '__validator' OPENREPORTS = 0 HTML = { 'valid': '[Valid]', 'invalid': '[Invalid]' } def saveresult(relpath, result): """ save error result to fn + ERR.html """ fpath, fname = os.path.split(relpath) if os.path.isabs(fpath): fpath = fpath[1:] resultpath = os.path.join(REPORTDIR, fpath) if not os.path.isdir(resultpath): try: os.makedirs(resultpath) except OSError, e: print e rfname = '%s.ERR.html' % (os.path.join(resultpath, fname)) open(rfname, 'w').write(result) print rfname if OPENREPORTS: webbrowser.open(rfname) def main(): """ build file list, validate and print result """ print 'Using config:' print '\tValidator URL:\n\t\t%s://%s%s' % (PROTOCOL, VALIDATORURL, PATH) print '\tLocal files:\n\t\t%s' % FILESERVERROOT print '\tValidate files in:\n\t\t"%s"' % VALIDATEPATH print '\tSkip directories:\n\t\t%s' % SKIPPATHS if not UPLOAD: print '\tURL to validate files:\n\t\t%s' % LOCALSERVERURL else: if UPLOADFROMURL: print '\tUploaded files are fetched from \n\t\t%s' % LOCALSERVERURL print '\tError Reports are saved to\n\t\t%s' % REPORTDIR print '-' * 40 start = time.time() # find files to validate files = [] for dir, dirs, fs in os.walk( os.path.join(FILESERVERROOT, VALIDATEPATH) ): skip = False for sp in SKIPPATHS: if dir.startswith(os.path.join(FILESERVERROOT, sp)): skip = True break if not skip: for ext in EXTS: names = fnmatch.filter(fs, '*.%s' % ext) names.sort() dirfile = [(dir, n) for n in names if not n.endswith('.ERR.html')] files.extend(dirfile) ok = 0 num = 0 errors = 0 unknown = 0 # validate for fpath, fname in files: abspath = os.path.join(fpath, fname) relpath = abspath.replace(FILESERVERROOT, '') relurl = urllib.pathname2url(relpath) # via UPLOAD if UPLOAD: if UPLOADFROMURL: content = urllib.urlopen(urlparse.urljoin(LOCALSERVERURL, relurl)).read() else: content = open(abspath, 'r').read() fields = [('uri', relurl)] files = [('uploaded_file', relurl, content)] errcode, errmsg, result = httplib_multipart.post_multipart( VALIDATORURL, PATH, fields, files) # via URL REQUEST else: url = urlparse.urlunsplit( (PROTOCOL, VALIDATORURL, PATH, 'uri=%s' % urlparse.urljoin(LOCALSERVERURL, relurl), None) ) result = urllib.urlopen(url).read() num += 1 # print result if HTML['valid'] in result: ok += 1 print '%s\n\t%s' % (relurl, HTML['valid']) elif HTML['invalid'] in result: errors += 1 print '%s\n\t%s see' % (relurl, HTML['invalid']), saveresult(relpath, result) else: unknown += 1 print '%s\n\tUNKNOWN ERROR' % relurl, if UPLOAD and errcode <> '200': print '[', errcode, errmsg, ']', print ', see', saveresult(relpath, result) print print '-' * 40 print 'Finished %s files in %s sec.' % (num, time.time() - start) print '%d invalid, %d valid, %d unknown results' % (errors, ok, unknown) if (errors > 0 or unknown > 0) and OPENREPORTS != 1: print '\nNotes:' print '* open reports automatically if OPENREPORTS = 1.' if not UPLOAD: print '* The Validator may not be able to request local files, you may need to use option UPLOAD = 1.\n' if __name__ == '__main__': if len(sys.argv) == 1: print 'Usage: python validate.py config.txt\n' else: config = open(sys.argv[1]).read() exec(config) main()