"""HTML batch validator
Usage: python validate.py config.txt
adapted from Perl version at http://atelier89.de/users/dirk/checkhtml.html
You need httplib_multipart.py and if you want to validate files like
.shtml or .php you need the specially modified version which uses a
default content-type mimetype of "text/html" (UPLOAD should be 0 in this
case).
specify the following values in config:
VALIDATORURL = w3
FILESERVERROOT = 'e:\\files'
absolute path to your local fileroot, Unix: use /, Win use \\ separator!
LOCALSERVERURL = 'http://example.org'
GET from this URL if UPLOAD=0 or UPLOADFROMURL=1
VALIDATEPATH = 'home'
validate all files starting from this path
SKIPPATHS = ['WEB-INF']
skip all files and subdirectories in these paths
UPLOAD = 1
POST (upload from FILESERVERROOT) files (=1) or GET (from LOCALSERVERURL) results (=0)
UPLOADFROMURL = 1
If UPLOAD=1 (POST) HTML to validate will be fetched from LOCALSERVERURL,
else from FILESERVERROOT\VALIDATEPATH. Filenames to be fetched will always
be the filenames in FILESERVERROOT\VALIDATEPATH.
EXTS = ['html', 'htm']
validate files with these extensions
REPORTDIR = '__validator'
reports are saved in this directory
OPENREPORTS = 0
If =1 automatically open report pages in default HTML viewer (normally a webbrowser).
"""
__version__ = '1.7'
__author__ = 'Christof Hoeke 090620'
import fnmatch
import httplib
# httplib.HTTPConnection.debuglevel = 1
import os
import sys
import time
import urllib
import urlparse
import webbrowser
import httplib_multipart
# validator URLs
w3 = 'validator.w3.org:80'
# use
VALIDATORURL = w3
PATH = '/check'
PROTOCOL = 'http'
UPLOAD = 1
UPLOADFROMURL = 1
FILESERVERROOT = 'e:\\files'
LOCALSERVERURL = 'http://example.org'
VALIDATEPATH = 'home'
SKIPPATHS = ['include', 'WEB-INF']
EXTS = ['html', 'htm']
REPORTDIR = '__validator'
OPENREPORTS = 0
HTML = {
'valid': '[Valid]',
'invalid': '[Invalid]'
}
def saveresult(relpath, result):
"""
save error result to fn + ERR.html
"""
fpath, fname = os.path.split(relpath)
if os.path.isabs(fpath):
fpath = fpath[1:]
resultpath = os.path.join(REPORTDIR, fpath)
if not os.path.isdir(resultpath):
try:
os.makedirs(resultpath)
except OSError, e:
print e
rfname = '%s.ERR.html' % (os.path.join(resultpath, fname))
open(rfname, 'w').write(result)
print rfname
if OPENREPORTS:
webbrowser.open(rfname)
def main():
"""
build file list, validate and print result
"""
print 'Using config:'
print '\tValidator URL:\n\t\t%s://%s%s' % (PROTOCOL, VALIDATORURL, PATH)
print '\tLocal files:\n\t\t%s' % FILESERVERROOT
print '\tValidate files in:\n\t\t"%s"' % VALIDATEPATH
print '\tSkip directories:\n\t\t%s' % SKIPPATHS
if not UPLOAD:
print '\tURL to validate files:\n\t\t%s' % LOCALSERVERURL
else:
if UPLOADFROMURL:
print '\tUploaded files are fetched from \n\t\t%s' % LOCALSERVERURL
print '\tError Reports are saved to\n\t\t%s' % REPORTDIR
print '-' * 40
start = time.time()
# find files to validate
files = []
for dir, dirs, fs in os.walk(
os.path.join(FILESERVERROOT, VALIDATEPATH)
):
skip = False
for sp in SKIPPATHS:
if dir.startswith(os.path.join(FILESERVERROOT, sp)):
skip = True
break
if not skip:
for ext in EXTS:
names = fnmatch.filter(fs, '*.%s' % ext)
names.sort()
dirfile = [(dir, n) for n in names if not n.endswith('.ERR.html')]
files.extend(dirfile)
ok = 0
num = 0
errors = 0
unknown = 0
# validate
for fpath, fname in files:
abspath = os.path.join(fpath, fname)
relpath = abspath.replace(FILESERVERROOT, '')
relurl = urllib.pathname2url(relpath)
# via UPLOAD
if UPLOAD:
if UPLOADFROMURL:
content = urllib.urlopen(urlparse.urljoin(LOCALSERVERURL, relurl)).read()
else:
content = open(abspath, 'r').read()
fields = [('uri', relurl)]
files = [('uploaded_file', relurl, content)]
errcode, errmsg, result = httplib_multipart.post_multipart(
VALIDATORURL, PATH, fields, files)
# via URL REQUEST
else:
url = urlparse.urlunsplit(
(PROTOCOL, VALIDATORURL, PATH, 'uri=%s' % urlparse.urljoin(LOCALSERVERURL, relurl), None)
)
result = urllib.urlopen(url).read()
num += 1
# print result
if HTML['valid'] in result:
ok += 1
print '%s\n\t%s' % (relurl, HTML['valid'])
elif HTML['invalid'] in result:
errors += 1
print '%s\n\t%s see' % (relurl, HTML['invalid']),
saveresult(relpath, result)
else:
unknown += 1
print '%s\n\tUNKNOWN ERROR' % relurl,
if UPLOAD and errcode <> '200':
print '[', errcode, errmsg, ']',
print ', see',
saveresult(relpath, result)
print
print '-' * 40
print 'Finished %s files in %s sec.' % (num, time.time() - start)
print '%d invalid, %d valid, %d unknown results' % (errors, ok, unknown)
if (errors > 0 or unknown > 0) and OPENREPORTS != 1:
print '\nNotes:'
print '* open reports automatically if OPENREPORTS = 1.'
if not UPLOAD:
print '* The Validator may not be able to request local files, you may need to use option UPLOAD = 1.\n'
if __name__ == '__main__':
if len(sys.argv) == 1:
print 'Usage: python validate.py config.txt\n'
else:
config = open(sys.argv[1]).read()
exec(config)
main()