Annotation of html5/spec/parse.py, revision 1.1
1.1 ! mike 1: #!/usr/bin/env python
! 2: """usage: %prog [options] filename
! 3:
! 4: Parse a document to a simpletree tree, with optional profiling
! 5: """
! 6: #RELEASE move ./examples/
! 7:
! 8: import sys
! 9: import os
! 10: from optparse import OptionParser
! 11:
! 12: #RELEASE remove
! 13: sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
! 14: #END RELEASE
! 15: from html5lib import html5parser, liberalxmlparser
! 16: from html5lib import treebuilders, serializer, treewalkers
! 17: from html5lib import constants
! 18:
! 19: def parse():
! 20: optParser = getOptParser()
! 21: opts,args = optParser.parse_args()
! 22: encoding = "utf-8"
! 23:
! 24: try:
! 25: f = args[-1]
! 26: # Try opening from the internet
! 27: if f.startswith('http://'):
! 28: try:
! 29: import urllib, cgi
! 30: f = urllib.urlopen(f)
! 31: contentType = f.headers.get('content-type')
! 32: if contentType:
! 33: (mediaType, params) = cgi.parse_header(contentType)
! 34: encoding = params.get('charset')
! 35: except: pass
! 36: elif f == '-':
! 37: f = sys.stdin
! 38: else:
! 39: try:
! 40: # Try opening from file system
! 41: f = open(f)
! 42: except IOError: pass
! 43: except IndexError:
! 44: sys.stderr.write("No filename provided. Use -h for help\n")
! 45: sys.exit(1)
! 46:
! 47: treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
! 48:
! 49: if opts.xml:
! 50: p = liberalxmlparser.XHTMLParser(tree=treebuilder)
! 51: else:
! 52: p = html5parser.HTMLParser(tree=treebuilder)
! 53:
! 54: if opts.fragment:
! 55: parseMethod = p.parseFragment
! 56: else:
! 57: parseMethod = p.parse
! 58:
! 59: if opts.profile:
! 60: import hotshot
! 61: import hotshot.stats
! 62: prof = hotshot.Profile('stats.prof')
! 63: prof.runcall(parseMethod, f, encoding=encoding)
! 64: prof.close()
! 65: # XXX - We should use a temp file here
! 66: stats = hotshot.stats.load('stats.prof')
! 67: stats.strip_dirs()
! 68: stats.sort_stats('time')
! 69: stats.print_stats()
! 70: elif opts.time:
! 71: import time
! 72: t0 = time.time()
! 73: document = parseMethod(f, encoding=encoding)
! 74: t1 = time.time()
! 75: printOutput(p, document, opts)
! 76: t2 = time.time()
! 77: print "\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)
! 78: else:
! 79: document = parseMethod(f, encoding=encoding)
! 80: printOutput(p, document, opts)
! 81:
! 82: def printOutput(parser, document, opts):
! 83: if opts.encoding:
! 84: print "Encoding:", parser.tokenizer.stream.charEncoding
! 85: if opts.xml:
! 86: sys.stdout.write(document.toxml("utf-8"))
! 87: elif opts.tree:
! 88: if not hasattr(document,'__getitem__'): document = [document]
! 89: for fragment in document:
! 90: print parser.tree.testSerializer(fragment).encode("utf-8")
! 91: elif opts.hilite:
! 92: sys.stdout.write(document.hilite("utf-8"))
! 93: elif opts.html:
! 94: kwargs = {}
! 95: for opt in serializer.HTMLSerializer.options:
! 96: kwargs[opt] = getattr(opts,opt)
! 97: if not kwargs['quote_char']: del kwargs['quote_char']
! 98: tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
! 99: for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding='utf-8'):
! 100: sys.stdout.write(text)
! 101: if not text.endswith('\n'): sys.stdout.write('\n')
! 102: if opts.error:
! 103: errList=[]
! 104: for pos, errorcode, datavars in parser.errors:
! 105: errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
! 106: sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
! 107:
! 108: def getOptParser():
! 109: parser = OptionParser(usage=__doc__)
! 110:
! 111: parser.add_option("-p", "--profile", action="store_true", default=False,
! 112: dest="profile", help="Use the hotshot profiler to "
! 113: "produce a detailed log of the run")
! 114:
! 115: parser.add_option("-t", "--time",
! 116: action="store_true", default=False, dest="time",
! 117: help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
! 118:
! 119: parser.add_option("-b", "--treebuilder", action="store", type="string",
! 120: dest="treebuilder", default="simpleTree")
! 121:
! 122: parser.add_option("-e", "--error", action="store_true", default=False,
! 123: dest="error", help="Print a list of parse errors")
! 124:
! 125: parser.add_option("-f", "--fragment", action="store_true", default=False,
! 126: dest="fragment", help="Parse as a fragment")
! 127:
! 128: parser.add_option("", "--tree", action="store_true", default=False,
! 129: dest="tree", help="Output as debug tree")
! 130:
! 131: parser.add_option("-x", "--xml", action="store_true", default=False,
! 132: dest="xml", help="Output as xml")
! 133:
! 134: parser.add_option("", "--no-html", action="store_false", default=True,
! 135: dest="html", help="Don't output html")
! 136:
! 137: parser.add_option("", "--hilite", action="store_true", default=False,
! 138: dest="hilite", help="Output as formatted highlighted code.")
! 139:
! 140: parser.add_option("-c", "--encoding", action="store_true", default=False,
! 141: dest="encoding", help="Print character encoding used")
! 142:
! 143: parser.add_option("", "--inject-meta-charset", action="store_true",
! 144: default=False, dest="inject_meta_charset",
! 145: help="inject <meta charset>")
! 146:
! 147: parser.add_option("", "--strip-whitespace", action="store_true",
! 148: default=False, dest="strip_whitespace",
! 149: help="strip whitespace")
! 150:
! 151: parser.add_option("", "--omit-optional-tags", action="store_true",
! 152: default=False, dest="omit_optional_tags",
! 153: help="omit optional tags")
! 154:
! 155: parser.add_option("", "--quote-attr-values", action="store_true",
! 156: default=False, dest="quote_attr_values",
! 157: help="quote attribute values")
! 158:
! 159: parser.add_option("", "--use-best-quote-char", action="store_true",
! 160: default=False, dest="use_best_quote_char",
! 161: help="use best quote character")
! 162:
! 163: parser.add_option("", "--quote-char", action="store",
! 164: default=None, dest="quote_char",
! 165: help="quote character")
! 166:
! 167: parser.add_option("", "--no-minimize-boolean-attributes",
! 168: action="store_false", default=True,
! 169: dest="minimize_boolean_attributes",
! 170: help="minimize boolean attributes")
! 171:
! 172: parser.add_option("", "--use-trailing-solidus", action="store_true",
! 173: default=False, dest="use_trailing_solidus",
! 174: help="use trailing solidus")
! 175:
! 176: parser.add_option("", "--space-before-trailing-solidus",
! 177: action="store_true", default=False,
! 178: dest="space_before_trailing_solidus",
! 179: help="add space before trailing solidus")
! 180:
! 181: parser.add_option("", "--escape-lt-in-attrs", action="store_true",
! 182: default=False, dest="escape_lt_in_attrs",
! 183: help="escape less than signs in attribute values")
! 184:
! 185: parser.add_option("", "--escape-rcdata", action="store_true",
! 186: default=False, dest="escape_rcdata",
! 187: help="escape rcdata element values")
! 188:
! 189: parser.add_option("", "--sanitize", action="store_true", default=False,
! 190: dest="sanitize", help="sanitize")
! 191:
! 192: return parser
! 193:
! 194: if __name__ == "__main__":
! 195: parse()
Webmaster