HTML Tidy

#!/usr/bin/env python
import sys
import getopt
 
def isHexDigit(c):
    return c.isdigit() or (c >= 'A' and c <= 'F') or (c >= 'a' and c <= 'f')
 
def isEntityStartCharacter(c):
    return c.isalpha() or c == '_' or c == ':'
 
def isEntityCharacter(c):
    return isEntityStartCharacter(c) or c.isdigit() or c == '.' or c == '-'
 
class ParseError(Exception):
    def __init__(self, msg):
        Exception.__init__(self, msg)
 
class TidyHtml:
    def init(self, html):
        self.input = html
        self.pos = 0
        self.tagStack = []
        self.beginText = False # Indicates at beginning of text node
        self.lineNo = 1
        self.columnNo = 1
        self.firstLine = True
 
    def output(self, str):
        """Write to the output."""
        sys.stdout.write(str)
 
    def getInputCharacter(self):
        """Get the next character from the input."""
        if self.pos >= len(self.input):
            return '\n'
        char = self.input[self.pos]
        self.pos += 1
        self.columnNo += 1
        if char == '\n':
            self.lineNo += 1
            self.columnNo = 1
        return char
 
    def previous(self):
        return self.input[self.pos - 1]
 
    def look(self):
        """Look ahead in the input."""
        if self.pos >= len(self.input):
            return False
        return self.input[self.pos]
 
    def lookMatches(self, match):
        if not self.look():
            return False
        return self.input[self.pos:self.pos + len(match)] == match
 
    def getEntity(self):
        html = []
        html += self.getCharacter('&')
        if self.lookMatches("#x"):
            html += self.getCharacter('#')
            html += self.getCharacter('x')
            i = 0
            while self.look() and self.look() != ';' and i < 4 and isHexDigit(self.look()):
                html += self.getInputCharacter()
                i += 1
            if self.look() != ';' and isHexDigit(self.previous()):
                html += ';';
                return ''.join(html)
        elif self.look() == '#':
            html += self.getCharacter('#');
            i = 0
            while self.look() and self.look() != ';' and i < 4 and self.look().isdigit():
                html += self.getInputCharacter()
                i += 1
            if self.look() != ';' and self.previous().isdigit():
                html += ';'
                return ''.join(html)
        elif isEntityStartCharacter(self.look()):
            while self.look() and self.look() != ';' and isEntityCharacter(self.look()):
                html += self.getInputCharacter()
 
        if self.look() == ';':
            html += self.getCharacter(';')
            return ''.join(html)
        else:
            return "&amp;" + ''.join(html[1:])
 
    def getCodeBlock(self):
        """Return code block."""
        code = []
        code += self.getCharacter('<') + self.getCharacter('?')
        while self.look() and not (self.previous() == '?' and self.look() == '>'):
            if self.look() == '"' or self.look() == "'":
                quoteChar = self.look()
                code += self.getCharacter(quoteChar)
                while self.look() and not (self.previous() != '\\' and self.look() == quoteChar):
                    code += self.getInputCharacter()
                code += self.getCharacter(quoteChar)
            else:
                code += self.getInputCharacter()
        code += self.getCharacter('>')
        return ''.join(code)
 
    def getScriptBlock(self):
        """Return script block."""
        code = []
        while self.look() and not self.lookMatches("</"):
            if self.look() == '"' or self.look() == "'":
                quoteChar = self.look()
                code += self.getCharacter(quoteChar)
                while self.look() and not (self.previous() != '\\' and self.look() == quoteChar):
                    code += self.getInputCharacter()
                code += self.getCharacter(quoteChar)
            else:
                code += self.getInputCharacter()
        return ''.join(code)
 
    def codeBlock(self):
        self.output(self.getCodeBlock())
 
    def scriptBlock(self):
        self.output(self.getScriptBlock())
 
    def getCharacter(self, match=None):
        if match:
            if self.look() == match:
                return self.getInputCharacter()
            else:
                self.notfound(match)
        elif self.lookMatches('<?'):
            return self.getCodeBlock()
        elif self.look() == '&':
            return self.getEntity()
        else:
            char = self.getInputCharacter()
            if char == '"':
                return '&quot;'
            else:
                return char
 
    def _location(self):
        return "on line " + str(self.lineNo) + " at column " + str(self.columnNo)
 
    def notfound(self, char):
        raise ParseError("Error " + self._location() + ": expecting " + char)
 
    def ignore(self, char):
        """Ignore a character."""
        if self.look() == char:
            self.getInputCharacter()
        else:
            self.notfound(char)
 
    def matchAny(self):
        """Match any character."""
        self.output(self.getCharacter())
 
    def match(self, char):
        """Match a character."""
        if self.look() == char:
            self.output(self.getInputCharacter())
        else:
            self.notfound(char)
 
    def skipWhitespace(self):
        """skip whitespace characters"""
        while self.look() and self.look().isspace():
            self.getInputCharacter()
 
    def indent(self):
        spacesPerIndent = 2
        indent = len(self.tagStack) * spacesPerIndent
        self.output(' ' * indent)
 
    def newline(self):
        if not self.firstLine:
            self.output('\n')
        else:
            self.firstLine = False
        self.indent()
 
    def attributeValue(self):
        attributeValue = []
        if self.look() == '"' or self.look() == "'":
            quoteChar = self.look()
            self.ignore(quoteChar)
            while self.look() and self.look() != quoteChar:
                attributeValue += self.getCharacter()
            self.ignore(quoteChar)
        else:
            while self.look() and not self.look().isspace() and self.look() != '>':
                attributeValue += self.getCharacter()
        return '"' + ''.join(attributeValue) + '"'
 
    def attribute(self):
        html = []
        attributeName = []
        while self.look() and not self.look().isspace() and self.look() != '=' and self.look() != '>':
            attributeName += self.getCharacter()
        attributeName = ''.join(attributeName)
        if attributeName[:2] != "<?": # Don't convert code blocks to lowercase
        	attributeName = attributeName.lower()
        if len(attributeName) == 0:
            return ''
        html += " " + attributeName
        self.skipWhitespace()
        if self.look() == '=':
            html += self.getCharacter('=')
            self.skipWhitespace()
            html += self.attributeValue()
        return ''.join(html)
 
    def tag(self):
        html = []
        html += self.getCharacter('<')
        self.skipWhitespace()
        tagName = []
        while self.look() and not self.look().isspace() and self.look() != '>':
            tagName += self.getCharacter()
        tagName = ''.join(tagName).lower()
        if len(tagName) == 0:
            raise ParseError('Expecting tag name ' + self._location())
        html += tagName
        self.skipWhitespace()
        while self.look() and self.look() != '>':
            html += self.attribute()
            self.skipWhitespace()
        html += self.getCharacter('>')
        isCloseTag = tagName[0] == '/'
        if isCloseTag:
            tagName = tagName[1:]
            if tagName in self.tagStack:
                while True:
                    if len(self.tagStack) == 0:
                        break
                    popTag = self.tagStack.pop()
                    if popTag == tagName:
                        break
                    # Insert missing closing tag
                    self.newline()
                    self.output("</" + popTag + ">")
                self.newline()
            else:
                return # Ignore closing tag
        else:
            self.newline()
            if tagName not in ['br', 'img', 'hr', 'col', 'link', 'input']:
                self.tagStack.append(tagName)
        self.output(''.join(html))
        if tagName == "script" and not isCloseTag:
            self.scriptBlock()
 
    def commentBlock(self):
        def comment():
            self.match('-')
            self.match('-')
            while self.look() and not self.lookMatches('--'):
                self.matchAny()
            self.match('-')
            self.match('-')
        self.match('<')
        self.match('!')
        while self.look() and self.look() != '>':
            if self.lookMatches('--'):
                comment()
            else:
                self.matchAny()
        self.match('>')
 
    def process(self, html):
        self.init(html)
        while self.look():
            if self.lookMatches('<?'):
                self.codeBlock()
            elif self.lookMatches('<!'):
                self.newline()
                self.commentBlock()
                self.beginText = True
            elif self.look() == '<':
                self.tag()
                self.beginText = True
            elif self.look().isspace():
                self.skipWhitespace()
            else:
                if self.beginText:
                    self.newline()
                    self.beginText = False
                elif self.previous().isspace():
                    self.output(' ')
                self.matchAny()
        # Close any remaining tags
        while True:
            if len(self.tagStack) == 0:
                break
            tagName = self.tagStack.pop()
            self.newline()
            self.output("</" + tagName + ">")
        self.output('\n')
 
    def processFile(self, fileName):
        f = open(fileName, 'r')
        html = f.read()
        f.close()
        self.process(html)
 
def usage():
    print "Usage: tidy.py [options] [file...]"
 
def version():
    print "Clean HTML v0.1\n" \
          "Copyright (c) 2008 by Cameron Zemek"
 
def main(argv):
    if len(argv) == 0:
        usage()
        sys.exit(1)
    try:
        opts, args = getopt.getopt(argv, "hv", ["help", "version"])
    except getopt.GetoptError:
        usage()
        sys.exit(1)
    for opt, optarg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
        elif opt in ("-v", "--version"):
            version()
            sys.exit()
    tidy = TidyHtml()
    try:
        for arg in args:
            tidy.processFile(arg)
    except ParseError, e:
        print '\n' + str(e)
        sys.exit(1)
 
if __name__ == '__main__':
    main(sys.argv[1:])