#!/usr/bin/env python import sys import getopt def isHexDigit(c): return c.isdigit() or (c >= 'A' and c <= 'F') or (c >= 'a' and c <= 'f') def isEntityStartCharacter(c): return c.isalpha() or c == '_' or c == ':' def isEntityCharacter(c): return isEntityStartCharacter(c) or c.isdigit() or c == '.' or c == '-' class ParseError(Exception): def __init__(self, msg): Exception.__init__(self, msg) class TidyHtml: def init(self, html): self.input = html self.pos = 0 self.tagStack = [] self.beginText = False # Indicates at beginning of text node self.lineNo = 1 self.columnNo = 1 self.firstLine = True def output(self, str): """Write to the output.""" sys.stdout.write(str) def getInputCharacter(self): """Get the next character from the input.""" if self.pos >= len(self.input): return '\n' char = self.input[self.pos] self.pos += 1 self.columnNo += 1 if char == '\n': self.lineNo += 1 self.columnNo = 1 return char def previous(self): return self.input[self.pos - 1] def look(self): """Look ahead in the input.""" if self.pos >= len(self.input): return False return self.input[self.pos] def lookMatches(self, match): if not self.look(): return False return self.input[self.pos:self.pos + len(match)] == match def getEntity(self): html = [] html += self.getCharacter('&') if self.lookMatches("#x"): html += self.getCharacter('#') html += self.getCharacter('x') i = 0 while self.look() and self.look() != ';' and i < 4 and isHexDigit(self.look()): html += self.getInputCharacter() i += 1 if self.look() != ';' and isHexDigit(self.previous()): html += ';'; return ''.join(html) elif self.look() == '#': html += self.getCharacter('#'); i = 0 while self.look() and self.look() != ';' and i < 4 and self.look().isdigit(): html += self.getInputCharacter() i += 1 if self.look() != ';' and self.previous().isdigit(): html += ';' return ''.join(html) elif isEntityStartCharacter(self.look()): while self.look() and self.look() != ';' and isEntityCharacter(self.look()): html += self.getInputCharacter() if self.look() == ';': html += self.getCharacter(';') return ''.join(html) else: return "&" + ''.join(html[1:]) def getCodeBlock(self): """Return code block.""" code = [] code += self.getCharacter('<') + self.getCharacter('?') while self.look() and not (self.previous() == '?' and self.look() == '>'): if self.look() == '"' or self.look() == "'": quoteChar = self.look() code += self.getCharacter(quoteChar) while self.look() and not (self.previous() != '\\' and self.look() == quoteChar): code += self.getInputCharacter() code += self.getCharacter(quoteChar) else: code += self.getInputCharacter() code += self.getCharacter('>') return ''.join(code) def getScriptBlock(self): """Return script block.""" code = [] while self.look() and not self.lookMatches("</"): if self.look() == '"' or self.look() == "'": quoteChar = self.look() code += self.getCharacter(quoteChar) while self.look() and not (self.previous() != '\\' and self.look() == quoteChar): code += self.getInputCharacter() code += self.getCharacter(quoteChar) else: code += self.getInputCharacter() return ''.join(code) def codeBlock(self): self.output(self.getCodeBlock()) def scriptBlock(self): self.output(self.getScriptBlock()) def getCharacter(self, match=None): if match: if self.look() == match: return self.getInputCharacter() else: self.notfound(match) elif self.lookMatches('<?'): return self.getCodeBlock() elif self.look() == '&': return self.getEntity() else: char = self.getInputCharacter() if char == '"': return '"' else: return char def _location(self): return "on line " + str(self.lineNo) + " at column " + str(self.columnNo) def notfound(self, char): raise ParseError("Error " + self._location() + ": expecting " + char) def ignore(self, char): """Ignore a character.""" if self.look() == char: self.getInputCharacter() else: self.notfound(char) def matchAny(self): """Match any character.""" self.output(self.getCharacter()) def match(self, char): """Match a character.""" if self.look() == char: self.output(self.getInputCharacter()) else: self.notfound(char) def skipWhitespace(self): """skip whitespace characters""" while self.look() and self.look().isspace(): self.getInputCharacter() def indent(self): spacesPerIndent = 2 indent = len(self.tagStack) * spacesPerIndent self.output(' ' * indent) def newline(self): if not self.firstLine: self.output('\n') else: self.firstLine = False self.indent() def attributeValue(self): attributeValue = [] if self.look() == '"' or self.look() == "'": quoteChar = self.look() self.ignore(quoteChar) while self.look() and self.look() != quoteChar: attributeValue += self.getCharacter() self.ignore(quoteChar) else: while self.look() and not self.look().isspace() and self.look() != '>': attributeValue += self.getCharacter() return '"' + ''.join(attributeValue) + '"' def attribute(self): html = [] attributeName = [] while self.look() and not self.look().isspace() and self.look() != '=' and self.look() != '>': attributeName += self.getCharacter() attributeName = ''.join(attributeName) if attributeName[:2] != "<?": # Don't convert code blocks to lowercase attributeName = attributeName.lower() if len(attributeName) == 0: return '' html += " " + attributeName self.skipWhitespace() if self.look() == '=': html += self.getCharacter('=') self.skipWhitespace() html += self.attributeValue() return ''.join(html) def tag(self): html = [] html += self.getCharacter('<') self.skipWhitespace() tagName = [] while self.look() and not self.look().isspace() and self.look() != '>': tagName += self.getCharacter() tagName = ''.join(tagName).lower() if len(tagName) == 0: raise ParseError('Expecting tag name ' + self._location()) html += tagName self.skipWhitespace() while self.look() and self.look() != '>': html += self.attribute() self.skipWhitespace() html += self.getCharacter('>') isCloseTag = tagName[0] == '/' if isCloseTag: tagName = tagName[1:] if tagName in self.tagStack: while True: if len(self.tagStack) == 0: break popTag = self.tagStack.pop() if popTag == tagName: break # Insert missing closing tag self.newline() self.output("</" + popTag + ">") self.newline() else: return # Ignore closing tag else: self.newline() if tagName not in ['br', 'img', 'hr', 'col', 'link', 'input']: self.tagStack.append(tagName) self.output(''.join(html)) if tagName == "script" and not isCloseTag: self.scriptBlock() def commentBlock(self): def comment(): self.match('-') self.match('-') while self.look() and not self.lookMatches('--'): self.matchAny() self.match('-') self.match('-') self.match('<') self.match('!') while self.look() and self.look() != '>': if self.lookMatches('--'): comment() else: self.matchAny() self.match('>') def process(self, html): self.init(html) while self.look(): if self.lookMatches('<?'): self.codeBlock() elif self.lookMatches('<!'): self.newline() self.commentBlock() self.beginText = True elif self.look() == '<': self.tag() self.beginText = True elif self.look().isspace(): self.skipWhitespace() else: if self.beginText: self.newline() self.beginText = False elif self.previous().isspace(): self.output(' ') self.matchAny() # Close any remaining tags while True: if len(self.tagStack) == 0: break tagName = self.tagStack.pop() self.newline() self.output("</" + tagName + ">") self.output('\n') def processFile(self, fileName): f = open(fileName, 'r') html = f.read() f.close() self.process(html) def usage(): print "Usage: tidy.py [options] [file...]" def version(): print "Clean HTML v0.1\n" \ "Copyright (c) 2008 by Cameron Zemek" def main(argv): if len(argv) == 0: usage() sys.exit(1) try: opts, args = getopt.getopt(argv, "hv", ["help", "version"]) except getopt.GetoptError: usage() sys.exit(1) for opt, optarg in opts: if opt in ("-h", "--help"): usage() sys.exit() elif opt in ("-v", "--version"): version() sys.exit() tidy = TidyHtml() try: for arg in args: tidy.processFile(arg) except ParseError, e: print '\n' + str(e) sys.exit(1) if __name__ == '__main__': main(sys.argv[1:])