parser

HTML Sanitizer

#!/usr/bin/php
<?php
/**
 * Script for sanitizing HTML input to only allow what is in the whitelist.
 * Tested against majority of the hacks listed at http://ha.ckers.org/xss.html
 *
 * @author Cameron Zemek <[email protected]>
 * @license http://opensource.org/licenses/mit-license.php MIT License
 */

HTML Tidy

#!/usr/bin/env python
import sys
import getopt
 
def isHexDigit(c):
    return c.isdigit() or (c >= 'A' and c <= 'F') or (c >= 'a' and c <= 'f')
 
def isEntityStartCharacter(c):
    return c.isalpha() or c == '_' or c == ':'
 
def isEntityCharacter(c):
    return isEntityStartCharacter(c) or c.isdigit() or c == '.' or c == '-'
 
class ParseError(Exception):
    def __init__(self, msg):
        Exception.__init__(self, msg)
 
class TidyHtml:
    def init(self, html):
        self.input = html
        self.pos = 0
        self.tagStack = []
        self.beginText = False # Indic