#!/usr/bin/php <?php /** * Script for sanitizing HTML input to only allow what is in the whitelist. * Tested against majority of the hacks listed at http://ha.ckers.org/xss.html * * @author Cameron Zemek <[email protected]> * @license http://opensource.org/licenses/mit-license.php MIT License */
html
HTML Sanitizer
Google Search Screen Scrapper
#!/usr/bin/env python import urllib2 from urllib import urlencode from urlparse import urlparse from BeautifulSoup import BeautifulSoup def doGoogleSearch(query, limit=10): def _googleSearch(query, start, limit): urlParams = {'q' : query} if start > 0: urlParams['start'] = start url = "http://www.google.com.au/search?hl=en&" + urlencode(urlParams) request = urllib2.Request(url) # Google blocks queries based on User Agent.
HTML Tidy
#!/usr/bin/env python import sys import getopt def isHexDigit(c): return c.isdigit() or (c >= 'A' and c <= 'F') or (c >= 'a' and c <= 'f') def isEntityStartCharacter(c): return c.isalpha() or c == '_' or c == ':' def isEntityCharacter(c): return isEntityStartCharacter(c) or c.isdigit() or c == '.' or c == '-' class ParseError(Exception): def __init__(self, msg): Exception.__init__(self, msg) class TidyHtml: def init(self, html): self.input = html self.pos = 0 self.tagStack = [] self.beginText = False # Indic