html

HTML Sanitizer

#!/usr/bin/php
<?php
/**
 * Script for sanitizing HTML input to only allow what is in the whitelist.
 * Tested against majority of the hacks listed at http://ha.ckers.org/xss.html
 *
 * @author Cameron Zemek <[email protected]>
 * @license http://opensource.org/licenses/mit-license.php MIT License
 */

Google Search Screen Scrapper

#!/usr/bin/env python
import urllib2
from urllib import urlencode
from urlparse import urlparse
 
from BeautifulSoup import BeautifulSoup
 
def doGoogleSearch(query, limit=10):
    def _googleSearch(query, start, limit):
        urlParams = {'q' : query}
        if start > 0:
            urlParams['start'] = start
        url = "http://www.google.com.au/search?hl=en&" + urlencode(urlParams)
        request = urllib2.Request(url)
        # Google blocks queries based on User Agent.

HTML Tidy

#!/usr/bin/env python
import sys
import getopt
 
def isHexDigit(c):
    return c.isdigit() or (c >= 'A' and c <= 'F') or (c >= 'a' and c <= 'f')
 
def isEntityStartCharacter(c):
    return c.isalpha() or c == '_' or c == ':'
 
def isEntityCharacter(c):
    return isEntityStartCharacter(c) or c.isdigit() or c == '.' or c == '-'
 
class ParseError(Exception):
    def __init__(self, msg):
        Exception.__init__(self, msg)
 
class TidyHtml:
    def init(self, html):
        self.input = html
        self.pos = 0
        self.tagStack = []
        self.beginText = False # Indic