script

HTML Sanitizer

#!/usr/bin/php
<?php
/**
 * Script for sanitizing HTML input to only allow what is in the whitelist.
 * Tested against majority of the hacks listed at http://ha.ckers.org/xss.html
 *
 * @author Cameron Zemek <[email protected]>
 * @license http://opensource.org/licenses/mit-license.php MIT License
 */

Fix whitespace

bash
script

#!/bin/sh
EXTENSIONS="sh|txt|php|py|rb|css|js|html|xsl|xml|sql"
for file in `find -type f | grep -v "\.svn" | grep -E "($EXTENSIONS)$"`
do
    # Send the commands H and w to ed
    # ed will append newline if the file does not end in one
    printf "%s\n" H w | ed -s $file
 
    # Strip trailing whitespace
    sed -i 's/[ \t]*$//g' $file
 
    # Convert tabs to 4 spaces
    sed -i -r "s/\t/    /g" $file
done

Google Search Screen Scrapper

#!/usr/bin/env python
import urllib2
from urllib import urlencode
from urlparse import urlparse
 
from BeautifulSoup import BeautifulSoup
 
def doGoogleSearch(query, limit=10):
    def _googleSearch(query, start, limit):
        urlParams = {'q' : query}
        if start > 0:
            urlParams['start'] = start
        url = "http://www.google.com.au/search?hl=en&" + urlencode(urlParams)
        request = urllib2.Request(url)
        # Google blocks queries based on User Agent.

HTML Tidy

#!/usr/bin/env python
import sys
import getopt
 
def isHexDigit(c):
    return c.isdigit() or (c >= 'A' and c <= 'F') or (c >= 'a' and c <= 'f')
 
def isEntityStartCharacter(c):
    return c.isalpha() or c == '_' or c == ':'
 
def isEntityCharacter(c):
    return isEntityStartCharacter(c) or c.isdigit() or c == '.' or c == '-'
 
class ParseError(Exception):
    def __init__(self, msg):
        Exception.__init__(self, msg)
 
class TidyHtml:
    def init(self, html):
        self.input = html
        self.pos = 0
        self.tagStack = []
        self.beginText = False # Indic

Cameron Zemek

HTML Sanitizer

Fix whitespace

Google Search Screen Scrapper

HTML Tidy

Navigation