#!/usr/bin/php <?php /** * Script for sanitizing HTML input to only allow what is in the whitelist. * Tested against majority of the hacks listed at http://ha.ckers.org/xss.html * * @author Cameron Zemek <[email protected]> * @license http://opensource.org/licenses/mit-license.php MIT License */
script
HTML Sanitizer
Fix whitespace
#!/bin/sh EXTENSIONS="sh|txt|php|py|rb|css|js|html|xsl|xml|sql" for file in `find -type f | grep -v "\.svn" | grep -E "($EXTENSIONS)$"` do # Send the commands H and w to ed # ed will append newline if the file does not end in one printf "%s\n" H w | ed -s $file # Strip trailing whitespace sed -i 's/[ \t]*$//g' $file # Convert tabs to 4 spaces sed -i -r "s/\t/ /g" $file done
Google Search Screen Scrapper
#!/usr/bin/env python import urllib2 from urllib import urlencode from urlparse import urlparse from BeautifulSoup import BeautifulSoup def doGoogleSearch(query, limit=10): def _googleSearch(query, start, limit): urlParams = {'q' : query} if start > 0: urlParams['start'] = start url = "http://www.google.com.au/search?hl=en&" + urlencode(urlParams) request = urllib2.Request(url) # Google blocks queries based on User Agent.
HTML Tidy
#!/usr/bin/env python import sys import getopt def isHexDigit(c): return c.isdigit() or (c >= 'A' and c <= 'F') or (c >= 'a' and c <= 'f') def isEntityStartCharacter(c): return c.isalpha() or c == '_' or c == ':' def isEntityCharacter(c): return isEntityStartCharacter(c) or c.isdigit() or c == '.' or c == '-' class ParseError(Exception): def __init__(self, msg): Exception.__init__(self, msg) class TidyHtml: def init(self, html): self.input = html self.pos = 0 self.tagStack = [] self.beginText = False # Indic