Google Search Screen Scrapper

#!/usr/bin/env python
import urllib2
from urllib import urlencode
from urlparse import urlparse
 
from BeautifulSoup import BeautifulSoup
 
def doGoogleSearch(query, limit=10):
    def _googleSearch(query, start, limit):
        urlParams = {'q' : query}
        if start > 0:
            urlParams['start'] = start
        url = "http://www.google.com.au/search?hl=en&" + urlencode(urlParams)
        request = urllib2.Request(url)
        # Google blocks queries based on User Agent. So pretend we are IE 7
        request.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT)')
        opener = urllib2.build_opener()
        htmlResults = opener.open(request).read()
        soup = BeautifulSoup(htmlResults)
        results = []
        howManyRemaining = limit - start
        for link in soup.findAll('a', {'class' : 'l'}):
            if len(results) == howManyRemaining:
                break
            results.append(link['href'])
        if soup.find('div', {'id' : 'nn'}):
            start = start + 10
            if start < limit:
                results.extend(_googleSearch(query, start, limit))
        return results    
    return _googleSearch(query, 0, limit)
 
def main():
	results = doGoogleSearch('a_search_term', 20)
	for rank, link in enumerate(results):
		host = urlparse(link)[1]
		if host.endswith('mydomain.com'):
			print str(rank) + ':' + link
 
if __name__ == '__main__':
	main()
Add new comment
Cameron Zemek

Google Search Screen Scrapper

Navigation