Thursday, 7 March 2013

Google Search API in Python

How to use Google Ajax Web Search API in Python?
In the previous blog, I was trying to sort my kindle books using Python regular expression, the books were sort by their author names in the end. Can we make a further step to put these books in different categories?
In the following, I will show how we can use Google Ajax Web Search API with Python to make it.

import json
import urllib
import sys

def googleSearch(searchFor):
    searchFor = searchFor + ' wiki'
    query = urllib.urlencode({'q':searchFor})
    url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&{0}'.format(query)
    search_response = urllib.urlopen(url)
    search_results = search_response.read()
    results = json.loads(search_results)
    data = results['responseData']
    hits = data['results']
    return hits[1]['url']

def getInfoFromWeb(url):
    search_response = urllib.urlopen(url)
    search_results = search_response.read()

    if search_results.find('Science fiction') != -1 or search_results.find('science fiction') != -1 or search_results.find('fantacy') != -1:
        return 'fantacy'
    if search_results.find('detective') != -1:
        return 'detective'
    if search_results.find('romance') != -1:
        return 'romance'
    if search_results.find('programming') != -1 or search_results.find('computer science') != -1:
        return 'CS'
    return 'other'

if __name__ == '__main__':
    searchFor = sys.argv[1]
    search_url = googleSearch(searchFor)
    search_result = getInfoFromWeb(search_url)
    print 'search from ', search_url
    print 'search result: ', search_result

Unfortunately, this script does not work. For some reasons, when you use bot to retrieve bulk contents, the server will block you. In this case, wikipedia will do so. Use the urllib2  package to redefine the getInfoFromWeb function:
def getInfoFromWeb(url):
    req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
    con = urllib2.urlopen( req )
    search_results = con.read()

    if search_results.find('Science fiction') != -1 or search_results.find('science fiction') != -1 or search_results.find('fantacy') != -1:
        return 'fantacy'
    if search_results.find('detective') != -1:
        return 'detective'
    if search_results.find('romance') != -1:
        return 'romance'
    if search_results.find('programming') != -1 or search_results.find('computer science') != -1:
        return 'CS'
    return 'other'

Results:

No comments:

Post a Comment