Path: blob/master/venv/Lib/site-packages/googlesearch/googlesearch.py
811 views
'''1Created on May 5, 201723@author: anthony4'''5import urllib26import math7import re8from bs4 import BeautifulSoup9from pprint import pprint10from threading import Thread11from collections import deque12from time import sleep1314class GoogleSearch:15USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ 58.0.3029.81 Safari/537.36"16SEARCH_URL = "https://google.com/search"17RESULT_SELECTOR = ".srg h3.r a"18TOTAL_SELECTOR = "#resultStats"19RESULTS_PER_PAGE = 1020DEFAULT_HEADERS = [21('User-Agent', USER_AGENT),22("Accept-Language", "en-US,en;q=0.5"),23]2425def search(self, query, num_results = 10, prefetch_pages = True, prefetch_threads = 10, language = "en"):26searchResults = []27pages = int(math.ceil(num_results / float(GoogleSearch.RESULTS_PER_PAGE)));28fetcher_threads = deque([])29total = None;30for i in range(pages) :31start = i * GoogleSearch.RESULTS_PER_PAGE32opener = urllib2.build_opener()33opener.addheaders = GoogleSearch.DEFAULT_HEADERS34response = opener.open(GoogleSearch.SEARCH_URL + "?q="+ urllib2.quote(query) + "&hl=" + language + ("" if start == 0 else ("&start=" + str(start))))35soup = BeautifulSoup(response.read(), "lxml")36response.close()37if total is None:38totalText = soup.select(GoogleSearch.TOTAL_SELECTOR)[0].children.next().encode('utf-8')39total = long(re.sub("[',\. ]", "", re.search("(([0-9]+[',\. ])*[0-9]+)", totalText).group(1)))40results = self.parseResults(soup.select(GoogleSearch.RESULT_SELECTOR))41if len(searchResults) + len(results) > num_results:42del results[num_results - len(searchResults):]43searchResults += results44if prefetch_pages:45for result in results:46while True:47running = 048for thread in fetcher_threads:49if thread.is_alive():50running += 151if running < prefetch_threads:52break53sleep(1)54fetcher_thread = Thread(target=result.getText)55fetcher_thread.start()56fetcher_threads.append(fetcher_thread)57for thread in fetcher_threads:58thread.join()59return SearchResponse(searchResults, total);6061def parseResults(self, results):62searchResults = [];63for result in results:64url = result["href"];65title = result.text66searchResults.append(SearchResult(title, url))67return searchResults6869class SearchResponse:70def __init__(self, results, total):71self.results = results;72self.total = total;7374class SearchResult:75def __init__(self, title, url):76self.title = title77self.url = url78self.__text = None79self.__markup = None8081def getText(self):82if self.__text is None:83soup = BeautifulSoup(self.getMarkup(), "lxml")84for junk in soup(["script", "style"]):85junk.extract()86self.__text = soup.get_text()87return self.__text8889def getMarkup(self):90if self.__markup is None:91opener = urllib2.build_opener()92opener.addheaders = GoogleSearch.DEFAULT_HEADERS93response = opener.open(self.url);94self.__markup = response.read()95return self.__markup9697def __str__(self):98return str(self.__dict__)99def __unicode__(self):100return unicode(self.__str__())101def __repr__(self):102return self.__str__()103104if __name__ == "__main__":105import sys106search = GoogleSearch()107i=1108query = " ".join(sys.argv[1:])109if len(query) == 0:110query = "python"111count = 10112print ("Fetching first " + str(count) + " results for \"" + query + "\"...")113response = search.search(query, count)114print ("TOTAL: " + str(response.total) + " RESULTS")115for result in response.results:116print("RESULT #" +str (i) + ": "+ (result._SearchResult__text if result._SearchResult__text is not None else "[None]") + "\n\n")117i+=1118119120