Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/googlesearch/googlesearch.py
811 views
1
'''
2
Created on May 5, 2017
3
4
@author: anthony
5
'''
6
import urllib2
7
import math
8
import re
9
from bs4 import BeautifulSoup
10
from pprint import pprint
11
from threading import Thread
12
from collections import deque
13
from time import sleep
14
15
class GoogleSearch:
16
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ 58.0.3029.81 Safari/537.36"
17
SEARCH_URL = "https://google.com/search"
18
RESULT_SELECTOR = ".srg h3.r a"
19
TOTAL_SELECTOR = "#resultStats"
20
RESULTS_PER_PAGE = 10
21
DEFAULT_HEADERS = [
22
('User-Agent', USER_AGENT),
23
("Accept-Language", "en-US,en;q=0.5"),
24
]
25
26
def search(self, query, num_results = 10, prefetch_pages = True, prefetch_threads = 10, language = "en"):
27
searchResults = []
28
pages = int(math.ceil(num_results / float(GoogleSearch.RESULTS_PER_PAGE)));
29
fetcher_threads = deque([])
30
total = None;
31
for i in range(pages) :
32
start = i * GoogleSearch.RESULTS_PER_PAGE
33
opener = urllib2.build_opener()
34
opener.addheaders = GoogleSearch.DEFAULT_HEADERS
35
response = opener.open(GoogleSearch.SEARCH_URL + "?q="+ urllib2.quote(query) + "&hl=" + language + ("" if start == 0 else ("&start=" + str(start))))
36
soup = BeautifulSoup(response.read(), "lxml")
37
response.close()
38
if total is None:
39
totalText = soup.select(GoogleSearch.TOTAL_SELECTOR)[0].children.next().encode('utf-8')
40
total = long(re.sub("[',\. ]", "", re.search("(([0-9]+[',\. ])*[0-9]+)", totalText).group(1)))
41
results = self.parseResults(soup.select(GoogleSearch.RESULT_SELECTOR))
42
if len(searchResults) + len(results) > num_results:
43
del results[num_results - len(searchResults):]
44
searchResults += results
45
if prefetch_pages:
46
for result in results:
47
while True:
48
running = 0
49
for thread in fetcher_threads:
50
if thread.is_alive():
51
running += 1
52
if running < prefetch_threads:
53
break
54
sleep(1)
55
fetcher_thread = Thread(target=result.getText)
56
fetcher_thread.start()
57
fetcher_threads.append(fetcher_thread)
58
for thread in fetcher_threads:
59
thread.join()
60
return SearchResponse(searchResults, total);
61
62
def parseResults(self, results):
63
searchResults = [];
64
for result in results:
65
url = result["href"];
66
title = result.text
67
searchResults.append(SearchResult(title, url))
68
return searchResults
69
70
class SearchResponse:
71
def __init__(self, results, total):
72
self.results = results;
73
self.total = total;
74
75
class SearchResult:
76
def __init__(self, title, url):
77
self.title = title
78
self.url = url
79
self.__text = None
80
self.__markup = None
81
82
def getText(self):
83
if self.__text is None:
84
soup = BeautifulSoup(self.getMarkup(), "lxml")
85
for junk in soup(["script", "style"]):
86
junk.extract()
87
self.__text = soup.get_text()
88
return self.__text
89
90
def getMarkup(self):
91
if self.__markup is None:
92
opener = urllib2.build_opener()
93
opener.addheaders = GoogleSearch.DEFAULT_HEADERS
94
response = opener.open(self.url);
95
self.__markup = response.read()
96
return self.__markup
97
98
def __str__(self):
99
return str(self.__dict__)
100
def __unicode__(self):
101
return unicode(self.__str__())
102
def __repr__(self):
103
return self.__str__()
104
105
if __name__ == "__main__":
106
import sys
107
search = GoogleSearch()
108
i=1
109
query = " ".join(sys.argv[1:])
110
if len(query) == 0:
111
query = "python"
112
count = 10
113
print ("Fetching first " + str(count) + " results for \"" + query + "\"...")
114
response = search.search(query, count)
115
print ("TOTAL: " + str(response.total) + " RESULTS")
116
for result in response.results:
117
print("RESULT #" +str (i) + ": "+ (result._SearchResult__text if result._SearchResult__text is not None else "[None]") + "\n\n")
118
i+=1
119
120