CoCalc -- googlesearch.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/googlesearch/googlesearch.py
⁸¹¹ views
1
'''
2
Created on May 5, 2017
3

4
@author: anthony
5
'''
6
import urllib2
7
import math
8
import re
9
from bs4 import BeautifulSoup
10
from pprint import pprint
11
from threading import Thread
12
from collections import deque
13
from time import sleep
14
        
15
class GoogleSearch:
16
    USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ 58.0.3029.81 Safari/537.36"
17
    SEARCH_URL = "https://google.com/search"
18
    RESULT_SELECTOR = ".srg h3.r a"
19
    TOTAL_SELECTOR = "#resultStats"
20
    RESULTS_PER_PAGE = 10
21
    DEFAULT_HEADERS = [
22
            ('User-Agent', USER_AGENT),
23
            ("Accept-Language", "en-US,en;q=0.5"),
24
        ]
25
    
26
    def search(self, query, num_results = 10, prefetch_pages = True, prefetch_threads = 10, language = "en"):
27
        searchResults = []
28
        pages = int(math.ceil(num_results / float(GoogleSearch.RESULTS_PER_PAGE)));
29
        fetcher_threads = deque([])
30
        total = None;
31
        for i in range(pages) :
32
            start = i * GoogleSearch.RESULTS_PER_PAGE
33
            opener = urllib2.build_opener()
34
            opener.addheaders = GoogleSearch.DEFAULT_HEADERS
35
            response = opener.open(GoogleSearch.SEARCH_URL + "?q="+ urllib2.quote(query) + "&hl=" + language + ("" if start == 0 else ("&start=" + str(start))))
36
            soup = BeautifulSoup(response.read(), "lxml")
37
            response.close()
38
            if total is None:
39
                totalText = soup.select(GoogleSearch.TOTAL_SELECTOR)[0].children.next().encode('utf-8')
40
                total = long(re.sub("[',\. ]", "", re.search("(([0-9]+[',\. ])*[0-9]+)", totalText).group(1)))
41
            results = self.parseResults(soup.select(GoogleSearch.RESULT_SELECTOR))
42
            if len(searchResults) + len(results) > num_results:
43
                del results[num_results - len(searchResults):]
44
            searchResults += results
45
            if prefetch_pages:
46
                for result in results:
47
                    while True:
48
                        running = 0
49
                        for thread in fetcher_threads:
50
                            if thread.is_alive():
51
                                running += 1
52
                        if running < prefetch_threads:
53
                            break
54
                        sleep(1)
55
                    fetcher_thread = Thread(target=result.getText)
56
                    fetcher_thread.start()
57
                    fetcher_threads.append(fetcher_thread)
58
        for thread in fetcher_threads:
59
            thread.join()
60
        return SearchResponse(searchResults, total);
61
        
62
    def parseResults(self, results):
63
        searchResults = [];
64
        for result in results:
65
            url = result["href"];
66
            title = result.text
67
            searchResults.append(SearchResult(title, url))
68
        return searchResults
69

70
class SearchResponse:
71
    def __init__(self, results, total):
72
        self.results = results;
73
        self.total = total;
74

75
class SearchResult:
76
    def __init__(self, title, url):
77
        self.title = title
78
        self.url = url
79
        self.__text = None
80
        self.__markup = None
81
    
82
    def getText(self):
83
        if self.__text is None:
84
            soup = BeautifulSoup(self.getMarkup(), "lxml")
85
            for junk in soup(["script", "style"]):
86
                junk.extract()
87
                self.__text = soup.get_text()
88
        return self.__text
89
    
90
    def getMarkup(self):
91
        if self.__markup is None:
92
            opener = urllib2.build_opener()
93
            opener.addheaders = GoogleSearch.DEFAULT_HEADERS
94
            response = opener.open(self.url);
95
            self.__markup = response.read()
96
        return self.__markup
97
    
98
    def __str__(self):
99
        return  str(self.__dict__)
100
    def __unicode__(self):
101
        return unicode(self.__str__())
102
    def __repr__(self):
103
        return self.__str__()
104

105
if __name__ == "__main__":
106
    import sys
107
    search = GoogleSearch()
108
    i=1
109
    query = " ".join(sys.argv[1:])
110
    if len(query) == 0:
111
        query = "python"
112
    count = 10
113
    print ("Fetching first " + str(count) + " results for \"" + query + "\"...")
114
    response = search.search(query, count)
115
    print ("TOTAL: " + str(response.total) + " RESULTS")
116
    for result in response.results:
117
        print("RESULT #" +str (i) + ": "+ (result._SearchResult__text if result._SearchResult__text is not None else "[None]") + "\n\n")
118
        i+=1
119

120
Product

Resources

Company