CoCalc -- sites.py

GitHub Repository: sherlock-project/sherlock
Path: blob/master/sherlock_project/sites.py
⁷⁶¹ views
1
"""Sherlock Sites Information Module
2

3
This module supports storing information about websites.
4
This is the raw data that will be used to search for usernames.
5
"""
6
import json
7
import requests
8
import secrets
9

10

11
MANIFEST_URL = "https://data.sherlockproject.xyz"
12
EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
13

14
class SiteInformation:
15
    def __init__(self, name, url_home, url_username_format, username_claimed,
16
                information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
17
        """Create Site Information Object.
18

19
        Contains information about a specific website.
20

21
        Keyword Arguments:
22
        self                   -- This object.
23
        name                   -- String which identifies site.
24
        url_home               -- String containing URL for home of site.
25
        url_username_format    -- String containing URL for Username format
26
                                  on site.
27
                                  NOTE:  The string should contain the
28
                                         token "{}" where the username should
29
                                         be substituted.  For example, a string
30
                                         of "https://somesite.com/users/{}"
31
                                         indicates that the individual
32
                                         usernames would show up under the
33
                                         "https://somesite.com/users/" area of
34
                                         the website.
35
        username_claimed       -- String containing username which is known
36
                                  to be claimed on website.
37
        username_unclaimed     -- String containing username which is known
38
                                  to be unclaimed on website.
39
        information            -- Dictionary containing all known information
40
                                  about website.
41
                                  NOTE:  Custom information about how to
42
                                         actually detect the existence of the
43
                                         username will be included in this
44
                                         dictionary.  This information will
45
                                         be needed by the detection method,
46
                                         but it is only recorded in this
47
                                         object for future use.
48
        is_nsfw                -- Boolean indicating if site is Not Safe For Work.
49

50
        Return Value:
51
        Nothing.
52
        """
53

54
        self.name = name
55
        self.url_home = url_home
56
        self.url_username_format = url_username_format
57

58
        self.username_claimed = username_claimed
59
        self.username_unclaimed = secrets.token_urlsafe(32)
60
        self.information = information
61
        self.is_nsfw  = is_nsfw
62

63
        return
64

65
    def __str__(self):
66
        """Convert Object To String.
67

68
        Keyword Arguments:
69
        self                   -- This object.
70

71
        Return Value:
72
        Nicely formatted string to get information about this object.
73
        """
74

75
        return f"{self.name} ({self.url_home})"
76

77

78
class SitesInformation:
79
    def __init__(
80
            self,
81
            data_file_path: str|None = None,
82
            honor_exclusions: bool = True,
83
            do_not_exclude: list[str] = [],
84
        ):
85
        """Create Sites Information Object.
86

87
        Contains information about all supported websites.
88

89
        Keyword Arguments:
90
        self                   -- This object.
91
        data_file_path         -- String which indicates path to data file.
92
                                  The file name must end in ".json".
93

94
                                  There are 3 possible formats:
95
                                   * Absolute File Format
96
                                     For example, "c:/stuff/data.json".
97
                                   * Relative File Format
98
                                     The current working directory is used
99
                                     as the context.
100
                                     For example, "data.json".
101
                                   * URL Format
102
                                     For example,
103
                                     "https://example.com/data.json", or
104
                                     "http://example.com/data.json".
105

106
                                  An exception will be thrown if the path
107
                                  to the data file is not in the expected
108
                                  format, or if there was any problem loading
109
                                  the file.
110

111
                                  If this option is not specified, then a
112
                                  default site list will be used.
113

114
        Return Value:
115
        Nothing.
116
        """
117

118
        if not data_file_path:
119
            # The default data file is the live data.json which is in the GitHub repo. The reason why we are using
120
            # this instead of the local one is so that the user has the most up-to-date data. This prevents
121
            # users from creating issue about false positives which has already been fixed or having outdated data
122
            data_file_path = MANIFEST_URL
123

124
        if data_file_path.lower().startswith("http"):
125
            # Reference is to a URL.
126
            try:
127
                response = requests.get(url=data_file_path, timeout=30)
128
            except Exception as error:
129
                raise FileNotFoundError(
130
                    f"Problem while attempting to access data file URL '{data_file_path}':  {error}"
131
                )
132

133
            if response.status_code != 200:
134
                raise FileNotFoundError(f"Bad response while accessing "
135
                                        f"data file URL '{data_file_path}'."
136
                                        )
137
            try:
138
                site_data = response.json()
139
            except Exception as error:
140
                raise ValueError(
141
                    f"Problem parsing json contents at '{data_file_path}':  {error}."
142
                )
143

144
        else:
145
            # Reference is to a file.
146
            try:
147
                with open(data_file_path, "r", encoding="utf-8") as file:
148
                    try:
149
                        site_data = json.load(file)
150
                    except Exception as error:
151
                        raise ValueError(
152
                            f"Problem parsing json contents at '{data_file_path}':  {error}."
153
                        )
154

155
            except FileNotFoundError:
156
                raise FileNotFoundError(f"Problem while attempting to access "
157
                                        f"data file '{data_file_path}'."
158
                                        )
159

160
        site_data.pop('$schema', None)
161

162
        if honor_exclusions:
163
            try:
164
                response = requests.get(url=EXCLUSIONS_URL, timeout=10)
165
                if response.status_code == 200:
166
                    exclusions = response.text.splitlines()
167
                    exclusions = [exclusion.strip() for exclusion in exclusions]
168

169
                    for site in do_not_exclude:
170
                        if site in exclusions:
171
                            exclusions.remove(site)
172

173
                    for exclusion in exclusions:
174
                        try:
175
                            site_data.pop(exclusion, None)
176
                        except KeyError:
177
                            pass
178

179
            except Exception:
180
                # If there was any problem loading the exclusions, just continue without them
181
                print("Warning: Could not load exclusions, continuing without them.")
182
                honor_exclusions = False
183

184
        self.sites = {}
185

186
        # Add all site information from the json file to internal site list.
187
        for site_name in site_data:
188
            try:
189

190
                self.sites[site_name] = \
191
                    SiteInformation(site_name,
192
                                    site_data[site_name]["urlMain"],
193
                                    site_data[site_name]["url"],
194
                                    site_data[site_name]["username_claimed"],
195
                                    site_data[site_name],
196
                                    site_data[site_name].get("isNSFW",False)
197

198
                                    )
199
            except KeyError as error:
200
                raise ValueError(
201
                    f"Problem parsing json contents at '{data_file_path}':  Missing attribute {error}."
202
                )
203
            except TypeError:
204
                print(f"Encountered TypeError parsing json contents for target '{site_name}' at {data_file_path}\nSkipping target.\n")
205

206
        return
207

208
    def remove_nsfw_sites(self, do_not_remove: list = []):
209
        """
210
        Remove NSFW sites from the sites, if isNSFW flag is true for site
211

212
        Keyword Arguments:
213
        self                   -- This object.
214

215
        Return Value:
216
        None
217
        """
218
        sites = {}
219
        do_not_remove = [site.casefold() for site in do_not_remove]
220
        for site in self.sites:
221
            if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
222
                continue
223
            sites[site] = self.sites[site]
224
        self.sites =  sites
225

226
    def site_name_list(self):
227
        """Get Site Name List.
228

229
        Keyword Arguments:
230
        self                   -- This object.
231

232
        Return Value:
233
        List of strings containing names of sites.
234
        """
235

236
        return sorted([site.name for site in self], key=str.lower)
237

238
    def __iter__(self):
239
        """Iterator For Object.
240

241
        Keyword Arguments:
242
        self                   -- This object.
243

244
        Return Value:
245
        Iterator for sites object.
246
        """
247

248
        for site_name in self.sites:
249
            yield self.sites[site_name]
250

251
    def __len__(self):
252
        """Length For Object.
253

254
        Keyword Arguments:
255
        self                   -- This object.
256

257
        Return Value:
258
        Length of sites object.
259
        """
260
        return len(self.sites)
261

262
Product

Resources

Company