Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sherlock-project
GitHub Repository: sherlock-project/sherlock
Path: blob/master/sherlock_project/sites.py
761 views
1
"""Sherlock Sites Information Module
2
3
This module supports storing information about websites.
4
This is the raw data that will be used to search for usernames.
5
"""
6
import json
7
import requests
8
import secrets
9
10
11
MANIFEST_URL = "https://data.sherlockproject.xyz"
12
EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
13
14
class SiteInformation:
15
def __init__(self, name, url_home, url_username_format, username_claimed,
16
information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
17
"""Create Site Information Object.
18
19
Contains information about a specific website.
20
21
Keyword Arguments:
22
self -- This object.
23
name -- String which identifies site.
24
url_home -- String containing URL for home of site.
25
url_username_format -- String containing URL for Username format
26
on site.
27
NOTE: The string should contain the
28
token "{}" where the username should
29
be substituted. For example, a string
30
of "https://somesite.com/users/{}"
31
indicates that the individual
32
usernames would show up under the
33
"https://somesite.com/users/" area of
34
the website.
35
username_claimed -- String containing username which is known
36
to be claimed on website.
37
username_unclaimed -- String containing username which is known
38
to be unclaimed on website.
39
information -- Dictionary containing all known information
40
about website.
41
NOTE: Custom information about how to
42
actually detect the existence of the
43
username will be included in this
44
dictionary. This information will
45
be needed by the detection method,
46
but it is only recorded in this
47
object for future use.
48
is_nsfw -- Boolean indicating if site is Not Safe For Work.
49
50
Return Value:
51
Nothing.
52
"""
53
54
self.name = name
55
self.url_home = url_home
56
self.url_username_format = url_username_format
57
58
self.username_claimed = username_claimed
59
self.username_unclaimed = secrets.token_urlsafe(32)
60
self.information = information
61
self.is_nsfw = is_nsfw
62
63
return
64
65
def __str__(self):
66
"""Convert Object To String.
67
68
Keyword Arguments:
69
self -- This object.
70
71
Return Value:
72
Nicely formatted string to get information about this object.
73
"""
74
75
return f"{self.name} ({self.url_home})"
76
77
78
class SitesInformation:
79
def __init__(
80
self,
81
data_file_path: str|None = None,
82
honor_exclusions: bool = True,
83
do_not_exclude: list[str] = [],
84
):
85
"""Create Sites Information Object.
86
87
Contains information about all supported websites.
88
89
Keyword Arguments:
90
self -- This object.
91
data_file_path -- String which indicates path to data file.
92
The file name must end in ".json".
93
94
There are 3 possible formats:
95
* Absolute File Format
96
For example, "c:/stuff/data.json".
97
* Relative File Format
98
The current working directory is used
99
as the context.
100
For example, "data.json".
101
* URL Format
102
For example,
103
"https://example.com/data.json", or
104
"http://example.com/data.json".
105
106
An exception will be thrown if the path
107
to the data file is not in the expected
108
format, or if there was any problem loading
109
the file.
110
111
If this option is not specified, then a
112
default site list will be used.
113
114
Return Value:
115
Nothing.
116
"""
117
118
if not data_file_path:
119
# The default data file is the live data.json which is in the GitHub repo. The reason why we are using
120
# this instead of the local one is so that the user has the most up-to-date data. This prevents
121
# users from creating issue about false positives which has already been fixed or having outdated data
122
data_file_path = MANIFEST_URL
123
124
if data_file_path.lower().startswith("http"):
125
# Reference is to a URL.
126
try:
127
response = requests.get(url=data_file_path, timeout=30)
128
except Exception as error:
129
raise FileNotFoundError(
130
f"Problem while attempting to access data file URL '{data_file_path}': {error}"
131
)
132
133
if response.status_code != 200:
134
raise FileNotFoundError(f"Bad response while accessing "
135
f"data file URL '{data_file_path}'."
136
)
137
try:
138
site_data = response.json()
139
except Exception as error:
140
raise ValueError(
141
f"Problem parsing json contents at '{data_file_path}': {error}."
142
)
143
144
else:
145
# Reference is to a file.
146
try:
147
with open(data_file_path, "r", encoding="utf-8") as file:
148
try:
149
site_data = json.load(file)
150
except Exception as error:
151
raise ValueError(
152
f"Problem parsing json contents at '{data_file_path}': {error}."
153
)
154
155
except FileNotFoundError:
156
raise FileNotFoundError(f"Problem while attempting to access "
157
f"data file '{data_file_path}'."
158
)
159
160
site_data.pop('$schema', None)
161
162
if honor_exclusions:
163
try:
164
response = requests.get(url=EXCLUSIONS_URL, timeout=10)
165
if response.status_code == 200:
166
exclusions = response.text.splitlines()
167
exclusions = [exclusion.strip() for exclusion in exclusions]
168
169
for site in do_not_exclude:
170
if site in exclusions:
171
exclusions.remove(site)
172
173
for exclusion in exclusions:
174
try:
175
site_data.pop(exclusion, None)
176
except KeyError:
177
pass
178
179
except Exception:
180
# If there was any problem loading the exclusions, just continue without them
181
print("Warning: Could not load exclusions, continuing without them.")
182
honor_exclusions = False
183
184
self.sites = {}
185
186
# Add all site information from the json file to internal site list.
187
for site_name in site_data:
188
try:
189
190
self.sites[site_name] = \
191
SiteInformation(site_name,
192
site_data[site_name]["urlMain"],
193
site_data[site_name]["url"],
194
site_data[site_name]["username_claimed"],
195
site_data[site_name],
196
site_data[site_name].get("isNSFW",False)
197
198
)
199
except KeyError as error:
200
raise ValueError(
201
f"Problem parsing json contents at '{data_file_path}': Missing attribute {error}."
202
)
203
except TypeError:
204
print(f"Encountered TypeError parsing json contents for target '{site_name}' at {data_file_path}\nSkipping target.\n")
205
206
return
207
208
def remove_nsfw_sites(self, do_not_remove: list = []):
209
"""
210
Remove NSFW sites from the sites, if isNSFW flag is true for site
211
212
Keyword Arguments:
213
self -- This object.
214
215
Return Value:
216
None
217
"""
218
sites = {}
219
do_not_remove = [site.casefold() for site in do_not_remove]
220
for site in self.sites:
221
if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
222
continue
223
sites[site] = self.sites[site]
224
self.sites = sites
225
226
def site_name_list(self):
227
"""Get Site Name List.
228
229
Keyword Arguments:
230
self -- This object.
231
232
Return Value:
233
List of strings containing names of sites.
234
"""
235
236
return sorted([site.name for site in self], key=str.lower)
237
238
def __iter__(self):
239
"""Iterator For Object.
240
241
Keyword Arguments:
242
self -- This object.
243
244
Return Value:
245
Iterator for sites object.
246
"""
247
248
for site_name in self.sites:
249
yield self.sites[site_name]
250
251
def __len__(self):
252
"""Length For Object.
253
254
Keyword Arguments:
255
self -- This object.
256
257
Return Value:
258
Length of sites object.
259
"""
260
return len(self.sites)
261
262