Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/discord.py
8745 views
1
# -*- coding: utf-8 -*-
2
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as
5
# published by the Free Software Foundation.
6
7
"""Extractors for https://discord.com/"""
8
9
from .common import Extractor, Message
10
from .. import text, exception
11
12
BASE_PATTERN = r"(?:https?://)?discord\.com"
13
14
15
class DiscordExtractor(Extractor):
16
"""Base class for Discord extractors"""
17
category = "discord"
18
root = "https://discord.com"
19
directory_fmt = ("{category}", "{server_id}_{server}",
20
"{channel_id}_{channel}")
21
filename_fmt = "{message_id}_{num:>02}_{filename[:220]}.{extension}"
22
archive_fmt = "{message_id}_{num}"
23
24
server_metadata = {}
25
server_channels_metadata = {}
26
27
def _init(self):
28
self.token = self.config("token")
29
self.enabled_embeds = self.config("embeds", ["image", "gifv", "video"])
30
self.enabled_threads = self.config("threads", True)
31
self.api = DiscordAPI(self)
32
33
def extract_message_text(self, message):
34
text_content = [message["content"]]
35
36
for embed in message["embeds"]:
37
if embed["type"] == "rich":
38
try:
39
text_content.append(embed["author"]["name"])
40
except Exception:
41
pass
42
text_content.append(embed.get("title", ""))
43
text_content.append(embed.get("description", ""))
44
45
for field in embed.get("fields", []):
46
text_content.append(field.get("name", ""))
47
text_content.append(field.get("value", ""))
48
49
try:
50
text_content.append(embed["footer"]["text"])
51
except Exception:
52
pass
53
54
if message.get("poll"):
55
text_content.append(message["poll"]["question"]["text"])
56
for answer in message["poll"]["answers"]:
57
text_content.append(answer["poll_media"]["text"])
58
59
return "\n".join(t for t in text_content if t)
60
61
def extract_message(self, message):
62
# https://discord.com/developers/docs/resources/message#message-object-message-types
63
if message["type"] in (0, 19, 21):
64
message_metadata = {}
65
message_metadata.update(self.server_metadata)
66
message_metadata.update(
67
self.server_channels_metadata[message["channel_id"]])
68
message_metadata.update({
69
"author": message["author"]["username"],
70
"author_id": message["author"]["id"],
71
"author_files": [],
72
"message": self.extract_message_text(message),
73
"message_id": message["id"],
74
"date": self.parse_datetime_iso(message["timestamp"]),
75
"files": []
76
})
77
78
for icon_type, icon_path in (
79
("avatar", "avatars"),
80
("banner", "banners")
81
):
82
if message["author"].get(icon_type):
83
message_metadata["author_files"].append({
84
"url": (f"https://cdn.discordapp.com/{icon_path}/"
85
f"{message_metadata['author_id']}/"
86
f"{message['author'][icon_type]}.png"
87
f"?size=4096"),
88
"filename": icon_type,
89
"extension": "png",
90
})
91
92
message_snapshots = [message]
93
message_snapshots.extend(
94
msg["message"] for msg in message.get("message_snapshots", [])
95
if msg["message"]["type"] in (0, 19, 21)
96
)
97
98
for snapshot in message_snapshots:
99
for attachment in snapshot["attachments"]:
100
message_metadata["files"].append({
101
"url": attachment["url"],
102
"type": "attachment",
103
})
104
105
for embed in snapshot["embeds"]:
106
if embed["type"] in self.enabled_embeds:
107
for field in ("video", "image", "thumbnail"):
108
if field not in embed:
109
continue
110
url = embed[field].get("proxy_url")
111
if url is not None:
112
message_metadata["files"].append({
113
"url": url,
114
"type": "embed",
115
})
116
break
117
118
for num, file in enumerate(message_metadata["files"], start=1):
119
text.nameext_from_url(file["url"], file)
120
file["num"] = num
121
122
yield Message.Directory, "", message_metadata
123
124
for file in message_metadata["files"]:
125
message_metadata_file = message_metadata.copy()
126
message_metadata_file.update(file)
127
yield Message.Url, file["url"], message_metadata_file
128
129
def extract_search(self, server_id, params):
130
for messages in self.api.get_search_messages(server_id, params):
131
for message in messages:
132
if message["channel_id"] not in self.server_channels_metadata:
133
self.parse_channel(self.api.get_channel(
134
message["channel_id"]))
135
yield from self.extract_message(message)
136
137
def extract_channel_text(self, channel_id):
138
for message in self.api.get_channel_messages(channel_id):
139
yield from self.extract_message(message)
140
141
def extract_channel_threads(self, channel_id):
142
for thread in self.api.get_channel_threads(channel_id):
143
id = self.parse_channel(thread)["channel_id"]
144
yield from self.extract_channel_text(id)
145
146
def extract_channel(self, channel_id, safe=False):
147
try:
148
if channel_id not in self.server_channels_metadata:
149
self.parse_channel(self.api.get_channel(channel_id))
150
151
channel_type = (
152
self.server_channels_metadata[channel_id]["channel_type"]
153
)
154
155
# https://discord.com/developers/docs/resources/channel#channel-object-channel-types
156
if channel_type in (0, 5):
157
yield from self.extract_channel_text(channel_id)
158
if self.enabled_threads:
159
yield from self.extract_channel_threads(channel_id)
160
elif channel_type in (1, 3, 10, 11, 12):
161
yield from self.extract_channel_text(channel_id)
162
elif channel_type in (15, 16):
163
yield from self.extract_channel_threads(channel_id)
164
elif channel_type in (4,):
165
for channel in self.server_channels_metadata.copy().values():
166
if channel["parent_id"] == channel_id:
167
yield from self.extract_channel(
168
channel["channel_id"], safe=True)
169
elif not safe:
170
raise exception.AbortExtraction(
171
"This channel type is not supported."
172
)
173
except exception.HttpError as exc:
174
if not (exc.status == 403 and safe):
175
raise
176
177
def parse_channel(self, channel):
178
parent_id = channel.get("parent_id")
179
channel_metadata = {
180
"channel": channel.get("name", ""),
181
"channel_id": channel.get("id"),
182
"channel_type": channel.get("type"),
183
"channel_topic": channel.get("topic", ""),
184
"parent_id": parent_id,
185
"is_thread": "thread_metadata" in channel
186
}
187
188
if parent_id in self.server_channels_metadata:
189
parent_metadata = self.server_channels_metadata[parent_id]
190
channel_metadata.update({
191
"parent": parent_metadata["channel"],
192
"parent_type": parent_metadata["channel_type"]
193
})
194
195
if channel_metadata["channel_type"] in (1, 3):
196
channel_metadata.update({
197
"channel": "DMs",
198
"recipients": (
199
[user["username"] for user in channel["recipients"]]
200
),
201
"recipients_id": (
202
[user["id"] for user in channel["recipients"]]
203
)
204
})
205
206
channel_id = channel_metadata["channel_id"]
207
208
self.server_channels_metadata[channel_id] = channel_metadata
209
return channel_metadata
210
211
def parse_server(self, server):
212
self.server_metadata = {
213
"server" : server["name"],
214
"server_id": server["id"],
215
"owner_id" : server["owner_id"],
216
"server_files": self.collect_server_assets(server),
217
}
218
219
return self.server_metadata
220
221
def collect_server_assets(self, server, asset_type=None):
222
if asset_type and asset_type != "general":
223
return [
224
{
225
**asset,
226
"url": (f"https://cdn.discordapp.com/{asset_type}/"
227
f"{asset['id']}.png?size=4096"),
228
"label" : asset_type,
229
"filename" : f"{asset['name']} ({asset['id']})",
230
"extension": "png",
231
}
232
for asset in assets
233
] if (assets := server.get(asset_type)) else ()
234
else:
235
return [
236
{
237
"url": (f"https://cdn.discordapp.com/{asset_path}/"
238
f"{server['id']}/{asset_id}.png?size=4096"),
239
"id" : f"{server['id']}/{asset_id}",
240
"label" : "",
241
"name" : asset_type,
242
"filename" : asset_type,
243
"extension": "png",
244
}
245
for asset_type, asset_path in (
246
("icon" , "icons"),
247
("banner", "banners"),
248
("splash", "splashes"),
249
("discovery_splash", "discovery-splashes")
250
)
251
if (asset_id := server.get(asset_type))
252
]
253
254
def build_server_and_channels(self, server_id):
255
self.parse_server(self.api.get_server(server_id))
256
257
for channel in sorted(
258
self.api.get_server_channels(server_id),
259
key=lambda ch: ch["type"] != 4
260
):
261
self.parse_channel(channel)
262
263
264
class DiscordChannelExtractor(DiscordExtractor):
265
subcategory = "channel"
266
pattern = BASE_PATTERN + r"/channels/(\d+)/(?:\d+/threads/)?(\d+)/?$"
267
example = "https://discord.com/channels/1234567890/9876543210"
268
269
def items(self):
270
server_id, channel_id = self.groups
271
272
self.build_server_and_channels(server_id)
273
274
return self.extract_channel(channel_id)
275
276
277
class DiscordMessageExtractor(DiscordExtractor):
278
subcategory = "message"
279
pattern = BASE_PATTERN + r"/channels/(\d+)/(\d+)/(\d+)/?$"
280
example = "https://discord.com/channels/1234567890/9876543210/2468013579"
281
282
def items(self):
283
server_id, channel_id, message_id = self.groups
284
285
self.build_server_and_channels(server_id)
286
287
if channel_id not in self.server_channels_metadata:
288
self.parse_channel(self.api.get_channel(channel_id))
289
290
return self.extract_message(
291
self.api.get_message(channel_id, message_id))
292
293
294
class DiscordServerAssetsExtractor(DiscordExtractor):
295
subcategory = "server-assets"
296
filename_fmt = "{name} ({id}).{extension}"
297
directory_fmt = ["{category}", "{server_id}_{server}", "Assets", "{label}"]
298
archive_fmt = "asset_{server_id}_{id}"
299
pattern = (BASE_PATTERN +
300
r"/channels/(\d+)/(?:assets?|files)(?:/([\w-]+))?/?$")
301
example = "https://discord.com/channels/1234567890/assets"
302
303
def items(self):
304
server_id, asset_type = self.groups
305
server = self.api.get_server(server_id)
306
parsed = self.parse_server(server)
307
308
if asset_type is None:
309
asset_types = ("", "emojis", "stickers")
310
else:
311
asset_types = asset_type.split(",")
312
313
for asset_type in asset_types:
314
assets = self.collect_server_assets(server, asset_type)
315
parsed["count"] = len(assets)
316
parsed["label"] = asset_type
317
yield Message.Directory, "", parsed
318
for asset in assets:
319
asset.update(parsed)
320
yield Message.Url, asset["url"], asset
321
322
323
class DiscordServerSearchExtractor(DiscordExtractor):
324
subcategory = "server-search"
325
pattern = BASE_PATTERN + r"/channels/(\d+)/search/?\?([^#]+)"
326
example = "https://discord.com/channels/1234567890/search?QUERY"
327
328
def items(self):
329
server_id, query = self.groups
330
server = self.api.get_server(server_id)
331
self.kwdict.update(self.parse_server(server))
332
333
params = {
334
**text.parse_query_list(query, {
335
"from", "in", "has", "mentions", "author_id", "channel_id"}),
336
"sort_by" : "timestamp",
337
"sort_order": "desc",
338
}
339
if "from" in params:
340
params["author_id"] = params.pop("from")
341
if "in" in params:
342
params["channel_id"] = params.pop("in")
343
344
return self.extract_search(server_id, params)
345
346
347
class DiscordServerExtractor(DiscordExtractor):
348
subcategory = "server"
349
pattern = BASE_PATTERN + r"/channels/(\d+)/?$"
350
example = "https://discord.com/channels/1234567890"
351
352
def items(self):
353
server_id = self.groups[0]
354
355
self.build_server_and_channels(server_id)
356
357
for channel in self.server_channels_metadata.copy().values():
358
if channel["channel_type"] in (0, 5, 15, 16):
359
yield from self.extract_channel(
360
channel["channel_id"], safe=True)
361
362
363
class DiscordDirectMessagesExtractor(DiscordExtractor):
364
subcategory = "direct-messages"
365
directory_fmt = ("{category}", "Direct Messages",
366
"{channel_id}_{recipients:J,}")
367
pattern = BASE_PATTERN + r"/channels/@me/(\d+)/?$"
368
example = "https://discord.com/channels/@me/1234567890"
369
370
def items(self):
371
return self.extract_channel(self.groups[0])
372
373
374
class DiscordDirectMessageExtractor(DiscordExtractor):
375
subcategory = "direct-message"
376
directory_fmt = ("{category}", "Direct Messages",
377
"{channel_id}_{recipients:J,}")
378
pattern = BASE_PATTERN + r"/channels/@me/(\d+)/(\d+)/?$"
379
example = "https://discord.com/channels/@me/1234567890/9876543210"
380
381
def items(self):
382
channel_id, message_id = self.groups
383
384
self.parse_channel(self.api.get_channel(channel_id))
385
386
return self.extract_message(
387
self.api.get_message(channel_id, message_id))
388
389
390
class DiscordAPI():
391
"""Interface for the Discord API v10
392
393
https://discord.com/developers/docs/reference
394
"""
395
396
def __init__(self, extractor):
397
self.extractor = extractor
398
self.root = extractor.root + "/api/v10"
399
self.headers = {"Authorization": extractor.token}
400
401
def get_server(self, server_id):
402
"""Get server information"""
403
return self._call("/guilds/" + server_id)
404
405
def get_server_channels(self, server_id):
406
"""Get server channels"""
407
return self._call("/guilds/" + server_id + "/channels")
408
409
def get_channel(self, channel_id):
410
"""Get channel information"""
411
return self._call("/channels/" + channel_id)
412
413
def get_channel_threads(self, channel_id):
414
"""Get channel threads"""
415
THREADS_BATCH = 25
416
417
def _method(offset):
418
return self._call("/channels/" + channel_id + "/threads/search", {
419
"sort_by": "last_message_time",
420
"sort_order": "desc",
421
"limit": THREADS_BATCH,
422
"offset": + offset,
423
}).get("threads", [])
424
425
return self._pagination(_method, THREADS_BATCH)
426
427
def get_channel_messages(self, channel_id):
428
"""Get channel messages"""
429
MESSAGES_BATCH = 100
430
431
before = None
432
433
def _method(_):
434
nonlocal before
435
messages = self._call("/channels/" + channel_id + "/messages", {
436
"limit": MESSAGES_BATCH,
437
"before": before
438
})
439
if messages:
440
before = messages[-1]["id"]
441
return messages
442
443
return self._pagination(_method, MESSAGES_BATCH)
444
445
def get_search_messages(self, server_id, params):
446
"""Get search messages"""
447
MESSAGES_BATCH = 25
448
449
def _method(offset):
450
messages = self._call(url, params)["messages"]
451
452
max_id = 0
453
for msgs in messages:
454
for msg in msgs:
455
mid = int(msg["id"])
456
if max_id > mid or not max_id:
457
max_id = mid
458
params["max_id"] = max_id
459
460
return messages
461
462
url = f"/guilds/{server_id}/messages/search"
463
return self._pagination(_method, MESSAGES_BATCH)
464
465
def get_message(self, channel_id, message_id):
466
"""Get message information"""
467
return self._call("/channels/" + channel_id + "/messages", {
468
"limit": 1,
469
"around": message_id
470
})[0]
471
472
def _call(self, endpoint, params=None):
473
url = self.root + endpoint
474
try:
475
response = self.extractor.request(
476
url, params=params, headers=self.headers)
477
except exception.HttpError as exc:
478
if exc.status == 401:
479
self._raise_invalid_token()
480
raise
481
return response.json()
482
483
def _pagination(self, method, batch):
484
offset = 0
485
while True:
486
data = method(offset)
487
yield from data
488
if len(data) < batch:
489
return
490
offset += len(data)
491
492
def _raise_invalid_token(self):
493
raise exception.AuthenticationError("""Invalid or missing token.
494
Please provide a valid token following these instructions:
495
496
1) Open Discord in your browser (https://discord.com/app);
497
2) Open your browser's Developer Tools (F12) and switch to the Network panel;
498
3) Reload the page and select any request going to https://discord.com/api/...;
499
4) In the "Headers" tab, look for an entry beginning with "Authorization: ";
500
5) Right-click the entry and click "Copy Value";
501
6) Paste the token in your configuration file under "extractor.discord.token",
502
or run this command with the -o "token=[your token]" argument.""")
503
504