Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/discord.py
5399 views
1
# -*- coding: utf-8 -*-
2
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as
5
# published by the Free Software Foundation.
6
7
"""Extractors for https://discord.com/"""
8
9
from .common import Extractor, Message
10
from .. import text, exception
11
12
13
BASE_PATTERN = r"(?:https?://)?discord\.com"
14
15
16
class DiscordExtractor(Extractor):
17
"""Base class for Discord extractors"""
18
category = "discord"
19
root = "https://discord.com"
20
directory_fmt = ("{category}", "{server_id}_{server}",
21
"{channel_id}_{channel}")
22
filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}"
23
archive_fmt = "{message_id}_{num}"
24
25
server_metadata = {}
26
server_channels_metadata = {}
27
28
def _init(self):
29
self.token = self.config("token")
30
self.enabled_embeds = self.config("embeds", ["image", "gifv", "video"])
31
self.enabled_threads = self.config("threads", True)
32
self.api = DiscordAPI(self)
33
34
def extract_message_text(self, message):
35
text_content = [message["content"]]
36
37
for embed in message["embeds"]:
38
if embed["type"] == "rich":
39
try:
40
text_content.append(embed["author"]["name"])
41
except Exception:
42
pass
43
text_content.append(embed.get("title", ""))
44
text_content.append(embed.get("description", ""))
45
46
for field in embed.get("fields", []):
47
text_content.append(field.get("name", ""))
48
text_content.append(field.get("value", ""))
49
50
try:
51
text_content.append(embed["footer"]["text"])
52
except Exception:
53
pass
54
55
if message.get("poll"):
56
text_content.append(message["poll"]["question"]["text"])
57
for answer in message["poll"]["answers"]:
58
text_content.append(answer["poll_media"]["text"])
59
60
return "\n".join(t for t in text_content if t)
61
62
def extract_message(self, message):
63
# https://discord.com/developers/docs/resources/message#message-object-message-types
64
if message["type"] in (0, 19, 21):
65
message_metadata = {}
66
message_metadata.update(self.server_metadata)
67
message_metadata.update(
68
self.server_channels_metadata[message["channel_id"]])
69
message_metadata.update({
70
"author": message["author"]["username"],
71
"author_id": message["author"]["id"],
72
"author_files": [],
73
"message": self.extract_message_text(message),
74
"message_id": message["id"],
75
"date": text.parse_datetime(
76
message["timestamp"], "%Y-%m-%dT%H:%M:%S.%f%z"
77
),
78
"files": []
79
})
80
81
for icon_type, icon_path in (
82
("avatar", "avatars"),
83
("banner", "banners")
84
):
85
if message["author"].get(icon_type):
86
message_metadata["author_files"].append({
87
"url": (f"https://cdn.discordapp.com/{icon_path}/"
88
f"{message_metadata['author_id']}/"
89
f"{message['author'][icon_type]}.png"
90
f"?size=4096"),
91
"filename": icon_type,
92
"extension": "png",
93
})
94
95
message_snapshots = [message]
96
message_snapshots.extend(
97
msg["message"] for msg in message.get("message_snapshots", [])
98
if msg["message"]["type"] in (0, 19, 21)
99
)
100
101
for snapshot in message_snapshots:
102
for attachment in snapshot["attachments"]:
103
message_metadata["files"].append({
104
"url": attachment["url"],
105
"type": "attachment",
106
})
107
108
for embed in snapshot["embeds"]:
109
if embed["type"] in self.enabled_embeds:
110
for field in ("video", "image", "thumbnail"):
111
if field not in embed:
112
continue
113
url = embed[field].get("proxy_url")
114
if url is not None:
115
message_metadata["files"].append({
116
"url": url,
117
"type": "embed",
118
})
119
break
120
121
for num, file in enumerate(message_metadata["files"], start=1):
122
text.nameext_from_url(file["url"], file)
123
file["num"] = num
124
125
yield Message.Directory, message_metadata
126
127
for file in message_metadata["files"]:
128
message_metadata_file = message_metadata.copy()
129
message_metadata_file.update(file)
130
yield Message.Url, file["url"], message_metadata_file
131
132
def extract_channel_text(self, channel_id):
133
for message in self.api.get_channel_messages(channel_id):
134
yield from self.extract_message(message)
135
136
def extract_channel_threads(self, channel_id):
137
for thread in self.api.get_channel_threads(channel_id):
138
id = self.parse_channel(thread)["channel_id"]
139
yield from self.extract_channel_text(id)
140
141
def extract_channel(self, channel_id, safe=False):
142
try:
143
if channel_id not in self.server_channels_metadata:
144
self.parse_channel(self.api.get_channel(channel_id))
145
146
channel_type = (
147
self.server_channels_metadata[channel_id]["channel_type"]
148
)
149
150
# https://discord.com/developers/docs/resources/channel#channel-object-channel-types
151
if channel_type in (0, 5):
152
yield from self.extract_channel_text(channel_id)
153
if self.enabled_threads:
154
yield from self.extract_channel_threads(channel_id)
155
elif channel_type in (1, 3, 10, 11, 12):
156
yield from self.extract_channel_text(channel_id)
157
elif channel_type in (15, 16):
158
yield from self.extract_channel_threads(channel_id)
159
elif channel_type in (4,):
160
for channel in self.server_channels_metadata.copy().values():
161
if channel["parent_id"] == channel_id:
162
yield from self.extract_channel(
163
channel["channel_id"], safe=True)
164
elif not safe:
165
raise exception.AbortExtraction(
166
"This channel type is not supported."
167
)
168
except exception.HttpError as exc:
169
if not (exc.status == 403 and safe):
170
raise
171
172
def parse_channel(self, channel):
173
parent_id = channel.get("parent_id")
174
channel_metadata = {
175
"channel": channel.get("name", ""),
176
"channel_id": channel.get("id"),
177
"channel_type": channel.get("type"),
178
"channel_topic": channel.get("topic", ""),
179
"parent_id": parent_id,
180
"is_thread": "thread_metadata" in channel
181
}
182
183
if parent_id in self.server_channels_metadata:
184
parent_metadata = self.server_channels_metadata[parent_id]
185
channel_metadata.update({
186
"parent": parent_metadata["channel"],
187
"parent_type": parent_metadata["channel_type"]
188
})
189
190
if channel_metadata["channel_type"] in (1, 3):
191
channel_metadata.update({
192
"channel": "DMs",
193
"recipients": (
194
[user["username"] for user in channel["recipients"]]
195
),
196
"recipients_id": (
197
[user["id"] for user in channel["recipients"]]
198
)
199
})
200
201
channel_id = channel_metadata["channel_id"]
202
203
self.server_channels_metadata[channel_id] = channel_metadata
204
return channel_metadata
205
206
def parse_server(self, server):
207
self.server_metadata = {
208
"server": server["name"],
209
"server_id": server["id"],
210
"server_files": [],
211
"owner_id": server["owner_id"]
212
}
213
214
for icon_type, icon_path in (
215
("icon", "icons"),
216
("banner", "banners"),
217
("splash", "splashes"),
218
("discovery_splash", "discovery-splashes")
219
):
220
if server.get(icon_type):
221
self.server_metadata["server_files"].append({
222
"url": (f"https://cdn.discordapp.com/{icon_path}/"
223
f"{self.server_metadata['server_id']}/"
224
f"{server[icon_type]}.png?size=4096"),
225
"filename": icon_type,
226
"extension": "png",
227
})
228
229
return self.server_metadata
230
231
def build_server_and_channels(self, server_id):
232
self.parse_server(self.api.get_server(server_id))
233
234
for channel in sorted(
235
self.api.get_server_channels(server_id),
236
key=lambda ch: ch["type"] != 4
237
):
238
self.parse_channel(channel)
239
240
241
class DiscordChannelExtractor(DiscordExtractor):
242
subcategory = "channel"
243
pattern = BASE_PATTERN + r"/channels/(\d+)/(?:\d+/threads/)?(\d+)/?$"
244
example = "https://discord.com/channels/1234567890/9876543210"
245
246
def items(self):
247
server_id, channel_id = self.groups
248
249
self.build_server_and_channels(server_id)
250
251
return self.extract_channel(channel_id)
252
253
254
class DiscordMessageExtractor(DiscordExtractor):
255
subcategory = "message"
256
pattern = BASE_PATTERN + r"/channels/(\d+)/(\d+)/(\d+)/?$"
257
example = "https://discord.com/channels/1234567890/9876543210/2468013579"
258
259
def items(self):
260
server_id, channel_id, message_id = self.groups
261
262
self.build_server_and_channels(server_id)
263
264
if channel_id not in self.server_channels_metadata:
265
self.parse_channel(self.api.get_channel(channel_id))
266
267
return self.extract_message(
268
self.api.get_message(channel_id, message_id))
269
270
271
class DiscordServerExtractor(DiscordExtractor):
272
subcategory = "server"
273
pattern = BASE_PATTERN + r"/channels/(\d+)/?$"
274
example = "https://discord.com/channels/1234567890"
275
276
def items(self):
277
server_id = self.groups[0]
278
279
self.build_server_and_channels(server_id)
280
281
for channel in self.server_channels_metadata.copy().values():
282
if channel["channel_type"] in (0, 5, 15, 16):
283
yield from self.extract_channel(
284
channel["channel_id"], safe=True)
285
286
287
class DiscordDirectMessagesExtractor(DiscordExtractor):
288
subcategory = "direct-messages"
289
directory_fmt = ("{category}", "Direct Messages",
290
"{channel_id}_{recipients:J,}")
291
pattern = BASE_PATTERN + r"/channels/@me/(\d+)/?$"
292
example = "https://discord.com/channels/@me/1234567890"
293
294
def items(self):
295
return self.extract_channel(self.groups[0])
296
297
298
class DiscordDirectMessageExtractor(DiscordExtractor):
299
subcategory = "direct-message"
300
directory_fmt = ("{category}", "Direct Messages",
301
"{channel_id}_{recipients:J,}")
302
pattern = BASE_PATTERN + r"/channels/@me/(\d+)/(\d+)/?$"
303
example = "https://discord.com/channels/@me/1234567890/9876543210"
304
305
def items(self):
306
channel_id, message_id = self.groups
307
308
self.parse_channel(self.api.get_channel(channel_id))
309
310
return self.extract_message(
311
self.api.get_message(channel_id, message_id))
312
313
314
class DiscordAPI():
315
"""Interface for the Discord API v10
316
317
https://discord.com/developers/docs/reference
318
"""
319
320
def __init__(self, extractor):
321
self.extractor = extractor
322
self.root = extractor.root + "/api/v10"
323
self.headers = {"Authorization": extractor.token}
324
325
def get_server(self, server_id):
326
"""Get server information"""
327
return self._call("/guilds/" + server_id)
328
329
def get_server_channels(self, server_id):
330
"""Get server channels"""
331
return self._call("/guilds/" + server_id + "/channels")
332
333
def get_channel(self, channel_id):
334
"""Get channel information"""
335
return self._call("/channels/" + channel_id)
336
337
def get_channel_threads(self, channel_id):
338
"""Get channel threads"""
339
THREADS_BATCH = 25
340
341
def _method(offset):
342
return self._call("/channels/" + channel_id + "/threads/search", {
343
"sort_by": "last_message_time",
344
"sort_order": "desc",
345
"limit": THREADS_BATCH,
346
"offset": + offset,
347
}).get("threads", [])
348
349
return self._pagination(_method, THREADS_BATCH)
350
351
def get_channel_messages(self, channel_id):
352
"""Get channel messages"""
353
MESSAGES_BATCH = 100
354
355
before = None
356
357
def _method(_):
358
nonlocal before
359
messages = self._call("/channels/" + channel_id + "/messages", {
360
"limit": MESSAGES_BATCH,
361
"before": before
362
})
363
if messages:
364
before = messages[-1]["id"]
365
return messages
366
367
return self._pagination(_method, MESSAGES_BATCH)
368
369
def get_message(self, channel_id, message_id):
370
"""Get message information"""
371
return self._call("/channels/" + channel_id + "/messages", {
372
"limit": 1,
373
"around": message_id
374
})[0]
375
376
def _call(self, endpoint, params=None):
377
url = self.root + endpoint
378
try:
379
response = self.extractor.request(
380
url, params=params, headers=self.headers)
381
except exception.HttpError as exc:
382
if exc.status == 401:
383
self._raise_invalid_token()
384
raise
385
return response.json()
386
387
def _pagination(self, method, batch):
388
offset = 0
389
while True:
390
data = method(offset)
391
yield from data
392
if len(data) < batch:
393
return
394
offset += len(data)
395
396
def _raise_invalid_token(self):
397
raise exception.AuthenticationError("""Invalid or missing token.
398
Please provide a valid token following these instructions:
399
400
1) Open Discord in your browser (https://discord.com/app);
401
2) Open your browser's Developer Tools (F12) and switch to the Network panel;
402
3) Reload the page and select any request going to https://discord.com/api/...;
403
4) In the "Headers" tab, look for an entry beginning with "Authorization: ";
404
5) Right-click the entry and click "Copy Value";
405
6) Paste the token in your configuration file under "extractor.discord.token",
406
or run this command with the -o "token=[your token]" argument.""")
407
408