Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/ao3.py
5399 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2024-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Extractors for https://archiveofourown.org/"""
10
11
from .common import Extractor, Message, Dispatch
12
from .. import text, util, exception
13
from ..cache import cache
14
15
BASE_PATTERN = (r"(?:https?://)?(?:www\.)?"
16
r"a(?:rchiveofourown|o3)\.(?:org|com|net)")
17
18
19
class Ao3Extractor(Extractor):
20
"""Base class for ao3 extractors"""
21
category = "ao3"
22
root = "https://archiveofourown.org"
23
categorytransfer = True
24
cookies_domain = ".archiveofourown.org"
25
cookies_names = ("remember_user_token",)
26
request_interval = (0.5, 1.5)
27
28
def items(self):
29
self.login()
30
31
base = self.root + "/works/"
32
data = {"_extractor": Ao3WorkExtractor, "type": "work"}
33
34
for work_id in self.works():
35
yield Message.Queue, base + work_id, data
36
37
def items_list(self, type, needle, part=True):
38
self.login()
39
40
base = self.root + "/"
41
data_work = {"_extractor": Ao3WorkExtractor, "type": "work"}
42
data_series = {"_extractor": Ao3SeriesExtractor, "type": "series"}
43
data_user = {"_extractor": Ao3UserExtractor, "type": "user"}
44
45
for item in self._pagination(self.groups[0], needle):
46
path = item.rpartition("/")[0] if part else item
47
url = base + path
48
if item.startswith("works/"):
49
yield Message.Queue, url, data_work
50
elif item.startswith("series/"):
51
yield Message.Queue, url, data_series
52
elif item.startswith("users/"):
53
yield Message.Queue, url, data_user
54
else:
55
self.log.warning("Unsupported %s type '%s'", type, path)
56
57
def works(self):
58
return self._pagination(self.groups[0])
59
60
def login(self):
61
if self.cookies_check(self.cookies_names):
62
return
63
64
username, password = self._get_auth_info()
65
if username:
66
return self.cookies_update(self._login_impl(username, password))
67
68
@cache(maxage=90*86400, keyarg=1)
69
def _login_impl(self, username, password):
70
self.log.info("Logging in as %s", username)
71
72
url = self.root + "/users/login"
73
page = self.request(url).text
74
75
pos = page.find('id="loginform"')
76
token = text.extract(
77
page, ' name="authenticity_token" value="', '"', pos)[0]
78
if not token:
79
self.log.error("Unable to extract 'authenticity_token'")
80
81
data = {
82
"authenticity_token": text.unescape(token),
83
"user[login]" : username,
84
"user[password]" : password,
85
"user[remember_me]" : "1",
86
"commit" : "Log In",
87
}
88
89
response = self.request(url, method="POST", data=data)
90
if not response.history:
91
raise exception.AuthenticationError()
92
93
remember = response.history[0].cookies.get("remember_user_token")
94
if not remember:
95
raise exception.AuthenticationError()
96
97
return {
98
"remember_user_token": remember,
99
"user_credentials" : "1",
100
}
101
102
def _pagination(self, path, needle='<li id="work_'):
103
while True:
104
page = self.request(self.root + path).text
105
yield from text.extract_iter(page, needle, '"')
106
path = text.extr(page, '<a rel="next" href="', '"')
107
if not path:
108
return
109
path = text.unescape(path)
110
111
112
class Ao3WorkExtractor(Ao3Extractor):
113
"""Extractor for an AO3 work"""
114
subcategory = "work"
115
directory_fmt = ("{category}", "{author}")
116
filename_fmt = "{id} {title}.{extension}"
117
archive_fmt = "{id}.{extension}"
118
pattern = BASE_PATTERN + r"/works/(\d+)"
119
example = "https://archiveofourown.org/works/12345"
120
121
def _init(self):
122
formats = self.config("formats")
123
if formats is None:
124
self.formats = ("pdf",)
125
elif not formats:
126
self.formats = ()
127
elif isinstance(formats, str):
128
self.formats = formats.lower().replace(" ", "").split(",")
129
else:
130
self.formats = formats
131
132
self.cookies.set("view_adult", "true", domain="archiveofourown.org")
133
134
def items(self):
135
self.login()
136
137
work_id = self.groups[0]
138
url = f"{self.root}/works/{work_id}"
139
response = self.request(url, notfound="work")
140
141
if response.url.endswith("/users/login?restricted=true"):
142
raise exception.AuthorizationError(
143
"Login required to access member-only works")
144
page = response.text
145
if len(page) < 20000 and \
146
'<h2 class="landmark heading">Adult Content Warning</' in page:
147
raise exception.AbortExtraction("Adult Content")
148
149
extr = text.extract_from(page)
150
151
chapters = {}
152
cindex = extr(' id="chapter_index"', "</ul>")
153
for ch in text.extract_iter(cindex, ' value="', "</option>"):
154
cid, _, cname = ch.partition('">')
155
chapters[cid] = text.unescape(cname)
156
157
fmts = {}
158
path = ""
159
download = extr(' class="download"', "</ul>")
160
for dl in text.extract_iter(download, ' href="', "</"):
161
path, _, type = dl.rpartition('">')
162
fmts[type.lower()] = path
163
164
data = {
165
"id" : text.parse_int(work_id),
166
"rating" : text.split_html(
167
extr('<dd class="rating tags">', "</dd>")),
168
"warnings" : text.split_html(
169
extr('<dd class="warning tags">', "</dd>")),
170
"categories" : text.split_html(
171
extr('<dd class="category tags">', "</dd>")),
172
"fandom" : text.split_html(
173
extr('<dd class="fandom tags">', "</dd>")),
174
"relationships": text.split_html(
175
extr('<dd class="relationship tags">', "</dd>")),
176
"characters" : text.split_html(
177
extr('<dd class="character tags">', "</dd>")),
178
"tags" : text.split_html(
179
extr('<dd class="freeform tags">', "</dd>")),
180
"lang" : extr('<dd class="language" lang="', '"'),
181
"series" : extr('<dd class="series">', "</dd>"),
182
"date" : text.parse_datetime(
183
extr('<dd class="published">', "<"), "%Y-%m-%d"),
184
"date_completed": text.parse_datetime(
185
extr('>Completed:</dt><dd class="status">', "<"), "%Y-%m-%d"),
186
"date_updated" : text.parse_timestamp(
187
path.rpartition("updated_at=")[2]),
188
"words" : text.parse_int(
189
extr('<dd class="words">', "<").replace(",", "")),
190
"chapters" : chapters,
191
"comments" : text.parse_int(
192
extr('<dd class="comments">', "<").replace(",", "")),
193
"likes" : text.parse_int(
194
extr('<dd class="kudos">', "<").replace(",", "")),
195
"bookmarks" : text.parse_int(text.remove_html(
196
extr('<dd class="bookmarks">', "</dd>")).replace(",", "")),
197
"views" : text.parse_int(
198
extr('<dd class="hits">', "<").replace(",", "")),
199
"title" : text.unescape(text.remove_html(
200
extr(' class="title heading">', "</h2>")).strip()),
201
"author" : text.unescape(text.remove_html(
202
extr(' class="byline heading">', "</h3>"))),
203
"summary" : text.split_html(
204
extr(' class="heading">Summary:</h3>', "</div>")),
205
}
206
data["language"] = util.code_to_language(data["lang"])
207
208
if series := data["series"]:
209
extr = text.extract_from(series)
210
data["series"] = {
211
"prev" : extr(' class="previous" href="/works/', '"'),
212
"index": extr(' class="position">Part ', " "),
213
"id" : extr(' href="/series/', '"'),
214
"name" : text.unescape(extr(">", "<")),
215
"next" : extr(' class="next" href="/works/', '"'),
216
}
217
else:
218
data["series"] = None
219
220
yield Message.Directory, data
221
for fmt in self.formats:
222
try:
223
url = text.urljoin(self.root, fmts[fmt])
224
except KeyError:
225
self.log.warning("%s: Format '%s' not available", work_id, fmt)
226
else:
227
yield Message.Url, url, text.nameext_from_url(url, data)
228
229
230
class Ao3SeriesExtractor(Ao3Extractor):
231
"""Extractor for AO3 works of a series"""
232
subcategory = "series"
233
pattern = BASE_PATTERN + r"(/series/(\d+))"
234
example = "https://archiveofourown.org/series/12345"
235
236
237
class Ao3TagExtractor(Ao3Extractor):
238
"""Extractor for AO3 works by tag"""
239
subcategory = "tag"
240
pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)"
241
example = "https://archiveofourown.org/tags/TAG/works"
242
243
244
class Ao3SearchExtractor(Ao3Extractor):
245
"""Extractor for AO3 search results"""
246
subcategory = "search"
247
pattern = BASE_PATTERN + r"(/works/search/?\?.+)"
248
example = "https://archiveofourown.org/works/search?work_search[query]=air"
249
250
251
class Ao3UserExtractor(Dispatch, Ao3Extractor):
252
"""Extractor for an AO3 user profile"""
253
pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)"
254
r"(?:/profile)?/?(?:$|\?|#)")
255
example = "https://archiveofourown.org/users/USER"
256
257
def items(self):
258
base = f"{self.root}/users/{self.groups[0]}/"
259
return self._dispatch_extractors((
260
(Ao3UserWorksExtractor , base + "works"),
261
(Ao3UserSeriesExtractor , base + "series"),
262
(Ao3UserBookmarkExtractor, base + "bookmarks"),
263
), ("user-works", "user-series"))
264
265
266
class Ao3UserWorksExtractor(Ao3Extractor):
267
"""Extractor for works of an AO3 user"""
268
subcategory = "user-works"
269
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
270
r"works(?:/?\?.+)?)")
271
example = "https://archiveofourown.org/users/USER/works"
272
273
274
class Ao3UserSeriesExtractor(Ao3Extractor):
275
"""Extractor for series of an AO3 user"""
276
subcategory = "user-series"
277
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
278
r"series(?:/?\?.+)?)")
279
example = "https://archiveofourown.org/users/USER/series"
280
281
def items(self):
282
self.login()
283
284
base = self.root + "/series/"
285
data = {"_extractor": Ao3SeriesExtractor}
286
287
for series_id in self.series():
288
yield Message.Queue, base + series_id, data
289
290
def series(self):
291
return self._pagination(self.groups[0], '<li id="series_')
292
293
294
class Ao3UserBookmarkExtractor(Ao3Extractor):
295
"""Extractor for bookmarked works of an AO3 user"""
296
subcategory = "user-bookmark"
297
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
298
r"bookmarks(?:/?\?.+)?)")
299
example = "https://archiveofourown.org/users/USER/bookmarks"
300
301
def items(self):
302
return self.items_list("bookmark", '<span class="count"><a href="/')
303
304
305
class Ao3SubscriptionsExtractor(Ao3Extractor):
306
"""Extractor for your AO3 account's subscriptions"""
307
subcategory = "subscriptions"
308
pattern = BASE_PATTERN + r"(/users/([^/?#]+)/subscriptions(?:/?\?.+)?)"
309
example = "https://archiveofourown.org/users/USER/subscriptions"
310
311
def items(self):
312
return self.items_list("subscription", '<dt>\n<a href="/', False)
313
314