CoCalc -- ao3.py

GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/ao3.py
⁸⁹⁰¹ views
1
# -*- coding: utf-8 -*-
2

3
# Copyright 2024-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8

9
"""Extractors for https://archiveofourown.org/"""
10

11
from .common import Extractor, Message, Dispatch
12
from .. import text, util, exception
13
from ..cache import cache
14

15
BASE_PATTERN = (r"(?:https?://)?(?:www\.)?"
16
                r"a(?:rchiveofourown|o3)\.(?:org|com|net)")
17

18

19
class Ao3Extractor(Extractor):
20
    """Base class for ao3 extractors"""
21
    category = "ao3"
22
    root = "https://archiveofourown.org"
23
    categorytransfer = True
24
    cookies_domain = ".archiveofourown.org"
25
    cookies_names = ("remember_user_token",)
26
    request_interval = (0.5, 1.5)
27

28
    def items(self):
29
        self.login()
30

31
        base = self.root + "/works/"
32
        data = {"_extractor": Ao3WorkExtractor, "type": "work"}
33

34
        for work_id in self.works():
35
            yield Message.Queue, base + work_id, data
36

37
    def items_list(self, type, needle, part=True):
38
        self.login()
39

40
        base = self.root + "/"
41
        data_work = {"_extractor": Ao3WorkExtractor, "type": "work"}
42
        data_series = {"_extractor": Ao3SeriesExtractor, "type": "series"}
43
        data_user = {"_extractor": Ao3UserExtractor, "type": "user"}
44

45
        for item in self._pagination(self.groups[0], needle):
46
            path = item.rpartition("/")[0] if part else item
47
            url = base + path
48
            if item.startswith("works/"):
49
                yield Message.Queue, url, data_work
50
            elif item.startswith("series/"):
51
                yield Message.Queue, url, data_series
52
            elif item.startswith("users/"):
53
                yield Message.Queue, url, data_user
54
            else:
55
                self.log.warning("Unsupported %s type '%s'", type, path)
56

57
    def works(self):
58
        return self._pagination(self.groups[0])
59

60
    def login(self):
61
        if self.cookies_check(self.cookies_names):
62
            return
63

64
        username, password = self._get_auth_info()
65
        if username:
66
            return self.cookies_update(self._login_impl(username, password))
67

68
    @cache(maxage=90*86400, keyarg=1)
69
    def _login_impl(self, username, password):
70
        self.log.info("Logging in as %s", username)
71

72
        url = self.root + "/users/login"
73
        page = self.request(url).text
74

75
        pos = page.find('id="loginform"')
76
        token = text.extract(
77
            page, ' name="authenticity_token" value="', '"', pos)[0]
78
        if not token:
79
            self.log.error("Unable to extract 'authenticity_token'")
80

81
        data = {
82
            "authenticity_token": text.unescape(token),
83
            "user[login]"       : username,
84
            "user[password]"    : password,
85
            "user[remember_me]" : "1",
86
            "commit"            : "Log In",
87
        }
88

89
        response = self.request(url, method="POST", data=data)
90
        if not response.history:
91
            raise exception.AuthenticationError()
92

93
        remember = response.history[0].cookies.get("remember_user_token")
94
        if not remember:
95
            raise exception.AuthenticationError()
96

97
        return {
98
            "remember_user_token": remember,
99
            "user_credentials"   : "1",
100
        }
101

102
    def _pagination(self, path, needle='<li id="work_'):
103
        while True:
104
            page = self.request(self.root + path).text
105

106
            yield from text.extract_iter(page, needle, '"')
107

108
            path = (text.extr(page, '<a rel="next" href="', '"') or
109
                    text.extr(page, '<li class="next"><a href="', '"'))
110
            if not path:
111
                return
112
            path = text.unescape(path)
113

114

115
class Ao3WorkExtractor(Ao3Extractor):
116
    """Extractor for an AO3 work"""
117
    subcategory = "work"
118
    directory_fmt = ("{category}", "{author}")
119
    filename_fmt = "{id} {title}.{extension}"
120
    archive_fmt = "{id}.{extension}"
121
    pattern = BASE_PATTERN + r"/works/(\d+)"
122
    example = "https://archiveofourown.org/works/12345"
123

124
    def _init(self):
125
        formats = self.config("formats")
126
        if formats is None:
127
            self.formats = ("pdf",)
128
        elif not formats:
129
            self.formats = ()
130
        elif isinstance(formats, str):
131
            self.formats = formats.lower().replace(" ", "").split(",")
132
        else:
133
            self.formats = formats
134

135
        self.cookies.set("view_adult", "true", domain="archiveofourown.org")
136

137
    def items(self):
138
        self.login()
139

140
        work_id = self.groups[0]
141
        url = f"{self.root}/works/{work_id}"
142
        response = self.request(url, notfound=True)
143

144
        if response.url.endswith("/users/login?restricted=true"):
145
            raise exception.AuthorizationError(
146
                "Login required to access member-only works")
147
        page = response.text
148
        if len(page) < 20000 and \
149
                '<h2 class="landmark heading">Adult Content Warning</' in page:
150
            raise exception.AbortExtraction("Adult Content")
151

152
        extr = text.extract_from(page)
153

154
        chapters = {}
155
        cindex = extr(' id="chapter_index"', "</ul>")
156
        for ch in text.extract_iter(cindex, ' value="', "</option>"):
157
            cid, _, cname = ch.partition('">')
158
            chapters[cid] = text.unescape(cname)
159

160
        fmts = {}
161
        path = ""
162
        download = extr(' class="download"', "</ul>")
163
        for dl in text.extract_iter(download, ' href="', "</"):
164
            path, _, type = dl.rpartition('">')
165
            fmts[type.lower()] = path
166

167
        data = {
168
            "id"           : text.parse_int(work_id),
169
            "rating"       : text.split_html(
170
                extr('<dd class="rating tags">', "</dd>")),
171
            "warnings"     : text.split_html(
172
                extr('<dd class="warning tags">', "</dd>")),
173
            "categories"   : text.split_html(
174
                extr('<dd class="category tags">', "</dd>")),
175
            "fandom"       : text.split_html(
176
                extr('<dd class="fandom tags">', "</dd>")),
177
            "relationships": text.split_html(
178
                extr('<dd class="relationship tags">', "</dd>")),
179
            "characters"   : text.split_html(
180
                extr('<dd class="character tags">', "</dd>")),
181
            "tags"         : text.split_html(
182
                extr('<dd class="freeform tags">', "</dd>")),
183
            "lang"         : extr('<dd class="language" lang="', '"'),
184
            "series"       : extr('<dd class="series">', "</dd>"),
185
            "date"         : self.parse_datetime_iso(extr(
186
                '<dd class="published">', "<")),
187
            "date_completed": self.parse_datetime_iso(extr(
188
                '>Completed:</dt><dd class="status">', "<")),
189
            "date_updated" : self.parse_timestamp(
190
                path.rpartition("updated_at=")[2]),
191
            "words"        : text.parse_int(
192
                extr('<dd class="words">', "<").replace(",", "")),
193
            "chapters"     : chapters,
194
            "comments"     : text.parse_int(
195
                extr('<dd class="comments">', "<").replace(",", "")),
196
            "likes"        : text.parse_int(
197
                extr('<dd class="kudos">', "<").replace(",", "")),
198
            "bookmarks"    : text.parse_int(text.remove_html(
199
                extr('<dd class="bookmarks">', "</dd>")).replace(",", "")),
200
            "views"        : text.parse_int(
201
                extr('<dd class="hits">', "<").replace(",", "")),
202
            "title"        : text.unescape(text.remove_html(
203
                extr(' class="title heading">', "</h2>")).strip()),
204
            "author"       : text.unescape(text.remove_html(
205
                extr(' class="byline heading">', "</h3>"))),
206
            "summary"      : text.split_html(
207
                extr(' class="heading">Summary:</h3>', "</div>")),
208
        }
209
        data["language"] = util.code_to_language(data["lang"])
210

211
        if series := data["series"]:
212
            extr = text.extract_from(series)
213
            data["series"] = {
214
                "prev" : extr(' class="previous" href="/works/', '"'),
215
                "index": extr(' class="position">Part ', " "),
216
                "id"   : extr(' href="/series/', '"'),
217
                "name" : text.unescape(extr(">", "<")),
218
                "next" : extr(' class="next" href="/works/', '"'),
219
            }
220
        else:
221
            data["series"] = None
222

223
        yield Message.Directory, "", data
224
        for fmt in self.formats:
225
            try:
226
                url = text.urljoin(self.root, fmts[fmt])
227
            except KeyError:
228
                self.log.warning("%s: Format '%s' not available", work_id, fmt)
229
            else:
230
                yield Message.Url, url, text.nameext_from_url(url, data)
231

232

233
class Ao3SeriesExtractor(Ao3Extractor):
234
    """Extractor for AO3 works of a series"""
235
    subcategory = "series"
236
    pattern = BASE_PATTERN + r"(/series/(\d+))"
237
    example = "https://archiveofourown.org/series/12345"
238

239

240
class Ao3TagExtractor(Ao3Extractor):
241
    """Extractor for AO3 works by tag"""
242
    subcategory = "tag"
243
    pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)"
244
    example = "https://archiveofourown.org/tags/TAG/works"
245

246

247
class Ao3SearchExtractor(Ao3Extractor):
248
    """Extractor for AO3 search results"""
249
    subcategory = "search"
250
    pattern = BASE_PATTERN + r"(/works/search/?\?.+)"
251
    example = "https://archiveofourown.org/works/search?work_search[query]=air"
252

253

254
class Ao3UserExtractor(Dispatch, Ao3Extractor):
255
    """Extractor for an AO3 user profile"""
256
    pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)"
257
               r"(?:/profile)?/?(?:$|\?|#)")
258
    example = "https://archiveofourown.org/users/USER"
259

260
    def items(self):
261
        base = f"{self.root}/users/{self.groups[0]}/"
262
        return self._dispatch_extractors((
263
            (Ao3UserWorksExtractor   , base + "works"),
264
            (Ao3UserSeriesExtractor  , base + "series"),
265
            (Ao3UserBookmarkExtractor, base + "bookmarks"),
266
        ), ("user-works", "user-series"))
267

268

269
class Ao3UserWorksExtractor(Ao3Extractor):
270
    """Extractor for works of an AO3 user"""
271
    subcategory = "user-works"
272
    pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
273
               r"works(?:/?\?.+)?)")
274
    example = "https://archiveofourown.org/users/USER/works"
275

276

277
class Ao3UserSeriesExtractor(Ao3Extractor):
278
    """Extractor for series of an AO3 user"""
279
    subcategory = "user-series"
280
    pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
281
               r"series(?:/?\?.+)?)")
282
    example = "https://archiveofourown.org/users/USER/series"
283

284
    def items(self):
285
        self.login()
286

287
        base = self.root + "/series/"
288
        data = {"_extractor": Ao3SeriesExtractor}
289

290
        for series_id in self.series():
291
            yield Message.Queue, base + series_id, data
292

293
    def series(self):
294
        return self._pagination(self.groups[0], '<li id="series_')
295

296

297
class Ao3UserBookmarkExtractor(Ao3Extractor):
298
    """Extractor for bookmarked works of an AO3 user"""
299
    subcategory = "user-bookmark"
300
    pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
301
               r"bookmarks(?:/?\?.+)?)")
302
    example = "https://archiveofourown.org/users/USER/bookmarks"
303

304
    def items(self):
305
        return self.items_list("bookmark", '<span class="count"><a href="/')
306

307

308
class Ao3SubscriptionsExtractor(Ao3Extractor):
309
    """Extractor for your AO3 account's subscriptions"""
310
    subcategory = "subscriptions"
311
    pattern = BASE_PATTERN + r"(/users/([^/?#]+)/subscriptions(?:/?\?.+)?)"
312
    example = "https://archiveofourown.org/users/USER/subscriptions"
313

314
    def items(self):
315
        return self.items_list("subscription", '<dt>\n<a href="/', False)
316

317
Product

Resources

Company