CoCalc -- 8chan.py

GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/8chan.py
⁸⁹⁰⁰ views
1
# -*- coding: utf-8 -*-
2

3
# Copyright 2022-2026 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8

9
"""Extractors for https://8chan.moe/"""
10

11
from .common import Extractor, Message
12
from .. import text, dt
13
from ..cache import memcache
14
import itertools
15

16
BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)"
17

18

19
class _8chanExtractor(Extractor):
20
    """Base class for 8chan extractors"""
21
    category = "8chan"
22
    root = "https://8chan.moe"
23

24
    def __init__(self, match):
25
        self.root = "https://8chan." + match[1]
26
        Extractor.__init__(self, match)
27

28
    @memcache()
29
    def cookies_tos_name(self):
30
        domain = "8chan." + self.groups[0]
31
        for cookie in self.cookies:
32
            if cookie.domain == domain and \
33
                    cookie.name.lower().startswith("tos"):
34
                self.log.debug("TOS cookie name: %s", cookie.name)
35
                return cookie.name
36

37
        url = self.root + "/.static/pages/confirmed.html"
38
        headers = {"Referer": self.root + "/.static/pages/disclaimer.html"}
39
        response = self.request(url, headers=headers, allow_redirects=False)
40

41
        for cookie in response.cookies:
42
            if cookie.name.lower().startswith("tos"):
43
                self.log.debug("TOS cookie name: %s", cookie.name)
44
                return cookie.name
45

46
        self.log.error("Unable to determin TOS cookie name")
47
        return "TOS20250418"
48

49
    @memcache()
50
    def cookies_prepare(self):
51
        # fetch captcha cookies
52
        # (necessary to download without getting interrupted)
53
        now = dt.now()
54
        url = self.root + "/captcha.js"
55
        params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")}
56
        self.request(url, params=params).content
57

58
        # adjust cookies
59
        # - remove 'expires' timestamp
60
        # - move 'captchaexpiration' value forward by 1 month
61
        domain = self.root.rpartition("/")[2]
62
        for cookie in self.cookies:
63
            if cookie.domain.endswith(domain):
64
                cookie.expires = None
65
                if cookie.name == "captchaexpiration":
66
                    cookie.value = (now + dt.timedelta(30, 300)).strftime(
67
                        "%a, %d %b %Y %H:%M:%S GMT")
68

69
        return self.cookies
70

71

72
class _8chanThreadExtractor(_8chanExtractor):
73
    """Extractor for 8chan threads"""
74
    subcategory = "thread"
75
    directory_fmt = ("{category}", "{boardUri}",
76
                     "{threadId} {subject[:50]}")
77
    filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}"
78
    archive_fmt = "{boardUri}_{postId}_{num}"
79
    pattern = BASE_PATTERN + r"/([^/?#]+)/(?:res|last)/(\d+)"
80
    example = "https://8chan.moe/a/res/12345.html"
81

82
    def items(self):
83
        _, board, thread = self.groups
84
        self.cookies.set(self.cookies_tos_name(), "1", domain=self.root[8:])
85

86
        # fetch thread data
87
        url = f"{self.root}/{board}/res/{thread}."
88
        self.session.headers["Referer"] = url + "html"
89
        thread = self.request_json(url + "json")
90
        thread["postId"] = thread["threadId"]
91
        thread["_http_headers"] = {"Referer": url + "html"}
92

93
        try:
94
            self.cookies = self.cookies_prepare()
95
        except Exception as exc:
96
            self.log.debug("Failed to fetch captcha cookies:  %s: %s",
97
                           exc.__class__.__name__, exc, exc_info=exc)
98

99
        # download files
100
        posts = thread.pop("posts", ())
101
        yield Message.Directory, "", thread
102
        for post in itertools.chain((thread,), posts):
103
            files = post.pop("files", ())
104
            if not files:
105
                continue
106
            thread.update(post)
107
            for num, file in enumerate(files):
108
                file.update(thread)
109
                file["num"] = num
110
                file["_http_validate"] = _validate
111
                text.nameext_from_url(file["originalName"], file)
112
                yield Message.Url, self.root + file["path"], file
113

114

115
class _8chanBoardExtractor(_8chanExtractor):
116
    """Extractor for 8chan boards"""
117
    subcategory = "board"
118
    pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$"
119
    example = "https://8chan.moe/a/"
120

121
    def items(self):
122
        _, board, pnum = self.groups
123
        self.cookies.set(self.cookies_tos_name(), "1", domain=self.root[8:])
124

125
        pnum = text.parse_int(pnum, 1)
126
        url = f"{self.root}/{board}/{pnum}.json"
127
        data = self.request_json(url)
128
        threads = data["threads"]
129

130
        while True:
131
            for thread in threads:
132
                thread["_extractor"] = _8chanThreadExtractor
133
                url = f"{self.root}/{board}/res/{thread['threadId']}.html"
134
                yield Message.Queue, url, thread
135

136
            pnum += 1
137
            if pnum > data["pageCount"]:
138
                return
139
            url = f"{self.root}/{board}/{pnum}.json"
140
            threads = self.request_json(url)["threads"]
141

142

143
def _validate(response):
144
    hget = response.headers.get
145
    return not (
146
        hget("expires") == "0" and
147
        hget("content-length") == "166" and
148
        hget("content-type") == "image/png"
149
    )
150

151
Product

Resources

Company