CoCalc -- 4chanarchives.py

GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/4chanarchives.py
⁵³⁹⁹ views
1
# -*- coding: utf-8 -*-
2

3
# Copyright 2023-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8

9
"""Extractors for https://4chanarchives.com/"""
10

11
from .common import Extractor, Message
12
from .. import text
13

14

15
class _4chanarchivesThreadExtractor(Extractor):
16
    """Extractor for threads on 4chanarchives.com"""
17
    category = "4chanarchives"
18
    subcategory = "thread"
19
    root = "https://4chanarchives.com"
20
    directory_fmt = ("{category}", "{board}", "{thread} - {title}")
21
    filename_fmt = "{no}-{filename}.{extension}"
22
    archive_fmt = "{board}_{thread}_{no}"
23
    referer = False
24
    pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)"
25
    example = "https://4chanarchives.com/board/a/thread/12345/"
26

27
    def __init__(self, match):
28
        Extractor.__init__(self, match)
29
        self.board, self.thread = match.groups()
30

31
    def items(self):
32
        url = f"{self.root}/board/{self.board}/thread/{self.thread}"
33
        page = self.request(url).text
34
        data = self.metadata(page)
35
        posts = self.posts(page)
36

37
        if not data["title"]:
38
            data["title"] = text.unescape(text.remove_html(
39
                posts[0]["com"]))[:50]
40

41
        for post in posts:
42
            post.update(data)
43
            yield Message.Directory, post
44
            if "url" in post:
45
                yield Message.Url, post["url"], post
46

47
    def metadata(self, page):
48
        return {
49
            "board"     : self.board,
50
            "thread"    : self.thread,
51
            "title"     : text.unescape(text.extr(
52
                page, 'property="og:title" content="', '"')),
53
        }
54

55
    def posts(self, page):
56
        """Build a list of all post objects"""
57
        return [self.parse(html) for html in text.extract_iter(
58
            page, 'id="pc', '</blockquote>')]
59

60
    def parse(self, html):
61
        """Build post object by extracting data from an HTML post"""
62
        post = self._extract_post(html)
63
        if ">File: <" in html:
64
            self._extract_file(html, post)
65
            post["extension"] = post["url"].rpartition(".")[2]
66
        return post
67

68
    def _extract_post(self, html):
69
        extr = text.extract_from(html)
70
        return {
71
            "no"  : text.parse_int(extr('', '"')),
72
            "name": extr('class="name">', '<'),
73
            "time": extr('class="dateTime postNum" >', '<').rstrip(),
74
            "com" : text.unescape(
75
                html[html.find('<blockquote'):].partition(">")[2]),
76
        }
77

78
    def _extract_file(self, html, post):
79
        extr = text.extract_from(html, html.index(">File: <"))
80
        post["url"] = extr('href="', '"')
81
        post["filename"] = text.unquote(extr(">", "<").rpartition(".")[0])
82
        post["fsize"] = extr("(", ", ")
83
        post["w"] = text.parse_int(extr("", "x"))
84
        post["h"] = text.parse_int(extr("", ")"))
85

86

87
class _4chanarchivesBoardExtractor(Extractor):
88
    """Extractor for boards on 4chanarchives.com"""
89
    category = "4chanarchives"
90
    subcategory = "board"
91
    root = "https://4chanarchives.com"
92
    pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)(?:/(\d+))?/?$"
93
    example = "https://4chanarchives.com/board/a/"
94

95
    def __init__(self, match):
96
        Extractor.__init__(self, match)
97
        self.board, self.page = match.groups()
98

99
    def items(self):
100
        data = {"_extractor": _4chanarchivesThreadExtractor}
101
        pnum = text.parse_int(self.page, 1)
102
        needle = '''<span class="postNum desktop">
103
                        <span><a href="'''
104

105
        while True:
106
            url = f"{self.root}/board/{self.board}/{pnum}"
107
            page = self.request(url).text
108

109
            thread = None
110
            for thread in text.extract_iter(page, needle, '"'):
111
                yield Message.Queue, thread, data
112

113
            if thread is None:
114
                return
115
            pnum += 1
116

117
Product

Resources

Company