CoCalc -- 2ch.py

GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/2ch.py
⁸⁸⁵⁷ views
1
# -*- coding: utf-8 -*-
2

3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as
5
# published by the Free Software Foundation.
6

7
"""Extractors for https://2ch.org/"""
8

9
from .common import Extractor, Message
10
from .. import text, util
11

12
BASE_PATTERN = r"(?:https?://)?2ch\.(org|su|life|hk)"
13

14

15
class _2chThreadExtractor(Extractor):
16
    """Extractor for 2ch threads"""
17
    category = "2ch"
18
    subcategory = "thread"
19
    root = "https://2ch.org"
20
    directory_fmt = ("{category}", "{board}", "{thread} {title}")
21
    filename_fmt = "{tim}{filename:? //}.{extension}"
22
    archive_fmt = "{board}_{thread}_{tim}"
23
    pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
24
    example = "https://2ch.org/a/res/12345.html"
25

26
    def __init__(self, match):
27
        tld = match[1]
28
        self.root = "https://2ch." + ("org" if tld == "hk" else tld)
29
        Extractor.__init__(self, match)
30

31
    def items(self):
32
        _, board, thread = self.groups
33
        url = f"{self.root}/{board}/res/{thread}.json"
34
        posts = self.request_json(url)["threads"][0]["posts"]
35

36
        op = posts[0]
37
        title = op.get("subject") or text.remove_html(op["comment"])
38

39
        thread = {
40
            "board" : board,
41
            "thread": thread,
42
            "title" : text.unescape(title)[:50],
43
        }
44

45
        yield Message.Directory, "", thread
46
        for post in posts:
47
            if files := post.get("files"):
48
                post["post_name"] = post["name"]
49
                post["date"] = self.parse_timestamp(post["timestamp"])
50
                del post["files"]
51
                del post["name"]
52

53
                for file in files:
54
                    file.update(thread)
55
                    file.update(post)
56

57
                    file["filename"] = file["fullname"].rpartition(".")[0]
58
                    file["tim"], _, file["extension"] = \
59
                        file["name"].rpartition(".")
60

61
                    yield Message.Url, self.root + file["path"], file
62

63

64
class _2chBoardExtractor(Extractor):
65
    """Extractor for 2ch boards"""
66
    category = "2ch"
67
    subcategory = "board"
68
    root = "https://2ch.org"
69
    pattern = BASE_PATTERN + r"/([^/?#]+)/?$"
70
    example = "https://2ch.org/a/"
71

72
    def __init__(self, match):
73
        tld = match[1]
74
        self.root = "https://2ch." + ("org" if tld == "hk" else tld)
75
        Extractor.__init__(self, match)
76

77
    def items(self):
78
        base = f"{self.root}/{self.groups[1]}"
79

80
        # index page
81
        url = base + "/index.json"
82
        index = self.request_json(url)
83
        index["_extractor"] = _2chThreadExtractor
84
        for thread in index["threads"]:
85
            url = f"{base}/res/{thread['thread_num']}.html"
86
            yield Message.Queue, url, index
87

88
        # pages 1..n
89
        for n in util.advance(index["pages"], 1):
90
            url = f"{base}/{n}.json"
91
            page = self.request_json(url)
92
            page["_extractor"] = _2chThreadExtractor
93
            for thread in page["threads"]:
94
                url = f"{base}/res/{thread['thread_num']}.html"
95
                yield Message.Queue, url, page
96

97
Product

Resources

Company