Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/2ch.py
8857 views
1
# -*- coding: utf-8 -*-
2
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as
5
# published by the Free Software Foundation.
6
7
"""Extractors for https://2ch.org/"""
8
9
from .common import Extractor, Message
10
from .. import text, util
11
12
BASE_PATTERN = r"(?:https?://)?2ch\.(org|su|life|hk)"
13
14
15
class _2chThreadExtractor(Extractor):
16
"""Extractor for 2ch threads"""
17
category = "2ch"
18
subcategory = "thread"
19
root = "https://2ch.org"
20
directory_fmt = ("{category}", "{board}", "{thread} {title}")
21
filename_fmt = "{tim}{filename:? //}.{extension}"
22
archive_fmt = "{board}_{thread}_{tim}"
23
pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
24
example = "https://2ch.org/a/res/12345.html"
25
26
def __init__(self, match):
27
tld = match[1]
28
self.root = "https://2ch." + ("org" if tld == "hk" else tld)
29
Extractor.__init__(self, match)
30
31
def items(self):
32
_, board, thread = self.groups
33
url = f"{self.root}/{board}/res/{thread}.json"
34
posts = self.request_json(url)["threads"][0]["posts"]
35
36
op = posts[0]
37
title = op.get("subject") or text.remove_html(op["comment"])
38
39
thread = {
40
"board" : board,
41
"thread": thread,
42
"title" : text.unescape(title)[:50],
43
}
44
45
yield Message.Directory, "", thread
46
for post in posts:
47
if files := post.get("files"):
48
post["post_name"] = post["name"]
49
post["date"] = self.parse_timestamp(post["timestamp"])
50
del post["files"]
51
del post["name"]
52
53
for file in files:
54
file.update(thread)
55
file.update(post)
56
57
file["filename"] = file["fullname"].rpartition(".")[0]
58
file["tim"], _, file["extension"] = \
59
file["name"].rpartition(".")
60
61
yield Message.Url, self.root + file["path"], file
62
63
64
class _2chBoardExtractor(Extractor):
65
"""Extractor for 2ch boards"""
66
category = "2ch"
67
subcategory = "board"
68
root = "https://2ch.org"
69
pattern = BASE_PATTERN + r"/([^/?#]+)/?$"
70
example = "https://2ch.org/a/"
71
72
def __init__(self, match):
73
tld = match[1]
74
self.root = "https://2ch." + ("org" if tld == "hk" else tld)
75
Extractor.__init__(self, match)
76
77
def items(self):
78
base = f"{self.root}/{self.groups[1]}"
79
80
# index page
81
url = base + "/index.json"
82
index = self.request_json(url)
83
index["_extractor"] = _2chThreadExtractor
84
for thread in index["threads"]:
85
url = f"{base}/res/{thread['thread_num']}.html"
86
yield Message.Queue, url, index
87
88
# pages 1..n
89
for n in util.advance(index["pages"], 1):
90
url = f"{base}/{n}.json"
91
page = self.request_json(url)
92
page["_extractor"] = _2chThreadExtractor
93
for thread in page["threads"]:
94
url = f"{base}/res/{thread['thread_num']}.html"
95
yield Message.Queue, url, page
96
97