Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/4chanarchives.py
5399 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2023-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Extractors for https://4chanarchives.com/"""
10
11
from .common import Extractor, Message
12
from .. import text
13
14
15
class _4chanarchivesThreadExtractor(Extractor):
16
"""Extractor for threads on 4chanarchives.com"""
17
category = "4chanarchives"
18
subcategory = "thread"
19
root = "https://4chanarchives.com"
20
directory_fmt = ("{category}", "{board}", "{thread} - {title}")
21
filename_fmt = "{no}-{filename}.{extension}"
22
archive_fmt = "{board}_{thread}_{no}"
23
referer = False
24
pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)"
25
example = "https://4chanarchives.com/board/a/thread/12345/"
26
27
def __init__(self, match):
28
Extractor.__init__(self, match)
29
self.board, self.thread = match.groups()
30
31
def items(self):
32
url = f"{self.root}/board/{self.board}/thread/{self.thread}"
33
page = self.request(url).text
34
data = self.metadata(page)
35
posts = self.posts(page)
36
37
if not data["title"]:
38
data["title"] = text.unescape(text.remove_html(
39
posts[0]["com"]))[:50]
40
41
for post in posts:
42
post.update(data)
43
yield Message.Directory, post
44
if "url" in post:
45
yield Message.Url, post["url"], post
46
47
def metadata(self, page):
48
return {
49
"board" : self.board,
50
"thread" : self.thread,
51
"title" : text.unescape(text.extr(
52
page, 'property="og:title" content="', '"')),
53
}
54
55
def posts(self, page):
56
"""Build a list of all post objects"""
57
return [self.parse(html) for html in text.extract_iter(
58
page, 'id="pc', '</blockquote>')]
59
60
def parse(self, html):
61
"""Build post object by extracting data from an HTML post"""
62
post = self._extract_post(html)
63
if ">File: <" in html:
64
self._extract_file(html, post)
65
post["extension"] = post["url"].rpartition(".")[2]
66
return post
67
68
def _extract_post(self, html):
69
extr = text.extract_from(html)
70
return {
71
"no" : text.parse_int(extr('', '"')),
72
"name": extr('class="name">', '<'),
73
"time": extr('class="dateTime postNum" >', '<').rstrip(),
74
"com" : text.unescape(
75
html[html.find('<blockquote'):].partition(">")[2]),
76
}
77
78
def _extract_file(self, html, post):
79
extr = text.extract_from(html, html.index(">File: <"))
80
post["url"] = extr('href="', '"')
81
post["filename"] = text.unquote(extr(">", "<").rpartition(".")[0])
82
post["fsize"] = extr("(", ", ")
83
post["w"] = text.parse_int(extr("", "x"))
84
post["h"] = text.parse_int(extr("", ")"))
85
86
87
class _4chanarchivesBoardExtractor(Extractor):
88
"""Extractor for boards on 4chanarchives.com"""
89
category = "4chanarchives"
90
subcategory = "board"
91
root = "https://4chanarchives.com"
92
pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)(?:/(\d+))?/?$"
93
example = "https://4chanarchives.com/board/a/"
94
95
def __init__(self, match):
96
Extractor.__init__(self, match)
97
self.board, self.page = match.groups()
98
99
def items(self):
100
data = {"_extractor": _4chanarchivesThreadExtractor}
101
pnum = text.parse_int(self.page, 1)
102
needle = '''<span class="postNum desktop">
103
<span><a href="'''
104
105
while True:
106
url = f"{self.root}/board/{self.board}/{pnum}"
107
page = self.request(url).text
108
109
thread = None
110
for thread in text.extract_iter(page, needle, '"'):
111
yield Message.Queue, thread, data
112
113
if thread is None:
114
return
115
pnum += 1
116
117