Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/4chan.py
5399 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2015-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Extractors for https://www.4chan.org/"""
10
11
from .common import Extractor, Message
12
from .. import text
13
14
15
class _4chanThreadExtractor(Extractor):
16
"""Extractor for 4chan threads"""
17
category = "4chan"
18
subcategory = "thread"
19
directory_fmt = ("{category}", "{board}", "{thread} {title}")
20
filename_fmt = "{tim} {filename}.{extension}"
21
archive_fmt = "{board}_{thread}_{tim}"
22
pattern = (r"(?:https?://)?boards\.4chan(?:nel)?\.org"
23
r"/([^/]+)/thread/(\d+)")
24
example = "https://boards.4channel.org/a/thread/12345/"
25
26
def __init__(self, match):
27
Extractor.__init__(self, match)
28
self.board, self.thread = match.groups()
29
30
def items(self):
31
url = f"https://a.4cdn.org/{self.board}/thread/{self.thread}.json"
32
posts = self.request_json(url)["posts"]
33
title = posts[0].get("sub") or text.remove_html(posts[0]["com"])
34
35
data = {
36
"board" : self.board,
37
"thread": self.thread,
38
"title" : text.unescape(title)[:50],
39
}
40
41
yield Message.Directory, data
42
for post in posts:
43
if "filename" in post:
44
post.update(data)
45
post["extension"] = post["ext"][1:]
46
post["filename"] = text.unescape(post["filename"])
47
post["_http_signature"] = _detect_null_byte
48
url = (f"https://i.4cdn.org"
49
f"/{post['board']}/{post['tim']}{post['ext']}")
50
yield Message.Url, url, post
51
52
53
def _detect_null_byte(signature):
54
"""Return False if all file signature bytes are null"""
55
if signature:
56
if signature[0]:
57
return True
58
for byte in signature:
59
if byte:
60
return True
61
return "File data consists of null bytes"
62
63
64
class _4chanBoardExtractor(Extractor):
65
"""Extractor for 4chan boards"""
66
category = "4chan"
67
subcategory = "board"
68
pattern = r"(?:https?://)?boards\.4chan(?:nel)?\.org/([^/?#]+)/\d*$"
69
example = "https://boards.4channel.org/a/"
70
71
def __init__(self, match):
72
Extractor.__init__(self, match)
73
self.board = match[1]
74
75
def items(self):
76
url = f"https://a.4cdn.org/{self.board}/threads.json"
77
threads = self.request_json(url)
78
79
for page in threads:
80
for thread in page["threads"]:
81
url = (f"https://boards.4chan.org"
82
f"/{self.board}/thread/{thread['no']}/")
83
thread["page"] = page["page"]
84
thread["_extractor"] = _4chanThreadExtractor
85
yield Message.Queue, url, thread
86
87