Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/2chen.py
5399 views
1
# -*- coding: utf-8 -*-
2
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as
5
# published by the Free Software Foundation.
6
7
"""Extractors for https://sturdychan.help/"""
8
9
from .common import Extractor, Message
10
from .. import text
11
12
BASE_PATTERN = r"(?:https?://)?(?:sturdychan.help|2chen\.(?:moe|club))"
13
14
15
class _2chenThreadExtractor(Extractor):
16
"""Extractor for 2chen threads"""
17
category = "2chen"
18
subcategory = "thread"
19
root = "https://sturdychan.help"
20
directory_fmt = ("{category}", "{board}", "{thread} {title}")
21
filename_fmt = "{time} {filename}.{extension}"
22
archive_fmt = "{board}_{thread}_{hash}_{time}"
23
pattern = BASE_PATTERN + r"/([^/?#]+)/(\d+)"
24
example = "https://sturdychan.help/a/12345/"
25
26
def __init__(self, match):
27
Extractor.__init__(self, match)
28
self.board, self.thread = match.groups()
29
30
def items(self):
31
url = f"{self.root}/{self.board}/{self.thread}"
32
page = self.request(url, encoding="utf-8", notfound="thread").text
33
data = self.metadata(page)
34
yield Message.Directory, data
35
36
for post in self.posts(page):
37
38
url = post["url"]
39
if not url:
40
continue
41
if url[0] == "/":
42
url = self.root + url
43
post["url"] = url = url.partition("?")[0]
44
45
post.update(data)
46
post["time"] = text.parse_int(post["date"].timestamp())
47
yield Message.Url, url, text.nameext_from_url(
48
post["filename"], post)
49
50
def metadata(self, page):
51
board, pos = text.extract(page, 'class="board">/', '/<')
52
title = text.extract(page, "<h3>", "</h3>", pos)[0]
53
return {
54
"board" : board,
55
"thread": self.thread,
56
"title" : text.unescape(title),
57
}
58
59
def posts(self, page):
60
"""Return iterable with relevant posts"""
61
return map(self.parse, text.extract_iter(
62
page, 'class="glass media', '</article>'))
63
64
def parse(self, post):
65
extr = text.extract_from(post)
66
return {
67
"name" : text.unescape(extr("<span>", "</span>")),
68
"date" : text.parse_datetime(
69
extr("<time", "<").partition(">")[2],
70
"%d %b %Y (%a) %H:%M:%S"
71
),
72
"no" : extr('href="#p', '"'),
73
"url" : extr('</a><a href="', '"'),
74
"filename": text.unescape(extr('download="', '"')),
75
"hash" : extr('data-hash="', '"'),
76
}
77
78
79
class _2chenBoardExtractor(Extractor):
80
"""Extractor for 2chen boards"""
81
category = "2chen"
82
subcategory = "board"
83
root = "https://sturdychan.help"
84
pattern = BASE_PATTERN + r"/([^/?#]+)(?:/catalog|/?$)"
85
example = "https://sturdychan.help/a/"
86
87
def __init__(self, match):
88
Extractor.__init__(self, match)
89
self.board = match[1]
90
91
def items(self):
92
url = f"{self.root}/{self.board}/catalog"
93
page = self.request(url, notfound="board").text
94
data = {"_extractor": _2chenThreadExtractor}
95
for thread in text.extract_iter(
96
page, '<figure><a href="', '"'):
97
yield Message.Queue, self.root + thread, data
98
99