Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/bellazon.py
8838 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2025-2026 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Extractors for https://www.bellazon.com/"""
10
11
from .common import Extractor, Message
12
from .. import text, exception
13
14
BASE_PATTERN = r"(?:https?://)?(?:www\.)?bellazon\.com/main"
15
16
17
class BellazonExtractor(Extractor):
18
"""Base class for bellazon extractors"""
19
category = "bellazon"
20
root = "https://www.bellazon.com/main"
21
directory_fmt = ("{category}", "{thread[section]}",
22
"{thread[title]} ({thread[id]})")
23
filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}"
24
archive_fmt = "{post[id]}/{id}_{filename}"
25
26
def items(self):
27
native = (self.root + "/", self.root[6:] + "/")
28
extract_urls = text.re(
29
r'(?s)<('
30
r'(?:video .*?<source [^>]*?src|a [^>]*?href)="([^"]+).*?</a>'
31
r'|img [^>]*?src="([^"]+)"[^>]*>'
32
r')'
33
).findall
34
35
if self.config("quoted", False):
36
strip_quoted = None
37
else:
38
strip_quoted = text.re(r"(?s)<blockquote .*?</blockquote>").sub
39
40
for post in self.posts():
41
if strip_quoted is None:
42
urls = extract_urls(post["content"])
43
else:
44
urls = extract_urls(strip_quoted("", post["content"]))
45
46
data = {"post": post}
47
post["count"] = data["count"] = len(urls)
48
49
yield Message.Directory, "", data
50
data["num"] = data["num_internal"] = data["num_external"] = 0
51
for info, url, url_img in urls:
52
if url_img:
53
url = text.unescape(
54
text.extr(info, 'data-full-image="', '"') or url_img)
55
else:
56
url = text.unescape(url)
57
58
if url.startswith(native):
59
if (
60
"/uploads/emoticons/" in url or
61
"/profile/" in url or
62
"/topic/" in url
63
):
64
continue
65
data["num"] += 1
66
data["num_internal"] += 1
67
if not (alt := text.extr(info, ' alt="', '"')) or (
68
alt.startswith("post-") and "_thumb." in alt):
69
dc = text.nameext_from_url(url, data.copy())
70
else:
71
dc = data.copy()
72
dc["name"] = name = text.unescape(alt)
73
dc["filename"] = name.partition(".")[0]
74
75
dc["id"] = text.extr(info, 'data-fileid="', '"')
76
if ext := text.extr(info, 'data-fileext="', '"'):
77
dc["extension"] = ext
78
elif "/core/interface/file/attachment.php" in url:
79
if not dc["id"]:
80
dc["id"] = \
81
url.rpartition("?id=")[2].partition("&")[0]
82
if name := text.extr(info, ">", "<").strip():
83
dc["name"] = name = text.unescape(name)
84
text.nameext_from_name(name, dc)
85
else:
86
dc["extension"] = text.ext_from_url(url)
87
88
if url[0] == "/":
89
url = "https:" + url
90
yield Message.Url, url, dc
91
92
else:
93
data["num"] += 1
94
data["num_external"] += 1
95
yield Message.Queue, url, data
96
97
def _pagination(self, base, pnum=None):
98
base = self.root + base
99
100
if pnum is None:
101
url = base + "/"
102
pnum = 1
103
else:
104
url = f"{base}/page/{pnum}/"
105
pnum = None
106
107
while True:
108
page = self.request(url).text
109
110
yield page
111
112
if pnum is None or ' rel="next" ' not in page or text.extr(
113
page, " rel=\"next\" data-page='", "'") == str(pnum):
114
return
115
pnum += 1
116
url = f"{base}/page/{pnum}/"
117
118
def _pagination_reverse(self, base, pnum=None):
119
base = self.root + base
120
121
url = f"{base}/page/{'9999' if pnum is None else pnum}/"
122
with self.request(url) as response:
123
parts = response.url.rsplit("/", 3)
124
pnum = text.parse_int(parts[2]) if parts[1] == "page" else 1
125
page = response.text
126
127
while True:
128
yield page
129
130
pnum -= 1
131
if pnum > 1:
132
url = f"{base}/page/{pnum}/"
133
elif pnum == 1:
134
url = base + "/"
135
else:
136
return
137
138
page = self.request(url).text
139
140
def _parse_thread(self, page):
141
schema = self._extract_jsonld(page)
142
author = schema["author"]
143
stats = schema["interactionStatistic"]
144
url_t = schema["url"]
145
url_a = author.get("url") or ""
146
147
path = text.split_html(text.extr(
148
page, '<nav class="ipsBreadcrumb', "</nav>"))[2:-1]
149
150
thread = {
151
"url" : url_t,
152
"path" : path,
153
"title": schema["headline"],
154
"views": stats[0]["userInteractionCount"],
155
"posts": stats[1]["userInteractionCount"],
156
"date" : self.parse_datetime_iso(schema["datePublished"]),
157
"date_updated": self.parse_datetime_iso(schema["dateModified"]),
158
"description" : text.unescape(schema["text"]).strip(),
159
"section" : path[-2],
160
"author" : author["name"],
161
"author_url" : url_a,
162
}
163
164
thread["id"], _, slug = \
165
url_t.rsplit("/", 2)[1].partition("-")
166
thread["slug"] = text.unquote(slug)
167
168
if url_a:
169
thread["author_id"], _, thread["author_slug"] = \
170
url_a.rsplit("/", 2)[1].partition("-")
171
else:
172
thread["author_id"] = thread["author_slug"] = ""
173
174
return thread
175
176
def _parse_post(self, html):
177
extr = text.extract_from(html)
178
179
post = {
180
"id": extr('id="elComment_', '"'),
181
"author_url": extr(" href='", "'"),
182
"date": self.parse_datetime_iso(extr("datetime='", "'")),
183
"content": extr("<!-- Post content -->", "\n\t\t</div>"),
184
}
185
186
if (pos := post["content"].find(">")) >= 0:
187
post["content"] = post["content"][pos+1:].strip()
188
189
if url_a := post["author_url"]:
190
post["author_id"], _, post["author_slug"] = \
191
url_a.rsplit("/", 2)[1].partition("-")
192
else:
193
post["author_id"] = post["author_slug"] = ""
194
195
return post
196
197
198
class BellazonPostExtractor(BellazonExtractor):
199
subcategory = "post"
200
pattern = (BASE_PATTERN + r"(/topic/\d+-[^/?#]+(?:/page/\d+)?)"
201
r"/?#(?:findC|c)omment-(\d+)")
202
example = "https://www.bellazon.com/main/topic/123-SLUG/#findComment-12345"
203
204
def posts(self):
205
path, post_id = self.groups
206
page = self.request(self.root + path).text
207
208
pos = page.find('id="elComment_' + post_id)
209
if pos < 0:
210
raise exception.NotFoundError("post")
211
html = text.extract(page, "<article ", "</article>", pos-100)[0]
212
213
self.kwdict["thread"] = self._parse_thread(page)
214
return (self._parse_post(html),)
215
216
217
class BellazonThreadExtractor(BellazonExtractor):
218
subcategory = "thread"
219
pattern = BASE_PATTERN + r"(/topic/\d+-[^/?#]+)(?:/page/(\d+))?"
220
example = "https://www.bellazon.com/main/topic/123-SLUG/"
221
222
def posts(self):
223
if (order := self.config("order-posts")) and \
224
order[0] not in ("d", "r"):
225
pages = self._pagination(*self.groups)
226
reverse = False
227
else:
228
pages = self._pagination_reverse(*self.groups)
229
reverse = True
230
231
for page in pages:
232
if "thread" not in self.kwdict:
233
self.kwdict["thread"] = self._parse_thread(page)
234
posts = text.extract_iter(page, "<article ", "</article>")
235
if reverse:
236
posts = list(posts)
237
posts.reverse()
238
for html in posts:
239
yield self._parse_post(html)
240
241
242
class BellazonForumExtractor(BellazonExtractor):
243
subcategory = "forum"
244
pattern = BASE_PATTERN + r"(/forum/\d+-[^/?#]+)(?:/page/(\d+))?"
245
example = "https://www.bellazon.com/main/forum/123-SLUG/"
246
247
def items(self):
248
data = {"_extractor": BellazonThreadExtractor}
249
for page in self._pagination(*self.groups):
250
for row in text.extract_iter(
251
page, '<li data-ips-hook="topicRow"', "</"):
252
yield Message.Queue, text.extr(row, 'href="', '"'), data
253
254