Path: blob/master/gallery_dl/extractor/4chanarchives.py
5399 views
# -*- coding: utf-8 -*-12# Copyright 2023-2025 Mike Fährmann3#4# This program is free software; you can redistribute it and/or modify5# it under the terms of the GNU General Public License version 2 as6# published by the Free Software Foundation.78"""Extractors for https://4chanarchives.com/"""910from .common import Extractor, Message11from .. import text121314class _4chanarchivesThreadExtractor(Extractor):15"""Extractor for threads on 4chanarchives.com"""16category = "4chanarchives"17subcategory = "thread"18root = "https://4chanarchives.com"19directory_fmt = ("{category}", "{board}", "{thread} - {title}")20filename_fmt = "{no}-{filename}.{extension}"21archive_fmt = "{board}_{thread}_{no}"22referer = False23pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)"24example = "https://4chanarchives.com/board/a/thread/12345/"2526def __init__(self, match):27Extractor.__init__(self, match)28self.board, self.thread = match.groups()2930def items(self):31url = f"{self.root}/board/{self.board}/thread/{self.thread}"32page = self.request(url).text33data = self.metadata(page)34posts = self.posts(page)3536if not data["title"]:37data["title"] = text.unescape(text.remove_html(38posts[0]["com"]))[:50]3940for post in posts:41post.update(data)42yield Message.Directory, post43if "url" in post:44yield Message.Url, post["url"], post4546def metadata(self, page):47return {48"board" : self.board,49"thread" : self.thread,50"title" : text.unescape(text.extr(51page, 'property="og:title" content="', '"')),52}5354def posts(self, page):55"""Build a list of all post objects"""56return [self.parse(html) for html in text.extract_iter(57page, 'id="pc', '</blockquote>')]5859def parse(self, html):60"""Build post object by extracting data from an HTML post"""61post = self._extract_post(html)62if ">File: <" in html:63self._extract_file(html, post)64post["extension"] = post["url"].rpartition(".")[2]65return post6667def _extract_post(self, html):68extr = text.extract_from(html)69return {70"no" : text.parse_int(extr('', '"')),71"name": extr('class="name">', '<'),72"time": extr('class="dateTime postNum" >', '<').rstrip(),73"com" : text.unescape(74html[html.find('<blockquote'):].partition(">")[2]),75}7677def _extract_file(self, html, post):78extr = text.extract_from(html, html.index(">File: <"))79post["url"] = extr('href="', '"')80post["filename"] = text.unquote(extr(">", "<").rpartition(".")[0])81post["fsize"] = extr("(", ", ")82post["w"] = text.parse_int(extr("", "x"))83post["h"] = text.parse_int(extr("", ")"))848586class _4chanarchivesBoardExtractor(Extractor):87"""Extractor for boards on 4chanarchives.com"""88category = "4chanarchives"89subcategory = "board"90root = "https://4chanarchives.com"91pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)(?:/(\d+))?/?$"92example = "https://4chanarchives.com/board/a/"9394def __init__(self, match):95Extractor.__init__(self, match)96self.board, self.page = match.groups()9798def items(self):99data = {"_extractor": _4chanarchivesThreadExtractor}100pnum = text.parse_int(self.page, 1)101needle = '''<span class="postNum desktop">102<span><a href="'''103104while True:105url = f"{self.root}/board/{self.board}/{pnum}"106page = self.request(url).text107108thread = None109for thread in text.extract_iter(page, needle, '"'):110yield Message.Queue, thread, data111112if thread is None:113return114pnum += 1115116117