Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/bbc.py
8901 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2021-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Extractors for https://bbc.co.uk/"""
10
11
from .common import GalleryExtractor, Extractor, Message
12
from .. import text, util
13
14
BASE_PATTERN = r"(?:https?://)?(?:www\.)?bbc\.co\.uk(/programmes/"
15
16
17
class BbcGalleryExtractor(GalleryExtractor):
18
"""Extractor for a programme gallery on bbc.co.uk"""
19
category = "bbc"
20
root = "https://www.bbc.co.uk"
21
directory_fmt = ("{category}", "{path:I}")
22
filename_fmt = "{num:>02}.{extension}"
23
archive_fmt = "{programme}_{num}"
24
pattern = BASE_PATTERN + r"[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$"
25
example = "https://www.bbc.co.uk/programmes/PATH"
26
27
def metadata(self, page):
28
data = self._extract_jsonld(page)
29
30
return {
31
"title": text.unescape(text.extr(
32
page, "<h1>", "</h1>").rpartition("</span>")[2]),
33
"description": text.unescape(text.extr(
34
page, 'property="og:description" content="', '"')),
35
"programme": self.page_url.split("/")[4],
36
"path": list(util.unique_sequence(
37
element["name"]
38
for element in data["itemListElement"]
39
)),
40
}
41
42
def images(self, page):
43
width = self.config("width")
44
width = width - width % 16 if width else 1920
45
dimensions = f"/{width}xn/"
46
47
results = []
48
for img in text.extract_iter(page, 'class="gallery__thumbnail', ">"):
49
src = text.extr(img, 'data-image-src="', '"')
50
results.append((
51
src.replace("/320x180_b/", dimensions),
52
{
53
"title_image": text.unescape(text.extr(
54
img, 'data-gallery-title="', '"')),
55
"synopsis": text.unescape(text.extr(
56
img, 'data-gallery-synopsis="', '"')),
57
"_fallback": self._fallback_urls(src, width),
58
},
59
))
60
return results
61
62
def _fallback_urls(self, src, max_width):
63
front, _, back = src.partition("/320x180_b/")
64
for width in (1920, 1600, 1280, 976):
65
if width < max_width:
66
yield f"{front}/{width}xn/{back}"
67
68
69
class BbcProgrammeExtractor(Extractor):
70
"""Extractor for all galleries of a bbc programme"""
71
category = "bbc"
72
subcategory = "programme"
73
root = "https://www.bbc.co.uk"
74
pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?"
75
example = "https://www.bbc.co.uk/programmes/ID/galleries"
76
77
def items(self):
78
path, pnum = self.groups
79
data = {"_extractor": BbcGalleryExtractor}
80
params = {"page": text.parse_int(pnum, 1)}
81
galleries_url = self.root + path
82
83
while True:
84
page = self.request(galleries_url, params=params).text
85
for programme_id in text.extract_iter(
86
page, '<a href="https://www.bbc.co.uk/programmes/', '"'):
87
url = "https://www.bbc.co.uk/programmes/" + programme_id
88
yield Message.Queue, url, data
89
if 'rel="next"' not in page:
90
return
91
params["page"] += 1
92
93