Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/common.py
5399 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Common classes and constants used by extractor modules."""
10
11
import os
12
import re
13
import ssl
14
import time
15
import netrc
16
import queue
17
import random
18
import getpass
19
import logging
20
import requests
21
import threading
22
from datetime import datetime
23
from xml.etree import ElementTree
24
from requests.adapters import HTTPAdapter
25
from .message import Message
26
from .. import config, output, text, util, cache, exception
27
urllib3 = requests.packages.urllib3
28
29
30
class Extractor():
31
32
category = ""
33
subcategory = ""
34
basecategory = ""
35
categorytransfer = False
36
directory_fmt = ("{category}",)
37
filename_fmt = "{filename}.{extension}"
38
archive_fmt = ""
39
status = 0
40
root = ""
41
cookies_domain = ""
42
cookies_index = 0
43
referer = True
44
ciphers = None
45
tls12 = True
46
browser = None
47
useragent = util.USERAGENT_FIREFOX
48
request_interval = 0.0
49
request_interval_min = 0.0
50
request_interval_429 = 60.0
51
request_timestamp = 0.0
52
53
def __init__(self, match):
54
self.log = logging.getLogger(self.category)
55
self.url = match.string
56
self.match = match
57
self.groups = match.groups()
58
self.kwdict = {}
59
60
if self.category in CATEGORY_MAP:
61
catsub = f"{self.category}:{self.subcategory}"
62
if catsub in CATEGORY_MAP:
63
self.category, self.subcategory = CATEGORY_MAP[catsub]
64
else:
65
self.category = CATEGORY_MAP[self.category]
66
67
self._cfgpath = ("extractor", self.category, self.subcategory)
68
self._parentdir = ""
69
70
@classmethod
71
def from_url(cls, url):
72
if isinstance(cls.pattern, str):
73
cls.pattern = util.re_compile(cls.pattern)
74
match = cls.pattern.match(url)
75
return cls(match) if match else None
76
77
def __iter__(self):
78
self.initialize()
79
return self.items()
80
81
def initialize(self):
82
self._init_options()
83
self._init_session()
84
self._init_cookies()
85
self._init()
86
self.initialize = util.noop
87
88
def finalize(self):
89
pass
90
91
def items(self):
92
yield Message.Version, 1
93
94
def skip(self, num):
95
return 0
96
97
def config(self, key, default=None):
98
return config.interpolate(self._cfgpath, key, default)
99
100
def config2(self, key, key2, default=None, sentinel=util.SENTINEL):
101
value = self.config(key, sentinel)
102
if value is not sentinel:
103
return value
104
return self.config(key2, default)
105
106
def config_deprecated(self, key, deprecated, default=None,
107
sentinel=util.SENTINEL, history=set()):
108
value = self.config(deprecated, sentinel)
109
if value is not sentinel:
110
if deprecated not in history:
111
history.add(deprecated)
112
self.log.warning("'%s' is deprecated. Use '%s' instead.",
113
deprecated, key)
114
default = value
115
116
value = self.config(key, sentinel)
117
if value is not sentinel:
118
return value
119
return default
120
121
def config_accumulate(self, key):
122
return config.accumulate(self._cfgpath, key)
123
124
def config_instance(self, key, default=None):
125
return default
126
127
def _config_shared(self, key, default=None):
128
return config.interpolate_common(
129
("extractor",), self._cfgpath, key, default)
130
131
def _config_shared_accumulate(self, key):
132
first = True
133
extr = ("extractor",)
134
135
for path in self._cfgpath:
136
if first:
137
first = False
138
values = config.accumulate(extr + path, key)
139
elif conf := config.get(extr, path[0]):
140
values[:0] = config.accumulate(
141
(self.subcategory,), key, conf=conf)
142
143
return values
144
145
def request(self, url, method="GET", session=None, fatal=True,
146
retries=None, retry_codes=None, expected=(), interval=True,
147
encoding=None, notfound=None, **kwargs):
148
if session is None:
149
session = self.session
150
if retries is None:
151
retries = self._retries
152
if retry_codes is None:
153
retry_codes = self._retry_codes
154
if "proxies" not in kwargs:
155
kwargs["proxies"] = self._proxies
156
if "timeout" not in kwargs:
157
kwargs["timeout"] = self._timeout
158
if "verify" not in kwargs:
159
kwargs["verify"] = self._verify
160
161
if "json" in kwargs:
162
if (json := kwargs["json"]) is not None:
163
kwargs["data"] = util.json_dumps(json).encode()
164
del kwargs["json"]
165
if headers := kwargs.get("headers"):
166
headers["Content-Type"] = "application/json"
167
else:
168
kwargs["headers"] = {"Content-Type": "application/json"}
169
170
response = challenge = None
171
tries = 1
172
173
if self._interval and interval:
174
seconds = (self._interval() -
175
(time.time() - Extractor.request_timestamp))
176
if seconds > 0.0:
177
self.sleep(seconds, "request")
178
179
while True:
180
try:
181
response = session.request(method, url, **kwargs)
182
except requests.exceptions.ConnectionError as exc:
183
try:
184
reason = exc.args[0].reason
185
cls = reason.__class__.__name__
186
pre, _, err = str(reason.args[-1]).partition(":")
187
msg = f" {cls}: {(err or pre).lstrip()}"
188
except Exception:
189
msg = exc
190
code = 0
191
except (requests.exceptions.Timeout,
192
requests.exceptions.ChunkedEncodingError,
193
requests.exceptions.ContentDecodingError) as exc:
194
msg = exc
195
code = 0
196
except (requests.exceptions.RequestException) as exc:
197
msg = exc
198
break
199
else:
200
code = response.status_code
201
if self._write_pages:
202
self._dump_response(response)
203
if (
204
code < 400 or
205
code in expected or
206
code < 500 and (
207
not fatal and code != 429 or fatal is None) or
208
fatal is ...
209
):
210
if encoding:
211
response.encoding = encoding
212
return response
213
if notfound and code == 404:
214
self.status |= exception.NotFoundError.code
215
raise exception.NotFoundError(notfound)
216
217
msg = f"'{code} {response.reason}' for '{response.url}'"
218
219
challenge = util.detect_challenge(response)
220
if challenge is not None:
221
self.log.warning(challenge)
222
223
if code == 429 and self._handle_429(response):
224
continue
225
elif code == 429 and self._interval_429:
226
pass
227
elif code not in retry_codes and code < 500:
228
break
229
230
finally:
231
Extractor.request_timestamp = time.time()
232
233
self.log.debug("%s (%s/%s)", msg, tries, retries+1)
234
if tries > retries:
235
break
236
237
seconds = tries
238
if self._interval:
239
s = self._interval()
240
if seconds < s:
241
seconds = s
242
if code == 429 and self._interval_429:
243
s = self._interval_429()
244
if seconds < s:
245
seconds = s
246
self.wait(seconds=seconds, reason="429 Too Many Requests")
247
else:
248
self.sleep(seconds, "retry")
249
tries += 1
250
251
if not fatal or fatal is ...:
252
self.log.warning(msg)
253
return util.NullResponse(url, msg)
254
255
if challenge is None:
256
exc = exception.HttpError(msg, response)
257
else:
258
exc = exception.ChallengeError(challenge, response)
259
self.status |= exc.code
260
raise exc
261
262
def request_location(self, url, **kwargs):
263
kwargs.setdefault("method", "HEAD")
264
kwargs.setdefault("allow_redirects", False)
265
return self.request(url, **kwargs).headers.get("location", "")
266
267
def request_json(self, url, **kwargs):
268
response = self.request(url, **kwargs)
269
270
try:
271
return util.json_loads(response.text)
272
except Exception as exc:
273
fatal = kwargs.get("fatal", True)
274
if not fatal or fatal is ...:
275
if challenge := util.detect_challenge(response):
276
self.log.warning(challenge)
277
else:
278
self.log.warning("%s: %s", exc.__class__.__name__, exc)
279
return {}
280
raise
281
282
def request_xml(self, url, xmlns=True, **kwargs):
283
response = self.request(url, **kwargs)
284
285
if xmlns:
286
text = response.text
287
else:
288
text = response.text.replace(" xmlns=", " ns=")
289
290
parser = ElementTree.XMLParser()
291
try:
292
parser.feed(text)
293
return parser.close()
294
except Exception as exc:
295
fatal = kwargs.get("fatal", True)
296
if not fatal or fatal is ...:
297
if challenge := util.detect_challenge(response):
298
self.log.warning(challenge)
299
else:
300
self.log.warning("%s: %s", exc.__class__.__name__, exc)
301
return ElementTree.Element("")
302
raise
303
304
_handle_429 = util.false
305
306
def wait(self, seconds=None, until=None, adjust=1.0,
307
reason="rate limit"):
308
now = time.time()
309
310
if seconds:
311
seconds = float(seconds)
312
until = now + seconds
313
elif until:
314
if isinstance(until, datetime):
315
# convert to UTC timestamp
316
until = util.datetime_to_timestamp(until)
317
else:
318
until = float(until)
319
seconds = until - now
320
else:
321
raise ValueError("Either 'seconds' or 'until' is required")
322
323
seconds += adjust
324
if seconds <= 0.0:
325
return
326
327
if reason:
328
t = datetime.fromtimestamp(until).time()
329
isotime = f"{t.hour:02}:{t.minute:02}:{t.second:02}"
330
self.log.info("Waiting until %s (%s)", isotime, reason)
331
time.sleep(seconds)
332
333
def sleep(self, seconds, reason):
334
self.log.debug("Sleeping %.2f seconds (%s)",
335
seconds, reason)
336
time.sleep(seconds)
337
338
def input(self, prompt, echo=True):
339
self._check_input_allowed(prompt)
340
341
if echo:
342
try:
343
return input(prompt)
344
except (EOFError, OSError):
345
return None
346
else:
347
return getpass.getpass(prompt)
348
349
def _check_input_allowed(self, prompt=""):
350
input = self.config("input")
351
if input is None:
352
input = output.TTY_STDIN
353
if not input:
354
raise exception.AbortExtraction(
355
f"User input required ({prompt.strip(' :')})")
356
357
def _get_auth_info(self):
358
"""Return authentication information as (username, password) tuple"""
359
username = self.config("username")
360
password = None
361
362
if username:
363
password = self.config("password")
364
if not password:
365
self._check_input_allowed("password")
366
password = util.LazyPrompt()
367
368
elif self.config("netrc", False):
369
try:
370
info = netrc.netrc().authenticators(self.category)
371
username, _, password = info
372
except (OSError, netrc.NetrcParseError) as exc:
373
self.log.error("netrc: %s", exc)
374
except TypeError:
375
self.log.warning("netrc: No authentication info")
376
377
return username, password
378
379
def _init(self):
380
pass
381
382
def _init_options(self):
383
self._write_pages = self.config("write-pages", False)
384
self._retry_codes = self.config("retry-codes")
385
self._retries = self.config("retries", 4)
386
self._timeout = self.config("timeout", 30)
387
self._verify = self.config("verify", True)
388
self._proxies = util.build_proxy_map(self.config("proxy"), self.log)
389
self._interval = util.build_duration_func(
390
self.config("sleep-request", self.request_interval),
391
self.request_interval_min,
392
)
393
self._interval_429 = util.build_duration_func(
394
self.config("sleep-429", self.request_interval_429),
395
)
396
397
if self._retries < 0:
398
self._retries = float("inf")
399
if not self._retry_codes:
400
self._retry_codes = ()
401
402
def _init_session(self):
403
self.session = session = requests.Session()
404
headers = session.headers
405
headers.clear()
406
ssl_options = ssl_ciphers = 0
407
408
# .netrc Authorization headers are alwsays disabled
409
session.trust_env = True if self.config("proxy-env", True) else False
410
411
browser = self.config("browser")
412
if browser is None:
413
browser = self.browser
414
if browser and isinstance(browser, str):
415
browser, _, platform = browser.lower().partition(":")
416
417
if not platform or platform == "auto":
418
platform = ("Windows NT 10.0; Win64; x64"
419
if util.WINDOWS else "X11; Linux x86_64")
420
elif platform == "windows":
421
platform = "Windows NT 10.0; Win64; x64"
422
elif platform == "linux":
423
platform = "X11; Linux x86_64"
424
elif platform == "macos":
425
platform = "Macintosh; Intel Mac OS X 15.5"
426
427
if browser == "chrome":
428
if platform.startswith("Macintosh"):
429
platform = platform.replace(".", "_")
430
else:
431
browser = "firefox"
432
433
for key, value in HEADERS[browser]:
434
if value and "{}" in value:
435
headers[key] = value.replace("{}", platform)
436
else:
437
headers[key] = value
438
439
ssl_options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
440
ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
441
ssl_ciphers = CIPHERS[browser]
442
else:
443
headers["User-Agent"] = self.useragent
444
headers["Accept"] = "*/*"
445
headers["Accept-Language"] = "en-US,en;q=0.5"
446
447
ssl_ciphers = self.ciphers
448
if ssl_ciphers is not None and ssl_ciphers in CIPHERS:
449
ssl_ciphers = CIPHERS[ssl_ciphers]
450
451
if BROTLI:
452
headers["Accept-Encoding"] = "gzip, deflate, br"
453
else:
454
headers["Accept-Encoding"] = "gzip, deflate"
455
if ZSTD:
456
headers["Accept-Encoding"] += ", zstd"
457
458
if referer := self.config("referer", self.referer):
459
if isinstance(referer, str):
460
headers["Referer"] = referer
461
elif self.root:
462
headers["Referer"] = self.root + "/"
463
464
custom_ua = self.config("user-agent")
465
if not custom_ua or custom_ua == "auto":
466
pass
467
elif custom_ua == "browser":
468
headers["User-Agent"] = _browser_useragent(None)
469
elif custom_ua[0] == "@":
470
headers["User-Agent"] = _browser_useragent(custom_ua[1:])
471
elif self.useragent is Extractor.useragent and not self.browser or \
472
custom_ua is not config.get(("extractor",), "user-agent"):
473
headers["User-Agent"] = custom_ua
474
475
if custom_headers := self.config("headers"):
476
if isinstance(custom_headers, str):
477
if custom_headers in HEADERS:
478
custom_headers = HEADERS[custom_headers]
479
else:
480
self.log.error("Invalid 'headers' value '%s'",
481
custom_headers)
482
custom_headers = ()
483
headers.update(custom_headers)
484
485
if custom_ciphers := self.config("ciphers"):
486
if isinstance(custom_ciphers, list):
487
ssl_ciphers = ":".join(custom_ciphers)
488
elif custom_ciphers in CIPHERS:
489
ssl_ciphers = CIPHERS[custom_ciphers]
490
else:
491
ssl_ciphers = custom_ciphers
492
493
if source_address := self.config("source-address"):
494
if isinstance(source_address, str):
495
source_address = (source_address, 0)
496
else:
497
source_address = (source_address[0], source_address[1])
498
499
tls12 = self.config("tls12")
500
if tls12 is None:
501
tls12 = self.tls12
502
if not tls12:
503
ssl_options |= ssl.OP_NO_TLSv1_2
504
self.log.debug("TLS 1.2 disabled.")
505
506
if self.config("truststore"):
507
try:
508
from truststore import SSLContext as ssl_ctx
509
except ImportError as exc:
510
self.log.error("%s: %s", exc.__class__.__name__, exc)
511
ssl_ctx = None
512
else:
513
ssl_ctx = None
514
515
adapter = _build_requests_adapter(
516
ssl_options, ssl_ciphers, ssl_ctx, source_address)
517
session.mount("https://", adapter)
518
session.mount("http://", adapter)
519
520
def _init_cookies(self):
521
"""Populate the session's cookiejar"""
522
self.cookies = self.session.cookies
523
self.cookies_file = None
524
if self.cookies_domain is None:
525
return
526
527
if cookies := self.config("cookies"):
528
if select := self.config("cookies-select"):
529
if select == "rotate":
530
cookies = cookies[self.cookies_index % len(cookies)]
531
Extractor.cookies_index += 1
532
else:
533
cookies = random.choice(cookies)
534
self.cookies_load(cookies)
535
536
def cookies_load(self, cookies_source):
537
if isinstance(cookies_source, dict):
538
self.cookies_update_dict(cookies_source, self.cookies_domain)
539
540
elif isinstance(cookies_source, str):
541
path = util.expand_path(cookies_source)
542
try:
543
with open(path) as fp:
544
cookies = util.cookiestxt_load(fp)
545
except ValueError as exc:
546
self.log.warning("cookies: Invalid Netscape cookies.txt file "
547
"'%s' (%s: %s)",
548
cookies_source, exc.__class__.__name__, exc)
549
except Exception as exc:
550
self.log.warning("cookies: Failed to load '%s' (%s: %s)",
551
cookies_source, exc.__class__.__name__, exc)
552
else:
553
self.log.debug("cookies: Loading cookies from '%s'",
554
cookies_source)
555
set_cookie = self.cookies.set_cookie
556
for cookie in cookies:
557
set_cookie(cookie)
558
self.cookies_file = path
559
560
elif isinstance(cookies_source, (list, tuple)):
561
key = tuple(cookies_source)
562
cookies = CACHE_COOKIES.get(key)
563
564
if cookies is None:
565
from ..cookies import load_cookies
566
try:
567
cookies = load_cookies(cookies_source)
568
except Exception as exc:
569
self.log.warning("cookies: %s", exc)
570
cookies = ()
571
else:
572
CACHE_COOKIES[key] = cookies
573
else:
574
self.log.debug("cookies: Using cached cookies from %s", key)
575
576
set_cookie = self.cookies.set_cookie
577
for cookie in cookies:
578
set_cookie(cookie)
579
580
else:
581
self.log.error(
582
"cookies: Expected 'dict', 'list', or 'str' value for "
583
"'cookies' option, got '%s' instead (%r)",
584
cookies_source.__class__.__name__, cookies_source)
585
586
def cookies_store(self):
587
"""Store the session's cookies in a cookies.txt file"""
588
export = self.config("cookies-update", True)
589
if not export:
590
return
591
592
if isinstance(export, str):
593
path = util.expand_path(export)
594
else:
595
path = self.cookies_file
596
if not path:
597
return
598
599
path_tmp = path + ".tmp"
600
try:
601
with open(path_tmp, "w") as fp:
602
util.cookiestxt_store(fp, self.cookies)
603
os.replace(path_tmp, path)
604
except OSError as exc:
605
self.log.error("cookies: Failed to write to '%s' "
606
"(%s: %s)", path, exc.__class__.__name__, exc)
607
608
def cookies_update(self, cookies, domain=""):
609
"""Update the session's cookiejar with 'cookies'"""
610
if isinstance(cookies, dict):
611
self.cookies_update_dict(cookies, domain or self.cookies_domain)
612
else:
613
set_cookie = self.cookies.set_cookie
614
try:
615
cookies = iter(cookies)
616
except TypeError:
617
set_cookie(cookies)
618
else:
619
for cookie in cookies:
620
set_cookie(cookie)
621
622
def cookies_update_dict(self, cookiedict, domain):
623
"""Update cookiejar with name-value pairs from a dict"""
624
set_cookie = self.cookies.set
625
for name, value in cookiedict.items():
626
set_cookie(name, value, domain=domain)
627
628
def cookies_check(self, cookies_names, domain=None, subdomains=False):
629
"""Check if all 'cookies_names' are in the session's cookiejar"""
630
if not self.cookies:
631
return False
632
633
if domain is None:
634
domain = self.cookies_domain
635
names = set(cookies_names)
636
now = time.time()
637
638
for cookie in self.cookies:
639
if cookie.name not in names:
640
continue
641
642
if not domain or cookie.domain == domain:
643
pass
644
elif not subdomains or not cookie.domain.endswith(domain):
645
continue
646
647
if cookie.expires:
648
diff = int(cookie.expires - now)
649
650
if diff <= 0:
651
self.log.warning(
652
"cookies: %s/%s expired at %s",
653
cookie.domain.lstrip("."), cookie.name,
654
datetime.fromtimestamp(cookie.expires))
655
continue
656
657
elif diff <= 86400:
658
hours = diff // 3600
659
self.log.warning(
660
"cookies: %s/%s will expire in less than %s hour%s",
661
cookie.domain.lstrip("."), cookie.name,
662
hours + 1, "s" if hours else "")
663
664
names.discard(cookie.name)
665
if not names:
666
return True
667
return False
668
669
def _extract_jsonld(self, page):
670
return util.json_loads(text.extr(
671
page, '<script type="application/ld+json">', "</script>"))
672
673
def _extract_nextdata(self, page):
674
return util.json_loads(text.extr(
675
page, ' id="__NEXT_DATA__" type="application/json">', "</script>"))
676
677
def _cache(self, func, maxage, keyarg=None):
678
# return cache.DatabaseCacheDecorator(func, maxage, keyarg)
679
return cache.DatabaseCacheDecorator(func, keyarg, maxage)
680
681
def _cache_memory(self, func, maxage=None, keyarg=None):
682
return cache.Memcache()
683
684
def _get_date_min_max(self, dmin=None, dmax=None):
685
"""Retrieve and parse 'date-min' and 'date-max' config values"""
686
def get(key, default):
687
ts = self.config(key, default)
688
if isinstance(ts, str):
689
try:
690
ts = int(datetime.strptime(ts, fmt).timestamp())
691
except ValueError as exc:
692
self.log.warning("Unable to parse '%s': %s", key, exc)
693
ts = default
694
return ts
695
fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S")
696
return get("date-min", dmin), get("date-max", dmax)
697
698
@classmethod
699
def _dump(cls, obj):
700
util.dump_json(obj, ensure_ascii=False, indent=2)
701
702
def _dump_response(self, response, history=True):
703
"""Write the response content to a .txt file in the current directory.
704
705
The file name is derived from the response url,
706
replacing special characters with "_"
707
"""
708
if history:
709
for resp in response.history:
710
self._dump_response(resp, False)
711
712
if hasattr(Extractor, "_dump_index"):
713
Extractor._dump_index += 1
714
else:
715
Extractor._dump_index = 1
716
Extractor._dump_sanitize = util.re_compile(
717
r"[\\\\|/<>:\"?*&=#]+").sub
718
719
fname = (f"{Extractor._dump_index:>02}_"
720
f"{Extractor._dump_sanitize('_', response.url)}")
721
722
if util.WINDOWS:
723
path = os.path.abspath(fname)[:255]
724
else:
725
path = fname[:251]
726
727
try:
728
with open(path + ".txt", 'wb') as fp:
729
util.dump_response(
730
response, fp,
731
headers=(self._write_pages in ("all", "ALL")),
732
hide_auth=(self._write_pages != "ALL")
733
)
734
self.log.info("Writing '%s' response to '%s'",
735
response.url, path + ".txt")
736
except Exception as e:
737
self.log.warning("Failed to dump HTTP request (%s: %s)",
738
e.__class__.__name__, e)
739
740
741
class GalleryExtractor(Extractor):
742
743
subcategory = "gallery"
744
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
745
directory_fmt = ("{category}", "{gallery_id} {title}")
746
archive_fmt = "{gallery_id}_{num}"
747
enum = "num"
748
749
def __init__(self, match, url=None):
750
Extractor.__init__(self, match)
751
752
if url is None and (path := self.groups[0]) and path[0] == "/":
753
self.page_url = f"{self.root}{path}"
754
else:
755
self.page_url = url
756
757
def items(self):
758
self.login()
759
760
if self.page_url:
761
page = self.request(
762
self.page_url, notfound=self.subcategory).text
763
else:
764
page = None
765
766
data = self.metadata(page)
767
imgs = self.images(page)
768
assets = self.assets(page)
769
770
if "count" in data:
771
if self.config("page-reverse"):
772
images = util.enumerate_reversed(imgs, 1, data["count"])
773
else:
774
images = zip(
775
range(1, data["count"]+1),
776
imgs,
777
)
778
else:
779
enum = enumerate
780
try:
781
data["count"] = len(imgs)
782
except TypeError:
783
pass
784
else:
785
if self.config("page-reverse"):
786
enum = util.enumerate_reversed
787
images = enum(imgs, 1)
788
789
yield Message.Directory, data
790
enum_key = self.enum
791
792
if assets:
793
for asset in assets:
794
url = asset["url"]
795
asset.update(data)
796
asset[enum_key] = 0
797
if "extension" not in asset:
798
text.nameext_from_url(url, asset)
799
yield Message.Url, url, asset
800
801
for data[enum_key], (url, imgdata) in images:
802
if imgdata:
803
data.update(imgdata)
804
if "extension" not in imgdata:
805
text.nameext_from_url(url, data)
806
else:
807
text.nameext_from_url(url, data)
808
yield Message.Url, url, data
809
810
def login(self):
811
"""Login and set necessary cookies"""
812
813
def metadata(self, page):
814
"""Return a dict with general metadata"""
815
816
def images(self, page):
817
"""Return a list or iterable of all (image-url, metadata)-tuples"""
818
819
def assets(self, page):
820
"""Return an iterable of additional gallery assets
821
822
Each asset must be a 'dict' containing at least 'url' and 'type'
823
"""
824
825
826
class ChapterExtractor(GalleryExtractor):
827
828
subcategory = "chapter"
829
directory_fmt = (
830
"{category}", "{manga}",
831
"{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}")
832
filename_fmt = (
833
"{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
834
archive_fmt = (
835
"{manga}_{chapter}{chapter_minor}_{page}")
836
enum = "page"
837
838
839
class MangaExtractor(Extractor):
840
841
subcategory = "manga"
842
categorytransfer = True
843
chapterclass = None
844
reverse = True
845
846
def __init__(self, match, url=None):
847
Extractor.__init__(self, match)
848
849
if url is None and (path := self.groups[0]) and path[0] == "/":
850
self.page_url = f"{self.root}{path}"
851
else:
852
self.page_url = url
853
854
if self.config("chapter-reverse", False):
855
self.reverse = not self.reverse
856
857
def items(self):
858
self.login()
859
860
if self.page_url:
861
page = self.request(self.page_url, notfound=self.subcategory).text
862
else:
863
page = None
864
865
chapters = self.chapters(page)
866
if self.reverse:
867
chapters.reverse()
868
869
for chapter, data in chapters:
870
data["_extractor"] = self.chapterclass
871
yield Message.Queue, chapter, data
872
873
def login(self):
874
"""Login and set necessary cookies"""
875
876
def chapters(self, page):
877
"""Return a list of all (chapter-url, metadata)-tuples"""
878
879
880
class Dispatch():
881
subcategory = "user"
882
cookies_domain = None
883
finalize = Extractor.finalize
884
skip = Extractor.skip
885
886
def __iter__(self):
887
return self.items()
888
889
def initialize(self):
890
pass
891
892
def _dispatch_extractors(self, extractor_data, default=(), alt=None):
893
extractors = {
894
data[0].subcategory: data
895
for data in extractor_data
896
}
897
898
if alt is not None:
899
for sub, sub_alt in alt:
900
extractors[sub_alt] = extractors[sub]
901
902
include = self.config("include", default) or ()
903
if include == "all":
904
include = extractors
905
elif isinstance(include, str):
906
include = include.replace(" ", "").split(",")
907
908
results = [(Message.Version, 1)]
909
for category in include:
910
try:
911
extr, url = extractors[category]
912
except KeyError:
913
self.log.warning("Invalid include '%s'", category)
914
else:
915
results.append((Message.Queue, url, {"_extractor": extr}))
916
return iter(results)
917
918
919
class AsynchronousMixin():
920
"""Run info extraction in a separate thread"""
921
922
def __iter__(self):
923
self.initialize()
924
925
messages = queue.Queue(5)
926
thread = threading.Thread(
927
target=self.async_items,
928
args=(messages,),
929
daemon=True,
930
)
931
932
thread.start()
933
while True:
934
msg = messages.get()
935
if msg is None:
936
thread.join()
937
return
938
if isinstance(msg, Exception):
939
thread.join()
940
raise msg
941
yield msg
942
messages.task_done()
943
944
def async_items(self, messages):
945
try:
946
for msg in self.items():
947
messages.put(msg)
948
except Exception as exc:
949
messages.put(exc)
950
messages.put(None)
951
952
953
class BaseExtractor(Extractor):
954
instances = ()
955
956
def __init__(self, match):
957
if not self.category:
958
self.groups = match.groups()
959
self.match = match
960
self._init_category()
961
Extractor.__init__(self, match)
962
963
def _init_category(self):
964
for index, group in enumerate(self.groups):
965
if group is not None:
966
if index:
967
self.category, self.root, info = self.instances[index-1]
968
if not self.root:
969
self.root = text.root_from_url(self.match[0])
970
self.config_instance = info.get
971
else:
972
self.root = group
973
self.category = group.partition("://")[2]
974
break
975
976
@classmethod
977
def update(cls, instances):
978
if extra_instances := config.get(("extractor",), cls.basecategory):
979
for category, info in extra_instances.items():
980
if isinstance(info, dict) and "root" in info:
981
instances[category] = info
982
983
pattern_list = []
984
instance_list = cls.instances = []
985
for category, info in instances.items():
986
if root := info["root"]:
987
root = root.rstrip("/")
988
instance_list.append((category, root, info))
989
990
pattern = info.get("pattern")
991
if not pattern:
992
pattern = re.escape(root[root.index(":") + 3:])
993
pattern_list.append(pattern + "()")
994
995
return (
996
r"(?:" + cls.basecategory + r":(https?://[^/?#]+)|"
997
r"(?:https?://)?(?:" + "|".join(pattern_list) + r"))"
998
)
999
1000
1001
class RequestsAdapter(HTTPAdapter):
1002
1003
def __init__(self, ssl_context=None, source_address=None):
1004
self.ssl_context = ssl_context
1005
self.source_address = source_address
1006
HTTPAdapter.__init__(self)
1007
1008
def init_poolmanager(self, *args, **kwargs):
1009
kwargs["ssl_context"] = self.ssl_context
1010
kwargs["source_address"] = self.source_address
1011
return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
1012
1013
def proxy_manager_for(self, *args, **kwargs):
1014
kwargs["ssl_context"] = self.ssl_context
1015
kwargs["source_address"] = self.source_address
1016
return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
1017
1018
1019
def _build_requests_adapter(
1020
ssl_options, ssl_ciphers, ssl_ctx, source_address):
1021
1022
key = (ssl_options, ssl_ciphers, ssl_ctx, source_address)
1023
try:
1024
return CACHE_ADAPTERS[key]
1025
except KeyError:
1026
pass
1027
1028
if ssl_options or ssl_ciphers or ssl_ctx:
1029
if ssl_ctx is None:
1030
ssl_context = urllib3.connection.create_urllib3_context(
1031
options=ssl_options or None, ciphers=ssl_ciphers)
1032
if not requests.__version__ < "2.32":
1033
# https://github.com/psf/requests/pull/6731
1034
ssl_context.load_verify_locations(requests.certs.where())
1035
else:
1036
ssl_ctx_orig = urllib3.util.ssl_.SSLContext
1037
try:
1038
urllib3.util.ssl_.SSLContext = ssl_ctx
1039
ssl_context = urllib3.connection.create_urllib3_context(
1040
options=ssl_options or None, ciphers=ssl_ciphers)
1041
finally:
1042
urllib3.util.ssl_.SSLContext = ssl_ctx_orig
1043
ssl_context.check_hostname = False
1044
else:
1045
ssl_context = None
1046
1047
adapter = CACHE_ADAPTERS[key] = RequestsAdapter(
1048
ssl_context, source_address)
1049
return adapter
1050
1051
1052
@cache.cache(maxage=86400, keyarg=0)
1053
def _browser_useragent(browser):
1054
"""Get User-Agent header from default browser"""
1055
import webbrowser
1056
try:
1057
open = webbrowser.get(browser).open
1058
except webbrowser.Error:
1059
if not browser:
1060
raise
1061
import shutil
1062
if not (browser := shutil.which(browser)):
1063
raise
1064
1065
def open(url):
1066
util.Popen((browser, url),
1067
start_new_session=False if util.WINDOWS else True)
1068
1069
import socket
1070
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1071
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
1072
server.bind(("127.0.0.1", 0))
1073
server.listen(1)
1074
1075
host, port = server.getsockname()
1076
open(f"http://{host}:{port}/user-agent")
1077
1078
client = server.accept()[0]
1079
server.close()
1080
1081
for line in client.recv(1024).split(b"\r\n"):
1082
key, _, value = line.partition(b":")
1083
if key.strip().lower() == b"user-agent":
1084
useragent = value.strip()
1085
break
1086
else:
1087
useragent = b""
1088
1089
client.send(b"HTTP/1.1 200 OK\r\n\r\n" + useragent)
1090
client.close()
1091
1092
return useragent.decode()
1093
1094
1095
CACHE_ADAPTERS = {}
1096
CACHE_COOKIES = {}
1097
CATEGORY_MAP = ()
1098
1099
1100
HEADERS_FIREFOX_140 = (
1101
("User-Agent", "Mozilla/5.0 ({}; rv:140.0) Gecko/20100101 Firefox/140.0"),
1102
("Accept", "text/html,application/xhtml+xml,"
1103
"application/xml;q=0.9,*/*;q=0.8"),
1104
("Accept-Language", "en-US,en;q=0.5"),
1105
("Accept-Encoding", None),
1106
("Connection", "keep-alive"),
1107
("Content-Type", None),
1108
("Content-Length", None),
1109
("Referer", None),
1110
("Origin", None),
1111
("Cookie", None),
1112
("Sec-Fetch-Dest", "empty"),
1113
("Sec-Fetch-Mode", "cors"),
1114
("Sec-Fetch-Site", "same-origin"),
1115
("TE", "trailers"),
1116
)
1117
HEADERS_FIREFOX_128 = (
1118
("User-Agent", "Mozilla/5.0 ({}; rv:128.0) Gecko/20100101 Firefox/128.0"),
1119
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
1120
"image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"),
1121
("Accept-Language", "en-US,en;q=0.5"),
1122
("Accept-Encoding", None),
1123
("Referer", None),
1124
("Connection", "keep-alive"),
1125
("Upgrade-Insecure-Requests", "1"),
1126
("Cookie", None),
1127
("Sec-Fetch-Dest", "empty"),
1128
("Sec-Fetch-Mode", "no-cors"),
1129
("Sec-Fetch-Site", "same-origin"),
1130
("TE", "trailers"),
1131
)
1132
HEADERS_CHROMIUM_138 = (
1133
("Connection", "keep-alive"),
1134
("sec-ch-ua", '"Not)A;Brand";v="8", "Chromium";v="138"'),
1135
("sec-ch-ua-mobile", "?0"),
1136
("sec-ch-ua-platform", '"Linux"'),
1137
("Upgrade-Insecure-Requests", "1"),
1138
("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, "
1139
"like Gecko) Chrome/138.0.0.0 Safari/537.36"),
1140
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
1141
"image/avif,image/webp,image/apng,*/*;q=0.8,"
1142
"application/signed-exchange;v=b3;q=0.7"),
1143
("Referer", None),
1144
("Sec-Fetch-Site", "same-origin"),
1145
("Sec-Fetch-Mode", "no-cors"),
1146
# ("Sec-Fetch-User", "?1"),
1147
("Sec-Fetch-Dest", "empty"),
1148
("Accept-Encoding", None),
1149
("Accept-Language", "en-US,en;q=0.9"),
1150
)
1151
HEADERS_CHROMIUM_111 = (
1152
("Connection", "keep-alive"),
1153
("Upgrade-Insecure-Requests", "1"),
1154
("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, "
1155
"like Gecko) Chrome/111.0.0.0 Safari/537.36"),
1156
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
1157
"image/avif,image/webp,image/apng,*/*;q=0.8,"
1158
"application/signed-exchange;v=b3;q=0.7"),
1159
("Referer", None),
1160
("Sec-Fetch-Site", "same-origin"),
1161
("Sec-Fetch-Mode", "no-cors"),
1162
("Sec-Fetch-Dest", "empty"),
1163
("Accept-Encoding", None),
1164
("Accept-Language", "en-US,en;q=0.9"),
1165
("cookie", None),
1166
("content-length", None),
1167
)
1168
HEADERS = {
1169
"firefox" : HEADERS_FIREFOX_140,
1170
"firefox/140": HEADERS_FIREFOX_140,
1171
"firefox/128": HEADERS_FIREFOX_128,
1172
"chrome" : HEADERS_CHROMIUM_138,
1173
"chrome/138" : HEADERS_CHROMIUM_138,
1174
"chrome/111" : HEADERS_CHROMIUM_111,
1175
}
1176
1177
CIPHERS_FIREFOX = (
1178
"TLS_AES_128_GCM_SHA256:"
1179
"TLS_CHACHA20_POLY1305_SHA256:"
1180
"TLS_AES_256_GCM_SHA384:"
1181
"ECDHE-ECDSA-AES128-GCM-SHA256:"
1182
"ECDHE-RSA-AES128-GCM-SHA256:"
1183
"ECDHE-ECDSA-CHACHA20-POLY1305:"
1184
"ECDHE-RSA-CHACHA20-POLY1305:"
1185
"ECDHE-ECDSA-AES256-GCM-SHA384:"
1186
"ECDHE-RSA-AES256-GCM-SHA384:"
1187
"ECDHE-ECDSA-AES256-SHA:"
1188
"ECDHE-ECDSA-AES128-SHA:"
1189
"ECDHE-RSA-AES128-SHA:"
1190
"ECDHE-RSA-AES256-SHA:"
1191
"AES128-GCM-SHA256:"
1192
"AES256-GCM-SHA384:"
1193
"AES128-SHA:"
1194
"AES256-SHA"
1195
)
1196
CIPHERS_CHROMIUM = (
1197
"TLS_AES_128_GCM_SHA256:"
1198
"TLS_AES_256_GCM_SHA384:"
1199
"TLS_CHACHA20_POLY1305_SHA256:"
1200
"ECDHE-ECDSA-AES128-GCM-SHA256:"
1201
"ECDHE-RSA-AES128-GCM-SHA256:"
1202
"ECDHE-ECDSA-AES256-GCM-SHA384:"
1203
"ECDHE-RSA-AES256-GCM-SHA384:"
1204
"ECDHE-ECDSA-CHACHA20-POLY1305:"
1205
"ECDHE-RSA-CHACHA20-POLY1305:"
1206
"ECDHE-RSA-AES128-SHA:"
1207
"ECDHE-RSA-AES256-SHA:"
1208
"AES128-GCM-SHA256:"
1209
"AES256-GCM-SHA384:"
1210
"AES128-SHA:"
1211
"AES256-SHA"
1212
)
1213
CIPHERS = {
1214
"firefox" : CIPHERS_FIREFOX,
1215
"firefox/140": CIPHERS_FIREFOX,
1216
"firefox/128": CIPHERS_FIREFOX,
1217
"chrome" : CIPHERS_CHROMIUM,
1218
"chrome/138" : CIPHERS_CHROMIUM,
1219
"chrome/111" : CIPHERS_CHROMIUM,
1220
}
1221
1222
1223
# disable Basic Authorization header injection from .netrc data
1224
try:
1225
requests.sessions.get_netrc_auth = lambda _: None
1226
except Exception:
1227
pass
1228
1229
# detect brotli support
1230
try:
1231
BROTLI = urllib3.response.brotli is not None
1232
except AttributeError:
1233
BROTLI = False
1234
1235
# detect zstandard support
1236
try:
1237
ZSTD = urllib3.response.HAS_ZSTD
1238
except AttributeError:
1239
ZSTD = False
1240
1241
# set (urllib3) warnings filter
1242
action = config.get((), "warnings", "default")
1243
if action:
1244
try:
1245
import warnings
1246
warnings.simplefilter(action, urllib3.exceptions.HTTPWarning)
1247
except Exception:
1248
pass
1249
del action
1250
1251