Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/common.py
8838 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2026 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Common classes and constants used by extractor modules."""
10
11
import os
12
import re
13
import ssl
14
import time
15
import netrc
16
import queue
17
import random
18
import getpass
19
import logging
20
import requests
21
import threading
22
from xml.etree import ElementTree
23
from requests.adapters import HTTPAdapter
24
from .message import Message
25
from .. import config, output, text, util, dt, cache, exception
26
urllib3 = requests.packages.urllib3
27
28
29
class Extractor():
30
31
category = ""
32
subcategory = ""
33
basecategory = ""
34
basesubcategory = ""
35
categorytransfer = False
36
parent = False
37
directory_fmt = ("{category}",)
38
filename_fmt = "{filename}.{extension}"
39
archive_fmt = ""
40
status = 0
41
root = ""
42
cookies_file = ""
43
cookies_index = 0
44
cookies_domain = ""
45
session = None
46
referer = True
47
ciphers = None
48
tls12 = True
49
browser = None
50
useragent = util.USERAGENT_FIREFOX
51
geobypass = None
52
request_interval = 0.0
53
request_interval_min = 0.0
54
request_interval_429 = 60.0
55
request_timestamp = 0.0
56
finalize = skip = None
57
58
def __init__(self, match):
59
self.log = logging.getLogger(self.category)
60
self.url = match.string
61
self.match = match
62
self.groups = match.groups()
63
self.kwdict = {}
64
65
if self.category in CATEGORY_MAP:
66
catsub = f"{self.category}:{self.subcategory}"
67
if catsub in CATEGORY_MAP:
68
self.category, self.subcategory = CATEGORY_MAP[catsub]
69
else:
70
self.category = CATEGORY_MAP[self.category]
71
72
self.parse_datetime = dt.parse
73
self.parse_datetime_iso = dt.parse_iso
74
self.parse_timestamp = dt.parse_ts
75
76
self._cfgpath = ("extractor", self.category, self.subcategory)
77
self._parentdir = ""
78
79
def __str__(self):
80
return f"{self.__class__.__name__} <{self.url}>"
81
82
@classmethod
83
def from_url(cls, url):
84
if isinstance(cls.pattern, str):
85
cls.pattern = util.re_compile(cls.pattern)
86
match = cls.pattern.match(url)
87
return cls(match) if match else None
88
89
def __iter__(self):
90
self.initialize()
91
return self.items()
92
93
def initialize(self):
94
self._init_options()
95
96
if self.session is None:
97
self._init_session()
98
self.cookies = self.session.cookies
99
if self.cookies_domain is not None:
100
self._init_cookies()
101
else:
102
self.cookies = self.session.cookies
103
104
self._init()
105
self.initialize = util.noop
106
107
def items(self):
108
return
109
yield
110
111
def config(self, key, default=None):
112
return config.interpolate(self._cfgpath, key, default)
113
114
def config2(self, key, key2, default=None, sentinel=util.SENTINEL):
115
value = self.config(key, sentinel)
116
if value is not sentinel:
117
return value
118
return self.config(key2, default)
119
120
def config_deprecated(self, key, deprecated, default=None,
121
sentinel=util.SENTINEL, history=set()):
122
value = self.config(deprecated, sentinel)
123
if value is not sentinel:
124
if deprecated not in history:
125
history.add(deprecated)
126
self.log.warning("'%s' is deprecated. Use '%s' instead.",
127
deprecated, key)
128
default = value
129
130
value = self.config(key, sentinel)
131
if value is not sentinel:
132
return value
133
return default
134
135
def config_accumulate(self, key):
136
return config.accumulate(self._cfgpath, key)
137
138
def config_instance(self, key, default=None):
139
return default
140
141
def _config_shared(self, key, default=None):
142
return config.interpolate_common(
143
("extractor",), self._cfgpath, key, default)
144
145
def _config_shared_accumulate(self, key):
146
first = True
147
extr = ("extractor",)
148
149
for path in self._cfgpath:
150
if first:
151
first = False
152
values = config.accumulate(extr + path, key)
153
elif conf := config.get(extr, path[0]):
154
values[:0] = config.accumulate(
155
(self.subcategory,), key, conf=conf)
156
157
return values
158
159
def request(self, url, method="GET", session=None, fatal=True,
160
retries=None, retry_codes=None, expected=(), interval=True,
161
encoding=None, notfound=None, **kwargs):
162
if session is None:
163
session = self.session
164
if retries is None:
165
retries = self._retries
166
if retry_codes is None:
167
retry_codes = self._retry_codes
168
if "proxies" not in kwargs:
169
kwargs["proxies"] = self._proxies
170
if "timeout" not in kwargs:
171
kwargs["timeout"] = self._timeout
172
if "verify" not in kwargs:
173
kwargs["verify"] = self._verify
174
175
if "json" in kwargs:
176
if (json := kwargs["json"]) is not None:
177
kwargs["data"] = util.json_dumps(json).encode()
178
del kwargs["json"]
179
if headers := kwargs.get("headers"):
180
headers["Content-Type"] = "application/json"
181
else:
182
kwargs["headers"] = {"Content-Type": "application/json"}
183
184
response = challenge = None
185
tries = 1
186
187
if self._interval and interval:
188
seconds = (self._interval() -
189
(time.time() - Extractor.request_timestamp))
190
if seconds > 0.0:
191
self.sleep(seconds, "request")
192
193
while True:
194
try:
195
response = session.request(method, url, **kwargs)
196
except requests.exceptions.ConnectionError as exc:
197
try:
198
reason = exc.args[0].reason
199
cls = reason.__class__.__name__
200
pre, _, err = str(reason.args[-1]).partition(":")
201
msg = f" {cls}: {(err or pre).lstrip()}"
202
except Exception:
203
msg = exc
204
code = 0
205
except (requests.exceptions.Timeout,
206
requests.exceptions.ChunkedEncodingError,
207
requests.exceptions.ContentDecodingError) as exc:
208
msg = exc
209
code = 0
210
except (requests.exceptions.RequestException) as exc:
211
msg = exc
212
break
213
else:
214
code = response.status_code
215
if self._write_pages:
216
self._dump_response(response)
217
if (
218
code < 400 or
219
code in expected or
220
code < 500 and (
221
not fatal and code != 429 or fatal is None) or
222
fatal is ...
223
):
224
if encoding:
225
response.encoding = encoding
226
return response
227
if notfound is not None and code == 404:
228
if notfound is True:
229
notfound = self.__class__.subcategory
230
self.status |= exception.NotFoundError.code
231
raise exception.NotFoundError(notfound)
232
233
msg = f"'{code} {response.reason}' for '{response.url}'"
234
235
challenge = util.detect_challenge(response)
236
if challenge is not None:
237
self.log.warning(challenge)
238
239
if code == 429 and self._handle_429(response):
240
continue
241
elif code == 429 and self._interval_429:
242
pass
243
elif code not in retry_codes and code < 500:
244
break
245
246
finally:
247
if interval:
248
Extractor.request_timestamp = time.time()
249
250
self.log.debug("%s (%s/%s)", msg, tries, retries+1)
251
if tries > retries:
252
break
253
254
seconds = tries
255
if self._interval:
256
s = self._interval()
257
if seconds < s:
258
seconds = s
259
if code == 429 and self._interval_429:
260
s = self._interval_429()
261
if seconds < s:
262
seconds = s
263
self.wait(seconds=seconds, reason="429 Too Many Requests")
264
else:
265
self.sleep(seconds, "retry")
266
tries += 1
267
268
if not fatal or fatal is ...:
269
self.log.warning(msg)
270
return util.NullResponse(url, msg)
271
272
if challenge is None:
273
exc = exception.HttpError(msg, response)
274
else:
275
exc = exception.ChallengeError(challenge, response)
276
self.status |= exc.code
277
raise exc
278
279
def request_location(self, url, **kwargs):
280
kwargs.setdefault("method", "HEAD")
281
kwargs.setdefault("allow_redirects", False)
282
kwargs.setdefault("interval", False)
283
return self.request(url, **kwargs).headers.get("location", "")
284
285
def request_json(self, url, **kwargs):
286
response = self.request(url, **kwargs)
287
288
try:
289
return util.json_loads(response.text)
290
except Exception as exc:
291
fatal = kwargs.get("fatal", True)
292
if not fatal or fatal is ...:
293
if challenge := util.detect_challenge(response):
294
self.log.warning(challenge)
295
else:
296
self.log.warning("%s: %s", exc.__class__.__name__, exc)
297
return {}
298
raise
299
300
def request_xml(self, url, xmlns=True, **kwargs):
301
response = self.request(url, **kwargs)
302
303
if xmlns:
304
text = response.text
305
else:
306
text = response.text.replace(" xmlns=", " ns=")
307
308
parser = ElementTree.XMLParser()
309
try:
310
parser.feed(text)
311
return parser.close()
312
except Exception as exc:
313
fatal = kwargs.get("fatal", True)
314
if not fatal or fatal is ...:
315
if challenge := util.detect_challenge(response):
316
self.log.warning(challenge)
317
else:
318
self.log.warning("%s: %s", exc.__class__.__name__, exc)
319
return ElementTree.Element("")
320
raise
321
322
_handle_429 = util.false
323
324
def wait(self, seconds=None, until=None, adjust=1.0,
325
reason="rate limit"):
326
now = time.time()
327
328
if seconds:
329
seconds = float(seconds)
330
until = now + seconds
331
elif until:
332
if isinstance(until, dt.datetime):
333
# convert to UTC timestamp
334
until = dt.to_ts(until)
335
else:
336
until = float(until)
337
seconds = until - now
338
else:
339
raise ValueError("Either 'seconds' or 'until' is required")
340
341
seconds += adjust
342
if seconds <= 0.0:
343
return
344
345
if reason:
346
t = dt.datetime.fromtimestamp(until).time()
347
isotime = f"{t.hour:02}:{t.minute:02}:{t.second:02}"
348
self.log.info("Waiting until %s (%s)", isotime, reason)
349
time.sleep(seconds)
350
351
def sleep(self, seconds, reason):
352
self.log.debug("Sleeping %.2f seconds (%s)",
353
seconds, reason)
354
time.sleep(seconds)
355
356
def utils(self, module="", name=None):
357
module = (self.__class__.category if not module else
358
module[1:] if module[0] == "/" else
359
f"{self.__class__.category}_{module}")
360
if module in CACHE_UTILS:
361
res = CACHE_UTILS[module]
362
else:
363
res = CACHE_UTILS[module] = __import__(
364
"utils." + module, globals(), None, module, 1)
365
return res if name is None else getattr(res, name, None)
366
367
def input(self, prompt, echo=True):
368
self._check_input_allowed(prompt)
369
370
if echo:
371
try:
372
return input(prompt)
373
except (EOFError, OSError):
374
return None
375
else:
376
return getpass.getpass(prompt)
377
378
def _check_input_allowed(self, prompt=""):
379
input = self.config("input")
380
if input is None:
381
input = output.TTY_STDIN
382
if not input:
383
raise exception.AbortExtraction(
384
f"User input required ({prompt.strip(' :')})")
385
386
def _get_auth_info(self, password=None):
387
"""Return authentication information as (username, password) tuple"""
388
username = self.config("username")
389
390
if username or password:
391
password = self.config("password")
392
if not password:
393
self._check_input_allowed("password")
394
password = util.LazyPrompt()
395
396
elif self.config("netrc", False):
397
try:
398
info = netrc.netrc().authenticators(self.category)
399
username, _, password = info
400
except (OSError, netrc.NetrcParseError) as exc:
401
self.log.error("netrc: %s", exc)
402
except TypeError:
403
self.log.warning("netrc: No authentication info")
404
405
return username, password
406
407
def _init(self):
408
pass
409
410
def _init_options(self):
411
self._write_pages = self.config("write-pages", False)
412
self._retry_codes = self.config("retry-codes")
413
self._retries = self.config("retries", 4)
414
self._timeout = self.config("timeout", 30)
415
self._verify = self.config("verify", True)
416
self._proxies = util.build_proxy_map(self.config("proxy"), self.log)
417
self._interval = util.build_duration_func(
418
self.config("sleep-request", self.request_interval),
419
self.request_interval_min,
420
)
421
self._interval_429 = util.build_duration_func(
422
self.config("sleep-429", self.request_interval_429),
423
)
424
425
if self._retries < 0:
426
self._retries = float("inf")
427
if not self._retry_codes:
428
self._retry_codes = ()
429
430
def _init_session(self):
431
self.session = session = requests.Session()
432
headers = session.headers
433
headers.clear()
434
ssl_options = ssl_ciphers = 0
435
436
# .netrc Authorization headers are alwsays disabled
437
session.trust_env = True if self.config("proxy-env", True) else False
438
439
browser = self.config("browser")
440
if browser is None:
441
browser = self.browser
442
if browser and isinstance(browser, str):
443
browser, _, platform = browser.lower().partition(":")
444
445
if not platform or platform == "auto":
446
platform = ("Windows NT 10.0; Win64; x64"
447
if util.WINDOWS else "X11; Linux x86_64")
448
elif platform == "windows":
449
platform = "Windows NT 10.0; Win64; x64"
450
elif platform == "linux":
451
platform = "X11; Linux x86_64"
452
elif platform == "macos":
453
platform = "Macintosh; Intel Mac OS X 15.5"
454
455
if browser == "chrome":
456
if platform.startswith("Macintosh"):
457
platform = platform.replace(".", "_")
458
else:
459
browser = "firefox"
460
461
for key, value in HEADERS[browser]:
462
if value and "{}" in value:
463
headers[key] = value.replace("{}", platform)
464
else:
465
headers[key] = value
466
467
ssl_options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
468
ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
469
ssl_ciphers = CIPHERS[browser]
470
else:
471
headers["User-Agent"] = self.useragent
472
headers["Accept"] = "*/*"
473
headers["Accept-Language"] = "en-US,en;q=0.5"
474
475
ssl_ciphers = self.ciphers
476
if ssl_ciphers is not None and ssl_ciphers in CIPHERS:
477
ssl_ciphers = CIPHERS[ssl_ciphers]
478
479
if BROTLI:
480
headers["Accept-Encoding"] = "gzip, deflate, br"
481
else:
482
headers["Accept-Encoding"] = "gzip, deflate"
483
if ZSTD:
484
headers["Accept-Encoding"] += ", zstd"
485
486
if referer := self.config("referer", self.referer):
487
if isinstance(referer, str):
488
headers["Referer"] = referer
489
elif self.root:
490
headers["Referer"] = self.root + "/"
491
492
custom_ua = self.config("user-agent")
493
if not custom_ua or custom_ua == "auto":
494
pass
495
elif custom_ua == "browser":
496
headers["User-Agent"] = _browser_useragent(None)
497
elif custom_ua[0] == "@":
498
headers["User-Agent"] = _browser_useragent(custom_ua[1:])
499
elif custom_ua[0] == "+":
500
custom_ua = custom_ua[1:].lower()
501
if custom_ua in {"firefox", "ff"}:
502
headers["User-Agent"] = util.USERAGENT_FIREFOX
503
elif custom_ua in {"chrome", "cr"}:
504
headers["User-Agent"] = util.USERAGENT_CHROME
505
elif custom_ua in {"gallery-dl", "gallerydl", "gdl"}:
506
headers["User-Agent"] = util.USERAGENT_GALLERYDL
507
elif custom_ua in {"google-bot", "googlebot", "bot"}:
508
headers["User-Agent"] = "Googlebot-Image/1.0"
509
else:
510
self.log.warning(
511
"Unsupported User-Agent preset '%s'", custom_ua)
512
elif self.useragent is Extractor.useragent and not self.browser or \
513
custom_ua is not config.get(("extractor",), "user-agent"):
514
headers["User-Agent"] = custom_ua
515
516
custom_xff = self.config("geo-bypass")
517
if custom_xff is None or custom_xff == "auto":
518
custom_xff = self.geobypass
519
if custom_xff is not None:
520
if ip := self.utils("/geo").random_ipv4(custom_xff):
521
headers["X-Forwarded-For"] = ip
522
self.log.debug("Using fake IP %s as 'X-Forwarded-For'", ip)
523
else:
524
self.log.warning("xff: Invalid ISO 3166 country code '%s'",
525
custom_xff)
526
527
if custom_headers := self.config("headers"):
528
if isinstance(custom_headers, str):
529
if custom_headers in HEADERS:
530
custom_headers = HEADERS[custom_headers]
531
else:
532
self.log.error("Invalid 'headers' value '%s'",
533
custom_headers)
534
custom_headers = ()
535
headers.update(custom_headers)
536
537
if custom_ciphers := self.config("ciphers"):
538
if isinstance(custom_ciphers, list):
539
ssl_ciphers = ":".join(custom_ciphers)
540
elif custom_ciphers in CIPHERS:
541
ssl_ciphers = CIPHERS[custom_ciphers]
542
else:
543
ssl_ciphers = custom_ciphers
544
545
if source_address := self.config("source-address"):
546
if isinstance(source_address, str):
547
source_address = (source_address, 0)
548
else:
549
source_address = (source_address[0], source_address[1])
550
551
tls12 = self.config("tls12")
552
if tls12 is None:
553
tls12 = self.tls12
554
if not tls12:
555
ssl_options |= ssl.OP_NO_TLSv1_2
556
self.log.debug("TLS 1.2 disabled.")
557
558
if self.config("truststore"):
559
try:
560
from truststore import SSLContext as ssl_ctx
561
except ImportError as exc:
562
self.log.error("%s: %s", exc.__class__.__name__, exc)
563
ssl_ctx = None
564
else:
565
ssl_ctx = None
566
567
adapter = _build_requests_adapter(
568
ssl_options, ssl_ciphers, ssl_ctx, source_address)
569
session.mount("https://", adapter)
570
session.mount("http://", adapter)
571
572
def _init_cookies(self):
573
"""Populate the session's cookiejar"""
574
if cookies := self.config("cookies"):
575
if select := self.config("cookies-select"):
576
if select == "rotate":
577
cookies = cookies[self.cookies_index % len(cookies)]
578
Extractor.cookies_index += 1
579
else:
580
cookies = random.choice(cookies)
581
self.cookies_load(cookies)
582
583
def cookies_load(self, cookies_source):
584
if isinstance(cookies_source, dict):
585
self.cookies_update_dict(cookies_source, self.cookies_domain)
586
587
elif isinstance(cookies_source, str):
588
path = util.expand_path(cookies_source)
589
try:
590
with open(path, encoding="utf-8") as fp:
591
cookies = util.cookiestxt_load(fp)
592
except ValueError as exc:
593
self.log.warning("cookies: Invalid Netscape cookies.txt file "
594
"'%s' (%s: %s)",
595
cookies_source, exc.__class__.__name__, exc)
596
except Exception as exc:
597
self.log.warning("cookies: Failed to load '%s' (%s: %s)",
598
cookies_source, exc.__class__.__name__, exc)
599
else:
600
self.log.debug("cookies: Loading cookies from '%s'",
601
cookies_source)
602
set_cookie = self.cookies.set_cookie
603
for cookie in cookies:
604
set_cookie(cookie)
605
self.cookies_file = path
606
607
elif isinstance(cookies_source, (list, tuple)):
608
key = tuple(cookies_source)
609
cookies = CACHE_COOKIES.get(key)
610
611
if cookies is None:
612
from ..cookies import load_cookies
613
try:
614
cookies = load_cookies(cookies_source)
615
except Exception as exc:
616
self.log.warning("cookies: %s", exc)
617
cookies = ()
618
else:
619
CACHE_COOKIES[key] = cookies
620
else:
621
self.log.debug("cookies: Using cached cookies from %s", key)
622
623
set_cookie = self.cookies.set_cookie
624
for cookie in cookies:
625
set_cookie(cookie)
626
627
else:
628
self.log.error(
629
"cookies: Expected 'dict', 'list', or 'str' value for "
630
"'cookies' option, got '%s' instead (%r)",
631
cookies_source.__class__.__name__, cookies_source)
632
633
def cookies_store(self):
634
"""Store the session's cookies in a cookies.txt file"""
635
export = self.config("cookies-update", True)
636
if not export:
637
return
638
639
if isinstance(export, str):
640
path = util.expand_path(export)
641
else:
642
path = self.cookies_file
643
if not path:
644
return
645
646
path_tmp = path + ".tmp"
647
try:
648
with open(path_tmp, "w", encoding="utf-8") as fp:
649
util.cookiestxt_store(fp, self.cookies)
650
os.replace(path_tmp, path)
651
except OSError as exc:
652
self.log.error("cookies: Failed to write to '%s' "
653
"(%s: %s)", path, exc.__class__.__name__, exc)
654
655
def cookies_update(self, cookies, domain=""):
656
"""Update the session's cookiejar with 'cookies'"""
657
if isinstance(cookies, dict):
658
self.cookies_update_dict(cookies, domain or self.cookies_domain)
659
else:
660
set_cookie = self.cookies.set_cookie
661
try:
662
cookies = iter(cookies)
663
except TypeError:
664
set_cookie(cookies)
665
else:
666
for cookie in cookies:
667
set_cookie(cookie)
668
669
def cookies_update_dict(self, cookiedict, domain):
670
"""Update cookiejar with name-value pairs from a dict"""
671
set_cookie = self.cookies.set
672
for name, value in cookiedict.items():
673
set_cookie(name, value, domain=domain)
674
675
def cookies_check(self, cookies_names, domain=None, subdomains=False):
676
"""Check if all 'cookies_names' are in the session's cookiejar"""
677
if not self.cookies:
678
return False
679
680
if domain is None:
681
domain = self.cookies_domain
682
names = set(cookies_names)
683
now = time.time()
684
685
for cookie in self.cookies:
686
if cookie.name not in names:
687
continue
688
689
if not domain or cookie.domain == domain:
690
pass
691
elif not subdomains or not cookie.domain.endswith(domain):
692
continue
693
694
if cookie.expires:
695
diff = int(cookie.expires - now)
696
697
if diff <= 0:
698
self.log.warning(
699
"cookies: %s/%s expired at %s",
700
cookie.domain.lstrip("."), cookie.name,
701
dt.datetime.fromtimestamp(cookie.expires))
702
continue
703
704
elif diff <= 86400:
705
hours = diff // 3600
706
self.log.warning(
707
"cookies: %s/%s will expire in less than %s hour%s",
708
cookie.domain.lstrip("."), cookie.name,
709
hours + 1, "s" if hours else "")
710
711
names.discard(cookie.name)
712
if not names:
713
return True
714
return False
715
716
def _extract_jsonld(self, page):
717
return util.json_loads(
718
text.extr(page, '<script type="application/ld+json">',
719
"</script>") or
720
text.extr(page, "<script type='application/ld+json'>",
721
"</script>"))
722
723
def _extract_nextdata(self, page):
724
return util.json_loads(
725
text.extr(page, ' id="__NEXT_DATA__" type="application/json">',
726
"</script>") or
727
text.extr(page, " id='__NEXT_DATA__' type='application/json'>",
728
"</script>"))
729
730
def _cache(self, func, maxage, keyarg=None):
731
# return cache.DatabaseCacheDecorator(func, maxage, keyarg)
732
return cache.DatabaseCacheDecorator(func, keyarg, maxage)
733
734
def _cache_memory(self, func, maxage=None, keyarg=None):
735
return cache.Memcache()
736
737
def _get_date_min_max(self, dmin=None, dmax=None):
738
"""Retrieve and parse 'date-min' and 'date-max' config values"""
739
def get(key, default):
740
ts = self.config(key, default)
741
if isinstance(ts, str):
742
dt_obj = dt.parse_iso(ts) if fmt is None else dt.parse(ts, fmt)
743
if dt_obj is dt.NONE:
744
self.log.warning(
745
"Unable to parse '%s': Invalid %s string '%s'",
746
key, "isoformat" if fmt is None else "date", ts)
747
ts = default
748
else:
749
ts = int(dt.to_ts(dt_obj))
750
return ts
751
fmt = self.config("date-format")
752
return get("date-min", dmin), get("date-max", dmax)
753
754
@classmethod
755
def _dump(cls, obj):
756
util.dump_json(obj, ensure_ascii=False, indent=2)
757
758
def _dump_response(self, response, history=True):
759
"""Write the response content to a .txt file in the current directory.
760
761
The file name is derived from the response url,
762
replacing special characters with "_"
763
"""
764
if history:
765
for resp in response.history:
766
self._dump_response(resp, False)
767
768
if hasattr(Extractor, "_dump_index"):
769
Extractor._dump_index += 1
770
else:
771
Extractor._dump_index = 1
772
Extractor._dump_sanitize = util.re_compile(
773
r"[\\\\|/<>:\"?*&=#]+").sub
774
775
fname = (f"{Extractor._dump_index:>02}_"
776
f"{Extractor._dump_sanitize('_', response.url)}")
777
778
if util.WINDOWS:
779
path = os.path.abspath(fname)[:255]
780
else:
781
path = fname[:251]
782
783
try:
784
with open(path + ".txt", 'wb') as fp:
785
util.dump_response(
786
response, fp,
787
headers=(self._write_pages in ("all", "ALL")),
788
hide_auth=(self._write_pages != "ALL")
789
)
790
self.log.info("Writing '%s' response to '%s'",
791
response.url, path + ".txt")
792
except Exception as e:
793
self.log.warning("Failed to dump HTTP request (%s: %s)",
794
e.__class__.__name__, e)
795
796
797
class GalleryExtractor(Extractor):
798
799
subcategory = "gallery"
800
filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
801
directory_fmt = ("{category}", "{gallery_id} {title}")
802
archive_fmt = "{gallery_id}_{num}"
803
enum = "num"
804
805
def __init__(self, match, url=None):
806
Extractor.__init__(self, match)
807
808
if url is None and (path := self.groups[0]) and path[0] == "/":
809
self.page_url = self.root + path
810
else:
811
self.page_url = url
812
813
def items(self):
814
self.login()
815
816
if self.page_url:
817
page = self.request(
818
self.page_url, notfound=self.subcategory).text
819
else:
820
page = None
821
822
data = self.metadata(page)
823
imgs = self.images(page)
824
assets = self.assets(page)
825
826
if "count" in data:
827
if self.config("page-reverse"):
828
images = util.enumerate_reversed(imgs, 1, data["count"])
829
else:
830
images = zip(
831
range(1, data["count"]+1),
832
imgs,
833
)
834
else:
835
enum = enumerate
836
try:
837
data["count"] = len(imgs)
838
except TypeError:
839
pass
840
else:
841
if self.config("page-reverse"):
842
enum = util.enumerate_reversed
843
images = enum(imgs, 1)
844
845
yield Message.Directory, "", data
846
enum_key = self.enum
847
848
if assets:
849
for asset in assets:
850
url = asset["url"]
851
asset.update(data)
852
asset[enum_key] = 0
853
if "extension" not in asset:
854
text.nameext_from_url(url, asset)
855
yield Message.Url, url, asset
856
857
for data[enum_key], (url, imgdata) in images:
858
if imgdata:
859
data.update(imgdata)
860
if "extension" not in imgdata:
861
text.nameext_from_url(url, data)
862
else:
863
text.nameext_from_url(url, data)
864
yield Message.Url, url, data
865
866
def login(self):
867
"""Login and set necessary cookies"""
868
869
def metadata(self, page):
870
"""Return a dict with general metadata"""
871
872
def images(self, page):
873
"""Return a list or iterable of all (image-url, metadata)-tuples"""
874
875
def assets(self, page):
876
"""Return an iterable of additional gallery assets
877
878
Each asset must be a 'dict' containing at least 'url' and 'type'
879
"""
880
881
882
class ChapterExtractor(GalleryExtractor):
883
884
subcategory = "chapter"
885
directory_fmt = (
886
"{category}", "{manga}",
887
"{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}")
888
filename_fmt = (
889
"{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
890
archive_fmt = (
891
"{manga}_{chapter}{chapter_minor}_{page}")
892
enum = "page"
893
894
895
class MangaExtractor(Extractor):
896
897
subcategory = "manga"
898
categorytransfer = True
899
chapterclass = None
900
reverse = True
901
902
def __init__(self, match, url=None):
903
Extractor.__init__(self, match)
904
905
if url is None and (path := self.groups[0]) and path[0] == "/":
906
self.page_url = self.root + path
907
else:
908
self.page_url = url
909
910
if self.config("chapter-reverse", False):
911
self.reverse = not self.reverse
912
913
def items(self):
914
self.login()
915
916
if self.page_url:
917
page = self.request(self.page_url, notfound=self.subcategory).text
918
else:
919
page = None
920
921
chapters = self.chapters(page)
922
if self.reverse:
923
chapters.reverse()
924
925
for chapter, data in chapters:
926
data["_extractor"] = self.chapterclass
927
yield Message.Queue, chapter, data
928
929
def login(self):
930
"""Login and set necessary cookies"""
931
932
def chapters(self, page):
933
"""Return a list of all (chapter-url, metadata)-tuples"""
934
935
936
class Dispatch():
937
subcategory = "user"
938
cookies_domain = None
939
finalize = Extractor.finalize
940
skip = Extractor.skip
941
942
def __iter__(self):
943
return self.items()
944
945
def initialize(self):
946
pass
947
948
def _dispatch_extractors(self, extractor_data, default=(), alt=None):
949
extractors = {
950
data[0].subcategory: data
951
for data in extractor_data
952
}
953
954
if alt is not None:
955
for sub, sub_alt, url in alt:
956
if url is None:
957
extractors[sub_alt] = extractors[sub]
958
else:
959
extractors[sub_alt] = (extractors[sub][0], url)
960
961
include = self.config("include", default) or ()
962
if include == "all":
963
include = extractors
964
elif isinstance(include, str):
965
include = include.replace(" ", "").split(",")
966
967
results = []
968
for category in include:
969
try:
970
extr, url = extractors[category]
971
except KeyError:
972
self.log.warning("Invalid include '%s'", category)
973
else:
974
results.append((Message.Queue, url, {"_extractor": extr}))
975
return iter(results)
976
977
978
class AsynchronousMixin():
979
"""Run info extraction in a separate thread"""
980
981
def __iter__(self):
982
self.initialize()
983
984
messages = queue.Queue(5)
985
thread = threading.Thread(
986
target=self.async_items,
987
args=(messages,),
988
daemon=True,
989
)
990
991
thread.start()
992
while True:
993
msg = messages.get()
994
if msg is None:
995
thread.join()
996
return
997
if isinstance(msg, Exception):
998
thread.join()
999
raise msg
1000
yield msg
1001
messages.task_done()
1002
1003
def async_items(self, messages):
1004
try:
1005
for msg in self.items():
1006
messages.put(msg)
1007
except Exception as exc:
1008
messages.put(exc)
1009
messages.put(None)
1010
1011
1012
class BaseExtractor(Extractor):
1013
instances = ()
1014
1015
def __init__(self, match):
1016
if not self.category:
1017
self._init_category(match)
1018
Extractor.__init__(self, match)
1019
1020
def _init_category(self, match):
1021
for index, group in enumerate(match.groups()):
1022
if group is not None:
1023
if index:
1024
self.category, self.root, info = self.instances[index-1]
1025
if not self.root:
1026
self.root = text.root_from_url(match[0])
1027
self.config_instance = info.get
1028
else:
1029
self.root = group
1030
self.category = group.partition("://")[2]
1031
break
1032
1033
@classmethod
1034
def update(cls, instances):
1035
if extra_instances := config.get(("extractor",), cls.basecategory):
1036
for category, info in extra_instances.items():
1037
if isinstance(info, dict) and "root" in info:
1038
instances[category] = info
1039
1040
pattern_list = []
1041
instance_list = cls.instances = []
1042
for category, info in instances.items():
1043
if root := info["root"]:
1044
root = root.rstrip("/")
1045
instance_list.append((category, root, info))
1046
1047
pattern = info.get("pattern")
1048
if not pattern:
1049
pattern = re.escape(root[root.index(":") + 3:])
1050
pattern_list.append(pattern + "()")
1051
1052
return (f"(?:{cls.basecategory}:(https?://[^/?#]+)|"
1053
f"(?:https?://)?(?:{'|'.join(pattern_list)}))")
1054
1055
1056
class RequestsAdapter(HTTPAdapter):
1057
1058
def __init__(self, ssl_context=None, source_address=None):
1059
self.ssl_context = ssl_context
1060
self.source_address = source_address
1061
HTTPAdapter.__init__(self)
1062
1063
def init_poolmanager(self, *args, **kwargs):
1064
kwargs["ssl_context"] = self.ssl_context
1065
kwargs["source_address"] = self.source_address
1066
return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
1067
1068
def proxy_manager_for(self, *args, **kwargs):
1069
kwargs["ssl_context"] = self.ssl_context
1070
kwargs["source_address"] = self.source_address
1071
return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
1072
1073
1074
def _build_requests_adapter(
1075
ssl_options, ssl_ciphers, ssl_ctx, source_address):
1076
1077
key = (ssl_options, ssl_ciphers, ssl_ctx, source_address)
1078
try:
1079
return CACHE_ADAPTERS[key]
1080
except KeyError:
1081
pass
1082
1083
if ssl_options or ssl_ciphers or ssl_ctx:
1084
if ssl_ctx is None:
1085
ssl_context = urllib3.connection.create_urllib3_context(
1086
options=ssl_options or None, ciphers=ssl_ciphers)
1087
if not requests.__version__ < "2.32":
1088
# https://github.com/psf/requests/pull/6731
1089
ssl_context.load_verify_locations(requests.certs.where())
1090
else:
1091
ssl_ctx_orig = urllib3.util.ssl_.SSLContext
1092
try:
1093
urllib3.util.ssl_.SSLContext = ssl_ctx
1094
ssl_context = urllib3.connection.create_urllib3_context(
1095
options=ssl_options or None, ciphers=ssl_ciphers)
1096
finally:
1097
urllib3.util.ssl_.SSLContext = ssl_ctx_orig
1098
ssl_context.check_hostname = False
1099
else:
1100
ssl_context = None
1101
1102
adapter = CACHE_ADAPTERS[key] = RequestsAdapter(
1103
ssl_context, source_address)
1104
return adapter
1105
1106
1107
@cache.cache(maxage=86400, keyarg=0)
1108
def _browser_useragent(browser):
1109
"""Get User-Agent header from default browser"""
1110
import webbrowser
1111
try:
1112
open = webbrowser.get(browser).open
1113
except webbrowser.Error:
1114
if not browser:
1115
raise
1116
import shutil
1117
if not (browser := shutil.which(browser)):
1118
raise
1119
1120
def open(url):
1121
util.Popen((browser, url),
1122
start_new_session=False if util.WINDOWS else True)
1123
1124
import socket
1125
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1126
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
1127
server.bind(("127.0.0.1", 0))
1128
server.listen(1)
1129
1130
host, port = server.getsockname()
1131
open(f"http://{host}:{port}/user-agent")
1132
1133
client = server.accept()[0]
1134
server.close()
1135
1136
for line in client.recv(1024).split(b"\r\n"):
1137
key, _, value = line.partition(b":")
1138
if key.strip().lower() == b"user-agent":
1139
useragent = value.strip()
1140
break
1141
else:
1142
useragent = b""
1143
1144
client.send(b"HTTP/1.1 200 OK\r\n\r\n" + useragent)
1145
client.close()
1146
1147
return useragent.decode()
1148
1149
1150
CACHE_ADAPTERS = {}
1151
CACHE_COOKIES = {}
1152
CACHE_UTILS = {}
1153
CATEGORY_MAP = ()
1154
1155
1156
HEADERS_FIREFOX_140 = (
1157
("User-Agent", "Mozilla/5.0 ({}; rv:140.0) Gecko/20100101 Firefox/140.0"),
1158
("Accept", "text/html,application/xhtml+xml,"
1159
"application/xml;q=0.9,*/*;q=0.8"),
1160
("Accept-Language", "en-US,en;q=0.5"),
1161
("Accept-Encoding", None),
1162
("Connection", "keep-alive"),
1163
("Content-Type", None),
1164
("Content-Length", None),
1165
("Referer", None),
1166
("Origin", None),
1167
("Cookie", None),
1168
("Sec-Fetch-Dest", "empty"),
1169
("Sec-Fetch-Mode", "cors"),
1170
("Sec-Fetch-Site", "same-origin"),
1171
("TE", "trailers"),
1172
)
1173
HEADERS_FIREFOX_128 = (
1174
("User-Agent", "Mozilla/5.0 ({}; rv:128.0) Gecko/20100101 Firefox/128.0"),
1175
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
1176
"image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"),
1177
("Accept-Language", "en-US,en;q=0.5"),
1178
("Accept-Encoding", None),
1179
("Referer", None),
1180
("Connection", "keep-alive"),
1181
("Upgrade-Insecure-Requests", "1"),
1182
("Cookie", None),
1183
("Sec-Fetch-Dest", "empty"),
1184
("Sec-Fetch-Mode", "no-cors"),
1185
("Sec-Fetch-Site", "same-origin"),
1186
("TE", "trailers"),
1187
)
1188
HEADERS_CHROMIUM_138 = (
1189
("Connection", "keep-alive"),
1190
("sec-ch-ua", '"Not)A;Brand";v="8", "Chromium";v="138"'),
1191
("sec-ch-ua-mobile", "?0"),
1192
("sec-ch-ua-platform", '"Linux"'),
1193
("Upgrade-Insecure-Requests", "1"),
1194
("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, "
1195
"like Gecko) Chrome/138.0.0.0 Safari/537.36"),
1196
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
1197
"image/avif,image/webp,image/apng,*/*;q=0.8,"
1198
"application/signed-exchange;v=b3;q=0.7"),
1199
("Referer", None),
1200
("Sec-Fetch-Site", "same-origin"),
1201
("Sec-Fetch-Mode", "no-cors"),
1202
# ("Sec-Fetch-User", "?1"),
1203
("Sec-Fetch-Dest", "empty"),
1204
("Accept-Encoding", None),
1205
("Accept-Language", "en-US,en;q=0.9"),
1206
)
1207
HEADERS_CHROMIUM_111 = (
1208
("Connection", "keep-alive"),
1209
("Upgrade-Insecure-Requests", "1"),
1210
("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, "
1211
"like Gecko) Chrome/111.0.0.0 Safari/537.36"),
1212
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
1213
"image/avif,image/webp,image/apng,*/*;q=0.8,"
1214
"application/signed-exchange;v=b3;q=0.7"),
1215
("Referer", None),
1216
("Sec-Fetch-Site", "same-origin"),
1217
("Sec-Fetch-Mode", "no-cors"),
1218
("Sec-Fetch-Dest", "empty"),
1219
("Accept-Encoding", None),
1220
("Accept-Language", "en-US,en;q=0.9"),
1221
("cookie", None),
1222
("content-length", None),
1223
)
1224
HEADERS = {
1225
"firefox" : HEADERS_FIREFOX_140,
1226
"firefox/140": HEADERS_FIREFOX_140,
1227
"firefox/128": HEADERS_FIREFOX_128,
1228
"chrome" : HEADERS_CHROMIUM_138,
1229
"chrome/138" : HEADERS_CHROMIUM_138,
1230
"chrome/111" : HEADERS_CHROMIUM_111,
1231
}
1232
1233
CIPHERS_FIREFOX = (
1234
"TLS_AES_128_GCM_SHA256:"
1235
"TLS_CHACHA20_POLY1305_SHA256:"
1236
"TLS_AES_256_GCM_SHA384:"
1237
"ECDHE-ECDSA-AES128-GCM-SHA256:"
1238
"ECDHE-RSA-AES128-GCM-SHA256:"
1239
"ECDHE-ECDSA-CHACHA20-POLY1305:"
1240
"ECDHE-RSA-CHACHA20-POLY1305:"
1241
"ECDHE-ECDSA-AES256-GCM-SHA384:"
1242
"ECDHE-RSA-AES256-GCM-SHA384:"
1243
"ECDHE-ECDSA-AES256-SHA:"
1244
"ECDHE-ECDSA-AES128-SHA:"
1245
"ECDHE-RSA-AES128-SHA:"
1246
"ECDHE-RSA-AES256-SHA:"
1247
"AES128-GCM-SHA256:"
1248
"AES256-GCM-SHA384:"
1249
"AES128-SHA:"
1250
"AES256-SHA"
1251
)
1252
CIPHERS_CHROMIUM = (
1253
"TLS_AES_128_GCM_SHA256:"
1254
"TLS_AES_256_GCM_SHA384:"
1255
"TLS_CHACHA20_POLY1305_SHA256:"
1256
"ECDHE-ECDSA-AES128-GCM-SHA256:"
1257
"ECDHE-RSA-AES128-GCM-SHA256:"
1258
"ECDHE-ECDSA-AES256-GCM-SHA384:"
1259
"ECDHE-RSA-AES256-GCM-SHA384:"
1260
"ECDHE-ECDSA-CHACHA20-POLY1305:"
1261
"ECDHE-RSA-CHACHA20-POLY1305:"
1262
"ECDHE-RSA-AES128-SHA:"
1263
"ECDHE-RSA-AES256-SHA:"
1264
"AES128-GCM-SHA256:"
1265
"AES256-GCM-SHA384:"
1266
"AES128-SHA:"
1267
"AES256-SHA"
1268
)
1269
CIPHERS = {
1270
"firefox" : CIPHERS_FIREFOX,
1271
"firefox/140": CIPHERS_FIREFOX,
1272
"firefox/128": CIPHERS_FIREFOX,
1273
"chrome" : CIPHERS_CHROMIUM,
1274
"chrome/138" : CIPHERS_CHROMIUM,
1275
"chrome/111" : CIPHERS_CHROMIUM,
1276
}
1277
1278
1279
# disable Basic Authorization header injection from .netrc data
1280
try:
1281
requests.sessions.get_netrc_auth = lambda _: None
1282
except Exception:
1283
pass
1284
1285
# detect brotli support
1286
try:
1287
BROTLI = urllib3.response.brotli is not None
1288
except AttributeError:
1289
BROTLI = False
1290
1291
# detect zstandard support
1292
try:
1293
ZSTD = urllib3.response.HAS_ZSTD
1294
except AttributeError:
1295
ZSTD = False
1296
1297
# set (urllib3) warnings filter
1298
action = config.get((), "warnings", "default")
1299
if action:
1300
try:
1301
import warnings
1302
warnings.simplefilter(action, urllib3.exceptions.HTTPWarning)
1303
except Exception:
1304
pass
1305
del action
1306
1307