Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/downloader/http.py
8947 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Downloader module for http:// and https:// URLs"""
10
11
import time
12
import mimetypes
13
from requests.exceptions import RequestException, ConnectionError, Timeout
14
from .common import DownloaderBase
15
from .. import text, util, output, exception
16
from ssl import SSLError
17
FLAGS = util.FLAGS
18
19
20
class HttpDownloader(DownloaderBase):
21
scheme = "http"
22
23
def __init__(self, job):
24
DownloaderBase.__init__(self, job)
25
extractor = job.extractor
26
self.downloading = False
27
28
self.adjust_extension = self.config("adjust-extensions", True)
29
self.chunk_size = self.config("chunk-size", 32768)
30
self.metadata = extractor.config("http-metadata")
31
self.progress = self.config("progress", 3.0)
32
self.validate = self.config("validate", True)
33
self.validate_html = self.config("validate-html", True)
34
self.headers = self.config("headers")
35
self.minsize = self.config("filesize-min")
36
self.maxsize = self.config("filesize-max")
37
self.retries = self.config("retries", extractor._retries)
38
self.retry_codes = self.config("retry-codes", extractor._retry_codes)
39
self.timeout = self.config("timeout", extractor._timeout)
40
self.verify = self.config("verify", extractor._verify)
41
self.mtime = self.config("mtime", True)
42
self.rate = self.config("rate")
43
interval_429 = self.config("sleep-429")
44
45
if not self.config("consume-content", False):
46
# this resets the underlying TCP connection, and therefore
47
# if the program makes another request to the same domain,
48
# a new connection (either TLS or plain TCP) must be made
49
self.release_conn = lambda resp: resp.close()
50
51
if self.retries < 0:
52
self.retries = float("inf")
53
if self.minsize:
54
minsize = text.parse_bytes(self.minsize)
55
if not minsize:
56
self.log.warning(
57
"Invalid minimum file size (%r)", self.minsize)
58
self.minsize = minsize
59
if self.maxsize:
60
maxsize = text.parse_bytes(self.maxsize)
61
if not maxsize:
62
self.log.warning(
63
"Invalid maximum file size (%r)", self.maxsize)
64
self.maxsize = maxsize
65
if isinstance(self.chunk_size, str):
66
chunk_size = text.parse_bytes(self.chunk_size)
67
if not chunk_size:
68
self.log.warning(
69
"Invalid chunk size (%r)", self.chunk_size)
70
chunk_size = 32768
71
self.chunk_size = chunk_size
72
if self.rate:
73
func = util.build_selection_func(self.rate, 0, text.parse_bytes)
74
if rmax := func.args[1] if hasattr(func, "args") else func():
75
if rmax < self.chunk_size:
76
# reduce chunk_size to allow for one iteration each second
77
self.chunk_size = rmax
78
self.rate = func
79
self.receive = self._receive_rate
80
else:
81
self.log.warning("Invalid rate limit (%r)", self.rate)
82
self.rate = False
83
if self.progress is not None:
84
self.receive = self._receive_rate
85
if self.progress < 0.0:
86
self.progress = 0.0
87
if interval_429 is None:
88
self.interval_429 = extractor._interval_429
89
else:
90
try:
91
self.interval_429 = util.build_duration_func_ex(interval_429)
92
except Exception as exc:
93
self.log.error("Invalid 'sleep-429' value '%s' (%s: %s)",
94
interval_429, exc.__class__.__name__, exc)
95
self.interval_429 = extractor._interval_429
96
97
def download(self, url, pathfmt):
98
try:
99
return self._download_impl(url, pathfmt)
100
except Exception as exc:
101
if self.downloading:
102
output.stderr_write("\n")
103
self.log.traceback(exc)
104
raise
105
finally:
106
# remove file from incomplete downloads
107
if self.downloading and not self.part:
108
util.remove_file(pathfmt.temppath)
109
110
def _download_impl(self, url, pathfmt):
111
response = None
112
tries = code = 0
113
msg = ""
114
115
metadata = self.metadata
116
kwdict = pathfmt.kwdict
117
expected_status = kwdict.get(
118
"_http_expected_status", ())
119
adjust_extension = kwdict.get(
120
"_http_adjust_extension", self.adjust_extension)
121
122
if self.part and not metadata:
123
pathfmt.part_enable(self.partdir)
124
125
while True:
126
if tries:
127
if response:
128
self.release_conn(response)
129
response = None
130
131
self.log.warning("%s (%s/%s)", msg, tries, self.retries+1)
132
if tries > self.retries:
133
return False
134
135
if code == 429 and self.interval_429:
136
s = self.interval_429(tries)
137
time.sleep(s if s > tries else tries)
138
else:
139
time.sleep(tries)
140
code = 0
141
142
tries += 1
143
file_header = None
144
145
# collect HTTP headers
146
headers = {"Accept": "*/*"}
147
# file-specific headers
148
if extra := kwdict.get("_http_headers"):
149
headers.update(extra)
150
# general headers
151
if self.headers:
152
headers.update(self.headers)
153
# partial content
154
if file_size := pathfmt.part_size():
155
headers["Range"] = f"bytes={file_size}-"
156
157
# connect to (remote) source
158
try:
159
response = self.session.request(
160
kwdict.get("_http_method", "GET"), url,
161
stream=True,
162
headers=headers,
163
data=kwdict.get("_http_data"),
164
timeout=self.timeout,
165
proxies=self.proxies,
166
verify=self.verify,
167
)
168
except ConnectionError as exc:
169
try:
170
reason = exc.args[0].reason
171
cls = reason.__class__.__name__
172
pre, _, err = str(reason.args[-1]).partition(":")
173
msg = f"{cls}: {(err or pre).lstrip()}"
174
except Exception:
175
msg = str(exc)
176
continue
177
except Timeout as exc:
178
msg = str(exc)
179
continue
180
except Exception as exc:
181
self.log.warning(exc)
182
return False
183
184
# check response
185
code = response.status_code
186
if code == 200 or code in expected_status: # OK
187
offset = 0
188
size = response.headers.get("Content-Length")
189
elif code == 206: # Partial Content
190
offset = file_size
191
size = response.headers["Content-Range"].rpartition("/")[2]
192
elif code == 416 and file_size: # Requested Range Not Satisfiable
193
break
194
else:
195
msg = f"'{code} {response.reason}' for '{url}'"
196
197
challenge = util.detect_challenge(response)
198
if challenge is not None:
199
self.log.warning(challenge)
200
201
if code in self.retry_codes or 500 <= code < 600:
202
continue
203
retry = kwdict.get("_http_retry")
204
if retry and retry(response):
205
continue
206
self.release_conn(response)
207
self.log.warning(msg)
208
return False
209
210
# check for invalid responses
211
if self.validate and \
212
(validate := kwdict.get("_http_validate")) is not None:
213
try:
214
result = validate(response)
215
except Exception:
216
self.release_conn(response)
217
raise
218
if isinstance(result, str):
219
url = result
220
tries -= 1
221
continue
222
if not result:
223
self.release_conn(response)
224
self.log.warning("Invalid response")
225
return False
226
if self.validate_html and response.headers.get(
227
"content-type", "").startswith("text/html") and \
228
pathfmt.extension not in ("html", "htm"):
229
if response.history:
230
self.log.warning("HTTP redirect to '%s'", response.url)
231
else:
232
self.log.warning("HTML response")
233
return False
234
235
# check file size
236
size = text.parse_int(size, None)
237
if size is not None:
238
if not size:
239
self.release_conn(response)
240
self.log.warning("Empty file")
241
return False
242
if self.minsize and size < self.minsize:
243
self.release_conn(response)
244
self.log.warning(
245
"File size smaller than allowed minimum (%s < %s)",
246
size, self.minsize)
247
pathfmt.temppath = ""
248
return True
249
if self.maxsize and size > self.maxsize:
250
self.release_conn(response)
251
self.log.warning(
252
"File size larger than allowed maximum (%s > %s)",
253
size, self.maxsize)
254
pathfmt.temppath = ""
255
return True
256
257
build_path = False
258
259
# set missing filename extension from MIME type
260
if not pathfmt.extension:
261
pathfmt.set_extension(self._find_extension(response))
262
build_path = True
263
264
# set metadata from HTTP headers
265
if metadata:
266
kwdict[metadata] = util.extract_headers(response)
267
build_path = True
268
269
# build and check file path
270
if build_path:
271
pathfmt.build_path()
272
if pathfmt.exists():
273
pathfmt.temppath = ""
274
# release the connection back to pool by explicitly
275
# calling .close()
276
# see https://requests.readthedocs.io/en/latest/user
277
# /advanced/#body-content-workflow
278
# when the image size is on the order of megabytes,
279
# re-establishing a TLS connection will typically be faster
280
# than consuming the whole response
281
response.close()
282
return True
283
if self.part and metadata:
284
pathfmt.part_enable(self.partdir)
285
metadata = False
286
287
content = response.iter_content(self.chunk_size)
288
289
validate_sig = kwdict.get("_http_signature")
290
validate_ext = (adjust_extension and
291
pathfmt.extension in SIGNATURE_CHECKS)
292
293
# check filename extension against file header
294
if not offset and (validate_ext or validate_sig):
295
try:
296
file_header = next(
297
content if response.raw.chunked
298
else response.iter_content(16), b"")
299
except (RequestException, SSLError) as exc:
300
msg = str(exc)
301
continue
302
if validate_sig:
303
result = validate_sig(file_header)
304
if result is not True:
305
self.release_conn(response)
306
self.log.warning(
307
result or "Invalid file signature bytes")
308
return False
309
if validate_ext and self._adjust_extension(
310
pathfmt, file_header) and pathfmt.exists():
311
pathfmt.temppath = ""
312
response.close()
313
return True
314
315
# set open mode
316
if not offset:
317
mode = "w+b"
318
if file_size:
319
self.log.debug("Unable to resume partial download")
320
else:
321
mode = "r+b"
322
self.log.debug("Resuming download at byte %d", offset)
323
324
# download content
325
self.downloading = True
326
with pathfmt.open(mode) as fp:
327
if fp is None:
328
# '.part' file no longer exists
329
break
330
if file_header:
331
fp.write(file_header)
332
offset += len(file_header)
333
elif offset:
334
if adjust_extension and \
335
pathfmt.extension in SIGNATURE_CHECKS:
336
self._adjust_extension(pathfmt, fp.read(16))
337
fp.seek(offset)
338
339
self.out.start(pathfmt.path)
340
try:
341
self.receive(fp, content, size, offset)
342
except (RequestException, SSLError) as exc:
343
msg = str(exc)
344
output.stderr_write("\n")
345
continue
346
except exception.StopExtraction:
347
response.close()
348
return False
349
except exception.ControlException:
350
response.close()
351
raise
352
353
# check file size
354
if size and (fsize := fp.tell()) < size:
355
if (segmented := kwdict.get("_http_segmented")) and \
356
segmented is True or segmented == fsize:
357
tries -= 1
358
msg = "Resuming segmented download"
359
output.stdout_write("\r")
360
else:
361
msg = f"file size mismatch ({fsize} < {size})"
362
output.stderr_write("\n")
363
continue
364
365
break
366
367
self.downloading = False
368
if self.mtime:
369
if "_http_lastmodified" in kwdict:
370
kwdict["_mtime_http"] = kwdict["_http_lastmodified"]
371
else:
372
kwdict["_mtime_http"] = response.headers.get("Last-Modified")
373
else:
374
kwdict["_mtime_http"] = None
375
376
return True
377
378
def release_conn(self, response):
379
"""Release connection back to pool by consuming response body"""
380
try:
381
for _ in response.iter_content(self.chunk_size):
382
pass
383
except (RequestException, SSLError) as exc:
384
output.stderr_write("\n")
385
self.log.debug(
386
"Unable to consume response body (%s: %s); "
387
"closing the connection anyway", exc.__class__.__name__, exc)
388
response.close()
389
390
def receive(self, fp, content, bytes_total, bytes_start):
391
write = fp.write
392
for data in content:
393
if FLAGS.DOWNLOAD is not None:
394
return FLAGS.process("DOWNLOAD")
395
write(data)
396
397
def _receive_rate(self, fp, content, bytes_total, bytes_start):
398
rate = self.rate() if self.rate else None
399
write = fp.write
400
progress = self.progress
401
402
bytes_downloaded = 0
403
time_start = time.monotonic()
404
405
for data in content:
406
if FLAGS.DOWNLOAD is not None:
407
return FLAGS.process("DOWNLOAD")
408
time_elapsed = time.monotonic() - time_start
409
bytes_downloaded += len(data)
410
411
write(data)
412
413
if progress is not None:
414
if time_elapsed > progress:
415
self.out.progress(
416
bytes_total,
417
bytes_start + bytes_downloaded,
418
int(bytes_downloaded / time_elapsed),
419
)
420
421
if rate is not None:
422
time_expected = bytes_downloaded / rate
423
if time_expected > time_elapsed:
424
time.sleep(time_expected - time_elapsed)
425
426
def _find_extension(self, response):
427
"""Get filename extension from MIME type"""
428
mtype = response.headers.get("Content-Type", "image/jpeg")
429
mtype = mtype.partition(";")[0].lower()
430
431
if "/" not in mtype:
432
mtype = "image/" + mtype
433
434
if mtype in MIME_TYPES:
435
return MIME_TYPES[mtype]
436
437
if ext := mimetypes.guess_extension(mtype, strict=False):
438
return ext[1:]
439
440
self.log.warning("Unknown MIME type '%s'", mtype)
441
return "bin"
442
443
def _adjust_extension(self, pathfmt, file_header):
444
"""Check filename extension against file header"""
445
if not SIGNATURE_CHECKS[pathfmt.extension](file_header):
446
for ext, check in SIGNATURE_CHECKS.items():
447
if check(file_header):
448
self.log.debug(
449
"Adjusting filename extension of '%s' to '%s'",
450
pathfmt.filename, ext)
451
pathfmt.set_extension(ext)
452
pathfmt.build_path()
453
return True
454
return False
455
456
457
MIME_TYPES = {
458
"image/jpeg" : "jpg",
459
"image/jpg" : "jpg",
460
"image/png" : "png",
461
"image/gif" : "gif",
462
"image/bmp" : "bmp",
463
"image/x-bmp" : "bmp",
464
"image/x-ms-bmp": "bmp",
465
"image/webp" : "webp",
466
"image/avif" : "avif",
467
"image/heic" : "heic",
468
"image/heif" : "heif",
469
"image/svg+xml" : "svg",
470
"image/ico" : "ico",
471
"image/icon" : "ico",
472
"image/x-icon" : "ico",
473
"image/vnd.microsoft.icon" : "ico",
474
"image/x-photoshop" : "psd",
475
"application/x-photoshop" : "psd",
476
"image/vnd.adobe.photoshop": "psd",
477
478
"video/webm": "webm",
479
"video/ogg" : "ogg",
480
"video/mp4" : "mp4",
481
"video/m4v" : "m4v",
482
"video/x-m4v": "m4v",
483
"video/quicktime": "mov",
484
485
"audio/wav" : "wav",
486
"audio/x-wav": "wav",
487
"audio/webm" : "webm",
488
"audio/ogg" : "ogg",
489
"audio/mpeg" : "mp3",
490
"audio/aac" : "aac",
491
"audio/x-aac": "aac",
492
493
"application/vnd.apple.mpegurl": "m3u8",
494
"application/x-mpegurl" : "m3u8",
495
"application/dash+xml" : "mpd",
496
497
"application/zip" : "zip",
498
"application/x-zip": "zip",
499
"application/x-zip-compressed": "zip",
500
"application/rar" : "rar",
501
"application/x-rar": "rar",
502
"application/x-rar-compressed": "rar",
503
"application/x-7z-compressed" : "7z",
504
505
"application/pdf" : "pdf",
506
"application/x-pdf": "pdf",
507
"application/x-shockwave-flash": "swf",
508
509
"text/html": "html",
510
511
"application/ogg": "ogg",
512
# https://www.iana.org/assignments/media-types/model/obj
513
"model/obj": "obj",
514
"application/octet-stream": "bin",
515
}
516
517
518
def _signature_html(s):
519
s = s[:14].lstrip()
520
return s and b"<!doctype html".startswith(s.lower())
521
522
523
# https://en.wikipedia.org/wiki/List_of_file_signatures
524
SIGNATURE_CHECKS = {
525
"jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF",
526
"png" : lambda s: s[0:8] == b"\x89PNG\r\n\x1A\n",
527
"gif" : lambda s: s[0:6] in (b"GIF87a", b"GIF89a"),
528
"bmp" : lambda s: s[0:2] == b"BM",
529
"webp": lambda s: (s[0:4] == b"RIFF" and
530
s[8:12] == b"WEBP"),
531
"avif": lambda s: s[4:11] == b"ftypavi" and s[11] in b"fs",
532
"heic": lambda s: (s[4:10] == b"ftyphe" and s[10:12] in (
533
b"ic", b"im", b"is", b"ix", b"vc", b"vm", b"vs")),
534
"svg" : lambda s: s[0:5] == b"<?xml",
535
"ico" : lambda s: s[0:4] == b"\x00\x00\x01\x00",
536
"cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00",
537
"psd" : lambda s: s[0:4] == b"8BPS",
538
"mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in (
539
b"mp4", b"avc", b"iso")),
540
"m4v" : lambda s: s[4:11] == b"ftypM4V",
541
"mov" : lambda s: s[4:12] == b"ftypqt ",
542
"webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3",
543
"ogg" : lambda s: s[0:4] == b"OggS",
544
"wav" : lambda s: (s[0:4] == b"RIFF" and
545
s[8:12] == b"WAVE"),
546
"mp3" : lambda s: (s[0:3] == b"ID3" or
547
s[0:2] in (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2")),
548
"aac" : lambda s: s[0:2] in (b"\xFF\xF9", b"\xFF\xF1"),
549
"m3u8": lambda s: s[0:7] == b"#EXTM3U",
550
"mpd" : lambda s: b"<MPD" in s,
551
"zip" : lambda s: s[0:4] in (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"),
552
"rar" : lambda s: s[0:6] == b"Rar!\x1A\x07",
553
"7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C",
554
"pdf" : lambda s: s[0:5] == b"%PDF-",
555
"swf" : lambda s: s[0:3] in (b"CWS", b"FWS"),
556
"html": _signature_html,
557
"htm" : _signature_html,
558
"blend": lambda s: s[0:7] == b"BLENDER",
559
# unfortunately the Wavefront .obj format doesn't have a signature,
560
# so we check for the existence of Blender's comment
561
"obj" : lambda s: s[0:11] == b"# Blender v",
562
# Celsys Clip Studio Paint format
563
# https://github.com/rasensuihei/cliputils/blob/master/README.md
564
"clip": lambda s: s[0:8] == b"CSFCHUNK",
565
# check 'bin' files against all other file signatures
566
"bin" : lambda s: False,
567
}
568
569
__downloader__ = HttpDownloader
570
571