CoCalc -- http.py

GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/downloader/http.py
⁸⁹⁴⁷ views
1
# -*- coding: utf-8 -*-
2

3
# Copyright 2014-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8

9
"""Downloader module for http:// and https:// URLs"""
10

11
import time
12
import mimetypes
13
from requests.exceptions import RequestException, ConnectionError, Timeout
14
from .common import DownloaderBase
15
from .. import text, util, output, exception
16
from ssl import SSLError
17
FLAGS = util.FLAGS
18

19

20
class HttpDownloader(DownloaderBase):
21
    scheme = "http"
22

23
    def __init__(self, job):
24
        DownloaderBase.__init__(self, job)
25
        extractor = job.extractor
26
        self.downloading = False
27

28
        self.adjust_extension = self.config("adjust-extensions", True)
29
        self.chunk_size = self.config("chunk-size", 32768)
30
        self.metadata = extractor.config("http-metadata")
31
        self.progress = self.config("progress", 3.0)
32
        self.validate = self.config("validate", True)
33
        self.validate_html = self.config("validate-html", True)
34
        self.headers = self.config("headers")
35
        self.minsize = self.config("filesize-min")
36
        self.maxsize = self.config("filesize-max")
37
        self.retries = self.config("retries", extractor._retries)
38
        self.retry_codes = self.config("retry-codes", extractor._retry_codes)
39
        self.timeout = self.config("timeout", extractor._timeout)
40
        self.verify = self.config("verify", extractor._verify)
41
        self.mtime = self.config("mtime", True)
42
        self.rate = self.config("rate")
43
        interval_429 = self.config("sleep-429")
44

45
        if not self.config("consume-content", False):
46
            # this resets the underlying TCP connection, and therefore
47
            # if the program makes another request to the same domain,
48
            # a new connection (either TLS or plain TCP) must be made
49
            self.release_conn = lambda resp: resp.close()
50

51
        if self.retries < 0:
52
            self.retries = float("inf")
53
        if self.minsize:
54
            minsize = text.parse_bytes(self.minsize)
55
            if not minsize:
56
                self.log.warning(
57
                    "Invalid minimum file size (%r)", self.minsize)
58
            self.minsize = minsize
59
        if self.maxsize:
60
            maxsize = text.parse_bytes(self.maxsize)
61
            if not maxsize:
62
                self.log.warning(
63
                    "Invalid maximum file size (%r)", self.maxsize)
64
            self.maxsize = maxsize
65
        if isinstance(self.chunk_size, str):
66
            chunk_size = text.parse_bytes(self.chunk_size)
67
            if not chunk_size:
68
                self.log.warning(
69
                    "Invalid chunk size (%r)", self.chunk_size)
70
                chunk_size = 32768
71
            self.chunk_size = chunk_size
72
        if self.rate:
73
            func = util.build_selection_func(self.rate, 0, text.parse_bytes)
74
            if rmax := func.args[1] if hasattr(func, "args") else func():
75
                if rmax < self.chunk_size:
76
                    # reduce chunk_size to allow for one iteration each second
77
                    self.chunk_size = rmax
78
                self.rate = func
79
                self.receive = self._receive_rate
80
            else:
81
                self.log.warning("Invalid rate limit (%r)", self.rate)
82
                self.rate = False
83
        if self.progress is not None:
84
            self.receive = self._receive_rate
85
            if self.progress < 0.0:
86
                self.progress = 0.0
87
        if interval_429 is None:
88
            self.interval_429 = extractor._interval_429
89
        else:
90
            try:
91
                self.interval_429 = util.build_duration_func_ex(interval_429)
92
            except Exception as exc:
93
                self.log.error("Invalid 'sleep-429' value '%s' (%s: %s)",
94
                               interval_429, exc.__class__.__name__, exc)
95
                self.interval_429 = extractor._interval_429
96

97
    def download(self, url, pathfmt):
98
        try:
99
            return self._download_impl(url, pathfmt)
100
        except Exception as exc:
101
            if self.downloading:
102
                output.stderr_write("\n")
103
            self.log.traceback(exc)
104
            raise
105
        finally:
106
            # remove file from incomplete downloads
107
            if self.downloading and not self.part:
108
                util.remove_file(pathfmt.temppath)
109

110
    def _download_impl(self, url, pathfmt):
111
        response = None
112
        tries = code = 0
113
        msg = ""
114

115
        metadata = self.metadata
116
        kwdict = pathfmt.kwdict
117
        expected_status = kwdict.get(
118
            "_http_expected_status", ())
119
        adjust_extension = kwdict.get(
120
            "_http_adjust_extension", self.adjust_extension)
121

122
        if self.part and not metadata:
123
            pathfmt.part_enable(self.partdir)
124

125
        while True:
126
            if tries:
127
                if response:
128
                    self.release_conn(response)
129
                    response = None
130

131
                self.log.warning("%s (%s/%s)", msg, tries, self.retries+1)
132
                if tries > self.retries:
133
                    return False
134

135
                if code == 429 and self.interval_429:
136
                    s = self.interval_429(tries)
137
                    time.sleep(s if s > tries else tries)
138
                else:
139
                    time.sleep(tries)
140
                code = 0
141

142
            tries += 1
143
            file_header = None
144

145
            # collect HTTP headers
146
            headers = {"Accept": "*/*"}
147
            #   file-specific headers
148
            if extra := kwdict.get("_http_headers"):
149
                headers.update(extra)
150
            #   general headers
151
            if self.headers:
152
                headers.update(self.headers)
153
            #   partial content
154
            if file_size := pathfmt.part_size():
155
                headers["Range"] = f"bytes={file_size}-"
156

157
            # connect to (remote) source
158
            try:
159
                response = self.session.request(
160
                    kwdict.get("_http_method", "GET"), url,
161
                    stream=True,
162
                    headers=headers,
163
                    data=kwdict.get("_http_data"),
164
                    timeout=self.timeout,
165
                    proxies=self.proxies,
166
                    verify=self.verify,
167
                )
168
            except ConnectionError as exc:
169
                try:
170
                    reason = exc.args[0].reason
171
                    cls = reason.__class__.__name__
172
                    pre, _, err = str(reason.args[-1]).partition(":")
173
                    msg = f"{cls}: {(err or pre).lstrip()}"
174
                except Exception:
175
                    msg = str(exc)
176
                continue
177
            except Timeout as exc:
178
                msg = str(exc)
179
                continue
180
            except Exception as exc:
181
                self.log.warning(exc)
182
                return False
183

184
            # check response
185
            code = response.status_code
186
            if code == 200 or code in expected_status:  # OK
187
                offset = 0
188
                size = response.headers.get("Content-Length")
189
            elif code == 206:  # Partial Content
190
                offset = file_size
191
                size = response.headers["Content-Range"].rpartition("/")[2]
192
            elif code == 416 and file_size:  # Requested Range Not Satisfiable
193
                break
194
            else:
195
                msg = f"'{code} {response.reason}' for '{url}'"
196

197
                challenge = util.detect_challenge(response)
198
                if challenge is not None:
199
                    self.log.warning(challenge)
200

201
                if code in self.retry_codes or 500 <= code < 600:
202
                    continue
203
                retry = kwdict.get("_http_retry")
204
                if retry and retry(response):
205
                    continue
206
                self.release_conn(response)
207
                self.log.warning(msg)
208
                return False
209

210
            # check for invalid responses
211
            if self.validate and \
212
                    (validate := kwdict.get("_http_validate")) is not None:
213
                try:
214
                    result = validate(response)
215
                except Exception:
216
                    self.release_conn(response)
217
                    raise
218
                if isinstance(result, str):
219
                    url = result
220
                    tries -= 1
221
                    continue
222
                if not result:
223
                    self.release_conn(response)
224
                    self.log.warning("Invalid response")
225
                    return False
226
            if self.validate_html and response.headers.get(
227
                    "content-type", "").startswith("text/html") and \
228
                    pathfmt.extension not in ("html", "htm"):
229
                if response.history:
230
                    self.log.warning("HTTP redirect to '%s'", response.url)
231
                else:
232
                    self.log.warning("HTML response")
233
                return False
234

235
            # check file size
236
            size = text.parse_int(size, None)
237
            if size is not None:
238
                if not size:
239
                    self.release_conn(response)
240
                    self.log.warning("Empty file")
241
                    return False
242
                if self.minsize and size < self.minsize:
243
                    self.release_conn(response)
244
                    self.log.warning(
245
                        "File size smaller than allowed minimum (%s < %s)",
246
                        size, self.minsize)
247
                    pathfmt.temppath = ""
248
                    return True
249
                if self.maxsize and size > self.maxsize:
250
                    self.release_conn(response)
251
                    self.log.warning(
252
                        "File size larger than allowed maximum (%s > %s)",
253
                        size, self.maxsize)
254
                    pathfmt.temppath = ""
255
                    return True
256

257
            build_path = False
258

259
            # set missing filename extension from MIME type
260
            if not pathfmt.extension:
261
                pathfmt.set_extension(self._find_extension(response))
262
                build_path = True
263

264
            # set metadata from HTTP headers
265
            if metadata:
266
                kwdict[metadata] = util.extract_headers(response)
267
                build_path = True
268

269
            # build and check file path
270
            if build_path:
271
                pathfmt.build_path()
272
                if pathfmt.exists():
273
                    pathfmt.temppath = ""
274
                    # release the connection back to pool by explicitly
275
                    # calling .close()
276
                    # see https://requests.readthedocs.io/en/latest/user
277
                    # /advanced/#body-content-workflow
278
                    # when the image size is on the order of megabytes,
279
                    # re-establishing a TLS connection will typically be faster
280
                    # than consuming the whole response
281
                    response.close()
282
                    return True
283
                if self.part and metadata:
284
                    pathfmt.part_enable(self.partdir)
285
                metadata = False
286

287
            content = response.iter_content(self.chunk_size)
288

289
            validate_sig = kwdict.get("_http_signature")
290
            validate_ext = (adjust_extension and
291
                            pathfmt.extension in SIGNATURE_CHECKS)
292

293
            # check filename extension against file header
294
            if not offset and (validate_ext or validate_sig):
295
                try:
296
                    file_header = next(
297
                        content if response.raw.chunked
298
                        else response.iter_content(16), b"")
299
                except (RequestException, SSLError) as exc:
300
                    msg = str(exc)
301
                    continue
302
                if validate_sig:
303
                    result = validate_sig(file_header)
304
                    if result is not True:
305
                        self.release_conn(response)
306
                        self.log.warning(
307
                            result or "Invalid file signature bytes")
308
                        return False
309
                if validate_ext and self._adjust_extension(
310
                        pathfmt, file_header) and pathfmt.exists():
311
                    pathfmt.temppath = ""
312
                    response.close()
313
                    return True
314

315
            # set open mode
316
            if not offset:
317
                mode = "w+b"
318
                if file_size:
319
                    self.log.debug("Unable to resume partial download")
320
            else:
321
                mode = "r+b"
322
                self.log.debug("Resuming download at byte %d", offset)
323

324
            # download content
325
            self.downloading = True
326
            with pathfmt.open(mode) as fp:
327
                if fp is None:
328
                    # '.part' file no longer exists
329
                    break
330
                if file_header:
331
                    fp.write(file_header)
332
                    offset += len(file_header)
333
                elif offset:
334
                    if adjust_extension and \
335
                            pathfmt.extension in SIGNATURE_CHECKS:
336
                        self._adjust_extension(pathfmt, fp.read(16))
337
                    fp.seek(offset)
338

339
                self.out.start(pathfmt.path)
340
                try:
341
                    self.receive(fp, content, size, offset)
342
                except (RequestException, SSLError) as exc:
343
                    msg = str(exc)
344
                    output.stderr_write("\n")
345
                    continue
346
                except exception.StopExtraction:
347
                    response.close()
348
                    return False
349
                except exception.ControlException:
350
                    response.close()
351
                    raise
352

353
                # check file size
354
                if size and (fsize := fp.tell()) < size:
355
                    if (segmented := kwdict.get("_http_segmented")) and \
356
                            segmented is True or segmented == fsize:
357
                        tries -= 1
358
                        msg = "Resuming segmented download"
359
                        output.stdout_write("\r")
360
                    else:
361
                        msg = f"file size mismatch ({fsize} < {size})"
362
                        output.stderr_write("\n")
363
                    continue
364

365
            break
366

367
        self.downloading = False
368
        if self.mtime:
369
            if "_http_lastmodified" in kwdict:
370
                kwdict["_mtime_http"] = kwdict["_http_lastmodified"]
371
            else:
372
                kwdict["_mtime_http"] = response.headers.get("Last-Modified")
373
        else:
374
            kwdict["_mtime_http"] = None
375

376
        return True
377

378
    def release_conn(self, response):
379
        """Release connection back to pool by consuming response body"""
380
        try:
381
            for _ in response.iter_content(self.chunk_size):
382
                pass
383
        except (RequestException, SSLError) as exc:
384
            output.stderr_write("\n")
385
            self.log.debug(
386
                "Unable to consume response body (%s: %s); "
387
                "closing the connection anyway", exc.__class__.__name__, exc)
388
            response.close()
389

390
    def receive(self, fp, content, bytes_total, bytes_start):
391
        write = fp.write
392
        for data in content:
393
            if FLAGS.DOWNLOAD is not None:
394
                return FLAGS.process("DOWNLOAD")
395
            write(data)
396

397
    def _receive_rate(self, fp, content, bytes_total, bytes_start):
398
        rate = self.rate() if self.rate else None
399
        write = fp.write
400
        progress = self.progress
401

402
        bytes_downloaded = 0
403
        time_start = time.monotonic()
404

405
        for data in content:
406
            if FLAGS.DOWNLOAD is not None:
407
                return FLAGS.process("DOWNLOAD")
408
            time_elapsed = time.monotonic() - time_start
409
            bytes_downloaded += len(data)
410

411
            write(data)
412

413
            if progress is not None:
414
                if time_elapsed > progress:
415
                    self.out.progress(
416
                        bytes_total,
417
                        bytes_start + bytes_downloaded,
418
                        int(bytes_downloaded / time_elapsed),
419
                    )
420

421
            if rate is not None:
422
                time_expected = bytes_downloaded / rate
423
                if time_expected > time_elapsed:
424
                    time.sleep(time_expected - time_elapsed)
425

426
    def _find_extension(self, response):
427
        """Get filename extension from MIME type"""
428
        mtype = response.headers.get("Content-Type", "image/jpeg")
429
        mtype = mtype.partition(";")[0].lower()
430

431
        if "/" not in mtype:
432
            mtype = "image/" + mtype
433

434
        if mtype in MIME_TYPES:
435
            return MIME_TYPES[mtype]
436

437
        if ext := mimetypes.guess_extension(mtype, strict=False):
438
            return ext[1:]
439

440
        self.log.warning("Unknown MIME type '%s'", mtype)
441
        return "bin"
442

443
    def _adjust_extension(self, pathfmt, file_header):
444
        """Check filename extension against file header"""
445
        if not SIGNATURE_CHECKS[pathfmt.extension](file_header):
446
            for ext, check in SIGNATURE_CHECKS.items():
447
                if check(file_header):
448
                    self.log.debug(
449
                        "Adjusting filename extension of '%s' to '%s'",
450
                        pathfmt.filename, ext)
451
                    pathfmt.set_extension(ext)
452
                    pathfmt.build_path()
453
                    return True
454
        return False
455

456

457
MIME_TYPES = {
458
    "image/jpeg"    : "jpg",
459
    "image/jpg"     : "jpg",
460
    "image/png"     : "png",
461
    "image/gif"     : "gif",
462
    "image/bmp"     : "bmp",
463
    "image/x-bmp"   : "bmp",
464
    "image/x-ms-bmp": "bmp",
465
    "image/webp"    : "webp",
466
    "image/avif"    : "avif",
467
    "image/heic"    : "heic",
468
    "image/heif"    : "heif",
469
    "image/svg+xml" : "svg",
470
    "image/ico"     : "ico",
471
    "image/icon"    : "ico",
472
    "image/x-icon"  : "ico",
473
    "image/vnd.microsoft.icon" : "ico",
474
    "image/x-photoshop"        : "psd",
475
    "application/x-photoshop"  : "psd",
476
    "image/vnd.adobe.photoshop": "psd",
477

478
    "video/webm": "webm",
479
    "video/ogg" : "ogg",
480
    "video/mp4" : "mp4",
481
    "video/m4v" : "m4v",
482
    "video/x-m4v": "m4v",
483
    "video/quicktime": "mov",
484

485
    "audio/wav"  : "wav",
486
    "audio/x-wav": "wav",
487
    "audio/webm" : "webm",
488
    "audio/ogg"  : "ogg",
489
    "audio/mpeg" : "mp3",
490
    "audio/aac"  : "aac",
491
    "audio/x-aac": "aac",
492

493
    "application/vnd.apple.mpegurl": "m3u8",
494
    "application/x-mpegurl"        : "m3u8",
495
    "application/dash+xml"         : "mpd",
496

497
    "application/zip"  : "zip",
498
    "application/x-zip": "zip",
499
    "application/x-zip-compressed": "zip",
500
    "application/rar"  : "rar",
501
    "application/x-rar": "rar",
502
    "application/x-rar-compressed": "rar",
503
    "application/x-7z-compressed" : "7z",
504

505
    "application/pdf"  : "pdf",
506
    "application/x-pdf": "pdf",
507
    "application/x-shockwave-flash": "swf",
508

509
    "text/html": "html",
510

511
    "application/ogg": "ogg",
512
    # https://www.iana.org/assignments/media-types/model/obj
513
    "model/obj": "obj",
514
    "application/octet-stream": "bin",
515
}
516

517

518
def _signature_html(s):
519
    s = s[:14].lstrip()
520
    return s and b"<!doctype html".startswith(s.lower())
521

522

523
# https://en.wikipedia.org/wiki/List_of_file_signatures
524
SIGNATURE_CHECKS = {
525
    "jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF",
526
    "png" : lambda s: s[0:8] == b"\x89PNG\r\n\x1A\n",
527
    "gif" : lambda s: s[0:6] in (b"GIF87a", b"GIF89a"),
528
    "bmp" : lambda s: s[0:2] == b"BM",
529
    "webp": lambda s: (s[0:4] == b"RIFF" and
530
                       s[8:12] == b"WEBP"),
531
    "avif": lambda s: s[4:11] == b"ftypavi" and s[11] in b"fs",
532
    "heic": lambda s: (s[4:10] == b"ftyphe" and s[10:12] in (
533
                       b"ic", b"im", b"is", b"ix", b"vc", b"vm", b"vs")),
534
    "svg" : lambda s: s[0:5] == b"<?xml",
535
    "ico" : lambda s: s[0:4] == b"\x00\x00\x01\x00",
536
    "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00",
537
    "psd" : lambda s: s[0:4] == b"8BPS",
538
    "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in (
539
                       b"mp4", b"avc", b"iso")),
540
    "m4v" : lambda s: s[4:11] == b"ftypM4V",
541
    "mov" : lambda s: s[4:12] == b"ftypqt  ",
542
    "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3",
543
    "ogg" : lambda s: s[0:4] == b"OggS",
544
    "wav" : lambda s: (s[0:4] == b"RIFF" and
545
                       s[8:12] == b"WAVE"),
546
    "mp3" : lambda s: (s[0:3] == b"ID3" or
547
                       s[0:2] in (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2")),
548
    "aac" : lambda s: s[0:2] in (b"\xFF\xF9", b"\xFF\xF1"),
549
    "m3u8": lambda s: s[0:7] == b"#EXTM3U",
550
    "mpd" : lambda s: b"<MPD" in s,
551
    "zip" : lambda s: s[0:4] in (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"),
552
    "rar" : lambda s: s[0:6] == b"Rar!\x1A\x07",
553
    "7z"  : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C",
554
    "pdf" : lambda s: s[0:5] == b"%PDF-",
555
    "swf" : lambda s: s[0:3] in (b"CWS", b"FWS"),
556
    "html": _signature_html,
557
    "htm" : _signature_html,
558
    "blend": lambda s: s[0:7] == b"BLENDER",
559
    # unfortunately the Wavefront .obj format doesn't have a signature,
560
    # so we check for the existence of Blender's comment
561
    "obj" : lambda s: s[0:11] == b"# Blender v",
562
    # Celsys Clip Studio Paint format
563
    # https://github.com/rasensuihei/cliputils/blob/master/README.md
564
    "clip": lambda s: s[0:8] == b"CSFCHUNK",
565
    # check 'bin' files against all other file signatures
566
    "bin" : lambda s: False,
567
}
568

569
__downloader__ = HttpDownloader
570

571
Product

Resources

Company