"""Downloader module for http:// and https:// URLs"""
import time
import mimetypes
from requests.exceptions import RequestException, ConnectionError, Timeout
from .common import DownloaderBase
from .. import text, util, output, exception
from ssl import SSLError
FLAGS = util.FLAGS
class HttpDownloader(DownloaderBase):
scheme = "http"
def __init__(self, job):
DownloaderBase.__init__(self, job)
extractor = job.extractor
self.downloading = False
self.adjust_extension = self.config("adjust-extensions", True)
self.chunk_size = self.config("chunk-size", 32768)
self.metadata = extractor.config("http-metadata")
self.progress = self.config("progress", 3.0)
self.validate = self.config("validate", True)
self.validate_html = self.config("validate-html", True)
self.headers = self.config("headers")
self.minsize = self.config("filesize-min")
self.maxsize = self.config("filesize-max")
self.retries = self.config("retries", extractor._retries)
self.retry_codes = self.config("retry-codes", extractor._retry_codes)
self.timeout = self.config("timeout", extractor._timeout)
self.verify = self.config("verify", extractor._verify)
self.mtime = self.config("mtime", True)
self.rate = self.config("rate")
interval_429 = self.config("sleep-429")
if not self.config("consume-content", False):
self.release_conn = lambda resp: resp.close()
if self.retries < 0:
self.retries = float("inf")
if self.minsize:
minsize = text.parse_bytes(self.minsize)
if not minsize:
self.log.warning(
"Invalid minimum file size (%r)", self.minsize)
self.minsize = minsize
if self.maxsize:
maxsize = text.parse_bytes(self.maxsize)
if not maxsize:
self.log.warning(
"Invalid maximum file size (%r)", self.maxsize)
self.maxsize = maxsize
if isinstance(self.chunk_size, str):
chunk_size = text.parse_bytes(self.chunk_size)
if not chunk_size:
self.log.warning(
"Invalid chunk size (%r)", self.chunk_size)
chunk_size = 32768
self.chunk_size = chunk_size
if self.rate:
func = util.build_selection_func(self.rate, 0, text.parse_bytes)
if rmax := func.args[1] if hasattr(func, "args") else func():
if rmax < self.chunk_size:
self.chunk_size = rmax
self.rate = func
self.receive = self._receive_rate
else:
self.log.warning("Invalid rate limit (%r)", self.rate)
self.rate = False
if self.progress is not None:
self.receive = self._receive_rate
if self.progress < 0.0:
self.progress = 0.0
if interval_429 is None:
self.interval_429 = extractor._interval_429
else:
try:
self.interval_429 = util.build_duration_func_ex(interval_429)
except Exception as exc:
self.log.error("Invalid 'sleep-429' value '%s' (%s: %s)",
interval_429, exc.__class__.__name__, exc)
self.interval_429 = extractor._interval_429
def download(self, url, pathfmt):
try:
return self._download_impl(url, pathfmt)
except Exception as exc:
if self.downloading:
output.stderr_write("\n")
self.log.traceback(exc)
raise
finally:
if self.downloading and not self.part:
util.remove_file(pathfmt.temppath)
def _download_impl(self, url, pathfmt):
response = None
tries = code = 0
msg = ""
metadata = self.metadata
kwdict = pathfmt.kwdict
expected_status = kwdict.get(
"_http_expected_status", ())
adjust_extension = kwdict.get(
"_http_adjust_extension", self.adjust_extension)
if self.part and not metadata:
pathfmt.part_enable(self.partdir)
while True:
if tries:
if response:
self.release_conn(response)
response = None
self.log.warning("%s (%s/%s)", msg, tries, self.retries+1)
if tries > self.retries:
return False
if code == 429 and self.interval_429:
s = self.interval_429(tries)
time.sleep(s if s > tries else tries)
else:
time.sleep(tries)
code = 0
tries += 1
file_header = None
headers = {"Accept": "*/*"}
if extra := kwdict.get("_http_headers"):
headers.update(extra)
if self.headers:
headers.update(self.headers)
if file_size := pathfmt.part_size():
headers["Range"] = f"bytes={file_size}-"
try:
response = self.session.request(
kwdict.get("_http_method", "GET"), url,
stream=True,
headers=headers,
data=kwdict.get("_http_data"),
timeout=self.timeout,
proxies=self.proxies,
verify=self.verify,
)
except ConnectionError as exc:
try:
reason = exc.args[0].reason
cls = reason.__class__.__name__
pre, _, err = str(reason.args[-1]).partition(":")
msg = f"{cls}: {(err or pre).lstrip()}"
except Exception:
msg = str(exc)
continue
except Timeout as exc:
msg = str(exc)
continue
except Exception as exc:
self.log.warning(exc)
return False
code = response.status_code
if code == 200 or code in expected_status:
offset = 0
size = response.headers.get("Content-Length")
elif code == 206:
offset = file_size
size = response.headers["Content-Range"].rpartition("/")[2]
elif code == 416 and file_size:
break
else:
msg = f"'{code} {response.reason}' for '{url}'"
challenge = util.detect_challenge(response)
if challenge is not None:
self.log.warning(challenge)
if code in self.retry_codes or 500 <= code < 600:
continue
retry = kwdict.get("_http_retry")
if retry and retry(response):
continue
self.release_conn(response)
self.log.warning(msg)
return False
if self.validate and \
(validate := kwdict.get("_http_validate")) is not None:
try:
result = validate(response)
except Exception:
self.release_conn(response)
raise
if isinstance(result, str):
url = result
tries -= 1
continue
if not result:
self.release_conn(response)
self.log.warning("Invalid response")
return False
if self.validate_html and response.headers.get(
"content-type", "").startswith("text/html") and \
pathfmt.extension not in ("html", "htm"):
if response.history:
self.log.warning("HTTP redirect to '%s'", response.url)
else:
self.log.warning("HTML response")
return False
size = text.parse_int(size, None)
if size is not None:
if not size:
self.release_conn(response)
self.log.warning("Empty file")
return False
if self.minsize and size < self.minsize:
self.release_conn(response)
self.log.warning(
"File size smaller than allowed minimum (%s < %s)",
size, self.minsize)
pathfmt.temppath = ""
return True
if self.maxsize and size > self.maxsize:
self.release_conn(response)
self.log.warning(
"File size larger than allowed maximum (%s > %s)",
size, self.maxsize)
pathfmt.temppath = ""
return True
build_path = False
if not pathfmt.extension:
pathfmt.set_extension(self._find_extension(response))
build_path = True
if metadata:
kwdict[metadata] = util.extract_headers(response)
build_path = True
if build_path:
pathfmt.build_path()
if pathfmt.exists():
pathfmt.temppath = ""
response.close()
return True
if self.part and metadata:
pathfmt.part_enable(self.partdir)
metadata = False
content = response.iter_content(self.chunk_size)
validate_sig = kwdict.get("_http_signature")
validate_ext = (adjust_extension and
pathfmt.extension in SIGNATURE_CHECKS)
if not offset and (validate_ext or validate_sig):
try:
file_header = next(
content if response.raw.chunked
else response.iter_content(16), b"")
except (RequestException, SSLError) as exc:
msg = str(exc)
continue
if validate_sig:
result = validate_sig(file_header)
if result is not True:
self.release_conn(response)
self.log.warning(
result or "Invalid file signature bytes")
return False
if validate_ext and self._adjust_extension(
pathfmt, file_header) and pathfmt.exists():
pathfmt.temppath = ""
response.close()
return True
if not offset:
mode = "w+b"
if file_size:
self.log.debug("Unable to resume partial download")
else:
mode = "r+b"
self.log.debug("Resuming download at byte %d", offset)
self.downloading = True
with pathfmt.open(mode) as fp:
if fp is None:
break
if file_header:
fp.write(file_header)
offset += len(file_header)
elif offset:
if adjust_extension and \
pathfmt.extension in SIGNATURE_CHECKS:
self._adjust_extension(pathfmt, fp.read(16))
fp.seek(offset)
self.out.start(pathfmt.path)
try:
self.receive(fp, content, size, offset)
except (RequestException, SSLError) as exc:
msg = str(exc)
output.stderr_write("\n")
continue
except exception.StopExtraction:
response.close()
return False
except exception.ControlException:
response.close()
raise
if size and (fsize := fp.tell()) < size:
if (segmented := kwdict.get("_http_segmented")) and \
segmented is True or segmented == fsize:
tries -= 1
msg = "Resuming segmented download"
output.stdout_write("\r")
else:
msg = f"file size mismatch ({fsize} < {size})"
output.stderr_write("\n")
continue
break
self.downloading = False
if self.mtime:
if "_http_lastmodified" in kwdict:
kwdict["_mtime_http"] = kwdict["_http_lastmodified"]
else:
kwdict["_mtime_http"] = response.headers.get("Last-Modified")
else:
kwdict["_mtime_http"] = None
return True
def release_conn(self, response):
"""Release connection back to pool by consuming response body"""
try:
for _ in response.iter_content(self.chunk_size):
pass
except (RequestException, SSLError) as exc:
output.stderr_write("\n")
self.log.debug(
"Unable to consume response body (%s: %s); "
"closing the connection anyway", exc.__class__.__name__, exc)
response.close()
def receive(self, fp, content, bytes_total, bytes_start):
write = fp.write
for data in content:
if FLAGS.DOWNLOAD is not None:
return FLAGS.process("DOWNLOAD")
write(data)
def _receive_rate(self, fp, content, bytes_total, bytes_start):
rate = self.rate() if self.rate else None
write = fp.write
progress = self.progress
bytes_downloaded = 0
time_start = time.monotonic()
for data in content:
if FLAGS.DOWNLOAD is not None:
return FLAGS.process("DOWNLOAD")
time_elapsed = time.monotonic() - time_start
bytes_downloaded += len(data)
write(data)
if progress is not None:
if time_elapsed > progress:
self.out.progress(
bytes_total,
bytes_start + bytes_downloaded,
int(bytes_downloaded / time_elapsed),
)
if rate is not None:
time_expected = bytes_downloaded / rate
if time_expected > time_elapsed:
time.sleep(time_expected - time_elapsed)
def _find_extension(self, response):
"""Get filename extension from MIME type"""
mtype = response.headers.get("Content-Type", "image/jpeg")
mtype = mtype.partition(";")[0].lower()
if "/" not in mtype:
mtype = "image/" + mtype
if mtype in MIME_TYPES:
return MIME_TYPES[mtype]
if ext := mimetypes.guess_extension(mtype, strict=False):
return ext[1:]
self.log.warning("Unknown MIME type '%s'", mtype)
return "bin"
def _adjust_extension(self, pathfmt, file_header):
"""Check filename extension against file header"""
if not SIGNATURE_CHECKS[pathfmt.extension](file_header):
for ext, check in SIGNATURE_CHECKS.items():
if check(file_header):
self.log.debug(
"Adjusting filename extension of '%s' to '%s'",
pathfmt.filename, ext)
pathfmt.set_extension(ext)
pathfmt.build_path()
return True
return False
MIME_TYPES = {
"image/jpeg" : "jpg",
"image/jpg" : "jpg",
"image/png" : "png",
"image/gif" : "gif",
"image/bmp" : "bmp",
"image/x-bmp" : "bmp",
"image/x-ms-bmp": "bmp",
"image/webp" : "webp",
"image/avif" : "avif",
"image/heic" : "heic",
"image/heif" : "heif",
"image/svg+xml" : "svg",
"image/ico" : "ico",
"image/icon" : "ico",
"image/x-icon" : "ico",
"image/vnd.microsoft.icon" : "ico",
"image/x-photoshop" : "psd",
"application/x-photoshop" : "psd",
"image/vnd.adobe.photoshop": "psd",
"video/webm": "webm",
"video/ogg" : "ogg",
"video/mp4" : "mp4",
"video/m4v" : "m4v",
"video/x-m4v": "m4v",
"video/quicktime": "mov",
"audio/wav" : "wav",
"audio/x-wav": "wav",
"audio/webm" : "webm",
"audio/ogg" : "ogg",
"audio/mpeg" : "mp3",
"audio/aac" : "aac",
"audio/x-aac": "aac",
"application/vnd.apple.mpegurl": "m3u8",
"application/x-mpegurl" : "m3u8",
"application/dash+xml" : "mpd",
"application/zip" : "zip",
"application/x-zip": "zip",
"application/x-zip-compressed": "zip",
"application/rar" : "rar",
"application/x-rar": "rar",
"application/x-rar-compressed": "rar",
"application/x-7z-compressed" : "7z",
"application/pdf" : "pdf",
"application/x-pdf": "pdf",
"application/x-shockwave-flash": "swf",
"text/html": "html",
"application/ogg": "ogg",
"model/obj": "obj",
"application/octet-stream": "bin",
}
def _signature_html(s):
s = s[:14].lstrip()
return s and b"<!doctype html".startswith(s.lower())
SIGNATURE_CHECKS = {
"jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF",
"png" : lambda s: s[0:8] == b"\x89PNG\r\n\x1A\n",
"gif" : lambda s: s[0:6] in (b"GIF87a", b"GIF89a"),
"bmp" : lambda s: s[0:2] == b"BM",
"webp": lambda s: (s[0:4] == b"RIFF" and
s[8:12] == b"WEBP"),
"avif": lambda s: s[4:11] == b"ftypavi" and s[11] in b"fs",
"heic": lambda s: (s[4:10] == b"ftyphe" and s[10:12] in (
b"ic", b"im", b"is", b"ix", b"vc", b"vm", b"vs")),
"svg" : lambda s: s[0:5] == b"<?xml",
"ico" : lambda s: s[0:4] == b"\x00\x00\x01\x00",
"cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00",
"psd" : lambda s: s[0:4] == b"8BPS",
"mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in (
b"mp4", b"avc", b"iso")),
"m4v" : lambda s: s[4:11] == b"ftypM4V",
"mov" : lambda s: s[4:12] == b"ftypqt ",
"webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3",
"ogg" : lambda s: s[0:4] == b"OggS",
"wav" : lambda s: (s[0:4] == b"RIFF" and
s[8:12] == b"WAVE"),
"mp3" : lambda s: (s[0:3] == b"ID3" or
s[0:2] in (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2")),
"aac" : lambda s: s[0:2] in (b"\xFF\xF9", b"\xFF\xF1"),
"m3u8": lambda s: s[0:7] == b"#EXTM3U",
"mpd" : lambda s: b"<MPD" in s,
"zip" : lambda s: s[0:4] in (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"),
"rar" : lambda s: s[0:6] == b"Rar!\x1A\x07",
"7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C",
"pdf" : lambda s: s[0:5] == b"%PDF-",
"swf" : lambda s: s[0:3] in (b"CWS", b"FWS"),
"html": _signature_html,
"htm" : _signature_html,
"blend": lambda s: s[0:7] == b"BLENDER",
"obj" : lambda s: s[0:11] == b"# Blender v",
"clip": lambda s: s[0:8] == b"CSFCHUNK",
"bin" : lambda s: False,
}
__downloader__ = HttpDownloader