Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
corpnewt
GitHub Repository: corpnewt/gibMacOS
Path: blob/master/Scripts/downloader.py
350 views
1
import sys, os, time, ssl, gzip, multiprocessing
2
from io import BytesIO
3
# Python-aware urllib stuff
4
try:
5
from urllib.request import urlopen, Request
6
import queue as q
7
except ImportError:
8
# Import urllib2 to catch errors
9
import urllib2
10
from urllib2 import urlopen, Request
11
import Queue as q
12
13
TERMINAL_WIDTH = 120 if os.name=="nt" else 80
14
15
def get_size(size, suffix=None, use_1024=False, round_to=2, strip_zeroes=False):
16
# size is the number of bytes
17
# suffix is the target suffix to locate (B, KB, MB, etc) - if found
18
# use_2014 denotes whether or not we display in MiB vs MB
19
# round_to is the number of dedimal points to round our result to (0-15)
20
# strip_zeroes denotes whether we strip out zeroes
21
22
# Failsafe in case our size is unknown
23
if size == -1:
24
return "Unknown"
25
# Get our suffixes based on use_1024
26
ext = ["B","KiB","MiB","GiB","TiB","PiB"] if use_1024 else ["B","KB","MB","GB","TB","PB"]
27
div = 1024 if use_1024 else 1000
28
s = float(size)
29
s_dict = {} # Initialize our dict
30
# Iterate the ext list, and divide by 1000 or 1024 each time to setup the dict {ext:val}
31
for e in ext:
32
s_dict[e] = s
33
s /= div
34
# Get our suffix if provided - will be set to None if not found, or if started as None
35
suffix = next((x for x in ext if x.lower() == suffix.lower()),None) if suffix else suffix
36
# Get the largest value that's still over 1
37
biggest = suffix if suffix else next((x for x in ext[::-1] if s_dict[x] >= 1), "B")
38
# Determine our rounding approach - first make sure it's an int; default to 2 on error
39
try:round_to=int(round_to)
40
except:round_to=2
41
round_to = 0 if round_to < 0 else 15 if round_to > 15 else round_to # Ensure it's between 0 and 15
42
bval = round(s_dict[biggest], round_to)
43
# Split our number based on decimal points
44
a,b = str(bval).split(".")
45
# Check if we need to strip or pad zeroes
46
b = b.rstrip("0") if strip_zeroes else b.ljust(round_to,"0") if round_to > 0 else ""
47
return "{:,}{} {}".format(int(a),"" if not b else "."+b,biggest)
48
49
def _process_hook(queue, total_size, bytes_so_far=0, update_interval=1.0, max_packets=0):
50
packets = []
51
speed = remaining = ""
52
last_update = time.time()
53
while True:
54
# Write our info first so we have *some* status while
55
# waiting for packets
56
if total_size > 0:
57
percent = float(bytes_so_far) / total_size
58
percent = round(percent*100, 2)
59
t_s = get_size(total_size)
60
try:
61
b_s = get_size(bytes_so_far, t_s.split(" ")[1])
62
except:
63
b_s = get_size(bytes_so_far)
64
perc_str = " {:.2f}%".format(percent)
65
bar_width = (TERMINAL_WIDTH // 3)-len(perc_str)
66
progress = "=" * int(bar_width * (percent/100))
67
sys.stdout.write("\r\033[K{}/{} | {}{}{}{}{}".format(
68
b_s,
69
t_s,
70
progress,
71
" " * (bar_width-len(progress)),
72
perc_str,
73
speed,
74
remaining
75
))
76
else:
77
b_s = get_size(bytes_so_far)
78
sys.stdout.write("\r\033[K{}{}".format(b_s, speed))
79
sys.stdout.flush()
80
# Now we gather the next packet
81
try:
82
packet = queue.get(timeout=update_interval)
83
# Packets should be formatted as a tuple of
84
# (timestamp, len(bytes_downloaded))
85
# If "DONE" is passed, we assume the download
86
# finished - and bail
87
if packet == "DONE":
88
print("") # Jump to the next line
89
return
90
# Append our packet to the list and ensure we're not
91
# beyond our max.
92
# Only check max if it's > 0
93
packets.append(packet)
94
if max_packets > 0:
95
packets = packets[-max_packets:]
96
# Increment our bytes so far as well
97
bytes_so_far += packet[1]
98
except q.Empty:
99
# Didn't get anything - reset the speed
100
# and packets
101
packets = []
102
speed = " | 0 B/s"
103
remaining = " | ?? left" if total_size > 0 else ""
104
except KeyboardInterrupt:
105
print("") # Jump to the next line
106
return
107
# If we have packets and it's time for an update, process
108
# the info.
109
update_check = time.time()
110
if packets and update_check - last_update >= update_interval:
111
last_update = update_check # Refresh our update timestamp
112
speed = " | ?? B/s"
113
if len(packets) > 1:
114
# Let's calculate the amount downloaded over how long
115
try:
116
first,last = packets[0][0],packets[-1][0]
117
chunks = sum([float(x[1]) for x in packets])
118
t = last-first
119
assert t >= 0
120
bytes_speed = 1. / t * chunks
121
speed = " | {}/s".format(get_size(bytes_speed,round_to=1))
122
# Get our remaining time
123
if total_size > 0:
124
seconds_left = (total_size-bytes_so_far) / bytes_speed
125
days = seconds_left // 86400
126
hours = (seconds_left - (days*86400)) // 3600
127
mins = (seconds_left - (days*86400) - (hours*3600)) // 60
128
secs = seconds_left - (days*86400) - (hours*3600) - (mins*60)
129
if days > 99 or bytes_speed == 0:
130
remaining = " | ?? left"
131
else:
132
remaining = " | {}{:02d}:{:02d}:{:02d} left".format(
133
"{}:".format(int(days)) if days else "",
134
int(hours),
135
int(mins),
136
int(round(secs))
137
)
138
except:
139
pass
140
# Clear the packets so we don't reuse the same ones
141
packets = []
142
143
class Downloader:
144
145
def __init__(self,**kwargs):
146
self.ua = kwargs.get("useragent",{"User-Agent":"Mozilla"})
147
self.chunk = None # Auto-assign if None, otherwise explicit
148
self.min_chunk = 1024 # 1 KiB min chunk size
149
self.max_chunk = 1024 * 1024 * 4 # 4 MiB max chunk size
150
self.chunk_rate = 0.1 # Update every 0.1 seconds
151
self.chunk_growth = 1.5 # Max multiplier for chunk growth
152
if os.name=="nt": os.system("color") # Initialize cmd for ANSI escapes
153
# Provide reasonable default logic to workaround macOS CA file handling
154
cafile = ssl.get_default_verify_paths().openssl_cafile
155
try:
156
# If default OpenSSL CA file does not exist, use that from certifi
157
if not os.path.exists(cafile):
158
import certifi
159
cafile = certifi.where()
160
self.ssl_context = ssl.create_default_context(cafile=cafile)
161
except:
162
# None of the above worked, disable certificate verification for now
163
self.ssl_context = ssl._create_unverified_context()
164
return
165
166
def _decode(self, value, encoding="utf-8", errors="ignore"):
167
# Helper method to only decode if bytes type
168
if sys.version_info >= (3,0) and isinstance(value, bytes):
169
return value.decode(encoding,errors)
170
return value
171
172
def _update_main_name(self):
173
# Windows running python 2 seems to have issues with multiprocessing
174
# if the case of the main script's name is incorrect:
175
# e.g. Downloader.py vs downloader.py
176
#
177
# To work around this, we try to scrape for the correct case if
178
# possible.
179
try:
180
path = os.path.abspath(sys.modules["__main__"].__file__)
181
except AttributeError as e:
182
# This likely means we're running from the interpreter
183
# directly
184
return None
185
if not os.path.isfile(path):
186
return None
187
# Get the file name and folder path
188
name = os.path.basename(path).lower()
189
fldr = os.path.dirname(path)
190
# Walk the files in the folder until we find our
191
# name - then steal its case and update that path
192
for f in os.listdir(fldr):
193
if f.lower() == name:
194
# Got it
195
new_path = os.path.join(fldr,f)
196
sys.modules["__main__"].__file__ = new_path
197
return new_path
198
# If we got here, it wasn't found
199
return None
200
201
def _get_headers(self, headers = None):
202
# Fall back on the default ua if none provided
203
target = headers if isinstance(headers,dict) else self.ua
204
new_headers = {}
205
# Shallow copy to prevent changes to the headers
206
# overriding the original
207
for k in target:
208
new_headers[k] = target[k]
209
return new_headers
210
211
def open_url(self, url, headers = None):
212
headers = self._get_headers(headers)
213
# Wrap up the try/except block so we don't have to do this for each function
214
try:
215
response = urlopen(Request(url, headers=headers), context=self.ssl_context)
216
except Exception as e:
217
# No fixing this - bail
218
return None
219
return response
220
221
def get_size(self, *args, **kwargs):
222
return get_size(*args,**kwargs)
223
224
def get_string(self, url, progress = True, headers = None, expand_gzip = True):
225
response = self.get_bytes(url,progress,headers,expand_gzip)
226
if response is None: return None
227
return self._decode(response)
228
229
def get_bytes(self, url, progress = True, headers = None, expand_gzip = True):
230
response = self.open_url(url, headers)
231
if response is None: return None
232
try: total_size = int(response.headers['Content-Length'])
233
except: total_size = -1
234
chunk_so_far = b""
235
packets = queue = process = None
236
if progress:
237
# Make sure our vars are initialized
238
packets = [] if progress else None
239
queue = multiprocessing.Queue()
240
# Create the multiprocess and start it
241
process = multiprocessing.Process(
242
target=_process_hook,
243
args=(queue,total_size)
244
)
245
process.daemon = True
246
# Filthy hack for earlier python versions on Windows
247
if os.name == "nt" and hasattr(multiprocessing,"forking"):
248
self._update_main_name()
249
process.start()
250
try:
251
chunk_size = self.chunk or 1024
252
auto_chunk_size = not self.chunk
253
while True:
254
t = time.perf_counter()
255
chunk = response.read(chunk_size)
256
chunk_time = time.perf_counter()-t
257
if progress:
258
# Add our items to the queue
259
queue.put((time.time(),len(chunk)))
260
if not chunk: break
261
chunk_so_far += chunk
262
if auto_chunk_size:
263
# Adjust our chunk size based on the internet speed at our defined rate
264
chunk_rate = int(len(chunk) / chunk_time * self.chunk_rate)
265
chunk_change_max = round(chunk_size * self.chunk_growth)
266
chunk_rate_clamped = min(max(self.min_chunk, chunk_rate), chunk_change_max)
267
chunk_size = min(chunk_rate_clamped, self.max_chunk)
268
finally:
269
# Close the response whenever we're done
270
response.close()
271
if expand_gzip and response.headers.get("Content-Encoding","unknown").lower() == "gzip":
272
fileobj = BytesIO(chunk_so_far)
273
gfile = gzip.GzipFile(fileobj=fileobj)
274
return gfile.read()
275
if progress:
276
# Finalize the queue and wait
277
queue.put("DONE")
278
process.join()
279
return chunk_so_far
280
281
def stream_to_file(self, url, file_path, progress = True, headers = None, ensure_size_if_present = True, allow_resume = False):
282
response = self.open_url(url, headers)
283
if response is None: return None
284
bytes_so_far = 0
285
try: total_size = int(response.headers['Content-Length'])
286
except: total_size = -1
287
packets = queue = process = None
288
mode = "wb"
289
if allow_resume and os.path.isfile(file_path) and total_size != -1:
290
# File exists, we're resuming and have a target size. Check the
291
# local file size.
292
current_size = os.stat(file_path).st_size
293
if current_size == total_size:
294
# File is already complete - return the path
295
return file_path
296
elif current_size < total_size:
297
response.close()
298
# File is not complete - seek to our current size
299
bytes_so_far = current_size
300
mode = "ab" # Append
301
# We also need to try creating a new request
302
# in order to pass our range header
303
new_headers = self._get_headers(headers)
304
# Get the start byte, 0-indexed
305
byte_string = "bytes={}-".format(current_size)
306
new_headers["Range"] = byte_string
307
response = self.open_url(url, new_headers)
308
if response is None: return None
309
if progress:
310
# Make sure our vars are initialized
311
packets = [] if progress else None
312
queue = multiprocessing.Queue()
313
# Create the multiprocess and start it
314
process = multiprocessing.Process(
315
target=_process_hook,
316
args=(queue,total_size,bytes_so_far)
317
)
318
process.daemon = True
319
# Filthy hack for earlier python versions on Windows
320
if os.name == "nt" and hasattr(multiprocessing,"forking"):
321
self._update_main_name()
322
process.start()
323
with open(file_path,mode) as f:
324
chunk_size = self.chunk or 1024
325
auto_chunk_size = not self.chunk
326
try:
327
while True:
328
t = time.perf_counter()
329
chunk = response.read(chunk_size)
330
chunk_time = time.perf_counter()-t
331
bytes_so_far += len(chunk)
332
if progress:
333
# Add our items to the queue
334
queue.put((time.time(),len(chunk)))
335
if not chunk: break
336
f.write(chunk)
337
if auto_chunk_size:
338
# Adjust our chunk size based on the internet speed at our defined rate
339
chunk_rate = int(len(chunk) / chunk_time * self.chunk_rate)
340
chunk_change_max = round(chunk_size * self.chunk_growth)
341
chunk_rate_clamped = min(max(self.min_chunk, chunk_rate), chunk_change_max)
342
chunk_size = min(chunk_rate_clamped, self.max_chunk)
343
finally:
344
# Close the response whenever we're done
345
response.close()
346
if progress:
347
# Finalize the queue and wait
348
queue.put("DONE")
349
process.join()
350
if ensure_size_if_present and total_size != -1:
351
# We're verifying size - make sure we got what we asked for
352
if bytes_so_far != total_size:
353
return None # We didn't - imply it failed
354
return file_path if os.path.exists(file_path) else None
355
356