CoCalc -- downloader.py

GitHub Repository: corpnewt/gibMacOS
Path: blob/master/Scripts/downloader.py
¹⁷⁵ views
1
import sys, os, time, ssl, gzip, multiprocessing
2
from io import BytesIO
3
# Python-aware urllib stuff
4
try:
5
    from urllib.request import urlopen, Request
6
    import queue as q
7
except ImportError:
8
    # Import urllib2 to catch errors
9
    import urllib2
10
    from urllib2 import urlopen, Request
11
    import Queue as q
12

13
TERMINAL_WIDTH = 120 if os.name=="nt" else 80
14

15
def get_size(size, suffix=None, use_1024=False, round_to=2, strip_zeroes=False):
16
    # size is the number of bytes
17
    # suffix is the target suffix to locate (B, KB, MB, etc) - if found
18
    # use_2014 denotes whether or not we display in MiB vs MB
19
    # round_to is the number of dedimal points to round our result to (0-15)
20
    # strip_zeroes denotes whether we strip out zeroes 
21

22
    # Failsafe in case our size is unknown
23
    if size == -1:
24
        return "Unknown"
25
    # Get our suffixes based on use_1024
26
    ext = ["B","KiB","MiB","GiB","TiB","PiB"] if use_1024 else ["B","KB","MB","GB","TB","PB"]
27
    div = 1024 if use_1024 else 1000
28
    s = float(size)
29
    s_dict = {} # Initialize our dict
30
    # Iterate the ext list, and divide by 1000 or 1024 each time to setup the dict {ext:val}
31
    for e in ext:
32
        s_dict[e] = s
33
        s /= div
34
    # Get our suffix if provided - will be set to None if not found, or if started as None
35
    suffix = next((x for x in ext if x.lower() == suffix.lower()),None) if suffix else suffix
36
    # Get the largest value that's still over 1
37
    biggest = suffix if suffix else next((x for x in ext[::-1] if s_dict[x] >= 1), "B")
38
    # Determine our rounding approach - first make sure it's an int; default to 2 on error
39
    try:round_to=int(round_to)
40
    except:round_to=2
41
    round_to = 0 if round_to < 0 else 15 if round_to > 15 else round_to # Ensure it's between 0 and 15
42
    bval = round(s_dict[biggest], round_to)
43
    # Split our number based on decimal points
44
    a,b = str(bval).split(".")
45
    # Check if we need to strip or pad zeroes
46
    b = b.rstrip("0") if strip_zeroes else b.ljust(round_to,"0") if round_to > 0 else ""
47
    return "{:,}{} {}".format(int(a),"" if not b else "."+b,biggest)
48

49
def _process_hook(queue, total_size, bytes_so_far=0, update_interval=1.0, max_packets=0):
50
    packets = []
51
    speed = remaining = ""
52
    last_update = time.time()
53
    while True:
54
        # Write our info first so we have *some* status while
55
        # waiting for packets
56
        if total_size > 0:
57
            percent = float(bytes_so_far) / total_size
58
            percent = round(percent*100, 2)
59
            t_s = get_size(total_size)
60
            try:
61
                b_s = get_size(bytes_so_far, t_s.split(" ")[1])
62
            except:
63
                b_s = get_size(bytes_so_far)
64
            perc_str = " {:.2f}%".format(percent)
65
            bar_width = (TERMINAL_WIDTH // 3)-len(perc_str)
66
            progress = "=" * int(bar_width * (percent/100))
67
            sys.stdout.write("\r\033[K{}/{} | {}{}{}{}{}".format(
68
                b_s,
69
                t_s,
70
                progress,
71
                " " * (bar_width-len(progress)),
72
                perc_str,
73
                speed,
74
                remaining
75
            ))
76
        else:
77
            b_s = get_size(bytes_so_far)
78
            sys.stdout.write("\r\033[K{}{}".format(b_s, speed))
79
        sys.stdout.flush()
80
        # Now we gather the next packet
81
        try:
82
            packet = queue.get(timeout=update_interval)
83
            # Packets should be formatted as a tuple of
84
            # (timestamp, len(bytes_downloaded))
85
            # If "DONE" is passed, we assume the download
86
            # finished - and bail
87
            if packet == "DONE":
88
                print("") # Jump to the next line
89
                return
90
            # Append our packet to the list and ensure we're not
91
            # beyond our max.
92
            # Only check max if it's > 0
93
            packets.append(packet)
94
            if max_packets > 0:
95
                packets = packets[-max_packets:]
96
            # Increment our bytes so far as well
97
            bytes_so_far += packet[1]
98
        except q.Empty:
99
            # Didn't get anything - reset the speed
100
            # and packets
101
            packets = []
102
            speed = " | 0 B/s"
103
            remaining = " | ?? left" if total_size > 0 else ""
104
        except KeyboardInterrupt:
105
            print("") # Jump to the next line
106
            return
107
        # If we have packets and it's time for an update, process
108
        # the info.
109
        update_check = time.time()
110
        if packets and update_check - last_update >= update_interval:
111
            last_update = update_check # Refresh our update timestamp
112
            speed = " | ?? B/s"
113
            if len(packets) > 1:
114
                # Let's calculate the amount downloaded over how long
115
                try:
116
                    first,last = packets[0][0],packets[-1][0]
117
                    chunks = sum([float(x[1]) for x in packets])
118
                    t = last-first
119
                    assert t >= 0
120
                    bytes_speed = 1. / t * chunks
121
                    speed = " | {}/s".format(get_size(bytes_speed,round_to=1))
122
                    # Get our remaining time
123
                    if total_size > 0:
124
                        seconds_left = (total_size-bytes_so_far) / bytes_speed
125
                        days  = seconds_left // 86400
126
                        hours = (seconds_left - (days*86400)) // 3600
127
                        mins  = (seconds_left - (days*86400) - (hours*3600)) // 60
128
                        secs  = seconds_left - (days*86400) - (hours*3600) - (mins*60)
129
                        if days > 99 or bytes_speed == 0:
130
                            remaining = " | ?? left"
131
                        else:
132
                            remaining = " | {}{:02d}:{:02d}:{:02d} left".format(
133
                                "{}:".format(int(days)) if days else "",
134
                                int(hours),
135
                                int(mins),
136
                                int(round(secs))
137
                            )
138
                except:
139
                    pass
140
                # Clear the packets so we don't reuse the same ones
141
                packets = []
142

143
class Downloader:
144

145
    def __init__(self,**kwargs):
146
        self.ua = kwargs.get("useragent",{"User-Agent":"Mozilla"})
147
        self.chunk = 1048576 # 1024 x 1024 i.e. 1MiB
148
        if os.name=="nt": os.system("color") # Initialize cmd for ANSI escapes
149
        # Provide reasonable default logic to workaround macOS CA file handling 
150
        cafile = ssl.get_default_verify_paths().openssl_cafile
151
        try:
152
            # If default OpenSSL CA file does not exist, use that from certifi
153
            if not os.path.exists(cafile):
154
                import certifi
155
                cafile = certifi.where()
156
            self.ssl_context = ssl.create_default_context(cafile=cafile)
157
        except:
158
            # None of the above worked, disable certificate verification for now
159
            self.ssl_context = ssl._create_unverified_context()
160
        return
161

162
    def _decode(self, value, encoding="utf-8", errors="ignore"):
163
        # Helper method to only decode if bytes type
164
        if sys.version_info >= (3,0) and isinstance(value, bytes):
165
            return value.decode(encoding,errors)
166
        return value
167

168
    def _update_main_name(self):
169
        # Windows running python 2 seems to have issues with multiprocessing
170
        # if the case of the main script's name is incorrect:
171
        # e.g. Downloader.py vs downloader.py
172
        #
173
        # To work around this, we try to scrape for the correct case if
174
        # possible.
175
        try:
176
            path = os.path.abspath(sys.modules["__main__"].__file__)
177
        except AttributeError as e:
178
            # This likely means we're running from the interpreter
179
            # directly
180
            return None
181
        if not os.path.isfile(path):
182
            return None
183
        # Get the file name and folder path
184
        name = os.path.basename(path).lower()
185
        fldr = os.path.dirname(path)
186
        # Walk the files in the folder until we find our
187
        # name - then steal its case and update that path
188
        for f in os.listdir(fldr):
189
            if f.lower() == name:
190
                # Got it
191
                new_path = os.path.join(fldr,f)
192
                sys.modules["__main__"].__file__ = new_path
193
                return new_path
194
        # If we got here, it wasn't found
195
        return None
196

197
    def _get_headers(self, headers = None):
198
        # Fall back on the default ua if none provided
199
        target = headers if isinstance(headers,dict) else self.ua
200
        new_headers = {}
201
        # Shallow copy to prevent changes to the headers
202
        # overriding the original
203
        for k in target:
204
            new_headers[k] = target[k]
205
        return new_headers
206

207
    def open_url(self, url, headers = None):
208
        headers = self._get_headers(headers)
209
        # Wrap up the try/except block so we don't have to do this for each function
210
        try:
211
            response = urlopen(Request(url, headers=headers), context=self.ssl_context)
212
        except Exception as e:
213
            # No fixing this - bail
214
            return None
215
        return response
216

217
    def get_size(self, *args, **kwargs):
218
        return get_size(*args,**kwargs)
219

220
    def get_string(self, url, progress = True, headers = None, expand_gzip = True):
221
        response = self.get_bytes(url,progress,headers,expand_gzip)
222
        if response is None: return None
223
        return self._decode(response)
224

225
    def get_bytes(self, url, progress = True, headers = None, expand_gzip = True):
226
        response = self.open_url(url, headers)
227
        if response is None: return None
228
        try: total_size = int(response.headers['Content-Length'])
229
        except: total_size = -1
230
        chunk_so_far = b""
231
        packets = queue = process = None
232
        if progress:
233
            # Make sure our vars are initialized
234
            packets = [] if progress else None
235
            queue = multiprocessing.Queue()
236
            # Create the multiprocess and start it
237
            process = multiprocessing.Process(
238
                target=_process_hook,
239
                args=(queue,total_size)
240
            )
241
            process.daemon = True
242
            # Filthy hack for earlier python versions on Windows
243
            if os.name == "nt" and hasattr(multiprocessing,"forking"):
244
                self._update_main_name()
245
            process.start()
246
        try:
247
            while True:
248
                chunk = response.read(self.chunk)
249
                if progress:
250
                    # Add our items to the queue
251
                    queue.put((time.time(),len(chunk)))
252
                if not chunk: break
253
                chunk_so_far += chunk
254
        finally:
255
            # Close the response whenever we're done
256
            response.close()
257
        if expand_gzip and response.headers.get("Content-Encoding","unknown").lower() == "gzip":
258
            fileobj = BytesIO(chunk_so_far)
259
            gfile   = gzip.GzipFile(fileobj=fileobj)
260
            return gfile.read()
261
        if progress:
262
            # Finalize the queue and wait
263
            queue.put("DONE")
264
            process.join()
265
        return chunk_so_far
266

267
    def stream_to_file(self, url, file_path, progress = True, headers = None, ensure_size_if_present = True, allow_resume = False):
268
        response = self.open_url(url, headers)
269
        if response is None: return None
270
        bytes_so_far = 0
271
        try: total_size = int(response.headers['Content-Length'])
272
        except: total_size = -1
273
        packets = queue = process = None
274
        mode = "wb"
275
        if allow_resume and os.path.isfile(file_path) and total_size != -1:
276
            # File exists, we're resuming and have a target size.  Check the
277
            # local file size.
278
            current_size = os.stat(file_path).st_size
279
            if current_size == total_size:
280
                # File is already complete - return the path
281
                return file_path
282
            elif current_size < total_size:
283
                response.close()
284
                # File is not complete - seek to our current size
285
                bytes_so_far = current_size
286
                mode = "ab" # Append
287
                # We also need to try creating a new request
288
                # in order to pass our range header
289
                new_headers = self._get_headers(headers)
290
                # Get the start byte, 0-indexed
291
                byte_string = "bytes={}-".format(current_size)
292
                new_headers["Range"] = byte_string
293
                response = self.open_url(url, new_headers)
294
                if response is None: return None
295
        if progress:
296
            # Make sure our vars are initialized
297
            packets = [] if progress else None
298
            queue = multiprocessing.Queue()
299
            # Create the multiprocess and start it
300
            process = multiprocessing.Process(
301
                target=_process_hook,
302
                args=(queue,total_size,bytes_so_far)
303
            )
304
            process.daemon = True
305
            # Filthy hack for earlier python versions on Windows
306
            if os.name == "nt" and hasattr(multiprocessing,"forking"):
307
                self._update_main_name()
308
            process.start()
309
        with open(file_path,mode) as f:
310
            try:
311
                while True:
312
                    chunk = response.read(self.chunk)
313
                    bytes_so_far += len(chunk)
314
                    if progress:
315
                        # Add our items to the queue
316
                        queue.put((time.time(),len(chunk)))
317
                    if not chunk: break
318
                    f.write(chunk)
319
            finally:
320
                # Close the response whenever we're done
321
                response.close()
322
        if progress:
323
            # Finalize the queue and wait
324
            queue.put("DONE")
325
            process.join()
326
        if ensure_size_if_present and total_size != -1:
327
            # We're verifying size - make sure we got what we asked for
328
            if bytes_so_far != total_size:
329
                return None # We didn't - imply it failed
330
        return file_path if os.path.exists(file_path) else None
331

332
Product

Resources

Company