CoCalc -- upload.py

GitHub Repository: keras-team/keras-io
Path: blob/master/scripts/upload.py
³²⁷³ views
1
import boto3
2
from pathlib import Path
3
import mimetypes
4
import hashlib
5
import os
6
import json
7
from multiprocessing.pool import ThreadPool
8

9
AKEY = os.environ["AWS_S3_ACCESS_KEY"]
10
SKEY = os.environ["AWS_S3_SECRET_KEY"]
11

12
BUCKET = "keras.io"
13
USE_THREADING = True
14
HASH_CACHE = "contents_hashes.json"
15

16
s3 = boto3.client("s3", aws_access_key_id=AKEY, aws_secret_access_key=SKEY)
17

18

19
def hash_file(fpath):
20
    h = hashlib.sha256()
21
    b = bytearray(128 * 1024)
22
    mv = memoryview(b)
23
    with open(fpath, "rb", buffering=0) as f:
24
        while n := f.readinto(mv):
25
            h.update(mv[:n])
26
    return h.hexdigest()[:8]
27

28

29
def upload_file(bucket, fpath, key_name, redirect=None):
30
    print(f"...Upload to {bucket}:{key_name}")
31
    mime = mimetypes.guess_type(fpath)[0]
32
    extra_args = {"ContentType": mime, "ACL": "public-read"}
33
    if redirect:
34
        extra_args["WebsiteRedirectLocation"] = redirect
35
    s3.upload_file(
36
        fpath, bucket, key_name, ExtraArgs={"ContentType": mime, "ACL": "public-read"}
37
    )
38

39

40
def load_hash_cache():
41
    try:
42
        s3.download_file(BUCKET, HASH_CACHE, HASH_CACHE)
43
    except:
44
        print(f"[ERROR] Could not dowload hash cache {HASH_CACHE}")
45
        return {}
46
    with open(HASH_CACHE) as f:
47
        contents = f.read()
48
        return json.loads(contents)
49

50

51
def save_hash_cache(hash_cache):
52
    with open(HASH_CACHE, "w") as f:
53
        f.write(json.dumps(hash_cache))
54
    upload_file(BUCKET, HASH_CACHE, HASH_CACHE)
55

56

57
def wrapped_upload_file(args):
58
    bucket, fpath, key_name = args
59
    upload_file(bucket, fpath, key_name)
60

61

62
def cleanup(site_directory, redirect_directory):
63
    paginator = s3.get_paginator("list_objects_v2")
64
    page_iterator = paginator.paginate(Bucket=BUCKET)
65
    for page in page_iterator:
66
        for obj in page["Contents"]:
67
            key = obj["Key"]
68
            if key.endswith(".html"):
69
                site_fpath = os.path.join(site_directory, key)
70
                redirect_fpath = os.path.join(redirect_directory, key)
71
                if not os.path.exists(site_fpath) and not os.path.exists(
72
                    redirect_fpath
73
                ):
74
                    print(f"[DELETE] {key}")
75
                    s3.delete_object(Bucket=BUCKET, Key=key)
76

77

78
def upload_dir(directory, include_img=True, hash_cache=None):
79
    print(f"Uploading files from '{directory}'...")
80
    all_targets = []
81
    for dp, _, fn in os.walk(directory):
82
        if fn:
83
            for f in fn:
84
                fpath = os.path.join(dp, f)
85
                if f.startswith("."):
86
                    continue
87
                if not include_img and "/img/" in fpath:
88
                    continue
89
                key_name = fpath[len(directory) :]
90
                key_name = key_name.removeprefix("/")
91
                print(f"...{key_name}")
92
                all_targets.append((BUCKET, fpath, key_name))
93

94
    if hash_cache is not None:
95
        filtered_targets = []
96
        new_hash_cache = {}
97
        for bucket, fpath, key_name in all_targets:
98
            new_hash = hash_file(fpath)
99
            old_hash = hash_cache.get(key_name)
100
            if new_hash != old_hash:
101
                filtered_targets.append((bucket, fpath, key_name))
102
            new_hash_cache[key_name] = new_hash
103
        all_targets = filtered_targets
104

105
    if USE_THREADING:
106
        pool = ThreadPool(processes=8)
107
        pool.map(wrapped_upload_file, all_targets)
108
    else:
109
        for args in all_targets:
110
            wrapped_upload_file(args)
111

112
    if hash_cache is not None:
113
        return new_hash_cache
114

115

116
def upload_redirects(directory):
117
    print("Uploading redirects...")
118
    for dp, _, fn in os.walk(directory):
119
        if fn:
120
            for f in fn:
121
                fpath = os.path.join(dp, f)
122
                if not f == "index.html":
123
                    continue
124
                content = open(fpath).read()
125
                url = content[content.find("URL=") + 5 :]
126
                url = url[: url.find("'")]
127
                print(fpath)
128
                print(url)
129
                key_name = fpath[len(directory) :]
130
                upload_file(BUCKET, fpath, key_name, redirect=url)
131

132

133
if __name__ == "__main__":
134
    root = Path(__file__).parent.parent.resolve()
135
    hash_cache = load_hash_cache()
136
    site_directory = os.path.join(root, "site")
137
    redirect_directory = os.path.join(root, "redirects")
138
    hash_cache = upload_dir(site_directory, hash_cache=hash_cache)
139
    upload_redirects(redirect_directory)
140
    cleanup(site_directory, redirect_directory)
141
    save_hash_cache(hash_cache)
142

143
Product

Resources

Company