Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/artstation.py
8924 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018-2026 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Extractors for https://www.artstation.com/"""
10
11
from .common import Extractor, Message
12
from .. import text, util, exception
13
import itertools
14
15
16
class ArtstationExtractor(Extractor):
17
"""Base class for artstation extractors"""
18
category = "artstation"
19
filename_fmt = "{category}_{id}_{asset[id]}_{title}.{extension}"
20
directory_fmt = ("{category}", "{userinfo[username]}")
21
archive_fmt = "{asset[id]}"
22
browser = "firefox"
23
tls12 = False
24
root = "https://www.artstation.com"
25
26
def __init__(self, match):
27
Extractor.__init__(self, match)
28
self.user = match[1] or match[2]
29
30
def _init(self):
31
self.session.headers["Cache-Control"] = "max-age=0"
32
self.mviews = self.config("mviews", True)
33
self.videos = self.config("videos", True)
34
self.external = self.config("external", False)
35
self.previews = self.config("previews", False)
36
self.max_posts = self.config("max-posts")
37
38
def items(self):
39
data = self.metadata()
40
projects = self.projects()
41
42
if self.max_posts:
43
projects = itertools.islice(projects, self.max_posts)
44
for project in projects:
45
for num, asset in enumerate(
46
self.get_project_assets(project["hash_id"]), 1):
47
asset.update(data)
48
adict = asset["asset"]
49
asset["num"] = num
50
yield Message.Directory, "", asset
51
52
if adict["has_embedded_player"]:
53
if url := self._extract_embed(asset):
54
text.nameext_from_url(url, asset)
55
yield Message.Url, url, asset
56
if not self.previews:
57
continue
58
59
if adict["has_image"]:
60
url = adict["image_url"]
61
text.nameext_from_url(url, asset)
62
63
url = self._no_cache(url)
64
if "/images/images/" in url:
65
lhs, _, rhs = url.partition("/large/")
66
if rhs:
67
url = f"{lhs}/8k/{rhs}"
68
asset["_fallback"] = self._image_fallback(lhs, rhs)
69
70
yield Message.Url, url, asset
71
72
def _extract_embed(self, asset):
73
adict = asset["asset"]
74
player = adict["player_embedded"]
75
url = (text.extr(player, 'src="', '"') or
76
text.extr(player, "src='", "'"))
77
78
if url.startswith(self.root):
79
# embed or video clip hosted on artstation
80
type = text.extr(adict.get("image_url", ""), "/assets/", "/")
81
if type == "marmosets":
82
if not self.mviews:
83
return
84
page = self.request(url).text
85
return text.extr(page, "marmoset.embed(", '",').strip("\"' ")
86
87
elif type:
88
if not self.videos:
89
return
90
page = self.request(url).text
91
return text.extract(
92
page, ' src="', '"', page.find('id="video"')+1)[0]
93
94
if url:
95
# external URL
96
if not self.external:
97
return
98
asset["extension"] = "mp4"
99
return "ytdl:" + url
100
101
self.log.debug(player)
102
self.log.warning("Failed to extract embedded player URL (%s)",
103
adict.get("id"))
104
105
def _image_fallback(self, lhs, rhs):
106
yield f"{lhs}/4k/{rhs}"
107
yield f"{lhs}/large/{rhs}"
108
yield f"{lhs}/medium/{rhs}"
109
yield f"{lhs}/small/{rhs}"
110
111
def metadata(self):
112
"""Return general metadata"""
113
return {"userinfo": self.get_user_info(self.user)}
114
115
def projects(self):
116
"""Return an iterable containing all relevant project IDs"""
117
118
def get_project_assets(self, project_id):
119
"""Return all assets associated with 'project_id'"""
120
url = f"{self.root}/projects/{project_id}.json"
121
122
try:
123
data = self.request_json(url)
124
except exception.HttpError as exc:
125
self.log.warning(exc)
126
return
127
128
data["title"] = text.unescape(data["title"])
129
data["description"] = text.unescape(text.remove_html(
130
data["description"]))
131
data["date"] = self.parse_datetime_iso(data["created_at"])
132
133
assets = data["assets"]
134
del data["assets"]
135
136
data["count"] = len(assets)
137
if len(assets) == 1:
138
data["asset"] = assets[0]
139
yield data
140
else:
141
for asset in assets:
142
data["asset"] = asset
143
yield data.copy()
144
145
def get_user_info(self, username):
146
"""Return metadata for a specific user"""
147
url = f"{self.root}/users/{username.lower()}/quick.json"
148
response = self.request(url, notfound="user")
149
return response.json()
150
151
def _pagination(self, url, params=None, json=None):
152
headers = {
153
"Accept" : "application/json, text/plain, */*",
154
"Origin" : self.root,
155
}
156
157
if json:
158
params = json
159
headers["PUBLIC-CSRF-TOKEN"] = self._init_csrf_token()
160
kwargs = {"method": "POST", "headers": headers, "json": json}
161
else:
162
if not params:
163
params = {}
164
kwargs = {"params": params, "headers": headers}
165
166
total = 0
167
params["page"] = 1
168
169
while True:
170
data = self.request_json(url, **kwargs)
171
yield from data["data"]
172
173
total += len(data["data"])
174
if total >= data["total_count"]:
175
return
176
177
params["page"] += 1
178
179
def _init_csrf_token(self):
180
url = self.root + "/api/v2/csrf_protection/token.json"
181
headers = {
182
"Accept" : "*/*",
183
"Origin" : self.root,
184
}
185
return self.request_json(
186
url, method="POST", headers=headers, json={})["public_csrf_token"]
187
188
def _no_cache(self, url):
189
"""Cause a cache miss to prevent Cloudflare 'optimizations'
190
191
Cloudflare's 'Polish' optimization strips image metadata and may even
192
recompress an image as lossy JPEG. This can be prevented by causing
193
a cache miss when requesting an image by adding a random dummy query
194
parameter.
195
196
Ref:
197
https://github.com/r888888888/danbooru/issues/3528
198
https://danbooru.donmai.us/forum_topics/14952
199
"""
200
sep = "&" if "?" in url else "?"
201
token = util.generate_token(8)
202
return url + sep + token[:4] + "=" + token[4:]
203
204
205
class ArtstationUserExtractor(ArtstationExtractor):
206
"""Extractor for all projects of an artstation user"""
207
subcategory = "user"
208
pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com"
209
r"/(?!artwork|projects|search)([^/?#]+)(?:/albums/all)?"
210
r"|((?!www)[\w-]+)\.artstation\.com(?:/projects)?)/?$")
211
example = "https://www.artstation.com/USER"
212
213
def projects(self):
214
url = f"{self.root}/users/{self.user}/projects.json"
215
params = {"album_id": "all"}
216
return self._pagination(url, params)
217
218
219
class ArtstationAlbumExtractor(ArtstationExtractor):
220
"""Extractor for all projects in an artstation album"""
221
subcategory = "album"
222
directory_fmt = ("{category}", "{userinfo[username]}", "Albums",
223
"{album[id]} - {album[title]}")
224
archive_fmt = "a_{album[id]}_{asset[id]}"
225
pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com"
226
r"/(?!artwork|projects|search)([^/?#]+)"
227
r"|((?!www)[\w-]+)\.artstation\.com)/albums/(\d+)")
228
example = "https://www.artstation.com/USER/albums/12345"
229
230
def __init__(self, match):
231
ArtstationExtractor.__init__(self, match)
232
self.album_id = text.parse_int(match[3])
233
234
def metadata(self):
235
userinfo = self.get_user_info(self.user)
236
album = None
237
238
for album in userinfo["albums_with_community_projects"]:
239
if album["id"] == self.album_id:
240
break
241
else:
242
raise exception.NotFoundError("album")
243
244
return {
245
"userinfo": userinfo,
246
"album": album
247
}
248
249
def projects(self):
250
url = f"{self.root}/users/{self.user}/projects.json"
251
params = {"album_id": self.album_id}
252
return self._pagination(url, params)
253
254
255
class ArtstationLikesExtractor(ArtstationExtractor):
256
"""Extractor for liked projects of an artstation user"""
257
subcategory = "likes"
258
directory_fmt = ("{category}", "{userinfo[username]}", "Likes")
259
archive_fmt = "f_{userinfo[id]}_{asset[id]}"
260
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
261
r"/(?!artwork|projects|search)([^/?#]+)/likes")
262
example = "https://www.artstation.com/USER/likes"
263
264
def projects(self):
265
url = f"{self.root}/users/{self.user}/likes.json"
266
return self._pagination(url)
267
268
269
class ArtstationCollectionExtractor(ArtstationExtractor):
270
"""Extractor for an artstation collection"""
271
subcategory = "collection"
272
directory_fmt = ("{category}", "{user}",
273
"{collection[id]} {collection[name]}")
274
archive_fmt = "c_{collection[id]}_{asset[id]}"
275
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
276
r"/(?!artwork|projects|search)([^/?#]+)/collections/(\d+)")
277
example = "https://www.artstation.com/USER/collections/12345"
278
279
def __init__(self, match):
280
ArtstationExtractor.__init__(self, match)
281
self.collection_id = match[2]
282
283
def metadata(self):
284
url = f"{self.root}/collections/{self.collection_id}.json"
285
params = {"username": self.user}
286
collection = self.request_json(
287
url, params=params, notfound=True)
288
return {"collection": collection, "user": self.user}
289
290
def projects(self):
291
url = f"{self.root}/collections/{self.collection_id}/projects.json"
292
params = {"collection_id": self.collection_id}
293
return self._pagination(url, params)
294
295
296
class ArtstationCollectionsExtractor(ArtstationExtractor):
297
"""Extractor for an artstation user's collections"""
298
subcategory = "collections"
299
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
300
r"/(?!artwork|projects|search)([^/?#]+)/collections/?$")
301
example = "https://www.artstation.com/USER/collections"
302
303
def items(self):
304
url = self.root + "/collections.json"
305
params = {"username": self.user}
306
307
for collection in self.request_json(
308
url, params=params, notfound=True):
309
url = f"{self.root}/{self.user}/collections/{collection['id']}"
310
collection["_extractor"] = ArtstationCollectionExtractor
311
yield Message.Queue, url, collection
312
313
314
class ArtstationChallengeExtractor(ArtstationExtractor):
315
"""Extractor for submissions of artstation challenges"""
316
subcategory = "challenge"
317
filename_fmt = "{submission_id}_{asset_id}_{filename}.{extension}"
318
directory_fmt = ("{category}", "Challenges",
319
"{challenge[id]} - {challenge[title]}")
320
archive_fmt = "c_{challenge[id]}_{asset_id}"
321
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
322
r"/c(?:hallenges|ontests)/[^/?#]+/c(?:ategori|halleng)es/(\d+)"
323
r"/?(?:\?sorting=([a-z]+))?")
324
example = "https://www.artstation.com/challenges/NAME/categories/12345"
325
326
def __init__(self, match):
327
ArtstationExtractor.__init__(self, match)
328
self.challenge_id = match[1]
329
self.sorting = match[2] or "popular"
330
331
def items(self):
332
base = self.root + "/api/v2/competition/"
333
challenge_url = f"{base}challenges/{self.challenge_id}.json"
334
submission_url = base + "submissions.json"
335
336
challenge = self.request_json(challenge_url)
337
yield Message.Directory, "", {"challenge": challenge}
338
339
params = {
340
"page" : 1,
341
"per_page" : 50,
342
"challenge_id": self.challenge_id,
343
"sort_by" : self.sorting,
344
}
345
346
for submission in self._pagination(submission_url, params):
347
update_url = (f"{base}submissions/{submission['id']}"
348
f"/submission_updates.json")
349
params = {"page": 1, "per_page": 50}
350
for update in self._pagination(update_url, params=params):
351
update["challenge"] = challenge
352
for url in util.unique_sequence(text.extract_iter(
353
update["body"], ' href="', '"')):
354
update["asset_id"] = self._id_from_url(url)
355
text.nameext_from_url(url, update)
356
yield Message.Url, self._no_cache(url), update
357
358
def _id_from_url(self, url):
359
"""Get an image's submission ID from its URL"""
360
parts = url.split("/")
361
return text.parse_int("".join(parts[7:10]))
362
363
364
class ArtstationSearchExtractor(ArtstationExtractor):
365
"""Extractor for artstation search results"""
366
subcategory = "search"
367
directory_fmt = ("{category}", "Searches", "{search[query]}")
368
archive_fmt = "s_{search[query]}_{asset[id]}"
369
pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com"
370
r"/search/?\?([^#]+)")
371
example = "https://www.artstation.com/search?query=QUERY"
372
373
def __init__(self, match):
374
ArtstationExtractor.__init__(self, match)
375
self.params = query = text.parse_query(match[1])
376
self.query = text.unquote(query.get("query") or query.get("q", ""))
377
self.sorting = query.get("sort_by", "relevance").lower()
378
self.tags = query.get("tags", "").split(",")
379
380
def metadata(self):
381
return {"search": {
382
"query" : self.query,
383
"sorting": self.sorting,
384
"tags" : self.tags,
385
}}
386
387
def projects(self):
388
filters = []
389
for key, value in self.params.items():
390
if key.endswith("_ids") or key == "tags":
391
filters.append({
392
"field" : key,
393
"method": "include",
394
"value" : value.split(","),
395
})
396
397
url = self.root + "/api/v2/search/projects.json"
398
data = {
399
"query" : self.query,
400
"page" : None,
401
"per_page" : 50,
402
"sorting" : self.sorting,
403
"pro_first" : ("1" if self.config("pro-first", True) else
404
"0"),
405
"filters" : filters,
406
"additional_fields": (),
407
}
408
return self._pagination(url, json=data)
409
410
411
class ArtstationArtworkExtractor(ArtstationExtractor):
412
"""Extractor for projects on artstation's artwork page"""
413
subcategory = "artwork"
414
directory_fmt = ("{category}", "Artworks", "{artwork[sorting]!c}")
415
archive_fmt = "A_{asset[id]}"
416
pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com"
417
r"/artwork/?\?([^#]+)")
418
example = "https://www.artstation.com/artwork?sorting=SORT"
419
420
def __init__(self, match):
421
ArtstationExtractor.__init__(self, match)
422
self.query = text.parse_query(match[1])
423
424
def metadata(self):
425
return {"artwork": self.query}
426
427
def projects(self):
428
url = self.root + "/projects.json"
429
return self._pagination(url, self.query.copy())
430
431
432
class ArtstationImageExtractor(ArtstationExtractor):
433
"""Extractor for images from a single artstation project"""
434
subcategory = "image"
435
pattern = (r"(?:https?://)?(?:"
436
r"(?:[\w-]+\.)?artstation\.com/(?:artwork|projects|search)"
437
r"|artstn\.co/p)/(\w+)")
438
example = "https://www.artstation.com/artwork/abcde"
439
440
def __init__(self, match):
441
ArtstationExtractor.__init__(self, match)
442
self.project_id = match[1]
443
self.assets = None
444
445
def metadata(self):
446
self.assets = list(ArtstationExtractor.get_project_assets(
447
self, self.project_id))
448
try:
449
self.user = self.assets[0]["user"]["username"]
450
except IndexError:
451
self.user = ""
452
return ArtstationExtractor.metadata(self)
453
454
def projects(self):
455
return ({"hash_id": self.project_id},)
456
457
def get_project_assets(self, project_id):
458
return self.assets
459
460
461
class ArtstationFollowingExtractor(ArtstationExtractor):
462
"""Extractor for a user's followed users"""
463
subcategory = "following"
464
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
465
r"/(?!artwork|projects|search)([^/?#]+)/following")
466
example = "https://www.artstation.com/USER/following"
467
468
def items(self):
469
url = f"{self.root}/users/{self.user}/following.json"
470
for user in self._pagination(url):
471
url = f"{self.root}/{user['username']}"
472
user["_extractor"] = ArtstationUserExtractor
473
yield Message.Queue, url, user
474
475