Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/artstation.py
5399 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Extractors for https://www.artstation.com/"""
10
11
from .common import Extractor, Message
12
from .. import text, util, exception
13
import itertools
14
15
16
class ArtstationExtractor(Extractor):
17
"""Base class for artstation extractors"""
18
category = "artstation"
19
filename_fmt = "{category}_{id}_{asset[id]}_{title}.{extension}"
20
directory_fmt = ("{category}", "{userinfo[username]}")
21
archive_fmt = "{asset[id]}"
22
browser = "firefox"
23
tls12 = False
24
root = "https://www.artstation.com"
25
26
def __init__(self, match):
27
Extractor.__init__(self, match)
28
self.user = match[1] or match[2]
29
30
def _init(self):
31
self.session.headers["Cache-Control"] = "max-age=0"
32
self.mviews = self.config("mviews", True)
33
self.videos = self.config("videos", True)
34
self.external = self.config("external", False)
35
self.previews = self.config("previews", False)
36
self.max_posts = self.config("max-posts")
37
38
def items(self):
39
data = self.metadata()
40
projects = self.projects()
41
42
if self.max_posts:
43
projects = itertools.islice(projects, self.max_posts)
44
for project in projects:
45
for num, asset in enumerate(
46
self.get_project_assets(project["hash_id"]), 1):
47
asset.update(data)
48
adict = asset["asset"]
49
asset["num"] = num
50
yield Message.Directory, asset
51
52
if adict["has_embedded_player"]:
53
if url := self._extract_embed(asset):
54
text.nameext_from_url(url, asset)
55
yield Message.Url, url, asset
56
if not self.previews:
57
continue
58
59
if adict["has_image"]:
60
url = adict["image_url"]
61
text.nameext_from_url(url, asset)
62
63
url = self._no_cache(url)
64
if "/images/images/" in url:
65
lhs, _, rhs = url.partition("/large/")
66
if rhs:
67
url = f"{lhs}/4k/{rhs}"
68
asset["_fallback"] = self._image_fallback(lhs, rhs)
69
70
yield Message.Url, url, asset
71
72
def _extract_embed(self, asset):
73
adict = asset["asset"]
74
player = adict["player_embedded"]
75
url = (text.extr(player, 'src="', '"') or
76
text.extr(player, "src='", "'"))
77
78
if url.startswith(self.root):
79
# embed or video clip hosted on artstation
80
type = text.extr(adict.get("image_url", ""), "/assets/", "/")
81
if type == "marmosets":
82
if not self.mviews:
83
return
84
page = self.request(url).text
85
return text.extr(page, "marmoset.embed(", '",').strip("\"' ")
86
87
elif type:
88
if not self.videos:
89
return
90
page = self.request(url).text
91
return text.extr(page, ' src="', '"')
92
93
if url:
94
# external URL
95
if not self.external:
96
return
97
asset["extension"] = "mp4"
98
return f"ytdl:{url}"
99
100
self.log.debug(player)
101
self.log.warning("Failed to extract embedded player URL (%s)",
102
adict.get("id"))
103
104
def _image_fallback(self, lhs, rhs):
105
yield f"{lhs}/large/{rhs}"
106
yield f"{lhs}/medium/{rhs}"
107
yield f"{lhs}/small/{rhs}"
108
109
def metadata(self):
110
"""Return general metadata"""
111
return {"userinfo": self.get_user_info(self.user)}
112
113
def projects(self):
114
"""Return an iterable containing all relevant project IDs"""
115
116
def get_project_assets(self, project_id):
117
"""Return all assets associated with 'project_id'"""
118
url = f"{self.root}/projects/{project_id}.json"
119
120
try:
121
data = self.request_json(url)
122
except exception.HttpError as exc:
123
self.log.warning(exc)
124
return
125
126
data["title"] = text.unescape(data["title"])
127
data["description"] = text.unescape(text.remove_html(
128
data["description"]))
129
data["date"] = text.parse_datetime(
130
data["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
131
132
assets = data["assets"]
133
del data["assets"]
134
135
data["count"] = len(assets)
136
if len(assets) == 1:
137
data["asset"] = assets[0]
138
yield data
139
else:
140
for asset in assets:
141
data["asset"] = asset
142
yield data.copy()
143
144
def get_user_info(self, username):
145
"""Return metadata for a specific user"""
146
url = f"{self.root}/users/{username.lower()}/quick.json"
147
response = self.request(url, notfound="user")
148
return response.json()
149
150
def _pagination(self, url, params=None, json=None):
151
headers = {
152
"Accept" : "application/json, text/plain, */*",
153
"Origin" : self.root,
154
}
155
156
if json:
157
params = json
158
headers["PUBLIC-CSRF-TOKEN"] = self._init_csrf_token()
159
kwargs = {"method": "POST", "headers": headers, "json": json}
160
else:
161
if not params:
162
params = {}
163
kwargs = {"params": params, "headers": headers}
164
165
total = 0
166
params["page"] = 1
167
168
while True:
169
data = self.request_json(url, **kwargs)
170
yield from data["data"]
171
172
total += len(data["data"])
173
if total >= data["total_count"]:
174
return
175
176
params["page"] += 1
177
178
def _init_csrf_token(self):
179
url = self.root + "/api/v2/csrf_protection/token.json"
180
headers = {
181
"Accept" : "*/*",
182
"Origin" : self.root,
183
}
184
return self.request_json(
185
url, method="POST", headers=headers, json={})["public_csrf_token"]
186
187
def _no_cache(self, url):
188
"""Cause a cache miss to prevent Cloudflare 'optimizations'
189
190
Cloudflare's 'Polish' optimization strips image metadata and may even
191
recompress an image as lossy JPEG. This can be prevented by causing
192
a cache miss when requesting an image by adding a random dummy query
193
parameter.
194
195
Ref:
196
https://github.com/r888888888/danbooru/issues/3528
197
https://danbooru.donmai.us/forum_topics/14952
198
"""
199
sep = "&" if "?" in url else "?"
200
token = util.generate_token(8)
201
return url + sep + token[:4] + "=" + token[4:]
202
203
204
class ArtstationUserExtractor(ArtstationExtractor):
205
"""Extractor for all projects of an artstation user"""
206
subcategory = "user"
207
pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com"
208
r"/(?!artwork|projects|search)([^/?#]+)(?:/albums/all)?"
209
r"|((?!www)[\w-]+)\.artstation\.com(?:/projects)?)/?$")
210
example = "https://www.artstation.com/USER"
211
212
def projects(self):
213
url = f"{self.root}/users/{self.user}/projects.json"
214
params = {"album_id": "all"}
215
return self._pagination(url, params)
216
217
218
class ArtstationAlbumExtractor(ArtstationExtractor):
219
"""Extractor for all projects in an artstation album"""
220
subcategory = "album"
221
directory_fmt = ("{category}", "{userinfo[username]}", "Albums",
222
"{album[id]} - {album[title]}")
223
archive_fmt = "a_{album[id]}_{asset[id]}"
224
pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com"
225
r"/(?!artwork|projects|search)([^/?#]+)"
226
r"|((?!www)[\w-]+)\.artstation\.com)/albums/(\d+)")
227
example = "https://www.artstation.com/USER/albums/12345"
228
229
def __init__(self, match):
230
ArtstationExtractor.__init__(self, match)
231
self.album_id = text.parse_int(match[3])
232
233
def metadata(self):
234
userinfo = self.get_user_info(self.user)
235
album = None
236
237
for album in userinfo["albums_with_community_projects"]:
238
if album["id"] == self.album_id:
239
break
240
else:
241
raise exception.NotFoundError("album")
242
243
return {
244
"userinfo": userinfo,
245
"album": album
246
}
247
248
def projects(self):
249
url = f"{self.root}/users/{self.user}/projects.json"
250
params = {"album_id": self.album_id}
251
return self._pagination(url, params)
252
253
254
class ArtstationLikesExtractor(ArtstationExtractor):
255
"""Extractor for liked projects of an artstation user"""
256
subcategory = "likes"
257
directory_fmt = ("{category}", "{userinfo[username]}", "Likes")
258
archive_fmt = "f_{userinfo[id]}_{asset[id]}"
259
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
260
r"/(?!artwork|projects|search)([^/?#]+)/likes")
261
example = "https://www.artstation.com/USER/likes"
262
263
def projects(self):
264
url = f"{self.root}/users/{self.user}/likes.json"
265
return self._pagination(url)
266
267
268
class ArtstationCollectionExtractor(ArtstationExtractor):
269
"""Extractor for an artstation collection"""
270
subcategory = "collection"
271
directory_fmt = ("{category}", "{user}",
272
"{collection[id]} {collection[name]}")
273
archive_fmt = "c_{collection[id]}_{asset[id]}"
274
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
275
r"/(?!artwork|projects|search)([^/?#]+)/collections/(\d+)")
276
example = "https://www.artstation.com/USER/collections/12345"
277
278
def __init__(self, match):
279
ArtstationExtractor.__init__(self, match)
280
self.collection_id = match[2]
281
282
def metadata(self):
283
url = f"{self.root}/collections/{self.collection_id}.json"
284
params = {"username": self.user}
285
collection = self.request_json(
286
url, params=params, notfound="collection")
287
return {"collection": collection, "user": self.user}
288
289
def projects(self):
290
url = f"{self.root}/collections/{self.collection_id}/projects.json"
291
params = {"collection_id": self.collection_id}
292
return self._pagination(url, params)
293
294
295
class ArtstationCollectionsExtractor(ArtstationExtractor):
296
"""Extractor for an artstation user's collections"""
297
subcategory = "collections"
298
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
299
r"/(?!artwork|projects|search)([^/?#]+)/collections/?$")
300
example = "https://www.artstation.com/USER/collections"
301
302
def items(self):
303
url = self.root + "/collections.json"
304
params = {"username": self.user}
305
306
for collection in self.request_json(
307
url, params=params, notfound="collections"):
308
url = f"{self.root}/{self.user}/collections/{collection['id']}"
309
collection["_extractor"] = ArtstationCollectionExtractor
310
yield Message.Queue, url, collection
311
312
313
class ArtstationChallengeExtractor(ArtstationExtractor):
314
"""Extractor for submissions of artstation challenges"""
315
subcategory = "challenge"
316
filename_fmt = "{submission_id}_{asset_id}_{filename}.{extension}"
317
directory_fmt = ("{category}", "Challenges",
318
"{challenge[id]} - {challenge[title]}")
319
archive_fmt = "c_{challenge[id]}_{asset_id}"
320
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
321
r"/contests/[^/?#]+/challenges/(\d+)"
322
r"/?(?:\?sorting=([a-z]+))?")
323
example = "https://www.artstation.com/contests/NAME/challenges/12345"
324
325
def __init__(self, match):
326
ArtstationExtractor.__init__(self, match)
327
self.challenge_id = match[1]
328
self.sorting = match[2] or "popular"
329
330
def items(self):
331
base = f"{self.root}/contests/_/challenges/{self.challenge_id}"
332
challenge_url = f"{base}.json"
333
submission_url = f"{base}/submissions.json"
334
update_url = f"{self.root}/contests/submission_updates.json"
335
336
challenge = self.request_json(challenge_url)
337
yield Message.Directory, {"challenge": challenge}
338
339
params = {"sorting": self.sorting}
340
for submission in self._pagination(submission_url, params):
341
342
params = {"submission_id": submission["id"]}
343
for update in self._pagination(update_url, params=params):
344
345
del update["replies"]
346
update["challenge"] = challenge
347
for url in text.extract_iter(
348
update["body_presentation_html"], ' href="', '"'):
349
update["asset_id"] = self._id_from_url(url)
350
text.nameext_from_url(url, update)
351
yield Message.Url, self._no_cache(url), update
352
353
def _id_from_url(self, url):
354
"""Get an image's submission ID from its URL"""
355
parts = url.split("/")
356
return text.parse_int("".join(parts[7:10]))
357
358
359
class ArtstationSearchExtractor(ArtstationExtractor):
360
"""Extractor for artstation search results"""
361
subcategory = "search"
362
directory_fmt = ("{category}", "Searches", "{search[query]}")
363
archive_fmt = "s_{search[query]}_{asset[id]}"
364
pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com"
365
r"/search/?\?([^#]+)")
366
example = "https://www.artstation.com/search?query=QUERY"
367
368
def __init__(self, match):
369
ArtstationExtractor.__init__(self, match)
370
self.params = query = text.parse_query(match[1])
371
self.query = text.unquote(query.get("query") or query.get("q", ""))
372
self.sorting = query.get("sort_by", "relevance").lower()
373
self.tags = query.get("tags", "").split(",")
374
375
def metadata(self):
376
return {"search": {
377
"query" : self.query,
378
"sorting": self.sorting,
379
"tags" : self.tags,
380
}}
381
382
def projects(self):
383
filters = []
384
for key, value in self.params.items():
385
if key.endswith("_ids") or key == "tags":
386
filters.append({
387
"field" : key,
388
"method": "include",
389
"value" : value.split(","),
390
})
391
392
url = f"{self.root}/api/v2/search/projects.json"
393
data = {
394
"query" : self.query,
395
"page" : None,
396
"per_page" : 50,
397
"sorting" : self.sorting,
398
"pro_first" : ("1" if self.config("pro-first", True) else
399
"0"),
400
"filters" : filters,
401
"additional_fields": (),
402
}
403
return self._pagination(url, json=data)
404
405
406
class ArtstationArtworkExtractor(ArtstationExtractor):
407
"""Extractor for projects on artstation's artwork page"""
408
subcategory = "artwork"
409
directory_fmt = ("{category}", "Artworks", "{artwork[sorting]!c}")
410
archive_fmt = "A_{asset[id]}"
411
pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com"
412
r"/artwork/?\?([^#]+)")
413
example = "https://www.artstation.com/artwork?sorting=SORT"
414
415
def __init__(self, match):
416
ArtstationExtractor.__init__(self, match)
417
self.query = text.parse_query(match[1])
418
419
def metadata(self):
420
return {"artwork": self.query}
421
422
def projects(self):
423
url = f"{self.root}/projects.json"
424
return self._pagination(url, self.query.copy())
425
426
427
class ArtstationImageExtractor(ArtstationExtractor):
428
"""Extractor for images from a single artstation project"""
429
subcategory = "image"
430
pattern = (r"(?:https?://)?(?:"
431
r"(?:[\w-]+\.)?artstation\.com/(?:artwork|projects|search)"
432
r"|artstn\.co/p)/(\w+)")
433
example = "https://www.artstation.com/artwork/abcde"
434
435
def __init__(self, match):
436
ArtstationExtractor.__init__(self, match)
437
self.project_id = match[1]
438
self.assets = None
439
440
def metadata(self):
441
self.assets = list(ArtstationExtractor.get_project_assets(
442
self, self.project_id))
443
try:
444
self.user = self.assets[0]["user"]["username"]
445
except IndexError:
446
self.user = ""
447
return ArtstationExtractor.metadata(self)
448
449
def projects(self):
450
return ({"hash_id": self.project_id},)
451
452
def get_project_assets(self, project_id):
453
return self.assets
454
455
456
class ArtstationFollowingExtractor(ArtstationExtractor):
457
"""Extractor for a user's followed users"""
458
subcategory = "following"
459
pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
460
r"/(?!artwork|projects|search)([^/?#]+)/following")
461
example = "https://www.artstation.com/USER/following"
462
463
def items(self):
464
url = f"{self.root}/users/{self.user}/following.json"
465
for user in self._pagination(url):
466
url = f"{self.root}/{user['username']}"
467
user["_extractor"] = ArtstationUserExtractor
468
yield Message.Queue, url, user
469
470