Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/scripts/init.py
8754 views
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
4
# Copyright 2025 Mike Fährmann
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License version 2 as
8
# published by the Free Software Foundation.
9
10
"""Initialize extractor modules"""
11
12
import re
13
import logging
14
import argparse
15
import datetime as dt
16
import util # noqa
17
18
from gallery_dl import text
19
20
LOG = logging.getLogger("init")
21
NONE = {}
22
ENCODING = """\
23
# -*- coding: utf-8 -*-
24
"""
25
LICENSE = """\
26
# This program is free software; you can redistribute it and/or modify
27
# it under the terms of the GNU General Public License version 2 as
28
# published by the Free Software Foundation.
29
"""
30
31
32
def init_extractor(args):
33
category = args.category
34
35
files = []
36
if args.init_module:
37
files.append((util.path("gallery_dl", "extractor", f"{category}.py"),
38
generate_module, False))
39
files.append((util.path("gallery_dl", "extractor", "__init__.py"),
40
insert_into_modules_list, True))
41
if args.init_test:
42
files.append((util.path("test", "results", f"{category}.py"),
43
generate_test, False))
44
if args.site_name:
45
files.append((util.path("scripts", "supportedsites.py"),
46
insert_into_supportedsites, True))
47
48
for path, func, lines in files:
49
LOG.info(util.trim(path))
50
51
if lines:
52
with util.lines(path) as lines:
53
if not func(args, lines):
54
LOG.warning("'%s' already present", category)
55
else:
56
try:
57
with util.open(path, args.open_mode) as fp:
58
fp.write(func(args))
59
except FileExistsError:
60
LOG.warning("File already present")
61
except Exception as exc:
62
LOG.error("%s: %s", exc.__class__.__name__, exc, exc_info=exc)
63
64
if args.git:
65
util.git("add", path)
66
67
68
###############################################################################
69
# Extractor ###################################################################
70
71
def generate_module(args):
72
type = args.type
73
if type == "manga":
74
generate_extractors = generate_extractors_manga
75
elif type == "user":
76
generate_extractors = generate_extractors_user
77
else:
78
generate_extractors = generate_extractors_basic
79
80
if copyright := args.copyright:
81
copyright = f"\n# Copyright {dt.date.today().year} {copyright}\n#"
82
83
return f'''\
84
{ENCODING}{copyright}
85
{LICENSE}
86
"""Extractors for {args.root}/"""
87
88
{generate_extractors(args)}\
89
'''
90
91
92
def generate_extractors_basic(args):
93
cat = args.category
94
ccat = cat.capitalize()
95
96
result = f'''\
97
from .common import Extractor, Message
98
from .. import text
99
100
{build_base_pattern(args)}
101
102
class {ccat}Extractor(Extractor):
103
"""Base class for {cat} extractors"""
104
category = "{cat}"
105
root = "{args.root}"
106
'''
107
108
for subcat in args.subcategories:
109
subcat = subcat.lower()
110
result = f'''{result}
111
112
class {ccat}{subcat.capitalize()}Extractor({ccat}Extractor):
113
subcategory = "{subcat}"
114
pattern = BASE_PATTERN + r"/PATH"
115
example = "{args.root}/PATH"
116
117
def items(self):
118
pass
119
'''
120
121
return result
122
123
124
def generate_extractors_manga(args):
125
cat = args.category
126
ccat = cat.capitalize()
127
128
return f'''\
129
from .common import ChapterExtractor, MangaExtractor
130
from .. import text
131
from ..cache import memcache
132
133
{build_base_pattern(args)}
134
135
class {ccat}Base():
136
"""Base class for {cat} extractors"""
137
category = "{cat}"
138
root = "{args.root}"
139
140
141
class {ccat}ChapterExtractor({ccat}Base, ChapterExtractor):
142
"""Extractor for {cat} manga chapters"""
143
pattern = BASE_PATTERN + r"/PATH"
144
example = "{args.root}/PATH"
145
146
def __init__(self, match):
147
url = f"{{self.root}}/PATH"
148
ChapterExtractor.__init__(self, match, url)
149
150
def metadata(self, page):
151
chapter, sep, minor = chapter.partition(".")
152
153
return {{
154
**_manga_info(self, manga_id),
155
"manga" : text.unescape(manga),
156
"manga_id": text.parse_int(manga_id),
157
"title" : "",
158
"volume" : text.parse_int(volume),
159
"chapter" : text.parse_int(chapter),
160
"chapter_minor": sep + minor,
161
"chapter_id" : text.parse_int(chapter_id),
162
"lang" : "en",
163
"language": "English",
164
}}
165
166
def images(self, page):
167
return [
168
(url, None)
169
for url in text.extract_iter(page, "", "")
170
]
171
172
173
class {ccat}MangaExtractor({ccat}Base, MangaExtractor):
174
"""Extractor for {cat} manga"""
175
chapterclass = {ccat}ChapterExtractor
176
pattern = BASE_PATTERN + r"/PATH"
177
example = "{args.root}/PATH"
178
179
def __init__(self, match):
180
url = f"{{self.root}}/PATH"
181
MangaExtractor.__init__(self, match, url)
182
183
def chapters(self, page):
184
results = []
185
186
while True:
187
results.append((url, None))
188
189
return results
190
191
192
@memcache(keyarg=1)
193
def _manga_info(self, slug):
194
return {{}}
195
'''
196
197
198
def generate_extractors_user(args):
199
cat = args.category
200
ccat = cat.capitalize()
201
202
return f'''\
203
from .common import Extractor, Message, Dispatch
204
from .. import text
205
206
{build_base_pattern(args)}
207
USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)"
208
209
class {ccat}Extractor(Extractor):
210
"""Base class for {cat} extractors"""
211
category = "{cat}"
212
root = "{args.root}"
213
214
215
class {ccat}UserExtractor(Dispatch, {ccat}Extractor)
216
"""Extractor for {cat} user profiles"""
217
pattern = USER_PATTERN + r"/?(?:$|\\?|#)"
218
example = "{args.root}/USER/"
219
220
def items(self):
221
base = self.root + "/"
222
return self._dispatch_extractors((
223
({ccat}InfoExtractor, base + "info"),
224
), ("posts",))
225
'''
226
227
228
def build_base_pattern(args):
229
domain = args.domain
230
if domain.count(".") > 1:
231
subdomain, domain, tld = domain.rsplit(".", 2)
232
domain = f"{domain}.{tld}"
233
if subdomain == "www":
234
subdomain = "(?:www\\.)?"
235
else:
236
subdomain = re.escape(subdomain + ".")
237
else:
238
subdomain = "(?:www\\.)?"
239
240
return f"""\
241
BASE_PATTERN = r"(?:https?://)?{subdomain}{re.escape(domain)}"
242
"""
243
244
245
###############################################################################
246
# Test Results ################################################################
247
248
def generate_test(args):
249
category = args.category
250
251
if category[0].isdecimal():
252
import_stmt = f"""\
253
gallery_dl = __import__("gallery_dl.extractor.{category}")
254
_{category} = getattr(gallery_dl.extractor, "{category}")
255
"""
256
else:
257
import_stmt = f"""\
258
from gallery_dl.extractor import {category}
259
"""
260
261
return f"""\
262
{ENCODING}
263
{LICENSE}
264
{import_stmt}
265
266
__tests__ = (
267
)
268
"""
269
270
271
###############################################################################
272
# Modules List ################################################################
273
274
def insert_into_modules_list(args, lines):
275
category = args.category
276
277
module_name = f' "{category}",\n'
278
if module_name in lines:
279
return False
280
281
compare = False
282
for idx, line in enumerate(lines):
283
if compare:
284
cat = text.extr(line, '"', '"')
285
if cat == category:
286
return False
287
if cat > category or cat == "booru":
288
break
289
elif line.startswith("modules = "):
290
compare = True
291
292
lines.insert(idx, module_name)
293
return True
294
295
296
###############################################################################
297
# Supported Sites #############################################################
298
299
def insert_into_supportedsites(args, lines):
300
category = args.category
301
302
compare = False
303
for idx, line in enumerate(lines):
304
if compare:
305
cat = text.extr(line, '"', '"')
306
if cat == category:
307
return False
308
if cat > category:
309
break
310
elif line.startswith("CATEGORY_MAP = "):
311
compare = True
312
313
ws = " " * max(15 - len(category), 0)
314
line = f''' "{category}"{ws}: "{args.site_name}",\n'''
315
lines.insert(idx, line)
316
return True
317
318
319
###############################################################################
320
# Command-Line Options ########################################################
321
322
def parse_args(args=None):
323
parser = argparse.ArgumentParser(args)
324
325
parser.add_argument(
326
"-s", "--subcategory",
327
dest="subcategories", metavar="SUBCaT", action="append", default=[])
328
parser.add_argument(
329
"-n", "--name",
330
dest="site_name", metavar="TITLE")
331
parser.add_argument(
332
"-c", "--copyright",
333
dest="copyright", metavar="NAME", default="")
334
parser.add_argument(
335
"-C",
336
dest="copyright", action="store_const", const="Mike Fährmann")
337
parser.add_argument(
338
"-F", "--force",
339
dest="open_mode", action="store_const", const="w", default="x")
340
parser.add_argument(
341
"-g", "--git",
342
dest="git", action="store_true")
343
parser.add_argument(
344
"-M", "--no-module",
345
dest="init_module", action="store_false")
346
parser.add_argument(
347
"-T", "--no-test",
348
dest="init_test", action="store_false")
349
parser.add_argument(
350
"-t", "--type",
351
dest="type", metavar="TYPE")
352
parser.add_argument(
353
"--manga",
354
dest="type", action="store_const", const="manga")
355
parser.add_argument(
356
"--base",
357
dest="type", action="store_const", const="base")
358
parser.add_argument(
359
"--user",
360
dest="type", action="store_const", const="user")
361
362
parser.add_argument("category")
363
parser.add_argument("root", nargs="?")
364
365
args = parser.parse_args()
366
args.category = args.category.lower()
367
368
if "://" in args.category:
369
base = args.category.split("/", 3)
370
if not args.root:
371
args.root = "/".join(base[:3])
372
args.category = re.sub(r"\W+", "", base[2].split(".")[-2])
373
374
if root := args.root:
375
if "://" in root:
376
root = root.rstrip("/")
377
domain = root[root.find("://")+3:]
378
else:
379
root = root.strip(":/")
380
domain = root
381
root = f"https://{root}"
382
383
if domain.startswith("www."):
384
domain = domain[4:]
385
386
args.root = root
387
args.domain = domain
388
elif args.init_module:
389
parser.error("'root' URL required")
390
else:
391
args.domain = ""
392
393
return args
394
395
396
def main():
397
args = parse_args()
398
init_extractor(args)
399
400
401
if __name__ == "__main__":
402
logging.basicConfig(
403
level=logging.DEBUG,
404
format="[%(levelname)s] %(message)s",
405
)
406
main()
407
408