Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/test/test_text.py
8783 views
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
4
# Copyright 2015-2026 Mike Fährmann
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License version 2 as
8
# published by the Free Software Foundation.
9
10
import os
11
import sys
12
import unittest
13
14
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15
from gallery_dl import text, util # noqa E402
16
17
18
INVALID = ((), [], {}, None, 1, 2.3)
19
INVALID_ALT = ((), [], {}, None, "")
20
21
22
class TestText(unittest.TestCase):
23
24
def test_re(self):
25
p1 = text.re_compile("foo")
26
p2 = text.re("foo")
27
p3 = text.re("foo")
28
29
Pattern = text.re_module.Pattern
30
self.assertIsInstance(p1, Pattern)
31
self.assertIsInstance(p2, Pattern)
32
self.assertIsInstance(p3, Pattern)
33
34
self.assertEqual(p1, p2)
35
self.assertIsNot(p1, p2)
36
self.assertIs(p2, p3)
37
38
def test_remove_html(self, f=text.remove_html):
39
result = "Hello World."
40
41
# standard usage
42
self.assertEqual(f(""), "")
43
self.assertEqual(f("Hello World."), result)
44
self.assertEqual(f(" Hello World. "), result)
45
self.assertEqual(f("Hello<br/>World."), result)
46
self.assertEqual(
47
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
48
49
# empty HTML
50
self.assertEqual(f("<div></div>"), "")
51
self.assertEqual(f(" <div> </div> "), "")
52
53
# malformed HTML
54
self.assertEqual(f("<div</div>"), "")
55
self.assertEqual(f("<div<Hello World.</div>"), "")
56
57
# invalid arguments
58
for value in INVALID:
59
self.assertEqual(f(value), "")
60
61
def test_split_html(self, f=text.split_html):
62
result = ["Hello", "World."]
63
empty = []
64
65
# standard usage
66
self.assertEqual(f(""), empty)
67
self.assertEqual(f("Hello World."), ["Hello World."])
68
self.assertEqual(f(" Hello World. "), ["Hello World."])
69
self.assertEqual(f("Hello<br/>World."), result)
70
self.assertEqual(f(" Hello <br/> World. "), result)
71
self.assertEqual(
72
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
73
74
# escaped HTML entities
75
self.assertEqual(
76
f("<i>&lt;foo&gt;</i> <i>&lt;bar&gt; </i>"), ["<foo>", "<bar>"])
77
78
# empty HTML
79
self.assertEqual(f("<div></div>"), empty)
80
self.assertEqual(f(" <div> </div> "), empty)
81
82
# malformed HTML
83
self.assertEqual(f("<div</div>"), empty)
84
self.assertEqual(f("<div<Hello World.</div>"), empty)
85
86
# invalid arguments
87
for value in INVALID:
88
self.assertEqual(f(value), empty)
89
90
def test_slugify(self, f=text.slugify):
91
self.assertEqual(f("Hello World"), "hello-world")
92
self.assertEqual(f("-HeLLo---World-"), "hello-world")
93
self.assertEqual(f("_-H#e:l#l:o+\t+W?o!rl=d-_"), "hello-world")
94
self.assertEqual(f("_Hello_World_"), "hello_world")
95
96
self.assertEqual(f(""), "")
97
self.assertEqual(f("-"), "")
98
self.assertEqual(f("--"), "")
99
100
self.assertEqual(f(()), "")
101
self.assertEqual(f([]), "")
102
self.assertEqual(f({}), "")
103
self.assertEqual(f(None), "none")
104
self.assertEqual(f(1), "1")
105
self.assertEqual(f(2.3), "23")
106
107
def test_sanitize_whitespace(self, f=text.sanitize_whitespace):
108
self.assertEqual(f("Hello World"), "Hello World")
109
self.assertEqual(f("Hello\tWorld"), "Hello World")
110
self.assertEqual(f(" Hello World "), "Hello World")
111
self.assertEqual(f("\tHello \n\tWorld "), "Hello World")
112
113
self.assertEqual(f(""), "")
114
self.assertEqual(f(" "), "")
115
self.assertEqual(f(" "), "")
116
self.assertEqual(f(" \t\n "), "")
117
118
def test_ensure_http_scheme(self, f=text.ensure_http_scheme):
119
result = "https://example.org/filename.ext"
120
121
# standard usage
122
self.assertEqual(f(""), "")
123
self.assertEqual(f("example.org/filename.ext"), result)
124
self.assertEqual(f("/example.org/filename.ext"), result)
125
self.assertEqual(f("//example.org/filename.ext"), result)
126
self.assertEqual(f("://example.org/filename.ext"), result)
127
128
# no change
129
self.assertEqual(f(result), result)
130
self.assertEqual(
131
f("http://example.org/filename.ext"),
132
"http://example.org/filename.ext",
133
)
134
135
# ...
136
self.assertEqual(
137
f("htp://example.org/filename.ext"),
138
"https://htp://example.org/filename.ext",
139
)
140
141
# invalid arguments
142
for value in INVALID_ALT:
143
self.assertEqual(f(value), value)
144
145
def test_root_from_url(self, f=text.root_from_url):
146
result = "https://example.org"
147
self.assertEqual(f("https://example.org") , result)
148
self.assertEqual(f("https://example.org/") , result)
149
self.assertEqual(f("https://example.org/path"), result)
150
self.assertEqual(f("example.org/") , result)
151
self.assertEqual(f("example.org/path/") , result)
152
153
result = "http://example.org"
154
self.assertEqual(f("http://example.org") , result)
155
self.assertEqual(f("http://example.org/") , result)
156
self.assertEqual(f("http://example.org/path/"), result)
157
self.assertEqual(f("example.org/", "http://") , result)
158
159
def test_filename_from_url(self, f=text.filename_from_url):
160
result = "filename.ext"
161
162
# standard usage
163
self.assertEqual(f(""), "")
164
self.assertEqual(f("filename.ext"), result)
165
self.assertEqual(f("/filename.ext"), result)
166
self.assertEqual(f("example.org/filename.ext"), result)
167
self.assertEqual(f("http://example.org/v2/filename.ext"), result)
168
self.assertEqual(
169
f("http://example.org/v2/filename.ext?param=value#frag"), result)
170
171
# invalid arguments
172
for value in INVALID:
173
self.assertEqual(f(value), "")
174
175
def test_ext_from_url(self, f=text.ext_from_url):
176
result = "ext"
177
178
# standard usage
179
self.assertEqual(f(""), "")
180
self.assertEqual(f("filename"), "")
181
self.assertEqual(f("filename.ext"), result)
182
self.assertEqual(f("/filename.ExT"), result)
183
self.assertEqual(f("example.org/filename.ext"), result)
184
self.assertEqual(f("http://example.org/v2/filename.ext"), result)
185
self.assertEqual(
186
f("http://example.org/v2/filename.ext?param=value#frag"), result)
187
188
# invalid arguments
189
for value in INVALID:
190
self.assertEqual(f(value), "")
191
192
def test_nameext_from_url(self, f=text.nameext_from_url):
193
empty = {"filename": "", "extension": ""}
194
result = {"filename": "filename", "extension": "ext"}
195
196
# standard usage
197
self.assertEqual(f(""), empty)
198
self.assertEqual(f("filename.ext"), result)
199
self.assertEqual(f("/filename.ExT"), result)
200
self.assertEqual(f("example.org/filename.ext"), result)
201
self.assertEqual(f("http://example.org/v2/filename.ext"), result)
202
self.assertEqual(
203
f("http://example.org/v2/filename.ext?param=value#frag"), result)
204
self.assertEqual(
205
f("http://example.org/v2/foo%202?bar&<>.ext?param=value#frag"),
206
{"filename": "foo 2", "extension": ""},
207
)
208
209
# long "extension"
210
fn = "httpswww.example.orgpath-path-path-path-path-path-path-path"
211
self.assertEqual(f(fn), {"filename": fn, "extension": ""})
212
213
# invalid arguments
214
for value in INVALID:
215
self.assertEqual(f(value), empty)
216
217
def test_nameext_from_name(self, f=text.nameext_from_name):
218
self.assertEqual(
219
f(""),
220
{"filename": "", "extension": ""},
221
)
222
self.assertEqual(
223
f("filename.ext"),
224
{"filename": "filename", "extension": "ext"},
225
)
226
self.assertEqual(
227
f("foo%202?bar&<>.ext"),
228
{"filename": "foo%202?bar&<>", "extension": "ext"},
229
)
230
231
# long "extension"
232
fn = "httpswww.example.orgpath-path-path-path-path-path-path-path"
233
self.assertEqual(f(fn), {"filename": fn, "extension": ""})
234
235
def test_extract(self, f=text.extract):
236
txt = "<a><b>"
237
self.assertEqual(f(txt, "<", ">"), ("a" , 3))
238
self.assertEqual(f(txt, "X", ">"), (None, 0))
239
self.assertEqual(f(txt, "<", "X"), (None, 0))
240
241
# 'pos' argument
242
for i in range(1, 4):
243
self.assertEqual(f(txt, "<", ">", i), ("b", 6))
244
for i in range(4, 10):
245
self.assertEqual(f(txt, "<", ">", i), (None, i))
246
247
# invalid arguments
248
for value in INVALID:
249
self.assertEqual(f(value, "<" , ">") , (None, 0))
250
self.assertEqual(f(txt , value, ">") , (None, 0))
251
self.assertEqual(f(txt , "<" , value), (None, 0))
252
253
def test_extr(self, f=text.extr):
254
txt = "<a><b>"
255
self.assertEqual(f(txt, "X", ">"), "")
256
self.assertEqual(f(txt, "<", "X"), "")
257
self.assertEqual(f(txt, "<", ">"), "a")
258
self.assertEqual(f(txt, "><", ">"), "b")
259
260
# 'default' argument
261
self.assertEqual(f(txt, "<", "X", None), None)
262
self.assertEqual(f(txt, "<", "X", default=None), None)
263
self.assertEqual(f(txt, "<", "X", default=()), ())
264
265
# invalid arguments
266
for value in INVALID:
267
self.assertEqual(f(value, "<" , ">") , "")
268
self.assertEqual(f(txt , value, ">") , "")
269
self.assertEqual(f(txt , "<" , value), "")
270
271
def test_rextract(self, f=text.rextract):
272
txt = "<a><b>"
273
self.assertEqual(f(txt, "<", ">"), ("b" , 3))
274
self.assertEqual(f(txt, "X", ">"), (None, -1))
275
self.assertEqual(f(txt, "<", "X"), (None, -1))
276
277
# 'pos' argument
278
for i in range(10, 3, -1):
279
self.assertEqual(f(txt, "<", ">", i), ("b", 3))
280
for i in range(3, 0, -1):
281
self.assertEqual(f(txt, "<", ">", i), ("a", 0))
282
283
# invalid arguments
284
for value in INVALID:
285
self.assertEqual(f(value, "<" , ">") , (None, -1))
286
self.assertEqual(f(txt , value, ">") , (None, -1))
287
self.assertEqual(f(txt , "<" , value), (None, -1))
288
289
def test_rextr(self, f=text.rextr):
290
txt = "<a><b>"
291
self.assertEqual(f(txt, "<", ">"), "b")
292
self.assertEqual(f(txt, "X", ">"), "")
293
self.assertEqual(f(txt, "<", "X"), "")
294
295
# 'pos' argument
296
for i in range(10, 3, -1):
297
self.assertEqual(f(txt, "<", ">", i), "b")
298
for i in range(3, 0, -1):
299
self.assertEqual(f(txt, "<", ">", i), "a")
300
301
# 'default' argument
302
self.assertEqual(f(txt, "[", "]", -1, "none"), "none")
303
self.assertEqual(f(txt, "[", "]", None, "none"), "none")
304
self.assertEqual(f(txt, "[", "]", default="none"), "none")
305
306
# invalid arguments
307
for value in INVALID:
308
self.assertEqual(f(value, "<" , ">") , "")
309
self.assertEqual(f(txt , value, ">") , "")
310
self.assertEqual(f(txt , "<" , value), "")
311
312
def test_extract_all(self, f=text.extract_all):
313
txt = "[c][b][a]: xyz! [d][e"
314
315
self.assertEqual(
316
f(txt, ()), ({}, 0))
317
self.assertEqual(
318
f(txt, (("C", "[", "]"), ("B", "[", "]"), ("A", "[", "]"))),
319
({"A": "a", "B": "b", "C": "c"}, 9),
320
)
321
322
# 'None' as field name
323
self.assertEqual(
324
f(txt, ((None, "[", "]"), (None, "[", "]"), ("A", "[", "]"))),
325
({"A": "a"}, 9),
326
)
327
self.assertEqual(
328
f(txt, ((None, "[", "]"), (None, "[", "]"), (None, "[", "]"))),
329
({}, 9),
330
)
331
332
# failed matches
333
self.assertEqual(
334
f(txt, (("C", "[", "]"), ("X", "X", "X"), ("B", "[", "]"))),
335
({"B": "b", "C": "c", "X": None}, 6),
336
)
337
338
# 'pos' argument
339
self.assertEqual(
340
f(txt, (("B", "[", "]"), ("A", "[", "]")), pos=1),
341
({"A": "a", "B": "b"}, 9),
342
)
343
344
# 'values' argument
345
self.assertEqual(
346
f(txt, (("C", "[", "]"),), values={"A": "a", "B": "b"}),
347
({"A": "a", "B": "b", "C": "c"}, 3),
348
)
349
350
vdict = {}
351
rdict, pos = f(txt, (), values=vdict)
352
self.assertIs(vdict, rdict)
353
354
def test_extract_iter(self, f=text.extract_iter):
355
txt = "[c][b][a]: xyz! [d][e"
356
357
def g(*args):
358
return list(f(*args))
359
360
self.assertEqual(
361
g("", "[", "]"), [])
362
self.assertEqual(
363
g("[a]", "[", "]"), ["a"])
364
self.assertEqual(
365
g(txt, "[", "]"), ["c", "b", "a", "d"])
366
self.assertEqual(
367
g(txt, "X", "X"), [])
368
self.assertEqual(
369
g(txt, "[", "]", 6), ["a", "d"])
370
371
# invalid arguments
372
for value in INVALID:
373
self.assertEqual(g(value, "<" , ">") , [])
374
self.assertEqual(g(txt , value, ">") , [])
375
self.assertEqual(g(txt , "<" , value), [])
376
377
def test_extract_from(self, f=text.extract_from):
378
txt = "[c][b][a]: xyz! [d][e"
379
380
e = f(txt)
381
self.assertEqual(e("[", "]"), "c")
382
self.assertEqual(e("[", "]"), "b")
383
self.assertEqual(e("[", "]"), "a")
384
self.assertEqual(e("[", "]"), "d")
385
self.assertEqual(e("[", "]"), "")
386
self.assertEqual(e("[", "]"), "")
387
388
e = f(txt, pos=6, default="END")
389
self.assertEqual(e("[", "]"), "a")
390
self.assertEqual(e("[", "]"), "d")
391
self.assertEqual(e("[", "]"), "END")
392
self.assertEqual(e("[", "]"), "END")
393
394
def test_extract_urls(self, f=text.extract_urls):
395
txt = ""
396
self.assertEqual(f(txt), [])
397
398
txt = "<p>foo </p> &amp; bar <p> </p>"
399
self.assertEqual(f(txt), [])
400
401
txt = """<p>
402
<a href="http://www.example.com">Lorem ipsum dolor sit amet</a>.
403
Duis aute irure <a href="http://blog.example.org/lorem?foo=bar">
404
http://blog.example.org</a>.
405
</p>"""
406
self.assertEqual(f(txt), ["http://www.example.com",
407
"http://blog.example.org/lorem?foo=bar",
408
"http://blog.example.org"])
409
410
def test_parse_unicode_escapes(self, f=text.parse_unicode_escapes):
411
self.assertEqual(f(""), "")
412
self.assertEqual(f("foobar"), "foobar")
413
self.assertEqual(f("foo’bar"), "foo’bar")
414
self.assertEqual(f("foo\\u2019bar"), "foo’bar")
415
self.assertEqual(f("foo\\u201bar"), "foo‛ar")
416
self.assertEqual(f("foo\\u201zar"), "foo\\u201zar")
417
self.assertEqual(
418
f("\\u2018foo\\u2019\\u2020bar\\u00ff"),
419
"‘foo’†barÿ",
420
)
421
422
def test_parse_bytes(self, f=text.parse_bytes):
423
self.assertEqual(f(0), 0)
424
self.assertEqual(f(50), 50)
425
self.assertEqual(f("0"), 0)
426
self.assertEqual(f("50"), 50)
427
self.assertEqual(f("50k"), 50 * 1024**1)
428
self.assertEqual(f("50m"), 50 * 1024**2)
429
self.assertEqual(f("50g"), 50 * 1024**3)
430
self.assertEqual(f("50t"), 50 * 1024**4)
431
self.assertEqual(f("50p"), 50 * 1024**5)
432
self.assertEqual(f(" 50p "), 50 * 1024**5)
433
434
# fractions
435
self.assertEqual(f(123.456), 123)
436
self.assertEqual(f("123.456"), 123)
437
self.assertEqual(f("123.567"), 124)
438
self.assertEqual(f(" 123.89 "), 124)
439
self.assertEqual(f("0.5M"), round(0.5 * 1024**2))
440
441
# invalid arguments
442
for value in INVALID_ALT:
443
self.assertEqual(f(value), 0)
444
self.assertEqual(f("NaN"), 0)
445
self.assertEqual(f("invalid"), 0)
446
self.assertEqual(f(" 123 kb "), 0)
447
448
def test_parse_int(self, f=text.parse_int):
449
self.assertEqual(f(0), 0)
450
self.assertEqual(f("0"), 0)
451
self.assertEqual(f(123), 123)
452
self.assertEqual(f("123"), 123)
453
454
# invalid arguments
455
for value in INVALID_ALT:
456
self.assertEqual(f(value), 0)
457
self.assertEqual(f("123.456"), 0)
458
self.assertEqual(f("zzz"), 0)
459
self.assertEqual(f([1, 2, 3]), 0)
460
self.assertEqual(f({1: 2, 3: 4}), 0)
461
462
# 'default' argument
463
default = "default"
464
for value in INVALID_ALT:
465
self.assertEqual(f(value, default), default)
466
self.assertEqual(f("zzz", default), default)
467
468
def test_parse_float(self, f=text.parse_float):
469
self.assertEqual(f(0), 0.0)
470
self.assertEqual(f("0"), 0.0)
471
self.assertEqual(f(123), 123.0)
472
self.assertEqual(f("123"), 123.0)
473
self.assertEqual(f(123.456), 123.456)
474
self.assertEqual(f("123.456"), 123.456)
475
476
# invalid arguments
477
for value in INVALID_ALT:
478
self.assertEqual(f(value), 0.0)
479
self.assertEqual(f("zzz"), 0.0)
480
self.assertEqual(f([1, 2, 3]), 0.0)
481
self.assertEqual(f({1: 2, 3: 4}), 0.0)
482
483
# 'default' argument
484
default = "default"
485
for value in INVALID_ALT:
486
self.assertEqual(f(value, default), default)
487
self.assertEqual(f("zzz", default), default)
488
489
def test_parse_query(self, f=text.parse_query):
490
# standard usage
491
self.assertEqual(f(""), {})
492
self.assertEqual(f("foo=1"), {"foo": "1"})
493
self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"})
494
495
# missing value
496
self.assertEqual(f("bar"), {})
497
self.assertEqual(f("bar="), {"bar": ""})
498
self.assertEqual(f("bar", empty=True), {"bar": ""})
499
self.assertEqual(f("foo=1&bar"), {"foo": "1"})
500
self.assertEqual(f("foo=1&bar="), {"foo": "1", "bar": ""})
501
self.assertEqual(f("foo=1&bar", True), {"foo": "1", "bar": ""})
502
self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"})
503
self.assertEqual(f("foo=1&bar=&baz=3"),
504
{"foo": "1", "bar": "", "baz": "3"})
505
self.assertEqual(f("foo=1&bar&baz=3", True),
506
{"foo": "1", "bar": "", "baz": "3"})
507
508
# keys with identical names
509
self.assertEqual(f("foo=1&foo=2"), {"foo": "1"})
510
self.assertEqual(
511
f("foo=1&bar=2&foo=3&bar=4"),
512
{"foo": "1", "bar": "2"},
513
)
514
515
# invalid arguments
516
for value in INVALID:
517
self.assertEqual(f(value), {})
518
519
def test_parse_query_list(self, f=text.parse_query_list):
520
# standard usage
521
self.assertEqual(f(""), {})
522
self.assertEqual(f("foo=1"), {"foo": "1"})
523
self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"})
524
self.assertEqual(f("%C3%A4%26=%E3%81%82%E3%81%A8&%23=%3F"),
525
{"ä&": "あと", "#": "?"})
526
527
# missing value
528
self.assertEqual(f("bar"), {})
529
self.assertEqual(f("foo=1&bar"), {"foo": "1"})
530
self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"})
531
532
# keys with identical names
533
self.assertEqual(f("foo=1&foo=2", ("foo",)), {"foo": ["1", "2"]})
534
self.assertEqual(
535
f("foo=1&bar=2&foo=3&bar=4&foo=5", {"foo", "baz"}),
536
{"foo": ["1", "3", "5"], "bar": "2"},
537
)
538
539
# invalid arguments
540
for value in INVALID:
541
self.assertEqual(f(value), {})
542
543
def test_build_query(self, f=text.build_query):
544
# standard usage
545
self.assertEqual(f({}), "")
546
self.assertEqual(f({"foo": "1"}), "foo=1")
547
self.assertEqual(f({"foo": "1", "bar": "2"}), "foo=1&bar=2")
548
549
# missing value
550
self.assertEqual(f({"bar": ""}), "bar=")
551
self.assertEqual(f({"foo": "1", "bar": ""}), "foo=1&bar=")
552
self.assertEqual(f({"foo": "1", "bar": "", "baz": "3"}),
553
"foo=1&bar=&baz=3")
554
555
self.assertEqual(f({"ä&": "あと", "#": "?"}),
556
"%C3%A4%26=%E3%81%82%E3%81%A8&%23=%3F")
557
558
559
if __name__ == "__main__":
560
unittest.main()
561
562