Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/test/test_text.py
5457 views
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
4
# Copyright 2015-2025 Mike Fährmann
5
#
6
# This program is free software; you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License version 2 as
8
# published by the Free Software Foundation.
9
10
import os
11
import sys
12
import unittest
13
14
import datetime
15
16
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
17
from gallery_dl import text, util # noqa E402
18
19
20
INVALID = ((), [], {}, None, 1, 2.3)
21
INVALID_ALT = ((), [], {}, None, "")
22
23
24
class TestText(unittest.TestCase):
25
26
def test_re(self):
27
p1 = text.re_compile("foo")
28
p2 = text.re("foo")
29
p3 = text.re("foo")
30
31
Pattern = text.re_module.Pattern
32
self.assertIsInstance(p1, Pattern)
33
self.assertIsInstance(p2, Pattern)
34
self.assertIsInstance(p3, Pattern)
35
36
self.assertEqual(p1, p2)
37
self.assertIsNot(p1, p2)
38
self.assertIs(p2, p3)
39
40
def test_remove_html(self, f=text.remove_html):
41
result = "Hello World."
42
43
# standard usage
44
self.assertEqual(f(""), "")
45
self.assertEqual(f("Hello World."), result)
46
self.assertEqual(f(" Hello World. "), result)
47
self.assertEqual(f("Hello<br/>World."), result)
48
self.assertEqual(
49
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
50
51
# empty HTML
52
self.assertEqual(f("<div></div>"), "")
53
self.assertEqual(f(" <div> </div> "), "")
54
55
# malformed HTML
56
self.assertEqual(f("<div</div>"), "")
57
self.assertEqual(f("<div<Hello World.</div>"), "")
58
59
# invalid arguments
60
for value in INVALID:
61
self.assertEqual(f(value), "")
62
63
def test_split_html(self, f=text.split_html):
64
result = ["Hello", "World."]
65
empty = []
66
67
# standard usage
68
self.assertEqual(f(""), empty)
69
self.assertEqual(f("Hello World."), ["Hello World."])
70
self.assertEqual(f(" Hello World. "), ["Hello World."])
71
self.assertEqual(f("Hello<br/>World."), result)
72
self.assertEqual(f(" Hello <br/> World. "), result)
73
self.assertEqual(
74
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
75
76
# escaped HTML entities
77
self.assertEqual(
78
f("<i>&lt;foo&gt;</i> <i>&lt;bar&gt; </i>"), ["<foo>", "<bar>"])
79
80
# empty HTML
81
self.assertEqual(f("<div></div>"), empty)
82
self.assertEqual(f(" <div> </div> "), empty)
83
84
# malformed HTML
85
self.assertEqual(f("<div</div>"), empty)
86
self.assertEqual(f("<div<Hello World.</div>"), empty)
87
88
# invalid arguments
89
for value in INVALID:
90
self.assertEqual(f(value), empty)
91
92
def test_slugify(self, f=text.slugify):
93
self.assertEqual(f("Hello World"), "hello-world")
94
self.assertEqual(f("-HeLLo---World-"), "hello-world")
95
self.assertEqual(f("_-H#e:l#l:o+\t+W?o!rl=d-_"), "hello-world")
96
self.assertEqual(f("_Hello_World_"), "hello_world")
97
98
self.assertEqual(f(""), "")
99
self.assertEqual(f("-"), "")
100
self.assertEqual(f("--"), "")
101
102
self.assertEqual(f(()), "")
103
self.assertEqual(f([]), "")
104
self.assertEqual(f({}), "")
105
self.assertEqual(f(None), "none")
106
self.assertEqual(f(1), "1")
107
self.assertEqual(f(2.3), "23")
108
109
def test_sanitize_whitespace(self, f=text.sanitize_whitespace):
110
self.assertEqual(f("Hello World"), "Hello World")
111
self.assertEqual(f("Hello\tWorld"), "Hello World")
112
self.assertEqual(f(" Hello World "), "Hello World")
113
self.assertEqual(f("\tHello \n\tWorld "), "Hello World")
114
115
self.assertEqual(f(""), "")
116
self.assertEqual(f(" "), "")
117
self.assertEqual(f(" "), "")
118
self.assertEqual(f(" \t\n "), "")
119
120
def test_ensure_http_scheme(self, f=text.ensure_http_scheme):
121
result = "https://example.org/filename.ext"
122
123
# standard usage
124
self.assertEqual(f(""), "")
125
self.assertEqual(f("example.org/filename.ext"), result)
126
self.assertEqual(f("/example.org/filename.ext"), result)
127
self.assertEqual(f("//example.org/filename.ext"), result)
128
self.assertEqual(f("://example.org/filename.ext"), result)
129
130
# no change
131
self.assertEqual(f(result), result)
132
self.assertEqual(
133
f("http://example.org/filename.ext"),
134
"http://example.org/filename.ext",
135
)
136
137
# ...
138
self.assertEqual(
139
f("htp://example.org/filename.ext"),
140
"https://htp://example.org/filename.ext",
141
)
142
143
# invalid arguments
144
for value in INVALID_ALT:
145
self.assertEqual(f(value), value)
146
147
def test_root_from_url(self, f=text.root_from_url):
148
result = "https://example.org"
149
self.assertEqual(f("https://example.org") , result)
150
self.assertEqual(f("https://example.org/") , result)
151
self.assertEqual(f("https://example.org/path"), result)
152
self.assertEqual(f("example.org/") , result)
153
self.assertEqual(f("example.org/path/") , result)
154
155
result = "http://example.org"
156
self.assertEqual(f("http://example.org") , result)
157
self.assertEqual(f("http://example.org/") , result)
158
self.assertEqual(f("http://example.org/path/"), result)
159
self.assertEqual(f("example.org/", "http://") , result)
160
161
def test_filename_from_url(self, f=text.filename_from_url):
162
result = "filename.ext"
163
164
# standard usage
165
self.assertEqual(f(""), "")
166
self.assertEqual(f("filename.ext"), result)
167
self.assertEqual(f("/filename.ext"), result)
168
self.assertEqual(f("example.org/filename.ext"), result)
169
self.assertEqual(f("http://example.org/v2/filename.ext"), result)
170
self.assertEqual(
171
f("http://example.org/v2/filename.ext?param=value#frag"), result)
172
173
# invalid arguments
174
for value in INVALID:
175
self.assertEqual(f(value), "")
176
177
def test_ext_from_url(self, f=text.ext_from_url):
178
result = "ext"
179
180
# standard usage
181
self.assertEqual(f(""), "")
182
self.assertEqual(f("filename"), "")
183
self.assertEqual(f("filename.ext"), result)
184
self.assertEqual(f("/filename.ExT"), result)
185
self.assertEqual(f("example.org/filename.ext"), result)
186
self.assertEqual(f("http://example.org/v2/filename.ext"), result)
187
self.assertEqual(
188
f("http://example.org/v2/filename.ext?param=value#frag"), result)
189
190
# invalid arguments
191
for value in INVALID:
192
self.assertEqual(f(value), "")
193
194
def test_nameext_from_url(self, f=text.nameext_from_url):
195
empty = {"filename": "", "extension": ""}
196
result = {"filename": "filename", "extension": "ext"}
197
198
# standard usage
199
self.assertEqual(f(""), empty)
200
self.assertEqual(f("filename.ext"), result)
201
self.assertEqual(f("/filename.ExT"), result)
202
self.assertEqual(f("example.org/filename.ext"), result)
203
self.assertEqual(f("http://example.org/v2/filename.ext"), result)
204
self.assertEqual(
205
f("http://example.org/v2/filename.ext?param=value#frag"), result)
206
207
# long "extension"
208
fn = "httpswww.example.orgpath-path-path-path-path-path-path-path"
209
self.assertEqual(f(fn), {"filename": fn, "extension": ""})
210
211
# invalid arguments
212
for value in INVALID:
213
self.assertEqual(f(value), empty)
214
215
def test_extract(self, f=text.extract):
216
txt = "<a><b>"
217
self.assertEqual(f(txt, "<", ">"), ("a" , 3))
218
self.assertEqual(f(txt, "X", ">"), (None, 0))
219
self.assertEqual(f(txt, "<", "X"), (None, 0))
220
221
# 'pos' argument
222
for i in range(1, 4):
223
self.assertEqual(f(txt, "<", ">", i), ("b", 6))
224
for i in range(4, 10):
225
self.assertEqual(f(txt, "<", ">", i), (None, i))
226
227
# invalid arguments
228
for value in INVALID:
229
self.assertEqual(f(value, "<" , ">") , (None, 0))
230
self.assertEqual(f(txt , value, ">") , (None, 0))
231
self.assertEqual(f(txt , "<" , value), (None, 0))
232
233
def test_extr(self, f=text.extr):
234
txt = "<a><b>"
235
self.assertEqual(f(txt, "X", ">"), "")
236
self.assertEqual(f(txt, "<", "X"), "")
237
self.assertEqual(f(txt, "<", ">"), "a")
238
self.assertEqual(f(txt, "><", ">"), "b")
239
240
# 'default' argument
241
self.assertEqual(f(txt, "<", "X", None), None)
242
self.assertEqual(f(txt, "<", "X", default=None), None)
243
self.assertEqual(f(txt, "<", "X", default=()), ())
244
245
# invalid arguments
246
for value in INVALID:
247
self.assertEqual(f(value, "<" , ">") , "")
248
self.assertEqual(f(txt , value, ">") , "")
249
self.assertEqual(f(txt , "<" , value), "")
250
251
def test_rextract(self, f=text.rextract):
252
txt = "<a><b>"
253
self.assertEqual(f(txt, "<", ">"), ("b" , 3))
254
self.assertEqual(f(txt, "X", ">"), (None, -1))
255
self.assertEqual(f(txt, "<", "X"), (None, -1))
256
257
# 'pos' argument
258
for i in range(10, 3, -1):
259
self.assertEqual(f(txt, "<", ">", i), ("b", 3))
260
for i in range(3, 0, -1):
261
self.assertEqual(f(txt, "<", ">", i), ("a", 0))
262
263
# invalid arguments
264
for value in INVALID:
265
self.assertEqual(f(value, "<" , ">") , (None, -1))
266
self.assertEqual(f(txt , value, ">") , (None, -1))
267
self.assertEqual(f(txt , "<" , value), (None, -1))
268
269
def test_rextr(self, f=text.rextr):
270
txt = "<a><b>"
271
self.assertEqual(f(txt, "<", ">"), "b")
272
self.assertEqual(f(txt, "X", ">"), "")
273
self.assertEqual(f(txt, "<", "X"), "")
274
275
# 'pos' argument
276
for i in range(10, 3, -1):
277
self.assertEqual(f(txt, "<", ">", i), "b")
278
for i in range(3, 0, -1):
279
self.assertEqual(f(txt, "<", ">", i), "a")
280
281
# 'default' argument
282
self.assertEqual(f(txt, "[", "]", -1, "none"), "none")
283
self.assertEqual(f(txt, "[", "]", None, "none"), "none")
284
self.assertEqual(f(txt, "[", "]", default="none"), "none")
285
286
# invalid arguments
287
for value in INVALID:
288
self.assertEqual(f(value, "<" , ">") , "")
289
self.assertEqual(f(txt , value, ">") , "")
290
self.assertEqual(f(txt , "<" , value), "")
291
292
def test_extract_all(self, f=text.extract_all):
293
txt = "[c][b][a]: xyz! [d][e"
294
295
self.assertEqual(
296
f(txt, ()), ({}, 0))
297
self.assertEqual(
298
f(txt, (("C", "[", "]"), ("B", "[", "]"), ("A", "[", "]"))),
299
({"A": "a", "B": "b", "C": "c"}, 9),
300
)
301
302
# 'None' as field name
303
self.assertEqual(
304
f(txt, ((None, "[", "]"), (None, "[", "]"), ("A", "[", "]"))),
305
({"A": "a"}, 9),
306
)
307
self.assertEqual(
308
f(txt, ((None, "[", "]"), (None, "[", "]"), (None, "[", "]"))),
309
({}, 9),
310
)
311
312
# failed matches
313
self.assertEqual(
314
f(txt, (("C", "[", "]"), ("X", "X", "X"), ("B", "[", "]"))),
315
({"B": "b", "C": "c", "X": None}, 6),
316
)
317
318
# 'pos' argument
319
self.assertEqual(
320
f(txt, (("B", "[", "]"), ("A", "[", "]")), pos=1),
321
({"A": "a", "B": "b"}, 9),
322
)
323
324
# 'values' argument
325
self.assertEqual(
326
f(txt, (("C", "[", "]"),), values={"A": "a", "B": "b"}),
327
({"A": "a", "B": "b", "C": "c"}, 3),
328
)
329
330
vdict = {}
331
rdict, pos = f(txt, (), values=vdict)
332
self.assertIs(vdict, rdict)
333
334
def test_extract_iter(self, f=text.extract_iter):
335
txt = "[c][b][a]: xyz! [d][e"
336
337
def g(*args):
338
return list(f(*args))
339
340
self.assertEqual(
341
g("", "[", "]"), [])
342
self.assertEqual(
343
g("[a]", "[", "]"), ["a"])
344
self.assertEqual(
345
g(txt, "[", "]"), ["c", "b", "a", "d"])
346
self.assertEqual(
347
g(txt, "X", "X"), [])
348
self.assertEqual(
349
g(txt, "[", "]", 6), ["a", "d"])
350
351
# invalid arguments
352
for value in INVALID:
353
self.assertEqual(g(value, "<" , ">") , [])
354
self.assertEqual(g(txt , value, ">") , [])
355
self.assertEqual(g(txt , "<" , value), [])
356
357
def test_extract_from(self, f=text.extract_from):
358
txt = "[c][b][a]: xyz! [d][e"
359
360
e = f(txt)
361
self.assertEqual(e("[", "]"), "c")
362
self.assertEqual(e("[", "]"), "b")
363
self.assertEqual(e("[", "]"), "a")
364
self.assertEqual(e("[", "]"), "d")
365
self.assertEqual(e("[", "]"), "")
366
self.assertEqual(e("[", "]"), "")
367
368
e = f(txt, pos=6, default="END")
369
self.assertEqual(e("[", "]"), "a")
370
self.assertEqual(e("[", "]"), "d")
371
self.assertEqual(e("[", "]"), "END")
372
self.assertEqual(e("[", "]"), "END")
373
374
def test_parse_unicode_escapes(self, f=text.parse_unicode_escapes):
375
self.assertEqual(f(""), "")
376
self.assertEqual(f("foobar"), "foobar")
377
self.assertEqual(f("foo’bar"), "foo’bar")
378
self.assertEqual(f("foo\\u2019bar"), "foo’bar")
379
self.assertEqual(f("foo\\u201bar"), "foo‛ar")
380
self.assertEqual(f("foo\\u201zar"), "foo\\u201zar")
381
self.assertEqual(
382
f("\\u2018foo\\u2019\\u2020bar\\u00ff"),
383
"‘foo’†barÿ",
384
)
385
386
def test_parse_bytes(self, f=text.parse_bytes):
387
self.assertEqual(f(0), 0)
388
self.assertEqual(f(50), 50)
389
self.assertEqual(f("0"), 0)
390
self.assertEqual(f("50"), 50)
391
self.assertEqual(f("50k"), 50 * 1024**1)
392
self.assertEqual(f("50m"), 50 * 1024**2)
393
self.assertEqual(f("50g"), 50 * 1024**3)
394
self.assertEqual(f("50t"), 50 * 1024**4)
395
self.assertEqual(f("50p"), 50 * 1024**5)
396
self.assertEqual(f(" 50p "), 50 * 1024**5)
397
398
# fractions
399
self.assertEqual(f(123.456), 123)
400
self.assertEqual(f("123.456"), 123)
401
self.assertEqual(f("123.567"), 124)
402
self.assertEqual(f(" 123.89 "), 124)
403
self.assertEqual(f("0.5M"), round(0.5 * 1024**2))
404
405
# invalid arguments
406
for value in INVALID_ALT:
407
self.assertEqual(f(value), 0)
408
self.assertEqual(f("NaN"), 0)
409
self.assertEqual(f("invalid"), 0)
410
self.assertEqual(f(" 123 kb "), 0)
411
412
def test_parse_int(self, f=text.parse_int):
413
self.assertEqual(f(0), 0)
414
self.assertEqual(f("0"), 0)
415
self.assertEqual(f(123), 123)
416
self.assertEqual(f("123"), 123)
417
418
# invalid arguments
419
for value in INVALID_ALT:
420
self.assertEqual(f(value), 0)
421
self.assertEqual(f("123.456"), 0)
422
self.assertEqual(f("zzz"), 0)
423
self.assertEqual(f([1, 2, 3]), 0)
424
self.assertEqual(f({1: 2, 3: 4}), 0)
425
426
# 'default' argument
427
default = "default"
428
for value in INVALID_ALT:
429
self.assertEqual(f(value, default), default)
430
self.assertEqual(f("zzz", default), default)
431
432
def test_parse_float(self, f=text.parse_float):
433
self.assertEqual(f(0), 0.0)
434
self.assertEqual(f("0"), 0.0)
435
self.assertEqual(f(123), 123.0)
436
self.assertEqual(f("123"), 123.0)
437
self.assertEqual(f(123.456), 123.456)
438
self.assertEqual(f("123.456"), 123.456)
439
440
# invalid arguments
441
for value in INVALID_ALT:
442
self.assertEqual(f(value), 0.0)
443
self.assertEqual(f("zzz"), 0.0)
444
self.assertEqual(f([1, 2, 3]), 0.0)
445
self.assertEqual(f({1: 2, 3: 4}), 0.0)
446
447
# 'default' argument
448
default = "default"
449
for value in INVALID_ALT:
450
self.assertEqual(f(value, default), default)
451
self.assertEqual(f("zzz", default), default)
452
453
def test_parse_query(self, f=text.parse_query):
454
# standard usage
455
self.assertEqual(f(""), {})
456
self.assertEqual(f("foo=1"), {"foo": "1"})
457
self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"})
458
459
# missing value
460
self.assertEqual(f("bar"), {})
461
self.assertEqual(f("bar="), {"bar": ""})
462
self.assertEqual(f("foo=1&bar"), {"foo": "1"})
463
self.assertEqual(f("foo=1&bar="), {"foo": "1", "bar": ""})
464
self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"})
465
self.assertEqual(f("foo=1&bar=&baz=3"),
466
{"foo": "1", "bar": "", "baz": "3"})
467
468
# keys with identical names
469
self.assertEqual(f("foo=1&foo=2"), {"foo": "1"})
470
self.assertEqual(
471
f("foo=1&bar=2&foo=3&bar=4"),
472
{"foo": "1", "bar": "2"},
473
)
474
475
# invalid arguments
476
for value in INVALID:
477
self.assertEqual(f(value), {})
478
479
def test_parse_query_list(self, f=text.parse_query_list):
480
# standard usage
481
self.assertEqual(f(""), {})
482
self.assertEqual(f("foo=1"), {"foo": "1"})
483
self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"})
484
self.assertEqual(f("%C3%A4%26=%E3%81%82%E3%81%A8&%23=%3F"),
485
{"ä&": "あと", "#": "?"})
486
487
# missing value
488
self.assertEqual(f("bar"), {})
489
self.assertEqual(f("foo=1&bar"), {"foo": "1"})
490
self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"})
491
492
# keys with identical names
493
self.assertEqual(f("foo=1&foo=2", ("foo",)), {"foo": ["1", "2"]})
494
self.assertEqual(
495
f("foo=1&bar=2&foo=3&bar=4&foo=5", {"foo", "baz"}),
496
{"foo": ["1", "3", "5"], "bar": "2"},
497
)
498
499
# invalid arguments
500
for value in INVALID:
501
self.assertEqual(f(value), {})
502
503
def test_build_query(self, f=text.build_query):
504
# standard usage
505
self.assertEqual(f({}), "")
506
self.assertEqual(f({"foo": "1"}), "foo=1")
507
self.assertEqual(f({"foo": "1", "bar": "2"}), "foo=1&bar=2")
508
509
# missing value
510
self.assertEqual(f({"bar": ""}), "bar=")
511
self.assertEqual(f({"foo": "1", "bar": ""}), "foo=1&bar=")
512
self.assertEqual(f({"foo": "1", "bar": "", "baz": "3"}),
513
"foo=1&bar=&baz=3")
514
515
self.assertEqual(f({"ä&": "あと", "#": "?"}),
516
"%C3%A4%26=%E3%81%82%E3%81%A8&%23=%3F")
517
518
def test_parse_timestamp(self, f=text.parse_timestamp):
519
null = util.datetime_utcfromtimestamp(0)
520
value = util.datetime_utcfromtimestamp(1555816235)
521
522
self.assertEqual(f(0) , null)
523
self.assertEqual(f("0") , null)
524
self.assertEqual(f(1555816235) , value)
525
self.assertEqual(f("1555816235"), value)
526
527
for value in INVALID_ALT:
528
self.assertEqual(f(value), None)
529
self.assertEqual(f(value, "foo"), "foo")
530
531
def test_parse_datetime(self, f=text.parse_datetime):
532
null = util.datetime_utcfromtimestamp(0)
533
534
self.assertEqual(f("1970-01-01T00:00:00+00:00"), null)
535
self.assertEqual(f("1970-01-01T00:00:00+0000") , null)
536
self.assertEqual(f("1970.01.01", "%Y.%m.%d") , null)
537
538
self.assertEqual(
539
f("2019-05-07T21:25:02+09:00"),
540
datetime.datetime(2019, 5, 7, 12, 25, 2),
541
)
542
self.assertEqual(
543
f("2019-05-07T21:25:02+0900"),
544
datetime.datetime(2019, 5, 7, 12, 25, 2),
545
)
546
self.assertEqual(
547
f("2019-05-07T21:25:02.753+0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
548
datetime.datetime(2019, 5, 7, 12, 25, 2),
549
)
550
self.assertEqual(
551
f("2019-05-07T21:25:02", "%Y-%m-%dT%H:%M:%S", utcoffset=9),
552
datetime.datetime(2019, 5, 7, 12, 25, 2),
553
)
554
self.assertEqual(
555
f("2019-05-07 21:25:02"),
556
"2019-05-07 21:25:02",
557
)
558
559
for value in INVALID:
560
self.assertEqual(f(value), None)
561
self.assertEqual(f("1970.01.01"), "1970.01.01")
562
563
564
if __name__ == "__main__":
565
unittest.main()
566
567