Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/text.py
5457 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2015-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Collection of functions that work on strings/text"""
10
11
import sys
12
import html
13
import time
14
import datetime
15
import urllib.parse
16
import re as re_module
17
18
try:
19
re_compile = re_module._compiler.compile
20
except AttributeError:
21
re_compile = re_module.sre_compile.compile
22
23
HTML_RE = re_compile(r"<[^>]+>")
24
PATTERN_CACHE = {}
25
26
27
def re(pattern):
28
"""Compile a regular expression pattern"""
29
try:
30
return PATTERN_CACHE[pattern]
31
except KeyError:
32
p = PATTERN_CACHE[pattern] = re_compile(pattern)
33
return p
34
35
36
def remove_html(txt, repl=" ", sep=" "):
37
"""Remove html-tags from a string"""
38
try:
39
txt = HTML_RE.sub(repl, txt)
40
except TypeError:
41
return ""
42
if sep:
43
return sep.join(txt.split())
44
return txt.strip()
45
46
47
def split_html(txt):
48
"""Split input string by HTML tags"""
49
try:
50
return [
51
unescape(x).strip()
52
for x in HTML_RE.split(txt)
53
if x and not x.isspace()
54
]
55
except TypeError:
56
return []
57
58
59
def slugify(value):
60
"""Convert a string to a URL slug
61
62
Adapted from:
63
https://github.com/django/django/blob/master/django/utils/text.py
64
"""
65
value = re(r"[^\w\s-]").sub("", str(value).lower())
66
return re(r"[-\s]+").sub("-", value).strip("-_")
67
68
69
def sanitize_whitespace(value):
70
"""Replace all whitespace characters with a single space"""
71
return re(r"\s+").sub(" ", value.strip())
72
73
74
def ensure_http_scheme(url, scheme="https://"):
75
"""Prepend 'scheme' to 'url' if it doesn't have one"""
76
if url and not url.startswith(("https://", "http://")):
77
return scheme + url.lstrip("/:")
78
return url
79
80
81
def root_from_url(url, scheme="https://"):
82
"""Extract scheme and domain from a URL"""
83
if not url.startswith(("https://", "http://")):
84
try:
85
return scheme + url[:url.index("/")]
86
except ValueError:
87
return scheme + url
88
try:
89
return url[:url.index("/", 8)]
90
except ValueError:
91
return url
92
93
94
def filename_from_url(url):
95
"""Extract the last part of an URL to use as a filename"""
96
try:
97
return url.partition("?")[0].rpartition("/")[2]
98
except Exception:
99
return ""
100
101
102
def ext_from_url(url):
103
"""Extract the filename extension of an URL"""
104
name, _, ext = filename_from_url(url).rpartition(".")
105
return ext.lower() if name else ""
106
107
108
def nameext_from_url(url, data=None):
109
"""Extract the last part of an URL and fill 'data' accordingly"""
110
if data is None:
111
data = {}
112
113
filename = unquote(filename_from_url(url))
114
name, _, ext = filename.rpartition(".")
115
if name and len(ext) <= 16:
116
data["filename"], data["extension"] = name, ext.lower()
117
else:
118
data["filename"], data["extension"] = filename, ""
119
120
return data
121
122
123
def extract(txt, begin, end, pos=None):
124
"""Extract the text between 'begin' and 'end' from 'txt'
125
126
Args:
127
txt: String to search in
128
begin: First string to be searched for
129
end: Second string to be searched for after 'begin'
130
pos: Starting position for searches in 'txt'
131
132
Returns:
133
The string between the two search-strings 'begin' and 'end' beginning
134
with position 'pos' in 'txt' as well as the position after 'end'.
135
136
If at least one of 'begin' or 'end' is not found, None and the original
137
value of 'pos' is returned
138
139
Examples:
140
extract("abcde", "b", "d") -> "c" , 4
141
extract("abcde", "b", "d", 3) -> None, 3
142
"""
143
try:
144
first = txt.index(begin, pos) + len(begin)
145
last = txt.index(end, first)
146
return txt[first:last], last+len(end)
147
except Exception:
148
return None, 0 if pos is None else pos
149
150
151
def extr(txt, begin, end, default=""):
152
"""Stripped-down version of 'extract()'"""
153
try:
154
first = txt.index(begin) + len(begin)
155
return txt[first:txt.index(end, first)]
156
except Exception:
157
return default
158
159
160
def rextract(txt, begin, end, pos=None):
161
try:
162
lbeg = len(begin)
163
first = txt.rindex(begin, None, pos)
164
last = txt.index(end, first + lbeg)
165
return txt[first + lbeg:last], first
166
except Exception:
167
return None, -1 if pos is None else pos
168
169
170
def rextr(txt, begin, end, pos=None, default=""):
171
"""Stripped-down version of 'rextract()'"""
172
try:
173
first = txt.rindex(begin, None, pos) + len(begin)
174
return txt[first:txt.index(end, first)]
175
except Exception:
176
return default
177
178
179
def extract_all(txt, rules, pos=None, values=None):
180
"""Calls extract for each rule and returns the result in a dict"""
181
if values is None:
182
values = {}
183
for key, begin, end in rules:
184
result, pos = extract(txt, begin, end, pos)
185
if key:
186
values[key] = result
187
return values, 0 if pos is None else pos
188
189
190
def extract_iter(txt, begin, end, pos=None):
191
"""Yield values that would be returned by repeated calls of extract()"""
192
try:
193
index = txt.index
194
lbeg = len(begin)
195
lend = len(end)
196
while True:
197
first = index(begin, pos) + lbeg
198
last = index(end, first)
199
pos = last + lend
200
yield txt[first:last]
201
except Exception:
202
return
203
204
205
def extract_from(txt, pos=None, default=""):
206
"""Returns a function object that extracts from 'txt'"""
207
def extr(begin, end, index=txt.index, txt=txt):
208
nonlocal pos
209
try:
210
first = index(begin, pos) + len(begin)
211
last = index(end, first)
212
pos = last + len(end)
213
return txt[first:last]
214
except Exception:
215
return default
216
return extr
217
218
219
def parse_unicode_escapes(txt):
220
"""Convert JSON Unicode escapes in 'txt' into actual characters"""
221
if "\\u" in txt:
222
return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt)
223
return txt
224
225
226
def _hex_to_char(match):
227
return chr(int(match[1], 16))
228
229
230
def parse_bytes(value, default=0, suffixes="bkmgtp"):
231
"""Convert a bytes-amount ("500k", "2.5M", ...) to int"""
232
if not value:
233
return default
234
235
value = str(value).strip()
236
last = value[-1].lower()
237
238
if last in suffixes:
239
mul = 1024 ** suffixes.index(last)
240
value = value[:-1]
241
else:
242
mul = 1
243
244
try:
245
return round(float(value) * mul)
246
except ValueError:
247
return default
248
249
250
def parse_int(value, default=0):
251
"""Convert 'value' to int"""
252
if not value:
253
return default
254
try:
255
return int(value)
256
except Exception:
257
return default
258
259
260
def parse_float(value, default=0.0):
261
"""Convert 'value' to float"""
262
if not value:
263
return default
264
try:
265
return float(value)
266
except Exception:
267
return default
268
269
270
def parse_query(qs):
271
"""Parse a query string into name-value pairs
272
273
Ignore values whose name has been seen before
274
"""
275
if not qs:
276
return {}
277
278
result = {}
279
try:
280
for name_value in qs.split("&"):
281
name, eq, value = name_value.partition("=")
282
if eq:
283
name = unquote(name.replace("+", " "))
284
if name not in result:
285
result[name] = unquote(value.replace("+", " "))
286
except Exception:
287
pass
288
return result
289
290
291
def parse_query_list(qs, as_list=()):
292
"""Parse a query string into name-value pairs
293
294
Combine values of names in 'as_list' into lists
295
"""
296
if not qs:
297
return {}
298
299
result = {}
300
try:
301
for name_value in qs.split("&"):
302
name, eq, value = name_value.partition("=")
303
if eq:
304
name = unquote(name.replace("+", " "))
305
value = unquote(value.replace("+", " "))
306
if name in as_list:
307
if name in result:
308
result[name].append(value)
309
else:
310
result[name] = [value]
311
elif name not in result:
312
result[name] = value
313
except Exception:
314
pass
315
return result
316
317
318
def build_query(params):
319
return "&".join([
320
f"{quote(name)}={quote(value)}"
321
for name, value in params.items()
322
])
323
324
325
if sys.hexversion < 0x30c0000:
326
# Python <= 3.11
327
def parse_timestamp(ts, default=None):
328
"""Create a datetime object from a Unix timestamp"""
329
try:
330
return datetime.datetime.utcfromtimestamp(int(ts))
331
except Exception:
332
return default
333
else:
334
# Python >= 3.12
335
def parse_timestamp(ts, default=None):
336
"""Create a datetime object from a Unix timestamp"""
337
try:
338
Y, m, d, H, M, S, _, _, _ = time.gmtime(int(ts))
339
return datetime.datetime(Y, m, d, H, M, S)
340
except Exception:
341
return default
342
343
344
def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0):
345
"""Create a datetime object by parsing 'date_string'"""
346
try:
347
d = datetime.datetime.strptime(date_string, format)
348
o = d.utcoffset()
349
if o is not None:
350
# convert to naive UTC
351
d = d.replace(tzinfo=None, microsecond=0) - o
352
else:
353
if d.microsecond:
354
d = d.replace(microsecond=0)
355
if utcoffset:
356
# apply manual UTC offset
357
d += datetime.timedelta(0, utcoffset * -3600)
358
return d
359
except (TypeError, IndexError, KeyError):
360
return None
361
except (ValueError, OverflowError):
362
return date_string
363
364
365
urljoin = urllib.parse.urljoin
366
367
quote = urllib.parse.quote
368
unquote = urllib.parse.unquote
369
370
escape = html.escape
371
unescape = html.unescape
372
373