Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
mikf
GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/text.py
8829 views
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2015-2026 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8
9
"""Collection of functions that work on strings/text"""
10
11
import html
12
import urllib.parse
13
import re as re_module
14
15
try:
16
re_compile = re_module._compiler.compile
17
except AttributeError:
18
re_compile = re_module.sre_compile.compile
19
20
HTML_RE = re_compile(r"<[^>]+>")
21
PATTERN_CACHE = {}
22
23
24
def re(pattern):
25
"""Compile a regular expression pattern"""
26
try:
27
return PATTERN_CACHE[pattern]
28
except KeyError:
29
p = PATTERN_CACHE[pattern] = re_compile(pattern)
30
return p
31
32
33
def remove_html(txt, repl=" ", sep=" "):
34
"""Remove html-tags from a string"""
35
try:
36
txt = HTML_RE.sub(repl, txt)
37
except TypeError:
38
return ""
39
if sep:
40
return sep.join(txt.split())
41
return txt.strip()
42
43
44
def split_html(txt):
45
"""Split input string by HTML tags"""
46
try:
47
return [
48
unescape(x).strip()
49
for x in HTML_RE.split(txt)
50
if x and not x.isspace()
51
]
52
except TypeError:
53
return []
54
55
56
def slugify(value):
57
"""Convert a string to a URL slug
58
59
Adapted from:
60
https://github.com/django/django/blob/master/django/utils/text.py
61
"""
62
value = re(r"[^\w\s-]").sub("", str(value).lower())
63
return re(r"[-\s]+").sub("-", value).strip("-_")
64
65
66
def sanitize_whitespace(value):
67
"""Replace all whitespace characters with a single space"""
68
return re(r"\s+").sub(" ", value.strip())
69
70
71
def ensure_http_scheme(url, scheme="https://"):
72
"""Prepend 'scheme' to 'url' if it doesn't have one"""
73
if url and not url.startswith(("https://", "http://")):
74
return scheme + url.lstrip("/:")
75
return url
76
77
78
def root_from_url(url, scheme="https://"):
79
"""Extract scheme and domain from a URL"""
80
if not url.startswith(("https://", "http://")):
81
try:
82
return scheme + url[:url.index("/")]
83
except ValueError:
84
return scheme + url
85
try:
86
return url[:url.index("/", 8)]
87
except ValueError:
88
return url
89
90
91
def filename_from_url(url):
92
"""Extract the last part of an URL to use as a filename"""
93
try:
94
return url.partition("?")[0].rpartition("/")[2]
95
except Exception:
96
return ""
97
98
99
def ext_from_url(url):
100
"""Extract the filename extension of an URL"""
101
name, _, ext = filename_from_url(url).rpartition(".")
102
return ext.lower() if name else ""
103
104
105
def nameext_from_url(url, data=None):
106
"""Extract the last part of an URL and fill 'data' accordingly"""
107
if data is None:
108
data = {}
109
110
filename = unquote(filename_from_url(url))
111
name, _, ext = filename.rpartition(".")
112
if name and len(ext) <= 16:
113
data["filename"] = name
114
data["extension"] = ext.lower()
115
else:
116
data["filename"] = filename
117
data["extension"] = ""
118
119
return data
120
121
122
def nameext_from_name(filename, data=None):
123
"""Extract the last part of an URL and fill 'data' accordingly"""
124
if data is None:
125
data = {}
126
127
name, _, ext = filename.rpartition(".")
128
if name and len(ext) <= 16:
129
data["filename"] = name
130
data["extension"] = ext.lower()
131
else:
132
data["filename"] = filename
133
data["extension"] = ""
134
135
return data
136
137
138
def extract(txt, begin, end, pos=None):
139
"""Extract the text between 'begin' and 'end' from 'txt'
140
141
Args:
142
txt: String to search in
143
begin: First string to be searched for
144
end: Second string to be searched for after 'begin'
145
pos: Starting position for searches in 'txt'
146
147
Returns:
148
The string between the two search-strings 'begin' and 'end' beginning
149
with position 'pos' in 'txt' as well as the position after 'end'.
150
151
If at least one of 'begin' or 'end' is not found, None and the original
152
value of 'pos' is returned
153
154
Examples:
155
extract("abcde", "b", "d") -> "c" , 4
156
extract("abcde", "b", "d", 3) -> None, 3
157
"""
158
try:
159
first = txt.index(begin, pos) + len(begin)
160
last = txt.index(end, first)
161
return txt[first:last], last+len(end)
162
except Exception:
163
return None, 0 if pos is None else pos
164
165
166
def extr(txt, begin, end, default=""):
167
"""Stripped-down version of 'extract()'"""
168
try:
169
first = txt.index(begin) + len(begin)
170
return txt[first:txt.index(end, first)]
171
except Exception:
172
return default
173
174
175
def rextract(txt, begin, end, pos=None):
176
try:
177
lbeg = len(begin)
178
first = txt.rindex(begin, None, pos)
179
last = txt.index(end, first + lbeg)
180
return txt[first + lbeg:last], first
181
except Exception:
182
return None, -1 if pos is None else pos
183
184
185
def rextr(txt, begin, end, pos=None, default=""):
186
"""Stripped-down version of 'rextract()'"""
187
try:
188
first = txt.rindex(begin, None, pos) + len(begin)
189
return txt[first:txt.index(end, first)]
190
except Exception:
191
return default
192
193
194
def extract_all(txt, rules, pos=None, values=None):
195
"""Calls extract for each rule and returns the result in a dict"""
196
if values is None:
197
values = {}
198
for key, begin, end in rules:
199
result, pos = extract(txt, begin, end, pos)
200
if key:
201
values[key] = result
202
return values, 0 if pos is None else pos
203
204
205
def extract_iter(txt, begin, end, pos=None):
206
"""Yield values that would be returned by repeated calls of extract()"""
207
try:
208
index = txt.index
209
lbeg = len(begin)
210
lend = len(end)
211
while True:
212
first = index(begin, pos) + lbeg
213
last = index(end, first)
214
pos = last + lend
215
yield txt[first:last]
216
except Exception:
217
return
218
219
220
def extract_from(txt, pos=None, default=""):
221
"""Returns a function object that extracts from 'txt'"""
222
def extr(begin, end, index=txt.index, txt=txt):
223
nonlocal pos
224
try:
225
first = index(begin, pos) + len(begin)
226
last = index(end, first)
227
pos = last + len(end)
228
return txt[first:last]
229
except Exception:
230
return default
231
return extr
232
233
234
extract_urls = re(r"https?://[^\s\"'<>\\]+").findall
235
236
237
def parse_unicode_escapes(txt):
238
"""Convert JSON Unicode escapes in 'txt' into actual characters"""
239
if "\\u" in txt:
240
return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt)
241
return txt
242
243
244
def _hex_to_char(match):
245
return chr(int(match[1], 16))
246
247
248
def parse_bytes(value, default=0, suffixes="bkmgtp"):
249
"""Convert a bytes-amount ("500k", "2.5M", ...) to int"""
250
if not value:
251
return default
252
253
value = str(value).strip()
254
last = value[-1].lower()
255
256
if last in suffixes:
257
mul = 1024 ** suffixes.index(last)
258
value = value[:-1]
259
else:
260
mul = 1
261
262
try:
263
return round(float(value) * mul)
264
except ValueError:
265
return default
266
267
268
def parse_int(value, default=0):
269
"""Convert 'value' to int"""
270
if not value:
271
return default
272
try:
273
return int(value)
274
except Exception:
275
return default
276
277
278
def parse_float(value, default=0.0):
279
"""Convert 'value' to float"""
280
if not value:
281
return default
282
try:
283
return float(value)
284
except Exception:
285
return default
286
287
288
def parse_query(qs, empty=False):
289
"""Parse a query string into name-value pairs
290
291
Ignore values whose name has been seen before
292
"""
293
if not qs:
294
return {}
295
296
result = {}
297
try:
298
for name_value in qs.split("&"):
299
name, eq, value = name_value.partition("=")
300
if eq or empty:
301
name = unquote(name.replace("+", " "))
302
if name not in result:
303
result[name] = unquote(value.replace("+", " "))
304
except Exception:
305
pass
306
return result
307
308
309
def parse_query_list(qs, as_list=()):
310
"""Parse a query string into name-value pairs
311
312
Combine values of names in 'as_list' into lists
313
"""
314
if not qs:
315
return {}
316
317
result = {}
318
try:
319
for name_value in qs.split("&"):
320
name, eq, value = name_value.partition("=")
321
if eq:
322
name = unquote(name.replace("+", " "))
323
value = unquote(value.replace("+", " "))
324
if name in as_list:
325
if name in result:
326
result[name].append(value)
327
else:
328
result[name] = [value]
329
elif name not in result:
330
result[name] = value
331
except Exception:
332
pass
333
return result
334
335
336
def build_query(params):
337
return "&".join([
338
f"{quote(name)}={quote(value)}"
339
for name, value in params.items()
340
])
341
342
343
urljoin = urllib.parse.urljoin
344
345
quote = urllib.parse.quote
346
unquote = urllib.parse.unquote
347
348
escape = html.escape
349
unescape = html.unescape
350
351