CoCalc -- text.py

GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/text.py
⁵⁴⁵⁷ views
1
# -*- coding: utf-8 -*-
2

3
# Copyright 2015-2025 Mike Fährmann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License version 2 as
7
# published by the Free Software Foundation.
8

9
"""Collection of functions that work on strings/text"""
10

11
import sys
12
import html
13
import time
14
import datetime
15
import urllib.parse
16
import re as re_module
17

18
try:
19
    re_compile = re_module._compiler.compile
20
except AttributeError:
21
    re_compile = re_module.sre_compile.compile
22

23
HTML_RE = re_compile(r"<[^>]+>")
24
PATTERN_CACHE = {}
25

26

27
def re(pattern):
28
    """Compile a regular expression pattern"""
29
    try:
30
        return PATTERN_CACHE[pattern]
31
    except KeyError:
32
        p = PATTERN_CACHE[pattern] = re_compile(pattern)
33
        return p
34

35

36
def remove_html(txt, repl=" ", sep=" "):
37
    """Remove html-tags from a string"""
38
    try:
39
        txt = HTML_RE.sub(repl, txt)
40
    except TypeError:
41
        return ""
42
    if sep:
43
        return sep.join(txt.split())
44
    return txt.strip()
45

46

47
def split_html(txt):
48
    """Split input string by HTML tags"""
49
    try:
50
        return [
51
            unescape(x).strip()
52
            for x in HTML_RE.split(txt)
53
            if x and not x.isspace()
54
        ]
55
    except TypeError:
56
        return []
57

58

59
def slugify(value):
60
    """Convert a string to a URL slug
61

62
    Adapted from:
63
    https://github.com/django/django/blob/master/django/utils/text.py
64
    """
65
    value = re(r"[^\w\s-]").sub("", str(value).lower())
66
    return re(r"[-\s]+").sub("-", value).strip("-_")
67

68

69
def sanitize_whitespace(value):
70
    """Replace all whitespace characters with a single space"""
71
    return re(r"\s+").sub(" ", value.strip())
72

73

74
def ensure_http_scheme(url, scheme="https://"):
75
    """Prepend 'scheme' to 'url' if it doesn't have one"""
76
    if url and not url.startswith(("https://", "http://")):
77
        return scheme + url.lstrip("/:")
78
    return url
79

80

81
def root_from_url(url, scheme="https://"):
82
    """Extract scheme and domain from a URL"""
83
    if not url.startswith(("https://", "http://")):
84
        try:
85
            return scheme + url[:url.index("/")]
86
        except ValueError:
87
            return scheme + url
88
    try:
89
        return url[:url.index("/", 8)]
90
    except ValueError:
91
        return url
92

93

94
def filename_from_url(url):
95
    """Extract the last part of an URL to use as a filename"""
96
    try:
97
        return url.partition("?")[0].rpartition("/")[2]
98
    except Exception:
99
        return ""
100

101

102
def ext_from_url(url):
103
    """Extract the filename extension of an URL"""
104
    name, _, ext = filename_from_url(url).rpartition(".")
105
    return ext.lower() if name else ""
106

107

108
def nameext_from_url(url, data=None):
109
    """Extract the last part of an URL and fill 'data' accordingly"""
110
    if data is None:
111
        data = {}
112

113
    filename = unquote(filename_from_url(url))
114
    name, _, ext = filename.rpartition(".")
115
    if name and len(ext) <= 16:
116
        data["filename"], data["extension"] = name, ext.lower()
117
    else:
118
        data["filename"], data["extension"] = filename, ""
119

120
    return data
121

122

123
def extract(txt, begin, end, pos=None):
124
    """Extract the text between 'begin' and 'end' from 'txt'
125

126
    Args:
127
        txt: String to search in
128
        begin: First string to be searched for
129
        end: Second string to be searched for after 'begin'
130
        pos: Starting position for searches in 'txt'
131

132
    Returns:
133
        The string between the two search-strings 'begin' and 'end' beginning
134
        with position 'pos' in 'txt' as well as the position after 'end'.
135

136
        If at least one of 'begin' or 'end' is not found, None and the original
137
        value of 'pos' is returned
138

139
    Examples:
140
        extract("abcde", "b", "d")    -> "c" , 4
141
        extract("abcde", "b", "d", 3) -> None, 3
142
    """
143
    try:
144
        first = txt.index(begin, pos) + len(begin)
145
        last = txt.index(end, first)
146
        return txt[first:last], last+len(end)
147
    except Exception:
148
        return None, 0 if pos is None else pos
149

150

151
def extr(txt, begin, end, default=""):
152
    """Stripped-down version of 'extract()'"""
153
    try:
154
        first = txt.index(begin) + len(begin)
155
        return txt[first:txt.index(end, first)]
156
    except Exception:
157
        return default
158

159

160
def rextract(txt, begin, end, pos=None):
161
    try:
162
        lbeg = len(begin)
163
        first = txt.rindex(begin, None, pos)
164
        last = txt.index(end, first + lbeg)
165
        return txt[first + lbeg:last], first
166
    except Exception:
167
        return None, -1 if pos is None else pos
168

169

170
def rextr(txt, begin, end, pos=None, default=""):
171
    """Stripped-down version of 'rextract()'"""
172
    try:
173
        first = txt.rindex(begin, None, pos) + len(begin)
174
        return txt[first:txt.index(end, first)]
175
    except Exception:
176
        return default
177

178

179
def extract_all(txt, rules, pos=None, values=None):
180
    """Calls extract for each rule and returns the result in a dict"""
181
    if values is None:
182
        values = {}
183
    for key, begin, end in rules:
184
        result, pos = extract(txt, begin, end, pos)
185
        if key:
186
            values[key] = result
187
    return values, 0 if pos is None else pos
188

189

190
def extract_iter(txt, begin, end, pos=None):
191
    """Yield values that would be returned by repeated calls of extract()"""
192
    try:
193
        index = txt.index
194
        lbeg = len(begin)
195
        lend = len(end)
196
        while True:
197
            first = index(begin, pos) + lbeg
198
            last = index(end, first)
199
            pos = last + lend
200
            yield txt[first:last]
201
    except Exception:
202
        return
203

204

205
def extract_from(txt, pos=None, default=""):
206
    """Returns a function object that extracts from 'txt'"""
207
    def extr(begin, end, index=txt.index, txt=txt):
208
        nonlocal pos
209
        try:
210
            first = index(begin, pos) + len(begin)
211
            last = index(end, first)
212
            pos = last + len(end)
213
            return txt[first:last]
214
        except Exception:
215
            return default
216
    return extr
217

218

219
def parse_unicode_escapes(txt):
220
    """Convert JSON Unicode escapes in 'txt' into actual characters"""
221
    if "\\u" in txt:
222
        return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt)
223
    return txt
224

225

226
def _hex_to_char(match):
227
    return chr(int(match[1], 16))
228

229

230
def parse_bytes(value, default=0, suffixes="bkmgtp"):
231
    """Convert a bytes-amount ("500k", "2.5M", ...) to int"""
232
    if not value:
233
        return default
234

235
    value = str(value).strip()
236
    last = value[-1].lower()
237

238
    if last in suffixes:
239
        mul = 1024 ** suffixes.index(last)
240
        value = value[:-1]
241
    else:
242
        mul = 1
243

244
    try:
245
        return round(float(value) * mul)
246
    except ValueError:
247
        return default
248

249

250
def parse_int(value, default=0):
251
    """Convert 'value' to int"""
252
    if not value:
253
        return default
254
    try:
255
        return int(value)
256
    except Exception:
257
        return default
258

259

260
def parse_float(value, default=0.0):
261
    """Convert 'value' to float"""
262
    if not value:
263
        return default
264
    try:
265
        return float(value)
266
    except Exception:
267
        return default
268

269

270
def parse_query(qs):
271
    """Parse a query string into name-value pairs
272

273
    Ignore values whose name has been seen before
274
    """
275
    if not qs:
276
        return {}
277

278
    result = {}
279
    try:
280
        for name_value in qs.split("&"):
281
            name, eq, value = name_value.partition("=")
282
            if eq:
283
                name = unquote(name.replace("+", " "))
284
                if name not in result:
285
                    result[name] = unquote(value.replace("+", " "))
286
    except Exception:
287
        pass
288
    return result
289

290

291
def parse_query_list(qs, as_list=()):
292
    """Parse a query string into name-value pairs
293

294
    Combine values of names in 'as_list' into lists
295
    """
296
    if not qs:
297
        return {}
298

299
    result = {}
300
    try:
301
        for name_value in qs.split("&"):
302
            name, eq, value = name_value.partition("=")
303
            if eq:
304
                name = unquote(name.replace("+", " "))
305
                value = unquote(value.replace("+", " "))
306
                if name in as_list:
307
                    if name in result:
308
                        result[name].append(value)
309
                    else:
310
                        result[name] = [value]
311
                elif name not in result:
312
                    result[name] = value
313
    except Exception:
314
        pass
315
    return result
316

317

318
def build_query(params):
319
    return "&".join([
320
        f"{quote(name)}={quote(value)}"
321
        for name, value in params.items()
322
    ])
323

324

325
if sys.hexversion < 0x30c0000:
326
    # Python <= 3.11
327
    def parse_timestamp(ts, default=None):
328
        """Create a datetime object from a Unix timestamp"""
329
        try:
330
            return datetime.datetime.utcfromtimestamp(int(ts))
331
        except Exception:
332
            return default
333
else:
334
    # Python >= 3.12
335
    def parse_timestamp(ts, default=None):
336
        """Create a datetime object from a Unix timestamp"""
337
        try:
338
            Y, m, d, H, M, S, _, _, _ = time.gmtime(int(ts))
339
            return datetime.datetime(Y, m, d, H, M, S)
340
        except Exception:
341
            return default
342

343

344
def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0):
345
    """Create a datetime object by parsing 'date_string'"""
346
    try:
347
        d = datetime.datetime.strptime(date_string, format)
348
        o = d.utcoffset()
349
        if o is not None:
350
            # convert to naive UTC
351
            d = d.replace(tzinfo=None, microsecond=0) - o
352
        else:
353
            if d.microsecond:
354
                d = d.replace(microsecond=0)
355
            if utcoffset:
356
                # apply manual UTC offset
357
                d += datetime.timedelta(0, utcoffset * -3600)
358
        return d
359
    except (TypeError, IndexError, KeyError):
360
        return None
361
    except (ValueError, OverflowError):
362
        return date_string
363

364

365
urljoin = urllib.parse.urljoin
366

367
quote = urllib.parse.quote
368
unquote = urllib.parse.unquote
369

370
escape = html.escape
371
unescape = html.unescape
372

373
Product

Resources

Company