Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/soupsieve/css_match.py
811 views
1
"""CSS matcher."""
2
from datetime import datetime
3
from . import util
4
import re
5
from .import css_types as ct
6
import unicodedata
7
8
# Empty tag pattern (whitespace okay)
9
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
10
11
RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
12
13
# Relationships
14
REL_PARENT = ' '
15
REL_CLOSE_PARENT = '>'
16
REL_SIBLING = '~'
17
REL_CLOSE_SIBLING = '+'
18
19
# Relationships for :has() (forward looking)
20
REL_HAS_PARENT = ': '
21
REL_HAS_CLOSE_PARENT = ':>'
22
REL_HAS_SIBLING = ':~'
23
REL_HAS_CLOSE_SIBLING = ':+'
24
25
NS_XHTML = 'http://www.w3.org/1999/xhtml'
26
NS_XML = 'http://www.w3.org/XML/1998/namespace'
27
28
DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
29
RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
30
31
DIR_MAP = {
32
'ltr': ct.SEL_DIR_LTR,
33
'rtl': ct.SEL_DIR_RTL,
34
'auto': 0
35
}
36
37
RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
38
RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
39
RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
40
RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
41
RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
42
RE_DATETIME = re.compile(
43
r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
44
)
45
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
46
47
MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
48
FEB = 2
49
SHORT_MONTH = 30
50
LONG_MONTH = 31
51
FEB_MONTH = 28
52
FEB_LEAP_MONTH = 29
53
DAYS_IN_WEEK = 7
54
55
56
class _FakeParent(object):
57
"""
58
Fake parent class.
59
60
When we have a fragment with no `BeautifulSoup` document object,
61
we can't evaluate `nth` selectors properly. Create a temporary
62
fake parent so we can traverse the root element as a child.
63
"""
64
65
def __init__(self, element):
66
"""Initialize."""
67
68
self.contents = [element]
69
70
def __len__(self):
71
"""Length."""
72
73
return len(self.contents)
74
75
76
class _DocumentNav(object):
77
"""Navigate a Beautiful Soup document."""
78
79
@classmethod
80
def assert_valid_input(cls, tag):
81
"""Check if valid input tag or document."""
82
83
# Fail on unexpected types.
84
if not cls.is_tag(tag):
85
raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))
86
87
@staticmethod
88
def is_doc(obj):
89
"""Is `BeautifulSoup` object."""
90
91
import bs4
92
return isinstance(obj, bs4.BeautifulSoup)
93
94
@staticmethod
95
def is_tag(obj):
96
"""Is tag."""
97
98
import bs4
99
return isinstance(obj, bs4.Tag)
100
101
@staticmethod
102
def is_declaration(obj): # pragma: no cover
103
"""Is declaration."""
104
105
import bs4
106
return isinstance(obj, bs4.Declaration)
107
108
@staticmethod
109
def is_cdata(obj):
110
"""Is CDATA."""
111
112
import bs4
113
return isinstance(obj, bs4.CData)
114
115
@staticmethod
116
def is_processing_instruction(obj): # pragma: no cover
117
"""Is processing instruction."""
118
119
import bs4
120
return isinstance(obj, bs4.ProcessingInstruction)
121
122
@staticmethod
123
def is_navigable_string(obj):
124
"""Is navigable string."""
125
126
import bs4
127
return isinstance(obj, bs4.NavigableString)
128
129
@staticmethod
130
def is_special_string(obj):
131
"""Is special string."""
132
133
import bs4
134
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
135
136
@classmethod
137
def is_content_string(cls, obj):
138
"""Check if node is content string."""
139
140
return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
141
142
@staticmethod
143
def create_fake_parent(el):
144
"""Create fake parent for a given element."""
145
146
return _FakeParent(el)
147
148
@staticmethod
149
def is_xml_tree(el):
150
"""Check if element (or document) is from a XML tree."""
151
152
return el._is_xml
153
154
def is_iframe(self, el):
155
"""Check if element is an `iframe`."""
156
157
return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el)
158
159
def is_root(self, el):
160
"""
161
Return whether element is a root element.
162
163
We check that the element is the root of the tree (which we have already pre-calculated),
164
and we check if it is the root element under an `iframe`.
165
"""
166
167
root = self.root and self.root is el
168
if not root:
169
parent = self.get_parent(el)
170
root = parent is not None and self.is_html and self.is_iframe(parent)
171
return root
172
173
def get_contents(self, el, no_iframe=False):
174
"""Get contents or contents in reverse."""
175
if not no_iframe or not self.is_iframe(el):
176
for content in el.contents:
177
yield content
178
179
def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False):
180
"""Get children."""
181
182
if not no_iframe or not self.is_iframe(el):
183
last = len(el.contents) - 1
184
if start is None:
185
index = last if reverse else 0
186
else:
187
index = start
188
end = -1 if reverse else last + 1
189
incr = -1 if reverse else 1
190
191
if 0 <= index <= last:
192
while index != end:
193
node = el.contents[index]
194
index += incr
195
if not tags or self.is_tag(node):
196
yield node
197
198
def get_descendants(self, el, tags=True, no_iframe=False):
199
"""Get descendants."""
200
201
if not no_iframe or not self.is_iframe(el):
202
next_good = None
203
for child in el.descendants:
204
205
if next_good is not None:
206
if child is not next_good:
207
continue
208
next_good = None
209
210
is_tag = self.is_tag(child)
211
212
if no_iframe and is_tag and self.is_iframe(child):
213
if child.next_sibling is not None:
214
next_good = child.next_sibling
215
else:
216
last_child = child
217
while self.is_tag(last_child) and last_child.contents:
218
last_child = last_child.contents[-1]
219
next_good = last_child.next_element
220
yield child
221
if next_good is None:
222
break
223
# Coverage isn't seeing this even though it's executed
224
continue # pragma: no cover
225
226
if not tags or is_tag:
227
yield child
228
229
def get_parent(self, el, no_iframe=False):
230
"""Get parent."""
231
232
parent = el.parent
233
if no_iframe and parent is not None and self.is_iframe(parent):
234
parent = None
235
return parent
236
237
@staticmethod
238
def get_tag_name(el):
239
"""Get tag."""
240
241
return el.name
242
243
@staticmethod
244
def get_prefix_name(el):
245
"""Get prefix."""
246
247
return el.prefix
248
249
@staticmethod
250
def get_uri(el):
251
"""Get namespace `URI`."""
252
253
return el.namespace
254
255
@classmethod
256
def get_next(cls, el, tags=True):
257
"""Get next sibling tag."""
258
259
sibling = el.next_sibling
260
while tags and not cls.is_tag(sibling) and sibling is not None:
261
sibling = sibling.next_sibling
262
return sibling
263
264
@classmethod
265
def get_previous(cls, el, tags=True):
266
"""Get previous sibling tag."""
267
268
sibling = el.previous_sibling
269
while tags and not cls.is_tag(sibling) and sibling is not None:
270
sibling = sibling.previous_sibling
271
return sibling
272
273
@staticmethod
274
def has_html_ns(el):
275
"""
276
Check if element has an HTML namespace.
277
278
This is a bit different than whether a element is treated as having an HTML namespace,
279
like we do in the case of `is_html_tag`.
280
"""
281
282
ns = getattr(el, 'namespace') if el else None
283
return ns and ns == NS_XHTML
284
285
@staticmethod
286
def split_namespace(el, attr_name):
287
"""Return namespace and attribute name without the prefix."""
288
289
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
290
291
@staticmethod
292
def get_attribute_by_name(el, name, default=None):
293
"""Get attribute by name."""
294
295
value = default
296
if el._is_xml:
297
try:
298
value = el.attrs[name]
299
except KeyError:
300
pass
301
else:
302
for k, v in el.attrs.items():
303
if util.lower(k) == name:
304
value = v
305
break
306
return value
307
308
@staticmethod
309
def iter_attributes(el):
310
"""Iterate attributes."""
311
312
for k, v in el.attrs.items():
313
yield k, v
314
315
@classmethod
316
def get_classes(cls, el):
317
"""Get classes."""
318
319
classes = cls.get_attribute_by_name(el, 'class', [])
320
if isinstance(classes, str):
321
classes = RE_NOT_WS.findall(classes)
322
return classes
323
324
def get_text(self, el, no_iframe=False):
325
"""Get text."""
326
327
return ''.join(
328
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
329
)
330
331
332
class Inputs(object):
333
"""Class for parsing and validating input items."""
334
335
@staticmethod
336
def validate_day(year, month, day):
337
"""Validate day."""
338
339
max_days = LONG_MONTH
340
if month == FEB:
341
max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
342
elif month in MONTHS_30:
343
max_days = SHORT_MONTH
344
return 1 <= day <= max_days
345
346
@staticmethod
347
def validate_week(year, week):
348
"""Validate week."""
349
350
max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]
351
if max_week == 1:
352
max_week = 53
353
return 1 <= week <= max_week
354
355
@staticmethod
356
def validate_month(month):
357
"""Validate month."""
358
359
return 1 <= month <= 12
360
361
@staticmethod
362
def validate_year(year):
363
"""Validate year."""
364
365
return 1 <= year
366
367
@staticmethod
368
def validate_hour(hour):
369
"""Validate hour."""
370
371
return 0 <= hour <= 23
372
373
@staticmethod
374
def validate_minutes(minutes):
375
"""Validate minutes."""
376
377
return 0 <= minutes <= 59
378
379
@classmethod
380
def parse_value(cls, itype, value):
381
"""Parse the input value."""
382
383
parsed = None
384
if itype == "date":
385
m = RE_DATE.match(value)
386
if m:
387
year = int(m.group('year'), 10)
388
month = int(m.group('month'), 10)
389
day = int(m.group('day'), 10)
390
if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
391
parsed = (year, month, day)
392
elif itype == "month":
393
m = RE_MONTH.match(value)
394
if m:
395
year = int(m.group('year'), 10)
396
month = int(m.group('month'), 10)
397
if cls.validate_year(year) and cls.validate_month(month):
398
parsed = (year, month)
399
elif itype == "week":
400
m = RE_WEEK.match(value)
401
if m:
402
year = int(m.group('year'), 10)
403
week = int(m.group('week'), 10)
404
if cls.validate_year(year) and cls.validate_week(year, week):
405
parsed = (year, week)
406
elif itype == "time":
407
m = RE_TIME.match(value)
408
if m:
409
hour = int(m.group('hour'), 10)
410
minutes = int(m.group('minutes'), 10)
411
if cls.validate_hour(hour) and cls.validate_minutes(minutes):
412
parsed = (hour, minutes)
413
elif itype == "datetime-local":
414
m = RE_DATETIME.match(value)
415
if m:
416
year = int(m.group('year'), 10)
417
month = int(m.group('month'), 10)
418
day = int(m.group('day'), 10)
419
hour = int(m.group('hour'), 10)
420
minutes = int(m.group('minutes'), 10)
421
if (
422
cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
423
cls.validate_hour(hour) and cls.validate_minutes(minutes)
424
):
425
parsed = (year, month, day, hour, minutes)
426
elif itype in ("number", "range"):
427
m = RE_NUM.match(value)
428
if m:
429
parsed = float(m.group('value'))
430
return parsed
431
432
433
class _Match(object):
434
"""Perform CSS matching."""
435
436
def __init__(self, selectors, scope, namespaces, flags):
437
"""Initialize."""
438
439
self.assert_valid_input(scope)
440
self.tag = scope
441
self.cached_meta_lang = []
442
self.cached_default_forms = []
443
self.cached_indeterminate_forms = []
444
self.selectors = selectors
445
self.namespaces = {} if namespaces is None else namespaces
446
self.flags = flags
447
self.iframe_restrict = False
448
449
# Find the root element for the whole tree
450
doc = scope
451
parent = self.get_parent(doc)
452
while parent:
453
doc = parent
454
parent = self.get_parent(doc)
455
root = None
456
if not self.is_doc(doc):
457
root = doc
458
else:
459
for child in self.get_children(doc):
460
root = child
461
break
462
463
self.root = root
464
self.scope = scope if scope is not doc else root
465
self.has_html_namespace = self.has_html_ns(root)
466
467
# A document can be both XML and HTML (XHTML)
468
self.is_xml = self.is_xml_tree(doc)
469
self.is_html = not self.is_xml or self.has_html_namespace
470
471
def supports_namespaces(self):
472
"""Check if namespaces are supported in the HTML type."""
473
474
return self.is_xml or self.has_html_namespace
475
476
def get_tag_ns(self, el):
477
"""Get tag namespace."""
478
479
if self.supports_namespaces():
480
namespace = ''
481
ns = self.get_uri(el)
482
if ns:
483
namespace = ns
484
else:
485
namespace = NS_XHTML
486
return namespace
487
488
def is_html_tag(self, el):
489
"""Check if tag is in HTML namespace."""
490
491
return self.get_tag_ns(el) == NS_XHTML
492
493
def get_tag(self, el):
494
"""Get tag."""
495
496
name = self.get_tag_name(el)
497
return util.lower(name) if name is not None and not self.is_xml else name
498
499
def get_prefix(self, el):
500
"""Get prefix."""
501
502
prefix = self.get_prefix_name(el)
503
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
504
505
def find_bidi(self, el):
506
"""Get directionality from element text."""
507
508
for node in self.get_children(el, tags=False):
509
510
# Analyze child text nodes
511
if self.is_tag(node):
512
513
# Avoid analyzing certain elements specified in the specification.
514
direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
515
if (
516
self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
517
not self.is_html_tag(node) or
518
direction is not None
519
):
520
continue # pragma: no cover
521
522
# Check directionality of this node's text
523
value = self.find_bidi(node)
524
if value is not None:
525
return value
526
527
# Direction could not be determined
528
continue # pragma: no cover
529
530
# Skip `doctype` comments, etc.
531
if self.is_special_string(node):
532
continue
533
534
# Analyze text nodes for directionality.
535
for c in node:
536
bidi = unicodedata.bidirectional(c)
537
if bidi in ('AL', 'R', 'L'):
538
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
539
return None
540
541
def extended_language_filter(self, lang_range, lang_tag):
542
"""Filter the language tags."""
543
544
match = True
545
lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
546
ranges = lang_range.split('-')
547
subtags = lang_tag.lower().split('-')
548
length = len(ranges)
549
rindex = 0
550
sindex = 0
551
r = ranges[rindex]
552
s = subtags[sindex]
553
554
# Primary tag needs to match
555
if r != '*' and r != s:
556
match = False
557
558
rindex += 1
559
sindex += 1
560
561
# Match until we run out of ranges
562
while match and rindex < length:
563
r = ranges[rindex]
564
try:
565
s = subtags[sindex]
566
except IndexError:
567
# Ran out of subtags,
568
# but we still have ranges
569
match = False
570
continue
571
572
# Empty range
573
if not r:
574
match = False
575
continue
576
577
# Matched range
578
elif s == r:
579
rindex += 1
580
581
# Implicit wildcard cannot match
582
# singletons
583
elif len(s) == 1:
584
match = False
585
continue
586
587
# Implicitly matched, so grab next subtag
588
sindex += 1
589
590
return match
591
592
def match_attribute_name(self, el, attr, prefix):
593
"""Match attribute name and return value if it exists."""
594
595
value = None
596
if self.supports_namespaces():
597
value = None
598
# If we have not defined namespaces, we can't very well find them, so don't bother trying.
599
if prefix:
600
ns = self.namespaces.get(prefix)
601
if ns is None and prefix != '*':
602
return None
603
else:
604
ns = None
605
606
for k, v in self.iter_attributes(el):
607
608
# Get attribute parts
609
namespace, name = self.split_namespace(el, k)
610
611
# Can't match a prefix attribute as we haven't specified one to match
612
# Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
613
if ns is None:
614
if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
615
value = v
616
break
617
# Coverage is not finding this even though it is executed.
618
# Adding a print statement before this (and erasing coverage) causes coverage to find the line.
619
# Ignore the false positive message.
620
continue # pragma: no cover
621
622
# We can't match our desired prefix attribute as the attribute doesn't have a prefix
623
if namespace is None or ns != namespace and prefix != '*':
624
continue
625
626
# The attribute doesn't match.
627
if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
628
continue
629
630
value = v
631
break
632
else:
633
for k, v in self.iter_attributes(el):
634
if util.lower(attr) != util.lower(k):
635
continue
636
value = v
637
break
638
return value
639
640
def match_namespace(self, el, tag):
641
"""Match the namespace of the element."""
642
643
match = True
644
namespace = self.get_tag_ns(el)
645
default_namespace = self.namespaces.get('')
646
tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None)
647
# We must match the default namespace if one is not provided
648
if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
649
match = False
650
# If we specified `|tag`, we must not have a namespace.
651
elif (tag.prefix is not None and tag.prefix == '' and namespace):
652
match = False
653
# Verify prefix matches
654
elif (
655
tag.prefix and
656
tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
657
):
658
match = False
659
return match
660
661
def match_attributes(self, el, attributes):
662
"""Match attributes."""
663
664
match = True
665
if attributes:
666
for a in attributes:
667
value = self.match_attribute_name(el, a.attribute, a.prefix)
668
pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
669
if isinstance(value, list):
670
value = ' '.join(value)
671
if value is None:
672
match = False
673
break
674
elif pattern is None:
675
continue
676
elif pattern.match(value) is None:
677
match = False
678
break
679
return match
680
681
def match_tagname(self, el, tag):
682
"""Match tag name."""
683
684
name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
685
return not (
686
name is not None and
687
name not in (self.get_tag(el), '*')
688
)
689
690
def match_tag(self, el, tag):
691
"""Match the tag."""
692
693
match = True
694
if tag is not None:
695
# Verify namespace
696
if not self.match_namespace(el, tag):
697
match = False
698
if not self.match_tagname(el, tag):
699
match = False
700
return match
701
702
def match_past_relations(self, el, relation):
703
"""Match past relationship."""
704
705
found = False
706
if relation[0].rel_type == REL_PARENT:
707
parent = self.get_parent(el, no_iframe=self.iframe_restrict)
708
while not found and parent:
709
found = self.match_selectors(parent, relation)
710
parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
711
elif relation[0].rel_type == REL_CLOSE_PARENT:
712
parent = self.get_parent(el, no_iframe=self.iframe_restrict)
713
if parent:
714
found = self.match_selectors(parent, relation)
715
elif relation[0].rel_type == REL_SIBLING:
716
sibling = self.get_previous(el)
717
while not found and sibling:
718
found = self.match_selectors(sibling, relation)
719
sibling = self.get_previous(sibling)
720
elif relation[0].rel_type == REL_CLOSE_SIBLING:
721
sibling = self.get_previous(el)
722
if sibling and self.is_tag(sibling):
723
found = self.match_selectors(sibling, relation)
724
return found
725
726
def match_future_child(self, parent, relation, recursive=False):
727
"""Match future child."""
728
729
match = False
730
children = self.get_descendants if recursive else self.get_children
731
for child in children(parent, no_iframe=self.iframe_restrict):
732
match = self.match_selectors(child, relation)
733
if match:
734
break
735
return match
736
737
def match_future_relations(self, el, relation):
738
"""Match future relationship."""
739
740
found = False
741
if relation[0].rel_type == REL_HAS_PARENT:
742
found = self.match_future_child(el, relation, True)
743
elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
744
found = self.match_future_child(el, relation)
745
elif relation[0].rel_type == REL_HAS_SIBLING:
746
sibling = self.get_next(el)
747
while not found and sibling:
748
found = self.match_selectors(sibling, relation)
749
sibling = self.get_next(sibling)
750
elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
751
sibling = self.get_next(el)
752
if sibling and self.is_tag(sibling):
753
found = self.match_selectors(sibling, relation)
754
return found
755
756
def match_relations(self, el, relation):
757
"""Match relationship to other elements."""
758
759
found = False
760
761
if relation[0].rel_type.startswith(':'):
762
found = self.match_future_relations(el, relation)
763
else:
764
found = self.match_past_relations(el, relation)
765
766
return found
767
768
def match_id(self, el, ids):
769
"""Match element's ID."""
770
771
found = True
772
for i in ids:
773
if i != self.get_attribute_by_name(el, 'id', ''):
774
found = False
775
break
776
return found
777
778
def match_classes(self, el, classes):
779
"""Match element's classes."""
780
781
current_classes = self.get_classes(el)
782
found = True
783
for c in classes:
784
if c not in current_classes:
785
found = False
786
break
787
return found
788
789
def match_root(self, el):
790
"""Match element as root."""
791
792
is_root = self.is_root(el)
793
if is_root:
794
sibling = self.get_previous(el, tags=False)
795
while is_root and sibling is not None:
796
if (
797
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
798
self.is_cdata(sibling)
799
):
800
is_root = False
801
else:
802
sibling = self.get_previous(sibling, tags=False)
803
if is_root:
804
sibling = self.get_next(el, tags=False)
805
while is_root and sibling is not None:
806
if (
807
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
808
self.is_cdata(sibling)
809
):
810
is_root = False
811
else:
812
sibling = self.get_next(sibling, tags=False)
813
return is_root
814
815
def match_scope(self, el):
816
"""Match element as scope."""
817
818
return self.scope is el
819
820
def match_nth_tag_type(self, el, child):
821
"""Match tag type for `nth` matches."""
822
823
return(
824
(self.get_tag(child) == self.get_tag(el)) and
825
(self.get_tag_ns(child) == self.get_tag_ns(el))
826
)
827
828
def match_nth(self, el, nth):
829
"""Match `nth` elements."""
830
831
matched = True
832
833
for n in nth:
834
matched = False
835
if n.selectors and not self.match_selectors(el, n.selectors):
836
break
837
parent = self.get_parent(el)
838
if parent is None:
839
parent = self.create_fake_parent(el)
840
last = n.last
841
last_index = len(parent) - 1
842
index = last_index if last else 0
843
relative_index = 0
844
a = n.a
845
b = n.b
846
var = n.n
847
count = 0
848
count_incr = 1
849
factor = -1 if last else 1
850
idx = last_idx = a * count + b if var else a
851
852
# We can only adjust bounds within a variable index
853
if var:
854
# Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
855
# Otherwise, increment to try to get in bounds.
856
adjust = None
857
while idx < 1 or idx > last_index:
858
if idx < 0:
859
diff_low = 0 - idx
860
if adjust is not None and adjust == 1:
861
break
862
adjust = -1
863
count += count_incr
864
idx = last_idx = a * count + b if var else a
865
diff = 0 - idx
866
if diff >= diff_low:
867
break
868
else:
869
diff_high = idx - last_index
870
if adjust is not None and adjust == -1:
871
break
872
adjust = 1
873
count += count_incr
874
idx = last_idx = a * count + b if var else a
875
diff = idx - last_index
876
if diff >= diff_high:
877
break
878
diff_high = diff
879
880
# If a < 0, our count is working backwards, so floor the index by increasing the count.
881
# Find the count that yields the lowest, in bound value and use that.
882
# Lastly reverse count increment so that we'll increase our index.
883
lowest = count
884
if a < 0:
885
while idx >= 1:
886
lowest = count
887
count += count_incr
888
idx = last_idx = a * count + b if var else a
889
count_incr = -1
890
count = lowest
891
idx = last_idx = a * count + b if var else a
892
893
# Evaluate elements while our calculated nth index is still in range
894
while 1 <= idx <= last_index + 1:
895
child = None
896
# Evaluate while our child index is still in range.
897
for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
898
index += factor
899
if not self.is_tag(child):
900
continue
901
# Handle `of S` in `nth-child`
902
if n.selectors and not self.match_selectors(child, n.selectors):
903
continue
904
# Handle `of-type`
905
if n.of_type and not self.match_nth_tag_type(el, child):
906
continue
907
relative_index += 1
908
if relative_index == idx:
909
if child is el:
910
matched = True
911
else:
912
break
913
if child is el:
914
break
915
if child is el:
916
break
917
last_idx = idx
918
count += count_incr
919
if count < 0:
920
# Count is counting down and has now ventured into invalid territory.
921
break
922
idx = a * count + b if var else a
923
if last_idx == idx:
924
break
925
if not matched:
926
break
927
return matched
928
929
def match_empty(self, el):
930
"""Check if element is empty (if requested)."""
931
932
is_empty = True
933
for child in self.get_children(el, tags=False):
934
if self.is_tag(child):
935
is_empty = False
936
break
937
elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
938
is_empty = False
939
break
940
return is_empty
941
942
def match_subselectors(self, el, selectors):
943
"""Match selectors."""
944
945
match = True
946
for sel in selectors:
947
if not self.match_selectors(el, sel):
948
match = False
949
return match
950
951
def match_contains(self, el, contains):
952
"""Match element if it contains text."""
953
954
match = True
955
content = None
956
for contain_list in contains:
957
if content is None:
958
content = self.get_text(el, no_iframe=self.is_html)
959
found = False
960
for text in contain_list.text:
961
if text in content:
962
found = True
963
break
964
if not found:
965
match = False
966
return match
967
968
def match_default(self, el):
969
"""Match default."""
970
971
match = False
972
973
# Find this input's form
974
form = None
975
parent = self.get_parent(el, no_iframe=True)
976
while parent and form is None:
977
if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
978
form = parent
979
else:
980
parent = self.get_parent(parent, no_iframe=True)
981
982
# Look in form cache to see if we've already located its default button
983
found_form = False
984
for f, t in self.cached_default_forms:
985
if f is form:
986
found_form = True
987
if t is el:
988
match = True
989
break
990
991
# We didn't have the form cached, so look for its default button
992
if not found_form:
993
for child in self.get_descendants(form, no_iframe=True):
994
name = self.get_tag(child)
995
# Can't do nested forms (haven't figured out why we never hit this)
996
if name == 'form': # pragma: no cover
997
break
998
if name in ('input', 'button'):
999
v = self.get_attribute_by_name(child, 'type', '')
1000
if v and util.lower(v) == 'submit':
1001
self.cached_default_forms.append([form, child])
1002
if el is child:
1003
match = True
1004
break
1005
return match
1006
1007
def match_indeterminate(self, el):
1008
"""Match default."""
1009
1010
match = False
1011
name = self.get_attribute_by_name(el, 'name')
1012
1013
def get_parent_form(el):
1014
"""Find this input's form."""
1015
form = None
1016
parent = self.get_parent(el, no_iframe=True)
1017
while form is None:
1018
if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
1019
form = parent
1020
break
1021
last_parent = parent
1022
parent = self.get_parent(parent, no_iframe=True)
1023
if parent is None:
1024
form = last_parent
1025
break
1026
return form
1027
1028
form = get_parent_form(el)
1029
1030
# Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
1031
found_form = False
1032
for f, n, i in self.cached_indeterminate_forms:
1033
if f is form and n == name:
1034
found_form = True
1035
if i is True:
1036
match = True
1037
break
1038
1039
# We didn't have the form cached, so validate that the radio button is indeterminate
1040
if not found_form:
1041
checked = False
1042
for child in self.get_descendants(form, no_iframe=True):
1043
if child is el:
1044
continue
1045
tag_name = self.get_tag(child)
1046
if tag_name == 'input':
1047
is_radio = False
1048
check = False
1049
has_name = False
1050
for k, v in self.iter_attributes(child):
1051
if util.lower(k) == 'type' and util.lower(v) == 'radio':
1052
is_radio = True
1053
elif util.lower(k) == 'name' and v == name:
1054
has_name = True
1055
elif util.lower(k) == 'checked':
1056
check = True
1057
if is_radio and check and has_name and get_parent_form(child) is form:
1058
checked = True
1059
break
1060
if checked:
1061
break
1062
if not checked:
1063
match = True
1064
self.cached_indeterminate_forms.append([form, name, match])
1065
1066
return match
1067
1068
def match_lang(self, el, langs):
1069
"""Match languages."""
1070
1071
match = False
1072
has_ns = self.supports_namespaces()
1073
root = self.root
1074
has_html_namespace = self.has_html_namespace
1075
1076
# Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
1077
parent = el
1078
found_lang = None
1079
last = None
1080
while not found_lang:
1081
has_html_ns = self.has_html_ns(parent)
1082
for k, v in self.iter_attributes(parent):
1083
attr_ns, attr = self.split_namespace(parent, k)
1084
if (
1085
((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
1086
(
1087
has_ns and not has_html_ns and attr_ns == NS_XML and
1088
(util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
1089
)
1090
):
1091
found_lang = v
1092
break
1093
last = parent
1094
parent = self.get_parent(parent, no_iframe=self.is_html)
1095
1096
if parent is None:
1097
root = last
1098
has_html_namespace = self.has_html_ns(root)
1099
parent = last
1100
break
1101
1102
# Use cached meta language.
1103
if not found_lang and self.cached_meta_lang:
1104
for cache in self.cached_meta_lang:
1105
if root is cache[0]:
1106
found_lang = cache[1]
1107
1108
# If we couldn't find a language, and the document is HTML, look to meta to determine language.
1109
if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
1110
# Find head
1111
found = False
1112
for tag in ('html', 'head'):
1113
found = False
1114
for child in self.get_children(parent, no_iframe=self.is_html):
1115
if self.get_tag(child) == tag and self.is_html_tag(child):
1116
found = True
1117
parent = child
1118
break
1119
if not found: # pragma: no cover
1120
break
1121
1122
# Search meta tags
1123
if found:
1124
for child in parent:
1125
if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
1126
c_lang = False
1127
content = None
1128
for k, v in self.iter_attributes(child):
1129
if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
1130
c_lang = True
1131
if util.lower(k) == 'content':
1132
content = v
1133
if c_lang and content:
1134
found_lang = content
1135
self.cached_meta_lang.append((root, found_lang))
1136
break
1137
if found_lang:
1138
break
1139
if not found_lang:
1140
self.cached_meta_lang.append((root, False))
1141
1142
# If we determined a language, compare.
1143
if found_lang:
1144
for patterns in langs:
1145
match = False
1146
for pattern in patterns:
1147
if self.extended_language_filter(pattern, found_lang):
1148
match = True
1149
if not match:
1150
break
1151
1152
return match
1153
1154
def match_dir(self, el, directionality):
1155
"""Check directionality."""
1156
1157
# If we have to match both left and right, we can't match either.
1158
if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
1159
return False
1160
1161
if el is None or not self.is_html_tag(el):
1162
return False
1163
1164
# Element has defined direction of left to right or right to left
1165
direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
1166
if direction not in (None, 0):
1167
return direction == directionality
1168
1169
# Element is the document element (the root) and no direction assigned, assume left to right.
1170
is_root = self.is_root(el)
1171
if is_root and direction is None:
1172
return ct.SEL_DIR_LTR == directionality
1173
1174
# If `input[type=telephone]` and no direction is assigned, assume left to right.
1175
name = self.get_tag(el)
1176
is_input = name == 'input'
1177
is_textarea = name == 'textarea'
1178
is_bdi = name == 'bdi'
1179
itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
1180
if is_input and itype == 'tel' and direction is None:
1181
return ct.SEL_DIR_LTR == directionality
1182
1183
# Auto handling for text inputs
1184
if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
1185
if is_textarea:
1186
value = []
1187
for node in self.get_contents(el, no_iframe=True):
1188
if self.is_content_string(node):
1189
value.append(node)
1190
value = ''.join(value)
1191
else:
1192
value = self.get_attribute_by_name(el, 'value', '')
1193
if value:
1194
for c in value:
1195
bidi = unicodedata.bidirectional(c)
1196
if bidi in ('AL', 'R', 'L'):
1197
direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
1198
return direction == directionality
1199
# Assume left to right
1200
return ct.SEL_DIR_LTR == directionality
1201
elif is_root:
1202
return ct.SEL_DIR_LTR == directionality
1203
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1204
1205
# Auto handling for `bdi` and other non text inputs.
1206
if (is_bdi and direction is None) or direction == 0:
1207
direction = self.find_bidi(el)
1208
if direction is not None:
1209
return direction == directionality
1210
elif is_root:
1211
return ct.SEL_DIR_LTR == directionality
1212
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1213
1214
# Match parents direction
1215
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
1216
1217
def match_range(self, el, condition):
1218
"""
1219
Match range.
1220
1221
Behavior is modeled after what we see in browsers. Browsers seem to evaluate
1222
if the value is out of range, and if not, it is in range. So a missing value
1223
will not evaluate out of range; therefore, value is in range. Personally, I
1224
feel like this should evaluate as neither in or out of range.
1225
"""
1226
1227
out_of_range = False
1228
1229
itype = util.lower(self.get_attribute_by_name(el, 'type'))
1230
mn = self.get_attribute_by_name(el, 'min', None)
1231
if mn is not None:
1232
mn = Inputs.parse_value(itype, mn)
1233
mx = self.get_attribute_by_name(el, 'max', None)
1234
if mx is not None:
1235
mx = Inputs.parse_value(itype, mx)
1236
1237
# There is no valid min or max, so we cannot evaluate a range
1238
if mn is None and mx is None:
1239
return False
1240
1241
value = self.get_attribute_by_name(el, 'value', None)
1242
if value is not None:
1243
value = Inputs.parse_value(itype, value)
1244
if value is not None:
1245
if itype in ("date", "datetime-local", "month", "week", "number", "range"):
1246
if mn is not None and value < mn:
1247
out_of_range = True
1248
if not out_of_range and mx is not None and value > mx:
1249
out_of_range = True
1250
elif itype == "time":
1251
if mn is not None and mx is not None and mn > mx:
1252
# Time is periodic, so this is a reversed/discontinuous range
1253
if value < mn and value > mx:
1254
out_of_range = True
1255
else:
1256
if mn is not None and value < mn:
1257
out_of_range = True
1258
if not out_of_range and mx is not None and value > mx:
1259
out_of_range = True
1260
1261
return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
1262
1263
def match_defined(self, el):
1264
"""
1265
Match defined.
1266
1267
`:defined` is related to custom elements in a browser.
1268
1269
- If the document is XML (not XHTML), all tags will match.
1270
- Tags that are not custom (don't have a hyphen) are marked defined.
1271
- If the tag has a prefix (without or without a namespace), it will not match.
1272
1273
This is of course requires the parser to provide us with the proper prefix and namespace info,
1274
if it doesn't, there is nothing we can do.
1275
"""
1276
1277
name = self.get_tag(el)
1278
return (
1279
name.find('-') == -1 or
1280
name.find(':') != -1 or
1281
self.get_prefix(el) is not None
1282
)
1283
1284
def match_placeholder_shown(self, el):
1285
"""
1286
Match placeholder shown according to HTML spec.
1287
1288
- text area should be checked if they have content. A single newline does not count as content.
1289
1290
"""
1291
1292
match = False
1293
content = self.get_text(el)
1294
if content in ('', '\n'):
1295
match = True
1296
1297
return match
1298
1299
def match_selectors(self, el, selectors):
1300
"""Check if element matches one of the selectors."""
1301
1302
match = False
1303
is_not = selectors.is_not
1304
is_html = selectors.is_html
1305
1306
# Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
1307
if is_html:
1308
namespaces = self.namespaces
1309
iframe_restrict = self.iframe_restrict
1310
self.namespaces = {'html': NS_XHTML}
1311
self.iframe_restrict = True
1312
1313
if not is_html or self.is_html:
1314
for selector in selectors:
1315
match = is_not
1316
# We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
1317
if isinstance(selector, ct.SelectorNull):
1318
continue
1319
# Verify tag matches
1320
if not self.match_tag(el, selector.tag):
1321
continue
1322
# Verify tag is defined
1323
if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
1324
continue
1325
# Verify element is root
1326
if selector.flags & ct.SEL_ROOT and not self.match_root(el):
1327
continue
1328
# Verify element is scope
1329
if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
1330
continue
1331
# Verify element has placeholder shown
1332
if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
1333
continue
1334
# Verify `nth` matches
1335
if not self.match_nth(el, selector.nth):
1336
continue
1337
if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
1338
continue
1339
# Verify id matches
1340
if selector.ids and not self.match_id(el, selector.ids):
1341
continue
1342
# Verify classes match
1343
if selector.classes and not self.match_classes(el, selector.classes):
1344
continue
1345
# Verify attribute(s) match
1346
if not self.match_attributes(el, selector.attributes):
1347
continue
1348
# Verify ranges
1349
if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
1350
continue
1351
# Verify language patterns
1352
if selector.lang and not self.match_lang(el, selector.lang):
1353
continue
1354
# Verify pseudo selector patterns
1355
if selector.selectors and not self.match_subselectors(el, selector.selectors):
1356
continue
1357
# Verify relationship selectors
1358
if selector.relation and not self.match_relations(el, selector.relation):
1359
continue
1360
# Validate that the current default selector match corresponds to the first submit button in the form
1361
if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
1362
continue
1363
# Validate that the unset radio button is among radio buttons with the same name in a form that are
1364
# also not set.
1365
if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
1366
continue
1367
# Validate element directionality
1368
if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
1369
continue
1370
# Validate that the tag contains the specified text.
1371
if not self.match_contains(el, selector.contains):
1372
continue
1373
match = not is_not
1374
break
1375
1376
# Restore actual namespaces being used for external selector lists
1377
if is_html:
1378
self.namespaces = namespaces
1379
self.iframe_restrict = iframe_restrict
1380
1381
return match
1382
1383
def select(self, limit=0):
1384
"""Match all tags under the targeted tag."""
1385
1386
if limit < 1:
1387
limit = None
1388
1389
for child in self.get_descendants(self.tag):
1390
if self.match(child):
1391
yield child
1392
if limit is not None:
1393
limit -= 1
1394
if limit < 1:
1395
break
1396
1397
def closest(self):
1398
"""Match closest ancestor."""
1399
1400
current = self.tag
1401
closest = None
1402
while closest is None and current is not None:
1403
if self.match(current):
1404
closest = current
1405
else:
1406
current = self.get_parent(current)
1407
return closest
1408
1409
def filter(self): # noqa A001
1410
"""Filter tag's children."""
1411
1412
return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
1413
1414
def match(self, el):
1415
"""Match."""
1416
1417
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
1418
1419
1420
class CSSMatch(_DocumentNav, _Match):
1421
"""The Beautiful Soup CSS match class."""
1422
1423
1424
class SoupSieve(ct.Immutable):
1425
"""Compiled Soup Sieve selector matching object."""
1426
1427
__slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
1428
1429
def __init__(self, pattern, selectors, namespaces, custom, flags):
1430
"""Initialize."""
1431
1432
super(SoupSieve, self).__init__(
1433
pattern=pattern,
1434
selectors=selectors,
1435
namespaces=namespaces,
1436
custom=custom,
1437
flags=flags
1438
)
1439
1440
def match(self, tag):
1441
"""Match."""
1442
1443
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
1444
1445
def closest(self, tag):
1446
"""Match closest ancestor."""
1447
1448
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
1449
1450
def filter(self, iterable): # noqa A001
1451
"""
1452
Filter.
1453
1454
`CSSMatch` can cache certain searches for tags of the same document,
1455
so if we are given a tag, all tags are from the same document,
1456
and we can take advantage of the optimization.
1457
1458
Any other kind of iterable could have tags from different documents or detached tags,
1459
so for those, we use a new `CSSMatch` for each item in the iterable.
1460
"""
1461
1462
if CSSMatch.is_tag(iterable):
1463
return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
1464
else:
1465
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
1466
1467
def select_one(self, tag):
1468
"""Select a single tag."""
1469
1470
tags = self.select(tag, limit=1)
1471
return tags[0] if tags else None
1472
1473
def select(self, tag, limit=0):
1474
"""Select the specified tags."""
1475
1476
return list(self.iselect(tag, limit))
1477
1478
def iselect(self, tag, limit=0):
1479
"""Iterate the specified tags."""
1480
1481
for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
1482
yield el
1483
1484
def __repr__(self): # pragma: no cover
1485
"""Representation."""
1486
1487
return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(
1488
self.pattern,
1489
self.namespaces,
1490
self.custom,
1491
self.flags
1492
)
1493
1494
__str__ = __repr__
1495
1496
1497
ct.pickle_register(SoupSieve)
1498
1499