Path: blob/master/venv/Lib/site-packages/soupsieve/css_match.py
811 views
"""CSS matcher."""1from datetime import datetime2from . import util3import re4from .import css_types as ct5import unicodedata67# Empty tag pattern (whitespace okay)8RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')910RE_NOT_WS = re.compile('[^ \t\r\n\f]+')1112# Relationships13REL_PARENT = ' '14REL_CLOSE_PARENT = '>'15REL_SIBLING = '~'16REL_CLOSE_SIBLING = '+'1718# Relationships for :has() (forward looking)19REL_HAS_PARENT = ': '20REL_HAS_CLOSE_PARENT = ':>'21REL_HAS_SIBLING = ':~'22REL_HAS_CLOSE_SIBLING = ':+'2324NS_XHTML = 'http://www.w3.org/1999/xhtml'25NS_XML = 'http://www.w3.org/XML/1998/namespace'2627DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL28RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE2930DIR_MAP = {31'ltr': ct.SEL_DIR_LTR,32'rtl': ct.SEL_DIR_RTL,33'auto': 034}3536RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")37RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')38RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')39RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')40RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')41RE_DATETIME = re.compile(42r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'43)44RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')4546MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November47FEB = 248SHORT_MONTH = 3049LONG_MONTH = 3150FEB_MONTH = 2851FEB_LEAP_MONTH = 2952DAYS_IN_WEEK = 7535455class _FakeParent(object):56"""57Fake parent class.5859When we have a fragment with no `BeautifulSoup` document object,60we can't evaluate `nth` selectors properly. Create a temporary61fake parent so we can traverse the root element as a child.62"""6364def __init__(self, element):65"""Initialize."""6667self.contents = [element]6869def __len__(self):70"""Length."""7172return len(self.contents)737475class _DocumentNav(object):76"""Navigate a Beautiful Soup document."""7778@classmethod79def assert_valid_input(cls, tag):80"""Check if valid input tag or document."""8182# Fail on unexpected types.83if not cls.is_tag(tag):84raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))8586@staticmethod87def is_doc(obj):88"""Is `BeautifulSoup` object."""8990import bs491return isinstance(obj, bs4.BeautifulSoup)9293@staticmethod94def is_tag(obj):95"""Is tag."""9697import bs498return isinstance(obj, bs4.Tag)99100@staticmethod101def is_declaration(obj): # pragma: no cover102"""Is declaration."""103104import bs4105return isinstance(obj, bs4.Declaration)106107@staticmethod108def is_cdata(obj):109"""Is CDATA."""110111import bs4112return isinstance(obj, bs4.CData)113114@staticmethod115def is_processing_instruction(obj): # pragma: no cover116"""Is processing instruction."""117118import bs4119return isinstance(obj, bs4.ProcessingInstruction)120121@staticmethod122def is_navigable_string(obj):123"""Is navigable string."""124125import bs4126return isinstance(obj, bs4.NavigableString)127128@staticmethod129def is_special_string(obj):130"""Is special string."""131132import bs4133return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))134135@classmethod136def is_content_string(cls, obj):137"""Check if node is content string."""138139return cls.is_navigable_string(obj) and not cls.is_special_string(obj)140141@staticmethod142def create_fake_parent(el):143"""Create fake parent for a given element."""144145return _FakeParent(el)146147@staticmethod148def is_xml_tree(el):149"""Check if element (or document) is from a XML tree."""150151return el._is_xml152153def is_iframe(self, el):154"""Check if element is an `iframe`."""155156return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el)157158def is_root(self, el):159"""160Return whether element is a root element.161162We check that the element is the root of the tree (which we have already pre-calculated),163and we check if it is the root element under an `iframe`.164"""165166root = self.root and self.root is el167if not root:168parent = self.get_parent(el)169root = parent is not None and self.is_html and self.is_iframe(parent)170return root171172def get_contents(self, el, no_iframe=False):173"""Get contents or contents in reverse."""174if not no_iframe or not self.is_iframe(el):175for content in el.contents:176yield content177178def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False):179"""Get children."""180181if not no_iframe or not self.is_iframe(el):182last = len(el.contents) - 1183if start is None:184index = last if reverse else 0185else:186index = start187end = -1 if reverse else last + 1188incr = -1 if reverse else 1189190if 0 <= index <= last:191while index != end:192node = el.contents[index]193index += incr194if not tags or self.is_tag(node):195yield node196197def get_descendants(self, el, tags=True, no_iframe=False):198"""Get descendants."""199200if not no_iframe or not self.is_iframe(el):201next_good = None202for child in el.descendants:203204if next_good is not None:205if child is not next_good:206continue207next_good = None208209is_tag = self.is_tag(child)210211if no_iframe and is_tag and self.is_iframe(child):212if child.next_sibling is not None:213next_good = child.next_sibling214else:215last_child = child216while self.is_tag(last_child) and last_child.contents:217last_child = last_child.contents[-1]218next_good = last_child.next_element219yield child220if next_good is None:221break222# Coverage isn't seeing this even though it's executed223continue # pragma: no cover224225if not tags or is_tag:226yield child227228def get_parent(self, el, no_iframe=False):229"""Get parent."""230231parent = el.parent232if no_iframe and parent is not None and self.is_iframe(parent):233parent = None234return parent235236@staticmethod237def get_tag_name(el):238"""Get tag."""239240return el.name241242@staticmethod243def get_prefix_name(el):244"""Get prefix."""245246return el.prefix247248@staticmethod249def get_uri(el):250"""Get namespace `URI`."""251252return el.namespace253254@classmethod255def get_next(cls, el, tags=True):256"""Get next sibling tag."""257258sibling = el.next_sibling259while tags and not cls.is_tag(sibling) and sibling is not None:260sibling = sibling.next_sibling261return sibling262263@classmethod264def get_previous(cls, el, tags=True):265"""Get previous sibling tag."""266267sibling = el.previous_sibling268while tags and not cls.is_tag(sibling) and sibling is not None:269sibling = sibling.previous_sibling270return sibling271272@staticmethod273def has_html_ns(el):274"""275Check if element has an HTML namespace.276277This is a bit different than whether a element is treated as having an HTML namespace,278like we do in the case of `is_html_tag`.279"""280281ns = getattr(el, 'namespace') if el else None282return ns and ns == NS_XHTML283284@staticmethod285def split_namespace(el, attr_name):286"""Return namespace and attribute name without the prefix."""287288return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)289290@staticmethod291def get_attribute_by_name(el, name, default=None):292"""Get attribute by name."""293294value = default295if el._is_xml:296try:297value = el.attrs[name]298except KeyError:299pass300else:301for k, v in el.attrs.items():302if util.lower(k) == name:303value = v304break305return value306307@staticmethod308def iter_attributes(el):309"""Iterate attributes."""310311for k, v in el.attrs.items():312yield k, v313314@classmethod315def get_classes(cls, el):316"""Get classes."""317318classes = cls.get_attribute_by_name(el, 'class', [])319if isinstance(classes, str):320classes = RE_NOT_WS.findall(classes)321return classes322323def get_text(self, el, no_iframe=False):324"""Get text."""325326return ''.join(327[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]328)329330331class Inputs(object):332"""Class for parsing and validating input items."""333334@staticmethod335def validate_day(year, month, day):336"""Validate day."""337338max_days = LONG_MONTH339if month == FEB:340max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH341elif month in MONTHS_30:342max_days = SHORT_MONTH343return 1 <= day <= max_days344345@staticmethod346def validate_week(year, week):347"""Validate week."""348349max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]350if max_week == 1:351max_week = 53352return 1 <= week <= max_week353354@staticmethod355def validate_month(month):356"""Validate month."""357358return 1 <= month <= 12359360@staticmethod361def validate_year(year):362"""Validate year."""363364return 1 <= year365366@staticmethod367def validate_hour(hour):368"""Validate hour."""369370return 0 <= hour <= 23371372@staticmethod373def validate_minutes(minutes):374"""Validate minutes."""375376return 0 <= minutes <= 59377378@classmethod379def parse_value(cls, itype, value):380"""Parse the input value."""381382parsed = None383if itype == "date":384m = RE_DATE.match(value)385if m:386year = int(m.group('year'), 10)387month = int(m.group('month'), 10)388day = int(m.group('day'), 10)389if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):390parsed = (year, month, day)391elif itype == "month":392m = RE_MONTH.match(value)393if m:394year = int(m.group('year'), 10)395month = int(m.group('month'), 10)396if cls.validate_year(year) and cls.validate_month(month):397parsed = (year, month)398elif itype == "week":399m = RE_WEEK.match(value)400if m:401year = int(m.group('year'), 10)402week = int(m.group('week'), 10)403if cls.validate_year(year) and cls.validate_week(year, week):404parsed = (year, week)405elif itype == "time":406m = RE_TIME.match(value)407if m:408hour = int(m.group('hour'), 10)409minutes = int(m.group('minutes'), 10)410if cls.validate_hour(hour) and cls.validate_minutes(minutes):411parsed = (hour, minutes)412elif itype == "datetime-local":413m = RE_DATETIME.match(value)414if m:415year = int(m.group('year'), 10)416month = int(m.group('month'), 10)417day = int(m.group('day'), 10)418hour = int(m.group('hour'), 10)419minutes = int(m.group('minutes'), 10)420if (421cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and422cls.validate_hour(hour) and cls.validate_minutes(minutes)423):424parsed = (year, month, day, hour, minutes)425elif itype in ("number", "range"):426m = RE_NUM.match(value)427if m:428parsed = float(m.group('value'))429return parsed430431432class _Match(object):433"""Perform CSS matching."""434435def __init__(self, selectors, scope, namespaces, flags):436"""Initialize."""437438self.assert_valid_input(scope)439self.tag = scope440self.cached_meta_lang = []441self.cached_default_forms = []442self.cached_indeterminate_forms = []443self.selectors = selectors444self.namespaces = {} if namespaces is None else namespaces445self.flags = flags446self.iframe_restrict = False447448# Find the root element for the whole tree449doc = scope450parent = self.get_parent(doc)451while parent:452doc = parent453parent = self.get_parent(doc)454root = None455if not self.is_doc(doc):456root = doc457else:458for child in self.get_children(doc):459root = child460break461462self.root = root463self.scope = scope if scope is not doc else root464self.has_html_namespace = self.has_html_ns(root)465466# A document can be both XML and HTML (XHTML)467self.is_xml = self.is_xml_tree(doc)468self.is_html = not self.is_xml or self.has_html_namespace469470def supports_namespaces(self):471"""Check if namespaces are supported in the HTML type."""472473return self.is_xml or self.has_html_namespace474475def get_tag_ns(self, el):476"""Get tag namespace."""477478if self.supports_namespaces():479namespace = ''480ns = self.get_uri(el)481if ns:482namespace = ns483else:484namespace = NS_XHTML485return namespace486487def is_html_tag(self, el):488"""Check if tag is in HTML namespace."""489490return self.get_tag_ns(el) == NS_XHTML491492def get_tag(self, el):493"""Get tag."""494495name = self.get_tag_name(el)496return util.lower(name) if name is not None and not self.is_xml else name497498def get_prefix(self, el):499"""Get prefix."""500501prefix = self.get_prefix_name(el)502return util.lower(prefix) if prefix is not None and not self.is_xml else prefix503504def find_bidi(self, el):505"""Get directionality from element text."""506507for node in self.get_children(el, tags=False):508509# Analyze child text nodes510if self.is_tag(node):511512# Avoid analyzing certain elements specified in the specification.513direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)514if (515self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or516not self.is_html_tag(node) or517direction is not None518):519continue # pragma: no cover520521# Check directionality of this node's text522value = self.find_bidi(node)523if value is not None:524return value525526# Direction could not be determined527continue # pragma: no cover528529# Skip `doctype` comments, etc.530if self.is_special_string(node):531continue532533# Analyze text nodes for directionality.534for c in node:535bidi = unicodedata.bidirectional(c)536if bidi in ('AL', 'R', 'L'):537return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL538return None539540def extended_language_filter(self, lang_range, lang_tag):541"""Filter the language tags."""542543match = True544lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()545ranges = lang_range.split('-')546subtags = lang_tag.lower().split('-')547length = len(ranges)548rindex = 0549sindex = 0550r = ranges[rindex]551s = subtags[sindex]552553# Primary tag needs to match554if r != '*' and r != s:555match = False556557rindex += 1558sindex += 1559560# Match until we run out of ranges561while match and rindex < length:562r = ranges[rindex]563try:564s = subtags[sindex]565except IndexError:566# Ran out of subtags,567# but we still have ranges568match = False569continue570571# Empty range572if not r:573match = False574continue575576# Matched range577elif s == r:578rindex += 1579580# Implicit wildcard cannot match581# singletons582elif len(s) == 1:583match = False584continue585586# Implicitly matched, so grab next subtag587sindex += 1588589return match590591def match_attribute_name(self, el, attr, prefix):592"""Match attribute name and return value if it exists."""593594value = None595if self.supports_namespaces():596value = None597# If we have not defined namespaces, we can't very well find them, so don't bother trying.598if prefix:599ns = self.namespaces.get(prefix)600if ns is None and prefix != '*':601return None602else:603ns = None604605for k, v in self.iter_attributes(el):606607# Get attribute parts608namespace, name = self.split_namespace(el, k)609610# Can't match a prefix attribute as we haven't specified one to match611# Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.612if ns is None:613if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):614value = v615break616# Coverage is not finding this even though it is executed.617# Adding a print statement before this (and erasing coverage) causes coverage to find the line.618# Ignore the false positive message.619continue # pragma: no cover620621# We can't match our desired prefix attribute as the attribute doesn't have a prefix622if namespace is None or ns != namespace and prefix != '*':623continue624625# The attribute doesn't match.626if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):627continue628629value = v630break631else:632for k, v in self.iter_attributes(el):633if util.lower(attr) != util.lower(k):634continue635value = v636break637return value638639def match_namespace(self, el, tag):640"""Match the namespace of the element."""641642match = True643namespace = self.get_tag_ns(el)644default_namespace = self.namespaces.get('')645tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None)646# We must match the default namespace if one is not provided647if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):648match = False649# If we specified `|tag`, we must not have a namespace.650elif (tag.prefix is not None and tag.prefix == '' and namespace):651match = False652# Verify prefix matches653elif (654tag.prefix and655tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)656):657match = False658return match659660def match_attributes(self, el, attributes):661"""Match attributes."""662663match = True664if attributes:665for a in attributes:666value = self.match_attribute_name(el, a.attribute, a.prefix)667pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern668if isinstance(value, list):669value = ' '.join(value)670if value is None:671match = False672break673elif pattern is None:674continue675elif pattern.match(value) is None:676match = False677break678return match679680def match_tagname(self, el, tag):681"""Match tag name."""682683name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)684return not (685name is not None and686name not in (self.get_tag(el), '*')687)688689def match_tag(self, el, tag):690"""Match the tag."""691692match = True693if tag is not None:694# Verify namespace695if not self.match_namespace(el, tag):696match = False697if not self.match_tagname(el, tag):698match = False699return match700701def match_past_relations(self, el, relation):702"""Match past relationship."""703704found = False705if relation[0].rel_type == REL_PARENT:706parent = self.get_parent(el, no_iframe=self.iframe_restrict)707while not found and parent:708found = self.match_selectors(parent, relation)709parent = self.get_parent(parent, no_iframe=self.iframe_restrict)710elif relation[0].rel_type == REL_CLOSE_PARENT:711parent = self.get_parent(el, no_iframe=self.iframe_restrict)712if parent:713found = self.match_selectors(parent, relation)714elif relation[0].rel_type == REL_SIBLING:715sibling = self.get_previous(el)716while not found and sibling:717found = self.match_selectors(sibling, relation)718sibling = self.get_previous(sibling)719elif relation[0].rel_type == REL_CLOSE_SIBLING:720sibling = self.get_previous(el)721if sibling and self.is_tag(sibling):722found = self.match_selectors(sibling, relation)723return found724725def match_future_child(self, parent, relation, recursive=False):726"""Match future child."""727728match = False729children = self.get_descendants if recursive else self.get_children730for child in children(parent, no_iframe=self.iframe_restrict):731match = self.match_selectors(child, relation)732if match:733break734return match735736def match_future_relations(self, el, relation):737"""Match future relationship."""738739found = False740if relation[0].rel_type == REL_HAS_PARENT:741found = self.match_future_child(el, relation, True)742elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:743found = self.match_future_child(el, relation)744elif relation[0].rel_type == REL_HAS_SIBLING:745sibling = self.get_next(el)746while not found and sibling:747found = self.match_selectors(sibling, relation)748sibling = self.get_next(sibling)749elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:750sibling = self.get_next(el)751if sibling and self.is_tag(sibling):752found = self.match_selectors(sibling, relation)753return found754755def match_relations(self, el, relation):756"""Match relationship to other elements."""757758found = False759760if relation[0].rel_type.startswith(':'):761found = self.match_future_relations(el, relation)762else:763found = self.match_past_relations(el, relation)764765return found766767def match_id(self, el, ids):768"""Match element's ID."""769770found = True771for i in ids:772if i != self.get_attribute_by_name(el, 'id', ''):773found = False774break775return found776777def match_classes(self, el, classes):778"""Match element's classes."""779780current_classes = self.get_classes(el)781found = True782for c in classes:783if c not in current_classes:784found = False785break786return found787788def match_root(self, el):789"""Match element as root."""790791is_root = self.is_root(el)792if is_root:793sibling = self.get_previous(el, tags=False)794while is_root and sibling is not None:795if (796self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or797self.is_cdata(sibling)798):799is_root = False800else:801sibling = self.get_previous(sibling, tags=False)802if is_root:803sibling = self.get_next(el, tags=False)804while is_root and sibling is not None:805if (806self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or807self.is_cdata(sibling)808):809is_root = False810else:811sibling = self.get_next(sibling, tags=False)812return is_root813814def match_scope(self, el):815"""Match element as scope."""816817return self.scope is el818819def match_nth_tag_type(self, el, child):820"""Match tag type for `nth` matches."""821822return(823(self.get_tag(child) == self.get_tag(el)) and824(self.get_tag_ns(child) == self.get_tag_ns(el))825)826827def match_nth(self, el, nth):828"""Match `nth` elements."""829830matched = True831832for n in nth:833matched = False834if n.selectors and not self.match_selectors(el, n.selectors):835break836parent = self.get_parent(el)837if parent is None:838parent = self.create_fake_parent(el)839last = n.last840last_index = len(parent) - 1841index = last_index if last else 0842relative_index = 0843a = n.a844b = n.b845var = n.n846count = 0847count_incr = 1848factor = -1 if last else 1849idx = last_idx = a * count + b if var else a850851# We can only adjust bounds within a variable index852if var:853# Abort if our nth index is out of bounds and only getting further out of bounds as we increment.854# Otherwise, increment to try to get in bounds.855adjust = None856while idx < 1 or idx > last_index:857if idx < 0:858diff_low = 0 - idx859if adjust is not None and adjust == 1:860break861adjust = -1862count += count_incr863idx = last_idx = a * count + b if var else a864diff = 0 - idx865if diff >= diff_low:866break867else:868diff_high = idx - last_index869if adjust is not None and adjust == -1:870break871adjust = 1872count += count_incr873idx = last_idx = a * count + b if var else a874diff = idx - last_index875if diff >= diff_high:876break877diff_high = diff878879# If a < 0, our count is working backwards, so floor the index by increasing the count.880# Find the count that yields the lowest, in bound value and use that.881# Lastly reverse count increment so that we'll increase our index.882lowest = count883if a < 0:884while idx >= 1:885lowest = count886count += count_incr887idx = last_idx = a * count + b if var else a888count_incr = -1889count = lowest890idx = last_idx = a * count + b if var else a891892# Evaluate elements while our calculated nth index is still in range893while 1 <= idx <= last_index + 1:894child = None895# Evaluate while our child index is still in range.896for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):897index += factor898if not self.is_tag(child):899continue900# Handle `of S` in `nth-child`901if n.selectors and not self.match_selectors(child, n.selectors):902continue903# Handle `of-type`904if n.of_type and not self.match_nth_tag_type(el, child):905continue906relative_index += 1907if relative_index == idx:908if child is el:909matched = True910else:911break912if child is el:913break914if child is el:915break916last_idx = idx917count += count_incr918if count < 0:919# Count is counting down and has now ventured into invalid territory.920break921idx = a * count + b if var else a922if last_idx == idx:923break924if not matched:925break926return matched927928def match_empty(self, el):929"""Check if element is empty (if requested)."""930931is_empty = True932for child in self.get_children(el, tags=False):933if self.is_tag(child):934is_empty = False935break936elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):937is_empty = False938break939return is_empty940941def match_subselectors(self, el, selectors):942"""Match selectors."""943944match = True945for sel in selectors:946if not self.match_selectors(el, sel):947match = False948return match949950def match_contains(self, el, contains):951"""Match element if it contains text."""952953match = True954content = None955for contain_list in contains:956if content is None:957content = self.get_text(el, no_iframe=self.is_html)958found = False959for text in contain_list.text:960if text in content:961found = True962break963if not found:964match = False965return match966967def match_default(self, el):968"""Match default."""969970match = False971972# Find this input's form973form = None974parent = self.get_parent(el, no_iframe=True)975while parent and form is None:976if self.get_tag(parent) == 'form' and self.is_html_tag(parent):977form = parent978else:979parent = self.get_parent(parent, no_iframe=True)980981# Look in form cache to see if we've already located its default button982found_form = False983for f, t in self.cached_default_forms:984if f is form:985found_form = True986if t is el:987match = True988break989990# We didn't have the form cached, so look for its default button991if not found_form:992for child in self.get_descendants(form, no_iframe=True):993name = self.get_tag(child)994# Can't do nested forms (haven't figured out why we never hit this)995if name == 'form': # pragma: no cover996break997if name in ('input', 'button'):998v = self.get_attribute_by_name(child, 'type', '')999if v and util.lower(v) == 'submit':1000self.cached_default_forms.append([form, child])1001if el is child:1002match = True1003break1004return match10051006def match_indeterminate(self, el):1007"""Match default."""10081009match = False1010name = self.get_attribute_by_name(el, 'name')10111012def get_parent_form(el):1013"""Find this input's form."""1014form = None1015parent = self.get_parent(el, no_iframe=True)1016while form is None:1017if self.get_tag(parent) == 'form' and self.is_html_tag(parent):1018form = parent1019break1020last_parent = parent1021parent = self.get_parent(parent, no_iframe=True)1022if parent is None:1023form = last_parent1024break1025return form10261027form = get_parent_form(el)10281029# Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate1030found_form = False1031for f, n, i in self.cached_indeterminate_forms:1032if f is form and n == name:1033found_form = True1034if i is True:1035match = True1036break10371038# We didn't have the form cached, so validate that the radio button is indeterminate1039if not found_form:1040checked = False1041for child in self.get_descendants(form, no_iframe=True):1042if child is el:1043continue1044tag_name = self.get_tag(child)1045if tag_name == 'input':1046is_radio = False1047check = False1048has_name = False1049for k, v in self.iter_attributes(child):1050if util.lower(k) == 'type' and util.lower(v) == 'radio':1051is_radio = True1052elif util.lower(k) == 'name' and v == name:1053has_name = True1054elif util.lower(k) == 'checked':1055check = True1056if is_radio and check and has_name and get_parent_form(child) is form:1057checked = True1058break1059if checked:1060break1061if not checked:1062match = True1063self.cached_indeterminate_forms.append([form, name, match])10641065return match10661067def match_lang(self, el, langs):1068"""Match languages."""10691070match = False1071has_ns = self.supports_namespaces()1072root = self.root1073has_html_namespace = self.has_html_namespace10741075# Walk parents looking for `lang` (HTML) or `xml:lang` XML property.1076parent = el1077found_lang = None1078last = None1079while not found_lang:1080has_html_ns = self.has_html_ns(parent)1081for k, v in self.iter_attributes(parent):1082attr_ns, attr = self.split_namespace(parent, k)1083if (1084((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or1085(1086has_ns and not has_html_ns and attr_ns == NS_XML and1087(util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'1088)1089):1090found_lang = v1091break1092last = parent1093parent = self.get_parent(parent, no_iframe=self.is_html)10941095if parent is None:1096root = last1097has_html_namespace = self.has_html_ns(root)1098parent = last1099break11001101# Use cached meta language.1102if not found_lang and self.cached_meta_lang:1103for cache in self.cached_meta_lang:1104if root is cache[0]:1105found_lang = cache[1]11061107# If we couldn't find a language, and the document is HTML, look to meta to determine language.1108if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):1109# Find head1110found = False1111for tag in ('html', 'head'):1112found = False1113for child in self.get_children(parent, no_iframe=self.is_html):1114if self.get_tag(child) == tag and self.is_html_tag(child):1115found = True1116parent = child1117break1118if not found: # pragma: no cover1119break11201121# Search meta tags1122if found:1123for child in parent:1124if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):1125c_lang = False1126content = None1127for k, v in self.iter_attributes(child):1128if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':1129c_lang = True1130if util.lower(k) == 'content':1131content = v1132if c_lang and content:1133found_lang = content1134self.cached_meta_lang.append((root, found_lang))1135break1136if found_lang:1137break1138if not found_lang:1139self.cached_meta_lang.append((root, False))11401141# If we determined a language, compare.1142if found_lang:1143for patterns in langs:1144match = False1145for pattern in patterns:1146if self.extended_language_filter(pattern, found_lang):1147match = True1148if not match:1149break11501151return match11521153def match_dir(self, el, directionality):1154"""Check directionality."""11551156# If we have to match both left and right, we can't match either.1157if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:1158return False11591160if el is None or not self.is_html_tag(el):1161return False11621163# Element has defined direction of left to right or right to left1164direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)1165if direction not in (None, 0):1166return direction == directionality11671168# Element is the document element (the root) and no direction assigned, assume left to right.1169is_root = self.is_root(el)1170if is_root and direction is None:1171return ct.SEL_DIR_LTR == directionality11721173# If `input[type=telephone]` and no direction is assigned, assume left to right.1174name = self.get_tag(el)1175is_input = name == 'input'1176is_textarea = name == 'textarea'1177is_bdi = name == 'bdi'1178itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''1179if is_input and itype == 'tel' and direction is None:1180return ct.SEL_DIR_LTR == directionality11811182# Auto handling for text inputs1183if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:1184if is_textarea:1185value = []1186for node in self.get_contents(el, no_iframe=True):1187if self.is_content_string(node):1188value.append(node)1189value = ''.join(value)1190else:1191value = self.get_attribute_by_name(el, 'value', '')1192if value:1193for c in value:1194bidi = unicodedata.bidirectional(c)1195if bidi in ('AL', 'R', 'L'):1196direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL1197return direction == directionality1198# Assume left to right1199return ct.SEL_DIR_LTR == directionality1200elif is_root:1201return ct.SEL_DIR_LTR == directionality1202return self.match_dir(self.get_parent(el, no_iframe=True), directionality)12031204# Auto handling for `bdi` and other non text inputs.1205if (is_bdi and direction is None) or direction == 0:1206direction = self.find_bidi(el)1207if direction is not None:1208return direction == directionality1209elif is_root:1210return ct.SEL_DIR_LTR == directionality1211return self.match_dir(self.get_parent(el, no_iframe=True), directionality)12121213# Match parents direction1214return self.match_dir(self.get_parent(el, no_iframe=True), directionality)12151216def match_range(self, el, condition):1217"""1218Match range.12191220Behavior is modeled after what we see in browsers. Browsers seem to evaluate1221if the value is out of range, and if not, it is in range. So a missing value1222will not evaluate out of range; therefore, value is in range. Personally, I1223feel like this should evaluate as neither in or out of range.1224"""12251226out_of_range = False12271228itype = util.lower(self.get_attribute_by_name(el, 'type'))1229mn = self.get_attribute_by_name(el, 'min', None)1230if mn is not None:1231mn = Inputs.parse_value(itype, mn)1232mx = self.get_attribute_by_name(el, 'max', None)1233if mx is not None:1234mx = Inputs.parse_value(itype, mx)12351236# There is no valid min or max, so we cannot evaluate a range1237if mn is None and mx is None:1238return False12391240value = self.get_attribute_by_name(el, 'value', None)1241if value is not None:1242value = Inputs.parse_value(itype, value)1243if value is not None:1244if itype in ("date", "datetime-local", "month", "week", "number", "range"):1245if mn is not None and value < mn:1246out_of_range = True1247if not out_of_range and mx is not None and value > mx:1248out_of_range = True1249elif itype == "time":1250if mn is not None and mx is not None and mn > mx:1251# Time is periodic, so this is a reversed/discontinuous range1252if value < mn and value > mx:1253out_of_range = True1254else:1255if mn is not None and value < mn:1256out_of_range = True1257if not out_of_range and mx is not None and value > mx:1258out_of_range = True12591260return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range12611262def match_defined(self, el):1263"""1264Match defined.12651266`:defined` is related to custom elements in a browser.12671268- If the document is XML (not XHTML), all tags will match.1269- Tags that are not custom (don't have a hyphen) are marked defined.1270- If the tag has a prefix (without or without a namespace), it will not match.12711272This is of course requires the parser to provide us with the proper prefix and namespace info,1273if it doesn't, there is nothing we can do.1274"""12751276name = self.get_tag(el)1277return (1278name.find('-') == -1 or1279name.find(':') != -1 or1280self.get_prefix(el) is not None1281)12821283def match_placeholder_shown(self, el):1284"""1285Match placeholder shown according to HTML spec.12861287- text area should be checked if they have content. A single newline does not count as content.12881289"""12901291match = False1292content = self.get_text(el)1293if content in ('', '\n'):1294match = True12951296return match12971298def match_selectors(self, el, selectors):1299"""Check if element matches one of the selectors."""13001301match = False1302is_not = selectors.is_not1303is_html = selectors.is_html13041305# Internal selector lists that use the HTML flag, will automatically get the `html` namespace.1306if is_html:1307namespaces = self.namespaces1308iframe_restrict = self.iframe_restrict1309self.namespaces = {'html': NS_XHTML}1310self.iframe_restrict = True13111312if not is_html or self.is_html:1313for selector in selectors:1314match = is_not1315# We have a un-matchable situation (like `:focus` as you can focus an element in this environment)1316if isinstance(selector, ct.SelectorNull):1317continue1318# Verify tag matches1319if not self.match_tag(el, selector.tag):1320continue1321# Verify tag is defined1322if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):1323continue1324# Verify element is root1325if selector.flags & ct.SEL_ROOT and not self.match_root(el):1326continue1327# Verify element is scope1328if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):1329continue1330# Verify element has placeholder shown1331if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):1332continue1333# Verify `nth` matches1334if not self.match_nth(el, selector.nth):1335continue1336if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):1337continue1338# Verify id matches1339if selector.ids and not self.match_id(el, selector.ids):1340continue1341# Verify classes match1342if selector.classes and not self.match_classes(el, selector.classes):1343continue1344# Verify attribute(s) match1345if not self.match_attributes(el, selector.attributes):1346continue1347# Verify ranges1348if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):1349continue1350# Verify language patterns1351if selector.lang and not self.match_lang(el, selector.lang):1352continue1353# Verify pseudo selector patterns1354if selector.selectors and not self.match_subselectors(el, selector.selectors):1355continue1356# Verify relationship selectors1357if selector.relation and not self.match_relations(el, selector.relation):1358continue1359# Validate that the current default selector match corresponds to the first submit button in the form1360if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):1361continue1362# Validate that the unset radio button is among radio buttons with the same name in a form that are1363# also not set.1364if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):1365continue1366# Validate element directionality1367if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):1368continue1369# Validate that the tag contains the specified text.1370if not self.match_contains(el, selector.contains):1371continue1372match = not is_not1373break13741375# Restore actual namespaces being used for external selector lists1376if is_html:1377self.namespaces = namespaces1378self.iframe_restrict = iframe_restrict13791380return match13811382def select(self, limit=0):1383"""Match all tags under the targeted tag."""13841385if limit < 1:1386limit = None13871388for child in self.get_descendants(self.tag):1389if self.match(child):1390yield child1391if limit is not None:1392limit -= 11393if limit < 1:1394break13951396def closest(self):1397"""Match closest ancestor."""13981399current = self.tag1400closest = None1401while closest is None and current is not None:1402if self.match(current):1403closest = current1404else:1405current = self.get_parent(current)1406return closest14071408def filter(self): # noqa A0011409"""Filter tag's children."""14101411return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]14121413def match(self, el):1414"""Match."""14151416return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)141714181419class CSSMatch(_DocumentNav, _Match):1420"""The Beautiful Soup CSS match class."""142114221423class SoupSieve(ct.Immutable):1424"""Compiled Soup Sieve selector matching object."""14251426__slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")14271428def __init__(self, pattern, selectors, namespaces, custom, flags):1429"""Initialize."""14301431super(SoupSieve, self).__init__(1432pattern=pattern,1433selectors=selectors,1434namespaces=namespaces,1435custom=custom,1436flags=flags1437)14381439def match(self, tag):1440"""Match."""14411442return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)14431444def closest(self, tag):1445"""Match closest ancestor."""14461447return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()14481449def filter(self, iterable): # noqa A0011450"""1451Filter.14521453`CSSMatch` can cache certain searches for tags of the same document,1454so if we are given a tag, all tags are from the same document,1455and we can take advantage of the optimization.14561457Any other kind of iterable could have tags from different documents or detached tags,1458so for those, we use a new `CSSMatch` for each item in the iterable.1459"""14601461if CSSMatch.is_tag(iterable):1462return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()1463else:1464return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]14651466def select_one(self, tag):1467"""Select a single tag."""14681469tags = self.select(tag, limit=1)1470return tags[0] if tags else None14711472def select(self, tag, limit=0):1473"""Select the specified tags."""14741475return list(self.iselect(tag, limit))14761477def iselect(self, tag, limit=0):1478"""Iterate the specified tags."""14791480for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):1481yield el14821483def __repr__(self): # pragma: no cover1484"""Representation."""14851486return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(1487self.pattern,1488self.namespaces,1489self.custom,1490self.flags1491)14921493__str__ = __repr__149414951496ct.pickle_register(SoupSieve)149714981499