Path: blob/master/venv/Lib/site-packages/lxml/_elementpath.py
811 views
# cython: language_level=212#3# ElementTree4# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $5#6# limited xpath support for element trees7#8# history:9# 2003-05-23 fl created10# 2003-05-28 fl added support for // etc11# 2003-08-27 fl fixed parsing of periods in element names12# 2007-09-10 fl new selection engine13# 2007-09-12 fl fixed parent selector14# 2007-09-13 fl added iterfind; changed findall to return a list15# 2007-11-30 fl added namespaces support16# 2009-10-30 fl added child element value filter17#18# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.19#20# [email protected]21# http://www.pythonware.com22#23# --------------------------------------------------------------------24# The ElementTree toolkit is25#26# Copyright (c) 1999-2009 by Fredrik Lundh27#28# By obtaining, using, and/or copying this software and/or its29# associated documentation, you agree that you have read, understood,30# and will comply with the following terms and conditions:31#32# Permission to use, copy, modify, and distribute this software and33# its associated documentation for any purpose and without fee is34# hereby granted, provided that the above copyright notice appears in35# all copies, and that both that copyright notice and this permission36# notice appear in supporting documentation, and that the name of37# Secret Labs AB or the author not be used in advertising or publicity38# pertaining to distribution of the software without specific, written39# prior permission.40#41# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD42# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-43# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR44# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY45# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,46# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS47# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE48# OF THIS SOFTWARE.49# --------------------------------------------------------------------5051##52# Implementation module for XPath support. There's usually no reason53# to import this module directly; the <b>ElementTree</b> does this for54# you, if needed.55##5657from __future__ import absolute_import5859import re6061xpath_tokenizer_re = re.compile(62"("63"'[^']*'|\"[^\"]*\"|"64"::|"65"//?|"66r"\.\.|"67r"\(\)|"68r"[/.*:\[\]\(\)@=])|"69r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"70r"\s+"71)7273def xpath_tokenizer(pattern, namespaces=None):74# ElementTree uses '', lxml used None originally.75default_namespace = (namespaces.get(None) or namespaces.get('')) if namespaces else None76parsing_attribute = False77for token in xpath_tokenizer_re.findall(pattern):78ttype, tag = token79if tag and tag[0] != "{":80if ":" in tag:81prefix, uri = tag.split(":", 1)82try:83if not namespaces:84raise KeyError85yield ttype, "{%s}%s" % (namespaces[prefix], uri)86except KeyError:87raise SyntaxError("prefix %r not found in prefix map" % prefix)88elif default_namespace and not parsing_attribute:89yield ttype, "{%s}%s" % (default_namespace, tag)90else:91yield token92parsing_attribute = False93else:94yield token95parsing_attribute = ttype == '@'969798def prepare_child(next, token):99tag = token[1]100def select(result):101for elem in result:102for e in elem.iterchildren(tag):103yield e104return select105106def prepare_star(next, token):107def select(result):108for elem in result:109for e in elem.iterchildren('*'):110yield e111return select112113def prepare_self(next, token):114def select(result):115return result116return select117118def prepare_descendant(next, token):119token = next()120if token[0] == "*":121tag = "*"122elif not token[0]:123tag = token[1]124else:125raise SyntaxError("invalid descendant")126def select(result):127for elem in result:128for e in elem.iterdescendants(tag):129yield e130return select131132def prepare_parent(next, token):133def select(result):134for elem in result:135parent = elem.getparent()136if parent is not None:137yield parent138return select139140def prepare_predicate(next, token):141# FIXME: replace with real parser!!! refs:142# http://effbot.org/zone/simple-iterator-parser.htm143# http://javascript.crockford.com/tdop/tdop.html144signature = ''145predicate = []146while 1:147token = next()148if token[0] == "]":149break150if token == ('', ''):151# ignore whitespace152continue153if token[0] and token[0][:1] in "'\"":154token = "'", token[0][1:-1]155signature += token[0] or "-"156predicate.append(token[1])157158# use signature to determine predicate type159if signature == "@-":160# [@attribute] predicate161key = predicate[1]162def select(result):163for elem in result:164if elem.get(key) is not None:165yield elem166return select167if signature == "@-='":168# [@attribute='value']169key = predicate[1]170value = predicate[-1]171def select(result):172for elem in result:173if elem.get(key) == value:174yield elem175return select176if signature == "-" and not re.match(r"-?\d+$", predicate[0]):177# [tag]178tag = predicate[0]179def select(result):180for elem in result:181for _ in elem.iterchildren(tag):182yield elem183break184return select185if signature == ".='" or (signature == "-='" and not re.match(r"-?\d+$", predicate[0])):186# [.='value'] or [tag='value']187tag = predicate[0]188value = predicate[-1]189if tag:190def select(result):191for elem in result:192for e in elem.iterchildren(tag):193if "".join(e.itertext()) == value:194yield elem195break196else:197def select(result):198for elem in result:199if "".join(elem.itertext()) == value:200yield elem201return select202if signature == "-" or signature == "-()" or signature == "-()-":203# [index] or [last()] or [last()-index]204if signature == "-":205# [index]206index = int(predicate[0]) - 1207if index < 0:208if index == -1:209raise SyntaxError(210"indices in path predicates are 1-based, not 0-based")211else:212raise SyntaxError("path index >= 1 expected")213else:214if predicate[0] != "last":215raise SyntaxError("unsupported function")216if signature == "-()-":217try:218index = int(predicate[2]) - 1219except ValueError:220raise SyntaxError("unsupported expression")221else:222index = -1223def select(result):224for elem in result:225parent = elem.getparent()226if parent is None:227continue228try:229# FIXME: what if the selector is "*" ?230elems = list(parent.iterchildren(elem.tag))231if elems[index] is elem:232yield elem233except IndexError:234pass235return select236raise SyntaxError("invalid predicate")237238ops = {239"": prepare_child,240"*": prepare_star,241".": prepare_self,242"..": prepare_parent,243"//": prepare_descendant,244"[": prepare_predicate,245}246247248# --------------------------------------------------------------------249250_cache = {}251252253def _build_path_iterator(path, namespaces):254"""compile selector pattern"""255if path[-1:] == "/":256path += "*" # implicit all (FIXME: keep this?)257258cache_key = (path,)259if namespaces:260# lxml originally used None for the default namespace but ElementTree uses the261# more convenient (all-strings-dict) empty string, so we support both here,262# preferring the more convenient '', as long as they aren't ambiguous.263if None in namespaces:264if '' in namespaces and namespaces[None] != namespaces['']:265raise ValueError("Ambiguous default namespace provided: %r versus %r" % (266namespaces[None], namespaces['']))267cache_key += (namespaces[None],) + tuple(sorted(268item for item in namespaces.items() if item[0] is not None))269else:270cache_key += tuple(sorted(namespaces.items()))271272try:273return _cache[cache_key]274except KeyError:275pass276if len(_cache) > 100:277_cache.clear()278279if path[:1] == "/":280raise SyntaxError("cannot use absolute path on element")281stream = iter(xpath_tokenizer(path, namespaces))282try:283_next = stream.next284except AttributeError:285# Python 3286_next = stream.__next__287try:288token = _next()289except StopIteration:290raise SyntaxError("empty path expression")291selector = []292while 1:293try:294selector.append(ops[token[0]](_next, token))295except StopIteration:296raise SyntaxError("invalid path")297try:298token = _next()299if token[0] == "/":300token = _next()301except StopIteration:302break303_cache[cache_key] = selector304return selector305306307##308# Iterate over the matching nodes309310def iterfind(elem, path, namespaces=None):311selector = _build_path_iterator(path, namespaces)312result = iter((elem,))313for select in selector:314result = select(result)315return result316317318##319# Find first matching object.320321def find(elem, path, namespaces=None):322it = iterfind(elem, path, namespaces)323try:324return next(it)325except StopIteration:326return None327328329##330# Find all matching objects.331332def findall(elem, path, namespaces=None):333return list(iterfind(elem, path, namespaces))334335336##337# Find text for first matching object.338339def findtext(elem, path, default=None, namespaces=None):340el = find(elem, path, namespaces)341if el is None:342return default343else:344return el.text or ''345346347