Source code for soupsavvy.selectors.relative

"""
Module with relative selectors and utility components.
They are used for selecting elements based on their relation to the anchor element.

Classes
-------
- `RelativeChild` - matches direct children of the anchor element
- `RelativeDescendant` - matches descendants of the anchor element
- `RelativeNextSibling` - matches next sibling of the anchor element
- `RelativeSubsequentSibling` - matches subsequent siblings of the anchor element
- `RelativeParent` - matches parent of the anchor element
- `RelativeAncestor` - matches ancestors of the anchor element
- `HasSelector` - selects elements based on matching reference elements
- `Anchor` - Anchor object for easily creating relative selectors
"""

from abc import abstractmethod
from typing import Optional

from soupsavvy.base import CompositeSoupSelector, SoupSelector, check_selector
from soupsavvy.interfaces import IElement
from soupsavvy.utils.selector_utils import TagIterator, TagResultSet


[docs] class RelativeSelector(SoupSelector): """ Base class for relative selectors, that are used to find elements relative to the element that is being searched, which is considered an anchor. CSS definition of relative selectors state, that it is a selector representing an element relative to one or more anchor elements preceded by a combinator. In this use case, the anchor element is the element that is being searched, and the combinator is the logic of specific relative selector that is used. Example ------- >>> selector = Anchor > TypeSelector("div") ... selector.find_all(tag) Uses `RelativeChild` selector to find any `div` tag that is a direct child of the tag that is being searched (passed as an argument). In css such selectors can be used for example in `:has` pseudo-class, where selector is anchored to the element: Example ------- >>> :has(> div) Selects any element that has a direct child `div` tag. Notes ------- Recursive parameter is ignored in relative selectors, as they have their own logic of searching the document. """
[docs] def __init__(self, selector: SoupSelector) -> None: """ Initializes RelativeSelector instance with specified selector. Parameters ---------- selector : SoupSelector Selector that is used to find tags relative to the anchor element. """ self._selector = check_selector(selector)
@property def selector(self) -> SoupSelector: """ Returns selector used to find elements relative to the anchor element in this relative selector. Returns ------- SoupSelector Selector used for searching elements relative to the anchor element. """ return self._selector def __eq__(self, other: object) -> bool: # for compatibility with type checkers if not isinstance(other, RelativeSelector): return NotImplemented elif type(self) is not type(other): # checking for exact type match - isinstance(other, self.__class__) # when other is subclass of self.__class__ would call other.__eq__(self) # which is not desired behavior, as it returns False return NotImplemented return self.selector == other.selector def __str__(self) -> str: return f"{self.__class__.__name__}({self.selector})" def __repr__(self) -> str: return str(self)
[docs] class BaseRelativeSibling(RelativeSelector): """ Base class with implementation for relative sibling selectors, searches for next sibling(s) of the anchor element. Child class needs to define: - '_limit' - class attribute to specify how many next siblings to search for. - '_func' - class attribute to specify which method to use for finding siblings. """ @abstractmethod def _func(self, tag: IElement) -> list[IElement]: raise NotImplementedError( "Method '_func' needs to be implemented in child class." )
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: parent = tag.parent if parent is None: return [] # find all sibling tags that match the selector matching = TagResultSet(self.selector.find_all(parent, recursive=False)) siblings = TagResultSet(self._func(tag)) # find intersection between two sets matches = matching & siblings return matches.fetch(limit)
[docs] class BaseAncestorSelector(RelativeSelector): """ Base class with implementation for ancestor selectors, searches for ancestor(s) of the anchor element. Child class needs to define: - '_limit' - class attribute to specify how many ancestors to search for. """ _limit: Optional[int]
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: limit = limit or self._limit # get max number of ancestors that can possibly be returned ancestors = tag.find_ancestors(limit=self._limit) if not ancestors: # if no ancestors, make no sense to search return [] search = ancestors[-1].parent or ancestors[-1] # search within parent of last ancestor matching = TagResultSet(self.selector.find_all(search)) matches = TagResultSet(ancestors) & matching return matches.fetch(limit)
[docs] class RelativeChild(RelativeSelector): """ Selector for finding direct children of the anchor element. Example ------- >>> RelativeChild(TypeSelector("p")) when 'div' element is passed into find methods: Example ------- >>> <div><p></p></div> ✔️ >>> <div><a><p></p></a></div> ❌ >>> <div><a></a></div> ❌ It can be created with `Anchor` instance as well with use of `gt` operator `>`: Example ------- >>> Anchor > TypeSelector("p") Notes ------- Behavior of `RelativeChild` selector is equivalent to using find methods of selector with `recursive=False` and is implemented to support 'HasSelector' and `ChildCombinator` selectors. """
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: return self.selector.find_all(tag, recursive=False, limit=limit)
[docs] class RelativeDescendant(RelativeSelector): """ Selector for finding descendants of the anchor element. Example ------- >>> RelativeDescendant(TypeSelector("p")) when 'div' element is passed into find methods: Example ------- >>> <div><p></p></div> ✔️ >>> <div><a><p></p></a></div> ✔️ >>> <div><a></a></div> ❌ >>> <div></div><p></p> ❌ It can be created with `Anchor` instance as well with use of `right shift` operator `>>`: Example ------- >>> Anchor >> TypeSelector("p") Notes ------- Behavior of `RelativeDescendant` selector is equivalent to using find methods of selector with default `recursive=True` and is implemented to support 'HasSelector' and `DescendantCombinator` selectors. """
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: return self.selector.find_all(tag, recursive=True, limit=limit)
[docs] class RelativeNextSibling(BaseRelativeSibling): """ Selector for finding next sibling of the anchor element. Example ------- >>> RelativeNextSibling(TypeSelector("p")) when 'div' element is passed into find methods: Example ------- >>> <div></div><p></p> ✔️ >>> <div></div><a></a><p></p> ❌ >>> <p></p><div></div> ❌ It can be created with `Anchor` instance as well with use of `plus` operator `+`: Example ------- >>> Anchor + TypeSelector("p") """ def _func(self, tag: IElement) -> list[IElement]: return tag.find_subsequent_siblings(limit=1)
[docs] class RelativeSubsequentSibling(BaseRelativeSibling): """ Selector for finding subsequent siblings of the anchor element. Example ------- >>> RelativeSubsequentSibling(TypeSelector("p")) when 'div' element is passed into find methods: Example ------- >>> <div></div><p></p> ✔️ >>> <div></div><a></a><p></p> ✔️ >>> <p></p><div></div> ❌ >>> <div></div><span><p></p></span> ❌ It can be created with `Anchor` instance as well with use of `multiplication` operator `*`: Example ------- >>> Anchor * TypeSelector("p") """ def _func(self, tag: IElement) -> list[IElement]: return tag.find_subsequent_siblings(limit=None)
[docs] class RelativeParent(BaseAncestorSelector): """ Selector for finding parent of the anchor element. Example ------- >>> RelativeParent(TypeSelector("div")) when 'p' element is passed into find methods: Example ------- >>> <div><p></p></div> ✔️ >>> <div><a><p></p></a></div> ❌ >>> <span><p></p></span> ❌ Although this combinator does not have its counterpart in CSS, it can be represented as has selector, where child combinator is explicitly stated: Example ------- >>> div:has(> p) It can be created with `Anchor` instance as well with use of `lt` operator `<`: Example ------- >>> Anchor < TypeSelector("div") Notes ------- `RelativeParent` selector ignores `recursive` parameter, as it is always searches only for parent of the anchor element, `find_all` method can return at most one element (parent). """ _limit = 1
[docs] class RelativeAncestor(BaseAncestorSelector): """ Selector for finding ancestors of the anchor element. Example ------- >>> RelativeAncestor(TypeSelector("div")) when 'p' element is passed into find methods: Example ------- >>> <div><p></p></div> ✔️ >>> <div><a><p></p></a></div> ✔️ >>> <span><p></p></span> ❌ >>> <p></p><div></div> ❌ Although this combinator does not have its counterpart in CSS, it can be represented as has selector, where descendant combinator is implied: Example ------- >>> div:has(p) It can be created with `Anchor` instance as well with use of `left shift` operator `<<`: Example ------- >>> Anchor << TypeSelector("div") Notes ------- `RelativeAncestor` selector ignores `recursive` parameter, as it is always searches among all ancestors of the anchor element. """ _limit = None
[docs] class Anchor_: """ Shortcut component used to create relative selectors in a more readable way with use of operators. `Anchor_` is an internal class and can be considered a singleton. It's advisable to use the `Anchor` instance instead of creating new objects. `Anchor` supports the following operators: - `>`: `RelativeChild` Example ------- >>> Anchor > TypeSelector("div") Creates `RelativeChild` selector, that selects any div tag that is a direct child of the tag that is being searched. - `>>`: `RelativeDescendant` Example ------- >>> Anchor >> TypeSelector("div") Creates `RelativeDescendant` selector, that selects any div tag that is a descendant of the tag that is being searched. This is default behavior of selectors, and is equivalent to using the `TypeSelector` directly, but is implemented for the sake of consistency. - `+`: `RelativeNextSibling` Example ------- >>> Anchor + TypeSelector("div") Creates `RelativeNextSibling` selector, that selects any div tag that is next sibling of the tag that is being searched, it can logically return at most one tag. - `*`: `RelativeSubsequentSibling` Example ------- >>> Anchor * TypeSelector("div") Creates `RelativeSubsequentSibling` selector, that selects any div tag that is a subsequent sibling of the tag that is being searched. - `<`: `RelativeParent` Example ------- >>> Anchor < TypeSelector("div") Creates `RelativeParent` selector, that selects any div tag that is a parent of the tag that is being searched. - `<<`: `RelativeAncestor` Example ------- >>> Anchor << TypeSelector("div") Creates `RelativeAncestor` selector, that selects any div tag that is an ancestor of the tag that is being searched. This imitates css selector relative selectors that are used for example in `:has` pseudo-class, that accepts relative selector list as an argument. Example ------- >>> :has(> div, + a) This translated to `soupsavvy` would be: Example ------- >>> HasSelector(Anchor > TypeSelector("div"), Anchor + TypeSelector("a")) Which would match any tag that has a direct child 'div' and a next sibling 'a' tag. Selected tag is the anchor tag that is being searched. Notes ------- For more information on relative selectors, see: https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors/Selector_structure#relative_selector """ def __gt__(self, x: SoupSelector) -> RelativeChild: return RelativeChild(check_selector(x)) def __rshift__(self, x: SoupSelector) -> RelativeDescendant: return RelativeDescendant(check_selector(x)) def __lt__(self, x: SoupSelector) -> RelativeParent: return RelativeParent(check_selector(x)) def __lshift__(self, x: SoupSelector) -> RelativeAncestor: return RelativeAncestor(check_selector(x)) def __add__(self, x: SoupSelector) -> RelativeSelector: return RelativeNextSibling(check_selector(x)) def __mul__(self, x: SoupSelector) -> RelativeSelector: return RelativeSubsequentSibling(check_selector(x))
# instance of Anchor class Anchor = Anchor_()
[docs] class HasSelector(CompositeSoupSelector): """ Selector for finding elements based on matching reference elements. Example ------- >>> HasSelector(TypeSelector("div")) matches all elements that have any descendant with "div" tag name. It uses default combinator of relative selector, which is descendant combinator. Example ------- >>> <span><div>Hello World</div></span> ✔️ >>> <span><a>Hello World</a></span> ❌ Other relative selectors can be used with `Anchor` element. Example ------- ... HasSelector(Anchor > TypeSelector("div")) ... HasSelector(Anchor + TypeSelector("div")) or by using `RelativeSelector` components directly: Example ------- ... HasSelector(RelativeChild(TypeSelector("div"))) ... HasSelector(RelativeNextSibling(TypeSelector("div")) Example ------- >>> <span><div>Hello World</div></span> ✔️ >>> <span><a><div>Hello World</div></a></span> ❌ In this case, HasSelector is anchored against any element, and matches only elements that have "div" tag name as a child. This is an equivalent of CSS :has() pseudo-class, that matches element if any of the relative selectors that are passed as an argument match element when anchored against it. Example ------- >>> :has(div, a) >>> :has(+ div, > a) These examples translated to `soupsavvy` would be: Example ------- ... HasSelector(TypeSelector("div"), TypeSelector("a")) ... HasSelector(Anchor + TypeSelector("div"), Anchor > TypeSelector("a")) Notes ----- Passing `RelativeDescendant` selector into HasSelector is equivalent to using its selector directly, as descendant combinator is a default option. Example ------- >>> HasSelector(RelativeDescendant(TypeSelector("div"))) ... HasSelector(Anchor > TypeSelector("div")) ... HasSelector(TypeSelector("div")) Three of the above examples are equivalent. For more information on :has() pseudo-class, see: https://developer.mozilla.org/en-US/docs/Web/CSS/:has """
[docs] def __init__( self, selector: SoupSelector, /, *selectors: SoupSelector, ) -> None: """ Initializes `HasSelector` object with provided positional arguments as selectors. Parameters ---------- selectors: SoupSelector `SoupSelector` objects to match accepted as positional arguments. At least one selector is required to create `HasSelector`. Raises ------ NotSoupSelectorException If any of provided parameters is not an instance of `SoupSelector`. """ super().__init__([selector, *selectors])
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: elements = TagIterator(tag, recursive=recursive) matching: list[IElement] = [] for element in elements: # we only care if anything matching was found if any(step.find(element) for step in self.selectors): matching.append(element) if len(matching) == limit: break return matching