Source code for soupsavvy.selectors.general

"""
Module with miscellaneous selectors.

Classes
-------
- `TypeSelector` - combines type and attribute selectors
- `PatternSelector` - matches elements based on text content and selector
- `UniversalSelector` - universal selector (*)
- `SelfSelector` - matches the element itself
- `ExpressionSelector` - matches elements based on user-defined function
"""

import itertools
from collections.abc import Callable
from dataclasses import dataclass
from typing import Optional, Pattern

from typing_extensions import deprecated

import soupsavvy.selectors.namespace as ns
from soupsavvy.base import SelectableCSS, SoupSelector
from soupsavvy.interfaces import IElement
from soupsavvy.utils.selector_utils import TagIterator


[docs] @dataclass class TypeSelector(SoupSelector, SelectableCSS): """ Selector for finding elements based on tag name (type). Counterpart of css type selectors. Example ------- >>> TypeSelector("div") matches all elements that have "div" tag name. Example ------- >>> <div class="widget">Hello World</div> ✔️ >>> <a href="/shop">Hello World</a> ❌ CSS counterpart can be represented as: Example ------- >>> div And can be retrieved with `css` property. Example ------- >>> TypeSelector("div").css "div" Parameters ---------- name : str Tag name of the element ex. "a", "div". Notes ----- For more information about type selectors, see: https://developer.mozilla.org/en-US/docs/Web/CSS/Type_selectors """ name: str
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: return tag.find_all(name=self.name, recursive=recursive, limit=limit)
@property def css(self) -> str: # css selector for tag name is just the tag name ex. "div" return self.name def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return NotImplemented # TypeSelectors produce the same results if names of the tag are the same return self.name == other.name
[docs] @dataclass class PatternSelector(SoupSelector): """ Selector for finding elements based on text content pattern. Example ------- >>> PatternSelector("Hello World") matches all element with exact text content "Hello World". Example ------- >>> <div>Hello World</div> ✔️ >>> <div>Hello Python</div> ❌ >>> <div>Hello World 3</div> ❌ In case of using regex pattern, `re.search` is used to match the attribute value. Example ------- >>> PatternSelector(re.compile(r"[0-9]+")) matches all elements with text content containing at least one digit. Example ------- >>> <div>Hello World 123</div> ✔️ >>> <div>Hello World</div> ❌ Parameters ---------- pattern: str | Pattern Pattern to match text of the element. Can be a string for exact match or `Pattern` for any more complex regular expressions. Notes ----- Element does not match the pattern if it has any children. Only leaf nodes can be returned by `PatternSelector` find methods. """ pattern: ns.PatternType def __post_init__(self) -> None: """Sets up compiled regex pattern used for find methods.""" self.pattern = ( str(self.pattern) if not isinstance(self.pattern, Pattern) else self.pattern )
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: iterator = TagIterator(tag, recursive=recursive) def _has_children(x: IElement) -> bool: #! As text of the element is concatenated string of all child text nodes, #! it does not make sense to include elements with children in the result. try: next(iter(x.children)) except StopIteration: return False return True filter_ = filter( lambda x: not _has_children(x) and ( self.pattern.search(x.text) if isinstance(self.pattern, Pattern) else x.text == self.pattern ), iterator, ) return list(itertools.islice(filter_, limit))
def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return NotImplemented return self.pattern == other.pattern
[docs] @dataclass class UniversalSelector(SoupSelector, SelectableCSS): """ Selector representing a wildcard pattern, that matches all elements in the html page. Example ------- >>> UniversalSelector() CSS counterpart can be represented as: Example ------- >>> * And can be retrieved with `css` property. Example ------- >>> UniversalSelector().css "*" Notes ----- For more information on universal selector, see: https://developer.mozilla.org/en-US/docs/Web/CSS/Universal_selectors """
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: return tag.find_all(recursive=recursive, limit=limit)
@property def css(self) -> str: """Returns wildcard css selector matching all elements in the markup.""" return ns.CSS_SELECTOR_WILDCARD def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return NotImplemented return True
[docs] @deprecated(f"'AnyTagSelector' is deprecated, use 'UniversalSelector' class instead.") class AnyTagSelector(UniversalSelector): """Alias for `UniversalSelector` class. Deprecated component."""
[docs] class SelfSelector(SoupSelector): """ Selector matching only the element itself. Convenience component that can be used for compatibility. Example ------- >>> SelfSelector() always matches the tag that is passed to the find methods. Notes ----- Can be used in user-defined model for scope if element itself is the scope. """
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: return [tag]
def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return NotImplemented return True
[docs] @dataclass class ExpressionSelector(SoupSelector): """ Selector that matches elements based on a user-defined function (predicate), that is used as filter for element object. Applies predicate to each element and returns those that satisfy the condition. Parameters ---------- f : Callable[[IElement], bool] A user-defined function (predicate) that determines whether the element should be selected. Examples -------- >>> selector = ExpressionSelector(lambda x: x.name not in {"a", "div"}) ... selector.find(soup) To perform operations on underlying node, use `IElement.get()` method or `IElement.node` attribute. Example ------- >>> selector = ExpressionSelector(lambda x: 'widget' in x.node['class']) For `SoupElement` object, that wraps `bs4.Tag`. Notes ----- Any exceptions should be handled inside provided function. If raised, it will be propagated to the caller. """ f: Callable[[IElement], bool]
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: iterator = TagIterator(tag, recursive=recursive) filter_ = filter(self.f, iterator) return list(itertools.islice(filter_, limit))
def __eq__(self, other) -> bool: if not isinstance(other, self.__class__): return NotImplemented return self.f is other.f