Source code for soupsavvy.utils.selector_utils

"""
Module for utility functions for selectors, used internally across package
to ensure consistent and reliable results.

Classes
-------
- `TagIterator` - Wrapper class for iterating over `IElement`.
- `ElementWrapper` - Wrapper class for `IElement` instances.
- `TagResultSet` - Collection that stores and manages results of selection.
"""

from __future__ import annotations

from collections.abc import Iterable, Iterator
from dataclasses import dataclass
from itertools import chain
from typing import Optional

from soupsavvy.interfaces import IElement


[docs] @dataclass class TagIterator: """ Wrapper class for iterating over `IElement` instances. Parameters ---------- tag : IElement `IElement` to iterate over. recursive : bool, optional If True, iterates over all descendants, otherwise only over direct children. Default is True. include_self : bool, optional If True, includes the element itself in iteration, default is False. """ tag: IElement recursive: bool = True include_self: bool = False def _get_iterator(self) -> Iterator: """ Returns iterator over `IElement` descendants or children based on recursive parameter value. """ return iter(self.tag.descendants if self.recursive else self.tag.children) def __iter__(self) -> TagIterator: # Resetting iterator to the beginning. iter_ = self._get_iterator() self._iter = chain([self.tag], iter_) if self.include_self else iter_ return self def __next__(self) -> IElement: """ Iterates over `IElement` nodes. If recursive is set to True, iterates over all descendants, otherwise only over direct children. """ return next(self._iter)
[docs] @dataclass class ElementWrapper: """ Wrapper class for `IElement` instances for operations applied in `TagResultSet`. Operations such as setting attributes are performed on wrapper, and original `IElement` instance is not modified. """ element: IElement def __hash__(self): """Hashes instance by `IElement` instance hash value.""" return hash(self.element) def __eq__(self, other): """Checks equality based on hash value.""" if not isinstance(other, self.__class__): return NotImplemented return hash(self) == hash(other)
[docs] class TagResultSet: """ `TagResultSet` class is collection that stores and manages results of find_all method of selectors. Prerequisites for returned results are: - `IElement` instances are unique - the order of results == order of their appearance in html This components consumes optional list of `IElement` instances and provides methods for fetching unique results with preserved order. It provides operations on sets of results like intersection and union. """ # constants used inside the class _ORDER_ATTR = "_order" _IS_BASE = "_base"
[docs] def __init__(self, elements: Optional[list[IElement]] = None) -> None: """ Initializes `TagResultSet` instance. Parameters ---------- elements : list[IElement], optional List of `IElement` instances to store in the collection. Default is None, which initializes empty collection. """ self._elements = elements or []
[docs] def fetch(self, n: Optional[int] = None) -> list[IElement]: """ Fetches n first unique `IElement` instances from collection. Ensures that the order of the initial list is preserved. Parameters ---------- n : int, optional Number of `IElement` instances to fetch. If default None, fetches all unique instances. Returns ------- list[IElement] List of `IElement` instances fetched from collection. """ set_ = self._to_set(base=True) ordered = self._sort(set_) return ordered[:n]
def _to_set(self, base: bool) -> set[ElementWrapper]: """ Converts list of `IElement` from collection to set of UniqueTag instances. Parameters ---------- base : bool If True, sets the element as base, otherwise as non-base. If TagResultSet is used in set operations as a base, it should be True. Returns ------- set[ElementWrapper] Set of ElementWrapper instances with set helper attributes. """ elements = [ElementWrapper(element) for element in self._elements] for i, element in enumerate(elements): # setting attributes used for restoring order setattr(element, self._ORDER_ATTR, i) setattr(element, self._IS_BASE, int(base)) return set(elements) def _sort(self, it: Iterable[ElementWrapper]) -> list[IElement]: """ Sorts an iterable of `ElementWrapper` instances by order and base attributes. Parameters ---------- it : Iterable[ElementWrapper] Iterable of `ElementWrapper` instances to sort. Returns ------- list[IElement] List of `IElement` instances sorted by order and base attributes. """ return [ unique.element for unique in sorted( it, key=lambda x: ( # Sorting by base descending - base goes first -getattr(x, self.__class__._IS_BASE), # Sorting by order ascending getattr(x, self._ORDER_ATTR), ), ) ] def __and__(self, other: TagResultSet) -> TagResultSet: """ Performs an intersection operation on two `TagResultSet` instances with current instance as a base, preserving the order of tags from the base instance. Parameters ---------- other : TagResultSet `TagResultSet` instance to perform intersection with. Example ------- >>> base = TagResultSet([x, y, b]) ... other = TagResultSet([c, y, x]) ... base & other TagResultSet([x, y]) Returns ------- TagResultSet New `TagResultSet` instance with results of intersection operation. """ base = self._to_set(base=True) right = other._to_set(base=False) # if set intersection, objects are taken from right operant, which messes up the order intersection = [obj for obj in base if obj in right] ordered = self._sort(intersection) return TagResultSet(ordered) def __or__(self, other: TagResultSet) -> TagResultSet: """ Performs a union operation on two `TagResultSet` instances with current instance list of tags as a base, appending new tags from other instance at the end of the list. Parameters ---------- other : TagResultSet `TagResultSet` instance to perform union with. Example ------- >>> base = TagResultSet([x, y, b]) ... other = TagResultSet([c, y, x]) ... base | other TagResultSet([x, y, b, c]) Returns ------- TagResultSet New `TagResultSet` instance with results of union operation. """ base = self._to_set(base=True) right = other._to_set(base=False) updated = base | right ordered = self._sort(updated) return TagResultSet(ordered) def __sub__(self, other: TagResultSet) -> TagResultSet: """ Performs a difference operation on two `TagResultSet` instances with current instance as a base, preserving the order of tags from the base instance. Parameters ---------- other : TagResultSet `TagResultSet` instance to perform difference with. Example ------- >>> base = TagResultSet([x, y, b]) ... other = TagResultSet([c, y, x]) ... base - other TagResultSet([b]) Returns ------- TagResultSet New TagResultSet instance with results of difference operation. """ base = self._to_set(base=True) right = other._to_set(base=False) difference = base - right ordered = self._sort(difference) return TagResultSet(ordered)
[docs] def symmetric_difference(self, other: TagResultSet) -> TagResultSet: """ Performs a symmetric difference operation on two `TagResultSet` instances with current instance as a base, preserving the order of tags from the base instance. Parameters ---------- other : TagResultSet `TagResultSet` instance to perform symmetric difference with. Example ------- >>> base = TagResultSet([x, y, b]) ... other = TagResultSet([c, y, x]) ... base.symmetric_difference(other) TagResultSet([b, c]) Returns ------- TagResultSet New `TagResultSet` instance with results of symmetric difference operation. """ base = self._to_set(base=True) right = other._to_set(base=False) symmetric_diff = base.symmetric_difference(right) ordered = self._sort(symmetric_diff) return TagResultSet(ordered)
def __len__(self) -> int: """Returns the number of `IElement` instances in the collection.""" return len(self._elements) def __bool__(self) -> bool: """Returns True if collection is not empty, otherwise False.""" return len(self) > 0