Source code for soupsavvy.operations.element

"""
Module for operations specific to `IElement` interface.

Classes
-------
- `Text` - Extracts text from element - most common operation.
- `Href` - Extracts href attribute from element.
- `Parent` - Extracts parent of element.

These components are design to be used for processing `IElement` and extracting
desired information. They can be used in combination with selectors.
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Optional, Union, overload

from soupsavvy.base import BaseOperation, OperationSearcherMixin, SoupSelector
from soupsavvy.interfaces import IElement

if TYPE_CHECKING:
    from soupsavvy.operations.general import OperationPipeline
    from soupsavvy.selectors.logical import SelectorList


[docs] class Text(OperationSearcherMixin): """ Operation to extract text from `IElement`. Wrapper of most common operation used in web scraping. Example ------- >>> from soupsavvy.operations import Text ... operation = Text() ... operation.execute(tag) "Extracted text from the tag" Implements `TagSearcher` interface for convenience. It has find methods that can be used to extract text from provided element. Example ------- >>> from soupsavvy.operations import Text ... operation = Text() ... operation.find(tag) "Text" Notes ----- Results of this operation may vary between implementations of `IElement`, as each of them extracts text differently. """ def _execute(self, arg: IElement) -> str: """Extracts text from `IElement`.""" return arg.text def __eq__(self, x: Any) -> bool: if not isinstance(x, self.__class__): return NotImplemented return True def __repr__(self) -> str: return f"{self.__class__.__name__}()"
[docs] class Href(OperationSearcherMixin): """ Operation to extract href attribute from `IElement`. Wrapper of one of the common operation used in web scraping. If href attribute is not present, returns None. Example ------- >>> from soupsavvy.operations import Href ... operation = Href() ... operation.execute(tag) "www.example.com" Implements `TagSearcher` interface for convenience. It has find methods that can be used to extract href from provided element. Example ------- >>> from soupsavvy.operations import Href ... operation = Href() ... operation.find(tag) "www.example.com" """ _ATTRIBUTE_NAME = "href" def _execute(self, arg: IElement) -> Optional[str]: """ Extracts href attribute from `IElement` object. If attribute is not present, returns None. """ return arg.get_attribute(self._ATTRIBUTE_NAME) def __eq__(self, x: Any) -> bool: if not isinstance(x, self.__class__): return NotImplemented return True def __repr__(self) -> str: return f"{self.__class__.__name__}()"
[docs] class Parent(BaseOperation, SoupSelector): """ Operation to extract parent of `IElement`. Example ------- >>> from soupsavvy.operations import Parent ... operation = Parent() ... operation.execute(tag) "<div>...</div>" Implements `SoupSelector` interface for convenience and can be used to extract parent of a provided tag without any conditions. Example ------- >>> from soupsavvy.operations import Parent ... operation = Parent() ... operation.find(tag) "<div>--tag--</div>" `Parent` has `BaseOperation` higher in MRO than `SoupSelector`, so, using pipe operator `|` on `Parent` object will result in `OperationPipeline` instance. Example ------- >>> from soupsavvy.operations import Parent ... operation = Parent() | Parent() ... operation.execute(tag) "<div><div>--tag--</div></div>" Notes ----- If element does not have parent, returns None. """ def _execute(self, arg: IElement) -> Optional[IElement]: """Extracts parent of `IElement`.""" return arg.parent
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[IElement]: return [self.execute(tag)]
def __eq__(self, x: Any) -> bool: if not isinstance(x, self.__class__): return NotImplemented return True def __repr__(self) -> str: return f"{self.__class__.__name__}()" @overload # type: ignore def __or__(self, other: BaseOperation) -> OperationPipeline: ... @overload def __or__(self, other: SoupSelector) -> SelectorList: ... def __or__( self, other: Union[BaseOperation, SoupSelector] ) -> Union[OperationPipeline, SelectorList]: if isinstance(other, SoupSelector): return SoupSelector.__or__(self, other) return BaseOperation.__or__(self, other)