Source code for soupsavvy.operations.selection_pipeline

"""
Module with selection pipeline class.
Pipeline for chaining selector and operation together, used as a bridge between
selecting html elements and processing the data.
"""

from __future__ import annotations

from typing import Any, Optional

from soupsavvy.base import BaseOperation, check_operation, check_tag_searcher
from soupsavvy.interfaces import Comparable, IElement, TagSearcher, TagSearcherType


[docs] class SelectionPipeline(TagSearcher, Comparable): """ Class for chaining searcher and operation together. Uses searcher to find information in element and operation to process the data. Example ------- >>> from soupsavvy import TypeSelector ... from soupsavvy.operations import Operation, Text ... pipeline = TypeSelector("span") | Text() ... pipeline.find(soup) 'information' Most common way of creating a pipeline is using the `|` operator on selector and operation. """
[docs] def __init__(self, selector: TagSearcherType, operation: BaseOperation) -> None: """ Initializes `SelectionPipeline` with selector and operation. Parameters ---------- selector : TagSearcher Selector used for finding target information in the element. operation : BaseOperation Operation used for processing the data. Raises ------ NotTagSearcherException If provided selector is not a valid `TagSearcher` instance. NotOperationException If provided operation is not a valid `BaseOperation` instance. """ self._selector = check_tag_searcher(selector) self._operation = check_operation(operation)
@property def selector(self) -> TagSearcher: """ Returns `TagSearcher` object of this pipeline used for finding target information in the element. Returns ------- TagSearcher TagSearcher object used in this pipeline. """ return self._selector @property def operation(self) -> BaseOperation: """ Returns `BaseOperation` object of this pipeline used for processing the data. Returns ------- BaseOperation BaseOperation object used in this pipeline. """ return self._operation
[docs] def find( self, tag: IElement, strict: bool = False, recursive: bool = True, ) -> Any: """ Finds a first element matching selector and processes it with operation. Parameters ---------- tag : IElement Any `IElement` object to process. strict : bool, optional If True, enforces results to be found in the element, by default False. recursive : bool, optional Specifies if search should be recursive. If set to `False`, only direct children of the element will be searched. By default `True`. Returns ------- Any Result of the operation applied to the found element. Raises ------ TagNotFoundException If strict parameter is set to `True` and none matching element was found. FailedOperationExecution If operation execution failed on the found element. """ return self.operation.execute( self.selector.find(tag, strict=strict, recursive=recursive) )
[docs] def find_all( self, tag: IElement, recursive: bool = True, limit: Optional[int] = None, ) -> list[Any]: """ Finds all elements matching selector and processes them with operation. Parameters ---------- tag : IElement Any `IElement` object to process. recursive : bool, optional Specifies if search should be recursive. If set to `False`, only direct children of the element will be searched. By default `True`. limit : int, optional Specifies maximum number of results to return in a list. By default `None`, everything is returned. Returns ------- list[Any] A list of results, if none found, the list is empty. Raises ------ FailedOperationExecution If operation execution failed on any of the found elements. """ return [ self.operation.execute(element) for element in self.selector.find_all(tag, recursive=recursive, limit=limit) ]
def __or__(self, x: Any) -> SelectionPipeline: """ Overrides `__or__` method called also by pipe operator '|'. Creates new `SelectionPipeline` by extending operations with provided one. Parameters ---------- x : BaseOperation `BaseOperation` object used to extend the pipeline. Returns ------- SelectionPipeline New `SelectionPipeline` with extended operations. Raises ------ NotOperationException If provided object is not an instance of `BaseOperation`. """ x = check_operation(x) operation = self.operation | x return SelectionPipeline(selector=self.selector, operation=operation) def __eq__(self, x) -> bool: # equal only if both selector and operation are the same if not isinstance(x, self.__class__): return NotImplemented return self.selector == x.selector and self.operation == x.operation def __repr__(self) -> str: return f"{self.__class__.__name__}({self.selector}, {self.operation})"