Source code for soupsavvy.implementation.playwright

from __future__ import annotations

import re
from itertools import islice
from typing import Iterable, Optional, Pattern, Union

from playwright.sync_api import ElementHandle, Page
from typing_extensions import Self

import soupsavvy.exceptions as exc
import soupsavvy.implementation.snippets.js.playwright as js
from soupsavvy.implementation.snippets import css, xpath
from soupsavvy.interfaces import IBrowser, IElement
from soupsavvy.selectors.css.api import PlaywrightCSSApi
from soupsavvy.selectors.xpath.api import PlaywrightXPathApi

_UID_REGEX = re.compile(r'\s*_uid="[^"]*"')


[docs] class PlaywrightElement(IElement[ElementHandle]): """ Implementation of `IElement` for `playwright` tree. Adapter for `playwright` handles, that makes them usable across the library. Example ------- >>> from soupsavvy.implementation.playwright import PlaywrightElement >>> from playwright.sync_api import sync_playwright >>> with sync_playwright() as p: ... browser = p.chromium.launch() ... page = browser.new_page() ... page.goto("https://example.com") ... element = page.query_selector("h1") ... playwright_element = PlaywrightElement(element) """ _NODE_TYPE = ElementHandle
[docs] def __init__(self, node: ElementHandle, *args, **kwargs): super().__init__(node, *args, **kwargs) # playwright does not guarantee the same identity for handles # from different queries, it needs to be worked around self._id = self.node.evaluate(js.ADD_IDENTIFIER_SCRIPT)
[docs] def find_all( self, name: Optional[str] = None, attrs: Optional[dict[str, Union[str, Pattern[str]]]] = None, recursive: bool = True, limit: Optional[int] = None, ) -> list[Self]: attrs = attrs or {} js_attrs = {k: None if isinstance(v, Pattern) else v for k, v in attrs.items()} found = self.node.evaluate_handle( js.FILTER_NODES_SCRIPT, [name, js_attrs, recursive], ) matched_elements = [ e.as_element() for e in found.get_properties().values() if e.as_element() is not None ] def match(element: ElementHandle) -> bool: return all( value.search(element.get_attribute(attr) or "") for attr, value in attrs.items() if isinstance(value, Pattern) ) return list(islice(self._map(filter(match, matched_elements)), limit))
[docs] def find_subsequent_siblings(self, limit: Optional[int] = None) -> list[Self]: iterator = self.node.query_selector_all( f"xpath={xpath.FIND_SUBSEQUENT_SIBLINGS_SELECTOR}" ) return list(islice(self._map(iterator), limit))
[docs] def find_ancestors(self, limit: Optional[int] = None) -> list[Self]: js_handle = self.node.evaluate_handle( js.FIND_ANCESTORS_SCRIPT, limit, ) ancestors = [ prop.as_element() for prop in js_handle.get_properties().values() if prop.as_element() is not None ] return list(self._map(ancestors))
@property def children(self) -> Iterable[Self]: iterator = self.node.query_selector_all( f"xpath={xpath.FIND_ALL_CHILDREN_SELECTOR}" ) return self._map(iterator) @property def descendants(self) -> Iterable[Self]: iterator = self.node.query_selector_all(css.FIND_ALL_DESCENDANTS_SELECTOR) return self._map(iterator) @property def parent(self) -> Optional[Self]: handle = self.node.evaluate_handle(js.PARENT_ELEMENT_SCRIPT) element = handle.as_element() if element is None: return None return self.from_node(element)
[docs] def get_attribute(self, name: str) -> Optional[str]: # get live JS property first, then html attribute property_ = self.node.evaluate(js.GET_ATTRIBUTE_SCRIPT, name) if property_ is not None: return property_ return self.node.get_attribute(name)
@property def name(self) -> str: return self.node.evaluate(js.TAG_NAME_SCRIPT).lower() def __str__(self) -> str: html = self.node.evaluate(js.OUTER_HTML_SCRIPT) return _UID_REGEX.sub("", html) @property def text(self) -> str: return self.node.text_content() or ""
[docs] def css(self, selector: str): return PlaywrightCSSApi(selector)
[docs] def xpath(self, selector: str): return PlaywrightXPathApi(selector)
def __hash__(self) -> int: return hash((self._id, self.__class__)) def __eq__(self, other): if not isinstance(other, self.__class__): return NotImplemented return self._id == other._id
[docs] class PlaywrightBrowser(IBrowser[Page, PlaywrightElement]): """ Implementation of `IBrowser` for `playwright` Page. Adapter for Playwright's `Page` object, allowing unified use across soupsavvy. Example ------- >>> from playwright.sync_api import sync_playwright >>> from soupsavvy.implementation.playwright import PlaywrightBrowser ... >>> with sync_playwright() as p: ... browser = p.chromium.launch() ... page = browser.new_page() ... pw_browser = PlaywrightBrowser(page) ... pw_browser.navigate("https://example.com") """
[docs] def navigate(self, url: str) -> None: self.browser.goto(url)
[docs] def click(self, element: PlaywrightElement) -> None: self.browser.evaluate(js.CLICK_ELEMENT_SCRIPT, element.node)
[docs] def send_keys( self, element: PlaywrightElement, value: str, clear: bool = True ) -> None: if clear: element.node.fill("") element.node.type(value)
[docs] def get_document(self) -> PlaywrightElement: element = self.browser.query_selector("html") if element is None: raise exc.TagNotFoundException("Could not find <html> element on the page.") return PlaywrightElement(element)
[docs] def close(self) -> None: self.browser.close()
[docs] def get_current_url(self) -> str: return self.browser.url