Source code for soupsavvy.implementation.playwright

from __future__ import annotations

import re
from itertools import islice
from typing import Iterable, Optional, Pattern, Union

from playwright.sync_api import ElementHandle, Page
from typing_extensions import Self

import soupsavvy.exceptions as exc
import soupsavvy.implementation.snippets.js.playwright as js
from soupsavvy.implementation.snippets import css, xpath
from soupsavvy.interfaces import IBrowser, IElement
from soupsavvy.selectors.css.api import PlaywrightCSSApi
from soupsavvy.selectors.xpath.api import PlaywrightXPathApi

_UID_REGEX = re.compile(r'\s*_uid="[^"]*"')



[docs]
class PlaywrightElement(IElement[ElementHandle]):
    """
    Implementation of `IElement` for `playwright` tree.
    Adapter for `playwright` handles, that makes them usable across the library.

    Example
    -------
    >>> from soupsavvy.implementation.playwright import PlaywrightElement
    >>> from playwright.sync_api import sync_playwright
    >>> with sync_playwright() as p:
    ...     browser = p.chromium.launch()
    ...     page = browser.new_page()
    ...     page.goto("https://example.com")
    ...     element = page.query_selector("h1")
    ...     playwright_element = PlaywrightElement(element)
    """

    _NODE_TYPE = ElementHandle


[docs]
    def __init__(self, node: ElementHandle, *args, **kwargs):
        super().__init__(node, *args, **kwargs)

        # playwright does not guarantee the same identity for handles
        # from different queries, it needs to be worked around
        self._id = self.node.evaluate(js.ADD_IDENTIFIER_SCRIPT)



[docs]
    def find_all(
        self,
        name: Optional[str] = None,
        attrs: Optional[dict[str, Union[str, Pattern[str]]]] = None,
        recursive: bool = True,
        limit: Optional[int] = None,
    ) -> list[Self]:
        attrs = attrs or {}
        js_attrs = {k: None if isinstance(v, Pattern) else v for k, v in attrs.items()}

        found = self.node.evaluate_handle(
            js.FILTER_NODES_SCRIPT,
            [name, js_attrs, recursive],
        )
        matched_elements = [
            e.as_element()
            for e in found.get_properties().values()
            if e.as_element() is not None
        ]

        def match(element: ElementHandle) -> bool:
            return all(
                value.search(element.get_attribute(attr) or "")
                for attr, value in attrs.items()
                if isinstance(value, Pattern)
            )

        return list(islice(self._map(filter(match, matched_elements)), limit))



[docs]
    def find_subsequent_siblings(self, limit: Optional[int] = None) -> list[Self]:
        iterator = self.node.query_selector_all(
            f"xpath={xpath.FIND_SUBSEQUENT_SIBLINGS_SELECTOR}"
        )
        return list(islice(self._map(iterator), limit))



[docs]
    def find_ancestors(self, limit: Optional[int] = None) -> list[Self]:
        js_handle = self.node.evaluate_handle(
            js.FIND_ANCESTORS_SCRIPT,
            limit,
        )
        ancestors = [
            prop.as_element()
            for prop in js_handle.get_properties().values()
            if prop.as_element() is not None
        ]
        return list(self._map(ancestors))


    @property
    def children(self) -> Iterable[Self]:
        iterator = self.node.query_selector_all(
            f"xpath={xpath.FIND_ALL_CHILDREN_SELECTOR}"
        )
        return self._map(iterator)

    @property
    def descendants(self) -> Iterable[Self]:
        iterator = self.node.query_selector_all(css.FIND_ALL_DESCENDANTS_SELECTOR)
        return self._map(iterator)

    @property
    def parent(self) -> Optional[Self]:
        handle = self.node.evaluate_handle(js.PARENT_ELEMENT_SCRIPT)
        element = handle.as_element()

        if element is None:
            return None

        return self.from_node(element)


[docs]
    def get_attribute(self, name: str) -> Optional[str]:
        # get live JS property first, then html attribute
        property_ = self.node.evaluate(js.GET_ATTRIBUTE_SCRIPT, name)

        if property_ is not None:
            return property_

        return self.node.get_attribute(name)


    @property
    def name(self) -> str:
        return self.node.evaluate(js.TAG_NAME_SCRIPT).lower()

    def __str__(self) -> str:
        html = self.node.evaluate(js.OUTER_HTML_SCRIPT)
        return _UID_REGEX.sub("", html)

    @property
    def text(self) -> str:
        return self.node.text_content() or ""


[docs]
    def css(self, selector: str):
        return PlaywrightCSSApi(selector)



[docs]
    def xpath(self, selector: str):
        return PlaywrightXPathApi(selector)


    def __hash__(self) -> int:
        return hash((self._id, self.__class__))

    def __eq__(self, other):
        if not isinstance(other, self.__class__):
            return NotImplemented

        return self._id == other._id




[docs]
class PlaywrightBrowser(IBrowser[Page, PlaywrightElement]):
    """
    Implementation of `IBrowser` for `playwright` Page.
    Adapter for Playwright's `Page` object, allowing unified use across soupsavvy.

    Example
    -------
    >>> from playwright.sync_api import sync_playwright
    >>> from soupsavvy.implementation.playwright import PlaywrightBrowser
    ...
    >>> with sync_playwright() as p:
    ...     browser = p.chromium.launch()
    ...     page = browser.new_page()
    ...     pw_browser = PlaywrightBrowser(page)
    ...     pw_browser.navigate("https://example.com")
    """


[docs]
    def navigate(self, url: str) -> None:
        self.browser.goto(url)



[docs]
    def click(self, element: PlaywrightElement) -> None:
        self.browser.evaluate(js.CLICK_ELEMENT_SCRIPT, element.node)



[docs]
    def send_keys(
        self, element: PlaywrightElement, value: str, clear: bool = True
    ) -> None:
        if clear:
            element.node.fill("")

        element.node.type(value)



[docs]
    def get_document(self) -> PlaywrightElement:
        element = self.browser.query_selector("html")

        if element is None:
            raise exc.TagNotFoundException("Could not find <html> element on the page.")

        return PlaywrightElement(element)



[docs]
    def close(self) -> None:
        self.browser.close()



[docs]
    def get_current_url(self) -> str:
        return self.browser.url