Why soupsavvy?

You might wonder, If I can achieve everything I need with BeautifulSoup, why should I bother with soupsavvy on top of it?
Here are some reasons to consider giving it a try!

Encapsulated logic

Instead of selecting from a variety of search methods in BeautifulSoup, soupsavvy offers a streamlined, consistent interface.
The logic is encapsulated in declared selectors, so there’s no need to write nested loops or complex conditionals.

BeautifulSoup

from bs4 import BeautifulSoup

soup = BeautifulSoup(
    """
        <div>
            <span class="event">Event</span>
            <span>party</span>
        </div>
    """,
    features="lxml",
)

for div in soup.find_all("div"):
    for event in div.find_all(class_="event", recursive=False):
        party = event.find_next_sibling("span", string="party")
        if party is not None:
            break
party
<span>party</span>

soupsavvy

from bs4 import BeautifulSoup

from soupsavvy import ClassSelector, PatternSelector, TypeSelector, to_soupsavvy

soup = BeautifulSoup(
    """
        <div>
            <span class="event">Event</span>
            <span>party</span>
        </div>
    """,
    features="lxml",
)
element = to_soupsavvy(soup)
selector = TypeSelector("div") > ClassSelector("event") + (
    TypeSelector("span") & PatternSelector("party")
)
selector.find(element)
SoupElement(<span>party</span>)

Missing elements

In BeautifulSoup, you often have to deal with missing elements before interacting with them, which clutters your code. soupsavvy selectors handle this for you automatically. If you need stricter control, the strict mode raises an exception if the required element isn’t found.

BeautifulSoup

from bs4 import BeautifulSoup

soup = BeautifulSoup(
    """
        <div>
            <span>No event here</span>
            <span>No party</span>
        </div>
    """,
    features="lxml",
)

event = soup.find(class_="event")

if event is not None:
    party = event.find_next_sibling(string="party")
else:
    print("This needs to be handled explicitly every time.")
This needs to be handled explicitly every time.

soupsavvy

from bs4 import BeautifulSoup

from soupsavvy import ClassSelector, PatternSelector, to_soupsavvy
from soupsavvy.exceptions import TagNotFoundException

soup = BeautifulSoup(
    """
        <div>
            <span>No event here</span>
            <span>No party</span>
        </div>
    """,
    features="lxml",
)
element = to_soupsavvy(soup)

selector = ClassSelector("event") + PatternSelector("party")
assert selector.find(element) is None

try:
    selector.find(element, strict=True)
except TagNotFoundException as e:
    print(e)
Tag was not found in markup.

Combining selectors

Combining selectors in BeautifulSoup can be cumbersome, especially when trying to use different methods or perform set operations like unions or intersections. With soupsavvy, logical operators allow you to easily combine selectors without worrying about hash collisions or element order.

BeautifulSoup

from bs4 import BeautifulSoup

soup = BeautifulSoup(
    """
        <p class="special">Festival</p>
        <div>
            <span>Event</span>
            <span>Menu</span>
        </div>
        <div>
            <span>Menu</span>
        </div>
        <div>
            <span>Event</span>
        </div>
        <span>Event</span>
    """,
    features="lxml",
)

result1 = soup.find_all("span", string="Event")
result2 = soup.select(":last-child")
result3 = soup.find_all(class_="special")

# elements with the same text representation have the same hash
# <span>Event</span> is included only once!
# There is no guarantee that the order of the elements will be preserved

(set(result1) & set(result2)) | set(result3)
{<p class="special">Festival</p>, <span>Event</span>}

soupsavvy

from bs4 import BeautifulSoup

from soupsavvy import ClassSelector, PatternSelector, TypeSelector, to_soupsavvy
from soupsavvy.selectors.css import LastChild

soup = BeautifulSoup(
    """
        <p class="special">Festival</p>
        <div>
            <span>Event</span>
            <span>Menu</span>
        </div>
        <div>
            <span>Menu</span>
        </div>
        <div>
            <span>Event</span>
        </div>
        <span>Event</span>
    """,
    features="lxml",
)
element = to_soupsavvy(soup)

selector = (
    PatternSelector("Event") & TypeSelector("span") & LastChild()
) | ClassSelector("special")
selector.find_all(element)
[SoupElement(<p class="special">Festival</p>),
 SoupElement(<span>Event</span>),
 SoupElement(<span>Event</span>)]

Data Pipelines

Often, selecting an element is just the first step, you need to extract and transform the data afterward. soupsavvy lets you pipe operations directly into selectors, enabling you to transform and extract data seamlessly without additional code.

BeautifulSoup

from datetime import datetime

from bs4 import BeautifulSoup

soup = BeautifulSoup(
    """
        <p>Event</p>
        <span class="date">2023-10-30</span>
        <span class="date">2023-08-31</span>
    """,
    features="lxml",
)

date_elements = soup.find_all(class_="date")
dates = [
    datetime.strptime(element.get_text(strip=True), "%Y-%m-%d")
    for element in date_elements
]
dates
[datetime.datetime(2023, 10, 30, 0, 0), datetime.datetime(2023, 8, 31, 0, 0)]

soupsavvy

from datetime import datetime

from bs4 import BeautifulSoup

from soupsavvy import ClassSelector, to_soupsavvy
from soupsavvy.operations import Operation, Text

soup = BeautifulSoup(
    """
        <p>Event</p>
        <span class="date">2023-10-30</span>
        <span class="date">2023-08-31</span>
    """,
    features="lxml",
)
element = to_soupsavvy(soup)

selector = ClassSelector("date") | Text() | Operation(datetime.strptime, "%Y-%m-%d")
selector.find_all(element)
[datetime.datetime(2023, 10, 30, 0, 0), datetime.datetime(2023, 8, 31, 0, 0)]

Structured information

Extracting structured information in BeautifulSoup often requires repetitive boilerplate code.
With soupsavvy, you can define flexible, reusable data extraction schemas.

BeautifulSoup

from dataclasses import dataclass

from bs4 import BeautifulSoup


@dataclass
class Book:
    title: str
    price: float


text = """
    <div class="book">
        <p class="title">Animal Farm</p>
        <p class="price">100$</p>
    </div>
    <div class="book">
        <p class="title">Brave New World  </p>
        <p class="price">80$</p>
    </div>
"""
soup = BeautifulSoup(text, features="lxml")

books = []
book_elements = soup.find_all("div", class_="book")

for book_element in book_elements:
    title = book_element.find(class_="title")

    if title is None:
        raise ValueError("Title not found")

    title = title.get_text(strip=True)

    price = book_element.find(class_="price")

    if price is None:
        raise ValueError("Price not found")

    price = int(price.get_text(strip=True).replace("$", ""))
    book = Book(title, price)
    books.append(book)

books
[Book(title='Animal Farm', price=100), Book(title='Brave New World', price=80)]

soupsavvy

from bs4 import BeautifulSoup

from soupsavvy import ClassSelector, TypeSelector, to_soupsavvy
from soupsavvy.models import BaseModel
from soupsavvy.operations import Operation, Text


class Book(BaseModel):

    __scope__ = TypeSelector("div") & ClassSelector("book")

    title = ClassSelector("title") | Text()
    price = (
        ClassSelector("price")
        | Text()
        | Operation(lambda x: x.strip("$"))
        | Operation(int)
    )


text = """
    <div class="book">
        <p class="title">Animal Farm</p>
        <p class="price">100$</p>
    </div>
    <div class="book">
        <p class="title">Brave New World  </p>
        <p class="price">80$</p>
    </div>
"""
soup = BeautifulSoup(text, features="lxml")
element = to_soupsavvy(soup)

Book.find_all(element)
[Book(title='Animal Farm', price=100),
 Book(title='Brave New World  ', price=80)]

Conclusion

By using soupsavvy, you not only simplify your code but also gain powerful tools to handle complex selection and extraction tasks with ease.
It’s a great way to keep your web scraping modules clean, concise and less error-prone.

Enjoy soupsavvy and leave us feedback!
Happy scraping!