Python playwright unable to access elements

Question:

I want to scrape the words which reside in the <li> elements. The results return an empty list. Are they resided within a frame because as I can see they are not within any <iframe><iframe> elements? If they do how do you access the frame or find the frame id in this case? Here is the site and the code

from playwright.sync_api import sync_playwright, expect


def test_fetch_paperrater():
    path = r"https://www.paperrater.com/page/lists-of-adjectives"
    with sync_playwright() as playwright:
        browser = playwright.chromium.launch()
        page = browser.new_page()
        page.goto(path)
        texts = page.locator("div#header-container article.page ul li").all_inner_texts()
        print(texts)
        browser.close()
Asked By: Andrea

||

Answers:

There are no iframes on the page: document.querySelector("iframe") returns null when run in the browser console. But that’s often a good line of thought for debugging failing locators.

There are many different lists on the page, but none of them are inside the header. article.page is also not in the header. I’m not sure which list you want, but this gets all of them along with their accompanying header:

from playwright.sync_api import sync_playwright # 1.23.1


url = "<Your URL>"

with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    context = browser.new_context(java_script_enabled=False)
    page = browser.new_page()

    def handle_route(route):
        route.continue_() if route.request.url == url  else route.abort()

    page.route("**/*", handle_route)
    page.goto(url, wait_until="domcontentloaded")
    lists = (
        page.locator(".content h2.doc-h2")
            .evaluate_all("""
              els => 
                els.map(e => ({
                  header: e.textContent,
                  items: [...e.nextElementSibling.querySelectorAll("li")]
                    .map(e => e.textContent)
                }))
            """)
    )

    for lst in lists:
        print(lst)

Note that the data is available statically in the HTML, so I’ve blocked images and disabled JS.

Once you get to that point, you may not even need Playwright. We could speed things up by using requests and BeautifulSoup:

import requests
from bs4 import BeautifulSoup

url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
lists = []

for x in soup.select(".content h2.doc-h2"):
    lists.append({
        "header": x.text.strip(),
        "items": [x.text.strip() for x in x.find_next("ul").select("li")],
    })

for lst in lists:
    print(lst)

On the other hand, if you just want one list:

(Playwright)

# ...
    page.goto(url, wait_until="domcontentloaded")
    lst = (
        page.locator(".content h2.doc-h2", has_text="Appearance adjectives list")
            .evaluate("""
              el => [...el.nextElementSibling.querySelectorAll("li")]
                .map(e => e.textContent)
            """)
    )
    print(lst)

(BeautifulSoup)

import re
import requests
from bs4 import BeautifulSoup

url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
h = soup.find(text=re.compile(r"^s*Appearance adjectives lists*$"))
lst = [x.text.strip() for x in h.find_next("ul").select("li")]
print(lst)

If you want just the lists of adjectives, the shortest change for either Playwright or BeautifulSoup is to use a slice to grab odd indexes and trim the "trends" lists at the tail:

for lst in lists[0:-10:2]:
    print(lst)

If this seems too hardcoded to the particular page, you could filter by headers that end with " list":

(Playwright)

lists = (
    page.locator(".content h2.doc-h2")
        .evaluate_all("""
          els => els
            .filter(e => e.textContent.trim().endsWith(" list"))
            .map(e => ({
              header: e.textContent,
              items: [...e.nextElementSibling.querySelectorAll("li")]
                .map(e => e.textContent)
            }))
        """)
)

(BeautifulSoup)

for x in soup.select(".content h2.doc-h2"):
    if x.text.strip().endswith(" list"):
        lists.append({
            "header": x.text.strip(),
            "items": [x.text.strip() for x in x.find_next("ul").select("li")]
        })

Or filter for lists that are nonempty, and where all items are exactly one word:

(Playwright)

lists = (
    page.locator(".content h2.doc-h2")
        .evaluate_all("""
          els => els
            .map(e => ({
              header: e.textContent.trim(),
              items: [...e.nextElementSibling.querySelectorAll("li")]
                .map(e => e.textContent.trim())
            }))
            .filter(e =>
              e.items.length &&
              e.items.every(e => e.split(/s+/).length === 1)
            )
        """)
)

(BeautifulSoup)

for x in soup.select(".content h2.doc-h2"):
    items = [x.text.strip() for x in x.find_next("ul").select("li")]

    if items and all(len(x.split()) == 1 for x in items):
        lists.append({
            "header": x.text.strip(),
            "items": items
        })
Answered By: ggorlen

The elements were not in div#header-container but div#wrapper. There were multiple ul elements and the best way to access these was with nth() as follows

with sync_playwright() as playwright:
    browser = playwright.chromium.launch()
    page = browser.new_page()
    page.goto(path)
    words = []
    for i in range(1, 22, 2):
        all_texts = page.locator("div#wrapper article.page ul").nth(i).all_inner_texts()
        texts = all_texts[0].split("n")
        for text in texts:
            append = words.append(text)
    browser.close()
Answered By: Andrea