Python playwright unable to access elements
Question:
I want to scrape the words which reside in the <li>
elements. The results return an empty list. Are they resided within a frame because as I can see they are not within any <iframe><iframe>
elements? If they do how do you access the frame or find the frame id in this case? Here is the site and the code
from playwright.sync_api import sync_playwright, expect
def test_fetch_paperrater():
path = r"https://www.paperrater.com/page/lists-of-adjectives"
with sync_playwright() as playwright:
browser = playwright.chromium.launch()
page = browser.new_page()
page.goto(path)
texts = page.locator("div#header-container article.page ul li").all_inner_texts()
print(texts)
browser.close()
Answers:
There are no iframes on the page: document.querySelector("iframe")
returns null when run in the browser console. But that’s often a good line of thought for debugging failing locators.
There are many different lists on the page, but none of them are inside the header. article.page
is also not in the header. I’m not sure which list you want, but this gets all of them along with their accompanying header:
from playwright.sync_api import sync_playwright # 1.23.1
url = "<Your URL>"
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(java_script_enabled=False)
page = browser.new_page()
def handle_route(route):
route.continue_() if route.request.url == url else route.abort()
page.route("**/*", handle_route)
page.goto(url, wait_until="domcontentloaded")
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els =>
els.map(e => ({
header: e.textContent,
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
}))
""")
)
for lst in lists:
print(lst)
Note that the data is available statically in the HTML, so I’ve blocked images and disabled JS.
Once you get to that point, you may not even need Playwright. We could speed things up by using requests and BeautifulSoup:
import requests
from bs4 import BeautifulSoup
url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
lists = []
for x in soup.select(".content h2.doc-h2"):
lists.append({
"header": x.text.strip(),
"items": [x.text.strip() for x in x.find_next("ul").select("li")],
})
for lst in lists:
print(lst)
On the other hand, if you just want one list:
(Playwright)
# ...
page.goto(url, wait_until="domcontentloaded")
lst = (
page.locator(".content h2.doc-h2", has_text="Appearance adjectives list")
.evaluate("""
el => [...el.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
""")
)
print(lst)
(BeautifulSoup)
import re
import requests
from bs4 import BeautifulSoup
url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
h = soup.find(text=re.compile(r"^s*Appearance adjectives lists*$"))
lst = [x.text.strip() for x in h.find_next("ul").select("li")]
print(lst)
If you want just the lists of adjectives, the shortest change for either Playwright or BeautifulSoup is to use a slice to grab odd indexes and trim the "trends" lists at the tail:
for lst in lists[0:-10:2]:
print(lst)
If this seems too hardcoded to the particular page, you could filter by headers that end with " list"
:
(Playwright)
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els => els
.filter(e => e.textContent.trim().endsWith(" list"))
.map(e => ({
header: e.textContent,
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
}))
""")
)
(BeautifulSoup)
for x in soup.select(".content h2.doc-h2"):
if x.text.strip().endswith(" list"):
lists.append({
"header": x.text.strip(),
"items": [x.text.strip() for x in x.find_next("ul").select("li")]
})
Or filter for lists that are nonempty, and where all items are exactly one word:
(Playwright)
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els => els
.map(e => ({
header: e.textContent.trim(),
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent.trim())
}))
.filter(e =>
e.items.length &&
e.items.every(e => e.split(/s+/).length === 1)
)
""")
)
(BeautifulSoup)
for x in soup.select(".content h2.doc-h2"):
items = [x.text.strip() for x in x.find_next("ul").select("li")]
if items and all(len(x.split()) == 1 for x in items):
lists.append({
"header": x.text.strip(),
"items": items
})
The elements were not in div#header-container
but div#wrapper
. There were multiple ul
elements and the best way to access these was with nth()
as follows
with sync_playwright() as playwright:
browser = playwright.chromium.launch()
page = browser.new_page()
page.goto(path)
words = []
for i in range(1, 22, 2):
all_texts = page.locator("div#wrapper article.page ul").nth(i).all_inner_texts()
texts = all_texts[0].split("n")
for text in texts:
append = words.append(text)
browser.close()
I want to scrape the words which reside in the <li>
elements. The results return an empty list. Are they resided within a frame because as I can see they are not within any <iframe><iframe>
elements? If they do how do you access the frame or find the frame id in this case? Here is the site and the code
from playwright.sync_api import sync_playwright, expect
def test_fetch_paperrater():
path = r"https://www.paperrater.com/page/lists-of-adjectives"
with sync_playwright() as playwright:
browser = playwright.chromium.launch()
page = browser.new_page()
page.goto(path)
texts = page.locator("div#header-container article.page ul li").all_inner_texts()
print(texts)
browser.close()
There are no iframes on the page: document.querySelector("iframe")
returns null when run in the browser console. But that’s often a good line of thought for debugging failing locators.
There are many different lists on the page, but none of them are inside the header. article.page
is also not in the header. I’m not sure which list you want, but this gets all of them along with their accompanying header:
from playwright.sync_api import sync_playwright # 1.23.1
url = "<Your URL>"
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(java_script_enabled=False)
page = browser.new_page()
def handle_route(route):
route.continue_() if route.request.url == url else route.abort()
page.route("**/*", handle_route)
page.goto(url, wait_until="domcontentloaded")
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els =>
els.map(e => ({
header: e.textContent,
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
}))
""")
)
for lst in lists:
print(lst)
Note that the data is available statically in the HTML, so I’ve blocked images and disabled JS.
Once you get to that point, you may not even need Playwright. We could speed things up by using requests and BeautifulSoup:
import requests
from bs4 import BeautifulSoup
url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
lists = []
for x in soup.select(".content h2.doc-h2"):
lists.append({
"header": x.text.strip(),
"items": [x.text.strip() for x in x.find_next("ul").select("li")],
})
for lst in lists:
print(lst)
On the other hand, if you just want one list:
(Playwright)
# ...
page.goto(url, wait_until="domcontentloaded")
lst = (
page.locator(".content h2.doc-h2", has_text="Appearance adjectives list")
.evaluate("""
el => [...el.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
""")
)
print(lst)
(BeautifulSoup)
import re
import requests
from bs4 import BeautifulSoup
url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
h = soup.find(text=re.compile(r"^s*Appearance adjectives lists*$"))
lst = [x.text.strip() for x in h.find_next("ul").select("li")]
print(lst)
If you want just the lists of adjectives, the shortest change for either Playwright or BeautifulSoup is to use a slice to grab odd indexes and trim the "trends" lists at the tail:
for lst in lists[0:-10:2]:
print(lst)
If this seems too hardcoded to the particular page, you could filter by headers that end with " list"
:
(Playwright)
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els => els
.filter(e => e.textContent.trim().endsWith(" list"))
.map(e => ({
header: e.textContent,
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
}))
""")
)
(BeautifulSoup)
for x in soup.select(".content h2.doc-h2"):
if x.text.strip().endswith(" list"):
lists.append({
"header": x.text.strip(),
"items": [x.text.strip() for x in x.find_next("ul").select("li")]
})
Or filter for lists that are nonempty, and where all items are exactly one word:
(Playwright)
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els => els
.map(e => ({
header: e.textContent.trim(),
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent.trim())
}))
.filter(e =>
e.items.length &&
e.items.every(e => e.split(/s+/).length === 1)
)
""")
)
(BeautifulSoup)
for x in soup.select(".content h2.doc-h2"):
items = [x.text.strip() for x in x.find_next("ul").select("li")]
if items and all(len(x.split()) == 1 for x in items):
lists.append({
"header": x.text.strip(),
"items": items
})
The elements were not in div#header-container
but div#wrapper
. There were multiple ul
elements and the best way to access these was with nth()
as follows
with sync_playwright() as playwright:
browser = playwright.chromium.launch()
page = browser.new_page()
page.goto(path)
words = []
for i in range(1, 22, 2):
all_texts = page.locator("div#wrapper article.page ul").nth(i).all_inner_texts()
texts = all_texts[0].split("n")
for text in texts:
append = words.append(text)
browser.close()