How to get all data when "show more" button clicked with scrapy-playwright

Question:

Currently, I’ve had trouble getting all data on this page: https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2

so if scrape right now it only gets 50 of the data, this is not what I want, what I want is to scrape all data, to show all table data must have to click the "show more" button until there is no "show more" button.

enter image description here

enter image description here

I’m using scrapy+playwright as a scraping tool

def start_requests(self):
        yield scrapy.Request(
            url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
            meta=dict(
                playwright=True,
                playwright_include_page=True,
                playwright_page_coroutines=[
                    PageMethod('click',
                               '//a[@class="AnchorLink loadMore__link"]'),
                    PageMethod('wait_for_selector',
                                "//table[@class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr"),
                ]
            ),
            callback=self.parse,
        )

    async def parse(self, response):
        page = response.meta["playwright_page"]
        button = page.locator("xpath=//a[@class='AnchorLink loadMore__link']")

        resp = response.body
        sel = Selector(text=resp)

        player_list = sel.xpath(
            "//table[@class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
        stats_list = sel.xpath(
            "//div[@class='Table__ScrollerWrapper relative overflow-hidden']/div[@class='Table__Scroller']/table/tbody/tr")

        for player, stat in zip(player_list, stats_list):
            player_name = player.xpath(".//a/text()").get()
            position = stat.xpath(".//td/div/text()").get()
            team_name = player.xpath(".//span/text()").get()
            game_played = stat.xpath(".//td[2]/text()").get()
            minutes_per_minute = stat.xpath(".//td[3]/text()").get()
            points_per_game = stat.xpath(".//td[4]/text()").get()
            fields_goal_made = stat.xpath(".//td[5]/text()").get()
            fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
            field_goal_percentage = stat.xpath(".//td[7]/text()").get()
            three_point_goal_made = stat.xpath(".//td[8]/text()").get()

            yield {
                "player_name": player_name,
                "player_position": position,
                "team_name": team_name,
                "game_played": game_played,
                "minutes_per_minute": minutes_per_minute,
                "points_per_game": points_per_game,
                "fields_goal_made": fields_goal_made,
                "fields_goal_attempted": fields_goal_attempted,
                "field_goal_percentage": field_goal_percentage,
                "three_point_goal_made": three_point_goal_made,
            }

I already define the button = page.locator("xpath=//a[@class='AnchorLink loadMore__link']") so what I want this button clicked until no show more button available.

playwright_page_coroutines=[
                    PageMethod('click',
                               '//a[@class="AnchorLink loadMore__link"]'),
                    PageMethod('wait_for_selector',
                                "//table[@class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr"),
                ]

I believe I’m wrong here, I think the first PageMethod handle clicks so it will click until no show more button then the second PageMethod will wait until loading all data, but the result same only gets 50 of the data.

So how to achieve all of that, kinda stuck here.

Also, I don’I know if this issue but if the page reloaded also back to 50 tables of data.

Answers:

Since your goal is to to continously find the same element until it no longer exists, you could handle all of the logic in the parse method itself. Their could be better ways to handle this, but this does provide the desired full table of results in the output.

    def start_requests(self):
        yield scrapy.Request(
            url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
            meta=dict(
                playwright=True,
                playwright_include_page=True),
            callback=self.parse,
        )

    async def parse(self, response):
        page = response.meta["playwright_page"]
        page.set_default_timeout(1000)
        try:
            while button := page.locator("//div[contains(@class,'loadMore')]/a"):
                await button.scroll_into_view_if_needed()
                await button.click()
        except:
            pass
        content = await page.content()
        sel = Selector(text=content)

        player_list = sel.xpath(
            "//table[@class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
        stats_list = sel.xpath(
            "//div[@class='Table__ScrollerWrapper relative overflow-hidden']/div[@class='Table__Scroller']/table/tbody/tr")

        for player, stat in zip(player_list, stats_list):
            player_name = player.xpath(".//a/text()").get()
            position = stat.xpath(".//td/div/text()").get()
            team_name = player.xpath(".//span/text()").get()
            game_played = stat.xpath(".//td[2]/text()").get()
            minutes_per_minute = stat.xpath(".//td[3]/text()").get()
            points_per_game = stat.xpath(".//td[4]/text()").get()
            fields_goal_made = stat.xpath(".//td[5]/text()").get()
            fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
            field_goal_percentage = stat.xpath(".//td[7]/text()").get()
            three_point_goal_made = stat.xpath(".//td[8]/text()").get()

            yield {
                "player_name": player_name,
                "player_position": position,
                "team_name": team_name,
                "game_played": game_played,
                "minutes_per_minute": minutes_per_minute,
                "points_per_game": points_per_game,
                "fields_goal_made": fields_goal_made,
                "fields_goal_attempted": fields_goal_attempted,
                "field_goal_percentage": field_goal_percentage,
                "three_point_goal_made": three_point_goal_made,
            }
Answered By: Alexander