pb avec web scraping

Question:

import requests
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
df = []
for x in range(1,31):
    url_allocine= 'https://www.allocine.fr/film/meilleurs/?page='

    page = requests.get(url_allocine + str(x))
    soup = BeautifulSoup(page.content, 'html.parser')


    films_all = soup.findAll('div',{'class':'card entity-card entity-card-list cf'})
    #print(len(films_all))

    film = films_all[0]
    #print(film)

    titre = film.find("div",{'class':'meta'}).find('a').text
    #print(titre)

    note = film.findAll("div",{'class':'rating-item'})[0]
    note_presse = note.find('span',{'class':'stareval-note'}).text
    #print(note_presse)
    note_1 = film.findAll("div",{'class':'rating-item'})[1]
    note_spectateur = note_1.find('span',{'class':'stareval-note'}).text
    #print(note_spectateur)



    for film in films_all:
        titre = film.find("div",{'class':'meta'}).find('a').text

        note_presse= (note.find('span',{'class':'stareval-note'}).text)

        note_spectateur = (note_1.find('span',{'class':'stareval-note'}).text)

        property_info = {
            'titre': titre,
            'note_presse': note_presse,
            'note_spectateur': note_spectateur,
         }
        df.append(property_info)
    #print(len(df))
df_allocine = pd.DataFrame(df)
print(df_allocine[0:20])

In the above code and for the note selection, I could not select or find a way to create the note_presse and the note_spectateur on the same line, since they share the same tags. So, I tried to use indexation hoping to solve the problem. But, I found after creating the Datframe that for the first 10 rows the films have the same notes, and it changes for the second 10 rows(due to pagination but it stays the same for these also and so on).
Hope I find a solution using urllib or requests but not another methode like selinium. Thanks in advance for your efforts.

Asked By: Raouf Yahiaoui

||

Answers:

To get "Note Presse" and "Note Spectateurs" you can use next example:

import requests
import pandas as pd
from bs4 import BeautifulSoup

data = []
for page in range(1, 3):  # <-- increase number of pages here
    url = f"https://www.allocine.fr/film/meilleurs/?page={page}"
    soup = BeautifulSoup(requests.get(url).content, "html.parser")

    for movie in soup.select("li.mdl"):
        data.append(
            {
                "Title": movie.h2.text.strip(),
                "Note Presse": movie.select_one(
                    ".rating-item:-soup-contains(Presse) .stareval-note"
                ).text.strip(),
                "Note Spectateurs": movie.select_one(
                    ".rating-item:-soup-contains(Spectateurs) .stareval-note"
                ).text.strip(),
            }
        )

df = pd.DataFrame(data)
print(df)

Prints:

                                                  Title Note Presse Note Spectateurs
0                                          Forrest Gump         2,6              4,6
1                                 La Liste de Schindler         4,2              4,6
2                                        La Ligne verte         2,8              4,6
3                                   12 hommes en colère         5,0              4,6
4                                            Le Parrain         4,6              4,5
5                                            Les Evadés         3,2              4,5
6            Le Seigneur des anneaux : le retour du roi         3,8              4,5
7                                           Le Roi Lion         3,4              4,5
8                      Vol au-dessus d'un nid de coucou         5,0              4,5
9                    The Dark Knight, Le Chevalier Noir         4,0              4,5
10                                         Pulp Fiction         4,4              4,5
11                       Il était une fois dans l'Ouest         4,0              4,5
12                        Le Bon, la brute et le truand         4,1              4,5
13                        Il était une fois en Amérique         4,9              4,5
14                                     Django Unchained         4,6              4,5
15  Le Seigneur des anneaux : la communauté de l'anneau         3,7              4,5
16                                            Gladiator         4,3              4,5
17                                          Gran Torino         4,7              4,5
18             Le Seigneur des anneaux : les deux tours         4,0              4,5
19                                         Interstellar         3,8              4,5
Answered By: Andrej Kesely

Andrej Kesely, this is the code I did: I know it works but it’s so heavy:

import requests
import pandas as pd
from bs4 import BeautifulSoup
df = []
for x in range(1,31):
url_allocine= ‘https://www.allocine.fr/film/meilleurs/?page=’

page = requests.get(url_allocine + str(x))
soup = BeautifulSoup(page.content, 'html.parser')


films_all = soup.find_all('div',{'class':'card entity-card entity-card-list cf'})
def remove_word(string):
    return string.replace("Presse","").replace("Spectateurs","")

for film in films_all:
    title = film.find('h2').get_text(strip=True)
    rates = film.find_all('div', class_='rating-holder rating-holder-3')
    for rate in rates:
        note_presse = remove_word(rate.find_all("div",{'class':'rating-item'})[0].get_text(strip=True))
        note_spectateur = remove_word(rate.find_all("div",{'class':'rating-item'})[1].get_text(strip=True))

    property_info = {
        'title': title,
        'note_presse': note_presse,
        'note_spectateur': note_spectateur,
    }
    df.append(property_info)
# print(len(df))

df_allocine = pd.DataFrame(df)
print(df_allocine[0:10])

Answered By: Raouf Yahiaoui
Categories: questions Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.