PyMySQL database, Issues with filling table
Question:
This is my Python main.py program:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as wait
import time
from bs4 import BeautifulSoup
import requests
import mysql.connector
db = mysql.connector.connect(
host="localhost",
user="root",
passwd="root",
database="testdatabase"
)
mycursor = db.cursor()
try:
#mycursor.execute("CREATE DATABASE animus")
mycursor.execute("CREATE TABLE animus_list (name VARCHAR(255), voti VARCHAR(10), ranks VARCHAR(50), popolarità VARCHAR(50), membri VARCHAR(50) )")
except:
#delete_table = "DROP TABLE animus_list"
#mycursor.execute(delete_table)
print("Table already exists")
class Datascraping:
def __init__(self):
print("Let's start!")
def MyAnimeList(self):
'''GIVE SELENIUM THE GECKODRIVER LOCATION AND THE WEBSITE TO WORK ON'''
#---------------------------------------------------------------------------------------------------------------------
PATH = "C:Program Files (x86)chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://myanimelist.net/anime.php")
print(driver.title)
'''INPUT TO PYTHON THE ANIME YOU ARE SEARCHING. CAREFUL!!! THE SEARCHBAR OF THAT WEBSITE DOESN'T WORK WITH LESS THAN 3 LETTERS IN IT!!!'''
#-----------------------------------------------------------------------------------------------------------------------
selected_anime = str(input("Insert the anime you are searching (MIN 3 CHARACTERS): "))
while len(selected_anime) < 3:
print("The search won't work with less than 3 characters!!")
selected_anime = input("Insert the anime you are searching (MIN 3 CHARACTERS): ")
'''COOKIE BUTTON THAT APPEARS AS SOON AS I OPEN THE WEBSITE. IF I DON'T CLOSE IT, THE HTML OF THE PAGE WON'T BE REACHABLE AND I WILL GET AN EXCEPTION'''
#-----------------------------------------------------------------------------------------------------------------------
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "css-47sehv")))
puls_cookie = driver.find_element(By.CLASS_NAME, "css-47sehv")
puls_cookie.click()
except:
print("The cookie window didn't appear this time!")
#-----------------------------------------------------------------------------------------------------------------------
'''WRITE IN THE WEBSITE SEARCHBAR'''
#-----------------------------------------------------------------------------------------------------------------------
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "q"))) #searchbar has ID = "q"
search = driver.find_element(By.ID, "q")
search.click()
search.send_keys(anime_selezionato) #anime_selezionato means "selected anime"
search.send_keys(Keys.RETURN)
#-----------------------------------------------------------------------------------------------------------------------
'''TRY TO OBTAIN THE ANIME WITH THE EXACT NAME THAT WE INSERTED IN anime_selezionato'''
#-----------------------------------------------------------------------------------------------------------------------
try:
driver.get(WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, f"//table/tbody/tr/td[2]/div/a[strong='{anime_selezionato}']"))).get_attribute("href"))
#-----------------------------------------------------------------------------------------------------------------------
'''IF WE DON'T FIND THE ANIME WITH THE EXACT NAME WE INPUT, WE PRINT THE ANIME(s) THAT THE PAGE FOUND WITH THE KEY WE INSERTED IN THE SEARCHBAR. IS ONE OF THEM WHAT YOU WERE SEARCHING FOR?'''
except:
#-----------------------------------------------------------------------------------------------------------------------
print("No anime with the same name found! Is it one of this, maybe?")
listaanimu = [] #create an empty list to fill later with the list of animes found
numero_di_anime = 1 #iterable element that we will need in the for loop. numero di anime means "number of animes"
search2 = driver.find_elements(By.TAG_NAME, "strong") #All anime names are stored in strong tags
for element in search2:
nome = element.text #nome means "name"
print("-" + str(numero_di_anime) + " " + element.text) #the result will be like: -1 Demon Slayer
listaanimu.append(nome)
numero_di_anime += 1
print("If one of this animes is what you are looking for, write the respective index number")
anime = input("The anime I want is number: ")
try:
anime2 = int(anime) #this try/except is needed in case of a wrong input, like a letter, that would create an exception
except:
anime2 = anime
while anime2 not in range(1, 51): #the animes that appear are 50 at maximum. A number too big would not be found and give an exception
print("Anime not found! Try again!!")
anime = input("The anime I want is number: ")
try:
anime2 = int(anime)
except:
anime2 = anime
#-----------------------------------------------------------------------------------------------------------------------
'''CLICK THE NAME OF THE ANIME ON THE SITE, TO OPEN ITS PAGE'''
#-----------------------------------------------------------------------------------------------------------------------
def anime_alternativo():
try:
anime_selezionato = listaanimu[int(anime) - 1] #The list of animes starts at 1, but lists in python start at 0
print(anime_selezionato)
print("Anime Found!")
try:
driver.get(WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, f"//table/tbody/tr/td[2]/div/a[strong='{anime_selezionato}']"))).get_attribute("href"))
anime_BeautifulSoup = driver.find_element(By.LINK_TEXT, "%s" % anime_selezionato).get_attribute("href") #BeautifulSoup works with links, let's give it the link of the page of the selected anime
print(anime_BeautifulSoup)
except:
print("Anime not found! Are you sure it is part of the list?")
except:
print("It is not a valid answer!!! ") #these excepts should never be activated, shouldn't even be necessary
BeautifulSoup_Scraping(anime_BeautifulSoup) #give the link to beautifulSoup
#-----------------------------------------------------------------------------------------------------------------------
'''NOW THAT THE PAGE OF THE SELECTED ANIME IS OPENED, WE DATASCRAPE IT WITH BEAUTIFULSOUP'''
#-----------------------------------------------------------------------------------------------------------------------
def BeautifulSoup_Scraping(link):
html_text = requests.get(link).text
soup = BeautifulSoup(html_text, 'lxml')
rangevoti = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'na'] #the range of votes of the website. if an anime is still unreleased, its vote will be na
for numeri in rangevoti:
try:
voto = soup.find('div', class_='score-label score-%s' % (numeri)).text #get the vote of the anime from the webpage
except:
print("Error! They may have invented another 'type' of vote!")
try:
nome = soup.find('h1', class_='title-name h1_bold_none').text #get the name of the anime. this try/except is needed because some anime names have weird fonts that are not supported by BeautifulSoup
except:
print("The anime could have a non supported font")
rank = soup.find('span', class_='numbers ranked').text #get the anime rank, popularity and number of users
popularity = soup.find('span', class_='numbers popularity').text
users = soup.find('span', class_='numbers members').text
rank1 = rank.replace('Ranked #', '', 1) #remove unnecessary characters, just give me the numbers
popularity1 = popularity.replace('Popularity #', '', 1)
users1 = users.replace('Members', '', 1)
if rank == 'Ranked N/A':
rank1 = rank.replace('Ranked', '', 1)
if rank == 'Popularity N/A':
popularity1 = popularity.replace('Popularity', '', 1)
print(nome + " " + voto + " " + rank1 + " " + popularity1 + " " + users1) #Print a preview of what the row on the database will look like
name = str(nome)
vote = str(voto)
rank2 = str(rank1)
popularity2 = str(popularity1)
users2 = str(users1)
'''ERROR ZONE! MYSQL SAYS THAT I HAVE AN ERROR IN THE SYNTAX BUT THIS COMMAND WORKS IN ANOTHER PROGRAM, WHERE IT IS WRITTEN IN THE SAME EXACT WAY!'''
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
data = "INSERT INTO animus_list (nome, voto, rank1, popularity1, users1) VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE" #insert all the scraped data in the table
values = (name, vote, rank2, popularity2, users2)
mycursor.execute(data, values)
db.commit()
for x in mycursor:
print(x)
#except:
#print("Cannot insert in database")
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
'''CALL THE FUNCTION'''
#-----------------------------------------------------------------------------------------------------------------------
anime_alternativo()
#-----------------------------------------------------------------------------------------------------------------------
'''FINE PROGRAMMA'''
#-----------------------------------------------------------------------------------------------------------------------
time.sleep(5) #close the browser window and end the program
driver.quit()
#-----------------------------------------------------------------------------------------------------------------------
scraping = Datascraping()
scraping.MyAnimeList()
This whole program is inside a class because end result of this project will be: a Tkinter (or Kivi or PyQt, still not sure what to choose) app that scrapes websites
The part of the code that gives an error works perfectly in another program (which is just the beautifoulSoup of this program here).
The exception I get says that I made a mistake on the MySQL syntax, but again. In other programs it works and has the same syntax.
data = "INSERT INTO animus_list (nome, voto, rank1, popularity1, users1) VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE" #insert all the scraped data in the table
values = (name, vote, rank2, popularity2, users2)
mycursor.execute(data, values)
db.commit()
for x in mycursor:
print(x)
#except:
#print("Cannot insert in database")
This is the part that is causing me issues.
What I tried is:
-DELETING the table and creating a new one
-Adding/removing variables to add to the table
-Changing the size of the space reserved for each variable in MySQL (VARCHAR (1), VARCHAR (100)….)
Answers:
The error was caused by the "ON DUPLICATE KEY UPDATE", and when I removed it the error disappeared.
I thought I needed it to not have duplicate rows but I found another way.
Thank you for the help Markus.
I wish you a great day
This is my Python main.py program:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait as wait
import time
from bs4 import BeautifulSoup
import requests
import mysql.connector
db = mysql.connector.connect(
host="localhost",
user="root",
passwd="root",
database="testdatabase"
)
mycursor = db.cursor()
try:
#mycursor.execute("CREATE DATABASE animus")
mycursor.execute("CREATE TABLE animus_list (name VARCHAR(255), voti VARCHAR(10), ranks VARCHAR(50), popolarità VARCHAR(50), membri VARCHAR(50) )")
except:
#delete_table = "DROP TABLE animus_list"
#mycursor.execute(delete_table)
print("Table already exists")
class Datascraping:
def __init__(self):
print("Let's start!")
def MyAnimeList(self):
'''GIVE SELENIUM THE GECKODRIVER LOCATION AND THE WEBSITE TO WORK ON'''
#---------------------------------------------------------------------------------------------------------------------
PATH = "C:Program Files (x86)chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://myanimelist.net/anime.php")
print(driver.title)
'''INPUT TO PYTHON THE ANIME YOU ARE SEARCHING. CAREFUL!!! THE SEARCHBAR OF THAT WEBSITE DOESN'T WORK WITH LESS THAN 3 LETTERS IN IT!!!'''
#-----------------------------------------------------------------------------------------------------------------------
selected_anime = str(input("Insert the anime you are searching (MIN 3 CHARACTERS): "))
while len(selected_anime) < 3:
print("The search won't work with less than 3 characters!!")
selected_anime = input("Insert the anime you are searching (MIN 3 CHARACTERS): ")
'''COOKIE BUTTON THAT APPEARS AS SOON AS I OPEN THE WEBSITE. IF I DON'T CLOSE IT, THE HTML OF THE PAGE WON'T BE REACHABLE AND I WILL GET AN EXCEPTION'''
#-----------------------------------------------------------------------------------------------------------------------
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "css-47sehv")))
puls_cookie = driver.find_element(By.CLASS_NAME, "css-47sehv")
puls_cookie.click()
except:
print("The cookie window didn't appear this time!")
#-----------------------------------------------------------------------------------------------------------------------
'''WRITE IN THE WEBSITE SEARCHBAR'''
#-----------------------------------------------------------------------------------------------------------------------
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "q"))) #searchbar has ID = "q"
search = driver.find_element(By.ID, "q")
search.click()
search.send_keys(anime_selezionato) #anime_selezionato means "selected anime"
search.send_keys(Keys.RETURN)
#-----------------------------------------------------------------------------------------------------------------------
'''TRY TO OBTAIN THE ANIME WITH THE EXACT NAME THAT WE INSERTED IN anime_selezionato'''
#-----------------------------------------------------------------------------------------------------------------------
try:
driver.get(WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, f"//table/tbody/tr/td[2]/div/a[strong='{anime_selezionato}']"))).get_attribute("href"))
#-----------------------------------------------------------------------------------------------------------------------
'''IF WE DON'T FIND THE ANIME WITH THE EXACT NAME WE INPUT, WE PRINT THE ANIME(s) THAT THE PAGE FOUND WITH THE KEY WE INSERTED IN THE SEARCHBAR. IS ONE OF THEM WHAT YOU WERE SEARCHING FOR?'''
except:
#-----------------------------------------------------------------------------------------------------------------------
print("No anime with the same name found! Is it one of this, maybe?")
listaanimu = [] #create an empty list to fill later with the list of animes found
numero_di_anime = 1 #iterable element that we will need in the for loop. numero di anime means "number of animes"
search2 = driver.find_elements(By.TAG_NAME, "strong") #All anime names are stored in strong tags
for element in search2:
nome = element.text #nome means "name"
print("-" + str(numero_di_anime) + " " + element.text) #the result will be like: -1 Demon Slayer
listaanimu.append(nome)
numero_di_anime += 1
print("If one of this animes is what you are looking for, write the respective index number")
anime = input("The anime I want is number: ")
try:
anime2 = int(anime) #this try/except is needed in case of a wrong input, like a letter, that would create an exception
except:
anime2 = anime
while anime2 not in range(1, 51): #the animes that appear are 50 at maximum. A number too big would not be found and give an exception
print("Anime not found! Try again!!")
anime = input("The anime I want is number: ")
try:
anime2 = int(anime)
except:
anime2 = anime
#-----------------------------------------------------------------------------------------------------------------------
'''CLICK THE NAME OF THE ANIME ON THE SITE, TO OPEN ITS PAGE'''
#-----------------------------------------------------------------------------------------------------------------------
def anime_alternativo():
try:
anime_selezionato = listaanimu[int(anime) - 1] #The list of animes starts at 1, but lists in python start at 0
print(anime_selezionato)
print("Anime Found!")
try:
driver.get(WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, f"//table/tbody/tr/td[2]/div/a[strong='{anime_selezionato}']"))).get_attribute("href"))
anime_BeautifulSoup = driver.find_element(By.LINK_TEXT, "%s" % anime_selezionato).get_attribute("href") #BeautifulSoup works with links, let's give it the link of the page of the selected anime
print(anime_BeautifulSoup)
except:
print("Anime not found! Are you sure it is part of the list?")
except:
print("It is not a valid answer!!! ") #these excepts should never be activated, shouldn't even be necessary
BeautifulSoup_Scraping(anime_BeautifulSoup) #give the link to beautifulSoup
#-----------------------------------------------------------------------------------------------------------------------
'''NOW THAT THE PAGE OF THE SELECTED ANIME IS OPENED, WE DATASCRAPE IT WITH BEAUTIFULSOUP'''
#-----------------------------------------------------------------------------------------------------------------------
def BeautifulSoup_Scraping(link):
html_text = requests.get(link).text
soup = BeautifulSoup(html_text, 'lxml')
rangevoti = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'na'] #the range of votes of the website. if an anime is still unreleased, its vote will be na
for numeri in rangevoti:
try:
voto = soup.find('div', class_='score-label score-%s' % (numeri)).text #get the vote of the anime from the webpage
except:
print("Error! They may have invented another 'type' of vote!")
try:
nome = soup.find('h1', class_='title-name h1_bold_none').text #get the name of the anime. this try/except is needed because some anime names have weird fonts that are not supported by BeautifulSoup
except:
print("The anime could have a non supported font")
rank = soup.find('span', class_='numbers ranked').text #get the anime rank, popularity and number of users
popularity = soup.find('span', class_='numbers popularity').text
users = soup.find('span', class_='numbers members').text
rank1 = rank.replace('Ranked #', '', 1) #remove unnecessary characters, just give me the numbers
popularity1 = popularity.replace('Popularity #', '', 1)
users1 = users.replace('Members', '', 1)
if rank == 'Ranked N/A':
rank1 = rank.replace('Ranked', '', 1)
if rank == 'Popularity N/A':
popularity1 = popularity.replace('Popularity', '', 1)
print(nome + " " + voto + " " + rank1 + " " + popularity1 + " " + users1) #Print a preview of what the row on the database will look like
name = str(nome)
vote = str(voto)
rank2 = str(rank1)
popularity2 = str(popularity1)
users2 = str(users1)
'''ERROR ZONE! MYSQL SAYS THAT I HAVE AN ERROR IN THE SYNTAX BUT THIS COMMAND WORKS IN ANOTHER PROGRAM, WHERE IT IS WRITTEN IN THE SAME EXACT WAY!'''
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
data = "INSERT INTO animus_list (nome, voto, rank1, popularity1, users1) VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE" #insert all the scraped data in the table
values = (name, vote, rank2, popularity2, users2)
mycursor.execute(data, values)
db.commit()
for x in mycursor:
print(x)
#except:
#print("Cannot insert in database")
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
'''CALL THE FUNCTION'''
#-----------------------------------------------------------------------------------------------------------------------
anime_alternativo()
#-----------------------------------------------------------------------------------------------------------------------
'''FINE PROGRAMMA'''
#-----------------------------------------------------------------------------------------------------------------------
time.sleep(5) #close the browser window and end the program
driver.quit()
#-----------------------------------------------------------------------------------------------------------------------
scraping = Datascraping()
scraping.MyAnimeList()
This whole program is inside a class because end result of this project will be: a Tkinter (or Kivi or PyQt, still not sure what to choose) app that scrapes websites
The part of the code that gives an error works perfectly in another program (which is just the beautifoulSoup of this program here).
The exception I get says that I made a mistake on the MySQL syntax, but again. In other programs it works and has the same syntax.
data = "INSERT INTO animus_list (nome, voto, rank1, popularity1, users1) VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE" #insert all the scraped data in the table
values = (name, vote, rank2, popularity2, users2)
mycursor.execute(data, values)
db.commit()
for x in mycursor:
print(x)
#except:
#print("Cannot insert in database")
This is the part that is causing me issues.
What I tried is:
-DELETING the table and creating a new one
-Adding/removing variables to add to the table
-Changing the size of the space reserved for each variable in MySQL (VARCHAR (1), VARCHAR (100)….)
The error was caused by the "ON DUPLICATE KEY UPDATE", and when I removed it the error disappeared.
I thought I needed it to not have duplicate rows but I found another way.
Thank you for the help Markus.
I wish you a great day