Following Links in HTML using BeautifulSoup
Question:
I am doing a course which requires me to parse this using BeautifulSoup: http://python-data.dr-chuck.net/known_by_Fikret.html
The instructions are: Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer is the last name that you retrieve.
This is the code I have so far:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import re
url = input('Enter - ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
taglist = list()
tags = soup('a')
for i in range(count):
for tag in tags:
taglist.append(tag)
url = taglist[pos].get('href', None)
print('Retrieving: ', url)
urllist.append(url)
print('Last URL: ', urllist[-1])
This is my output:
Retrieving: http://python-data.dr-chuck.net/known_by_Fikret.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Last URL: http://python-data.dr-chuck.net/known_by_Montgomery.html
This is the output that I am supposed to get:
Retrieving: http://python-data.dr-chuck.net/known_by_Fikret.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Mhairade.html
Retrieving: http://python-data.dr-chuck.net/known_by_Butchi.html
Retrieving: http://python-data.dr-chuck.net/known_by_Anayah.html
Last URL: http://python-data.dr-chuck.net/known_by_Anayah.html
I’ve been working on this for a while but I still have not been able to get the code to loop correctly. I am new to coding and I’m just looking for some help to point me in the right direction. Thanks.
Answers:
You are getting the link at the same pos
position multiple times. Use the i
loop counter for the offset, replace:
url = taglist[pos].get('href', None)
with:
url = taglist[pos + i].get('href', None)
The reason you do not get the proper answer is the following: You do not open the link.
After finding the right url in the first page you have to open the url you found with urllib.request.urlopen(URL).read(), and look for the new link there. You have to repeat this three times. I’d recommend a while loop for this.
this code does the trick:
url = 'http://python-data.dr-chuck.net/known_by_Fikret.html'
count = 5
pos = 2
urllist = []
taglist = []
connections = 0
while connections < 5 : #you need to connect five times
taglist = []
print('Retrieving: ', url)
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
for i in range(count):
for tag in tags:
taglist.append(tag)
url = taglist[pos].get('href', None)
urllist.append(url)
connections = connections + 1
print ("last url:", url)
def get_html(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
return soup
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
for i in range(count):
taglist = list()
for tag in get_html(url)('a'): # Needed to update your variable to new url html
taglist.append(tag)
url = taglist[pos].get('href', None) # You grabbed url but never updated your tags variable.
print('Retrieving: ', url)
urllist.append(url)
print('Last URL: ', urllist[-1])
Try this way:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
url=input("Enter url:")
count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1
urllist=list()
for i in range(count):
html=urllib.request.urlopen(url)
soup=BeautifulSoup(html,'html.parser')
tags=soup('a')
print('Retrieveing:',url)
taglist=list()
for tag in tags:
y=tag.get('href',None)
taglist.append(y)
url=taglist[pos]
urllist.append(url)
print("Last Url:",urllist[-2])
Try this one:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
def parse(url):
count=0
while count<7:
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
list1=list()
tags = soup('a')
for tag in tags:
list1.append(tag.get('href', None))
url=list1[17]
count+=1
print ('Retreiving:',url)
print (parse('http://py4e-data.dr-chuck.net/known_by_Lorenz.html'))
That’s my output:
Retreiving: http://py4e-data.dr-chuck.net/known_by_Cadyn.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Phebe.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Cullen.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Alessandro.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Gurveer.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Anureet.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Sandie.html
None
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
urllist = list()
taglist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
for i in range(count):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags=soup('a')
for tag in tags:
# the most important part is keep updating the variable of tags by putting in front of this loop
taglist.append(tag)
print('Retrieving: ', url)
url = taglist[pos].get('href', None)
urllist.append(url)
print('Retrieving: ', urllist[-1])
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
urllist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
for i in range(count):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags=soup('a')
url = tags[pos].get('href', None)
print('Retrieving: ', url)
urllist.append(url)
print('Retrieving: ', urllist[-1])
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
url=input("Enter url:")
count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1
urllist=list()
for i in range(count):
html=urllib.request.urlopen(url)
soup=BeautifulSoup(html,'html.parser')
tags=soup('a')
print('Retrieveing:',url)
taglist=list()
for tag in tags:
y=tag.get('href',None)
taglist.append(y)
url=taglist[pos]
urllist.append(url)
x=len(urllist)
print("Last Url:",urllist[x-1])
#assignment2
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
count = 7
position = 18
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#run1
idea = ['https://py4e-data.dr-chuck.net/known_by_Lynn.html']
empty = []
for i in range(count+1):
url = idea[len(idea)-1]
print("retrieving:", url)
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
for tag in tags:
empty.append(tag.get('href',None))
idea.append(empty[position-1])
empty.clear()
For me it was necessary to make a change because of an error caused by a change in the library of python 3.10+. This is the link where i found the solution.
Error "AttributeError 'collections' has no attribute 'Callable' " using Beautiful Soup
With this solution it’s not necessary to create a new list in which insert all the urls and then select the url that you need according to the "position" parameter. Imagine to have a page with 1 million of urls. Making a list of 1 million of urls and then select, for example, the tenth url, it’s absolutely not necessary. So I create a counter (actpos) and once i reach the position defined I exit the loop immediately, I store the new url to be opened from "newurl" in the variable "myurl" and then I restart my loop again using the updated "myurl" variable. Everything happens for a number of times defined by the parameter "count".
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
#collections.Callable has been moved to collections.abc.Callable in python 3.10+.
#Added the reference back to collections before importing the problem library.
import collections
collections.Callable = collections.abc.Callable
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
def getsoup(url):
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
return soup
myurl = input('Enter - ')
count = int(input('Enter count: '))
position = int(input('Enter position: '))
print('Retrieving: ',myurl)
for tag in range(count):
actpos = 0 #actual position
for newurl in getsoup(myurl)('a'):
actpos = actpos + 1
if actpos < position:
continue
break
myurl = newurl.get('href', None) #update url
print('Retrieving: ',myurl)
print('Last url: ',myurl)
I am doing a course which requires me to parse this using BeautifulSoup: http://python-data.dr-chuck.net/known_by_Fikret.html
The instructions are: Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer is the last name that you retrieve.
This is the code I have so far:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import re
url = input('Enter - ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
taglist = list()
tags = soup('a')
for i in range(count):
for tag in tags:
taglist.append(tag)
url = taglist[pos].get('href', None)
print('Retrieving: ', url)
urllist.append(url)
print('Last URL: ', urllist[-1])
This is my output:
Retrieving: http://python-data.dr-chuck.net/known_by_Fikret.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Last URL: http://python-data.dr-chuck.net/known_by_Montgomery.html
This is the output that I am supposed to get:
Retrieving: http://python-data.dr-chuck.net/known_by_Fikret.html
Retrieving: http://python-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://python-data.dr-chuck.net/known_by_Mhairade.html
Retrieving: http://python-data.dr-chuck.net/known_by_Butchi.html
Retrieving: http://python-data.dr-chuck.net/known_by_Anayah.html
Last URL: http://python-data.dr-chuck.net/known_by_Anayah.html
I’ve been working on this for a while but I still have not been able to get the code to loop correctly. I am new to coding and I’m just looking for some help to point me in the right direction. Thanks.
You are getting the link at the same pos
position multiple times. Use the i
loop counter for the offset, replace:
url = taglist[pos].get('href', None)
with:
url = taglist[pos + i].get('href', None)
The reason you do not get the proper answer is the following: You do not open the link.
After finding the right url in the first page you have to open the url you found with urllib.request.urlopen(URL).read(), and look for the new link there. You have to repeat this three times. I’d recommend a while loop for this.
this code does the trick:
url = 'http://python-data.dr-chuck.net/known_by_Fikret.html'
count = 5
pos = 2
urllist = []
taglist = []
connections = 0
while connections < 5 : #you need to connect five times
taglist = []
print('Retrieving: ', url)
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
for i in range(count):
for tag in tags:
taglist.append(tag)
url = taglist[pos].get('href', None)
urllist.append(url)
connections = connections + 1
print ("last url:", url)
def get_html(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
return soup
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
for i in range(count):
taglist = list()
for tag in get_html(url)('a'): # Needed to update your variable to new url html
taglist.append(tag)
url = taglist[pos].get('href', None) # You grabbed url but never updated your tags variable.
print('Retrieving: ', url)
urllist.append(url)
print('Last URL: ', urllist[-1])
Try this way:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
url=input("Enter url:")
count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1
urllist=list()
for i in range(count):
html=urllib.request.urlopen(url)
soup=BeautifulSoup(html,'html.parser')
tags=soup('a')
print('Retrieveing:',url)
taglist=list()
for tag in tags:
y=tag.get('href',None)
taglist.append(y)
url=taglist[pos]
urllist.append(url)
print("Last Url:",urllist[-2])
Try this one:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
def parse(url):
count=0
while count<7:
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
list1=list()
tags = soup('a')
for tag in tags:
list1.append(tag.get('href', None))
url=list1[17]
count+=1
print ('Retreiving:',url)
print (parse('http://py4e-data.dr-chuck.net/known_by_Lorenz.html'))
That’s my output:
Retreiving: http://py4e-data.dr-chuck.net/known_by_Cadyn.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Phebe.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Cullen.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Alessandro.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Gurveer.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Anureet.html
Retreiving: http://py4e-data.dr-chuck.net/known_by_Sandie.html
None
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
urllist = list()
taglist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
urllist = list()
for i in range(count):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags=soup('a')
for tag in tags:
# the most important part is keep updating the variable of tags by putting in front of this loop
taglist.append(tag)
print('Retrieving: ', url)
url = taglist[pos].get('href', None)
urllist.append(url)
print('Retrieving: ', urllist[-1])
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
urllist = list()
url = input('Enter - ')
count = int(input('Enter count: '))
pos = int(input('Enter position: ')) - 1
for i in range(count):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags=soup('a')
url = tags[pos].get('href', None)
print('Retrieving: ', url)
urllist.append(url)
print('Retrieving: ', urllist[-1])
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
url=input("Enter url:")
count=int(input('Enter count:'))
pos=int(input('Enter position:'))-1
urllist=list()
for i in range(count):
html=urllib.request.urlopen(url)
soup=BeautifulSoup(html,'html.parser')
tags=soup('a')
print('Retrieveing:',url)
taglist=list()
for tag in tags:
y=tag.get('href',None)
taglist.append(y)
url=taglist[pos]
urllist.append(url)
x=len(urllist)
print("Last Url:",urllist[x-1])
#assignment2
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
count = 7
position = 18
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#run1
idea = ['https://py4e-data.dr-chuck.net/known_by_Lynn.html']
empty = []
for i in range(count+1):
url = idea[len(idea)-1]
print("retrieving:", url)
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
for tag in tags:
empty.append(tag.get('href',None))
idea.append(empty[position-1])
empty.clear()
For me it was necessary to make a change because of an error caused by a change in the library of python 3.10+. This is the link where i found the solution.
Error "AttributeError 'collections' has no attribute 'Callable' " using Beautiful Soup
With this solution it’s not necessary to create a new list in which insert all the urls and then select the url that you need according to the "position" parameter. Imagine to have a page with 1 million of urls. Making a list of 1 million of urls and then select, for example, the tenth url, it’s absolutely not necessary. So I create a counter (actpos) and once i reach the position defined I exit the loop immediately, I store the new url to be opened from "newurl" in the variable "myurl" and then I restart my loop again using the updated "myurl" variable. Everything happens for a number of times defined by the parameter "count".
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
#collections.Callable has been moved to collections.abc.Callable in python 3.10+.
#Added the reference back to collections before importing the problem library.
import collections
collections.Callable = collections.abc.Callable
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
def getsoup(url):
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
return soup
myurl = input('Enter - ')
count = int(input('Enter count: '))
position = int(input('Enter position: '))
print('Retrieving: ',myurl)
for tag in range(count):
actpos = 0 #actual position
for newurl in getsoup(myurl)('a'):
actpos = actpos + 1
if actpos < position:
continue
break
myurl = newurl.get('href', None) #update url
print('Retrieving: ',myurl)
print('Last url: ',myurl)