How to select only a link while web scrapping a HTML which the attribute has also text?

Question:

As a part of a a bigger webscrapping project, I want to extract the html link from a html. It is not all html link on the page, but only the in the second column of the big table.

An example of how the html these links appear look like:

<a href="exibir?proc=18955/989/20&amp;offset=0">18955/989/20</a>

I would like to have a list the "exibir?proc=18955/989/20&offset=0" and NOT the "18955/989/20".

So far, i could only get them both together (code bellow). How can I get rid of it? Is there another solution? At the end I would like to have only a list of the links in the order they already appear.

from requests_html import HTMLSession
import csv

s = HTMLSession()
def get_product_links(page):
  url = f"https://www.tce.sp.gov.br/jurisprudencia/pesquisar?txtTdPalvs=munic%C3%ADpio+pessoal+37&txtExp=temporari&txtQqUma=admiss%C3%A3o+contrata%C3%A7%C3%A3o&txtNenhPalvs=&txtNumIni=&txtNumFim=&tipoBuscaTxt=Documento&_tipoBuscaTxt=on&quantTrechos=1&processo=&exercicio=&dataAutuacaoInicio=&dataAutuacaoFim=&dataPubInicio=01%2F01%2F2021&dataPubFim=31%2F12%2F2021&_relator=1&_auditor=1&_materia=1&tipoDocumento=2&_tipoDocumento=1&acao=Executa&offset={page}"
  links = []
  r = s.get(url)
  products = r.html.find('td.small a')
  for item in products:
    links.append(item.find('a', first=True).attrs['href'])
  return links
page1 = get_product_links(0)

print(page1)

Answers:

Here is one way to get those links from the second column. You’re welcome to functionalize it if you want.

from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm ## if using Jupyter: from tqdm.notebook import tqdm
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)

big_list = []
for x in tqdm(range(0, 410, 10)):
    url = f'https://www.tce.sp.gov.br/jurisprudencia/pesquisar?txtTdPalvs=munic%C3%ADpio+pessoal+37&txtExp=temporari&txtQqUma=admiss%C3%A3o+contrata%C3%A7%C3%A3o&txtNenhPalvs=&txtNumIni=&txtNumFim=&tipoBuscaTxt=Documento&_tipoBuscaTxt=on&quantTrechos=1&processo=&exercicio=&dataAutuacaoInicio=&dataAutuacaoFim=&dataPubInicio=01%2F01%2F2021&dataPubFim=31%2F12%2F2021&_relator=1&_auditor=1&_materia=1&tipoDocumento=2&_tipoDocumento=1&acao=Executa&offset={x}'
    r = s.get(url)
    urls = bs(r.text, 'html.parser').select('tr[class="borda-superior"] td:nth-of-type(2) a')
    big_list.extend([(x.text.strip(), 'https://www.tce.sp.gov.br/jurisprudencia/' + x.get('href')) for x in urls])
df = pd.DataFrame(big_list, columns = ['title', 'url'])   
print(df)

Result in terminal:

100%
41/41 [00:30<00:00, 1.42it/s]
title   url
0   18955/989/20    https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=18955/989/20&offset=0
1   13614/989/18    https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=13614/989/18&offset=0
2   6269/989/19 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=6269/989/19&offset=0
3   14011/989/19    https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=14011/989/19&offset=0
4   14082/989/19    https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=14082/989/19&offset=0
... ... ...
399 4023/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4023/989/18&offset=390
400 4024/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4024/989/18&offset=400
401 4025/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4025/989/18&offset=400
402 4026/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4026/989/18&offset=400
403 4027/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4027/989/18&offset=400
404 rows × 2 columns

Edit: If you want only the (partial) url as a list, all you have to do is:

from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm ## if Jupyter: from tqdm.notebook import tqdm
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

s = requests.Session()
s.headers.update(headers)

big_list = []
for x in tqdm(range(0, 410, 10)):
    url = f'https://www.tce.sp.gov.br/jurisprudencia/pesquisar?txtTdPalvs=munic%C3%ADpio+pessoal+37&txtExp=temporari&txtQqUma=admiss%C3%A3o+contrata%C3%A7%C3%A3o&txtNenhPalvs=&txtNumIni=&txtNumFim=&tipoBuscaTxt=Documento&_tipoBuscaTxt=on&quantTrechos=1&processo=&exercicio=&dataAutuacaoInicio=&dataAutuacaoFim=&dataPubInicio=01%2F01%2F2021&dataPubFim=31%2F12%2F2021&_relator=1&_auditor=1&_materia=1&tipoDocumento=2&_tipoDocumento=1&acao=Executa&offset={x}'
    r = s.get(url)
    urls = bs(r.text, 'html.parser').select('tr[class="borda-superior"] td:nth-of-type(2) a')
    big_list.extend([x.get('href') for x in urls])
print(big_list)

Result in terminal:

100%
41/41 [00:29<00:00, 1.83it/s]
['exibir?proc=18955/989/20&offset=0',
 'exibir?proc=13614/989/18&offset=0',
 'exibir?proc=6269/989/19&offset=0',
 'exibir?proc=14011/989/19&offset=0',
 'exibir?proc=14082/989/19&offset=0',
 'exibir?proc=14238/989/19&offset=0',
 'exibir?proc=14141/989/20&offset=0',
 'exibir?proc=15371/989/19&offset=0',
 'exibir?proc=15388/989/20&offset=0',
 'exibir?proc=12911/989/16&offset=0',
 'exibir?proc=1735/002/11&offset=10',
 'exibir?proc=23494/989/18&offset=10',
 'exibir?proc=24496/989/19&offset=10',
 'exibir?proc=17110/989/18&offset=10',
 'exibir?proc=24043/989/19&offset=10',
 'exibir?proc=2515/989/20&offset=10',
 'exibir?proc=1891/989/20&offset=10',
 'exibir?proc=15026/989/20&offset=10',
 'exibir?proc=9070/989/20&offset=10',
 'exibir?proc=21543/989/20&offset=10',
 'exibir?proc=19654/989/20&offset=20',
 'exibir?proc=19678/989/20&offset=20',
 'exibir?proc=5714/989/19&offset=20',
 'exibir?proc=20493/989/20&offset=20',
 'exibir?proc=4671/989/19&offset=20',
 'exibir?proc=14200/989/20&offset=20',
 'exibir?proc=15277/989/20&offset=20',
 'exibir?proc=1363/007/12&offset=20',
 'exibir?proc=4908/989/19&offset=20',
 'exibir?proc=15164/989/20&offset=20',
 'exibir?proc=4418/989/19&offset=30',
 'exibir?proc=4890/989/19&offset=30',
 'exibir?proc=17924/989/20&offset=30',
 'exibir?proc=4742/989/19&offset=30',
 'exibir?proc=800226/465/09&offset=30',
 'exibir?proc=23880/989/20&offset=30',
 'exibir?proc=4561/989/19&offset=30',
 'exibir?proc=4540/989/19&offset=30',
 'exibir?proc=4471/989/19&offset=30',
 'exibir?proc=4982/989/19&offset=30',
 'exibir?proc=4519/989/19&offset=40',
 'exibir?proc=4632/989/19&offset=40',
 'exibir?proc=4536/989/19&offset=40',
 'exibir?proc=4622/989/19&offset=40',
 'exibir?proc=14734/989/16&offset=40',
 'exibir?proc=4678/989/19&offset=40',
 'exibir?proc=5501/989/16&offset=40',
 'exibir?proc=13988/989/17&offset=40',
 'exibir?proc=4854/989/19&offset=40',
 'exibir?proc=4609/989/19&offset=40',
 'exibir?proc=4717/989/19&offset=50',
 'exibir?proc=4673/989/19&offset=50',
 'exibir?proc=20988/989/20&offset=50',
 'exibir?proc=4481/989/19&offset=50',
 'exibir?proc=4675/989/19&offset=50',
 'exibir?proc=4451/989/19&offset=50',
 'exibir?proc=12943/989/19&offset=50',
 'exibir?proc=23644/989/18&offset=50',
 'exibir?proc=23875/989/18&offset=50',
 'exibir?proc=4679/989/19&offset=50',
 'exibir?proc=4425/989/19&offset=60',
 'exibir?proc=2726/989/18&offset=60',
 'exibir?proc=17172/989/20&offset=60',
 'exibir?proc=2901/989/18&offset=60',
 'exibir?proc=4469/989/19&offset=60',
 'exibir?proc=299/012/13&offset=60',
 'exibir?proc=4915/989/19&offset=60',
 'exibir?proc=22649/989/20&offset=60',
 'exibir?proc=22887/989/20&offset=60',
 'exibir?proc=4721/989/19&offset=60',
 'exibir?proc=4378/989/19&offset=70',
 'exibir?proc=4935/989/19&offset=70',
 'exibir?proc=4714/989/19&offset=70',
 'exibir?proc=1230/989/21&offset=70',
 'exibir?proc=1847/989/21&offset=70',
 'exibir?proc=15606/989/21&offset=70',
 'exibir?proc=11267/989/18&offset=70',
 'exibir?proc=1232/004/12&offset=70',
 'exibir?proc=4421/989/19&offset=70',
 'exibir?proc=4931/989/19&offset=70',
 'exibir?proc=4885/989/19&offset=80',
 'exibir?proc=5002/989/19&offset=80',
 'exibir?proc=21592/989/20&offset=80',
 'exibir?proc=4839/989/19&offset=80',
 'exibir?proc=4783/989/19&offset=80',
 'exibir?proc=4599/989/19&offset=80',
 'exibir?proc=4702/989/19&offset=80',
 'exibir?proc=4617/989/19&offset=80',
 'exibir?proc=4970/989/16&offset=80',
 'exibir?proc=4492/989/19&offset=80',
 'exibir?proc=2582/989/17&offset=90',
 'exibir?proc=4993/989/19&offset=90',
 'exibir?proc=4658/989/19&offset=90',
 'exibir?proc=4606/989/19&offset=90',
 'exibir?proc=4387/989/19&offset=90',
 'exibir?proc=14549/989/19&offset=90',
 'exibir?proc=4525/989/18&offset=90',
 'exibir?proc=4713/989/19&offset=90',
 'exibir?proc=838/001/14&offset=90',
 'exibir?proc=4971/989/19&offset=90',
 'exibir?proc=17505/989/18&offset=100',
 'exibir?proc=5096/989/18&offset=100',
 'exibir?proc=4413/989/19&offset=100',
 'exibir?proc=4392/989/19&offset=100',
 'exibir?proc=15132/989/20&offset=100',
 'exibir?proc=4517/989/19&offset=100',
 'exibir?proc=4760/989/19&offset=100',
 'exibir?proc=18509/989/19&offset=100',
 'exibir?proc=4952/989/19&offset=100',
 'exibir?proc=5013/989/19&offset=100',
 'exibir?proc=12922/989/19&offset=110',
 'exibir?proc=6194/989/16&offset=110',
 'exibir?proc=19323/989/20&offset=110',
 'exibir?proc=4732/989/19&offset=110',
...]
Answered By: Barry the Platipus