extracting data from list of URLs extracted

Question:

I have made a python code to extract list of URLs, however each url has some further data, please guide on how to extract that data
Thanks in advance

Below is the code:


import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract(page):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0'}
    url = f'https://www.legalentityidentifier.in/leicert/?page_size={page}'
    r =  requests.get(url, headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup

def transform(soup):
    divs = soup.find_all('div', class_ = 'table-cell legal-name')

    for item in divs:
        title = item.find('a').text
        link = item.find('a')['href']

        Job = {
            'Name': title,
            'Link': link
        }
        joblist.append(Job)
    return

joblist = []

c = extract(10)
transform(c)

df = pd.DataFrame(joblist)
print(df.head())
df.to_csv(index=False)
Asked By: Kadam Jain

||

Answers:

Of course in terms of beauty this thing is absolute shitcoding but it should work, try it. It’s to solve the whole your problem. It takes all data from item’s card and added it into corresponding joblist item.

import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract(page_size=None, url=None):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0'}
    if page_size:
        url = f'https://www.legalentityidentifier.in/leicert/?page_size={page_size}'
    r =  requests.get(url, headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup

def transform(soup, outer_one=False):
    if outer_one:
        divs = soup.find_all('div', class_ = 'table-cell legal-name')

        for item in divs:
            title = item.find('a').text
            link = item.find('a')['href']

            Job = {
                'Name': title,
                'Link': link
            }
            joblist.append(Job)
    else:
        dict_to_ret = dict()
        
        divs = soup.find_all('div', class_ = 'details-table-row')
        
        for j in divs:
            for z in zip(j.find_all('div', class_='title-block'),j.find_all('div', class_='value-block')):
                dict_to_ret[z[0].text] = z[1].text

        return dict_to_ret




joblist = []

c = extract(10)
transform(c, outer_one=True)


for i in joblist:
    i.update(transform(extract(url=i['Link'])))



df = pd.DataFrame(joblist)
print(df.head())
df.to_csv(index=False)

I’ve tried it, it works. This is what it has in joblist in the end:

[{'Name': 'AXITA COTTON LIMITED', 'Link': 'https://www.legalentityidentifier.in/leicert/9845004DEIU88DD76D71/', 'Legal Name': 'AXITA COTTON LIMITED', 'Registered At': 'Companies Register (Ministry of Corporate Affairs)India, IndiaRA000394', 'Registration Authority Entity ID': 'L17200GJ2013PLC076059', 'Legal Jurisdiction': 'IN', 'Entity Category': 'GENERAL', 'Entity Legal Form Code': 'Public Limited CompanyDDKQ', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2019-11-17', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-11-20', 'Automatic Renewal Until': '2025-11-20', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'FULLY_CORROBORATED'}, {'Name': 'DARLING PETS PRIVATE LIMITED', 'Link': 'https://www.legalentityidentifier.in/leicert/984500DO38CD12DAC590/', 'Legal Name': 'DARLING PETS PRIVATE LIMITED', 'Registered At': 'Companies Register (Ministry of Corporate Affairs)India, IndiaRA000394', 'Registration Authority Entity ID': 'U01110DL2021PTC380185', 'Legal Jurisdiction': 'IN', 'Entity Category': 'GENERAL', 'Entity Legal Form Code': 'YSP9', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2022-10-05', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-10-05', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'FULLY_CORROBORATED'}, {'Name': 'K S ENTERPRISES PRIVATE LIMITED', 'Link': 'https://www.legalentityidentifier.in/leicert/984500D1DB7AE38F4E76/', 'Legal Name': 'K S ENTERPRISES PRIVATE LIMITED', 'Registered At': 'Companies Register (Ministry of Corporate Affairs)India, IndiaRA000394', 'Registration Authority Entity ID': 'U74899DL1989PTC037374', 'Legal Jurisdiction': 'IN', 'Entity Category': 'GENERAL', 'Entity Legal Form Code': 'YSP9', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2020-10-29', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-11-20', 'Automatic Renewal Until': '2026-11-20', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'FULLY_CORROBORATED'}, {'Name': 'SHITANSHU GARG', 'Link': 'https://www.legalentityidentifier.in/leicert/984500AF71539BCFWE72/', 'Legal Name': 'SHITANSHU GARG', 'Other Entity Names': 'SHIBU ALUMINIUM INDUSTRIES', 'Registered At': 'GST Portal (Goods and Services Tax Network (Ministry of Finance))India, IndiaRA000754', 'Registration Authority Entity ID': '06AAPPG0373F1ZW', 'Legal Jurisdiction': 'IN', 'Entity Category': 'SOLE_PROPRIETOR', 'Entity Legal Form Code': '4QIE', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2021-10-06', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-10-06', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'PARTIALLY_CORROBORATED'}, {'Name': 'MANDEEP SINGH ARORA', 'Link': 'https://www.legalentityidentifier.in/leicert/9845009EE0BE3DE3F545/', 'Legal Name': 'MANDEEP SINGH ARORA', 'Other Entity Names': 'ARORA FOOD PRODUCT', 'Registered At': 'GST Portal (Goods and Services Tax Network (Ministry of Finance))India, IndiaRA000754', 'Registration Authority Entity ID': '22AKWPA4770D1ZB', 'Legal Jurisdiction': 'IN', 'Entity Category': 'SOLE_PROPRIETOR', 'Entity Legal Form Code': 'Sole Proprietorship4QIE', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2021-10-08', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-10-08', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'PARTIALLY_CORROBORATED'}, {'Name': 'PRAVIN ORNAMENTS PRIVATE LIMITED', 'Link': 'https://www.legalentityidentifier.in/leicert/984500D146ACC07A7F71/', 'Legal Name': 'PRAVIN ORNAMENTS PRIVATE LIMITED', 'Registered At': 'Companies Register (Ministry of Corporate Affairs)India, IndiaRA000394', 'Registration Authority Entity ID': 'U27320PN2019PTC182917', 'Legal Jurisdiction': 'IN', 'Entity Category': 'GENERAL', 'Entity Legal Form Code': 'YSP9', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2020-11-19', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-11-19', 'Automatic Renewal Until': '2025-11-19', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'FULLY_CORROBORATED'}, {'Name': 'AACHI SPICES AND FOODS PRIVATE LIMITED', 'Link': 'https://www.legalentityidentifier.in/leicert/9845001C9AD6E02CE931/', 'Legal Name': 'AACHI SPICES AND FOODS PRIVATE LIMITED', 'Registered At': 'Companies Register (Ministry of Corporate Affairs)India, IndiaRA000394', 'Registration Authority Entity ID': 'U15400TN2010PTC074954', 'Legal Jurisdiction': 'IN', 'Entity Category': 'GENERAL', 'Entity Legal Form Code': 'Private Limited CompanyYSP9', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2020-11-19', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-11-19', 'Automatic Renewal Until': '2025-11-19', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'FULLY_CORROBORATED'}, {'Name': 'BIOASPIRE PROCESS SOLUTIONS PRIVATE LIMITED', 'Link': 'https://www.legalentityidentifier.in/leicert/984500AAB804C6001W55/', 'Legal Name': 'BIOASPIRE PROCESS SOLUTIONS PRIVATE LIMITED', 'Registered At': 'Companies Register (Ministry of Corporate Affairs)India, IndiaRA000394', 'Registration Authority Entity ID': 'U29300MH2017PTC295107', 'Legal Jurisdiction': 'IN', 'Entity Category': 'GENERAL', 'Entity Legal Form Code': 'Private Limited CompanyYSP9', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2021-11-19', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-11-19', 'Automatic Renewal Until': '2026-11-19', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'FULLY_CORROBORATED'}, {'Name': 'UPGRAD EDUCATION PRIVATE LIMITED', 'Link': 'https://www.legalentityidentifier.in/leicert/984500868E745C3A7742/', 'Legal Name': 'UPGRAD EDUCATION PRIVATE LIMITED', 'Registered At': 'Companies Register (Ministry of Corporate Affairs)India, IndiaRA000394', 'Registration Authority Entity ID': 'U80902MH2012PTC258559', 'Legal Jurisdiction': 'IN', 'Entity Category': 'GENERAL', 'Entity Legal Form Code': 'Private Limited CompanyYSP9', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2020-11-19', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-11-19', 'Automatic Renewal Until': '2024-11-19', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'FULLY_CORROBORATED'}, {'Name': 'JINDBA PROCESSORS PRIVATE LIMITED', 'Link': 'https://www.legalentityidentifier.in/leicert/9845006FTAEC0D4F2F75/', 'Legal Name': 'JINDBA PROCESSORS PRIVATE LIMITED', 'Registered At': 'RA000394', 'Registration Authority Entity ID': 'U17120PB2016PTC040181', 'Legal Jurisdiction': 'IN', 'Entity Category': 'GENERAL', 'Entity Legal Form Code': 'Private Limited CompanyYSP9', 'Entity Status': 'ACTIVE', 'LEI Initial Registration Date': '2019-11-19', 'LEI Last Update Date': '2022-10-05', 'LEI Registration Status': 'ISSUED', 'LEI Next Renewal Date': '2023-11-19', 'Automatic Renewal Until': '2024-11-19', 'Managing LOU': 'Ubisecure Oy (RapidLEI)529900T8BM49AURSDO55', 'Validation Sources': 'FULLY_CORROBORATED'}]
Answered By: Dmitriy Neledva
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.