Python scrap useful information from webpage with login

Question:

By referring https://medium.com/analytics-vidhya/how-to-scrape-data-from-a-website-using-python-for-beginner-5c770a1fbe2d I have started scraping data from a website with login information.
My website is a bit different and I got result. But it is in a different format.
code:

from pprint import pprint
import datetime
import requests
from bs4 import BeautifulSoup

cookies = {
    'CFID': '180615757',
    'CFTOKEN': '64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE',
    'visid_incap_2388351': '0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier',
    '_ga_6ZQNJ4ELG2': 'GS1.1.1662315508.15.1.1662315668.0.0.0',
    '_ga': 'GA1.2.147261521.1662080801',
    '_gid': 'GA1.2.1149490171.1662080801',
    'reese84': '3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani+7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8+nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE+X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj+39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY+y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu+pb0mSp5n+iKotUEn9h+sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG+3Qe3zAfpdrs=',
    '__atuvc': '65%7C35%2C2%7C36',
    'COOKIESTATUS': 'ON',
    'HIDECOOKIEBANNER': 'TRUE',
    'nlbi_2388351': 'jGGxMFazFBqnU+x+okRrFAAAAAC/AJ/k+R2U+vs5Q4LIRTS7',
    'nlbi_2388351_2147483392': 'PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N',
    'incap_ses_989_2388351': 'mWy+Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==',
    'incap_ses_468_2388351': 'sDNcR2labTHyNXYlUqx+BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==',
    '__atuvs': '6314ec0cdbe92a78001',
    '_gat_gtag_UA_12825325_1': '1',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.higheredjobs.com/admin/',
    'Connection': 'keep-alive',
    # Requests sorts cookies= alphabetically
    # 'Cookie': 'CFID=180615757; CFTOKEN=64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE; visid_incap_2388351=0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier; _ga_6ZQNJ4ELG2=GS1.1.1662315508.15.1.1662315668.0.0.0; _ga=GA1.2.147261521.1662080801; _gid=GA1.2.1149490171.1662080801; reese84=3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani+7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8+nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE+X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj+39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY+y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu+pb0mSp5n+iKotUEn9h+sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG+3Qe3zAfpdrs=; __atuvc=65%7C35%2C2%7C36; COOKIESTATUS=ON; HIDECOOKIEBANNER=TRUE; nlbi_2388351=jGGxMFazFBqnU+x+okRrFAAAAAC/AJ/k+R2U+vs5Q4LIRTS7; nlbi_2388351_2147483392=PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N; incap_ses_989_2388351=mWy+Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==; incap_ses_468_2388351=sDNcR2labTHyNXYlUqx+BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==; __atuvs=6314ec0cdbe92a78001; _gat_gtag_UA_12825325_1=1',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    # Requests doesn't support trailers
    # 'TE': 'trailers',
}

params = {
    'JobCat': '141',
    'CatName': 'Academic Advising',
}

response = requests.get('https://www.higheredjobs.com/admin/search.cfm', params=params, cookies=cookies, headers=headers)

soup = BeautifulSoup(response.text,'html.parser')#'lxml')#
job_title = soup.find_all('div',class_=["row record","col-sm-5 text-sm-right"])
jobs_list = []
for i in job_title:
 name = i.text
 jobs_list.append(name)
df = pd.DataFrame({'Jobs title':jobs_list})

Present output:

df = 
Jobs title
0   nnRe-SortnnrntttResults 1 - 70 of 70...
1   nnrntttttttttttAssistant Profes...
2   rnttttttttttrnttttttttt...

Expected output:

df = 
     Jobs title                     Company name                   location         Posted
0   Assistant Professor/Associate  University of Southern Indiana  Evansville, IN   09/02/22
    Professor of Engineering, 
    Pott College of Science, 
    Engineering, and Education - F22057F1
Asked By: Mainland

||

Answers:

To remove newlines meaning nt , you can invoke get_text() property instead of .text

name = i.get_text(strip=True)
Answered By: F.Hoque

Main issue is that you try to create your DataFrame from unstructured data, that is collected in your list.

So try to structure it first e.g. as dict, append it to your list and then create your DataFrame:

jobs_list = []
for i in soup.select('.row.record'):
    
    jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))

pd.DataFrame(jobs_list)

Note: If you like to change the headers, change this list -> [‘title’,’university’,’location’,’study’,’date’]

Example

from bs4 import BeautifulSoup
html ='''
<div class="row record">
<div class="col-sm-7"><a href="details.cfm?JobCode=178085874&amp;Title=Assistant%20Professor%2FAssociate%20Professor%20of%20Engineering%2C%20Pott%20College%20of%20Science%2C%20Engineering%2C%20and%20Education%20%2D%20F22057F1">
                                            Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1</a>
<br/>
                                        University of Southern Indiana <br/>
                                            Evansville, IN 
                                    </div>
<div class="col-sm-5 text-sm-right">
                                        
                                        Electrical Engineering 
                                            <br/> Posted 09/02/22<br/>
</div>
</div>
<div class="row record">
<div class="col-sm-7">
<a href="details.cfm?JobCode=178085843&amp;Title=Assistant%20Professor%20of%20Engineering%20F99507">
                                            Assistant Professor of Engineering F99507</a>
<br/>
                                        McNeese State University <br/>
                                            Lake Charles, LA 
                                    </div>
<div class="col-sm-5 text-sm-right">
                                        
                                        Electrical Engineering 
                                            <br/> Posted 09/02/22<br/>
</div>
</div>
'''
soup = BeautifulSoup(html)

jobs_list = []
for i in soup.select('.row.record'):
    
    jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))

pd.DataFrame(jobs_list)

Output

title university location study date
0 Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education – F22057F1 University of Southern Indiana Evansville, IN Electrical Engineering Posted 09/02/22
1 Assistant Professor of Engineering F99507 McNeese State University Lake Charles, LA Electrical Engineering Posted 09/02/22
Answered By: HedgeHog

The following is a complete example of how you can extract the jobs under ‘Academic Advising’ from that website:

import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm import tqdm

headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(1, 1337, 100)):
    url = f'https://www.higheredjobs.com/admin/search.cfm?JobCat=141&StartRow={x}&SortBy=4&NumJobs=100'
    r = s.get(url)
    soup = bs(r.text, 'html.parser')
    jobs = soup.select_one('div#js-results').select('div[class="row record"]')
    for job in jobs:
        job_title = job.select_one('a').get_text(strip=True)
        job_url = job.select_one('a').get('href')
        big_list.append((job_title, job_url))
df = pd.DataFrame(list(set(big_list)), columns = ['Job', 'Url'])
print(df)

Result is a dataframe with all those jobs (1337):

Job Url
0   Director, Usha Kundu, MD College of Health Adv...   details.cfm?JobCode=178071028&Title=Director%2...
1   Academic Advisor, College of Natural, Behavior...   details.cfm?JobCode=178061977&Title=Academic%2...
2   Part-Time Academic Advisor for EAP & Foreign L...   details.cfm?JobCode=177870235&Title=Part%2DTim...
3   Student Service Assistant ll (Temp) details.cfm?JobCode=178044985&Title=Student%20...
4   On-Call Academic Advisor (Applicant Pool)   details.cfm?JobCode=177522145&Title=On%2DCall%...
... ... ...
1332    Part-Time Academic Support Coach    details.cfm?JobCode=178060131&Title=Part%2DTim...
1333    Academic Advisor    details.cfm?JobCode=178005430&Title=Academic%2...
1334    Retention Coordinator/Academic Advisor  details.cfm?JobCode=178077784&Title=Retention%...
1335    P220178 - Academic Advisor, School of Public H...   details.cfm?JobCode=177930648&Title=P220178%20...
1336    Director of Academic Advising - Georgetown Uni...   details.cfm?JobCode=178021588&Title=Director%2...
Answered By: platipus_on_fire