Parsing csv file from a button press using python
Question:
I have the following url https://pubmed.ncbi.nlm.nih.gov/?term=IBD which I want to parse data from
(I found nothing against scraping in their terms), the site is public, there’s an ‘export’ button to download some historic data in csv, I want to automate downloading the file content with python.
I tried many options in the past day, this is my recent one
def parse_history():
url = "https://pubmed.ncbi.nlm.nih.gov/?term=IBD"
web_page = requests.get(url)
soup = BeautifulSoup(web_page.content, "html.parser")
form = soup.find('form', id='side-export-search-by-year-form')
download_url = form.get('action')
form_data = {}
for input_field in form.find_all('input'):
form_data[input_field.get('name')] = input_field.get('value')
csrf_token = get_csrf_token(url)
form_data["csrfmiddlewaretoken"] = csrf_token
response = requests.post(f"{url}{download_url}", data=form_data)
# Save the downloaded file
with open('history.csv', 'wb') as f:
f.write(response.content)
and I get error 403 with "invalid security token" message in the HTML value.
Any ideas? I prefer not to use selenium.
Answers:
import requests
from bs4 import BeautifulSoup
def main():
base_url = "https://pubmed.ncbi.nlm.nih.gov"
url = f"{base_url}/?term=IBD"
with requests.Session() as s:
web_page = requests.get(url, timeout=5)
soup = BeautifulSoup(web_page.content, "html.parser")
form = soup.find("form", id="side-export-search-by-year-form")
action_url = form.get("action")
download_url = f"{base_url}{action_url}"
form_data = {}
for input_field in form.find_all("input"):
form_data[input_field.get("name")] = input_field.get("value")
form_data["term"] = "IBD"
cookies = web_page.cookies
cookies.update({"pm-iosp": ""})
headers = {"Content-Type": "application/x-www-form-urlencoded",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8",
"origin": "https://pubmed.ncbi.nlm.nih.gov",
"referer": "https://pubmed.ncbi.nlm.nih.gov/?term=IBD",
"sec-ch-ua": "'Chromium';v='110', 'Not A(Brand';v='24', 'Microsoft Edge';v='110'",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"sec-gpc": "1",
"upgrade-insecure-requests": "1",
"dnt": "1",
"Cache-Control": "max-age=0"}
response = requests.post(url=download_url, data=form_data, cookies=cookies, headers=headers, timeout=5)
with open("history.csv", "wb") as f:
f.write(response.content)
if __name__ == '__main__':
main()
import subprocess
import re
BASE_NCBI_URL = "https://pubmed.ncbi.nlm.nih.gov"
YEAR_COUNT_DATA_REGEX = r'yearCounts:s*"[(.*?)]"'
def get_term_history_amount_stat(term: str):
url = f"{BASE_NCBI_URL}?term={term}"
output = subprocess.check_output(['curl', url])
year_counts_pattern = r'yearCounts:s*"[(.*?)]"'
year_counts_match = re.search(year_counts_pattern, output.decode(), re.DOTALL)
if year_counts_match:
return eval(year_counts_match.group(1))
else:
raise Exception(f"Cant parse historical data for term {term}")
Implemented it with subprocess and curl, for some reason curl does not get any permission errors, it just less convenient to parse it but it works.
I have the following url https://pubmed.ncbi.nlm.nih.gov/?term=IBD which I want to parse data from
(I found nothing against scraping in their terms), the site is public, there’s an ‘export’ button to download some historic data in csv, I want to automate downloading the file content with python.
I tried many options in the past day, this is my recent one
def parse_history():
url = "https://pubmed.ncbi.nlm.nih.gov/?term=IBD"
web_page = requests.get(url)
soup = BeautifulSoup(web_page.content, "html.parser")
form = soup.find('form', id='side-export-search-by-year-form')
download_url = form.get('action')
form_data = {}
for input_field in form.find_all('input'):
form_data[input_field.get('name')] = input_field.get('value')
csrf_token = get_csrf_token(url)
form_data["csrfmiddlewaretoken"] = csrf_token
response = requests.post(f"{url}{download_url}", data=form_data)
# Save the downloaded file
with open('history.csv', 'wb') as f:
f.write(response.content)
and I get error 403 with "invalid security token" message in the HTML value.
Any ideas? I prefer not to use selenium.
import requests
from bs4 import BeautifulSoup
def main():
base_url = "https://pubmed.ncbi.nlm.nih.gov"
url = f"{base_url}/?term=IBD"
with requests.Session() as s:
web_page = requests.get(url, timeout=5)
soup = BeautifulSoup(web_page.content, "html.parser")
form = soup.find("form", id="side-export-search-by-year-form")
action_url = form.get("action")
download_url = f"{base_url}{action_url}"
form_data = {}
for input_field in form.find_all("input"):
form_data[input_field.get("name")] = input_field.get("value")
form_data["term"] = "IBD"
cookies = web_page.cookies
cookies.update({"pm-iosp": ""})
headers = {"Content-Type": "application/x-www-form-urlencoded",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8",
"origin": "https://pubmed.ncbi.nlm.nih.gov",
"referer": "https://pubmed.ncbi.nlm.nih.gov/?term=IBD",
"sec-ch-ua": "'Chromium';v='110', 'Not A(Brand';v='24', 'Microsoft Edge';v='110'",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"sec-gpc": "1",
"upgrade-insecure-requests": "1",
"dnt": "1",
"Cache-Control": "max-age=0"}
response = requests.post(url=download_url, data=form_data, cookies=cookies, headers=headers, timeout=5)
with open("history.csv", "wb") as f:
f.write(response.content)
if __name__ == '__main__':
main()
import subprocess
import re
BASE_NCBI_URL = "https://pubmed.ncbi.nlm.nih.gov"
YEAR_COUNT_DATA_REGEX = r'yearCounts:s*"[(.*?)]"'
def get_term_history_amount_stat(term: str):
url = f"{BASE_NCBI_URL}?term={term}"
output = subprocess.check_output(['curl', url])
year_counts_pattern = r'yearCounts:s*"[(.*?)]"'
year_counts_match = re.search(year_counts_pattern, output.decode(), re.DOTALL)
if year_counts_match:
return eval(year_counts_match.group(1))
else:
raise Exception(f"Cant parse historical data for term {term}")
Implemented it with subprocess and curl, for some reason curl does not get any permission errors, it just less convenient to parse it but it works.