How to programmatically getting link to CSV behind javascript page?
Question:
I’m using python and I’m trying to get the link from which the CSV come from when I click on the DATA V CSV
button at the bottom of this page.
I tried beautifulsoup
:
import requests
from bs4 import BeautifulSoup
url = 'https://www.ceps.cz/en/all-data#AktualniSystemovaOdchylkaCR'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find the link to the CSV file
csv_link = soup.find('a', string='DATA V CSV').get('href')
I also tried:
soup.find("button", {"id":"DATA V CSV"})
but it doesn’t find the link behind DATA V CSV
.
Answers:
The site dynamically generates a link. With the help of requests
and BeautifulSoup
, you can’t pull it out. On using the browser debugger, you can easily find it manually by clicking on the button and looking at what calls are being made.
import requests
url = 'https://www.ceps.cz/download-data/?format=csv'
response = requests.get(url)
print(response.text)
--------------------------
Data version;From;To;Agregation function;Agregation;Type of power plant;
real data;28.03.2023 09:00:00;28.03.2023 09:59:59;agregation average;minute;ALL;
Date;WPP [MW];PVPP [MW];
28.03.2023 09:00;189.8;342.3;
28.03.2023 09:01;194.1;342.3;
28.03.2023 09:02;200.1;354.4;
28.03.2023 09:03;200.1;366.3;
28.03.2023 09:04;200.1;383.4;
28.03.2023 09:05;197.5;387.4;
28.03.2023 09:06;195.4;391.4;
28.03.2023 09:07;198.2;398.3;
28.03.2023 09:08;198.2;388.9;
28.03.2023 09:09;199.6;397.2;
28.03.2023 09:10;205.1;400.8;
28.03.2023 09:11;205.1;410.8;
28.03.2023 09:12;203.1;413.1;
28.03.2023 09:13;198.6;413.1;
28.03.2023 09:14;198.6;413.1;
In order to get all data you need to fully mimic the requests that are send to the server.
Here’s how to do it:
from shutil import copyfileobj
from urllib.parse import urlencode
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"referer": "https://www.ceps.cz/en/all-data",
"accept": "application/json, text/javascript, */*; q=0.01",
"cookie": "nette-samesite=1; ARRAffinity=3ee2404f26d0149d946e50cb3d4c22661f9f3b6510837fa538c67990a81979de; ARRAffinitySameSite=3ee2404f26d0149d946e50cb3d4c22661f9f3b6510837fa538c67990a81979de"
}
payload = {
"do": "loadGraphData",
"method": "AktualniSystemovaOdchylkaCR",
"graph_id": "1026",
"move_graph": "day",
"download": "csv",
"date_to": "2023-03-28T23:59:59",
"date_from": "2023-03-28T00:00:00",
"agregation": "MI",
"date_type": "day",
"interval": "false",
"version": "bla",
"function": "AVG",
}
all_data = "https://www.ceps.cz/en/all-data"
download_url = "https://www.ceps.cz/download-data/?format=csv"
with requests.Session() as s:
headers.update({"x-requested-with": "XMLHttpRequest"})
r = s.get(f"{all_data}?{urlencode(payload)}", headers=headers)
print(r.json()["result"])
headers.pop("x-requested-with")
with s.get(download_url, headers=headers, stream=True) as r,
open("data.csv", "wb") as f:
copyfileobj(r.raw, f)
You should get a semicolon
-separated file that looks like this:
I’m using python and I’m trying to get the link from which the CSV come from when I click on the DATA V CSV
button at the bottom of this page.
I tried beautifulsoup
:
import requests
from bs4 import BeautifulSoup
url = 'https://www.ceps.cz/en/all-data#AktualniSystemovaOdchylkaCR'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find the link to the CSV file
csv_link = soup.find('a', string='DATA V CSV').get('href')
I also tried:
soup.find("button", {"id":"DATA V CSV"})
but it doesn’t find the link behind DATA V CSV
.
The site dynamically generates a link. With the help of requests
and BeautifulSoup
, you can’t pull it out. On using the browser debugger, you can easily find it manually by clicking on the button and looking at what calls are being made.
import requests
url = 'https://www.ceps.cz/download-data/?format=csv'
response = requests.get(url)
print(response.text)
--------------------------
Data version;From;To;Agregation function;Agregation;Type of power plant;
real data;28.03.2023 09:00:00;28.03.2023 09:59:59;agregation average;minute;ALL;
Date;WPP [MW];PVPP [MW];
28.03.2023 09:00;189.8;342.3;
28.03.2023 09:01;194.1;342.3;
28.03.2023 09:02;200.1;354.4;
28.03.2023 09:03;200.1;366.3;
28.03.2023 09:04;200.1;383.4;
28.03.2023 09:05;197.5;387.4;
28.03.2023 09:06;195.4;391.4;
28.03.2023 09:07;198.2;398.3;
28.03.2023 09:08;198.2;388.9;
28.03.2023 09:09;199.6;397.2;
28.03.2023 09:10;205.1;400.8;
28.03.2023 09:11;205.1;410.8;
28.03.2023 09:12;203.1;413.1;
28.03.2023 09:13;198.6;413.1;
28.03.2023 09:14;198.6;413.1;
In order to get all data you need to fully mimic the requests that are send to the server.
Here’s how to do it:
from shutil import copyfileobj
from urllib.parse import urlencode
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"referer": "https://www.ceps.cz/en/all-data",
"accept": "application/json, text/javascript, */*; q=0.01",
"cookie": "nette-samesite=1; ARRAffinity=3ee2404f26d0149d946e50cb3d4c22661f9f3b6510837fa538c67990a81979de; ARRAffinitySameSite=3ee2404f26d0149d946e50cb3d4c22661f9f3b6510837fa538c67990a81979de"
}
payload = {
"do": "loadGraphData",
"method": "AktualniSystemovaOdchylkaCR",
"graph_id": "1026",
"move_graph": "day",
"download": "csv",
"date_to": "2023-03-28T23:59:59",
"date_from": "2023-03-28T00:00:00",
"agregation": "MI",
"date_type": "day",
"interval": "false",
"version": "bla",
"function": "AVG",
}
all_data = "https://www.ceps.cz/en/all-data"
download_url = "https://www.ceps.cz/download-data/?format=csv"
with requests.Session() as s:
headers.update({"x-requested-with": "XMLHttpRequest"})
r = s.get(f"{all_data}?{urlencode(payload)}", headers=headers)
print(r.json()["result"])
headers.pop("x-requested-with")
with s.get(download_url, headers=headers, stream=True) as r,
open("data.csv", "wb") as f:
copyfileobj(r.raw, f)
You should get a semicolon
-separated file that looks like this: