How do i have a list of url (around 80k) using python
Question:
How do i ping list of urls (around 80k) using python. The url is given in the format "https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"
. So i need to remove the numbers after comma from url(,”99000002″) and ping the rest to url to find which one of them shows 404 error code.I was able to remove the the last character using rsplit library.
df= '"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"'
print(df.rsplit(',',1)[0])
I have the urls in a csv file.But how do i ping such a huge list of urls.
update
I did try the a solution but after some time i get an error
MY code:
import csv
from urllib2 import urlopen
import urllib2
import split
import requests
with open('C:Userskanchan.jhaDesktoppmperformer_metros.csv',"rU") as csvfile:
reader = csv.reader(csvfile)
output = csv.writer(open("C:Userskanchan.jhaDesktoppmpm_quotes.csv",'w'))
for row in reader:
splitlist = [i.split(',',1)[0] for i in row]
#output.writerow(splitlist)
#converting to string and removing the extra quotes and square bracket
url = str(splitlist)[1:-1]
urls = str(url.strip('''))
content = urllib2.urlopen(urls).read()
if content.find('404') > -1:
output.writerow(splitlist)
csvfile.close()
The code runs for a while and then i get an error(pasted below).A output file is created but it contains only 10-15 urls having 404 error. It seems only a few urls are checked for error not all.
Traceback (most recent call last):
File "c:Userskanchan.jhaDesktopfile.py", line 27, in <module>
content = urllib2.urlopen(urls, timeout =1000).read()
File "C:Python27liburllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:Python27liburllib2.py", line 435, in open
response = meth(req, response)
File "C:Python27liburllib2.py", line 548, in http_response
'http', request, response, code, msg, hdrs)
File "C:Python27liburllib2.py", line 473, in error
return self._call_chain(*args)
File "C:Python27liburllib2.py", line 407, in _call_chain
result = func(*args)
File "C:Python27liburllib2.py", line 556, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: Not Found
Answers:
You can ping a url using Python Requests.
import requests
url = "https://stackoverflow.com/questions/49634166/how-do-i-have-a-list-of-url-around-80k-using-python"
response = requests.get(url)
print response.status_code
# 200
Once you have your urls, you can easily iterate through the list and send a get request, saving or printing the result per URL as per your requirements. Not sure if it’s going to work seamlessly with such a big list though, and also please note we are assuming that every URL will be available with no authentication and that every URL will be valid, which I am not sure it is the case.
You can use requests library and ping all the URLs one by one and collect the data on which one returned a 404. You can probably keep writing this data to disk instead of keeping it in memory if you want to preserve it.
import requests
# raw_string_urls is your list of 80k urls with string attached
raw_string_urls = ['"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"', '"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"', '"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"', '"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"']
not_found_urls = list()
# Iterate here on the raw_string_urls
# The below code could be executed for each url.
for raw_string_url in raw_string_urls:
url = raw_string_url.split(',')[0].strip('"')
r = requests.get(url)
print(url)
print(r.status_code)
if r.status_code == 404:
not_found_urls.append(url)
You can then dump not_found_urls
list as JSON file or whatever you want.
the is a snippet of infrastructure code to ping the urls using multi-threading,
a simple worker-queue model there is a queue with tasks and every worker (thread) spawn will listen to this queue and take tasks from it
by using multiple threads you can process 80K requests in a reasonable time
import threading, Queue, requests
pool = Queue.Queue()
num_worker_threads = 10
def ping(url):
# do a ping to the url return True/False or whatever you want..
response = requests.get(url)
if response.status_code != 200:
return False
return True
def worker():
while True:
url = pool.get()
try:
response = ping(url)
# check if response is ok and do stuff (printing to log or smt)
except Exception as e:
pass
pool.task_done()
for i in range(num_worker_threads):
t = threading.Thread(target=worker, args=())
t.setDaemon(True)
t.start()
urls = [...] #list of urls to check
for url in urls:
pool.put(url)
pool.join()
How do i ping list of urls (around 80k) using python. The url is given in the format "https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"
. So i need to remove the numbers after comma from url(,”99000002″) and ping the rest to url to find which one of them shows 404 error code.I was able to remove the the last character using rsplit library.
df= '"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"'
print(df.rsplit(',',1)[0])
I have the urls in a csv file.But how do i ping such a huge list of urls.
update
I did try the a solution but after some time i get an error
MY code:
import csv
from urllib2 import urlopen
import urllib2
import split
import requests
with open('C:Userskanchan.jhaDesktoppmperformer_metros.csv',"rU") as csvfile:
reader = csv.reader(csvfile)
output = csv.writer(open("C:Userskanchan.jhaDesktoppmpm_quotes.csv",'w'))
for row in reader:
splitlist = [i.split(',',1)[0] for i in row]
#output.writerow(splitlist)
#converting to string and removing the extra quotes and square bracket
url = str(splitlist)[1:-1]
urls = str(url.strip('''))
content = urllib2.urlopen(urls).read()
if content.find('404') > -1:
output.writerow(splitlist)
csvfile.close()
The code runs for a while and then i get an error(pasted below).A output file is created but it contains only 10-15 urls having 404 error. It seems only a few urls are checked for error not all.
Traceback (most recent call last):
File "c:Userskanchan.jhaDesktopfile.py", line 27, in <module>
content = urllib2.urlopen(urls, timeout =1000).read()
File "C:Python27liburllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:Python27liburllib2.py", line 435, in open
response = meth(req, response)
File "C:Python27liburllib2.py", line 548, in http_response
'http', request, response, code, msg, hdrs)
File "C:Python27liburllib2.py", line 473, in error
return self._call_chain(*args)
File "C:Python27liburllib2.py", line 407, in _call_chain
result = func(*args)
File "C:Python27liburllib2.py", line 556, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: Not Found
You can ping a url using Python Requests.
import requests
url = "https://stackoverflow.com/questions/49634166/how-do-i-have-a-list-of-url-around-80k-using-python"
response = requests.get(url)
print response.status_code
# 200
Once you have your urls, you can easily iterate through the list and send a get request, saving or printing the result per URL as per your requirements. Not sure if it’s going to work seamlessly with such a big list though, and also please note we are assuming that every URL will be available with no authentication and that every URL will be valid, which I am not sure it is the case.
You can use requests library and ping all the URLs one by one and collect the data on which one returned a 404. You can probably keep writing this data to disk instead of keeping it in memory if you want to preserve it.
import requests
# raw_string_urls is your list of 80k urls with string attached
raw_string_urls = ['"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"', '"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"', '"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"', '"https://www.test.com/en/Doors-Down/Buffalo/pm/99000002/3991","99000002"']
not_found_urls = list()
# Iterate here on the raw_string_urls
# The below code could be executed for each url.
for raw_string_url in raw_string_urls:
url = raw_string_url.split(',')[0].strip('"')
r = requests.get(url)
print(url)
print(r.status_code)
if r.status_code == 404:
not_found_urls.append(url)
You can then dump not_found_urls
list as JSON file or whatever you want.
the is a snippet of infrastructure code to ping the urls using multi-threading,
a simple worker-queue model there is a queue with tasks and every worker (thread) spawn will listen to this queue and take tasks from it
by using multiple threads you can process 80K requests in a reasonable time
import threading, Queue, requests
pool = Queue.Queue()
num_worker_threads = 10
def ping(url):
# do a ping to the url return True/False or whatever you want..
response = requests.get(url)
if response.status_code != 200:
return False
return True
def worker():
while True:
url = pool.get()
try:
response = ping(url)
# check if response is ok and do stuff (printing to log or smt)
except Exception as e:
pass
pool.task_done()
for i in range(num_worker_threads):
t = threading.Thread(target=worker, args=())
t.setDaemon(True)
t.start()
urls = [...] #list of urls to check
for url in urls:
pool.put(url)
pool.join()