How to speed up extracting data from raster file for points using GDAL python
Question:
I have 366 raster image files (MODIS satellite daily data) in tif format containing snow data and another csv file containing 19,000 locations (latitude and longitudes). I need to collect the snow data from raster files. I have tried collecting the data using GDAL python library. However, the program is taking approximately 30 minutes for collecting data from each single file. That means I have to run the code for around 180 hours. Following is the code I am using. Please suggest if there is anyway I can improve the speed the program executes, or if there is any better way I can implement the same.
import gdal
import pandas
import numpy as np
import os,subprocess
def runCmdAndGetOutput(cmd) :
outList = []
proc = subprocess.Popen(cmd,stdout=subprocess.PIPE)
while True:
line = proc.stdout.readline()
if not line:
break
#the real code does filtering here
outList.append(line.rstrip())
print(outList)
# value = float(outList[2].decode("utf-8").replace("<Value>","").replace("</Value>",""))
value = float(outList[0].decode("utf-8"))
return value
# ndsiFile = "2016001.tif"
locs = "hkkhlocations.csv"
ndsFileLoc = r"D:SrinivasaRao_DocsMODIS_NDSI_V6_20165000000499560out"
# with open(locs) as f:
# locData = f.readlines()
latLnginfo = pandas.read_csv(locs)
print(latLnginfo.columns)
print(latLnginfo.shape)
# outDf = pandas.DataFrame()
outDf = pandas.DataFrame(np.zeros([len(latLnginfo),370])*np.nan)
day =1
print(os.listdir(ndsFileLoc))
print(type(os.listdir(ndsFileLoc)))
datasetsList = os.listdir(ndsFileLoc)
for eFile in datasetsList:
rCount = 0
# print(eFile)
cCount = int(eFile[4:7])
# print(cCount)
with open("output.csv") as f :
for line in f :
locData = line.split(",")
cmdToRun = ["gdallocationinfo" ,"-valonly", "-wgs84", os.path.join(ndsFileLoc,eFile) ,str(latLnginfo.iloc[rCount,4]), str(latLnginfo.iloc[rCount,3])]# str(locData[0]), str(locData[1])]
v = runCmdAndGetOutput(cmdToRun)
outDf.iloc[rCount,cCount]= float(v)
rCount = rCount + 1
print("rowno: ", rCount, "Dayno :", cCount, "SCF value: ", v)
day = day+1
outDf.to_csv('test.csv')
'''
Answers:
def run_cmd_processor(efile):
r_count = 0
c_count = int(efile[4:7])
with open("output.csv") as f :
for line in f :
loc_data = line.split(",")
# ~
pool = multiprocessing.Pool(processes=2) # You can add more processes
pool.map(run_cmd_processor, datasetsList)
pool.close()
pool.join()
It seems that the only point that can have multi processed branch is “for eFile in datasetsList:”. It could be changed like upper.
My suggestion is, that you don’t need to call gdal via subprocess. Just read the HDF file via gdal and then get the pixel values at your long/lat coordinates:
from osgeo import gdal
src = <location_to_your_hdf>
ds = gdal.Open(src,gdal.GA_ReadOnly)
## get your subdataset, to find out which one -> ds.GetSubdatasets()
subdata = gdal.Open(ds.GetSubDatasets()[0][0], gdal.GA_ReadOnly)
## get Geometadata
gt = subdata.GetGeoTransform()
## now locate the pixel by transforming them from coordinates to width/height
px = int((lat - gt[0]) / gt[1])
py = int((long - gt[3]) / gt[5])
pixelval = subdata.ReadAsArray(px, py, 1, 1)
This should be much quicker then your subprocess-call because you only need to open the hdf file once and then loop through the coordinates list, instead of calling gdallocationinfo for every single coordinate.
Cheers
I have 366 raster image files (MODIS satellite daily data) in tif format containing snow data and another csv file containing 19,000 locations (latitude and longitudes). I need to collect the snow data from raster files. I have tried collecting the data using GDAL python library. However, the program is taking approximately 30 minutes for collecting data from each single file. That means I have to run the code for around 180 hours. Following is the code I am using. Please suggest if there is anyway I can improve the speed the program executes, or if there is any better way I can implement the same.
import gdal
import pandas
import numpy as np
import os,subprocess
def runCmdAndGetOutput(cmd) :
outList = []
proc = subprocess.Popen(cmd,stdout=subprocess.PIPE)
while True:
line = proc.stdout.readline()
if not line:
break
#the real code does filtering here
outList.append(line.rstrip())
print(outList)
# value = float(outList[2].decode("utf-8").replace("<Value>","").replace("</Value>",""))
value = float(outList[0].decode("utf-8"))
return value
# ndsiFile = "2016001.tif"
locs = "hkkhlocations.csv"
ndsFileLoc = r"D:SrinivasaRao_DocsMODIS_NDSI_V6_20165000000499560out"
# with open(locs) as f:
# locData = f.readlines()
latLnginfo = pandas.read_csv(locs)
print(latLnginfo.columns)
print(latLnginfo.shape)
# outDf = pandas.DataFrame()
outDf = pandas.DataFrame(np.zeros([len(latLnginfo),370])*np.nan)
day =1
print(os.listdir(ndsFileLoc))
print(type(os.listdir(ndsFileLoc)))
datasetsList = os.listdir(ndsFileLoc)
for eFile in datasetsList:
rCount = 0
# print(eFile)
cCount = int(eFile[4:7])
# print(cCount)
with open("output.csv") as f :
for line in f :
locData = line.split(",")
cmdToRun = ["gdallocationinfo" ,"-valonly", "-wgs84", os.path.join(ndsFileLoc,eFile) ,str(latLnginfo.iloc[rCount,4]), str(latLnginfo.iloc[rCount,3])]# str(locData[0]), str(locData[1])]
v = runCmdAndGetOutput(cmdToRun)
outDf.iloc[rCount,cCount]= float(v)
rCount = rCount + 1
print("rowno: ", rCount, "Dayno :", cCount, "SCF value: ", v)
day = day+1
outDf.to_csv('test.csv')
'''
def run_cmd_processor(efile):
r_count = 0
c_count = int(efile[4:7])
with open("output.csv") as f :
for line in f :
loc_data = line.split(",")
# ~
pool = multiprocessing.Pool(processes=2) # You can add more processes
pool.map(run_cmd_processor, datasetsList)
pool.close()
pool.join()
It seems that the only point that can have multi processed branch is “for eFile in datasetsList:”. It could be changed like upper.
My suggestion is, that you don’t need to call gdal via subprocess. Just read the HDF file via gdal and then get the pixel values at your long/lat coordinates:
from osgeo import gdal
src = <location_to_your_hdf>
ds = gdal.Open(src,gdal.GA_ReadOnly)
## get your subdataset, to find out which one -> ds.GetSubdatasets()
subdata = gdal.Open(ds.GetSubDatasets()[0][0], gdal.GA_ReadOnly)
## get Geometadata
gt = subdata.GetGeoTransform()
## now locate the pixel by transforming them from coordinates to width/height
px = int((lat - gt[0]) / gt[1])
py = int((long - gt[3]) / gt[5])
pixelval = subdata.ReadAsArray(px, py, 1, 1)
This should be much quicker then your subprocess-call because you only need to open the hdf file once and then loop through the coordinates list, instead of calling gdallocationinfo for every single coordinate.
Cheers