How to speed up extracting data from raster file for points using GDAL python

Question:

I have 366 raster image files (MODIS satellite daily data) in tif format containing snow data and another csv file containing 19,000 locations (latitude and longitudes). I need to collect the snow data from raster files. I have tried collecting the data using GDAL python library. However, the program is taking approximately 30 minutes for collecting data from each single file. That means I have to run the code for around 180 hours. Following is the code I am using. Please suggest if there is anyway I can improve the speed the program executes, or if there is any better way I can implement the same.

import gdal
import pandas
import numpy as np
import os,subprocess
def runCmdAndGetOutput(cmd) :
    outList = []
    proc = subprocess.Popen(cmd,stdout=subprocess.PIPE)
    while True:
        line = proc.stdout.readline()
        if not line:
            break
        #the real code does filtering here
        outList.append(line.rstrip())
        print(outList)
    # value = float(outList[2].decode("utf-8").replace("<Value>","").replace("</Value>",""))
    value = float(outList[0].decode("utf-8"))
    return value

# ndsiFile = "2016001.tif"
locs = "hkkhlocations.csv"
ndsFileLoc = r"D:SrinivasaRao_DocsMODIS_NDSI_V6_20165000000499560out"
# with open(locs) as f:
#     locData = f.readlines()
latLnginfo = pandas.read_csv(locs)
print(latLnginfo.columns)
print(latLnginfo.shape)

# outDf = pandas.DataFrame()


outDf = pandas.DataFrame(np.zeros([len(latLnginfo),370])*np.nan)
day =1
print(os.listdir(ndsFileLoc))
print(type(os.listdir(ndsFileLoc)))
datasetsList = os.listdir(ndsFileLoc)
for eFile in datasetsList:
    rCount = 0
    # print(eFile)
    cCount = int(eFile[4:7])
    # print(cCount)
    with open("output.csv") as f :
        for line in f :
            locData = line.split(",")
            cmdToRun = ["gdallocationinfo" ,"-valonly", "-wgs84", os.path.join(ndsFileLoc,eFile) ,str(latLnginfo.iloc[rCount,4]), str(latLnginfo.iloc[rCount,3])]# str(locData[0]), str(locData[1])]
            v = runCmdAndGetOutput(cmdToRun)
            outDf.iloc[rCount,cCount]= float(v)
            rCount = rCount + 1
            print("rowno: ", rCount, "Dayno :", cCount, "SCF value: ", v)


    day = day+1
outDf.to_csv('test.csv')

'''
Asked By: srinivas

||

Answers:

def run_cmd_processor(efile):
    r_count = 0
    c_count = int(efile[4:7])
    with open("output.csv") as f :
        for line in f :
            loc_data = line.split(",")
            # ~

pool = multiprocessing.Pool(processes=2) # You can add more processes
pool.map(run_cmd_processor, datasetsList)
pool.close()
pool.join()

It seems that the only point that can have multi processed branch is “for eFile in datasetsList:”. It could be changed like upper.

Answered By: Minu

My suggestion is, that you don’t need to call gdal via subprocess. Just read the HDF file via gdal and then get the pixel values at your long/lat coordinates:

from osgeo import gdal

src = <location_to_your_hdf>
ds = gdal.Open(src,gdal.GA_ReadOnly)

## get your subdataset, to find out which one -> ds.GetSubdatasets()
subdata = gdal.Open(ds.GetSubDatasets()[0][0], gdal.GA_ReadOnly)

## get Geometadata
gt = subdata.GetGeoTransform()

## now locate the pixel by transforming them from coordinates to width/height
px = int((lat - gt[0]) / gt[1])
py = int((long - gt[3]) / gt[5])
pixelval = subdata.ReadAsArray(px, py, 1, 1)

This should be much quicker then your subprocess-call because you only need to open the hdf file once and then loop through the coordinates list, instead of calling gdallocationinfo for every single coordinate.

Cheers

Answered By: HnsStnl