Is there a way to process point in area faster in Python? shapely implementation
Question:
I have attempted to try in Python the same thing I achieve in a GIS Software.
I have 51 centroids to compare to 90,000+ parcels and find the overlap.
It seems that my script is working (which I am happy to have been able and write) but I was curious if there is a way to speed this up? it takes around 5mins. I appreciate any help I can get as the final scope would be to process way more centroids and way more parcels so I am concerned by the time it would take.
I added the code below as well as a few lines of how the csv and json files look.
import json, csv, time
from shapely.geometry import Point, Polygon
start = time.time()
with open('M035TaxPar_CY20_FY20.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
#print(f'Column names are {", ".join(row)}')
line_count += 1
else:
my_coord = tuple([float(row[10]), float(row[11])])
my_coord = Point(my_coord)
line_count += 1
with open('boston_parcels_all.json', 'r') as f:
array = json.load(f)
dict_size = len(array.get("features"))
for i in range(0, dict_size -1):
sub_dict = array.get("features")[i]
geometry_dict = sub_dict.get("geometry")
current_Pol = geometry_dict.get("coordinates")
attribute_data = sub_dict.get("properties")
parcel_id = attribute_data.get("MAP_PAR_ID")
try:
final_Pol = Polygon(current_Pol[0])
except AttributeError:
continue
except ValueError:
continue
if not my_coord.within(final_Pol):
continue
else:
if parcel_id == None:
continue
else:
print(f"My pol: {parcel_id}, my point: {row[0]}")
print(f'Processed {line_count} lines.')
end = time.time()
total_time = end - start
print("n"+ str(total_time))```
CSV file:
MAP_PAR_ID,LOC_ID,POLY_TYPE,MAP_NO,SOURCE,PLAN_ID,LAST_EDIT,BND_CHK,NO_MATCH,TOWN_ID,X_coord,Y_coord
0301290001,F_772282_2959076,FEE,3,ASSESS,,20191107,,N,35,-71.07030535235137,42.36703589444949
0301286000,F_772818_2959719,FEE,3,ASSESS,,20191107,,N,35,-71.0682938996932,42.36876187375208
JSON file:
{
"type" : "FeatureCollection",
"name" : "M035TaxPar_CY20_FY20",
"features" : [
{
"type" : "Feature",
"geometry" : {
"type" : "Polygon",
"coordinates" : [
[
[ -71.074767681, 42.3489358757 ],
[ -71.0745071763, 42.3490067112 ],
[ -71.074852421, 42.3497214512 ],
[ -71.0757149372, 42.3494869825 ],
[ -71.0753495281, 42.3487373608 ],
[ -71.074767681, 42.3489358757 ]
]
]
},
"properties" : {
"SHAPE_Leng" : 324.191190544,
"SHAPE_Area" : 6483.24124923,
"MAP_PAR_ID" : "0401134000",
"LOC_ID" : "F_771022_2952578",
"POLY_TYPE" : "FEE",
"MAP_NO" : "4",
"SOURCE" : "ASSESS",
"LAST_EDIT" : 20191107,
"NO_MATCH" : "N",
"TOWN_ID" : 35
}
},
{
"type" : "Feature",
"geometry" : {
"type" : "Polygon",
"coordinates" : [
[
[ -71.0554563931, 42.3547109221 ],
[ -71.0550339868, 42.3548837812 ],
[ -71.0557334699, 42.3555313301 ],
[ -71.0564003565, 42.3554356917 ],
[ -71.0562325294, 42.3545663782 ],
[ -71.0554563931, 42.3547109221 ]
]
]
},
"properties" : {
"SHAPE_Leng" : 351.344209198,
"SHAPE_Area" : 7627.60108948,
"MAP_PAR_ID" : "0304410000",
"LOC_ID" : "F_776224_2954721",
"POLY_TYPE" : "FEE",
"MAP_NO" : "3",
"SOURCE" : "ASSESS",
"LAST_EDIT" : 20191107,
"NO_MATCH" : "N",
"TOWN_ID" : 35
}
}
]
}
Answers:
Parse your GeoJSON file only once, not for each point you read from the CSV.
You can also use csv.DictReader
to make your code a bit shorter.
import json, csv, time
from shapely.geometry import Point, Polygon
def parse_parcel(feature: dict):
geometry_dict = feature["geometry"]
coordinates = geometry_dict["coordinates"]
attribute_data = feature["properties"]
parcel_id = attribute_data.get("MAP_PAR_ID")
if not parcel_id:
raise ValueError("Parcel ID is missing")
pol = Polygon(coordinates[0])
return (parcel_id, pol)
def read_parcels():
with open("boston_parcels_all.json", "r") as f:
geojson = json.load(f)
assert geojson["type"] == "FeatureCollection"
for sub_dict in geojson["features"]:
try:
yield parse_parcel(sub_dict)
except Exception as exc:
print(f"Error parsing geometry {sub_dict}: {exc}")
def main():
start = time.time()
parcels = dict(read_parcels())
print(f"Read {len(parcels)} parcels.")
with open("M035TaxPar_CY20_FY20.csv") as csv_file:
for line_count, row in enumerate(csv.DictReader(csv_file, delimiter=","), 1):
my_coord = Point((float(row["X_coord"]), float(row["Y_coord"])))
for parcel_id, parcel in parcels.items():
if my_coord.within(parcel):
print(f"Parcel ID: {parcel_id} contains {row[0]}")
print(f"Processed {line_count} records.")
end = time.time()
total_time = end - start
print(f"Total time: {total_time:.3f} seconds")
if __name__ == "__main__":
main()
Doing this in pandas+geopandas should be as simple as:
import pandas as pd, geopandas as gpd
df = pd.read_csv('M035TaxPar_CY20_FY20.csv')
shp = gpd.read_file(
'boston_parcels_all.json', engine='GeoJSON'
)
points = gpd.GeoDataFrame(
gpd.points_from_xy(df.X_coord, df.Y_coord)
)
joined = points.sjoin(shp, predicate='within')
See the geopandas docs on spatial joins for more info.
I have attempted to try in Python the same thing I achieve in a GIS Software.
I have 51 centroids to compare to 90,000+ parcels and find the overlap.
It seems that my script is working (which I am happy to have been able and write) but I was curious if there is a way to speed this up? it takes around 5mins. I appreciate any help I can get as the final scope would be to process way more centroids and way more parcels so I am concerned by the time it would take.
I added the code below as well as a few lines of how the csv and json files look.
import json, csv, time
from shapely.geometry import Point, Polygon
start = time.time()
with open('M035TaxPar_CY20_FY20.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
#print(f'Column names are {", ".join(row)}')
line_count += 1
else:
my_coord = tuple([float(row[10]), float(row[11])])
my_coord = Point(my_coord)
line_count += 1
with open('boston_parcels_all.json', 'r') as f:
array = json.load(f)
dict_size = len(array.get("features"))
for i in range(0, dict_size -1):
sub_dict = array.get("features")[i]
geometry_dict = sub_dict.get("geometry")
current_Pol = geometry_dict.get("coordinates")
attribute_data = sub_dict.get("properties")
parcel_id = attribute_data.get("MAP_PAR_ID")
try:
final_Pol = Polygon(current_Pol[0])
except AttributeError:
continue
except ValueError:
continue
if not my_coord.within(final_Pol):
continue
else:
if parcel_id == None:
continue
else:
print(f"My pol: {parcel_id}, my point: {row[0]}")
print(f'Processed {line_count} lines.')
end = time.time()
total_time = end - start
print("n"+ str(total_time))```
CSV file:
MAP_PAR_ID,LOC_ID,POLY_TYPE,MAP_NO,SOURCE,PLAN_ID,LAST_EDIT,BND_CHK,NO_MATCH,TOWN_ID,X_coord,Y_coord
0301290001,F_772282_2959076,FEE,3,ASSESS,,20191107,,N,35,-71.07030535235137,42.36703589444949
0301286000,F_772818_2959719,FEE,3,ASSESS,,20191107,,N,35,-71.0682938996932,42.36876187375208
JSON file:
{
"type" : "FeatureCollection",
"name" : "M035TaxPar_CY20_FY20",
"features" : [
{
"type" : "Feature",
"geometry" : {
"type" : "Polygon",
"coordinates" : [
[
[ -71.074767681, 42.3489358757 ],
[ -71.0745071763, 42.3490067112 ],
[ -71.074852421, 42.3497214512 ],
[ -71.0757149372, 42.3494869825 ],
[ -71.0753495281, 42.3487373608 ],
[ -71.074767681, 42.3489358757 ]
]
]
},
"properties" : {
"SHAPE_Leng" : 324.191190544,
"SHAPE_Area" : 6483.24124923,
"MAP_PAR_ID" : "0401134000",
"LOC_ID" : "F_771022_2952578",
"POLY_TYPE" : "FEE",
"MAP_NO" : "4",
"SOURCE" : "ASSESS",
"LAST_EDIT" : 20191107,
"NO_MATCH" : "N",
"TOWN_ID" : 35
}
},
{
"type" : "Feature",
"geometry" : {
"type" : "Polygon",
"coordinates" : [
[
[ -71.0554563931, 42.3547109221 ],
[ -71.0550339868, 42.3548837812 ],
[ -71.0557334699, 42.3555313301 ],
[ -71.0564003565, 42.3554356917 ],
[ -71.0562325294, 42.3545663782 ],
[ -71.0554563931, 42.3547109221 ]
]
]
},
"properties" : {
"SHAPE_Leng" : 351.344209198,
"SHAPE_Area" : 7627.60108948,
"MAP_PAR_ID" : "0304410000",
"LOC_ID" : "F_776224_2954721",
"POLY_TYPE" : "FEE",
"MAP_NO" : "3",
"SOURCE" : "ASSESS",
"LAST_EDIT" : 20191107,
"NO_MATCH" : "N",
"TOWN_ID" : 35
}
}
]
}
Parse your GeoJSON file only once, not for each point you read from the CSV.
You can also use csv.DictReader
to make your code a bit shorter.
import json, csv, time
from shapely.geometry import Point, Polygon
def parse_parcel(feature: dict):
geometry_dict = feature["geometry"]
coordinates = geometry_dict["coordinates"]
attribute_data = feature["properties"]
parcel_id = attribute_data.get("MAP_PAR_ID")
if not parcel_id:
raise ValueError("Parcel ID is missing")
pol = Polygon(coordinates[0])
return (parcel_id, pol)
def read_parcels():
with open("boston_parcels_all.json", "r") as f:
geojson = json.load(f)
assert geojson["type"] == "FeatureCollection"
for sub_dict in geojson["features"]:
try:
yield parse_parcel(sub_dict)
except Exception as exc:
print(f"Error parsing geometry {sub_dict}: {exc}")
def main():
start = time.time()
parcels = dict(read_parcels())
print(f"Read {len(parcels)} parcels.")
with open("M035TaxPar_CY20_FY20.csv") as csv_file:
for line_count, row in enumerate(csv.DictReader(csv_file, delimiter=","), 1):
my_coord = Point((float(row["X_coord"]), float(row["Y_coord"])))
for parcel_id, parcel in parcels.items():
if my_coord.within(parcel):
print(f"Parcel ID: {parcel_id} contains {row[0]}")
print(f"Processed {line_count} records.")
end = time.time()
total_time = end - start
print(f"Total time: {total_time:.3f} seconds")
if __name__ == "__main__":
main()
Doing this in pandas+geopandas should be as simple as:
import pandas as pd, geopandas as gpd
df = pd.read_csv('M035TaxPar_CY20_FY20.csv')
shp = gpd.read_file(
'boston_parcels_all.json', engine='GeoJSON'
)
points = gpd.GeoDataFrame(
gpd.points_from_xy(df.X_coord, df.Y_coord)
)
joined = points.sjoin(shp, predicate='within')
See the geopandas docs on spatial joins for more info.