Optimize operation for pandas dataframe , current solution is extremely slow

Question:

I have a pandas dataframe with geohashes as indexes along with a column called neighbours which stores a list of neighbours for each geohash. There are a few other column as well with metadata for each geohash as well.
The dataframe looks like this:

Geohash (index) Wave Height Normalized Wave Height Speed Factor Neighbours
u4sj9hz 0.962316 0.361604 0.757725 [‘u4sj9hy’, ‘u4sj9kb’, ‘u4sj9hx’, ‘u4sj9hw’, ‘u4sj9k8’, …]
u4ezqxn 0.570723 0.214457 0.856314 [‘u4ezqxj’, ‘u4ezqxp’, ‘u4ezqwy’, ‘u4ezqwv’, ‘u4ezqwz’, …

I need to create a edge_list used for graph creation, at first i did the following:

def create_edge_list(geohash, speed_factor, neighbours):
    edge_list = []
    for n in neighbours:
        distance = haversine_distance(geohash, n)
        # distance is in km, speed is in m/s.
        speed = 14 * speed_factor
        time = round((distance/(speed*3.6))*60, 1)
        edge_list.append((geohash, n, {"distance": distance, "time": time}))
    return edge_list


for geohash, row in tqdm(df.iterrows(), desc="Creating edge list", total=len(df.index), colour="green"):
        edge_list = create_edge_list(geohash, row.speed_factor, row.neighbours)
        elist.extend(edge_list)

But this is extremely slow considering i have over 7 million rows. Then i tried using multiprocessing and multithreading trying out both ProcessPoolExecutor and ThreadPoolExecutor, but these did not help much. Any suggestions?

Edit:
Seems like i had some errors in the ProcessPoolExecutor, once I fixed that it worked and it did speed it up (took 80 minutes to run down from several hours from just looping through). Also made a sligthly edited minimal reproducible example (notebook)

# Using Python 3.11.2, but works fine for most other newer Python versions

!pip install geopandas
!pip install geohash
!pip install polygeohasher
!pip install shapely
!pip install pandas
!pip install geopandas
!pip install tqdm

import os
import random
from math import cos, sin, asin, sqrt, radian

import geohash as gh
from polygeohasher import polygeohasher
from shapely.wkt import loads
import pandas as pd
import geopandas as gpd
from tqdm import tqdm


def haversine_distance(geohash1, geohash2):
    # geohash2 might be a list of neighbors
    if isinstance(geohash2, list):
        return [round(haversine_distance(geohash1, gh), 3) for gh in geohash2]

    lat1, lon1 = gh.decode(geohash1)
    lat2, lon2 = gh.decode(geohash2)

    lat1, lon1 = (float(lat1), float(lon1))
    lat2, lon2 = (float(lat2), float(lon2))

    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r


def create_edge_list(geohash, speed_factor, neighbours):
    speed_multiplier = 60 / (3.6 * 14 * speed_factor)
    neighbours = list(neighbours)
    distances = haversine_distance(geohash, neighbours)
    times = [round(d * speed_multiplier, 2) for d in distances]
    edge_list = [(geohash, neighbours[i], {"distance": distances[i], "time": times[i]}) for i in range(len(times))]
    return edge_list


if __name__ == "__main__":
    GEOHASH_PRECISION = 6
    # Create polygons using: https://clydedacruz.github.io/openstreetmap-wkt-playground/
    polygon_wkt = "POLYGON((9.07196044921875 53.91728101547625,8.25897216796875 52.99495027026802,5.88043212890625 53.20603255157843,5.072937011718749 53.497849543967675,5.913391113281249 53.74221377343122,6.05621337890625 54.004540438503625,8.73687744140625 54.072282655603885,9.07196044921875 53.91728101547625))"
    polygon_gdf = gpd.GeoDataFrame(index=[0], crs="EPSG:4326", geometry=[loads(polygon_wkt)])
    print("Creating geohash list...")
    temp_df = polygeohasher.create_geohash_list(polygon_gdf, GEOHASH_PRECISION, inner=True)
    df = pd.DataFrame(temp_df.geohash_list.values.tolist()[0], columns=["geohash"])
    df.set_index("geohash", inplace=True)

    # just simulate some speed factor for now
    df["speed_factor"] =  [random.uniform(0.4, 1.0) for i in range(len(df.index))]

    neighbours = {geohash: gh.neighbors(geohash) for geohash in df.index}
    df["neighbours"] = df.index.map(neighbours)

    elist = []
    MT = False
    print("Creating edge list...")
    if MT:
        from concurrent.futures import ProcessPoolExecutor
        geohash_list = list(df.index)
        speed_factor_list = list(df.speed_factor)
        neighbours_list = list(df.neighbours)
        with tqdm(desc="Creating edge list", total=len(df.index), colour="green") as pbar:
            with ProcessPoolExecutor(os.cpu_count()) as executor:
                result = executor.map(create_edge_list, geohash_list, speed_factor_list, neighbours_list, chunksize=len(df.index)//(os.cpu_count()))
                for edge_list in result:
                    elist.extend(edge_list)
                    pbar.update(1)
    else:
        for geohash, row in tqdm(df.iterrows(), desc="Creating edge list", total=len(df.index), colour="green"):
            edge_list = create_edge_list(geohash, row.speed_factor, row.neighbours)
            elist.extend(edge_list)
Asked By: Laende

||

Answers:

Couple of suggestions for create_edge_list: Calculate your multiplier only once, remove the for loop by processing arrays and use list comprehension to generate your output.

def create_edge_list(geohash, speed_factor, neighbours):
    speed_multiplier = 60 / (3.6 * 14 * speed_factor)  # distance is in km, speed is in m/s.
    distance = haversine_distance(geohash, neighbours)
    time = np.round(distance*speed_multiplier, 1)
    edge_list = [(geohash, neighbours[i], {"distance": distance[i], "time": time[i]})) for i in range(len(time))]
    return edge_list

Could not test this. You might have to use distance[0, i] if haversine_distance outputs a 2D array..

As for you main loop, you could pre-initialize your list and fill on the fly:

elist = [None for _ in range(len(df.index))]
for i, (geohash, row) in tqdm(enumerate(df.iterrows()), desc="Creating edge list", total=len(df.index), colour="green"):
        elist[i] = create_edge_list(geohash, row.speed_factor, row.neighbours)
Answered By: Nyps

Here is an approach for speedup (about 9x for your example):

  1. Transform the geohashes into 3-D (x, y, z) coordinates.
  2. Compute the Euclidean distances.
  3. Transform these distances (chord) into "great-circle" distances.

We also prefer to convert the distinct geohashes and then map from that, rather than convert duplicate geohashes multiple times.

As a side note, it would be very convenient to find neighbors in 3D space, for example using the excellent scipy.spatial.KDTree: you could set a maximum distance that is close to your desired maximum haversine distance (for small distances they are almost the same, but the Euclidean distance is always a bit smaller, of course), then filter the nearest neighbors found.

In any case, just using the neihbours that you provide:

First, conversion of geohashes:

R_earth = 6371


def geohash_to_xyz(geoh, R=R_earth):
    lat, lon = np.deg2rad(np.array(pd.Series(geoh).apply(gh.decode).to_list())).T
    
    # conversion (latitude, longitude, altitude) to (x, y, z)
    # see https://stackoverflow.com/a/10788250/758174
    alt = 0  # use altitude = 0
    f = 0  # use flattening = 0
    coslat = np.cos(lat)
    sinlat = np.sin(lat)
    FF     = (1.0 - f)**2
    C      = 1 / np.sqrt(coslat**2 + FF * sinlat**2)
    S      = C * FF

    x = (R * C + alt) * coslat * np.cos(lon)
    y = (R * C + alt) * coslat * np.sin(lon)
    z = (R * S + alt) * sinlat
    
    return np.c_[x, y, z]

Example:

>>> geohash_to_xyz(['u4sj9hz', 'u4sj9hy'])
array([[3164.48891562,  314.70029181, 5520.56289062],
       [3164.49645759,  314.62444382, 5520.56289062]])

Then, we convert chord distance (Euclidean) to great-circle or "haversine":

def chord_to_haversine(d, R=R_earth):
    # say we have two 3D points on a R-radius sphere
    # separated by a Euclidean distance d
    # d = R * crd(alpha)   (the chord times radius)
    # 
    # crd(alpha) = 2 sin(alpha / 2)
    # Thus: alpha = 2 arcsin(1/2 d/R)
    # Haversine (hav) is simply alpha * R
    return R * 2 * np.arcsin(d / (2 * R))

Example:

ha, hb = 'u4sj9hz', 'u4ezqxn'
a, b = geohash_to_xyz([ha, hb])
d = np.linalg.norm(a - b)

>>> chord_to_haversine(d)
36.106372367799615

>>> haversine_distance(ha, hb)
36.106372367799615

Now, we have all the elements to implement everything in a vectorized way:

def new_edge_list(df, R=R_earth):
    z = df.explode('neighbours')
    geoh = pd.concat([
        df.reset_index()['geohash'],
        z['neighbours'],
    ], ignore_index=True).drop_duplicates()
    pmap = pd.DataFrame(
        geohash_to_xyz(geoh, R),
        columns=list('xyz'),
        index=geoh).rename_axis('geohash', axis=0)
    a = pmap.loc[z.index].to_numpy()
    b = pmap.loc[z['neighbours']].to_numpy()
    d = chord_to_haversine(np.linalg.norm(a - b, axis=1), R)
    t = d * 60 / (3.6 * 14 * z['speed_factor'])
    z = z.assign(distance=d, time=t)

    return z

Example on the MRE you provided:

z = new_edge_list(df)

>>> z
         speed_factor neighbours  distance      time
geohash                                             
u1kz2d       0.474744     u1kz26  0.729742  1.829913
u1kz2d       0.474744     u1kz2f  0.729742  1.829913
u1kz2d       0.474744     u1kz29  0.610812  1.531683
u1kz2d       0.474744     u1kz23  0.951674  2.386433
u1kz2d       0.474744     u1kz2c  0.951674  2.386433
...               ...        ...       ...       ...
u1mrp4       0.761640     u1mrnc  0.952250  1.488407
u1mrp4       0.761640     u1mrp3  0.952250  1.488407
u1mrp4       0.761640     u1mrp5  0.610812  0.954725
u1mrp4       0.761640     u1mrng  0.952178  1.488295
u1mrp4       0.761640     u1mrp7  0.952178  1.488295

[375808 rows x 4 columns]

Compare to your result:

def make_edges(df, show_progress=True):
    elist = []
    it = df.iterrows()
    if show_progress:
        it = tqdm(it, desc="Creating edge list", total=len(df.index), colour="green")
    for geohash, row in it:
        edge_list = create_edge_list(geohash, row['speed_factor'], row['neighbours'])
        elist += edge_list
    return elist

elist = make_edges(df)
>>> elist
[('u1kz2d', 'u1kz26', {'distance': 0.73, 'time': 1.83}),
 ('u1kz2d', 'u1kz2f', {'distance': 0.73, 'time': 1.83}),
 ('u1kz2d', 'u1kz29', {'distance': 0.611, 'time': 1.53}),
 ('u1kz2d', 'u1kz23', {'distance': 0.952, 'time': 2.39}),
 ('u1kz2d', 'u1kz2c', {'distance': 0.952, 'time': 2.39}),
 ...
 ('u1mrp4', 'u1mrnc', {'distance': 0.952, 'time': 1.49}),
 ('u1mrp4', 'u1mrp3', {'distance': 0.952, 'time': 1.49}),
 ('u1mrp4', 'u1mrp5', {'distance': 0.611, 'time': 0.96}),
 ('u1mrp4', 'u1mrng', {'distance': 0.952, 'time': 1.49}),
 ('u1mrp4', 'u1mrp7', {'distance': 0.952, 'time': 1.49})]

Addendum: transform result to list of tuples

To transform the result above into the exact format required, we can do the following:

def to_tuples(z):
    return [
        (ha, hb, {'distance': d, 'time':t})
        for ha, hb, d, t in zip(
            z.index, z['neighbours'],
            np.round(z['distance'].to_numpy(), 3),
            np.round(z['time'].to_numpy(), 2))
    ]

Example:

new_elist = to_tuples(z)
>>> new_elist[:5]
[('u1kz2d', 'u1kz26', {'distance': 0.73, 'time': 1.83}),
 ('u1kz2d', 'u1kz2f', {'distance': 0.73, 'time': 1.83}),
 ('u1kz2d', 'u1kz29', {'distance': 0.611, 'time': 1.53}),
 ('u1kz2d', 'u1kz23', {'distance': 0.952, 'time': 2.39}),
 ('u1kz2d', 'u1kz2c', {'distance': 0.952, 'time': 2.39})]

Note however that this should probably be the very last step. As a DataFrame, the result is much easier to manipulate. For example, you can filter rows that are under a certain distance in a one-liner, e.g.: z.loc[z['distance'] < .9]. Or you can look at the distribution (e.g. z['distance'].hist(bins=50), or z['distance'].quantile([.1,.5,.9])), etc.

Speed

On the MRE given:

t_orig = %timeit -o make_edges(df, show_progress=False)
# 3.83 s ± 8.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

t_new = %timeit -o new_edge_list(df)
# 408 ms ± 1.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

>>> t_orig.best / t_new.best
9.40

The conversion to the desired output format takes some time too (although this is 3x faster than my initial version, which was more "pandas-esque"):

%timeit to_tuples(z)
# 243 ms ± 857 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
Answered By: Pierre D
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.