Picking/filtering element from pandas table where data is between column header values
Question:
I have some 2D data that has boundaries (bins
) like this:
import numpy as npy
# These are the boundaries of our wind speeds and directions
speed_bins = npy.array([0.0, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0])
dir_bins = npy.linspace(0,360,9)
# Random LPF values from 0–0.5
data = npy.random.rand(len(dir_bins)-1, len(speed_bins)-1)*0.5
dataTable = pd.DataFrame(LPF, index=dir_bins[1:],columns=speed_bins[1:])
I have some meteorological data like this:
# Assume all arrays are the same length
speeds = numpy.array([...])
directions = numpy.array([...])
X = numpy.array([...])
metData = pd.DataFrame({'speeds':speeds,'directions':directions,'X':X})
What I need to do is find the element in dataTable
, where each combination of speed
and direction
are bound.
I’m sure there is some pandas #magic that will do the following:
- For each
(speed, direction)
pair:
- Find the row in
dataTable
that bounds the speed
- Find the column in
dataTable
that bounds the direction
- return the
data
variable.
For example: if my speed
is 0.75
and my direction
is 350
then I want to get the value at element dataTable.iat[7,1]
. 0.75
is between 0.5
and 1.0
(the column header values) and similarly for the direction, but with the index values.
I hope I’ve described this well.
Answers:
Input data
The DataFrame for the boundaries:
import pandas as pd
import numpy as np
# For reproducibility
np.random.seed(0)
# These are the boundaries of our wind speeds and directions
speed_bins = np.array([0.0, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0])
dir_bins = np.linspace(0,360,9)
# Random LPF values from 0–0.5
data = np.random.rand(len(dir_bins)-1, len(speed_bins)-1)*0.5
df = pd.DataFrame(data, index=dir_bins[1:],columns=speed_bins[1:])
print(df)
0.5 1.0 2.0 5.0 10.0 100.0
45.0 0.274407 0.357595 0.301382 0.272442 0.211827 0.322947
90.0 0.218794 0.445887 0.481831 0.191721 0.395863 0.264447
135.0 0.284022 0.462798 0.035518 0.043565 0.010109 0.416310
180.0 0.389078 0.435006 0.489309 0.399579 0.230740 0.390265
225.0 0.059137 0.319961 0.071677 0.472334 0.260924 0.207331
270.0 0.132278 0.387117 0.228075 0.284217 0.009395 0.308818
315.0 0.306048 0.308467 0.471874 0.340910 0.179754 0.218516
360.0 0.348816 0.030113 0.333383 0.335319 0.105191 0.064463
The DataFrame for the meteorological data:
# Randomly generate n many values for speeds and directions
n = 1000000
speeds = np.random.rand(n) * 4
directions = np.random.rand(n) * 360
met_data = pd.DataFrame({'speeds': speeds,
'directions': directions})
print(met_data.head(10))
speeds directions
0 1.261713 144.163960
1 1.454843 236.032725
2 2.280787 119.731554
3 1.754406 173.319830
4 3.953495 80.739232
5 0.408179 146.199108
6 0.835507 255.461460
7 0.645238 13.960304
8 2.612433 64.738510
9 1.013166 147.860518
Solution
The pd.cut()
function is especially useful to solve your problem. It can be used to assign the speed and direction values to the bin intervals you have defined.
# Get the bin intervals that each of the speeds and directions lie in.
speed_intervals = pd.cut(met_data['speeds'], bins=[0] + df.columns.tolist())
direction_intervals = pd.cut(met_data['directions'], bins=[0] + df.index.tolist())
# Create a new DataFrame with these values as columns.
intervals = pd.concat([speed_intervals, direction_intervals], axis=1)
print(intervals.head(10))
speeds directions
0 (1.0, 2.0] (135.0, 180.0]
1 (1.0, 2.0] (225.0, 270.0]
2 (2.0, 5.0] (90.0, 135.0]
3 (1.0, 2.0] (135.0, 180.0]
4 (2.0, 5.0] (45.0, 90.0]
5 (0.0, 0.5] (135.0, 180.0]
6 (0.5, 1.0] (225.0, 270.0]
7 (0.5, 1.0] (0.0, 45.0]
8 (2.0, 5.0] (45.0, 90.0]
9 (1.0, 2.0] (135.0, 180.0]
From there, use .groupby()
to group this DataFrame by the (speeds
, directions
) combinations. Then for each group find the data value from the boundaries table, corresponding to the right limits of these combinations.
# Create a copy of the met_data column
met_data2 = met_data.copy()
# Add a new column that will store the data boundary values
met_data['data'] = 0
# Grouping by the (speed, direction) combinations, iterate over each group.
for iv, g in intervals.groupby(['speeds', 'directions']):
# Extract the right boundaries of the speed and direction combinations
speed_limit = iv[0].right
dir_limit = iv[1].right
# From these boundaries extract the corresponding value in the data grid,
# and set it as the value of each the met_data row in the interval group.
met_data2.loc[g.index, 'values'] = df.at[dir_limit, speed_limit]
print(met_data2.head(10))
Final output:
speeds directions values
0 1.261713 144.163960 0.489309
1 1.454843 236.032725 0.228075
2 2.280787 119.731554 0.043565
3 1.754406 173.319830 0.489309
4 3.953495 80.739232 0.191721
5 0.408179 146.199108 0.389078
6 0.835507 255.461460 0.387117
7 0.645238 13.960304 0.357595
8 2.612433 64.738510 0.191721
9 1.013166 147.860518 0.489309
Full code (as a generalised function):
def get_data_values(data_table, mt_data, col_x='speeds', col_y='directions'):
"""
From a given boundary table, extract all the values that corresponding to the bin positions
of a pair of columns (each representing one of the table axes).
"""
x_intervals = pd.cut(mt_data[col_x], bins=[0] + data_table.columns.tolist())
y_intervals = pd.cut(mt_data[col_y], bins=[0] + data_table.index.tolist())
intervals = pd.concat([x_intervals, y_intervals], axis=1)
data = np.zeros(len(mt_data))
for iv, g in intervals.groupby([col_x, col_y]):
x_limit = iv[0].right
y_limit = iv[1].right
data[g.index] = data_table.at[y_limit, x_limit]
return data
Usage:
# Find the values from the boundary table where each of the (speed, direction) combinations fit in.
met_data['data'] = get_data_values(df, met_data, col_x='speeds', col_y='directions')
I have some 2D data that has boundaries (bins
) like this:
import numpy as npy
# These are the boundaries of our wind speeds and directions
speed_bins = npy.array([0.0, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0])
dir_bins = npy.linspace(0,360,9)
# Random LPF values from 0–0.5
data = npy.random.rand(len(dir_bins)-1, len(speed_bins)-1)*0.5
dataTable = pd.DataFrame(LPF, index=dir_bins[1:],columns=speed_bins[1:])
I have some meteorological data like this:
# Assume all arrays are the same length
speeds = numpy.array([...])
directions = numpy.array([...])
X = numpy.array([...])
metData = pd.DataFrame({'speeds':speeds,'directions':directions,'X':X})
What I need to do is find the element in dataTable
, where each combination of speed
and direction
are bound.
I’m sure there is some pandas #magic that will do the following:
- For each
(speed, direction)
pair:- Find the row in
dataTable
that bounds thespeed
- Find the column in
dataTable
that bounds thedirection
- return the
data
variable.
- Find the row in
For example: if my speed
is 0.75
and my direction
is 350
then I want to get the value at element dataTable.iat[7,1]
. 0.75
is between 0.5
and 1.0
(the column header values) and similarly for the direction, but with the index values.
I hope I’ve described this well.
Input data
The DataFrame for the boundaries:
import pandas as pd
import numpy as np
# For reproducibility
np.random.seed(0)
# These are the boundaries of our wind speeds and directions
speed_bins = np.array([0.0, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0])
dir_bins = np.linspace(0,360,9)
# Random LPF values from 0–0.5
data = np.random.rand(len(dir_bins)-1, len(speed_bins)-1)*0.5
df = pd.DataFrame(data, index=dir_bins[1:],columns=speed_bins[1:])
print(df)
0.5 1.0 2.0 5.0 10.0 100.0
45.0 0.274407 0.357595 0.301382 0.272442 0.211827 0.322947
90.0 0.218794 0.445887 0.481831 0.191721 0.395863 0.264447
135.0 0.284022 0.462798 0.035518 0.043565 0.010109 0.416310
180.0 0.389078 0.435006 0.489309 0.399579 0.230740 0.390265
225.0 0.059137 0.319961 0.071677 0.472334 0.260924 0.207331
270.0 0.132278 0.387117 0.228075 0.284217 0.009395 0.308818
315.0 0.306048 0.308467 0.471874 0.340910 0.179754 0.218516
360.0 0.348816 0.030113 0.333383 0.335319 0.105191 0.064463
The DataFrame for the meteorological data:
# Randomly generate n many values for speeds and directions
n = 1000000
speeds = np.random.rand(n) * 4
directions = np.random.rand(n) * 360
met_data = pd.DataFrame({'speeds': speeds,
'directions': directions})
print(met_data.head(10))
speeds directions
0 1.261713 144.163960
1 1.454843 236.032725
2 2.280787 119.731554
3 1.754406 173.319830
4 3.953495 80.739232
5 0.408179 146.199108
6 0.835507 255.461460
7 0.645238 13.960304
8 2.612433 64.738510
9 1.013166 147.860518
Solution
The pd.cut()
function is especially useful to solve your problem. It can be used to assign the speed and direction values to the bin intervals you have defined.
# Get the bin intervals that each of the speeds and directions lie in.
speed_intervals = pd.cut(met_data['speeds'], bins=[0] + df.columns.tolist())
direction_intervals = pd.cut(met_data['directions'], bins=[0] + df.index.tolist())
# Create a new DataFrame with these values as columns.
intervals = pd.concat([speed_intervals, direction_intervals], axis=1)
print(intervals.head(10))
speeds directions
0 (1.0, 2.0] (135.0, 180.0]
1 (1.0, 2.0] (225.0, 270.0]
2 (2.0, 5.0] (90.0, 135.0]
3 (1.0, 2.0] (135.0, 180.0]
4 (2.0, 5.0] (45.0, 90.0]
5 (0.0, 0.5] (135.0, 180.0]
6 (0.5, 1.0] (225.0, 270.0]
7 (0.5, 1.0] (0.0, 45.0]
8 (2.0, 5.0] (45.0, 90.0]
9 (1.0, 2.0] (135.0, 180.0]
From there, use .groupby()
to group this DataFrame by the (speeds
, directions
) combinations. Then for each group find the data value from the boundaries table, corresponding to the right limits of these combinations.
# Create a copy of the met_data column
met_data2 = met_data.copy()
# Add a new column that will store the data boundary values
met_data['data'] = 0
# Grouping by the (speed, direction) combinations, iterate over each group.
for iv, g in intervals.groupby(['speeds', 'directions']):
# Extract the right boundaries of the speed and direction combinations
speed_limit = iv[0].right
dir_limit = iv[1].right
# From these boundaries extract the corresponding value in the data grid,
# and set it as the value of each the met_data row in the interval group.
met_data2.loc[g.index, 'values'] = df.at[dir_limit, speed_limit]
print(met_data2.head(10))
Final output:
speeds directions values
0 1.261713 144.163960 0.489309
1 1.454843 236.032725 0.228075
2 2.280787 119.731554 0.043565
3 1.754406 173.319830 0.489309
4 3.953495 80.739232 0.191721
5 0.408179 146.199108 0.389078
6 0.835507 255.461460 0.387117
7 0.645238 13.960304 0.357595
8 2.612433 64.738510 0.191721
9 1.013166 147.860518 0.489309
Full code (as a generalised function):
def get_data_values(data_table, mt_data, col_x='speeds', col_y='directions'):
"""
From a given boundary table, extract all the values that corresponding to the bin positions
of a pair of columns (each representing one of the table axes).
"""
x_intervals = pd.cut(mt_data[col_x], bins=[0] + data_table.columns.tolist())
y_intervals = pd.cut(mt_data[col_y], bins=[0] + data_table.index.tolist())
intervals = pd.concat([x_intervals, y_intervals], axis=1)
data = np.zeros(len(mt_data))
for iv, g in intervals.groupby([col_x, col_y]):
x_limit = iv[0].right
y_limit = iv[1].right
data[g.index] = data_table.at[y_limit, x_limit]
return data
Usage:
# Find the values from the boundary table where each of the (speed, direction) combinations fit in.
met_data['data'] = get_data_values(df, met_data, col_x='speeds', col_y='directions')