pandas make dummys on small datasets
Question:
Can someone help me with this function to make dummies:
def make_dummies(df):
# Create dummies for all hours of the day
hours = pd.get_dummies(df.index.hour, prefix='hour')
# Create columns for hour, day of week, weekend, and month
df['hour'] = df.index.strftime('%H')
df['day_of_week'] = df.index.dayofweek
df['weekend'] = np.where(df['day_of_week'].isin([5,6]), 1, 0)
df['month'] = df.index.month
# Create dummies for hours of the day
hour_dummies = pd.get_dummies(df['hour'], prefix='hour')
# Create dummies for all days of the week
day_mapping = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
all_days = pd.Categorical(df['day_of_week'].map(day_mapping), categories=day_mapping.values())
day_dummies = pd.get_dummies(all_days)
# Create dummies for all months of the year
month_mapping = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul',
8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'}
all_months = pd.Categorical(df['month'].map(month_mapping), categories=month_mapping.values())
month_dummies = pd.get_dummies(all_months)
# Merge all dummies with original DataFrame
df = pd.concat([df, hours, hour_dummies, day_dummies, month_dummies], axis=1)
# Drop redundant columns
df = df.drop(['hour', 'day_of_week', 'month'], axis=1)
return df
On a small dataset like this:
import pandas as pd
import numpy as np
data = {"temp":[53.13,52.93,52.56,51.58,47.57],
"Date":["2023-04-07 15:00:00-05:00","2023-04-07 16:00:00-05:00","2023-04-07 17:00:00-05:00","2023-04-07 18:00:00-05:00","2023-04-07 19:00:00-05:00"]
}
df = pd.DataFrame(data).set_index("Date")
# Converting the index as date
df.index = pd.to_datetime(df.index)
df = make_dummies(df)
print(df)
This wont merge the data correctly. I apologize for the screenshot but the function is just stacking dummy variables beneath where what I was hoping for is ALL dummy variables would be added to the df and not stacked beneath. Hopefully this makes sense, was hoping to make a function that creates all dummy variables for each hour, month, and day type.
Answers:
You just missed some set_index
to align indexes on pd.concat
:
def make_dummies(df):
# Create dummies for all hours of the day
hours = pd.get_dummies(df.index.hour, prefix='hour').set_index(df.index) # HERE
# Create columns for hour, day of week, weekend, and month
df['hour'] = df.index.strftime('%H')
df['day_of_week'] = df.index.dayofweek
df['weekend'] = np.where(df['day_of_week'].isin([5,6]), 1, 0)
df['month'] = df.index.month
# Create dummies for hours of the day
hour_dummies = pd.get_dummies(df['hour'], prefix='hour')
# Create dummies for all days of the week
day_mapping = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
all_days = pd.Categorical(df['day_of_week'].map(day_mapping), categories=day_mapping.values())
day_dummies = pd.get_dummies(all_days).set_index(df.index) # HERE
# Create dummies for all months of the year
month_mapping = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul',
8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'}
all_months = pd.Categorical(df['month'].map(month_mapping), categories=month_mapping.values())
month_dummies = pd.get_dummies(all_months).set_index(df.index) # HERE
# Merge all dummies with original DataFrame
df = pd.concat([df, hours, hour_dummies, day_dummies, month_dummies], axis=1)
# Drop redundant columns
df = df.drop(['hour', 'day_of_week', 'month'], axis=1)
return df
Note: I think hours
and hour_dummies
are redundant.
Output:
>>> make_dummies(df)
temp weekend hour_15 hour_16 hour_17 hour_18 hour_19 hour_15 hour_16 ... apr may jun jul aug sep oct nov dec
Date ...
2023-04-07 15:00:00-05:00 53.13 0 1 0 0 0 0 1 0 ... 1 0 0 0 0 0 0 0 0
2023-04-07 16:00:00-05:00 52.93 0 0 1 0 0 0 0 1 ... 1 0 0 0 0 0 0 0 0
2023-04-07 17:00:00-05:00 52.56 0 0 0 1 0 0 0 0 ... 1 0 0 0 0 0 0 0 0
2023-04-07 18:00:00-05:00 51.58 0 0 0 0 1 0 0 0 ... 1 0 0 0 0 0 0 0 0
2023-04-07 19:00:00-05:00 47.57 0 0 0 0 0 1 0 0 ... 1 0 0 0 0 0 0 0 0
[5 rows x 31 columns]
For what its worth here’s the sci-kit learn version looks a bit daunting but also appears to work:
from sklearn.preprocessing import OneHotEncoder
def make_dummies(df):
# Create a new DataFrame to hold the encoded data
encoded_df = df.copy()
# Create a OneHotEncoder object for hours of the day
hour_encoder = OneHotEncoder(categories=[range(24)], sparse=False)
# Encode the hour column
hour_encoded = hour_encoder.fit_transform(encoded_df.index.hour.values.reshape(-1, 1))
hour_columns = [f'hour_{i}' for i in range(24)]
hour_df = pd.DataFrame(hour_encoded, columns=hour_columns, index=encoded_df.index)
# Create a OneHotEncoder object for days of the week
day_encoder = OneHotEncoder(categories=[range(7)], sparse=False)
# Encode the day_of_week column
day_encoded = day_encoder.fit_transform(encoded_df.index.dayofweek.values.reshape(-1, 1))
day_columns = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
day_df = pd.DataFrame(day_encoded, columns=day_columns, index=encoded_df.index)
# Create a OneHotEncoder object for months of the year
month_encoder = OneHotEncoder(categories=[range(1, 13)], sparse=False)
# Encode the month column
month_encoded = month_encoder.fit_transform(encoded_df.index.month.values.reshape(-1, 1))
month_columns = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
month_df = pd.DataFrame(month_encoded, columns=month_columns, index=encoded_df.index)
# Merge all dummies with original DataFrame
encoded_df = pd.concat([encoded_df, hour_df, day_df, month_df], axis=1)
return encoded_df
Can someone help me with this function to make dummies:
def make_dummies(df):
# Create dummies for all hours of the day
hours = pd.get_dummies(df.index.hour, prefix='hour')
# Create columns for hour, day of week, weekend, and month
df['hour'] = df.index.strftime('%H')
df['day_of_week'] = df.index.dayofweek
df['weekend'] = np.where(df['day_of_week'].isin([5,6]), 1, 0)
df['month'] = df.index.month
# Create dummies for hours of the day
hour_dummies = pd.get_dummies(df['hour'], prefix='hour')
# Create dummies for all days of the week
day_mapping = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
all_days = pd.Categorical(df['day_of_week'].map(day_mapping), categories=day_mapping.values())
day_dummies = pd.get_dummies(all_days)
# Create dummies for all months of the year
month_mapping = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul',
8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'}
all_months = pd.Categorical(df['month'].map(month_mapping), categories=month_mapping.values())
month_dummies = pd.get_dummies(all_months)
# Merge all dummies with original DataFrame
df = pd.concat([df, hours, hour_dummies, day_dummies, month_dummies], axis=1)
# Drop redundant columns
df = df.drop(['hour', 'day_of_week', 'month'], axis=1)
return df
On a small dataset like this:
import pandas as pd
import numpy as np
data = {"temp":[53.13,52.93,52.56,51.58,47.57],
"Date":["2023-04-07 15:00:00-05:00","2023-04-07 16:00:00-05:00","2023-04-07 17:00:00-05:00","2023-04-07 18:00:00-05:00","2023-04-07 19:00:00-05:00"]
}
df = pd.DataFrame(data).set_index("Date")
# Converting the index as date
df.index = pd.to_datetime(df.index)
df = make_dummies(df)
print(df)
This wont merge the data correctly. I apologize for the screenshot but the function is just stacking dummy variables beneath where what I was hoping for is ALL dummy variables would be added to the df and not stacked beneath. Hopefully this makes sense, was hoping to make a function that creates all dummy variables for each hour, month, and day type.
You just missed some set_index
to align indexes on pd.concat
:
def make_dummies(df):
# Create dummies for all hours of the day
hours = pd.get_dummies(df.index.hour, prefix='hour').set_index(df.index) # HERE
# Create columns for hour, day of week, weekend, and month
df['hour'] = df.index.strftime('%H')
df['day_of_week'] = df.index.dayofweek
df['weekend'] = np.where(df['day_of_week'].isin([5,6]), 1, 0)
df['month'] = df.index.month
# Create dummies for hours of the day
hour_dummies = pd.get_dummies(df['hour'], prefix='hour')
# Create dummies for all days of the week
day_mapping = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
all_days = pd.Categorical(df['day_of_week'].map(day_mapping), categories=day_mapping.values())
day_dummies = pd.get_dummies(all_days).set_index(df.index) # HERE
# Create dummies for all months of the year
month_mapping = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul',
8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'}
all_months = pd.Categorical(df['month'].map(month_mapping), categories=month_mapping.values())
month_dummies = pd.get_dummies(all_months).set_index(df.index) # HERE
# Merge all dummies with original DataFrame
df = pd.concat([df, hours, hour_dummies, day_dummies, month_dummies], axis=1)
# Drop redundant columns
df = df.drop(['hour', 'day_of_week', 'month'], axis=1)
return df
Note: I think hours
and hour_dummies
are redundant.
Output:
>>> make_dummies(df)
temp weekend hour_15 hour_16 hour_17 hour_18 hour_19 hour_15 hour_16 ... apr may jun jul aug sep oct nov dec
Date ...
2023-04-07 15:00:00-05:00 53.13 0 1 0 0 0 0 1 0 ... 1 0 0 0 0 0 0 0 0
2023-04-07 16:00:00-05:00 52.93 0 0 1 0 0 0 0 1 ... 1 0 0 0 0 0 0 0 0
2023-04-07 17:00:00-05:00 52.56 0 0 0 1 0 0 0 0 ... 1 0 0 0 0 0 0 0 0
2023-04-07 18:00:00-05:00 51.58 0 0 0 0 1 0 0 0 ... 1 0 0 0 0 0 0 0 0
2023-04-07 19:00:00-05:00 47.57 0 0 0 0 0 1 0 0 ... 1 0 0 0 0 0 0 0 0
[5 rows x 31 columns]
For what its worth here’s the sci-kit learn version looks a bit daunting but also appears to work:
from sklearn.preprocessing import OneHotEncoder
def make_dummies(df):
# Create a new DataFrame to hold the encoded data
encoded_df = df.copy()
# Create a OneHotEncoder object for hours of the day
hour_encoder = OneHotEncoder(categories=[range(24)], sparse=False)
# Encode the hour column
hour_encoded = hour_encoder.fit_transform(encoded_df.index.hour.values.reshape(-1, 1))
hour_columns = [f'hour_{i}' for i in range(24)]
hour_df = pd.DataFrame(hour_encoded, columns=hour_columns, index=encoded_df.index)
# Create a OneHotEncoder object for days of the week
day_encoder = OneHotEncoder(categories=[range(7)], sparse=False)
# Encode the day_of_week column
day_encoded = day_encoder.fit_transform(encoded_df.index.dayofweek.values.reshape(-1, 1))
day_columns = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
day_df = pd.DataFrame(day_encoded, columns=day_columns, index=encoded_df.index)
# Create a OneHotEncoder object for months of the year
month_encoder = OneHotEncoder(categories=[range(1, 13)], sparse=False)
# Encode the month column
month_encoded = month_encoder.fit_transform(encoded_df.index.month.values.reshape(-1, 1))
month_columns = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
month_df = pd.DataFrame(month_encoded, columns=month_columns, index=encoded_df.index)
# Merge all dummies with original DataFrame
encoded_df = pd.concat([encoded_df, hour_df, day_df, month_df], axis=1)
return encoded_df