Apache beam – look back x mins from each element
Question:
I am trying to calculate the total number of transactions done by each customer in last x min.
Let’s say there are a total of 3 elements, I would like to look back last 5 minutes and find the sum for each customer.
- {"event_time": 12:01, "customer": "A", "amount": 10.00} -> look up last 5 minutes (from event time 11.55 to 12.00) and sum (result: 0)
- {"event_time": 12:02, "customer": "A", "amount": 20.00} -> look up last 5 minutes (from event time 11.56 to 12.01) and sum (result: 10)
- {"event_time": 12:03, "customer": "A", "amount": 30.00} -> look up last 5 minutes (from event time 11.57 to 12.02) and sum (result: 10 + 20)
The expected output is
- {"event_time": 12:01, "customer": "A", "amount": 10.00, "sum": 0.00}
- {"event_time": 12:02, "customer": "A", "amount": 20.00, "sum": 10.00}
- {"event_time": 12:03, "customer": "A", "amount": 30.00, "sum": 20.00}
It’s similar to the sliding window, but for each element instead of a timed window. Can it be done with Apache Beam?
Here is the code to create dummy data
import time
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.combiners import MeanCombineFn, CountCombineFn
def to_unix_time(time_str: str, time_format='%Y-%m-%d %H:%M:%S') -> int:
"""Converts a time string into Unix time."""
time_tuple = time.strptime(time_str, time_format)
return int(time.mktime(time_tuple))
@beam.ptransform_fn
@beam.typehints.with_input_types(beam.pvalue.PBegin)
@beam.typehints.with_output_types(beam.window.TimestampedValue)
def CreateData(pipeline):
return (
pipeline
| 'Create dummy data' >> beam.Create([
{'TX_ID': '01', 'TX_TS': '2022-08-04 08:36:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '02', 'TX_TS': '2022-08-04 08:36:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '03', 'TX_TS': '2022-08-04 08:37:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '04', 'TX_TS': '2022-08-04 08:37:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '05', 'TX_TS': '2022-08-04 08:38:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '06', 'TX_TS': '2022-08-04 08:38:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '07', 'TX_TS': '2022-08-04 08:39:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '08', 'TX_TS': '2022-08-04 08:39:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '09', 'TX_TS': '2022-08-04 08:40:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '10', 'TX_TS': '2022-08-04 08:40:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '11', 'TX_TS': '2022-08-04 08:41:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '12', 'TX_TS': '2022-08-04 08:41:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '13', 'TX_TS': '2022-08-04 08:42:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '14', 'TX_TS': '2022-08-04 08:42:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '15', 'TX_TS': '2022-08-04 08:43:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '16', 'TX_TS': '2022-08-04 08:43:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '17', 'TX_TS': '2022-08-04 08:44:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '18', 'TX_TS': '2022-08-04 08:44:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '19', 'TX_TS': '2022-08-04 08:45:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '20', 'TX_TS': '2022-08-04 08:45:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
])
| 'With timestamps' >> beam.Map(
lambda x: beam.window.TimestampedValue(x, to_unix_time(x['TX_TS']))
)
)
Things that I have tried so far:
- Used Fixed Windows
window_size = 60 * 5 # 5 min
beam_options = PipelineOptions(flags=[], type_check_additional='all')
with beam.Pipeline(options=beam_options) as pipeline:
(
pipeline
| 'Create dummy data' >> CreateData()
| 'Fixed windows' >> beam.WindowInto(beam.window.FixedWindows(window_size))
| 'Change to namedtuple' >> beam.Map(lambda x: beam.Row(**x))
| 'Group By' >> beam.GroupBy('CUSTOMER_ID')
.aggregate_field('TX_AMOUNT', sum, 'total_amount_5min')
.aggregate_field('TX_AMOUNT', CountCombineFn(), 'tx_count_5min')
.aggregate_field('TX_AMOUNT', MeanCombineFn(), 'average_amount_5min')
| 'Print element info' >> beam.Map(print)
)
- Used Sliding Windows
window_size = 60 * 5 # 5 min
window_period = 60 # 1 min
beam_options = PipelineOptions(flags=[], type_check_additional='all')
with beam.Pipeline(options=beam_options) as pipeline:
(
pipeline
| 'Create dummy data' >> CreateData()
| 'Fixed windows' >> beam.WindowInto(beam.window.SlidingWindows(window_size, window_period))
| 'Change to namedtuple' >> beam.Map(lambda x: beam.Row(**x))
| 'Group By' >> beam.GroupBy('CUSTOMER_ID')
.aggregate_field('TX_AMOUNT', sum, 'total_amount_5min')
.aggregate_field('TX_AMOUNT', CountCombineFn(), 'tx_count_5min')
.aggregate_field('TX_AMOUNT', MeanCombineFn(), 'average_amount_5min')
| 'Print element info' >> beam.Map(print)
)
Both options didn’t give what I want. Any help is appreciated. Thank you.
Answers:
I don’t think window
functions will really help in your case since you need to have a 5 minute window for each element.
You can try to do more of a python oriented approach inside the pipeline.
EG:
import datetime
def convert(date_time):
format = '%Y-%m-%d %H:%M:%S' # The format
datetime_str = datetime.datetime.strptime(date_time, format)
return datetime_str
def process(rows):
temp_rows = rows[1]
result = temp_rows
for row in temp_rows:
search_to = convert(row['TX_TS']) - datetime.timedelta(minutes=1)
search_from = convert(row['TX_TS']) - datetime.timedelta(minutes=5)
amounts = [row['TX_AMOUNT'] for row in temp_rows if convert(row['TX_TS'])>=search_from and convert(row['TX_TS']) <=search_to]
sum_amounts= sum(amounts)
row['sum'] = sum_amounts
return temp_rows
with beam.Pipeline(options=beam_options) as pipeline:
(
pipeline
| 'Create dummy data' >> CreateData()
| beam.Map(lambda x: (x['CUSTOMER_ID'], x))
| beam.GroupByKey()
| beam.Map(lambda x : (process(rows=x)))
| 'Print element info' >> beam.Map(print)
)
Also you might need to do some post processing of the result as it will be grouped for each customer id.
Sorry for the delay. I managed to solve this using lambda functions and window end time.
The sample code can be found here (https://github.com/GoogleCloudPlatform/fraudfinder/blob/main/03_feature_engineering_streaming.ipynb)
I am trying to calculate the total number of transactions done by each customer in last x min.
Let’s say there are a total of 3 elements, I would like to look back last 5 minutes and find the sum for each customer.
- {"event_time": 12:01, "customer": "A", "amount": 10.00} -> look up last 5 minutes (from event time 11.55 to 12.00) and sum (result: 0)
- {"event_time": 12:02, "customer": "A", "amount": 20.00} -> look up last 5 minutes (from event time 11.56 to 12.01) and sum (result: 10)
- {"event_time": 12:03, "customer": "A", "amount": 30.00} -> look up last 5 minutes (from event time 11.57 to 12.02) and sum (result: 10 + 20)
The expected output is
- {"event_time": 12:01, "customer": "A", "amount": 10.00, "sum": 0.00}
- {"event_time": 12:02, "customer": "A", "amount": 20.00, "sum": 10.00}
- {"event_time": 12:03, "customer": "A", "amount": 30.00, "sum": 20.00}
It’s similar to the sliding window, but for each element instead of a timed window. Can it be done with Apache Beam?
Here is the code to create dummy data
import time
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.combiners import MeanCombineFn, CountCombineFn
def to_unix_time(time_str: str, time_format='%Y-%m-%d %H:%M:%S') -> int:
"""Converts a time string into Unix time."""
time_tuple = time.strptime(time_str, time_format)
return int(time.mktime(time_tuple))
@beam.ptransform_fn
@beam.typehints.with_input_types(beam.pvalue.PBegin)
@beam.typehints.with_output_types(beam.window.TimestampedValue)
def CreateData(pipeline):
return (
pipeline
| 'Create dummy data' >> beam.Create([
{'TX_ID': '01', 'TX_TS': '2022-08-04 08:36:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '02', 'TX_TS': '2022-08-04 08:36:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '03', 'TX_TS': '2022-08-04 08:37:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '04', 'TX_TS': '2022-08-04 08:37:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '05', 'TX_TS': '2022-08-04 08:38:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '06', 'TX_TS': '2022-08-04 08:38:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '07', 'TX_TS': '2022-08-04 08:39:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '08', 'TX_TS': '2022-08-04 08:39:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '09', 'TX_TS': '2022-08-04 08:40:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '10', 'TX_TS': '2022-08-04 08:40:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '11', 'TX_TS': '2022-08-04 08:41:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '12', 'TX_TS': '2022-08-04 08:41:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '13', 'TX_TS': '2022-08-04 08:42:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '14', 'TX_TS': '2022-08-04 08:42:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '15', 'TX_TS': '2022-08-04 08:43:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '16', 'TX_TS': '2022-08-04 08:43:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '17', 'TX_TS': '2022-08-04 08:44:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '18', 'TX_TS': '2022-08-04 08:44:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
{'TX_ID': '19', 'TX_TS': '2022-08-04 08:45:00', 'CUSTOMER_ID': 'A', 'TX_AMOUNT': 10.00},
{'TX_ID': '20', 'TX_TS': '2022-08-04 08:45:30', 'CUSTOMER_ID': 'B', 'TX_AMOUNT': 10.00},
])
| 'With timestamps' >> beam.Map(
lambda x: beam.window.TimestampedValue(x, to_unix_time(x['TX_TS']))
)
)
Things that I have tried so far:
- Used Fixed Windows
window_size = 60 * 5 # 5 min
beam_options = PipelineOptions(flags=[], type_check_additional='all')
with beam.Pipeline(options=beam_options) as pipeline:
(
pipeline
| 'Create dummy data' >> CreateData()
| 'Fixed windows' >> beam.WindowInto(beam.window.FixedWindows(window_size))
| 'Change to namedtuple' >> beam.Map(lambda x: beam.Row(**x))
| 'Group By' >> beam.GroupBy('CUSTOMER_ID')
.aggregate_field('TX_AMOUNT', sum, 'total_amount_5min')
.aggregate_field('TX_AMOUNT', CountCombineFn(), 'tx_count_5min')
.aggregate_field('TX_AMOUNT', MeanCombineFn(), 'average_amount_5min')
| 'Print element info' >> beam.Map(print)
)
- Used Sliding Windows
window_size = 60 * 5 # 5 min
window_period = 60 # 1 min
beam_options = PipelineOptions(flags=[], type_check_additional='all')
with beam.Pipeline(options=beam_options) as pipeline:
(
pipeline
| 'Create dummy data' >> CreateData()
| 'Fixed windows' >> beam.WindowInto(beam.window.SlidingWindows(window_size, window_period))
| 'Change to namedtuple' >> beam.Map(lambda x: beam.Row(**x))
| 'Group By' >> beam.GroupBy('CUSTOMER_ID')
.aggregate_field('TX_AMOUNT', sum, 'total_amount_5min')
.aggregate_field('TX_AMOUNT', CountCombineFn(), 'tx_count_5min')
.aggregate_field('TX_AMOUNT', MeanCombineFn(), 'average_amount_5min')
| 'Print element info' >> beam.Map(print)
)
Both options didn’t give what I want. Any help is appreciated. Thank you.
I don’t think window
functions will really help in your case since you need to have a 5 minute window for each element.
You can try to do more of a python oriented approach inside the pipeline.
EG:
import datetime
def convert(date_time):
format = '%Y-%m-%d %H:%M:%S' # The format
datetime_str = datetime.datetime.strptime(date_time, format)
return datetime_str
def process(rows):
temp_rows = rows[1]
result = temp_rows
for row in temp_rows:
search_to = convert(row['TX_TS']) - datetime.timedelta(minutes=1)
search_from = convert(row['TX_TS']) - datetime.timedelta(minutes=5)
amounts = [row['TX_AMOUNT'] for row in temp_rows if convert(row['TX_TS'])>=search_from and convert(row['TX_TS']) <=search_to]
sum_amounts= sum(amounts)
row['sum'] = sum_amounts
return temp_rows
with beam.Pipeline(options=beam_options) as pipeline:
(
pipeline
| 'Create dummy data' >> CreateData()
| beam.Map(lambda x: (x['CUSTOMER_ID'], x))
| beam.GroupByKey()
| beam.Map(lambda x : (process(rows=x)))
| 'Print element info' >> beam.Map(print)
)
Also you might need to do some post processing of the result as it will be grouped for each customer id.
Sorry for the delay. I managed to solve this using lambda functions and window end time.
The sample code can be found here (https://github.com/GoogleCloudPlatform/fraudfinder/blob/main/03_feature_engineering_streaming.ipynb)