Replacing integers in a string with corresponding weekdays
Question:
I want to find a more elegant way to replace integers in a string with Weekdays using pyspark. Maybe using a mapping or something
A string might look like '0,1,2,3,4,5,6'
and should be converted into SU, MO, TU, WE, TH, FR, SA
My current solution is to call seven-times regexp_replace
:
f.regexp_replace(
f.regexp_replace(
f.regexp_replace(
f.regexp_replace(
f.regexp_replace(
f.regexp_replace(f.regexp_replace(f.col("by_day"), "0", "SU"), "1", "MO"),
"2",
"TU",
),
"3",
"WE",
),
"4",
"TH",
),
"5",
"FR",
),
"6",
"SA",
)
Answers:
how about using udf
and num_map
like below:
num_map = {'0': 'SU', '1': 'MO', '2': 'TU', '3': 'WE', '4': 'TH', '5': 'FR', '6': 'SA'}
@udf(returnType=StringType())
def string_mapping(ori_date):
if ori_date is None:
return ''
num_list = ori_date.split(',')
week_name_list = [num_map[num] for num in num_list]
return ','.join(week_name_list)
data = spark.createDataFrame([
('0,1,2,3,4,5,6',),
('0,3,6',),
('2,5',),
(None,),
], ['col1'])
data.show(10, False)
# +-------------+
# |col1 |
# +-------------+
# |0,1,2,3,4,5,6|
# |0,3,6 |
# |2,5 |
# | null|
# +-------------+
data.withColumn('weeks', string_mapping(col('col1'))).show(10)
# +-------------+--------------------+
# | col1| weeks|
# +-------------+--------------------+
# |0,1,2,3,4,5,6|SU,MO,TU,WE,TH,FR,SA|
# | 0,3,6| SU,WE,SA|
# | 2,5| TU,FR|
# | null| |
# +-------------+--------------------+
You can simplify it by using functools.reduce
to dynamically generate the regexp_replace
expressions from a mapping dict like this:
from functools import reduce
from pyspark.sql import functions as F
df = spark.createDataFrame([("0,1,2,3,4,5,6",)], ["by_day"])
mapping = {'0': 'SU', '1': 'MO', '2': 'TU', '3': 'WE', '4': 'TH', '5': 'FR', '6': 'SA'}
df.withColumn(
"by_day",
reduce(lambda a, b: F.regexp_replace(a, b[0], b[1]), mapping.items(), F.col("by_day"))
).show()
# +--------------------+
# | by_day|
# +--------------------+
# |SU,MO,TU,WE,TH,FR,SA|
# +--------------------+
One could also split the string then using transform
function to replace each value from a map literal expression and finally join the array to get a string column.
I want to find a more elegant way to replace integers in a string with Weekdays using pyspark. Maybe using a mapping or something
A string might look like '0,1,2,3,4,5,6'
and should be converted into SU, MO, TU, WE, TH, FR, SA
My current solution is to call seven-times regexp_replace
:
f.regexp_replace(
f.regexp_replace(
f.regexp_replace(
f.regexp_replace(
f.regexp_replace(
f.regexp_replace(f.regexp_replace(f.col("by_day"), "0", "SU"), "1", "MO"),
"2",
"TU",
),
"3",
"WE",
),
"4",
"TH",
),
"5",
"FR",
),
"6",
"SA",
)
how about using udf
and num_map
like below:
num_map = {'0': 'SU', '1': 'MO', '2': 'TU', '3': 'WE', '4': 'TH', '5': 'FR', '6': 'SA'}
@udf(returnType=StringType())
def string_mapping(ori_date):
if ori_date is None:
return ''
num_list = ori_date.split(',')
week_name_list = [num_map[num] for num in num_list]
return ','.join(week_name_list)
data = spark.createDataFrame([
('0,1,2,3,4,5,6',),
('0,3,6',),
('2,5',),
(None,),
], ['col1'])
data.show(10, False)
# +-------------+
# |col1 |
# +-------------+
# |0,1,2,3,4,5,6|
# |0,3,6 |
# |2,5 |
# | null|
# +-------------+
data.withColumn('weeks', string_mapping(col('col1'))).show(10)
# +-------------+--------------------+
# | col1| weeks|
# +-------------+--------------------+
# |0,1,2,3,4,5,6|SU,MO,TU,WE,TH,FR,SA|
# | 0,3,6| SU,WE,SA|
# | 2,5| TU,FR|
# | null| |
# +-------------+--------------------+
You can simplify it by using functools.reduce
to dynamically generate the regexp_replace
expressions from a mapping dict like this:
from functools import reduce
from pyspark.sql import functions as F
df = spark.createDataFrame([("0,1,2,3,4,5,6",)], ["by_day"])
mapping = {'0': 'SU', '1': 'MO', '2': 'TU', '3': 'WE', '4': 'TH', '5': 'FR', '6': 'SA'}
df.withColumn(
"by_day",
reduce(lambda a, b: F.regexp_replace(a, b[0], b[1]), mapping.items(), F.col("by_day"))
).show()
# +--------------------+
# | by_day|
# +--------------------+
# |SU,MO,TU,WE,TH,FR,SA|
# +--------------------+
One could also split the string then using transform
function to replace each value from a map literal expression and finally join the array to get a string column.