how to groupby rows and create new columns on pyspark
Question:
original dataframe
id
email
name
1
[email protected]
john
2
[email protected]
Maike
2
id2@second
Maike
1
[email protected]
john
I want to convert to this
id
email
email1
name
1
[email protected]
[email protected]
john
2
[email protected]
id2@second
Maike
it’s only an example, I have very large file and more than 60 columns
im using
df = spark.read.option("header",True)
.csv("contatcs.csv", sep =',')
but works to with pyspark.pandas api
import pyspark.pandas as ps
df = ps.read_csv('contacts.csv', sep=',')
df.head()
but I prefer spark.read because it’s a Lazy Evaluation
and the pandas API is not
Answers:
In order to do it deterministically in Spark, you must have some rule to determine which email is first and which is second. The row order in the CSV file (not having a specified column for row number) is a bad rule when you work with Spark, because every row may go to a different node, and then you will cannot see which of rows was first or second.
In the following example, I assume that the rule is the alphabetical order, so I collect all the emails into one array using collect_set
and then sort them using array_sort
.
Input:
from pyspark.sql import functions as F
df = spark.createDataFrame(
[('1', '[email protected]', 'john'),
('2', '[email protected]', 'Maike'),
('2', 'id2@second', 'Maike'),
('1', '[email protected]', 'john')],
['id', 'email', 'name'])
Script:
emails = F.array_sort(F.collect_set('email'))
df = df.groupBy('id', 'name').agg(
emails[0].alias('email0'),
emails[1].alias('email1'),
)
df.show()
# +---+-----+-------------+--------------+
# | id| name| email0| email1|
# +---+-----+-------------+--------------+
# | 2|Maike|[email protected]| id2@second|
# | 1| john|[email protected]|[email protected]|
# +---+-----+-------------+--------------+
If you had a row number, something like…
from pyspark.sql import functions as F
df = spark.createDataFrame(
[('1', '1', '[email protected]', 'john'),
('2', '2', '[email protected]', 'Maike'),
('3', '2', 'id2@second', 'Maike'),
('4', '1', '[email protected]', 'john')],
['row_number', 'id', 'email', 'name'])
You could use something like below options:
emails = F.array_sort(F.collect_set(F.struct(F.col('row_number').cast('long'), 'email')))
df = df.groupBy('id', 'name').agg(
emails[0]['email'].alias('email0'),
emails[1]['email'].alias('email1'),
)
df.show()
# +---+-----+-------------+--------------+
# | id| name| email0| email1|
# +---+-----+-------------+--------------+
# | 2|Maike|[email protected]| id2@second|
# | 1| john|[email protected]|[email protected]|
# +---+-----+-------------+--------------+
from pyspark.sql import Window as W
w = W.partitionBy('id', 'name').orderBy('row_number')
df = (df
.withColumn('_rn', F.row_number().over(w))
.filter('_rn <= 2')
.withColumn('_rn', F.concat(F.lit('email'), '_rn'))
.groupBy('id', 'name')
.pivot('_rn')
.agg(F.first('email'))
)
df.show()
# +---+-----+-------------+--------------+
# | id| name| email1| email2|
# +---+-----+-------------+--------------+
# | 1| john|[email protected]|[email protected]|
# | 2|Maike|[email protected]| id2@second|
# +---+-----+-------------+--------------+
pyspark
I have included a corner case when there is uneven number of email ids. For that, find the max length and iterate to fetch email at each index:
from pyspark.sql import functions as F
df = spark.createDataFrame([(1, '[email protected]', 'john'),(2, '[email protected]', 'Maike'),(2, 'id2@second', 'Maike'),(1, '[email protected]', 'john'),(3, '[email protected]', 'amy'),], ['id', 'email', 'name'])
df = df.groupby("id", "name").agg(F.collect_list("email").alias("email"))
max_len = df.select(F.size("email").alias("size")).collect()[0]["size"]
for i in range(1, max_len + 1):
df = df.withColumn(f"email{i}", F.when(F.size("email") >= i, F.element_at("email", i)).otherwise(F.lit("")))
df = df.drop("email")
Output:
+---+-----+-------------+--------------+
|id |name |email1 |email2 |
+---+-----+-------------+--------------+
|2 |Maike|[email protected]|id2@second |
|3 |amy |[email protected]| |
|1 |john |[email protected]|[email protected]|
+---+-----+-------------+--------------+
pandas
Since you have mentioned pandas in the tags, following is the solution in pandas:
df = pd.DataFrame(data=[(1, '[email protected]', 'john'),(2, '[email protected]', 'Maike'),(2, 'id2@second', 'Maike'),(1, '[email protected]', 'john'),(3, '[email protected]', 'amy'),], columns=["id","email","name"])
df = df.groupby("id").agg(email=("email",list), name=("name",pd.unique))
df2 = df.apply(lambda row: pd.Series(data={f"email{i+1}":v for i,v in enumerate(row["email"])}, dtype="object"), axis=1)
df = df.drop("email", axis=1).merge(df2, on="id")
Output:
name email1 email2
id
1 john [email protected] [email protected]
2 Maike [email protected] id2@second
3 amy [email protected] NaN
If you wanted to make it dynamic so that it creates new email counts based on maximum email count, you can try logic and code below
from pyspark.sql import functions as F
df = spark.createDataFrame(
[('1', '[email protected]', 'john'),
('2', '[email protected]', 'Maike'),
('2', '[email protected]', 'Maike'),
('2', 'id2@second', 'Maike'),
('1', '[email protected]', 'john')],
['id', 'email', 'name'])
df.show()
+---+---------------+-----+
| id| email| name|
+---+---------------+-----+
| 1| [email protected]| john|
| 2| [email protected]|Maike|
| 2|[email protected]|Maike|
| 2| id2@second|Maike|
| 1| [email protected]| john|
Solution
new = ( df.groupBy('id','name').agg(collect_set('email').alias('email') )#Collect unique emails
.withColumn('x',max(size('email')).over(Window.partitionBy()))#Find the group with maximum emails, for use in email column count
)
new = (new.withColumn('email',F.struct(*[ F.col("email")[i].alias(f"email{i+1}") for i in range(new.select('x').collect()[0][0])]))#Convert email column to struct type
.selectExpr('x','id','name','email.*') #Select all columns
)
new.show(truncate=False)
Outcome
+---+---+-----+-------------+--------------+---------------+
|x |id |name |email1 |email2 |email3 |
+---+---+-----+-------------+--------------+---------------+
|3 |1 |john |[email protected]|[email protected]|null |
|3 |2 |Maike|id2@second |[email protected] |[email protected]|
+---+---+-----+-------------+--------------+---------------+
original dataframe
id | name | |
---|---|---|
1 | [email protected] | john |
2 | [email protected] | Maike |
2 | id2@second | Maike |
1 | [email protected] | john |
I want to convert to this
id | email1 | name | |
---|---|---|---|
1 | [email protected] | [email protected] | john |
2 | [email protected] | id2@second | Maike |
it’s only an example, I have very large file and more than 60 columns
im using
df = spark.read.option("header",True)
.csv("contatcs.csv", sep =',')
but works to with pyspark.pandas api
import pyspark.pandas as ps
df = ps.read_csv('contacts.csv', sep=',')
df.head()
but I prefer spark.read because it’s a Lazy Evaluation
and the pandas API is not
In order to do it deterministically in Spark, you must have some rule to determine which email is first and which is second. The row order in the CSV file (not having a specified column for row number) is a bad rule when you work with Spark, because every row may go to a different node, and then you will cannot see which of rows was first or second.
In the following example, I assume that the rule is the alphabetical order, so I collect all the emails into one array using collect_set
and then sort them using array_sort
.
Input:
from pyspark.sql import functions as F
df = spark.createDataFrame(
[('1', '[email protected]', 'john'),
('2', '[email protected]', 'Maike'),
('2', 'id2@second', 'Maike'),
('1', '[email protected]', 'john')],
['id', 'email', 'name'])
Script:
emails = F.array_sort(F.collect_set('email'))
df = df.groupBy('id', 'name').agg(
emails[0].alias('email0'),
emails[1].alias('email1'),
)
df.show()
# +---+-----+-------------+--------------+
# | id| name| email0| email1|
# +---+-----+-------------+--------------+
# | 2|Maike|[email protected]| id2@second|
# | 1| john|[email protected]|[email protected]|
# +---+-----+-------------+--------------+
If you had a row number, something like…
from pyspark.sql import functions as F
df = spark.createDataFrame(
[('1', '1', '[email protected]', 'john'),
('2', '2', '[email protected]', 'Maike'),
('3', '2', 'id2@second', 'Maike'),
('4', '1', '[email protected]', 'john')],
['row_number', 'id', 'email', 'name'])
You could use something like below options:
emails = F.array_sort(F.collect_set(F.struct(F.col('row_number').cast('long'), 'email')))
df = df.groupBy('id', 'name').agg(
emails[0]['email'].alias('email0'),
emails[1]['email'].alias('email1'),
)
df.show()
# +---+-----+-------------+--------------+
# | id| name| email0| email1|
# +---+-----+-------------+--------------+
# | 2|Maike|[email protected]| id2@second|
# | 1| john|[email protected]|[email protected]|
# +---+-----+-------------+--------------+
from pyspark.sql import Window as W
w = W.partitionBy('id', 'name').orderBy('row_number')
df = (df
.withColumn('_rn', F.row_number().over(w))
.filter('_rn <= 2')
.withColumn('_rn', F.concat(F.lit('email'), '_rn'))
.groupBy('id', 'name')
.pivot('_rn')
.agg(F.first('email'))
)
df.show()
# +---+-----+-------------+--------------+
# | id| name| email1| email2|
# +---+-----+-------------+--------------+
# | 1| john|[email protected]|[email protected]|
# | 2|Maike|[email protected]| id2@second|
# +---+-----+-------------+--------------+
pyspark
I have included a corner case when there is uneven number of email ids. For that, find the max length and iterate to fetch email at each index:
from pyspark.sql import functions as F
df = spark.createDataFrame([(1, '[email protected]', 'john'),(2, '[email protected]', 'Maike'),(2, 'id2@second', 'Maike'),(1, '[email protected]', 'john'),(3, '[email protected]', 'amy'),], ['id', 'email', 'name'])
df = df.groupby("id", "name").agg(F.collect_list("email").alias("email"))
max_len = df.select(F.size("email").alias("size")).collect()[0]["size"]
for i in range(1, max_len + 1):
df = df.withColumn(f"email{i}", F.when(F.size("email") >= i, F.element_at("email", i)).otherwise(F.lit("")))
df = df.drop("email")
Output:
+---+-----+-------------+--------------+
|id |name |email1 |email2 |
+---+-----+-------------+--------------+
|2 |Maike|[email protected]|id2@second |
|3 |amy |[email protected]| |
|1 |john |[email protected]|[email protected]|
+---+-----+-------------+--------------+
pandas
Since you have mentioned pandas in the tags, following is the solution in pandas:
df = pd.DataFrame(data=[(1, '[email protected]', 'john'),(2, '[email protected]', 'Maike'),(2, 'id2@second', 'Maike'),(1, '[email protected]', 'john'),(3, '[email protected]', 'amy'),], columns=["id","email","name"])
df = df.groupby("id").agg(email=("email",list), name=("name",pd.unique))
df2 = df.apply(lambda row: pd.Series(data={f"email{i+1}":v for i,v in enumerate(row["email"])}, dtype="object"), axis=1)
df = df.drop("email", axis=1).merge(df2, on="id")
Output:
name email1 email2
id
1 john [email protected] [email protected]
2 Maike [email protected] id2@second
3 amy [email protected] NaN
If you wanted to make it dynamic so that it creates new email counts based on maximum email count, you can try logic and code below
from pyspark.sql import functions as F
df = spark.createDataFrame(
[('1', '[email protected]', 'john'),
('2', '[email protected]', 'Maike'),
('2', '[email protected]', 'Maike'),
('2', 'id2@second', 'Maike'),
('1', '[email protected]', 'john')],
['id', 'email', 'name'])
df.show()
+---+---------------+-----+
| id| email| name|
+---+---------------+-----+
| 1| [email protected]| john|
| 2| [email protected]|Maike|
| 2|[email protected]|Maike|
| 2| id2@second|Maike|
| 1| [email protected]| john|
Solution
new = ( df.groupBy('id','name').agg(collect_set('email').alias('email') )#Collect unique emails
.withColumn('x',max(size('email')).over(Window.partitionBy()))#Find the group with maximum emails, for use in email column count
)
new = (new.withColumn('email',F.struct(*[ F.col("email")[i].alias(f"email{i+1}") for i in range(new.select('x').collect()[0][0])]))#Convert email column to struct type
.selectExpr('x','id','name','email.*') #Select all columns
)
new.show(truncate=False)
Outcome
+---+---+-----+-------------+--------------+---------------+
|x |id |name |email1 |email2 |email3 |
+---+---+-----+-------------+--------------+---------------+
|3 |1 |john |[email protected]|[email protected]|null |
|3 |2 |Maike|id2@second |[email protected] |[email protected]|
+---+---+-----+-------------+--------------+---------------+