Masking the email and phone number in PySpark
Question:
Answers:
You can use a UDF for that:
from pyspark.sql.functions import udf
def mask_email(email):
at_index = email.index('@')
return email[0] + "*" * (at_index-2) + email[at_index-1:]
def mask_mobile(mobile):
return mobile[0] + "*" * (len(mobile) - 2) + mobile[-1]
mask_email_udf = udf(mask_email)
mask_mobile_udf = udf(mask_mobile)
df.withColumn("Masked_Email", mask_email_udf("Email"))
.withColumn("Masked_Mobile", mask_mobile_udf("Mobile"))
.show()
# +---------------+-------------+------------+--------------------+----------+--------------------+-------------+
# |Customer_Number|Customer_Name|Customer_Age| Email| Mobile| Masked_Email|Masked_Mobile|
# +---------------+-------------+------------+--------------------+----------+--------------------+-------------+
# | 1| Aman| 27| [email protected]|9923150074| a*****[email protected]| 9********4|
# | 2| Prateek| 28| [email protected]|8756325412| p*****[email protected]| 8********2|
# | 3| Rajat| 27|goyal.rajat@gmail...|8784654186|g*********t@gmail...| 8********6|
# +---------------+-------------+------------+--------------------+----------+--------------------+-------------+
It might be possible to do it directly with Spark functions but I’m not sure how.
Use regexp_replace
:
Input:
from pyspark.sql import functions as F
df = spark.createDataFrame(
[(1, 'Aman', 27, '[email protected]', '9923150074'),
(2, 'Prateek', 28, '[email protected]', '8727451936'),
(3, 'Rajat', 27, '[email protected]', '9871288442')],
['Customer_Number', 'Customer_Name', 'Customer_Age', 'Email', 'Mobile']
)
Script:
df = df.withColumn('Email', F.regexp_replace('Email', '(?<!^).(?=.+@)', '*'))
df = df.withColumn('Mobile', F.regexp_replace('Mobile', '(?<!^).(?!$)', '*'))
df.show()
# +---------------+-------------+------------+--------------------+----------+
# |Customer_Number|Customer_Name|Customer_Age| Email| Mobile|
# +---------------+-------------+------------+--------------------+----------+
# | 1| Aman| 27| a*****[email protected]|9********4|
# | 2| Prateek| 28|p************i@re...|8********6|
# | 3| Rajat| 27|g*********t@gmail...|9********2|
# +---------------+-------------+------------+--------------------+----------+
It’s enabled by regex lookarounds.
For Email, you replace every character with *
when 2 conditions are satisfied:
(?<!^)
means that right before this character you must not have the start of string
(?=.+@)
means that after this character you must have at least one character followed by @
symbol
For Mobile, you replace every character with *
when 2 conditions are satisfied:
(?<!^)
– same as above – means that right before this character you must not have the start of string
(?!$)
means that right after this character you must not have the end of string
Here you can easily mask email. I have used the DataBricks platform.
Question : 1 After email validation -> if email is valid, mask username & domain part, if not valid keep as it is. & Split username & domains to store in new columns of username & domain.
#create a DF from salesPath
df = spark.read.parquet(salesPath)
#create function to verify email
def verify_email(email):
import re
regex = '^w+([.-]?w+)@w+([.-]?w+)(.w{2,3})+$'
if(re.search(regex,email)):
return True
else:
return False
#Convert python function to UDF function
verify_email = udf(verify_email)
#used column to verify email
from pyspark.sql.functions import col
#result of email validation store into new DF as df1
df1 = df.select("email",verify_email(col("email"))
)
# define a udf for split username
def split_username(email):
if '@' in email:
return email.split('@')[0]
else:
return email
# define a udf for split domain
def split_domain(email):
if '@' in email:
return email.split('@')[1]
else:
return email
# register the udf
split_username_udf = udf(split_username)
split_domain_udf = udf(split_domain)
#apply the udf
df_result = (df1.withColumn('Username', split_username_udf("email"))
.withColumn("Domain",split_domain_udf("email"))
)
#import functions and perform masking based on condition
from pyspark.sql.functions import lit, col, when
masked_df = df_result.select(
col("email"),
col("Username"),
col("Domain"),
col("verify_email(email)"),
when(col("verify_email(email)") == lit("true"), lit("****")).otherwise(col("Username")).alias("masked_username")
)
#Final_Output
display(masked_df)
**–Question : Mobile/Phone Validation & Masking (replace) 1st digit of mobile number with 8.
Here I used DataBricks Platform
#read csv file and creae df ans pdf
pdf = (spark.read
.option("sep", ",")
.option("header", True)
.option("inferSchema", True)
.csv("/FileStore/tables/demo_data-1.csv")
)
#create function
import re
def verify_phone(phone):
# Define the regular expression pattern for phone numbers
phone_regex = r'^+?d{1,3}[- ]?d{3,4}[- ]?d{4}$'
# Use the re.match() function to check if the phone number matches the pattern
if re.match(phone_regex, phone):
return True
else:
return False
#Convert python function to UDF function
verify_phone = udf(verify_phone)
df1 = (pdf
.select("phone", verify_phone(col("phone")).alias("valid_phone"))
.filter("valid_phone == 'true'")
)
from pyspark.sql.functions import substring, concat
# Use substring() to extract the first digit and concat()
df1 = df1.withColumn('Masked_column', concat(lit('8'),substring('phone', 2, 100)))
# Display the masked DataFrame
display(df1)
You can use a UDF for that:
from pyspark.sql.functions import udf
def mask_email(email):
at_index = email.index('@')
return email[0] + "*" * (at_index-2) + email[at_index-1:]
def mask_mobile(mobile):
return mobile[0] + "*" * (len(mobile) - 2) + mobile[-1]
mask_email_udf = udf(mask_email)
mask_mobile_udf = udf(mask_mobile)
df.withColumn("Masked_Email", mask_email_udf("Email"))
.withColumn("Masked_Mobile", mask_mobile_udf("Mobile"))
.show()
# +---------------+-------------+------------+--------------------+----------+--------------------+-------------+
# |Customer_Number|Customer_Name|Customer_Age| Email| Mobile| Masked_Email|Masked_Mobile|
# +---------------+-------------+------------+--------------------+----------+--------------------+-------------+
# | 1| Aman| 27| [email protected]|9923150074| a*****[email protected]| 9********4|
# | 2| Prateek| 28| [email protected]|8756325412| p*****[email protected]| 8********2|
# | 3| Rajat| 27|goyal.rajat@gmail...|8784654186|g*********t@gmail...| 8********6|
# +---------------+-------------+------------+--------------------+----------+--------------------+-------------+
It might be possible to do it directly with Spark functions but I’m not sure how.
Use regexp_replace
:
Input:
from pyspark.sql import functions as F
df = spark.createDataFrame(
[(1, 'Aman', 27, '[email protected]', '9923150074'),
(2, 'Prateek', 28, '[email protected]', '8727451936'),
(3, 'Rajat', 27, '[email protected]', '9871288442')],
['Customer_Number', 'Customer_Name', 'Customer_Age', 'Email', 'Mobile']
)
Script:
df = df.withColumn('Email', F.regexp_replace('Email', '(?<!^).(?=.+@)', '*'))
df = df.withColumn('Mobile', F.regexp_replace('Mobile', '(?<!^).(?!$)', '*'))
df.show()
# +---------------+-------------+------------+--------------------+----------+
# |Customer_Number|Customer_Name|Customer_Age| Email| Mobile|
# +---------------+-------------+------------+--------------------+----------+
# | 1| Aman| 27| a*****[email protected]|9********4|
# | 2| Prateek| 28|p************i@re...|8********6|
# | 3| Rajat| 27|g*********t@gmail...|9********2|
# +---------------+-------------+------------+--------------------+----------+
It’s enabled by regex lookarounds.
For Email, you replace every character with *
when 2 conditions are satisfied:
(?<!^)
means that right before this character you must not have the start of string(?=.+@)
means that after this character you must have at least one character followed by@
symbol
For Mobile, you replace every character with *
when 2 conditions are satisfied:
(?<!^)
– same as above – means that right before this character you must not have the start of string(?!$)
means that right after this character you must not have the end of string
Here you can easily mask email. I have used the DataBricks platform.
Question : 1 After email validation -> if email is valid, mask username & domain part, if not valid keep as it is. & Split username & domains to store in new columns of username & domain.
#create a DF from salesPath
df = spark.read.parquet(salesPath)
#create function to verify email
def verify_email(email):
import re
regex = '^w+([.-]?w+)@w+([.-]?w+)(.w{2,3})+$'
if(re.search(regex,email)):
return True
else:
return False
#Convert python function to UDF function
verify_email = udf(verify_email)
#used column to verify email
from pyspark.sql.functions import col
#result of email validation store into new DF as df1
df1 = df.select("email",verify_email(col("email"))
)
# define a udf for split username
def split_username(email):
if '@' in email:
return email.split('@')[0]
else:
return email
# define a udf for split domain
def split_domain(email):
if '@' in email:
return email.split('@')[1]
else:
return email
# register the udf
split_username_udf = udf(split_username)
split_domain_udf = udf(split_domain)
#apply the udf
df_result = (df1.withColumn('Username', split_username_udf("email"))
.withColumn("Domain",split_domain_udf("email"))
)
#import functions and perform masking based on condition
from pyspark.sql.functions import lit, col, when
masked_df = df_result.select(
col("email"),
col("Username"),
col("Domain"),
col("verify_email(email)"),
when(col("verify_email(email)") == lit("true"), lit("****")).otherwise(col("Username")).alias("masked_username")
)
#Final_Output
display(masked_df)
**–Question : Mobile/Phone Validation & Masking (replace) 1st digit of mobile number with 8.
Here I used DataBricks Platform
#read csv file and creae df ans pdf
pdf = (spark.read
.option("sep", ",")
.option("header", True)
.option("inferSchema", True)
.csv("/FileStore/tables/demo_data-1.csv")
)
#create function
import re
def verify_phone(phone):
# Define the regular expression pattern for phone numbers
phone_regex = r'^+?d{1,3}[- ]?d{3,4}[- ]?d{4}$'
# Use the re.match() function to check if the phone number matches the pattern
if re.match(phone_regex, phone):
return True
else:
return False
#Convert python function to UDF function
verify_phone = udf(verify_phone)
df1 = (pdf
.select("phone", verify_phone(col("phone")).alias("valid_phone"))
.filter("valid_phone == 'true'")
)
from pyspark.sql.functions import substring, concat
# Use substring() to extract the first digit and concat()
df1 = df1.withColumn('Masked_column', concat(lit('8'),substring('phone', 2, 100)))
# Display the masked DataFrame
display(df1)