Masking the email and phone number in PySpark


I want to mask the email – the first and last character before ‘@’ remain unmasked and the rest should be masked.

For phone number, the first and the last digit remains unmasked and the rest will be masked.

You can use a UDF for that:

from pyspark.sql.functions import udf

def mask_email(email):
    at_index = email.index('@')
    return email[0] + "*" * (at_index-2) + email[at_index-1:]

def mask_mobile(mobile):
    return mobile[0] + "*" * (len(mobile) - 2) + mobile[-1]

mask_email_udf = udf(mask_email)
mask_mobile_udf = udf(mask_mobile)

df.withColumn("Masked_Email", mask_email_udf("Email")) 
  .withColumn("Masked_Mobile", mask_mobile_udf("Mobile")) 

# +---------------+-------------+------------+--------------------+----------+--------------------+-------------+
# |Customer_Number|Customer_Name|Customer_Age|               Email|    Mobile|        Masked_Email|Masked_Mobile|
# +---------------+-------------+------------+--------------------+----------+--------------------+-------------+
# |              1|         Aman|          27|   [email protected]|9923150074|   a*****[email protected]|   9********4|
# |              2|      Prateek|          28|   [email protected]|8756325412|   p*****[email protected]|   8********2|
# |              3|        Rajat|          27|goyal.rajat@gmail...|8784654186|g*********t@gmail...|   8********6|
# +---------------+-------------+------------+--------------------+----------+--------------------+-------------+

It might be possible to do it directly with Spark functions but I’m not sure how.

Use regexp_replace:


from pyspark.sql import functions as F
df = spark.createDataFrame(
    [(1, 'Aman', 27, '[email protected]', '9923150074'),
     (2, 'Prateek', 28, '[email protected]', '8727451936'),
     (3, 'Rajat', 27, '[email protected]', '9871288442')],
    ['Customer_Number', 'Customer_Name', 'Customer_Age', 'Email', 'Mobile']


df = df.withColumn('Email', F.regexp_replace('Email', '(?<!^).(?=.+@)', '*'))
df = df.withColumn('Mobile', F.regexp_replace('Mobile', '(?<!^).(?!$)', '*'))
# +---------------+-------------+------------+--------------------+----------+
# |Customer_Number|Customer_Name|Customer_Age|               Email|    Mobile|
# +---------------+-------------+------------+--------------------+----------+
# |              1|         Aman|          27|   a*****[email protected]|9********4|
# |              2|      Prateek|          28|p************i@re...|8********6|
# |              3|        Rajat|          27|g*********t@gmail...|9********2|
# +---------------+-------------+------------+--------------------+----------+

It’s enabled by regex lookarounds.

For Email, you replace every character with * when 2 conditions are satisfied:

  • (?<!^) means that right before this character you must not have the start of string
  • (?=.+@) means that after this character you must have at least one character followed by @ symbol

For Mobile, you replace every character with * when 2 conditions are satisfied:

  • (?<!^) – same as above – means that right before this character you must not have the start of string
  • (?!$) means that right after this character you must not have the end of string
Here you can easily mask email. I have used the DataBricks platform.

Question : 1 After email validation -> if email is valid, mask username & domain part, if not valid keep as it is. & Split username & domains to store in new columns of username & domain.

#create a DF from salesPath

df =

#create function to verify email

def verify_email(email):
  import re
  regex = '^w+([.-]?w+)@w+([.-]?w+)(.w{2,3})+$'
      return True
      return False

#Convert python function to UDF function
verify_email = udf(verify_email)

#used column to verify email

from pyspark.sql.functions import col

#result of email validation store into new DF as df1 

df1 ="email",verify_email(col("email"))

# define a udf for split username

def split_username(email):
    if '@' in email:
        return email.split('@')[0]
        return email

# define a udf for split domain

def split_domain(email):
    if '@' in email:
        return email.split('@')[1]
        return email

# register the udf

split_username_udf = udf(split_username)
split_domain_udf = udf(split_domain)

#apply the udf

df_result = (df1.withColumn('Username', split_username_udf("email"))

#import functions and perform masking based on condition

from pyspark.sql.functions import lit, col, when
masked_df =
    when(col("verify_email(email)") == lit("true"), lit("****")).otherwise(col("Username")).alias("masked_username")


**–Question : Mobile/Phone Validation & Masking (replace) 1st digit of mobile number with 8.

Here I used DataBricks Platform

#read csv file and creae df ans pdf 

pdf = (
           .option("sep", ",")
           .option("header", True)
           .option("inferSchema", True)

#create function
import re   
def verify_phone(phone):

    # Define the regular expression pattern for phone numbers
    phone_regex = r'^+?d{1,3}[- ]?d{3,4}[- ]?d{4}$'

    # Use the re.match() function to check if the phone number matches the pattern
    if re.match(phone_regex, phone):
        return True
        return False

#Convert python function to UDF function
verify_phone = udf(verify_phone)

df1 = (pdf
         .select("phone", verify_phone(col("phone")).alias("valid_phone"))
         .filter("valid_phone == 'true'")

from pyspark.sql.functions import substring, concat

# Use substring() to extract the first digit and concat() 
df1 = df1.withColumn('Masked_column', concat(lit('8'),substring('phone', 2, 100)))

# Display the masked DataFrame

