Reading vcf file data using python and re library
Question:
Here is content inside .vcf file data.
BEGIN:VCARD
VERSION:4.0
N:Muller;CCCIsabella;;;
FN:Muller
ORG:Bubba Gump Shrimp Co.
TITLE:Shrimp Man
PHOTO;MEDIATYPE=image/gif:http://www.example.com/dir_photos/my_photo.gif
TEL;TYPE=work,voice;VALUE=uri:tel:+16829185770
REV:20080424T195243Z
END:VCARD
BEGIN:VCARD
VERSION:4.0
N:Mraz;CCCEdwardo;;;
FN:Mraz
ORG:Bubba Gump Shrimp Co.
TITLE:Shrimp Man
PHOTO;MEDIATYPE=image/gif:http://www.example.com/dir_photos/my_photo.gif
TEL;TYPE=work,voice;VALUE=uri:tel:+18083155095
REV:20080424T195243Z
END:VCARD
BEGIN:VCARD
VERSION:4.0
N:Reynolds;CCCBrant;;;
FN:Reynolds
ORG:Bubba Gump Shrimp Co.
TITLE:Shrimp Man
PHOTO;MEDIATYPE=image/gif:http://www.example.com/dir_photos/my_photo.gif
TEL;TYPE=work,voice;VALUE=uri:tel:+15089473508
REV:20080424T195243Z
END:VCARD
I want my data in as below.
data = [{'name': 'Muller','phone': '+16829185770'}, {'name': 'Mraz', 'phone': '+18083155095'}, {'name': 'Reynolds','phone': '+15089473508'}]
but I am not getting data as above. Please help me out in this case. Here I am using re python package to solve.
import re
file = open('contacts.vcf', 'r')
contacts = []
for line in file:
name = re.findall('FN:(.*)', line)
tel = re.findall('tel:(.*)', line)
nm = ''.join(name)
tel = ''.join(tel)
if len(nm) == 0 and len(tel) == 0:
continue
data = {'name' : nm, 'phone' : tel}
contacts.append(data)
print(contacts)
getting below results name and phone are adding in defferent.
[{'name': 'Muller', 'phone': ''}, {'name': '', 'phone': '+16829185770'}, {'name': 'Mraz', 'phone': ''}, {'name': '', 'phone': '+18083155095'}, {'name': 'Reynolds', 'phone': ''}, {'name': '', 'phone': '+15089473508'}]
Answers:
Often time when debugging it is useful to use print
at various points to figure out where the code is going awry. If you inserted print(">",nm,tel)
after tel = ''.join(tel)
, for example, you should get the following output:
>
>
>
> Muller
>
>
>
> +16829185770
>
>
>
>
>
>
> Mraz
>
[... continued...]
Obviously, this is because your for
loop is operating on each line in the file instead of each card (technically, you even acknowledge this: for line in file:
).
You may be interested in using a module for parsing this file (a quick google brought up the vobject
package), which would eliminate the need for re
. If you’re feeling ambitious you may be able to parse it manually (not really familiar with the format, so here’s an off-the-cuff example).
CARDMATCHER = re.compile(r"""
^ ## Match at Start of Line (Multiline-flag)
BEGIN:VCARD ## Match the string "BEGIN:VCARD" exactly
$ ## Match the End of Line (Multiline-flag)
.*? ## Match characters (.) any number of times(*),
## as few times as possible(?), including new-line(Dotall-flag)
^ ## Match at Start of Line (Multiline-flag)
END:VCARD ## Match the string "END:VCARD" exactly
$ ## Match the End of Line (Multiline-flag)
""", re.MULTILINE|re.DOTALL|re.VERBOSE)
VALUERE = re.compile("""
^(?P<type>[A-Z]+) ## Match Capital Ascii Characters at Start of Line
(?P<sep>:|;) ## Match a colon or a semicolon
(?P<value>.*) ## Match all other characters remaining
""", re.VERBOSE)
class MyVCard():
def __init__(self,cardstring):
self.info = defaultdict(list)
## Iterate over the card like you were doing
for line in cardstring.split("n"):
## Split Key of line
match = VALUERE.match(line)
if match:
vtype = match.group("type")
## Line Values are separated by semicolons
values = match.group("value").split(";")
## Lines with colons appear to be unique values
if match.group("sep") == ":":
## If only a single value, we don't need the list
if len(values) == 1:
self.info[vtype] = values[0]
else:
self.info[vtype] = values
## Otherwise (Semicolon sep), the value may not be unique
else:
## Semicolon seps also appear to have multiple keys
## So we'll use a dict
out = {}
for val in values:
## Get key,value for each value
k,v = val.split("=",maxsplit=1)
out[k] = v
## Make sure we havea list to append to
self.info[vtype].append(out)
def get_a_number(self):
""" Naive approach to getting the number """
if "TEL" in self.info:
number = self.info["TEL"][0]["VALUE"]
numbers = re.findall("tel:(.+)",number)
if numbers:
return numbers[0]
return None
def get_vcards(file):
""" Use regex to parse VCards into dicts. """
with open(file,'r') as f:
finput = f.read()
cards = CARDMATCHER.findall(finput)
return [MyVCard(card) for card in cards]
print([{"fn":card.info['FN'], "tel":card.get_a_number()} for card in get_vcards(file)])
Again, I make no guarantees on this code since I’m not going to look up all the specifications for the vcf
format, and would recommend using a module specifically designed for this instead.
You can try the below code.
import re
file = open('vcards-2.vcf', 'r')
contacts = []
phone = []
for line in file:
name = re.findall('FN:(.*)', line)
nm = ''.join(name)
if len(nm) == 0:
continue
data = {'name' : nm.strip()}
for lin in file:
tel = re.findall('pref:(.*)', lin)
tel = ''.join(tel)
if len(tel) == 0:
continue
tel = tel.strip()
tel = ''.join(e for e in tel if e.isalnum())
data['phone'] = tel
break
contacts.append(data)
print(contacts)
You will get below redults
[{'name': 'Muller','phone': '+16829185770'}, {'name': 'Mraz', 'phone': '+18083155095'}, {'name': 'Reynolds','phone': '+15089473508'}]
Try the below code
import re
import json
from typing import List, Any
def get_contacts_from_vcf(filename: str) -> List[Any]:
contacts = []
with open(filename, 'r') as f:
ln = ''.join(f.readlines())
raw_data = ln.split("END:VCARD")
for x in raw_data:
s = re.search("FN:(.*)", x)
fn = re.findall("TEL;(.*)", x)
fn = [b_.strip() for b_ in fn]
if s is not None and len(fn) >= 1:
name = s.groups()[-1].strip()
for num in fn:
num_count = fn.count(num)
if num_count > 1:
for n in range(1, num_count):
fn.remove(num)
numbers = []
for fn in fn:
numbers.append(fn.split(':')[-1])
tel = ', '.join(numbers)
data = {
"name": name,
"Ph No(s)": tel
}
contacts.append(json.dumps(data) + ",")
return contacts
print('n'.join(get_contacts_from_vcf("bin.vcf")))
Output:
{"name": "John", "Ph No(s)": "1234, 4321"},
{"name": "watt", "Ph No(s)": "121, 32444"},
{"name": "smith", "Ph No(s)": "55555"},
Here is content inside .vcf file data.
BEGIN:VCARD
VERSION:4.0
N:Muller;CCCIsabella;;;
FN:Muller
ORG:Bubba Gump Shrimp Co.
TITLE:Shrimp Man
PHOTO;MEDIATYPE=image/gif:http://www.example.com/dir_photos/my_photo.gif
TEL;TYPE=work,voice;VALUE=uri:tel:+16829185770
REV:20080424T195243Z
END:VCARD
BEGIN:VCARD
VERSION:4.0
N:Mraz;CCCEdwardo;;;
FN:Mraz
ORG:Bubba Gump Shrimp Co.
TITLE:Shrimp Man
PHOTO;MEDIATYPE=image/gif:http://www.example.com/dir_photos/my_photo.gif
TEL;TYPE=work,voice;VALUE=uri:tel:+18083155095
REV:20080424T195243Z
END:VCARD
BEGIN:VCARD
VERSION:4.0
N:Reynolds;CCCBrant;;;
FN:Reynolds
ORG:Bubba Gump Shrimp Co.
TITLE:Shrimp Man
PHOTO;MEDIATYPE=image/gif:http://www.example.com/dir_photos/my_photo.gif
TEL;TYPE=work,voice;VALUE=uri:tel:+15089473508
REV:20080424T195243Z
END:VCARD
I want my data in as below.
data = [{'name': 'Muller','phone': '+16829185770'}, {'name': 'Mraz', 'phone': '+18083155095'}, {'name': 'Reynolds','phone': '+15089473508'}]
but I am not getting data as above. Please help me out in this case. Here I am using re python package to solve.
import re
file = open('contacts.vcf', 'r')
contacts = []
for line in file:
name = re.findall('FN:(.*)', line)
tel = re.findall('tel:(.*)', line)
nm = ''.join(name)
tel = ''.join(tel)
if len(nm) == 0 and len(tel) == 0:
continue
data = {'name' : nm, 'phone' : tel}
contacts.append(data)
print(contacts)
getting below results name and phone are adding in defferent.
[{'name': 'Muller', 'phone': ''}, {'name': '', 'phone': '+16829185770'}, {'name': 'Mraz', 'phone': ''}, {'name': '', 'phone': '+18083155095'}, {'name': 'Reynolds', 'phone': ''}, {'name': '', 'phone': '+15089473508'}]
Often time when debugging it is useful to use print
at various points to figure out where the code is going awry. If you inserted print(">",nm,tel)
after tel = ''.join(tel)
, for example, you should get the following output:
>
>
>
> Muller
>
>
>
> +16829185770
>
>
>
>
>
>
> Mraz
>
[... continued...]
Obviously, this is because your for
loop is operating on each line in the file instead of each card (technically, you even acknowledge this: for line in file:
).
You may be interested in using a module for parsing this file (a quick google brought up the vobject
package), which would eliminate the need for re
. If you’re feeling ambitious you may be able to parse it manually (not really familiar with the format, so here’s an off-the-cuff example).
CARDMATCHER = re.compile(r"""
^ ## Match at Start of Line (Multiline-flag)
BEGIN:VCARD ## Match the string "BEGIN:VCARD" exactly
$ ## Match the End of Line (Multiline-flag)
.*? ## Match characters (.) any number of times(*),
## as few times as possible(?), including new-line(Dotall-flag)
^ ## Match at Start of Line (Multiline-flag)
END:VCARD ## Match the string "END:VCARD" exactly
$ ## Match the End of Line (Multiline-flag)
""", re.MULTILINE|re.DOTALL|re.VERBOSE)
VALUERE = re.compile("""
^(?P<type>[A-Z]+) ## Match Capital Ascii Characters at Start of Line
(?P<sep>:|;) ## Match a colon or a semicolon
(?P<value>.*) ## Match all other characters remaining
""", re.VERBOSE)
class MyVCard():
def __init__(self,cardstring):
self.info = defaultdict(list)
## Iterate over the card like you were doing
for line in cardstring.split("n"):
## Split Key of line
match = VALUERE.match(line)
if match:
vtype = match.group("type")
## Line Values are separated by semicolons
values = match.group("value").split(";")
## Lines with colons appear to be unique values
if match.group("sep") == ":":
## If only a single value, we don't need the list
if len(values) == 1:
self.info[vtype] = values[0]
else:
self.info[vtype] = values
## Otherwise (Semicolon sep), the value may not be unique
else:
## Semicolon seps also appear to have multiple keys
## So we'll use a dict
out = {}
for val in values:
## Get key,value for each value
k,v = val.split("=",maxsplit=1)
out[k] = v
## Make sure we havea list to append to
self.info[vtype].append(out)
def get_a_number(self):
""" Naive approach to getting the number """
if "TEL" in self.info:
number = self.info["TEL"][0]["VALUE"]
numbers = re.findall("tel:(.+)",number)
if numbers:
return numbers[0]
return None
def get_vcards(file):
""" Use regex to parse VCards into dicts. """
with open(file,'r') as f:
finput = f.read()
cards = CARDMATCHER.findall(finput)
return [MyVCard(card) for card in cards]
print([{"fn":card.info['FN'], "tel":card.get_a_number()} for card in get_vcards(file)])
Again, I make no guarantees on this code since I’m not going to look up all the specifications for the vcf
format, and would recommend using a module specifically designed for this instead.
You can try the below code.
import re
file = open('vcards-2.vcf', 'r')
contacts = []
phone = []
for line in file:
name = re.findall('FN:(.*)', line)
nm = ''.join(name)
if len(nm) == 0:
continue
data = {'name' : nm.strip()}
for lin in file:
tel = re.findall('pref:(.*)', lin)
tel = ''.join(tel)
if len(tel) == 0:
continue
tel = tel.strip()
tel = ''.join(e for e in tel if e.isalnum())
data['phone'] = tel
break
contacts.append(data)
print(contacts)
You will get below redults
[{'name': 'Muller','phone': '+16829185770'}, {'name': 'Mraz', 'phone': '+18083155095'}, {'name': 'Reynolds','phone': '+15089473508'}]
Try the below code
import re
import json
from typing import List, Any
def get_contacts_from_vcf(filename: str) -> List[Any]:
contacts = []
with open(filename, 'r') as f:
ln = ''.join(f.readlines())
raw_data = ln.split("END:VCARD")
for x in raw_data:
s = re.search("FN:(.*)", x)
fn = re.findall("TEL;(.*)", x)
fn = [b_.strip() for b_ in fn]
if s is not None and len(fn) >= 1:
name = s.groups()[-1].strip()
for num in fn:
num_count = fn.count(num)
if num_count > 1:
for n in range(1, num_count):
fn.remove(num)
numbers = []
for fn in fn:
numbers.append(fn.split(':')[-1])
tel = ', '.join(numbers)
data = {
"name": name,
"Ph No(s)": tel
}
contacts.append(json.dumps(data) + ",")
return contacts
print('n'.join(get_contacts_from_vcf("bin.vcf")))
Output:
{"name": "John", "Ph No(s)": "1234, 4321"},
{"name": "watt", "Ph No(s)": "121, 32444"},
{"name": "smith", "Ph No(s)": "55555"},