Python – split string with multiple delimiters, return dictionary delimiter as key remaining item as value
Question:
So I wanted to know if there was something out there that took a string, split it over multiple delimiters, but instead of returning a list, it returned a dictionary of what delimiter was used to split the string, followed by the unsplit string up to the next delimiter. For example, consider this list:
Food to make:
1. Cake
a. eggs
b. flour
c. milk
d. etc
2. Salad
a. lettuce
b. spinach
c. cheese
d. ham
e. etc
Here is the list unformatted:
GroceryList = "1. Cake a. eggs b. flour c. milk d. etc 2. Salad a. lettuce b. spinach c. cheese d. ham e. etc"
When I run my script, I need it to split over alphanumeric values (and the period), then return it as a dictionary. Ideally, I would like to be able to set the dilemeters by the use of a list (my_str = "123test123" my_str.split(["1", "3"])
to split the string over values "1" and "3", returning a dict of {"1#1": "2", "3#1": "test", "1#2": "2", "3#2": ""}
). I understand any repeats would be overwritten in the dictionary so it’d have to have a unique ID associated with it like so:
{"#1": "Food to make:",
"1.#1": "Cake",
"a.#1": "eggs",
"b.#1": "flour",
"c.#1": "milk",
"d.#1": "etc",
"2.#2": "Salad",
"a.#2": "lettuce",
"b.#2": "spinach",
"c.#2": "cheese",
"d.#2": "ham",
"e.#2": "etc"}
I wouldn’t think that there would be a native function to do this, but seeing I am not real familiar with python (I’m running python 3.8), I figured I give the question a shot.
I’ve looked at mapping and lambda functions as an alternative to try and achieve this goal, but I wouldn’t know where to even begin tackling a problem like this, so if there is something native to accomplish this task, then that would be best.
Thank you!
—Edit From Some Time Later—
In retrospect, I can see how this might be confusing, so in an effort to clarify, here is example input that I will actually be dealing with:
M 10 315
L 110 215
A 30 50 0 0 1 162.55 162.45
L 172.55 152.45
A 30 50 -45 0 1 215.1 109.9
L 315 10
and because the data identifiers need to be unique, I don’t care so much as to how the data is stored, but rather that it stays unique. Lists are sorted in the order added by default, so I settled on that idea in my answer.
Answers:
Try this –
import re
import string
alp = ' '+string.ascii_lowercase
#split by digits and then split by words
items = [re.split('w.',i) for i in re.split('d.', GroceryList)][1:]
#iterate over list of lists while keeping track of the index with enumerate
#then for the inner index return, return corresponding alphabet
#finally apply dict transformation
result = dict([(alp[l]+'#'+str(i),m.strip()) for i,j in enumerate(items,1) for l,m in enumerate(j)])
result
{' #1': 'Cake',
'a#1': 'eggs',
'b#1': 'flour',
'c#1': 'milk',
'd#1': 'etc',
' #2': 'Salad',
'a#2': 'lettuce',
'b#2': 'spinach',
'c#2': 'cheese',
'd#2': 'ham',
'e#2': 'etc'}
Usage:
-
Save the class as a file… I called mine StringSplitter.py.
-
import StringSplitter as SS
-
ss = SS.StringSplitter("123test123", ["1", "3"])
-
ss.split()
computes and returns the output
-
ss.toFile("./split.txt")
to write to a file called "split.txt".
Which returns:
[{'delimiter': '1', 'start': 0, 'end': 1, 'content': '2'}, {'delimiter': '3', 'start': 2, 'end': 3, 'content': 'testing'}, {'delimiter': '1', 'start': 10, 'end': 11, 'content': '2'}, {'delimiter': '3', 'start': 12, 'end': 13, 'content': ''}]
When you reconstruct the string with the pattern delimiter + content, it yields:
123testing123
class StringSplitter:
def __init__(self, string=None, delimiter=None, caseSensitive=True):
self.string = string
self.splitted = []
self.delimiter = delimiter
self.caseSensitive = caseSensitive
self.split()
self.rmDupeStr = ""
def toFile(self, path):
with open(path, "w") as file:
file.writelines(str(self.splitted))
def split(self):
i = 0
delCount = len(self.delimiter)
strLen = len(self.string)
split = []
#loop through all chars in string
while i < strLen:
j = 0
#loop over all possible delimiters
while j < delCount:
#get the delimiters
searchitem = self.delimiter[j]
compChar = self.string[i]
if self.caseSensitive != True:
searchitem = searchitem.lower()
compChar = compChar.lower()
#if the delimiter at its char 0 is the same as the string at i
if searchitem[0] == compChar:
compItem = self.string[i:i + len(searchitem)]
if self.caseSensitive != True:
compItem = compItem.lower()
#check to see if the whole delimiter is matched at the rest of the string starting at i
if compItem == searchitem:
searchitem = self.string[i:i + len(searchitem)]
#then if there wasn't a match at the first character when a match was found,
#take the stuff up to the first match and make a dict out of it
#example: "string", ["i"] => [{"": "str"},{"i": "ng"}]
#for the purpose of this project, this is probably unnecessary
if len(split) == 0 and i > 0:
split.append({"delimiter": "", "start": 0, "end": i, "content": self.string[0: i]})
split.append({"delimiter": searchitem, "start": i, "end": i + len(searchitem), "content": ""})
else:
#add the delimiter and the starting and ending location of the of the delimiter
if len(split) > 0:
split[-1]["content"] = self.string[split[-1]["end"]: i]
split.append({"delimiter": searchitem, "start": i, "end": i + len(searchitem), "content": ""})
#break the loop
j = delCount + 1
#if len(split) > 1:
# split[-2]["content"] = self.string[int(split[-2]["end"]):int(split[-1]["start"])]
else:
#keep searching
j += 1
else:
#keep searching
j += 1
#keep searching
i += 1
if len(split) > 1:
split[-1]["content"] = self.string[int(split[-1]["end"]):]
elif len(split) == 1:
split[0]["content"] = self.string[int(split[0]["end"]):]
else:
split.append({"delimiter": "", "start": 0, "end": strLen, "content": self.string})
self.splitted = split
return split
def rmDupe(self):
self.rmDupeStr = ""
prevcontent = None
prevdelimiter = None
for i in range(len(self.splitted)):
content = self.splitted[i]["content"]
delimiter = self.splitted[i]["delimiter"]
if i > 0:
if prevdelimiter == delimiter:
# if prevcontent == content:
# then don't add them together, ex: "ssssss", so we are left with:
if prevcontent != content:
self.rmDupeStr += delimiter + content
else:
self.rmDupeStr += delimiter + content
else:
self.rmDupeStr += delimiter + content
prevcontent = content
prevdelimiter = delimiter
return self.rmDupeStr
Sample data from post:
import StringSplitter as SS
a = "M 10 315 L 110 215 A 30 50 0 0 1 162.55 162.45 L 172.55 152.45 A 30 50 -45 0 1 215.1 109.9 L 315 10"
ss = SS.StringSplitter(a, ["M", "L,", "A", " "])
ss.split()
Which outputs:
[{'delimiter': 'M', 'start': 0, 'end': 1, 'content': ''}
{'delimiter': ' ', 'start': 1, 'end': 2, 'content': '10'}
{'delimiter': ' ', 'start': 4, 'end': 5, 'content': '315'}
{'delimiter': ' ', 'start': 8, 'end': 9, 'content': ''}
{'delimiter': 'L', 'start': 9, 'end': 10, 'content': ''}
{'delimiter': ' ', 'start': 10, 'end': 11, 'content': '110'}
{'delimiter': ' ', 'start': 14, 'end': 15, 'content': '215'}
{'delimiter': ' ', 'start': 18, 'end': 19, 'content': ''}
{'delimiter': 'A', 'start': 19, 'end': 20, 'content': ''}
{'delimiter': ' ', 'start': 20, 'end': 21, 'content': '30'}
{'delimiter': ' ', 'start': 23, 'end': 24, 'content': '50'}
{'delimiter': ' ', 'start': 26, 'end': 27, 'content': '0'}
{'delimiter': ' ', 'start': 28, 'end': 29, 'content': '0'}
{'delimiter': ' ', 'start': 30, 'end': 31, 'content': '1'}
{'delimiter': ' ', 'start': 32, 'end': 33, 'content': '162.55'}
{'delimiter': ' ', 'start': 39, 'end': 40, 'content': '162.45'}
{'delimiter': ' ', 'start': 46, 'end': 47, 'content': ''}
{'delimiter': 'L', 'start': 47, 'end': 48, 'content': ''}
{'delimiter': ' ', 'start': 48, 'end': 49, 'content': '172.55'}
{'delimiter': ' ', 'start': 55, 'end': 56, 'content': '152.45'}
{'delimiter': ' ', 'start': 62, 'end': 63, 'content': ''}
{'delimiter': 'A', 'start': 63, 'end': 64, 'content': ''}
{'delimiter': ' ', 'start': 64, 'end': 65, 'content': '30'}
{'delimiter': ' ', 'start': 67, 'end': 68, 'content': '50'}
{'delimiter': ' ', 'start': 70, 'end': 71, 'content': '-45'}
{'delimiter': ' ', 'start': 74, 'end': 75, 'content': '0'}
{'delimiter': ' ', 'start': 76, 'end': 77, 'content': '1'}
{'delimiter': ' ', 'start': 78, 'end': 79, 'content': '215.1'}
{'delimiter': ' ', 'start': 84, 'end': 85, 'content': '109.9'}
{'delimiter': ' ', 'start': 90, 'end': 91, 'content': ''}
{'delimiter': 'L', 'start': 91, 'end': 92, 'content': ''}
{'delimiter': ' ', 'start': 92, 'end': 93, 'content': '315'}
{'delimiter': ' ', 'start': 96, 'end': 97, 'content': '10'}]
So I wanted to know if there was something out there that took a string, split it over multiple delimiters, but instead of returning a list, it returned a dictionary of what delimiter was used to split the string, followed by the unsplit string up to the next delimiter. For example, consider this list:
Food to make:
1. Cake
a. eggs
b. flour
c. milk
d. etc
2. Salad
a. lettuce
b. spinach
c. cheese
d. ham
e. etc
Here is the list unformatted:
GroceryList = "1. Cake a. eggs b. flour c. milk d. etc 2. Salad a. lettuce b. spinach c. cheese d. ham e. etc"
When I run my script, I need it to split over alphanumeric values (and the period), then return it as a dictionary. Ideally, I would like to be able to set the dilemeters by the use of a list (my_str = "123test123" my_str.split(["1", "3"])
to split the string over values "1" and "3", returning a dict of {"1#1": "2", "3#1": "test", "1#2": "2", "3#2": ""}
). I understand any repeats would be overwritten in the dictionary so it’d have to have a unique ID associated with it like so:
{"#1": "Food to make:",
"1.#1": "Cake",
"a.#1": "eggs",
"b.#1": "flour",
"c.#1": "milk",
"d.#1": "etc",
"2.#2": "Salad",
"a.#2": "lettuce",
"b.#2": "spinach",
"c.#2": "cheese",
"d.#2": "ham",
"e.#2": "etc"}
I wouldn’t think that there would be a native function to do this, but seeing I am not real familiar with python (I’m running python 3.8), I figured I give the question a shot.
I’ve looked at mapping and lambda functions as an alternative to try and achieve this goal, but I wouldn’t know where to even begin tackling a problem like this, so if there is something native to accomplish this task, then that would be best.
Thank you!
—Edit From Some Time Later—
In retrospect, I can see how this might be confusing, so in an effort to clarify, here is example input that I will actually be dealing with:
M 10 315
L 110 215
A 30 50 0 0 1 162.55 162.45
L 172.55 152.45
A 30 50 -45 0 1 215.1 109.9
L 315 10
and because the data identifiers need to be unique, I don’t care so much as to how the data is stored, but rather that it stays unique. Lists are sorted in the order added by default, so I settled on that idea in my answer.
Try this –
import re
import string
alp = ' '+string.ascii_lowercase
#split by digits and then split by words
items = [re.split('w.',i) for i in re.split('d.', GroceryList)][1:]
#iterate over list of lists while keeping track of the index with enumerate
#then for the inner index return, return corresponding alphabet
#finally apply dict transformation
result = dict([(alp[l]+'#'+str(i),m.strip()) for i,j in enumerate(items,1) for l,m in enumerate(j)])
result
{' #1': 'Cake',
'a#1': 'eggs',
'b#1': 'flour',
'c#1': 'milk',
'd#1': 'etc',
' #2': 'Salad',
'a#2': 'lettuce',
'b#2': 'spinach',
'c#2': 'cheese',
'd#2': 'ham',
'e#2': 'etc'}
Usage:
-
Save the class as a file… I called mine StringSplitter.py.
-
import StringSplitter as SS
-
ss = SS.StringSplitter("123test123", ["1", "3"])
-
ss.split()
computes and returns the output -
ss.toFile("./split.txt")
to write to a file called "split.txt".
Which returns:
[{'delimiter': '1', 'start': 0, 'end': 1, 'content': '2'}, {'delimiter': '3', 'start': 2, 'end': 3, 'content': 'testing'}, {'delimiter': '1', 'start': 10, 'end': 11, 'content': '2'}, {'delimiter': '3', 'start': 12, 'end': 13, 'content': ''}]
When you reconstruct the string with the pattern delimiter + content, it yields:
123testing123
class StringSplitter:
def __init__(self, string=None, delimiter=None, caseSensitive=True):
self.string = string
self.splitted = []
self.delimiter = delimiter
self.caseSensitive = caseSensitive
self.split()
self.rmDupeStr = ""
def toFile(self, path):
with open(path, "w") as file:
file.writelines(str(self.splitted))
def split(self):
i = 0
delCount = len(self.delimiter)
strLen = len(self.string)
split = []
#loop through all chars in string
while i < strLen:
j = 0
#loop over all possible delimiters
while j < delCount:
#get the delimiters
searchitem = self.delimiter[j]
compChar = self.string[i]
if self.caseSensitive != True:
searchitem = searchitem.lower()
compChar = compChar.lower()
#if the delimiter at its char 0 is the same as the string at i
if searchitem[0] == compChar:
compItem = self.string[i:i + len(searchitem)]
if self.caseSensitive != True:
compItem = compItem.lower()
#check to see if the whole delimiter is matched at the rest of the string starting at i
if compItem == searchitem:
searchitem = self.string[i:i + len(searchitem)]
#then if there wasn't a match at the first character when a match was found,
#take the stuff up to the first match and make a dict out of it
#example: "string", ["i"] => [{"": "str"},{"i": "ng"}]
#for the purpose of this project, this is probably unnecessary
if len(split) == 0 and i > 0:
split.append({"delimiter": "", "start": 0, "end": i, "content": self.string[0: i]})
split.append({"delimiter": searchitem, "start": i, "end": i + len(searchitem), "content": ""})
else:
#add the delimiter and the starting and ending location of the of the delimiter
if len(split) > 0:
split[-1]["content"] = self.string[split[-1]["end"]: i]
split.append({"delimiter": searchitem, "start": i, "end": i + len(searchitem), "content": ""})
#break the loop
j = delCount + 1
#if len(split) > 1:
# split[-2]["content"] = self.string[int(split[-2]["end"]):int(split[-1]["start"])]
else:
#keep searching
j += 1
else:
#keep searching
j += 1
#keep searching
i += 1
if len(split) > 1:
split[-1]["content"] = self.string[int(split[-1]["end"]):]
elif len(split) == 1:
split[0]["content"] = self.string[int(split[0]["end"]):]
else:
split.append({"delimiter": "", "start": 0, "end": strLen, "content": self.string})
self.splitted = split
return split
def rmDupe(self):
self.rmDupeStr = ""
prevcontent = None
prevdelimiter = None
for i in range(len(self.splitted)):
content = self.splitted[i]["content"]
delimiter = self.splitted[i]["delimiter"]
if i > 0:
if prevdelimiter == delimiter:
# if prevcontent == content:
# then don't add them together, ex: "ssssss", so we are left with:
if prevcontent != content:
self.rmDupeStr += delimiter + content
else:
self.rmDupeStr += delimiter + content
else:
self.rmDupeStr += delimiter + content
prevcontent = content
prevdelimiter = delimiter
return self.rmDupeStr
Sample data from post:
import StringSplitter as SS
a = "M 10 315 L 110 215 A 30 50 0 0 1 162.55 162.45 L 172.55 152.45 A 30 50 -45 0 1 215.1 109.9 L 315 10"
ss = SS.StringSplitter(a, ["M", "L,", "A", " "])
ss.split()
Which outputs:
[{'delimiter': 'M', 'start': 0, 'end': 1, 'content': ''}
{'delimiter': ' ', 'start': 1, 'end': 2, 'content': '10'}
{'delimiter': ' ', 'start': 4, 'end': 5, 'content': '315'}
{'delimiter': ' ', 'start': 8, 'end': 9, 'content': ''}
{'delimiter': 'L', 'start': 9, 'end': 10, 'content': ''}
{'delimiter': ' ', 'start': 10, 'end': 11, 'content': '110'}
{'delimiter': ' ', 'start': 14, 'end': 15, 'content': '215'}
{'delimiter': ' ', 'start': 18, 'end': 19, 'content': ''}
{'delimiter': 'A', 'start': 19, 'end': 20, 'content': ''}
{'delimiter': ' ', 'start': 20, 'end': 21, 'content': '30'}
{'delimiter': ' ', 'start': 23, 'end': 24, 'content': '50'}
{'delimiter': ' ', 'start': 26, 'end': 27, 'content': '0'}
{'delimiter': ' ', 'start': 28, 'end': 29, 'content': '0'}
{'delimiter': ' ', 'start': 30, 'end': 31, 'content': '1'}
{'delimiter': ' ', 'start': 32, 'end': 33, 'content': '162.55'}
{'delimiter': ' ', 'start': 39, 'end': 40, 'content': '162.45'}
{'delimiter': ' ', 'start': 46, 'end': 47, 'content': ''}
{'delimiter': 'L', 'start': 47, 'end': 48, 'content': ''}
{'delimiter': ' ', 'start': 48, 'end': 49, 'content': '172.55'}
{'delimiter': ' ', 'start': 55, 'end': 56, 'content': '152.45'}
{'delimiter': ' ', 'start': 62, 'end': 63, 'content': ''}
{'delimiter': 'A', 'start': 63, 'end': 64, 'content': ''}
{'delimiter': ' ', 'start': 64, 'end': 65, 'content': '30'}
{'delimiter': ' ', 'start': 67, 'end': 68, 'content': '50'}
{'delimiter': ' ', 'start': 70, 'end': 71, 'content': '-45'}
{'delimiter': ' ', 'start': 74, 'end': 75, 'content': '0'}
{'delimiter': ' ', 'start': 76, 'end': 77, 'content': '1'}
{'delimiter': ' ', 'start': 78, 'end': 79, 'content': '215.1'}
{'delimiter': ' ', 'start': 84, 'end': 85, 'content': '109.9'}
{'delimiter': ' ', 'start': 90, 'end': 91, 'content': ''}
{'delimiter': 'L', 'start': 91, 'end': 92, 'content': ''}
{'delimiter': ' ', 'start': 92, 'end': 93, 'content': '315'}
{'delimiter': ' ', 'start': 96, 'end': 97, 'content': '10'}]