parsing a text file into lists with python
Question:
So I have a generated text file that I’d like to parse into a couple lists of dates. I had it figured out when there was one date per ‘group’ but i realized i may have to deal with multiple date values per group.
My .txt file looks like this:
DateGroup1
20191129
20191127
20191126
DateGroup2
20191129
20191127
20191126
DateGroup3
2019-12-02
DateGroup4
2019-11-27
DateGroup5
2019-11-27
And ideally i would be able to parse this out into 5 lists that include the dates for each group. I am so stumped
Answers:
Here is an example that you could build off of, every time it reads a string rather than a number it then makes a new list and puts all the dates under the group in it
import os
#read file
lineList = 0
with open("test.txt") as f:
lineList = f.readlines()
#make new list to hold variables
lists = []
#loop through and check for numbers and strings
y=-1
for x in range(len(lineList)):
#check if it is a number or a string
if(lineList[x][0] is not None and not lineList[x][0].isdigit()):
#if it is a string make a new list and push back the name
lists.append([lineList[x]])
y+=1
else:
#if it is the number append it to the current list
lists[y].append(lineList[x])
#print the lists
for x in lists:
print(x)
Start by reading in your whole text file. Then you can count the amount of occurrences of “DateGroup”, which seems to be the constant part in your date group separation. You can then parse your file by going through all the data that is in between any two “DateGroup” identifiers or between one “DateGroup” identifier and the end of the file. Try to understand the following piece of code and build your application on top of that:
file = open("dates.txt")
text = file.read()
file.close()
amountGroups = text.count("DateGroup")
list = []
index = 0
i = 0
for i in range(amountGroups):
list.append([])
index = text.find("DateGroup", index)
index = text.find("n", index) + 1
indexEnd = text.find("DateGroup", index)
if(indexEnd == -1):
indexEnd = len(text)
while(index < indexEnd):
indexNewline = text.find("n", index)
list[i].append(text[index:indexNewline])
index = indexNewline + 1
print(list)
Just loop over each line, check for your key that will group data, remove newlines and store each new date.
DATE_GROUP_SEPARATOR = 'DateGroup'
sorted_data = {}
with open('test.txt') as file:
last_group = None
for line in file.readlines():
line = line.replace('n', '')
if DATE_GROUP_SEPARATOR in line:
sorted_data[line] = []
last_group = line
else:
sorted_data[last_group].append(line)
for date_group, dates in sorted_data.items():
print(f"{date_group}: {dates}")
This first section just to show how to treat a string with data as if it came from a file. That helps if you don’t want to generate the actual file of the OP but want to visibly import the data in the editor.
import sys
from io import StringIO # allows treating some lines in editor as if they were from a file)
dat=StringIO("""DateGroup1
20191129
20191127
20191126
DateGroup2
20191129
20191127
20191126
DateGroup3
2019-12-02
DateGroup4
2019-11-27
DateGroup5
2019-11-27""")
lines=[ l.strip() for l in dat.readlines()]
print(lines)
output:
['DateGroup1', '20191129', '20191127', '20191126', 'DateGroup2', '20191129', '20191127', '20191126', 'DateGroup3', '2019-12-02', 'DateGroup4', '2019-11-27', 'DateGroup5', '2019-11-27']
Now one possible way to generate your desired list of lists, while ensuring that both possible date formats are covered:
from datetime import datetime
b=[]
for i,line in enumerate(lines):
try: # try first dateformat
do = datetime.strptime(line, '%Y%m%d')
a.append(datetime.strftime(do,'%Y-%m-%d'))
except:
try: # try second dateformat
do=datetime.strptime(line,'%Y-%m-%d')
a.append(datetime.strftime(do,'%Y-%m-%d'))
except: # if neither date, append old list to list of lists & make a new list
if a!=None:
b.append(a)
a=[]
if i==len(lines)-1:
b.append(a)
b
output:
[['2019-11-27'],
['2019-11-29', '2019-11-27', '2019-11-26'],
['2019-11-29', '2019-11-27', '2019-11-26'],
['2019-12-02'],
['2019-11-27'],
['2019-11-27']]
TTP can help to parse this text as well, here is sample template with code how to run it:
from ttp import ttp
data_to_parse = """
DateGroup1
20191129
20191127
20191126
DateGroup2
20191129
20191127
20191126
DateGroup3
2019-12-02
DateGroup4
2019-11-27
DateGroup5
2019-11-27
"""
ttp_template = """
<group name="date_groups.date_group{{ id }}">
DateGroup{{ id }}
{{ dates | to_list | joinmatches() }}
</group>
"""
parser = ttp(data=data_to_parse, template=ttp_template)
parser.parse()
print(parser.result(format="json")[0])
above code would produce this output:
[
{
"date_groups": {
"date_group1": {
"dates": [
"20191129",
"20191127",
"20191126"
]
},
"date_group2": {
"dates": [
"20191129",
"20191127",
"20191126"
]
},
"date_group3": {
"dates": [
"2019-12-02"
]
},
"date_group4": {
"dates": [
"2019-11-27"
]
},
"date_group5": {
"dates": [
"2019-11-27"
]
}
}
}
]
This is my attempt to parse that text data. I deliberately chose parsec.py, a haskell parsec-like parser combinators library, because it works more clearly then regular expressions, so it is easier to debug and test.
And second cause is much more flexibility of getting output data format.
import re
from parsec import *
spaces = regex(r's*', re.MULTILINE)
@generate
def getHeader():
s1 = yield string ("DateGroup")
s2 = ''.join( (yield many1(digit())))
return (s1 + s2)
@generate
def getDataLine():
s1 = yield digit()
s2 = ''.join((yield many1 (none_of ("rn"))))
yield spaces
return (s1 + s2)
@generate
def getChunk():
yield spaces
header = yield getHeader
yield spaces
dataList = yield many1 (getDataLine)
return (header,dataList)
@generate
def getData():
yield spaces
parsedData = yield many1(getChunk)
yield eof()
return parsedData
inputText = """DateGroup1
20191129
20191127
20191126
DateGroup2
20191129
20191127
20191126
DateGroup3
2019-12-02
DateGroup4
2019-11-27
DateGroup5
2019-11-27"""
result = getData.parse(inputText)
for p in result:
print(p)
Output:
('DateGroup1', ['20191129', '20191127', '20191126'])
('DateGroup2', ['20191129', '20191127', '20191126'])
('DateGroup3', ['2019-12-02'])
('DateGroup4', ['2019-11-27'])
('DateGroup5', ['2019-11-27'])
So I have a generated text file that I’d like to parse into a couple lists of dates. I had it figured out when there was one date per ‘group’ but i realized i may have to deal with multiple date values per group.
My .txt file looks like this:
DateGroup1
20191129
20191127
20191126
DateGroup2
20191129
20191127
20191126
DateGroup3
2019-12-02
DateGroup4
2019-11-27
DateGroup5
2019-11-27
And ideally i would be able to parse this out into 5 lists that include the dates for each group. I am so stumped
Here is an example that you could build off of, every time it reads a string rather than a number it then makes a new list and puts all the dates under the group in it
import os
#read file
lineList = 0
with open("test.txt") as f:
lineList = f.readlines()
#make new list to hold variables
lists = []
#loop through and check for numbers and strings
y=-1
for x in range(len(lineList)):
#check if it is a number or a string
if(lineList[x][0] is not None and not lineList[x][0].isdigit()):
#if it is a string make a new list and push back the name
lists.append([lineList[x]])
y+=1
else:
#if it is the number append it to the current list
lists[y].append(lineList[x])
#print the lists
for x in lists:
print(x)
Start by reading in your whole text file. Then you can count the amount of occurrences of “DateGroup”, which seems to be the constant part in your date group separation. You can then parse your file by going through all the data that is in between any two “DateGroup” identifiers or between one “DateGroup” identifier and the end of the file. Try to understand the following piece of code and build your application on top of that:
file = open("dates.txt")
text = file.read()
file.close()
amountGroups = text.count("DateGroup")
list = []
index = 0
i = 0
for i in range(amountGroups):
list.append([])
index = text.find("DateGroup", index)
index = text.find("n", index) + 1
indexEnd = text.find("DateGroup", index)
if(indexEnd == -1):
indexEnd = len(text)
while(index < indexEnd):
indexNewline = text.find("n", index)
list[i].append(text[index:indexNewline])
index = indexNewline + 1
print(list)
Just loop over each line, check for your key that will group data, remove newlines and store each new date.
DATE_GROUP_SEPARATOR = 'DateGroup'
sorted_data = {}
with open('test.txt') as file:
last_group = None
for line in file.readlines():
line = line.replace('n', '')
if DATE_GROUP_SEPARATOR in line:
sorted_data[line] = []
last_group = line
else:
sorted_data[last_group].append(line)
for date_group, dates in sorted_data.items():
print(f"{date_group}: {dates}")
This first section just to show how to treat a string with data as if it came from a file. That helps if you don’t want to generate the actual file of the OP but want to visibly import the data in the editor.
import sys
from io import StringIO # allows treating some lines in editor as if they were from a file)
dat=StringIO("""DateGroup1
20191129
20191127
20191126
DateGroup2
20191129
20191127
20191126
DateGroup3
2019-12-02
DateGroup4
2019-11-27
DateGroup5
2019-11-27""")
lines=[ l.strip() for l in dat.readlines()]
print(lines)
output:
['DateGroup1', '20191129', '20191127', '20191126', 'DateGroup2', '20191129', '20191127', '20191126', 'DateGroup3', '2019-12-02', 'DateGroup4', '2019-11-27', 'DateGroup5', '2019-11-27']
Now one possible way to generate your desired list of lists, while ensuring that both possible date formats are covered:
from datetime import datetime
b=[]
for i,line in enumerate(lines):
try: # try first dateformat
do = datetime.strptime(line, '%Y%m%d')
a.append(datetime.strftime(do,'%Y-%m-%d'))
except:
try: # try second dateformat
do=datetime.strptime(line,'%Y-%m-%d')
a.append(datetime.strftime(do,'%Y-%m-%d'))
except: # if neither date, append old list to list of lists & make a new list
if a!=None:
b.append(a)
a=[]
if i==len(lines)-1:
b.append(a)
b
output:
[['2019-11-27'],
['2019-11-29', '2019-11-27', '2019-11-26'],
['2019-11-29', '2019-11-27', '2019-11-26'],
['2019-12-02'],
['2019-11-27'],
['2019-11-27']]
TTP can help to parse this text as well, here is sample template with code how to run it:
from ttp import ttp
data_to_parse = """
DateGroup1
20191129
20191127
20191126
DateGroup2
20191129
20191127
20191126
DateGroup3
2019-12-02
DateGroup4
2019-11-27
DateGroup5
2019-11-27
"""
ttp_template = """
<group name="date_groups.date_group{{ id }}">
DateGroup{{ id }}
{{ dates | to_list | joinmatches() }}
</group>
"""
parser = ttp(data=data_to_parse, template=ttp_template)
parser.parse()
print(parser.result(format="json")[0])
above code would produce this output:
[
{
"date_groups": {
"date_group1": {
"dates": [
"20191129",
"20191127",
"20191126"
]
},
"date_group2": {
"dates": [
"20191129",
"20191127",
"20191126"
]
},
"date_group3": {
"dates": [
"2019-12-02"
]
},
"date_group4": {
"dates": [
"2019-11-27"
]
},
"date_group5": {
"dates": [
"2019-11-27"
]
}
}
}
]
This is my attempt to parse that text data. I deliberately chose parsec.py, a haskell parsec-like parser combinators library, because it works more clearly then regular expressions, so it is easier to debug and test.
And second cause is much more flexibility of getting output data format.
import re
from parsec import *
spaces = regex(r's*', re.MULTILINE)
@generate
def getHeader():
s1 = yield string ("DateGroup")
s2 = ''.join( (yield many1(digit())))
return (s1 + s2)
@generate
def getDataLine():
s1 = yield digit()
s2 = ''.join((yield many1 (none_of ("rn"))))
yield spaces
return (s1 + s2)
@generate
def getChunk():
yield spaces
header = yield getHeader
yield spaces
dataList = yield many1 (getDataLine)
return (header,dataList)
@generate
def getData():
yield spaces
parsedData = yield many1(getChunk)
yield eof()
return parsedData
inputText = """DateGroup1
20191129
20191127
20191126
DateGroup2
20191129
20191127
20191126
DateGroup3
2019-12-02
DateGroup4
2019-11-27
DateGroup5
2019-11-27"""
result = getData.parse(inputText)
for p in result:
print(p)
Output:
('DateGroup1', ['20191129', '20191127', '20191126'])
('DateGroup2', ['20191129', '20191127', '20191126'])
('DateGroup3', ['2019-12-02'])
('DateGroup4', ['2019-11-27'])
('DateGroup5', ['2019-11-27'])