How to read, format, sort, and save a csv file, without pandas
Question:
- Given the following sample data in a file
test.csv
27-Mar-12,8.25,8.35,8.17,8.19,9801989
26-Mar-12,8.16,8.25,8.12,8.24,8694416
23-Mar-12,8.05,8.12,7.95,8.09,8149170
- How can this file be parsed without using
pandas
?
- Open the file
- Format the date column into a
datetime
date formatted string
- Sort all rows by the column 0, the date column
- Save back to the same file, with a header for the date column
- With
pandas
this can be accomplished with a single (long) line of code, not including the import.
- It should be noted, using
parse_date
can be very slow, if date_parser
isn’t used.
import pandas as pd
(pd.read_csv('test.csv', header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
.rename(columns={0: 'date'})
.sort_values('date')
.to_csv('test.csv', index=False))
Expected Form
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
- This question and answer are being written to fill a knowledge content gap on Stack Overflow.
- It’s very easy to use
pandas
for this task.
- It was surprisingly difficult to come up with all the necessary pieces to create a complete solution, without
pandas
.
- This should be beneficial for anyone curious about this task, and for students prohibited from using
pandas
.
- I wouldn’t mind seeing a solution with
numpy
, but the primary point of the question, is to accomplish this task with only packages from the standard library.
The three answers are all acceptable solutions to the question.
Answers:
pandas
is the easier tool for parsing and cleaning files.
- What requires 1 line of
pandas
, took 11 lines of code, and requires a for-loop
.
- This requires the following packages and functions
csv
& datetime
- Methods of File Objects:
.seek
& .truncate
- Sorting: How To
- Initially,
list()
was used to unpack the csv.reader
object, but that was removed, to update the date value, while iterating through the reader
.
- A custom key function can be supplied to
sorted
to customize the sort order, but I do not see a way to return a value from the lambda
expression.
- Originally
key=lambda row: datetime.strptime(row[0], '%Y-%m-%d')
was used, but has been removed, since the updated date column doesn’t contain month names.
- If the date column contains month names, it won’t be properly sorted, without a custom sort key.
import csv
from datetime import datetime
# open the file for reading and writing
with open('test1.csv', mode='r+', newline='') as f:
# create a reader and writer opbject
reader, writer = csv.reader(f), csv.writer(f)
data = list()
# iterate through the reader and update column 0 to a datetime date string
for row in reader:
# update column 0 to a datetime date string
row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
# append the row to data
data.append(row)
# sort all of the rows, based on date, with a lambda expression
data = sorted(data, key=lambda row: row[0])
# change the stream position to the given byte offset
f.seek(0)
# truncate the file size
f.truncate()
# add a header to data
data.insert(0, ['date', 1, 2, 3, 4, 5])
# write data to the file
writer.writerows(data)
Updated test.csv
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
%time
test
import pandas
import pandas_datareader as web
# test data with 1M rows
df = web.DataReader(ticker, data_source='yahoo', start='1980-01-01', end='2020-09-27').drop(columns=['Adj Close']).reset_index().sort_values('High', ascending=False)
df.Date = df.Date.dt.strftime('%d-%b-%y')
df = pd.concat([df]*100)
df.to_csv('test.csv', index=False, header=False)
Tests
# pandas test with date_parser
%time pandas_test('test.csv')
[out]:
Wall time: 17.9 s
# pandas test without the date_parser parameter
%time pandas_test('test.csv')
[out]:
Wall time: 1min 17s
# from Paddy Alton
%time paddy('test.csv')
[out]:
Wall time: 15.9 s
# from Trenton
%time trenton('test.csv')
[out]:
Wall time: 17.7 s
# from sammywemmy with functions updated to return the correct date format
%time sammy('test.csv')
[out]:
Wall time: 22.2 s
%time sammy2('test.csv')
[out]:
Wall time: 22.2 s
Test Functions
from operator import itemgetter
import csv
import pandas as pd
from datetime import datetime
def pandas_test(file):
(pd.read_csv(file, header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
.rename(columns={0: 'date'})
.sort_values('date')
.to_csv(file, index=False))
def trenton(file):
with open(file, mode='r+', newline='') as f:
reader, writer = csv.reader(f), csv.writer(f)
data = list()
for row in reader:
row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
data.append(row)
data = sorted(data, key=lambda row: row[0])
f.seek(0)
f.truncate()
data.insert(0, ['date', 1, 2, 3, 4, 5])
writer.writerows(data)
def paddy(file):
def format_date(date: str) -> str:
formatted_date = datetime.strptime(date, "%d-%b-%y").date().isoformat()
return formatted_date
with open(file, "r") as f:
lines = f.readlines()
records = [[value for value in line.split(",")] for line in lines]
for record in records:
record[0] = format_date(record[0])
sorted_records = sorted(records, key = lambda r: r[0])
prepared_lines = [",".join(record).strip("n") for record in sorted_records]
field_names = "date,1,2,3,4,5"
prepared_lines.insert(0, field_names)
prepared_data = "n".join(prepared_lines)
with open(file, "w") as f:
f.write(prepared_data)
def sammy(file):
# updated with .date().isoformat() to return the correct format
with open(file) as csvfile:
fieldnames = ["date", 1, 2, 3, 4, 5]
reader = csv.DictReader(csvfile, fieldnames=fieldnames)
mapping = list(reader)
mapping = [
{
key: datetime.strptime(value, ("%d-%b-%y")).date().isoformat()
if key == "date" else value
for key, value in entry.items()
}
for entry in mapping
]
mapping = sorted(mapping, key=itemgetter("date"))
with open(file, mode="w", newline="") as csvfile:
fieldnames = mapping[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in mapping:
writer.writerow(row)
def sammy2(file):
# updated with .date().isoformat() to return the correct format
with open(file) as csvfile:
reader = csv.reader(csvfile, delimiter=",")
mapping = dict(enumerate(reader))
num_of_cols = len(mapping[0])
fieldnames = ["date" if n == 0 else n
for n in range(num_of_cols)]
mapping = [
[ datetime.strptime(val, "%d-%b-%y").date().isoformat()
if ind == 0 else val
for ind, val in enumerate(value)
]
for key, value in mapping.items()
]
mapping = sorted(mapping, key=itemgetter(0))
with open(file, mode="w", newline="") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(fieldnames)
for row in mapping:
csvwriter.writerow(row)
Using as few imports as I can:
from datetime import datetime
def format_date(date: str) -> str:
formatted_date = datetime.strptime(date, "%d-%b-%y").date().isoformat()
return formatted_date
# read in the CSV
with open("test.csv", "r") as file:
lines = file.readlines()
records = [[value for value in line.split(",")] for line in lines]
# reformat the first field in each record
for record in records:
record[0] = format_date(record[0])
# having formatted the dates, sort records by first (date) field:
sorted_records = sorted(records, key = lambda r: r[0])
# join values with commas once more, removing newline characters
prepared_lines = [",".join(record).strip("n") for record in sorted_records]
# create a header row
field_names = "date,1,2,3,4,5"
# prepend the header row
prepared_lines.insert(0, field_names)
prepared_data = "n".join(prepared_lines)
# write out the CSV
with open("test.csv", "w") as file:
file.write(prepared_data)
As the OP states, Pandas makes this easy; an alternative is to use the DictReader and DictWriter options; it still is more verbose than using Pandas (beauty of abstraction here, Pandas does the heavy lifting for us).
import csv
from datetime import datetime
from operator import itemgetter
with open("test.csv") as csvfile:
fieldnames = ["date", 1, 2, 3, 4, 5]
reader = csv.DictReader(csvfile, fieldnames=fieldnames)
mapping = list(reader)
mapping = [
{
key: datetime.strptime(value, ("%d-%b-%y"))
if key == "date" else value
for key, value in entry.items()
}
for entry in mapping
]
mapping = sorted(mapping, key=itemgetter("date"))
with open("test.csv", mode="w", newline="") as csvfile:
fieldnames = mapping[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in mapping:
writer.writerow(row)
Since the fieldnames are not known ahead of time, we could use the csvreader and csvwriter options:
with open("test.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
mapping = dict(enumerate(reader))
num_of_cols = len(mapping[0])
fieldnames = ["date" if n == 0 else n
for n in range(num_of_cols)]
mapping = [
[ datetime.strptime(val, "%d-%b-%y")
if ind == 0 else val
for ind, val in enumerate(value)
]
for key, value in mapping.items()
]
mapping = sorted(mapping, key=itemgetter(0))
with open("test.csv", mode="w", newline="") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(fieldnames)
for row in mapping:
csvwriter.writerow(row)
This sorting/formatting task can be accomplished in mlr without any coding, kind of 😉
cat nopanda.csv
27-Mar-12,8.25,8.35,8.17,8.19,9801989
26-Mar-12,8.16,8.25,8.12,8.24,8694416
23-Mar-12,8.05,8.12,7.95,8.09,8149170
brew install miller
mlr --csv --ofs ',' --implicit-csv-header label date,1,2,3,4,5 then put '$date=strftime(strptime($date, "%d-%b-%y"), "%Y-%m-%d");' then sort -f date nopanda.csv
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
See documentation:
- Given the following sample data in a file
test.csv
27-Mar-12,8.25,8.35,8.17,8.19,9801989
26-Mar-12,8.16,8.25,8.12,8.24,8694416
23-Mar-12,8.05,8.12,7.95,8.09,8149170
- How can this file be parsed without using
pandas
?- Open the file
- Format the date column into a
datetime
date formatted string - Sort all rows by the column 0, the date column
- Save back to the same file, with a header for the date column
- With
pandas
this can be accomplished with a single (long) line of code, not including the import.- It should be noted, using
parse_date
can be very slow, ifdate_parser
isn’t used.
- It should be noted, using
import pandas as pd
(pd.read_csv('test.csv', header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
.rename(columns={0: 'date'})
.sort_values('date')
.to_csv('test.csv', index=False))
Expected Form
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
- This question and answer are being written to fill a knowledge content gap on Stack Overflow.
- It’s very easy to use
pandas
for this task. - It was surprisingly difficult to come up with all the necessary pieces to create a complete solution, without
pandas
. - This should be beneficial for anyone curious about this task, and for students prohibited from using
pandas
. - I wouldn’t mind seeing a solution with
numpy
, but the primary point of the question, is to accomplish this task with only packages from the standard library.
The three answers are all acceptable solutions to the question.
pandas
is the easier tool for parsing and cleaning files.- What requires 1 line of
pandas
, took 11 lines of code, and requires afor-loop
.
- What requires 1 line of
- This requires the following packages and functions
csv
&datetime
- Methods of File Objects:
.seek
&.truncate
- Sorting: How To
- Initially,
list()
was used to unpack thecsv.reader
object, but that was removed, to update the date value, while iterating through thereader
. - A custom key function can be supplied to
sorted
to customize the sort order, but I do not see a way to return a value from thelambda
expression.- Originally
key=lambda row: datetime.strptime(row[0], '%Y-%m-%d')
was used, but has been removed, since the updated date column doesn’t contain month names. - If the date column contains month names, it won’t be properly sorted, without a custom sort key.
- Originally
import csv
from datetime import datetime
# open the file for reading and writing
with open('test1.csv', mode='r+', newline='') as f:
# create a reader and writer opbject
reader, writer = csv.reader(f), csv.writer(f)
data = list()
# iterate through the reader and update column 0 to a datetime date string
for row in reader:
# update column 0 to a datetime date string
row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
# append the row to data
data.append(row)
# sort all of the rows, based on date, with a lambda expression
data = sorted(data, key=lambda row: row[0])
# change the stream position to the given byte offset
f.seek(0)
# truncate the file size
f.truncate()
# add a header to data
data.insert(0, ['date', 1, 2, 3, 4, 5])
# write data to the file
writer.writerows(data)
Updated test.csv
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
%time
test
import pandas
import pandas_datareader as web
# test data with 1M rows
df = web.DataReader(ticker, data_source='yahoo', start='1980-01-01', end='2020-09-27').drop(columns=['Adj Close']).reset_index().sort_values('High', ascending=False)
df.Date = df.Date.dt.strftime('%d-%b-%y')
df = pd.concat([df]*100)
df.to_csv('test.csv', index=False, header=False)
Tests
# pandas test with date_parser
%time pandas_test('test.csv')
[out]:
Wall time: 17.9 s
# pandas test without the date_parser parameter
%time pandas_test('test.csv')
[out]:
Wall time: 1min 17s
# from Paddy Alton
%time paddy('test.csv')
[out]:
Wall time: 15.9 s
# from Trenton
%time trenton('test.csv')
[out]:
Wall time: 17.7 s
# from sammywemmy with functions updated to return the correct date format
%time sammy('test.csv')
[out]:
Wall time: 22.2 s
%time sammy2('test.csv')
[out]:
Wall time: 22.2 s
Test Functions
from operator import itemgetter
import csv
import pandas as pd
from datetime import datetime
def pandas_test(file):
(pd.read_csv(file, header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
.rename(columns={0: 'date'})
.sort_values('date')
.to_csv(file, index=False))
def trenton(file):
with open(file, mode='r+', newline='') as f:
reader, writer = csv.reader(f), csv.writer(f)
data = list()
for row in reader:
row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
data.append(row)
data = sorted(data, key=lambda row: row[0])
f.seek(0)
f.truncate()
data.insert(0, ['date', 1, 2, 3, 4, 5])
writer.writerows(data)
def paddy(file):
def format_date(date: str) -> str:
formatted_date = datetime.strptime(date, "%d-%b-%y").date().isoformat()
return formatted_date
with open(file, "r") as f:
lines = f.readlines()
records = [[value for value in line.split(",")] for line in lines]
for record in records:
record[0] = format_date(record[0])
sorted_records = sorted(records, key = lambda r: r[0])
prepared_lines = [",".join(record).strip("n") for record in sorted_records]
field_names = "date,1,2,3,4,5"
prepared_lines.insert(0, field_names)
prepared_data = "n".join(prepared_lines)
with open(file, "w") as f:
f.write(prepared_data)
def sammy(file):
# updated with .date().isoformat() to return the correct format
with open(file) as csvfile:
fieldnames = ["date", 1, 2, 3, 4, 5]
reader = csv.DictReader(csvfile, fieldnames=fieldnames)
mapping = list(reader)
mapping = [
{
key: datetime.strptime(value, ("%d-%b-%y")).date().isoformat()
if key == "date" else value
for key, value in entry.items()
}
for entry in mapping
]
mapping = sorted(mapping, key=itemgetter("date"))
with open(file, mode="w", newline="") as csvfile:
fieldnames = mapping[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in mapping:
writer.writerow(row)
def sammy2(file):
# updated with .date().isoformat() to return the correct format
with open(file) as csvfile:
reader = csv.reader(csvfile, delimiter=",")
mapping = dict(enumerate(reader))
num_of_cols = len(mapping[0])
fieldnames = ["date" if n == 0 else n
for n in range(num_of_cols)]
mapping = [
[ datetime.strptime(val, "%d-%b-%y").date().isoformat()
if ind == 0 else val
for ind, val in enumerate(value)
]
for key, value in mapping.items()
]
mapping = sorted(mapping, key=itemgetter(0))
with open(file, mode="w", newline="") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(fieldnames)
for row in mapping:
csvwriter.writerow(row)
Using as few imports as I can:
from datetime import datetime
def format_date(date: str) -> str:
formatted_date = datetime.strptime(date, "%d-%b-%y").date().isoformat()
return formatted_date
# read in the CSV
with open("test.csv", "r") as file:
lines = file.readlines()
records = [[value for value in line.split(",")] for line in lines]
# reformat the first field in each record
for record in records:
record[0] = format_date(record[0])
# having formatted the dates, sort records by first (date) field:
sorted_records = sorted(records, key = lambda r: r[0])
# join values with commas once more, removing newline characters
prepared_lines = [",".join(record).strip("n") for record in sorted_records]
# create a header row
field_names = "date,1,2,3,4,5"
# prepend the header row
prepared_lines.insert(0, field_names)
prepared_data = "n".join(prepared_lines)
# write out the CSV
with open("test.csv", "w") as file:
file.write(prepared_data)
As the OP states, Pandas makes this easy; an alternative is to use the DictReader and DictWriter options; it still is more verbose than using Pandas (beauty of abstraction here, Pandas does the heavy lifting for us).
import csv
from datetime import datetime
from operator import itemgetter
with open("test.csv") as csvfile:
fieldnames = ["date", 1, 2, 3, 4, 5]
reader = csv.DictReader(csvfile, fieldnames=fieldnames)
mapping = list(reader)
mapping = [
{
key: datetime.strptime(value, ("%d-%b-%y"))
if key == "date" else value
for key, value in entry.items()
}
for entry in mapping
]
mapping = sorted(mapping, key=itemgetter("date"))
with open("test.csv", mode="w", newline="") as csvfile:
fieldnames = mapping[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in mapping:
writer.writerow(row)
Since the fieldnames are not known ahead of time, we could use the csvreader and csvwriter options:
with open("test.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
mapping = dict(enumerate(reader))
num_of_cols = len(mapping[0])
fieldnames = ["date" if n == 0 else n
for n in range(num_of_cols)]
mapping = [
[ datetime.strptime(val, "%d-%b-%y")
if ind == 0 else val
for ind, val in enumerate(value)
]
for key, value in mapping.items()
]
mapping = sorted(mapping, key=itemgetter(0))
with open("test.csv", mode="w", newline="") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(fieldnames)
for row in mapping:
csvwriter.writerow(row)
This sorting/formatting task can be accomplished in mlr without any coding, kind of 😉
cat nopanda.csv
27-Mar-12,8.25,8.35,8.17,8.19,9801989
26-Mar-12,8.16,8.25,8.12,8.24,8694416
23-Mar-12,8.05,8.12,7.95,8.09,8149170
brew install miller
mlr --csv --ofs ',' --implicit-csv-header label date,1,2,3,4,5 then put '$date=strftime(strptime($date, "%d-%b-%y"), "%Y-%m-%d");' then sort -f date nopanda.csv
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
See documentation: