Python and scope
Question:
So I’ve got this program I’ve been working on and I’m stuck with the scope of it. Basically, I’m taking csv’s which are coming in from the field and scrubbing them of bad rows. One of the checks I need to write into the program is a date calculation. I have a function for this, but as my program is now, I have to call this function after I’ve already written the data to a new file. I am having a hard time wrapping my head around the scope of this problem. Here is my program:
import csv
import glob
import os
import stat
import shutil
from os import path
from datetime import datetime, timedelta
# path to files -- parent folder is source.
source = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files'
destination = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptCompleted Data Files'
data = []
# list of all csv files in path
TD_files = glob.glob(os.path.join(source, "*.csv"), recursive=True)
# A function to test whether a file is read only. If it is this function will change the file's attribute to writeable.
def IsFileReadOnly(f):
file_att = os.stat(f)[0]
if not file_att & stat.S_IWRITE: # if the file is read only
os.chmod(f, stat.S_IWRITE) # it needs to be made writeable
# A function to decide if a file is a Trend file or not. If it is not a Trend File, it is simply erased.
def IsTrendFile(f):
with open(f, newline='', encoding='utf-8') as g: # Open file as read
r = csv.reader(g) # Declare read variable for list
is_trend = next(r)[0] == 'TD' # Initialize isTrend variable to cell A1
if not is_trend: # If file is not a trend file
os.remove(f) # Erase the file
# A function which moves files from source to destination. This function will overwrite any existing file with new data.
def MoveFiles():
directory_exists = os.path.exists(destination)
if not directory_exists: # If the directory does not exist
os.makedirs(destination) # Make the directory at the destination
files = os.listdir(source) # List all csv files in the source directory
for file in files: # FOR EACH TREND FILE IN ALL TREND FILES:
shutil.move(os.path.join(source, file),
os.path.join(destination, file)) # Move files from source to destination
def CheckDates(f):
with open(f, 'r', newline='') as src:
row_0 = src.readline()
tokens = row_0.strip().split(',')
orig_time = tokens[1] + ' ' + tokens[2]
base_time = datetime.strptime(orig_time, '%m/%d/%Y %H:%M:%S')
src.readline()
for line in src:
tokens = line.strip().split(',')
row_time = datetime.strptime(tokens[0], '%m/%d/%Y %H:%M:%S')
td = row_time - base_time
if td < timedelta(0):
pass
# A function which cleans the data of null and truncated rows.
def CleanUpData(f):
IsTrendFile(f) # check to see if file is a Trend File
IsFileReadOnly(f) # check to see if file is read only
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r) # get trend header
machine_header_tuple = next(r) # get machine header
data = [line for line in r
if len(line) == len(trend_header_tuple)
or len(line) == len(machine_header_tuple)]
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
CheckDates(f)
# A function which writes data to .csv files.
def WriteData(f, data, trend_header_tuple, machine_header_tuple):
with open(f, 'w', newline='') as g: # open file as write
w = csv.writer(g) # declare write variable
w.writerow(trend_header_tuple) # write the trend header tuple to file
w.writerow(machine_header_tuple) # write the machine header tuple to file
w.writerows(data) # write rows to file
# A function which does all the work.
def DoWork():
for f in TD_files: # FOR ALL TREND FILES
CleanUpData(f) # Clean up the data
MoveFiles() # Move the files
How would I structure this program so that CheckDates() is part of the main with open()
loop in the CleanUpData()
function, or something which is equal to this? When I put CheckDates()
inside of the data
variable as part of the if... or
statements, only a certain amount of data was written to the files. And moving the call around the program proved to be useless as well.
Here is my main:
import DataCleanupScript
import getpass
# A function which prints a welcome statement to the user.
def welcome_screen():
print(f'Hello', get_name(), 'please wait while the script cleans the data. This may take a few moments depending'
' on network speed and the amount of files.')
# A function which gets the users name.
def get_name():
return getpass.getuser()
# Main
if __name__ != '__main__':
pass
else:
welcome_screen()
DataCleanupScript.DoWork()
print('Complete! Press Enter to continue.')
input()
This is an example csv file with messed up data. you can see row 11 is messed up. This is supposed to be like this. Some rows come in from the field like this for some reason.
TD,08/24/2021,14:14:08,21012,223,0,1098,0,031,810,12,01,092,048,0008,02
Date/Time,G120010,M129000,G110100,M119030,G112070,G112080,G111030,G127020,G127030,G120020,G120030,G121020,G111040,G112010,P102000,G112020,G112040,G112090,G110050,G110060,G110070,T111100
08/27/2021 00:00:00,75,249.75,0,0,12.61895,0,58.04886,64,87.6,1,2,5,41.5,5,686,2,239,2700,0,154,0,5
08/27/2021 00:00:02,75,249.75,0,0,12.61895,0,58.04743,64,87.6,1,2,5,41.5,5,686,2,239,2700,0,154,0,5
08/27/2021 00:00:04,75,249.75,0,0,12.61895,0,58.05036,64,87.6,1,2,5,41.5,5,686,2,239,2700,0,154,0,5
08/31/2021 08:05:48,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:05:50,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:05:52,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:05:54,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:05:56,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:41,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:43,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:45,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:47,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:49,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:51,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:53,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:55,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:57,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:59,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:01,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:03,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:05,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:07,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:09,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:11,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:13,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:15,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN
EDIT:
I have incorporated suggestions from the community and moved some of the hardcoded things to main()
, and restructured my data
variable to be more dynamic. Here is the new main()
.
import DataCleanupScript
import getpass
import glob
import os
# path to files -- parent folder is source.
source = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files'
destination = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptCompleted Data Files'
# list of all csv files in path
TD_files = glob.glob(os.path.join(source, "*.csv"), recursive=True)
# A function which prints a welcome statement to the user.
def welcome_screen():
print(f'Hello', get_name(), 'please wait while the script cleans the data. This may take a few moments depending'
' on network speed and the amount of files.')
# A function which gets the users name.
def get_name():
return getpass.getuser()
# Main
if __name__ != '__main__':
pass
else:
welcome_screen()
DataCleanupScript.DoWork(TD_files, source, destination)
print('Complete! Press Enter to continue.')
input()
And the functions: sorry I haven’t had time to comment any of the new code, but the new CheckDate()
loops through each row of a csv, and compares the date in that row to the date in the trend_header_tuple
. If the date is before the date in the tuple, it needs to strip the row. Right now I have solved the problem of the data
variable running off the end of the array when the csv ended in a row of bunk information like in the example I have provided. It is not dropping the found indices though.
import csv
from datetime import datetime, timedelta
import os
import stat
import shutil
# A function to test whether a file is read only. If it is this function will change the file's attribute to writeable.
def IsFileReadOnly(f):
file_att = os.stat(f)[0]
if not file_att & stat.S_IWRITE: # If the file is read only
os.chmod(f, stat.S_IWRITE) # It needs to be made writeable
# A function to decide if a file is a Trend file or not. If it is not a Trend File, it is simply erased.
def IsTrendFile(f):
with open(f, newline='', encoding='utf-8') as g: # Open file as read
r = csv.reader(g) # Declare read variable for list
is_trend = next(r)[0] == 'TD' # Initialize is_trend variable
if not is_trend: # If file is not a trend file
os.remove(f) # Erase the file
# A function which moves files from source to destination. This function will overwrite any existing file with new data.
def MoveFiles(source, destination):
directory_exists = os.path.exists(destination)
if not directory_exists: # If the directory does not exist
os.makedirs(destination) # Make the directory at the destination
files = os.listdir(source) # List all csv files in the source directory
for file in files: # FOR EACH TREND FILE IN ALL TREND FILES:
shutil.move(os.path.join(source, file), os.path.join(destination, file)) # Move files from source to destination
def CheckDates(f, line: str, trend_header_tuple):
row_time = datetime.strptime(line[0], '%m/%d/%Y %H:%M:%S')
orig_time = trend_header_tuple[1] + ' ' + trend_header_tuple[2]
base_time = datetime.strptime(orig_time, '%m/%d/%Y %H:%M:%S')
# print(f'recovered this base time: {base_time}')
td = row_time - base_time
line = line[0].strip()
if td < timedelta(0):
print('this line is before the base time:')
print(f' {line}')
print(f' {f}')
return False
return True
# A function which cleans the data of null and truncated rows.
def CleanUpData(f, source, destination):
IsTrendFile(f) # check to see if file is a Trend File
IsFileReadOnly(f) # check to see if file is read only
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r) # get trend header
machine_header_tuple = next(r) # get machine header
data = []
for line in r:
if line:
valid = False
if len(line) == len(trend_header_tuple):
valid = True
elif len(line) == len(machine_header_tuple):
valid = True
if CheckDates(f, line, trend_header_tuple):
valid = True
if valid:
data.append(line)
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
# MoveFiles(source, destination)
# A function which writes data to .csv files.
def WriteData(f, data, trend_header_tuple, machine_header_tuple):
with open(f, 'w', newline='') as g: # open file as write
w = csv.writer(g) # declare write variable
w.writerow(trend_header_tuple) # write the trend header tuple to file
w.writerow(machine_header_tuple) # write the machine header tuple to file
w.writerows(data) # write rows to file
# A function which does all the work.
def DoWork(TD_files, source, destination):
for f in TD_files: # FOR ALL TREND FILES
CleanUpData(f, source, destination) # Clean up the data
MoveFiles(source, destination) # Move the files
Terminal output:
C:UsersklucasPycharmProjectsDataCleanupScriptvenvScriptspython.exe C:UsersklucasPycharmProjectsDataCleanupScriptmain.py
Hello klucas please wait while the script cleans the data. This may take a few moments depending on network speed and the amount of files.
this line is before the base time:
08/31/1521 00:00:00
C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files2022_09_06_08_00_10_554_IF1Rockwell-Trend_SN2231098_20210831.csv
Answers:
I don’t think I’m 100% clear on the problem, but I’ll try my best to help. For this section of your code:
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r) # get trend header
machine_header_tuple = next(r) # get machine header
data = [line for line in r
if len(line) == len(trend_header_tuple)
or len(line) == len(machine_header_tuple)]
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
CheckDates(f)
I would suggest that you instead declare data
as an empty list to begin, then iterate over r
and append line
to data
if it meets your criteria. This way, instead of using a list comprehension, you can specify individual conditions and add more advanced logic easily:
def CheckDate(line: str):
line = line.strip() # process line however you need
if LINE_VALIDATION_HERE:
return True
return False
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r) # get trend header
machine_header_tuple = next(r) # get machine header
data = []
for line in r:
valid = False
# Trend headers
if len(line) == len(trend_header_tuple):
valid = True
# Machine headers
elif len(line) == len(machine_header_tuple):
valid = True
# Dates (only if valid)
elif CheckDate(line):
valid = True
if valid:
data.append(line)
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
This restructuring might be a bit overboard, but hopefully it makes everything easier to understand and extend in the future.
Again, I’m not exactly sure what CheckDates
does, so this code assumes each line can be checked independent of the others. If this is not true, I’d suggest adding additional parameters to CheckDate
.
The answer proved to be restructuring the data
variable from list comprehension into a completely different function. This way I was able to get way more functionality out of it and in the future if I need to write more functionality into it, it will be easy to do so.
What I originally had:
data = [line for line in r
if len(line) == len(trend_header_tuple)
or len(line) == len(machine_header_tuple)] # set data for file
What I ended up with:
data = [] # initialize data list
IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple) # check to see if the data is valid
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
And the new function which takes the place of the old data
variable:
def IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple):
for line in r1: # for each line in the trend file
if line: # if it is a line
valid = False # initialize the variable to False
if len(line) == len(trend_header_tuple): # if the length of the line = length of the trend_header_tuple
valid = True # the data is valid
elif len(line) == len(machine_header_tuple): # elseif the length of the line = length of the machine_header_tuple
valid = True # the data is valid
valid = valid and CheckDates(f, line, trend_header_tuple) # Check the dates next
if valid: # if everything checks out as valid...
data.append(line) # append the current line to the data variable
And the full program:
import DataCleanupScript
import getpass
import glob
import os
# path to files -- parent folder is source.
source = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files'
destination = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptCompleted Data Files'
# list of all csv files in path
TD_files = glob.glob(os.path.join(source, "*.csv"), recursive=True)
# A function which prints a welcome statement to the user.
def welcome_screen():
print(f'Hello', get_name(), 'please wait while the script cleans the data.
This may take a few moments depending'
' on network speed and the amount of files.nn')
# A function which gets the users name.
def get_name():
return getpass.getuser()
# Main
if __name__ != '__main__':
pass
else:
welcome_screen() # welcome the user
DataCleanupScript.DoWork(TD_files, source, destination) # clean the data
print('Complete! Press Enter to continue.') # print an ending statement
input() # and wait for user input
And the functions:
import csv
from datetime import datetime, timedelta
import os
import stat
import shutil
# A function to test whether a file is read only. If it is this function will change the file's attribute to writeable.
def IsFileReadOnly(f):
file_att = os.stat(f)[0] # get the current file attribute
if not file_att & stat.S_IWRITE: # if the file is read only
os.chmod(f, stat.S_IWRITE) # it needs to be made writeable
# A function to decide if a file is a Trend file or not. If it is not a Trend File, it is simply erased.
def IsTrendFile(f):
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader(g) # declare read variable for list
is_trend = next(r)[0] == 'TD' # initialize is_trend variable
if not is_trend: # if file is not a trend file
os.remove(f) # erase the file
# A function which moves files from source to destination. This function will overwrite any existing file with new data.
def MoveFiles(source, destination):
directory_exists = os.path.exists(destination)
if not directory_exists: # if the directory does not exist
os.makedirs(destination) # make the directory at the destination
files = os.listdir(source) # list all csv files in the source directory
for file in files: # FOR EACH TREND FILE IN ALL TREND FILES:
shutil.move(os.path.join(source, file), os.path.join(destination, file)) # move files from source to destination
# A function which compares the dates of each index of a csv to the date in the trend_header_tuple. If the row_time comes
# before base_time, the check fails.
def CheckDates(f, line: str, trend_header_tuple):
row_time = datetime.strptime(line[0], '%m/%d/%Y %H:%M:%S') # strip the time from each row to get a time object
orig_time = trend_header_tuple[1] + ' ' + trend_header_tuple[2] # concatenate the trend_header_tuple into a string
base_time = datetime.strptime(orig_time, '%m/%d/%Y %H:%M:%S') # get a time object from the trend_header_tuple
td = row_time - base_time # calculate the time difference (a negative number indicates a time before base_time)
line = line[0].strip() # strip the line
if td < timedelta(0): # if td is negative
print(f' This line is before the base time and has been stripped:') # print some stuff to the user:
print(f' {line}') # which line was stripped
print(f' {f}') # which file it was in
return False # the check fails
return True # the check passes
# A function which cleans the data.
def CleanUpData(f):
IsTrendFile(f) # check to see if file is a trend file
IsFileReadOnly(f) # check to see if file is read only
with open(f, newline='', encoding='utf-8') as g: # open file as read
r1 = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r1) # get trend header
machine_header_tuple = next(r1) # get machine header
data = [] # initialize data list
IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple) # check to see if the data is valid
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
# A function to check to see if data[] is valid.
def IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple):
for line in r1: # for each line in the trend file
if line: # if it is a line
valid = False # initialize the variable to False
if len(line) == len(trend_header_tuple): # if the length of the line = length of the trend_header_tuple
valid = True # the data is valid
elif len(line) == len(machine_header_tuple): # elseif the length of the line = length of the machine_header_tuple
valid = True # the data is valid
valid = valid and CheckDates(f, line, trend_header_tuple) # Check the dates next
if valid: # if everything checks out as valid...
data.append(line) # append the current line to the data variable
# A function which writes data to .csv files.
def WriteData(f, data, trend_header_tuple, machine_header_tuple):
with open(f, 'w', newline='') as g: # open file as write
w = csv.writer(g) # declare write variable
w.writerow(trend_header_tuple) # write the trend header to file
w.writerow(machine_header_tuple) # write the machine header to file
w.writerows(data) # write the new sanitized rows to file
# A function which does all the work.
def DoWork(TD_files, source, destination):
for f in TD_files: # FOR ALL TREND FILES
CleanUpData(f) # Clean up the data
MoveFiles(source, destination) # Move the files
This new function takes a few args, but returns either true or false to the valid
variable. Then for each line of each csv, if the line is not valid
it is simply stripped before it is appended to the variable and finally written to a new file.
So I’ve got this program I’ve been working on and I’m stuck with the scope of it. Basically, I’m taking csv’s which are coming in from the field and scrubbing them of bad rows. One of the checks I need to write into the program is a date calculation. I have a function for this, but as my program is now, I have to call this function after I’ve already written the data to a new file. I am having a hard time wrapping my head around the scope of this problem. Here is my program:
import csv
import glob
import os
import stat
import shutil
from os import path
from datetime import datetime, timedelta
# path to files -- parent folder is source.
source = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files'
destination = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptCompleted Data Files'
data = []
# list of all csv files in path
TD_files = glob.glob(os.path.join(source, "*.csv"), recursive=True)
# A function to test whether a file is read only. If it is this function will change the file's attribute to writeable.
def IsFileReadOnly(f):
file_att = os.stat(f)[0]
if not file_att & stat.S_IWRITE: # if the file is read only
os.chmod(f, stat.S_IWRITE) # it needs to be made writeable
# A function to decide if a file is a Trend file or not. If it is not a Trend File, it is simply erased.
def IsTrendFile(f):
with open(f, newline='', encoding='utf-8') as g: # Open file as read
r = csv.reader(g) # Declare read variable for list
is_trend = next(r)[0] == 'TD' # Initialize isTrend variable to cell A1
if not is_trend: # If file is not a trend file
os.remove(f) # Erase the file
# A function which moves files from source to destination. This function will overwrite any existing file with new data.
def MoveFiles():
directory_exists = os.path.exists(destination)
if not directory_exists: # If the directory does not exist
os.makedirs(destination) # Make the directory at the destination
files = os.listdir(source) # List all csv files in the source directory
for file in files: # FOR EACH TREND FILE IN ALL TREND FILES:
shutil.move(os.path.join(source, file),
os.path.join(destination, file)) # Move files from source to destination
def CheckDates(f):
with open(f, 'r', newline='') as src:
row_0 = src.readline()
tokens = row_0.strip().split(',')
orig_time = tokens[1] + ' ' + tokens[2]
base_time = datetime.strptime(orig_time, '%m/%d/%Y %H:%M:%S')
src.readline()
for line in src:
tokens = line.strip().split(',')
row_time = datetime.strptime(tokens[0], '%m/%d/%Y %H:%M:%S')
td = row_time - base_time
if td < timedelta(0):
pass
# A function which cleans the data of null and truncated rows.
def CleanUpData(f):
IsTrendFile(f) # check to see if file is a Trend File
IsFileReadOnly(f) # check to see if file is read only
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r) # get trend header
machine_header_tuple = next(r) # get machine header
data = [line for line in r
if len(line) == len(trend_header_tuple)
or len(line) == len(machine_header_tuple)]
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
CheckDates(f)
# A function which writes data to .csv files.
def WriteData(f, data, trend_header_tuple, machine_header_tuple):
with open(f, 'w', newline='') as g: # open file as write
w = csv.writer(g) # declare write variable
w.writerow(trend_header_tuple) # write the trend header tuple to file
w.writerow(machine_header_tuple) # write the machine header tuple to file
w.writerows(data) # write rows to file
# A function which does all the work.
def DoWork():
for f in TD_files: # FOR ALL TREND FILES
CleanUpData(f) # Clean up the data
MoveFiles() # Move the files
How would I structure this program so that CheckDates() is part of the main with open()
loop in the CleanUpData()
function, or something which is equal to this? When I put CheckDates()
inside of the data
variable as part of the if... or
statements, only a certain amount of data was written to the files. And moving the call around the program proved to be useless as well.
Here is my main:
import DataCleanupScript
import getpass
# A function which prints a welcome statement to the user.
def welcome_screen():
print(f'Hello', get_name(), 'please wait while the script cleans the data. This may take a few moments depending'
' on network speed and the amount of files.')
# A function which gets the users name.
def get_name():
return getpass.getuser()
# Main
if __name__ != '__main__':
pass
else:
welcome_screen()
DataCleanupScript.DoWork()
print('Complete! Press Enter to continue.')
input()
This is an example csv file with messed up data. you can see row 11 is messed up. This is supposed to be like this. Some rows come in from the field like this for some reason.
TD,08/24/2021,14:14:08,21012,223,0,1098,0,031,810,12,01,092,048,0008,02
Date/Time,G120010,M129000,G110100,M119030,G112070,G112080,G111030,G127020,G127030,G120020,G120030,G121020,G111040,G112010,P102000,G112020,G112040,G112090,G110050,G110060,G110070,T111100
08/27/2021 00:00:00,75,249.75,0,0,12.61895,0,58.04886,64,87.6,1,2,5,41.5,5,686,2,239,2700,0,154,0,5
08/27/2021 00:00:02,75,249.75,0,0,12.61895,0,58.04743,64,87.6,1,2,5,41.5,5,686,2,239,2700,0,154,0,5
08/27/2021 00:00:04,75,249.75,0,0,12.61895,0,58.05036,64,87.6,1,2,5,41.5,5,686,2,239,2700,0,154,0,5
08/31/2021 08:05:48,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:05:50,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:05:52,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:05:54,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:05:56,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:41,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:43,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:45,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:47,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:49,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:51,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:53,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:55,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:57,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:21:59,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:01,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:03,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:05,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:07,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:09,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:11,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:13,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
08/31/2021 08:22:15,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN
EDIT:
I have incorporated suggestions from the community and moved some of the hardcoded things to main()
, and restructured my data
variable to be more dynamic. Here is the new main()
.
import DataCleanupScript
import getpass
import glob
import os
# path to files -- parent folder is source.
source = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files'
destination = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptCompleted Data Files'
# list of all csv files in path
TD_files = glob.glob(os.path.join(source, "*.csv"), recursive=True)
# A function which prints a welcome statement to the user.
def welcome_screen():
print(f'Hello', get_name(), 'please wait while the script cleans the data. This may take a few moments depending'
' on network speed and the amount of files.')
# A function which gets the users name.
def get_name():
return getpass.getuser()
# Main
if __name__ != '__main__':
pass
else:
welcome_screen()
DataCleanupScript.DoWork(TD_files, source, destination)
print('Complete! Press Enter to continue.')
input()
And the functions: sorry I haven’t had time to comment any of the new code, but the new CheckDate()
loops through each row of a csv, and compares the date in that row to the date in the trend_header_tuple
. If the date is before the date in the tuple, it needs to strip the row. Right now I have solved the problem of the data
variable running off the end of the array when the csv ended in a row of bunk information like in the example I have provided. It is not dropping the found indices though.
import csv
from datetime import datetime, timedelta
import os
import stat
import shutil
# A function to test whether a file is read only. If it is this function will change the file's attribute to writeable.
def IsFileReadOnly(f):
file_att = os.stat(f)[0]
if not file_att & stat.S_IWRITE: # If the file is read only
os.chmod(f, stat.S_IWRITE) # It needs to be made writeable
# A function to decide if a file is a Trend file or not. If it is not a Trend File, it is simply erased.
def IsTrendFile(f):
with open(f, newline='', encoding='utf-8') as g: # Open file as read
r = csv.reader(g) # Declare read variable for list
is_trend = next(r)[0] == 'TD' # Initialize is_trend variable
if not is_trend: # If file is not a trend file
os.remove(f) # Erase the file
# A function which moves files from source to destination. This function will overwrite any existing file with new data.
def MoveFiles(source, destination):
directory_exists = os.path.exists(destination)
if not directory_exists: # If the directory does not exist
os.makedirs(destination) # Make the directory at the destination
files = os.listdir(source) # List all csv files in the source directory
for file in files: # FOR EACH TREND FILE IN ALL TREND FILES:
shutil.move(os.path.join(source, file), os.path.join(destination, file)) # Move files from source to destination
def CheckDates(f, line: str, trend_header_tuple):
row_time = datetime.strptime(line[0], '%m/%d/%Y %H:%M:%S')
orig_time = trend_header_tuple[1] + ' ' + trend_header_tuple[2]
base_time = datetime.strptime(orig_time, '%m/%d/%Y %H:%M:%S')
# print(f'recovered this base time: {base_time}')
td = row_time - base_time
line = line[0].strip()
if td < timedelta(0):
print('this line is before the base time:')
print(f' {line}')
print(f' {f}')
return False
return True
# A function which cleans the data of null and truncated rows.
def CleanUpData(f, source, destination):
IsTrendFile(f) # check to see if file is a Trend File
IsFileReadOnly(f) # check to see if file is read only
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r) # get trend header
machine_header_tuple = next(r) # get machine header
data = []
for line in r:
if line:
valid = False
if len(line) == len(trend_header_tuple):
valid = True
elif len(line) == len(machine_header_tuple):
valid = True
if CheckDates(f, line, trend_header_tuple):
valid = True
if valid:
data.append(line)
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
# MoveFiles(source, destination)
# A function which writes data to .csv files.
def WriteData(f, data, trend_header_tuple, machine_header_tuple):
with open(f, 'w', newline='') as g: # open file as write
w = csv.writer(g) # declare write variable
w.writerow(trend_header_tuple) # write the trend header tuple to file
w.writerow(machine_header_tuple) # write the machine header tuple to file
w.writerows(data) # write rows to file
# A function which does all the work.
def DoWork(TD_files, source, destination):
for f in TD_files: # FOR ALL TREND FILES
CleanUpData(f, source, destination) # Clean up the data
MoveFiles(source, destination) # Move the files
Terminal output:
C:UsersklucasPycharmProjectsDataCleanupScriptvenvScriptspython.exe C:UsersklucasPycharmProjectsDataCleanupScriptmain.py
Hello klucas please wait while the script cleans the data. This may take a few moments depending on network speed and the amount of files.
this line is before the base time:
08/31/1521 00:00:00
C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files2022_09_06_08_00_10_554_IF1Rockwell-Trend_SN2231098_20210831.csv
I don’t think I’m 100% clear on the problem, but I’ll try my best to help. For this section of your code:
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r) # get trend header
machine_header_tuple = next(r) # get machine header
data = [line for line in r
if len(line) == len(trend_header_tuple)
or len(line) == len(machine_header_tuple)]
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
CheckDates(f)
I would suggest that you instead declare data
as an empty list to begin, then iterate over r
and append line
to data
if it meets your criteria. This way, instead of using a list comprehension, you can specify individual conditions and add more advanced logic easily:
def CheckDate(line: str):
line = line.strip() # process line however you need
if LINE_VALIDATION_HERE:
return True
return False
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r) # get trend header
machine_header_tuple = next(r) # get machine header
data = []
for line in r:
valid = False
# Trend headers
if len(line) == len(trend_header_tuple):
valid = True
# Machine headers
elif len(line) == len(machine_header_tuple):
valid = True
# Dates (only if valid)
elif CheckDate(line):
valid = True
if valid:
data.append(line)
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
This restructuring might be a bit overboard, but hopefully it makes everything easier to understand and extend in the future.
Again, I’m not exactly sure what CheckDates
does, so this code assumes each line can be checked independent of the others. If this is not true, I’d suggest adding additional parameters to CheckDate
.
The answer proved to be restructuring the data
variable from list comprehension into a completely different function. This way I was able to get way more functionality out of it and in the future if I need to write more functionality into it, it will be easy to do so.
What I originally had:
data = [line for line in r
if len(line) == len(trend_header_tuple)
or len(line) == len(machine_header_tuple)] # set data for file
What I ended up with:
data = [] # initialize data list
IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple) # check to see if the data is valid
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
And the new function which takes the place of the old data
variable:
def IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple):
for line in r1: # for each line in the trend file
if line: # if it is a line
valid = False # initialize the variable to False
if len(line) == len(trend_header_tuple): # if the length of the line = length of the trend_header_tuple
valid = True # the data is valid
elif len(line) == len(machine_header_tuple): # elseif the length of the line = length of the machine_header_tuple
valid = True # the data is valid
valid = valid and CheckDates(f, line, trend_header_tuple) # Check the dates next
if valid: # if everything checks out as valid...
data.append(line) # append the current line to the data variable
And the full program:
import DataCleanupScript
import getpass
import glob
import os
# path to files -- parent folder is source.
source = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files'
destination = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptCompleted Data Files'
# list of all csv files in path
TD_files = glob.glob(os.path.join(source, "*.csv"), recursive=True)
# A function which prints a welcome statement to the user.
def welcome_screen():
print(f'Hello', get_name(), 'please wait while the script cleans the data.
This may take a few moments depending'
' on network speed and the amount of files.nn')
# A function which gets the users name.
def get_name():
return getpass.getuser()
# Main
if __name__ != '__main__':
pass
else:
welcome_screen() # welcome the user
DataCleanupScript.DoWork(TD_files, source, destination) # clean the data
print('Complete! Press Enter to continue.') # print an ending statement
input() # and wait for user input
And the functions:
import csv
from datetime import datetime, timedelta
import os
import stat
import shutil
# A function to test whether a file is read only. If it is this function will change the file's attribute to writeable.
def IsFileReadOnly(f):
file_att = os.stat(f)[0] # get the current file attribute
if not file_att & stat.S_IWRITE: # if the file is read only
os.chmod(f, stat.S_IWRITE) # it needs to be made writeable
# A function to decide if a file is a Trend file or not. If it is not a Trend File, it is simply erased.
def IsTrendFile(f):
with open(f, newline='', encoding='utf-8') as g: # open file as read
r = csv.reader(g) # declare read variable for list
is_trend = next(r)[0] == 'TD' # initialize is_trend variable
if not is_trend: # if file is not a trend file
os.remove(f) # erase the file
# A function which moves files from source to destination. This function will overwrite any existing file with new data.
def MoveFiles(source, destination):
directory_exists = os.path.exists(destination)
if not directory_exists: # if the directory does not exist
os.makedirs(destination) # make the directory at the destination
files = os.listdir(source) # list all csv files in the source directory
for file in files: # FOR EACH TREND FILE IN ALL TREND FILES:
shutil.move(os.path.join(source, file), os.path.join(destination, file)) # move files from source to destination
# A function which compares the dates of each index of a csv to the date in the trend_header_tuple. If the row_time comes
# before base_time, the check fails.
def CheckDates(f, line: str, trend_header_tuple):
row_time = datetime.strptime(line[0], '%m/%d/%Y %H:%M:%S') # strip the time from each row to get a time object
orig_time = trend_header_tuple[1] + ' ' + trend_header_tuple[2] # concatenate the trend_header_tuple into a string
base_time = datetime.strptime(orig_time, '%m/%d/%Y %H:%M:%S') # get a time object from the trend_header_tuple
td = row_time - base_time # calculate the time difference (a negative number indicates a time before base_time)
line = line[0].strip() # strip the line
if td < timedelta(0): # if td is negative
print(f' This line is before the base time and has been stripped:') # print some stuff to the user:
print(f' {line}') # which line was stripped
print(f' {f}') # which file it was in
return False # the check fails
return True # the check passes
# A function which cleans the data.
def CleanUpData(f):
IsTrendFile(f) # check to see if file is a trend file
IsFileReadOnly(f) # check to see if file is read only
with open(f, newline='', encoding='utf-8') as g: # open file as read
r1 = csv.reader((line.replace(' ', '') for line in g)) # declare read variable while stripping nulls
trend_header_tuple = next(r1) # get trend header
machine_header_tuple = next(r1) # get machine header
data = [] # initialize data list
IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple) # check to see if the data is valid
WriteData(f, data, trend_header_tuple, machine_header_tuple) # write the data to the file
# A function to check to see if data[] is valid.
def IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple):
for line in r1: # for each line in the trend file
if line: # if it is a line
valid = False # initialize the variable to False
if len(line) == len(trend_header_tuple): # if the length of the line = length of the trend_header_tuple
valid = True # the data is valid
elif len(line) == len(machine_header_tuple): # elseif the length of the line = length of the machine_header_tuple
valid = True # the data is valid
valid = valid and CheckDates(f, line, trend_header_tuple) # Check the dates next
if valid: # if everything checks out as valid...
data.append(line) # append the current line to the data variable
# A function which writes data to .csv files.
def WriteData(f, data, trend_header_tuple, machine_header_tuple):
with open(f, 'w', newline='') as g: # open file as write
w = csv.writer(g) # declare write variable
w.writerow(trend_header_tuple) # write the trend header to file
w.writerow(machine_header_tuple) # write the machine header to file
w.writerows(data) # write the new sanitized rows to file
# A function which does all the work.
def DoWork(TD_files, source, destination):
for f in TD_files: # FOR ALL TREND FILES
CleanUpData(f) # Clean up the data
MoveFiles(source, destination) # Move the files
This new function takes a few args, but returns either true or false to the valid
variable. Then for each line of each csv, if the line is not valid
it is simply stripped before it is appended to the variable and finally written to a new file.