Python and scope

Question:

So I’ve got this program I’ve been working on and I’m stuck with the scope of it. Basically, I’m taking csv’s which are coming in from the field and scrubbing them of bad rows. One of the checks I need to write into the program is a date calculation. I have a function for this, but as my program is now, I have to call this function after I’ve already written the data to a new file. I am having a hard time wrapping my head around the scope of this problem. Here is my program:

import csv
import glob
import os
import stat
import shutil
from os import path
from datetime import datetime, timedelta


# path to files -- parent folder is source.
source = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files'
destination = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptCompleted Data Files'

data = []

# list of all csv files in path
TD_files = glob.glob(os.path.join(source, "*.csv"), recursive=True)


# A function to test whether a file is read only. If it is this function will change the file's attribute to writeable.
def IsFileReadOnly(f):
    file_att = os.stat(f)[0]

    if not file_att & stat.S_IWRITE:  # if the file is read only
        os.chmod(f, stat.S_IWRITE)  # it needs to be made writeable


# A function to decide if a file is a Trend file or not. If it is not a Trend File, it is simply erased.
def IsTrendFile(f):
    with open(f, newline='', encoding='utf-8') as g:  # Open file as read
        r = csv.reader(g)  # Declare read variable for list
        is_trend = next(r)[0] == 'TD'  # Initialize isTrend variable to cell A1
    if not is_trend:  # If file is not a trend file
        os.remove(f)  # Erase the file


# A function which moves files from source to destination. This function will overwrite any existing file with new data.
def MoveFiles():
    directory_exists = os.path.exists(destination)

    if not directory_exists:  # If the directory does not exist
        os.makedirs(destination)  # Make the directory at the destination
    files = os.listdir(source)  # List all csv files in the source directory
    for file in files:  # FOR EACH TREND FILE IN ALL TREND FILES:
        shutil.move(os.path.join(source, file),
                    os.path.join(destination, file))  # Move files from source to destination


def CheckDates(f):
    with open(f, 'r', newline='') as src:
        row_0 = src.readline()
        tokens = row_0.strip().split(',')
        orig_time = tokens[1] + ' ' + tokens[2]
        base_time = datetime.strptime(orig_time, '%m/%d/%Y %H:%M:%S')

        src.readline()

        for line in src:
            tokens = line.strip().split(',')
            row_time = datetime.strptime(tokens[0], '%m/%d/%Y %H:%M:%S')

            td = row_time - base_time

            if td < timedelta(0):
                pass


# A function which cleans the data of null and truncated rows.
def CleanUpData(f):
    IsTrendFile(f)  # check to see if file is a Trend File
    IsFileReadOnly(f)  # check to see if file is read only
    with open(f, newline='', encoding='utf-8') as g:  # open file as read
        r = csv.reader((line.replace('', '') for line in g))  # declare read variable while stripping nulls
        trend_header_tuple = next(r)  # get trend header
        machine_header_tuple = next(r)  # get machine header
        data = [line for line in r
                if len(line) == len(trend_header_tuple)
                or len(line) == len(machine_header_tuple)]
    WriteData(f, data, trend_header_tuple, machine_header_tuple)  # write the data to the file
    CheckDates(f)


# A function which writes data to .csv files.
def WriteData(f, data, trend_header_tuple, machine_header_tuple):
    with open(f, 'w', newline='') as g:  # open file as write
        w = csv.writer(g)  # declare write variable
        w.writerow(trend_header_tuple)  # write the trend header tuple to file
        w.writerow(machine_header_tuple)  # write the machine header tuple to file
        w.writerows(data)  # write rows to file


# A function which does all the work.
def DoWork():
    for f in TD_files:  # FOR ALL TREND FILES
        CleanUpData(f)  # Clean up the data
    MoveFiles()  # Move the files

How would I structure this program so that CheckDates() is part of the main with open() loop in the CleanUpData() function, or something which is equal to this? When I put CheckDates() inside of the data variable as part of the if... or statements, only a certain amount of data was written to the files. And moving the call around the program proved to be useless as well.

Here is my main:

import DataCleanupScript
import getpass


# A function which prints a welcome statement to the user.
def welcome_screen():
    print(f'Hello', get_name(), 'please wait while the script cleans the data. This may take a few moments depending'
                                ' on network speed and the amount of files.')


# A function which gets the users name.
def get_name():
    return getpass.getuser()


# Main
if __name__ != '__main__':
    pass
else:
    welcome_screen()
    DataCleanupScript.DoWork()
    print('Complete! Press Enter to continue.')
    input()

This is an example csv file with messed up data. you can see row 11 is messed up. This is supposed to be like this. Some rows come in from the field like this for some reason.

    TD,08/24/2021,14:14:08,21012,223,0,1098,0,031,810,12,01,092,048,0008,02
        Date/Time,G120010,M129000,G110100,M119030,G112070,G112080,G111030,G127020,G127030,G120020,G120030,G121020,G111040,G112010,P102000,G112020,G112040,G112090,G110050,G110060,G110070,T111100
        08/27/2021 00:00:00,75,249.75,0,0,12.61895,0,58.04886,64,87.6,1,2,5,41.5,5,686,2,239,2700,0,154,0,5
        08/27/2021 00:00:02,75,249.75,0,0,12.61895,0,58.04743,64,87.6,1,2,5,41.5,5,686,2,239,2700,0,154,0,5
        08/27/2021 00:00:04,75,249.75,0,0,12.61895,0,58.05036,64,87.6,1,2,5,41.5,5,686,2,239,2700,0,154,0,5
        08/31/2021 08:05:48,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:05:50,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:05:52,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:05:54,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:05:56,100,333,0,0,12.9439,0,0,0,0,0,0,5,42.5,5,0,0,233,0,1,154,0,5
                                                                                          08/31/2021 08:21:41,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
        08/31/2021 08:21:43,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
        08/31/2021 08:21:45,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
        08/31/2021 08:21:47,100,333,0,0,12.9439,0,0,0,0,0,0,1,0,5,0,0,233,0,1,154,0,5
        08/31/2021 08:21:49,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:21:51,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:21:53,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:21:55,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:21:57,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:21:59,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:22:01,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:22:03,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:22:05,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:22:07,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:22:09,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:22:11,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:22:13,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
        08/31/2021 08:22:15,100,333,0,0,12.9439,0,0,0,0,0,0,1,42.5,5,0,0,233,0,1,154,0,5
NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN

EDIT:

I have incorporated suggestions from the community and moved some of the hardcoded things to main(), and restructured my data variable to be more dynamic. Here is the new main().

import DataCleanupScript
import getpass
import glob
import os

# path to files -- parent folder is source.
source = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files'
destination = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptCompleted Data Files'

# list of all csv files in path
TD_files = glob.glob(os.path.join(source, "*.csv"), recursive=True)


# A function which prints a welcome statement to the user.
def welcome_screen():
    print(f'Hello', get_name(), 'please wait while the script cleans the data. This may take a few moments depending'
                                ' on network speed and the amount of files.')


# A function which gets the users name.
def get_name():
    return getpass.getuser()


# Main
if __name__ != '__main__':
    pass
else:
    welcome_screen()
    DataCleanupScript.DoWork(TD_files, source, destination)
    print('Complete! Press Enter to continue.')
    input()

And the functions: sorry I haven’t had time to comment any of the new code, but the new CheckDate() loops through each row of a csv, and compares the date in that row to the date in the trend_header_tuple. If the date is before the date in the tuple, it needs to strip the row. Right now I have solved the problem of the data variable running off the end of the array when the csv ended in a row of bunk information like in the example I have provided. It is not dropping the found indices though.

import csv
from datetime import datetime, timedelta
import os
import stat
import shutil


# A function to test whether a file is read only. If it is this function will change the file's attribute to writeable.
def IsFileReadOnly(f):
    file_att = os.stat(f)[0]

    if not file_att & stat.S_IWRITE:  # If the file is read only
        os.chmod(f, stat.S_IWRITE)  # It needs to be made writeable


# A function to decide if a file is a Trend file or not. If it is not a Trend File, it is simply erased.
def IsTrendFile(f):
    with open(f, newline='', encoding='utf-8') as g:  # Open file as read
        r = csv.reader(g)  # Declare read variable for list
        is_trend = next(r)[0] == 'TD'  # Initialize is_trend variable
    if not is_trend:  # If file is not a trend file
        os.remove(f)  # Erase the file


# A function which moves files from source to destination. This function will overwrite any existing file with new data.
def MoveFiles(source, destination):
    directory_exists = os.path.exists(destination)

    if not directory_exists:  # If the directory does not exist
        os.makedirs(destination)  # Make the directory at the destination

    files = os.listdir(source)  # List all csv files in the source directory

    for file in files:  # FOR EACH TREND FILE IN ALL TREND FILES:
        shutil.move(os.path.join(source, file), os.path.join(destination, file))  # Move files from source to destination


def CheckDates(f, line: str, trend_header_tuple):
    row_time = datetime.strptime(line[0], '%m/%d/%Y %H:%M:%S')
    orig_time = trend_header_tuple[1] + ' ' + trend_header_tuple[2]
    base_time = datetime.strptime(orig_time, '%m/%d/%Y %H:%M:%S')
    # print(f'recovered this base time: {base_time}')

    td = row_time - base_time

    line = line[0].strip()

    if td < timedelta(0):
        print('this line is before the base time:')
        print(f'    {line}')
        print(f'    {f}')
        return False
    return True


# A function which cleans the data of null and truncated rows.
def CleanUpData(f, source, destination):
    IsTrendFile(f)  # check to see if file is a Trend File
    IsFileReadOnly(f)  # check to see if file is read only
    with open(f, newline='', encoding='utf-8') as g:  # open file as read
        r = csv.reader((line.replace('', '') for line in g))  # declare read variable while stripping nulls
        trend_header_tuple = next(r)  # get trend header
        machine_header_tuple = next(r)  # get machine header
        data = []

        for line in r:
            if line:
                valid = False

                if len(line) == len(trend_header_tuple):
                    valid = True
                elif len(line) == len(machine_header_tuple):
                    valid = True

                if CheckDates(f, line, trend_header_tuple):
                    valid = True

                if valid:
                    data.append(line)

    WriteData(f, data, trend_header_tuple, machine_header_tuple)  # write the data to the file
    # MoveFiles(source, destination)

# A function which writes data to .csv files.
def WriteData(f, data, trend_header_tuple, machine_header_tuple):
    with open(f, 'w', newline='') as g:  # open file as write
        w = csv.writer(g)  # declare write variable
        w.writerow(trend_header_tuple)  # write the trend header tuple to file
        w.writerow(machine_header_tuple)  # write the machine header tuple to file
        w.writerows(data)  # write rows to file


# A function which does all the work.
def DoWork(TD_files, source, destination):
    for f in TD_files:  # FOR ALL TREND FILES
        CleanUpData(f, source, destination)  # Clean up the data
    MoveFiles(source, destination)  # Move the files

Terminal output:

C:UsersklucasPycharmProjectsDataCleanupScriptvenvScriptspython.exe C:UsersklucasPycharmProjectsDataCleanupScriptmain.py 
Hello klucas please wait while the script cleans the data. This may take a few moments depending on network speed and the amount of files.
this line is before the base time:
    08/31/1521 00:00:00
    C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files2022_09_06_08_00_10_554_IF1Rockwell-Trend_SN2231098_20210831.csv
Asked By: Kyle Lucas

||

Answers:

I don’t think I’m 100% clear on the problem, but I’ll try my best to help. For this section of your code:

with open(f, newline='', encoding='utf-8') as g:  # open file as read
    r = csv.reader((line.replace('', '') for line in g))  # declare read variable while stripping nulls
    trend_header_tuple = next(r)  # get trend header
    machine_header_tuple = next(r)  # get machine header
    data = [line for line in r
            if len(line) == len(trend_header_tuple)
            or len(line) == len(machine_header_tuple)]
WriteData(f, data, trend_header_tuple, machine_header_tuple)  # write the data to the file
CheckDates(f)

I would suggest that you instead declare data as an empty list to begin, then iterate over r and append line to data if it meets your criteria. This way, instead of using a list comprehension, you can specify individual conditions and add more advanced logic easily:

def CheckDate(line: str):
    line = line.strip() # process line however you need
    if LINE_VALIDATION_HERE:
        return True
    return False

with open(f, newline='', encoding='utf-8') as g:  # open file as read
    r = csv.reader((line.replace('', '') for line in g))  # declare read variable while stripping nulls
    trend_header_tuple = next(r)  # get trend header
    machine_header_tuple = next(r)  # get machine header
    data = []
    for line in r:
        valid = False
        
        # Trend headers
        if len(line) == len(trend_header_tuple):
            valid = True
        # Machine headers
        elif len(line) == len(machine_header_tuple):
            valid = True
        # Dates (only if valid)
        elif CheckDate(line):
            valid = True

        if valid:
            data.append(line)
WriteData(f, data, trend_header_tuple, machine_header_tuple)  # write the data to the file

This restructuring might be a bit overboard, but hopefully it makes everything easier to understand and extend in the future.

Again, I’m not exactly sure what CheckDates does, so this code assumes each line can be checked independent of the others. If this is not true, I’d suggest adding additional parameters to CheckDate.

Answered By: Raj K

The answer proved to be restructuring the data variable from list comprehension into a completely different function. This way I was able to get way more functionality out of it and in the future if I need to write more functionality into it, it will be easy to do so.

What I originally had:

data = [line for line in r
            if len(line) == len(trend_header_tuple)
            or len(line) == len(machine_header_tuple)]  # set data for file

What I ended up with:

    data = []  # initialize data list
    IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple)  # check to see if the data is valid
WriteData(f, data, trend_header_tuple, machine_header_tuple)  # write the data to the file    

And the new function which takes the place of the old data variable:

def IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple):
for line in r1:  # for each line in the trend file
    if line:  # if it is a line
        valid = False  # initialize the variable to False

        if len(line) == len(trend_header_tuple):  # if the length of the line = length of the trend_header_tuple
            valid = True  # the data is valid
        elif len(line) == len(machine_header_tuple):  # elseif the length of the line = length of the machine_header_tuple
            valid = True  # the data is valid

        valid = valid and CheckDates(f, line, trend_header_tuple)  # Check the dates next

        if valid:  # if everything checks out as valid...
            data.append(line)  # append the current line to the data variable

And the full program:

import DataCleanupScript
import getpass
import glob
import os

# path to files -- parent folder is source.
source = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptRaw Data Files'
destination = r'C:UsersklucasDesktopCurrent ProjectData Cleanup ScriptCompleted Data Files'

# list of all csv files in path
TD_files = glob.glob(os.path.join(source, "*.csv"), recursive=True)


# A function which prints a welcome statement to the user.
def welcome_screen():
    print(f'Hello', get_name(), 'please wait while the script cleans the data. 
This may take a few moments depending'
                            ' on network speed and the amount of files.nn')


# A function which gets the users name.
def get_name():
    return getpass.getuser()


# Main
if __name__ != '__main__':
    pass
else:
    welcome_screen()  # welcome the user
    DataCleanupScript.DoWork(TD_files, source, destination)  # clean the data
    print('Complete! Press Enter to continue.')  # print an ending statement
    input()  # and wait for user input

And the functions:

import csv
from datetime import datetime, timedelta
import os
import stat
import shutil


# A function to test whether a file is read only. If it is this function will change the file's attribute to writeable.
def IsFileReadOnly(f):
    file_att = os.stat(f)[0]  # get the current file attribute

    if not file_att & stat.S_IWRITE:  # if the file is read only
        os.chmod(f, stat.S_IWRITE)  # it needs to be made writeable


# A function to decide if a file is a Trend file or not. If it is not a Trend File, it is simply erased.
def IsTrendFile(f):
    with open(f, newline='', encoding='utf-8') as g:  # open file as read
        r = csv.reader(g)  # declare read variable for list
        is_trend = next(r)[0] == 'TD'  # initialize is_trend variable
    if not is_trend:  # if file is not a trend file
        os.remove(f)  # erase the file


# A function which moves files from source to destination. This function will overwrite any existing file with new data.
def MoveFiles(source, destination):
    directory_exists = os.path.exists(destination)

    if not directory_exists:  # if the directory does not exist
        os.makedirs(destination)  # make the directory at the destination

    files = os.listdir(source)  # list all csv files in the source directory

    for file in files:  # FOR EACH TREND FILE IN ALL TREND FILES:
        shutil.move(os.path.join(source, file), os.path.join(destination, file))  # move files from source to destination


# A function which compares the dates of each index of a csv to the date in the trend_header_tuple. If the row_time comes
# before base_time, the check fails.
def CheckDates(f, line: str, trend_header_tuple):
    row_time = datetime.strptime(line[0], '%m/%d/%Y %H:%M:%S')  # strip the time from each row to get a time object
    orig_time = trend_header_tuple[1] + ' ' + trend_header_tuple[2]  # concatenate the trend_header_tuple into a string
    base_time = datetime.strptime(orig_time, '%m/%d/%Y %H:%M:%S')  # get a time object from the trend_header_tuple

    td = row_time - base_time  # calculate the time difference (a negative number indicates a time before base_time)

    line = line[0].strip()  # strip the line

    if td < timedelta(0):  # if td is negative
        print(f'    This line is before the base time and has been stripped:')  # print some stuff to the user:
        print(f'    {line}')  # which line was stripped
        print(f'    {f}')  # which file it was in
        return False  # the check fails
    return True  # the check passes


# A function which cleans the data.
def CleanUpData(f):
    IsTrendFile(f)  # check to see if file is a trend file
    IsFileReadOnly(f)  # check to see if file is read only
    with open(f, newline='', encoding='utf-8') as g:  # open file as read
        r1 = csv.reader((line.replace('', '') for line in g))  # declare read variable while stripping nulls
        trend_header_tuple = next(r1)  # get trend header
        machine_header_tuple = next(r1)  # get machine header
        data = []  # initialize data list
        IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple)  # check to see if the data is valid
    WriteData(f, data, trend_header_tuple, machine_header_tuple)  # write the data to the file


# A function to check to see if data[] is valid.
def IsDataValid(f, r1, data, trend_header_tuple, machine_header_tuple):
    for line in r1:  # for each line in the trend file
        if line:  # if it is a line
            valid = False  # initialize the variable to False

            if len(line) == len(trend_header_tuple):  # if the length of the line = length of the trend_header_tuple
                valid = True  # the data is valid
            elif len(line) == len(machine_header_tuple):  # elseif the length of the line = length of the machine_header_tuple
                valid = True  # the data is valid

            valid = valid and CheckDates(f, line, trend_header_tuple)  # Check the dates next

            if valid:  # if everything checks out as valid...
                data.append(line)  # append the current line to the data variable


# A function which writes data to .csv files.
def WriteData(f, data, trend_header_tuple, machine_header_tuple):
    with open(f, 'w', newline='') as g:  # open file as write
        w = csv.writer(g)  # declare write variable
        w.writerow(trend_header_tuple)  # write the trend header to file
        w.writerow(machine_header_tuple)  # write the machine header to file
        w.writerows(data)  # write the new sanitized rows to file


# A function which does all the work.
def DoWork(TD_files, source, destination):
    for f in TD_files:  # FOR ALL TREND FILES
        CleanUpData(f)  # Clean up the data
    MoveFiles(source, destination)  # Move the files

This new function takes a few args, but returns either true or false to the valid variable. Then for each line of each csv, if the line is not valid it is simply stripped before it is appended to the variable and finally written to a new file.

Answered By: Kyle Lucas
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.