How to generate a report in python containing the difference of last and first row of multiple csv files?

Question:

I have multiple csv files in the below format:

CSV File 1:

enter image description here

CSV File 2:

enter image description here

The report needs to be generated containing the difference of last and first row of each csv file as below:

enter image description here

The below code calculates the difference between the last and first row. How do we write the results into a separate file in the report format specified above?

def csv_subtract():
    # this is the extension you want to detect
    extension = '.csv'
    for root, dirs_list, files_list in os.walk(csv_file_path):
        for f in files_list:
            if os.path.splitext(f)[-1] == extension:
                file_name_path = os.path.join(root, f)
                df = pd.read_csv(file_name_path)
                diff_row = (df.iloc[len(df.index) - 1] - df.iloc[0]).to_frame()
Asked By: user2077648

||

Answers:

Using your code

def csv_subtract():
    # this is the extension you want to detect
    extension = '.csv'
    for root, dirs_list, files_list in os.walk(csv_file_path):
        df_dict = {}
        for f in files_list:
            if os.path.splitext(f)[-1] == extension:
                file_name_path = os.path.join(root, f)
                df = pd.read_csv(file_name_path)
                # simplified indexing
                # diff_row = (df.iloc[len(df.index) - 1] - df.iloc[0]).to_frame()  # old
                diff_row = (df.iloc[-1] - df.iloc[0]).to_frame()  # new
                df_dict[f] = diff_row
    
        out = pd.concat(df_dict, names=["File Name"])
        out.to_csv("path/to/report.csv")

Another Approach

Concatenate all the data upon read, groupby the file names, and calculate the differences within each group.

import numpy as np
import pandas as pd


if __name__ == "__main__":
    # some fake data for setup
    np.random.seed(1)
    df1 = pd.DataFrame(
        data=np.random.random(size=(5, 5)),
        columns=list("abcde")
    )
    np.random.seed(2)
    df2 = pd.DataFrame(
        data=np.random.random(size=(5, 5)),
        columns=list("abcde")
    )

    # I concatenate all dfs into one and use `keys` to identify which rows
    # belong to which df
    # in your function you could set keys to the file names
    df = pd.concat([df1, df2], keys=["df1", "df2"], names=["file_name"])

    # groupby the keys and calculate the difference between 0th and -1st rows
    out = df.groupby("file_name").apply(lambda df: df.iloc[-1] - df.iloc[0])

    print(out)

                  a         b        c         d         e
file_name                                                 
df1        0.383723  0.247937  0.31331  0.389990  0.729633
df2        0.069251  0.039360 -0.12154 -0.338791 -0.293208

Last step is to save this to a .CSV using pandas.DataFrame.to_csv

out.to_csv("path/to/file.csv")

Concatenating Files Via Function

from glob import iglob

import numpy as np
import pandas as pd


def read_df_rec(path: str, fn_regex: str = ".csv"):
    # generate all *.csv paths in root_dir
    csv_paths = list(iglob(f"*{fn_regex}", root_dir=path, recursive=True))
    # generator that reads all csv files into pd.DataFrames
    df_gen = (pd.read_csv(f) for f in csv_paths)
    # concat the `df_gen` values and name each with it's respective `csv_path`
    df = pd.concat(objs=df_gen,
                   keys=csv_paths,  # assign file names here
                   names=["File Name"])  # name index here
    # group by file name and calculate difference between last and first row
    out = df.groupby("File Name").apply(lambda df: df.iloc[-1].sub(df.iloc[0]))
    return out

# make some fake data and save to current directory
np.random.seed(1)
df1 = pd.DataFrame(
    data=np.random.random(size=(5, 5)),
    columns=list("abcde")
)

np.random.seed(2)
df2 = pd.DataFrame(
    data=np.random.random(size=(5, 5)),
    columns=list("abcde")
)

df1.to_csv("df1.csv", index=False)
df2.to_csv("df2.csv", index=False)

# read all csv files from current directory (".")
out = read_df_rec(path=".")

print(out)
                  a         b        c         d         e
File Name                                                 
df1.csv    0.383723  0.247937  0.31331  0.389990  0.729633
df2.csv    0.069251  0.039360 -0.12154 -0.338791 -0.293208

Concatenating Files in Nested Directories using Recursion and Reducing File Names to their Stems

from glob import iglob
from pathlib import Path

import numpy as np
import pandas as pd


def read_df_rec(path: str, fn_regex: str = ".csv"):
    # recursively generate all *.csv paths starting in root 
    # directory and subdirectories
    csv_paths = list(iglob(f"**/*{fn_regex}", root_dir=path,
                           recursive=True))
    # generator that reads all csv files into pd.DataFrames
    df_gen = (pd.read_csv(f) for f in csv_paths)
    # extract path stems from csv_paths
    file_names = (Path(path).stem for path in csv_paths)
    # concat the `df_gen` values and name each with it's 
    # respective `csv_path`
    df = pd.concat(objs=df_gen,
                   keys=file_names,  # assign file names here
                   names=["File Name"])  # name index here
    # group by file name and calculate difference between 
    # last and first row
    out = df.groupby("File Name").apply(
        lambda df: df.iloc[-1].sub(df.iloc[0])
    )
    return out

# make some fake data and save to different directories
np.random.seed(1)
df1 = pd.DataFrame(
    data=np.random.random(size=(5, 5)),
    columns=list("abcde")
)

np.random.seed(2)
df2 = pd.DataFrame(
    data=np.random.random(size=(5, 5)),
    columns=list("abcde")
)

# different directory per file
df1.to_csv("Untitled Folder/df1.csv", index=False)
df2.to_csv("Untitled Folder 1/df2.csv", index=False)

# read all csv files starting in current directory (".")
out = read_df_rec(path=".")

print(out)
                  a         b        c         d         e
File Name                                                 
df1        0.383723  0.247937  0.31331  0.389990  0.729633
df2        0.069251  0.039360 -0.12154 -0.338791 -0.293208
Answered By: Ian Thompson
Categories: questions Tags: , , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.