Calculate statistics

Question

1

If I have a number of files like this:

Inside each folder is 3 more like this:

Now inside each of these folders is a .txt file that looks like this:

For each of the .txt files I need to get the value from the 6th column in the file which I have circled in red and I am only interested in the lines that contain cope1, cope2, cope3, cope4 and cope5 at the start (highlighted in blue). Everything else can be ignored.

PROBLEM: I also need to write a script which can perform the following tasks:

Work out how many participants you have
Read in the relevant data from each of the files and store this in a
sensible data structure
Calculate the mean and standard deviation across subjects for each ROI and
stimulus class (you will therefore end up with fifteen means and standard
deviation)
Print out the mean, standard deviation and number of samples which were used
to calculate these to the screen as if it were a CSV file (with a header).

example of output:
see image

I managed to complete until the mean part but now I am not sure how to include the standard deviation and the number of samples.

this is the output I have so far:

{'ffa': {'cope1': 0.6525,
         'cope2': 0.4146,
         'cope3': 0.5896,
         'cope4': 0.1521,
         'cope5': 0.5317},
 'lingual': {'cope1': -0.08865060000000001,
             'cope2': -0.150985,
             'cope3': -0.162005,
             'cope4': -0.130845,
             'cope5': -0.126411},
 'ppa': {'cope1': 0.74836,
         'cope2': 0.9444,
         'cope3': 0.300482,
         'cope4': 1.12435,
         'cope5': 0.8332200000000001}}

and now i need to add next to the mean the st. deviation and number of samples

I also need to convert each cope type to it’s correspondent

cope1:  Bottle:
cope2: Chair:
cope3: Face: 
cope4: House: 
cope5: Shoe:

this is all the coding I have so far

import os
import csv
import pprint
import statistics

def main():
    values = {}
    ffaResults = {}
    lingualResults = {}
    ppaResults = {}

    dir = os.path.join("pin-assessment1-master", "roi_data")
    subdirs = os.listdir(dir)
    for subdir in subdirs:
        subdirpath = os.path.join(dir, subdir)
        subsubdirs = os.listdir(subdirpath)
        for subsubdir in subsubdirs:

            if subsubdir == "ffa":
                dirpath = os.path.join(subdirpath, subsubdir)
                files = os.listdir(dirpath)
                for filename in files:
                    path = os.path.join(dirpath, filename)
                    with open(path, "r") as f:
                        content = csv.reader(f, delimiter=" ")
                        for row in content:
                            if "cope" in row[1]:
                                name = row[1].split("/")[1]
                                if not name in values:
                                    ffaResults[name] = [float(row[6])]
                                else:
                                    ffaResults[name].append(float(row[6]))  

            if subsubdir == "lingual_gyrus":
                dirpath = os.path.join(subdirpath, subsubdir)
                files = os.listdir(dirpath)
                for filename in files:
                    path = os.path.join(dirpath, filename)
                    with open(path, "r") as f:
                        content = csv.reader(f, delimiter=" ")
                        for row in content:
                            if "cope" in row[1]:
                                name = row[1].split("/")[1]
                                if not name in lingualResults:
                                    lingualResults[name] = [float(row[6])]
                                else:
                                    lingualResults[name].append(float(row[6]))

            if subsubdir == "ppa":
                dirpath = os.path.join(subdirpath, subsubdir)
                files = os.listdir(dirpath)
                for filename in files:
                    path = os.path.join(dirpath, filename)
                    with open(path, "r") as f:
                        content = csv.reader(f, delimiter=" ")
                        for row in content:
                            if "cope" in row[1]:
                                name = row[1].split("/")[1]
                                if not name in ppaResults:
                                    ppaResults[name] = [float(row[6])]
                                else:
                                    ppaResults[name].append(float(row[6]))
    res = {"ffa": {}, "lingual": {}, "ppa": {}}
    for k in ffaResults:
        res["ffa"][k] = statistics.mean(ffaResults[k]) 
    for k in lingualResults:
        res["lingual"][k] = statistics.mean(lingualResults[k]) 
    for k in ppaResults:
        res["ppa"][k] = statistics.mean(ppaResults[k]) 

    pprint.pprint(res)

if __name__ == "__main__":
    main()

Asked By: Sophia

||

Source

Answer 1

Try use this code:

import os
import pandas as pd

li = []

# Traverses thru the root folder 'roi_data' tree and opens .txt files.
for root, dirs, files in os.walk('roi_data'):
    for name in files:
        # Opens .txt file as dataFrame and uses the second and the sixth columns.
        file_path = os.path.join(root, name)
        df = pd.read_csv(file_path, sep=' ', usecols=[1, 5], names=['cope', 'value'])
        
        # Filters out rows except those which contains 'cope'.
        # Adds ROI columns based on the file dir.
        df = df[df.iloc[:, 0].str.contains('cope')]
        df['roi'] = root.split("/")[-1]
        
        # Converts the column with the values into float.
        # Groups data by roi, cope and finds mean, std, count.
        df['value'] = df['value'].astype(float)
        
        # Adds files path to distinguish for which file the calculation is done.
        # Appends the dataframe to the list.
        df['file_path'] = file_path
        li.append(df)

# Concatenates all extracted dataframes.
combined_df = pd.concat(li)      

# Replaces values based on the dictionary.
di = {
    'stats/cope1': 'Bottle',
    'stats/cope2': 'Chair',
    'stats/cope3': 'Face',
    'stats/cope4': 'House',
    'stats/cope5': 'Shoe'
}
combined_df.replace(di, inplace=True)

# Aggregates the combined data and calculates the 'mean', 'std', 'count'.
# Saves the results into 'result.csv' and prints out the results.
result_df = combined_df.groupby(by=['roi', 'cope']).agg(['mean', 'std', 'count'])
result_df.to_csv('result.csv', sep='|')
print(result_df.to_string())

Output:

roi	cope	mean	std	count
ffa	Bottle	0.76427	0.36723498396046694	10
ffa	Chair	0.7036800000000001	0.40113803609231574	10
ffa	Face	1.0842100000000001	0.39293685511938314	10
ffa	House	0.511365	0.394306610851392	10
ffa	Shoe	0.92214	0.48974865707943616	10
lingual_gyrus	Bottle	-0.004992799999999994	0.26929961480881803	10
lingual_gyrus	Chair	0.0005299999999999909	0.35327198121434866	10
lingual_gyrus	Face	-0.004019999999999993	0.3000928154569664	10
lingual_gyrus	House	0.018081	0.3216293944709932	10
lingual_gyrus	Shoe	-0.002994999999999997	0.23756639419328654	10
ppa	Bottle	0.7706500000000001	0.49211155521667826	10
ppa	Chair	0.9696400000000001	0.44630525229065166	10
ppa	Face	0.327822	0.29263178903036335	10
ppa	House	1.14463	0.43584880673615856	10
ppa	Shoe	0.8539999999999999	0.5421760148799568	10

Numpy implementation:

import os
import numpy as np

li = []

# Traverses thru the root folder 'roi_data' tree and opens .txt files.
for root, dirs, files in os.walk('roi_data'):
    for name in files:
        # Opens .txt file as numpy array and uses the second and the sixth columns.
        file_path = os.path.join(root, name)
        arr = np.loadtxt(file_path, delimiter=' ', usecols=[1, 5], dtype=str)
        
        # Filters out rows except those which contains 'cope'.
        # Adds ROI columns based on the file dir.
        arr = arr[np.char.startswith(arr[:, 0], 'stats/cope')]
        roi = np.full(fill_value=root.split("/")[-1], shape=(5, 1))
        arr = np.concatenate((roi, arr), axis=1)
        
        # Adds files path to distinguish for which file the calculation is done.
        # Appends the array to the list.
        file_path = np.full(fill_value=file_path, shape=(5, 1))
        arr = np.concatenate((arr, file_path), axis=1)
        li.append(arr)

# Concatenates all extracted arrays.
# Calculates the requested metrics and builds the result_di
combined_arr = np.array(li).reshape((-1, 4))
groups = (np.char.array(combined_arr[:, 0])
          + '_' + np.char.array(combined_arr[:, 1])).reshape((-1, 1))
combined_arr = np.concatenate((groups, combined_arr), axis=1)
result_di = dict()
for group in set(combined_arr[:, 0]):
    group_slice = combined_arr[combined_arr[:, 0] == group]
    values = (group_slice[:, 3].astype(float).mean(), 
              group_slice[:, 3].astype(np.float64).std(ddof=1),
              group_slice[:, 3].astype(np.float64).shape[0])
    result_di[group] = values

result_di =  dict(sorted(result_di.items()))
result_di

{'ffa_stats/cope1': (0.76427, 0.36723498396046694, 10),
 'ffa_stats/cope2': (0.7036800000000001, 0.4011380360923157, 10),
 'ffa_stats/cope3': (1.0842100000000001, 0.39293685511938314, 10),
 'ffa_stats/cope4': (0.511365, 0.394306610851392, 10),
 'ffa_stats/cope5': (0.92214, 0.4897486570794361, 10),
 'lingual_gyrus_stats/cope1': (-0.004992799999999996, 0.26929961480881803, 10),
 'lingual_gyrus_stats/cope2': (0.0005299999999999909, 0.35327198121434866, 10),
 'lingual_gyrus_stats/cope3': (-0.004019999999999996, 0.3000928154569664, 10),
 'lingual_gyrus_stats/cope4': (0.018081000000000003, 0.3216293944709932, 10),
 'lingual_gyrus_stats/cope5': (-0.0029950000000000024,
  0.23756639419328654,
  10),
 'ppa_stats/cope1': (0.77065, 0.49211155521667826, 10),
 'ppa_stats/cope2': (0.9696400000000001, 0.44630525229065166, 10),
 'ppa_stats/cope3': (0.32782199999999995, 0.2926317890303634, 10),
 'ppa_stats/cope4': (1.1446299999999998, 0.43584880673615867, 10),
 'ppa_stats/cope5': (0.8539999999999999, 0.5421760148799568, 10)}

Answered By: Sergey Sakharovskiy

Calculate statistics

Question:

Answers:

Numpy implementation: