Calculate statistics

Question:

1

If I have a number of files like this:

see image

Inside each folder is 3 more like this:

see image

Now inside each of these folders is a .txt file that looks like this:

see image

For each of the .txt files I need to get the value from the 6th column in the file which I have circled in red and I am only interested in the lines that contain cope1, cope2, cope3, cope4 and cope5 at the start (highlighted in blue). Everything else can be ignored.

PROBLEM: I also need to write a script which can perform the following tasks:

  • Work out how many participants you have
  • Read in the relevant data from each of the files and store this in a
    sensible data structure
  • Calculate the mean and standard deviation across subjects for each ROI and
    stimulus class (you will therefore end up with fifteen means and standard
    deviation)
  • Print out the mean, standard deviation and number of samples which were used
    to calculate these to the screen as if it were a CSV file (with a header).

example of output:
see image

I managed to complete until the mean part but now I am not sure how to include the standard deviation and the number of samples.

this is the output I have so far:

{'ffa': {'cope1': 0.6525,
         'cope2': 0.4146,
         'cope3': 0.5896,
         'cope4': 0.1521,
         'cope5': 0.5317},
 'lingual': {'cope1': -0.08865060000000001,
             'cope2': -0.150985,
             'cope3': -0.162005,
             'cope4': -0.130845,
             'cope5': -0.126411},
 'ppa': {'cope1': 0.74836,
         'cope2': 0.9444,
         'cope3': 0.300482,
         'cope4': 1.12435,
         'cope5': 0.8332200000000001}}

and now i need to add next to the mean the st. deviation and number of samples

I also need to convert each cope type to it’s correspondent

cope1:  Bottle:
cope2: Chair:
cope3: Face: 
cope4: House: 
cope5: Shoe:

this is all the coding I have so far

import os
import csv
import pprint
import statistics

def main():
    values = {}
    ffaResults = {}
    lingualResults = {}
    ppaResults = {}

    dir = os.path.join("pin-assessment1-master", "roi_data")
    subdirs = os.listdir(dir)
    for subdir in subdirs:
        subdirpath = os.path.join(dir, subdir)
        subsubdirs = os.listdir(subdirpath)
        for subsubdir in subsubdirs:

            if subsubdir == "ffa":
                dirpath = os.path.join(subdirpath, subsubdir)
                files = os.listdir(dirpath)
                for filename in files:
                    path = os.path.join(dirpath, filename)
                    with open(path, "r") as f:
                        content = csv.reader(f, delimiter=" ")
                        for row in content:
                            if "cope" in row[1]:
                                name = row[1].split("/")[1]
                                if not name in values:
                                    ffaResults[name] = [float(row[6])]
                                else:
                                    ffaResults[name].append(float(row[6]))  

            if subsubdir == "lingual_gyrus":
                dirpath = os.path.join(subdirpath, subsubdir)
                files = os.listdir(dirpath)
                for filename in files:
                    path = os.path.join(dirpath, filename)
                    with open(path, "r") as f:
                        content = csv.reader(f, delimiter=" ")
                        for row in content:
                            if "cope" in row[1]:
                                name = row[1].split("/")[1]
                                if not name in lingualResults:
                                    lingualResults[name] = [float(row[6])]
                                else:
                                    lingualResults[name].append(float(row[6]))

            if subsubdir == "ppa":
                dirpath = os.path.join(subdirpath, subsubdir)
                files = os.listdir(dirpath)
                for filename in files:
                    path = os.path.join(dirpath, filename)
                    with open(path, "r") as f:
                        content = csv.reader(f, delimiter=" ")
                        for row in content:
                            if "cope" in row[1]:
                                name = row[1].split("/")[1]
                                if not name in ppaResults:
                                    ppaResults[name] = [float(row[6])]
                                else:
                                    ppaResults[name].append(float(row[6]))
    res = {"ffa": {}, "lingual": {}, "ppa": {}}
    for k in ffaResults:
        res["ffa"][k] = statistics.mean(ffaResults[k]) 
    for k in lingualResults:
        res["lingual"][k] = statistics.mean(lingualResults[k]) 
    for k in ppaResults:
        res["ppa"][k] = statistics.mean(ppaResults[k]) 

    pprint.pprint(res)

if __name__ == "__main__":
    main()
Asked By: Sophia

||

Answers:

Try use this code:

import os
import pandas as pd

li = []

# Traverses thru the root folder 'roi_data' tree and opens .txt files.
for root, dirs, files in os.walk('roi_data'):
    for name in files:
        # Opens .txt file as dataFrame and uses the second and the sixth columns.
        file_path = os.path.join(root, name)
        df = pd.read_csv(file_path, sep=' ', usecols=[1, 5], names=['cope', 'value'])
        
        # Filters out rows except those which contains 'cope'.
        # Adds ROI columns based on the file dir.
        df = df[df.iloc[:, 0].str.contains('cope')]
        df['roi'] = root.split("/")[-1]
        
        # Converts the column with the values into float.
        # Groups data by roi, cope and finds mean, std, count.
        df['value'] = df['value'].astype(float)
        
        # Adds files path to distinguish for which file the calculation is done.
        # Appends the dataframe to the list.
        df['file_path'] = file_path
        li.append(df)

# Concatenates all extracted dataframes.
combined_df = pd.concat(li)      

# Replaces values based on the dictionary.
di = {
    'stats/cope1': 'Bottle',
    'stats/cope2': 'Chair',
    'stats/cope3': 'Face',
    'stats/cope4': 'House',
    'stats/cope5': 'Shoe'
}
combined_df.replace(di, inplace=True)

# Aggregates the combined data and calculates the 'mean', 'std', 'count'.
# Saves the results into 'result.csv' and prints out the results.
result_df = combined_df.groupby(by=['roi', 'cope']).agg(['mean', 'std', 'count'])
result_df.to_csv('result.csv', sep='|')
print(result_df.to_string())

Output:

roi cope mean std count
ffa Bottle 0.76427 0.36723498396046694 10
ffa Chair 0.7036800000000001 0.40113803609231574 10
ffa Face 1.0842100000000001 0.39293685511938314 10
ffa House 0.511365 0.394306610851392 10
ffa Shoe 0.92214 0.48974865707943616 10
lingual_gyrus Bottle -0.004992799999999994 0.26929961480881803 10
lingual_gyrus Chair 0.0005299999999999909 0.35327198121434866 10
lingual_gyrus Face -0.004019999999999993 0.3000928154569664 10
lingual_gyrus House 0.018081 0.3216293944709932 10
lingual_gyrus Shoe -0.002994999999999997 0.23756639419328654 10
ppa Bottle 0.7706500000000001 0.49211155521667826 10
ppa Chair 0.9696400000000001 0.44630525229065166 10
ppa Face 0.327822 0.29263178903036335 10
ppa House 1.14463 0.43584880673615856 10
ppa Shoe 0.8539999999999999 0.5421760148799568 10

Numpy implementation:

import os
import numpy as np

li = []

# Traverses thru the root folder 'roi_data' tree and opens .txt files.
for root, dirs, files in os.walk('roi_data'):
    for name in files:
        # Opens .txt file as numpy array and uses the second and the sixth columns.
        file_path = os.path.join(root, name)
        arr = np.loadtxt(file_path, delimiter=' ', usecols=[1, 5], dtype=str)
        
        # Filters out rows except those which contains 'cope'.
        # Adds ROI columns based on the file dir.
        arr = arr[np.char.startswith(arr[:, 0], 'stats/cope')]
        roi = np.full(fill_value=root.split("/")[-1], shape=(5, 1))
        arr = np.concatenate((roi, arr), axis=1)
        
        # Adds files path to distinguish for which file the calculation is done.
        # Appends the array to the list.
        file_path = np.full(fill_value=file_path, shape=(5, 1))
        arr = np.concatenate((arr, file_path), axis=1)
        li.append(arr)

# Concatenates all extracted arrays.
# Calculates the requested metrics and builds the result_di
combined_arr = np.array(li).reshape((-1, 4))
groups = (np.char.array(combined_arr[:, 0])
          + '_' + np.char.array(combined_arr[:, 1])).reshape((-1, 1))
combined_arr = np.concatenate((groups, combined_arr), axis=1)
result_di = dict()
for group in set(combined_arr[:, 0]):
    group_slice = combined_arr[combined_arr[:, 0] == group]
    values = (group_slice[:, 3].astype(float).mean(), 
              group_slice[:, 3].astype(np.float64).std(ddof=1),
              group_slice[:, 3].astype(np.float64).shape[0])
    result_di[group] = values

result_di =  dict(sorted(result_di.items()))
result_di

{'ffa_stats/cope1': (0.76427, 0.36723498396046694, 10),
 'ffa_stats/cope2': (0.7036800000000001, 0.4011380360923157, 10),
 'ffa_stats/cope3': (1.0842100000000001, 0.39293685511938314, 10),
 'ffa_stats/cope4': (0.511365, 0.394306610851392, 10),
 'ffa_stats/cope5': (0.92214, 0.4897486570794361, 10),
 'lingual_gyrus_stats/cope1': (-0.004992799999999996, 0.26929961480881803, 10),
 'lingual_gyrus_stats/cope2': (0.0005299999999999909, 0.35327198121434866, 10),
 'lingual_gyrus_stats/cope3': (-0.004019999999999996, 0.3000928154569664, 10),
 'lingual_gyrus_stats/cope4': (0.018081000000000003, 0.3216293944709932, 10),
 'lingual_gyrus_stats/cope5': (-0.0029950000000000024,
  0.23756639419328654,
  10),
 'ppa_stats/cope1': (0.77065, 0.49211155521667826, 10),
 'ppa_stats/cope2': (0.9696400000000001, 0.44630525229065166, 10),
 'ppa_stats/cope3': (0.32782199999999995, 0.2926317890303634, 10),
 'ppa_stats/cope4': (1.1446299999999998, 0.43584880673615867, 10),
 'ppa_stats/cope5': (0.8539999999999999, 0.5421760148799568, 10)}
Answered By: Sergey Sakharovskiy
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.