Calculate statistics
Question:
1
If I have a number of files like this:
Inside each folder is 3 more like this:
Now inside each of these folders is a .txt file that looks like this:
For each of the .txt files I need to get the value from the 6th column in the file which I have circled in red and I am only interested in the lines that contain cope1, cope2, cope3, cope4 and cope5 at the start (highlighted in blue). Everything else can be ignored.
PROBLEM: I also need to write a script which can perform the following tasks:
- Work out how many participants you have
- Read in the relevant data from each of the files and store this in a
sensible data structure
- Calculate the mean and standard deviation across subjects for each ROI and
stimulus class (you will therefore end up with fifteen means and standard
deviation)
- Print out the mean, standard deviation and number of samples which were used
to calculate these to the screen as if it were a CSV file (with a header).
example of output:
see image
I managed to complete until the mean part but now I am not sure how to include the standard deviation and the number of samples.
this is the output I have so far:
{'ffa': {'cope1': 0.6525,
'cope2': 0.4146,
'cope3': 0.5896,
'cope4': 0.1521,
'cope5': 0.5317},
'lingual': {'cope1': -0.08865060000000001,
'cope2': -0.150985,
'cope3': -0.162005,
'cope4': -0.130845,
'cope5': -0.126411},
'ppa': {'cope1': 0.74836,
'cope2': 0.9444,
'cope3': 0.300482,
'cope4': 1.12435,
'cope5': 0.8332200000000001}}
and now i need to add next to the mean the st. deviation and number of samples
I also need to convert each cope type to it’s correspondent
cope1: Bottle:
cope2: Chair:
cope3: Face:
cope4: House:
cope5: Shoe:
this is all the coding I have so far
import os
import csv
import pprint
import statistics
def main():
values = {}
ffaResults = {}
lingualResults = {}
ppaResults = {}
dir = os.path.join("pin-assessment1-master", "roi_data")
subdirs = os.listdir(dir)
for subdir in subdirs:
subdirpath = os.path.join(dir, subdir)
subsubdirs = os.listdir(subdirpath)
for subsubdir in subsubdirs:
if subsubdir == "ffa":
dirpath = os.path.join(subdirpath, subsubdir)
files = os.listdir(dirpath)
for filename in files:
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
content = csv.reader(f, delimiter=" ")
for row in content:
if "cope" in row[1]:
name = row[1].split("/")[1]
if not name in values:
ffaResults[name] = [float(row[6])]
else:
ffaResults[name].append(float(row[6]))
if subsubdir == "lingual_gyrus":
dirpath = os.path.join(subdirpath, subsubdir)
files = os.listdir(dirpath)
for filename in files:
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
content = csv.reader(f, delimiter=" ")
for row in content:
if "cope" in row[1]:
name = row[1].split("/")[1]
if not name in lingualResults:
lingualResults[name] = [float(row[6])]
else:
lingualResults[name].append(float(row[6]))
if subsubdir == "ppa":
dirpath = os.path.join(subdirpath, subsubdir)
files = os.listdir(dirpath)
for filename in files:
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
content = csv.reader(f, delimiter=" ")
for row in content:
if "cope" in row[1]:
name = row[1].split("/")[1]
if not name in ppaResults:
ppaResults[name] = [float(row[6])]
else:
ppaResults[name].append(float(row[6]))
res = {"ffa": {}, "lingual": {}, "ppa": {}}
for k in ffaResults:
res["ffa"][k] = statistics.mean(ffaResults[k])
for k in lingualResults:
res["lingual"][k] = statistics.mean(lingualResults[k])
for k in ppaResults:
res["ppa"][k] = statistics.mean(ppaResults[k])
pprint.pprint(res)
if __name__ == "__main__":
main()
Answers:
Try use this code:
import os
import pandas as pd
li = []
# Traverses thru the root folder 'roi_data' tree and opens .txt files.
for root, dirs, files in os.walk('roi_data'):
for name in files:
# Opens .txt file as dataFrame and uses the second and the sixth columns.
file_path = os.path.join(root, name)
df = pd.read_csv(file_path, sep=' ', usecols=[1, 5], names=['cope', 'value'])
# Filters out rows except those which contains 'cope'.
# Adds ROI columns based on the file dir.
df = df[df.iloc[:, 0].str.contains('cope')]
df['roi'] = root.split("/")[-1]
# Converts the column with the values into float.
# Groups data by roi, cope and finds mean, std, count.
df['value'] = df['value'].astype(float)
# Adds files path to distinguish for which file the calculation is done.
# Appends the dataframe to the list.
df['file_path'] = file_path
li.append(df)
# Concatenates all extracted dataframes.
combined_df = pd.concat(li)
# Replaces values based on the dictionary.
di = {
'stats/cope1': 'Bottle',
'stats/cope2': 'Chair',
'stats/cope3': 'Face',
'stats/cope4': 'House',
'stats/cope5': 'Shoe'
}
combined_df.replace(di, inplace=True)
# Aggregates the combined data and calculates the 'mean', 'std', 'count'.
# Saves the results into 'result.csv' and prints out the results.
result_df = combined_df.groupby(by=['roi', 'cope']).agg(['mean', 'std', 'count'])
result_df.to_csv('result.csv', sep='|')
print(result_df.to_string())
Output:
roi
cope
mean
std
count
ffa
Bottle
0.76427
0.36723498396046694
10
ffa
Chair
0.7036800000000001
0.40113803609231574
10
ffa
Face
1.0842100000000001
0.39293685511938314
10
ffa
House
0.511365
0.394306610851392
10
ffa
Shoe
0.92214
0.48974865707943616
10
lingual_gyrus
Bottle
-0.004992799999999994
0.26929961480881803
10
lingual_gyrus
Chair
0.0005299999999999909
0.35327198121434866
10
lingual_gyrus
Face
-0.004019999999999993
0.3000928154569664
10
lingual_gyrus
House
0.018081
0.3216293944709932
10
lingual_gyrus
Shoe
-0.002994999999999997
0.23756639419328654
10
ppa
Bottle
0.7706500000000001
0.49211155521667826
10
ppa
Chair
0.9696400000000001
0.44630525229065166
10
ppa
Face
0.327822
0.29263178903036335
10
ppa
House
1.14463
0.43584880673615856
10
ppa
Shoe
0.8539999999999999
0.5421760148799568
10
Numpy implementation:
import os
import numpy as np
li = []
# Traverses thru the root folder 'roi_data' tree and opens .txt files.
for root, dirs, files in os.walk('roi_data'):
for name in files:
# Opens .txt file as numpy array and uses the second and the sixth columns.
file_path = os.path.join(root, name)
arr = np.loadtxt(file_path, delimiter=' ', usecols=[1, 5], dtype=str)
# Filters out rows except those which contains 'cope'.
# Adds ROI columns based on the file dir.
arr = arr[np.char.startswith(arr[:, 0], 'stats/cope')]
roi = np.full(fill_value=root.split("/")[-1], shape=(5, 1))
arr = np.concatenate((roi, arr), axis=1)
# Adds files path to distinguish for which file the calculation is done.
# Appends the array to the list.
file_path = np.full(fill_value=file_path, shape=(5, 1))
arr = np.concatenate((arr, file_path), axis=1)
li.append(arr)
# Concatenates all extracted arrays.
# Calculates the requested metrics and builds the result_di
combined_arr = np.array(li).reshape((-1, 4))
groups = (np.char.array(combined_arr[:, 0])
+ '_' + np.char.array(combined_arr[:, 1])).reshape((-1, 1))
combined_arr = np.concatenate((groups, combined_arr), axis=1)
result_di = dict()
for group in set(combined_arr[:, 0]):
group_slice = combined_arr[combined_arr[:, 0] == group]
values = (group_slice[:, 3].astype(float).mean(),
group_slice[:, 3].astype(np.float64).std(ddof=1),
group_slice[:, 3].astype(np.float64).shape[0])
result_di[group] = values
result_di = dict(sorted(result_di.items()))
result_di
{'ffa_stats/cope1': (0.76427, 0.36723498396046694, 10),
'ffa_stats/cope2': (0.7036800000000001, 0.4011380360923157, 10),
'ffa_stats/cope3': (1.0842100000000001, 0.39293685511938314, 10),
'ffa_stats/cope4': (0.511365, 0.394306610851392, 10),
'ffa_stats/cope5': (0.92214, 0.4897486570794361, 10),
'lingual_gyrus_stats/cope1': (-0.004992799999999996, 0.26929961480881803, 10),
'lingual_gyrus_stats/cope2': (0.0005299999999999909, 0.35327198121434866, 10),
'lingual_gyrus_stats/cope3': (-0.004019999999999996, 0.3000928154569664, 10),
'lingual_gyrus_stats/cope4': (0.018081000000000003, 0.3216293944709932, 10),
'lingual_gyrus_stats/cope5': (-0.0029950000000000024,
0.23756639419328654,
10),
'ppa_stats/cope1': (0.77065, 0.49211155521667826, 10),
'ppa_stats/cope2': (0.9696400000000001, 0.44630525229065166, 10),
'ppa_stats/cope3': (0.32782199999999995, 0.2926317890303634, 10),
'ppa_stats/cope4': (1.1446299999999998, 0.43584880673615867, 10),
'ppa_stats/cope5': (0.8539999999999999, 0.5421760148799568, 10)}
1
If I have a number of files like this:
Inside each folder is 3 more like this:
Now inside each of these folders is a .txt file that looks like this:
For each of the .txt files I need to get the value from the 6th column in the file which I have circled in red and I am only interested in the lines that contain cope1, cope2, cope3, cope4 and cope5 at the start (highlighted in blue). Everything else can be ignored.
PROBLEM: I also need to write a script which can perform the following tasks:
- Work out how many participants you have
- Read in the relevant data from each of the files and store this in a
sensible data structure - Calculate the mean and standard deviation across subjects for each ROI and
stimulus class (you will therefore end up with fifteen means and standard
deviation) - Print out the mean, standard deviation and number of samples which were used
to calculate these to the screen as if it were a CSV file (with a header).
example of output:
see image
I managed to complete until the mean part but now I am not sure how to include the standard deviation and the number of samples.
this is the output I have so far:
{'ffa': {'cope1': 0.6525,
'cope2': 0.4146,
'cope3': 0.5896,
'cope4': 0.1521,
'cope5': 0.5317},
'lingual': {'cope1': -0.08865060000000001,
'cope2': -0.150985,
'cope3': -0.162005,
'cope4': -0.130845,
'cope5': -0.126411},
'ppa': {'cope1': 0.74836,
'cope2': 0.9444,
'cope3': 0.300482,
'cope4': 1.12435,
'cope5': 0.8332200000000001}}
and now i need to add next to the mean the st. deviation and number of samples
I also need to convert each cope type to it’s correspondent
cope1: Bottle:
cope2: Chair:
cope3: Face:
cope4: House:
cope5: Shoe:
this is all the coding I have so far
import os
import csv
import pprint
import statistics
def main():
values = {}
ffaResults = {}
lingualResults = {}
ppaResults = {}
dir = os.path.join("pin-assessment1-master", "roi_data")
subdirs = os.listdir(dir)
for subdir in subdirs:
subdirpath = os.path.join(dir, subdir)
subsubdirs = os.listdir(subdirpath)
for subsubdir in subsubdirs:
if subsubdir == "ffa":
dirpath = os.path.join(subdirpath, subsubdir)
files = os.listdir(dirpath)
for filename in files:
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
content = csv.reader(f, delimiter=" ")
for row in content:
if "cope" in row[1]:
name = row[1].split("/")[1]
if not name in values:
ffaResults[name] = [float(row[6])]
else:
ffaResults[name].append(float(row[6]))
if subsubdir == "lingual_gyrus":
dirpath = os.path.join(subdirpath, subsubdir)
files = os.listdir(dirpath)
for filename in files:
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
content = csv.reader(f, delimiter=" ")
for row in content:
if "cope" in row[1]:
name = row[1].split("/")[1]
if not name in lingualResults:
lingualResults[name] = [float(row[6])]
else:
lingualResults[name].append(float(row[6]))
if subsubdir == "ppa":
dirpath = os.path.join(subdirpath, subsubdir)
files = os.listdir(dirpath)
for filename in files:
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
content = csv.reader(f, delimiter=" ")
for row in content:
if "cope" in row[1]:
name = row[1].split("/")[1]
if not name in ppaResults:
ppaResults[name] = [float(row[6])]
else:
ppaResults[name].append(float(row[6]))
res = {"ffa": {}, "lingual": {}, "ppa": {}}
for k in ffaResults:
res["ffa"][k] = statistics.mean(ffaResults[k])
for k in lingualResults:
res["lingual"][k] = statistics.mean(lingualResults[k])
for k in ppaResults:
res["ppa"][k] = statistics.mean(ppaResults[k])
pprint.pprint(res)
if __name__ == "__main__":
main()
Try use this code:
import os
import pandas as pd
li = []
# Traverses thru the root folder 'roi_data' tree and opens .txt files.
for root, dirs, files in os.walk('roi_data'):
for name in files:
# Opens .txt file as dataFrame and uses the second and the sixth columns.
file_path = os.path.join(root, name)
df = pd.read_csv(file_path, sep=' ', usecols=[1, 5], names=['cope', 'value'])
# Filters out rows except those which contains 'cope'.
# Adds ROI columns based on the file dir.
df = df[df.iloc[:, 0].str.contains('cope')]
df['roi'] = root.split("/")[-1]
# Converts the column with the values into float.
# Groups data by roi, cope and finds mean, std, count.
df['value'] = df['value'].astype(float)
# Adds files path to distinguish for which file the calculation is done.
# Appends the dataframe to the list.
df['file_path'] = file_path
li.append(df)
# Concatenates all extracted dataframes.
combined_df = pd.concat(li)
# Replaces values based on the dictionary.
di = {
'stats/cope1': 'Bottle',
'stats/cope2': 'Chair',
'stats/cope3': 'Face',
'stats/cope4': 'House',
'stats/cope5': 'Shoe'
}
combined_df.replace(di, inplace=True)
# Aggregates the combined data and calculates the 'mean', 'std', 'count'.
# Saves the results into 'result.csv' and prints out the results.
result_df = combined_df.groupby(by=['roi', 'cope']).agg(['mean', 'std', 'count'])
result_df.to_csv('result.csv', sep='|')
print(result_df.to_string())
Output:
roi | cope | mean | std | count |
---|---|---|---|---|
ffa | Bottle | 0.76427 | 0.36723498396046694 | 10 |
ffa | Chair | 0.7036800000000001 | 0.40113803609231574 | 10 |
ffa | Face | 1.0842100000000001 | 0.39293685511938314 | 10 |
ffa | House | 0.511365 | 0.394306610851392 | 10 |
ffa | Shoe | 0.92214 | 0.48974865707943616 | 10 |
lingual_gyrus | Bottle | -0.004992799999999994 | 0.26929961480881803 | 10 |
lingual_gyrus | Chair | 0.0005299999999999909 | 0.35327198121434866 | 10 |
lingual_gyrus | Face | -0.004019999999999993 | 0.3000928154569664 | 10 |
lingual_gyrus | House | 0.018081 | 0.3216293944709932 | 10 |
lingual_gyrus | Shoe | -0.002994999999999997 | 0.23756639419328654 | 10 |
ppa | Bottle | 0.7706500000000001 | 0.49211155521667826 | 10 |
ppa | Chair | 0.9696400000000001 | 0.44630525229065166 | 10 |
ppa | Face | 0.327822 | 0.29263178903036335 | 10 |
ppa | House | 1.14463 | 0.43584880673615856 | 10 |
ppa | Shoe | 0.8539999999999999 | 0.5421760148799568 | 10 |
Numpy implementation:
import os
import numpy as np
li = []
# Traverses thru the root folder 'roi_data' tree and opens .txt files.
for root, dirs, files in os.walk('roi_data'):
for name in files:
# Opens .txt file as numpy array and uses the second and the sixth columns.
file_path = os.path.join(root, name)
arr = np.loadtxt(file_path, delimiter=' ', usecols=[1, 5], dtype=str)
# Filters out rows except those which contains 'cope'.
# Adds ROI columns based on the file dir.
arr = arr[np.char.startswith(arr[:, 0], 'stats/cope')]
roi = np.full(fill_value=root.split("/")[-1], shape=(5, 1))
arr = np.concatenate((roi, arr), axis=1)
# Adds files path to distinguish for which file the calculation is done.
# Appends the array to the list.
file_path = np.full(fill_value=file_path, shape=(5, 1))
arr = np.concatenate((arr, file_path), axis=1)
li.append(arr)
# Concatenates all extracted arrays.
# Calculates the requested metrics and builds the result_di
combined_arr = np.array(li).reshape((-1, 4))
groups = (np.char.array(combined_arr[:, 0])
+ '_' + np.char.array(combined_arr[:, 1])).reshape((-1, 1))
combined_arr = np.concatenate((groups, combined_arr), axis=1)
result_di = dict()
for group in set(combined_arr[:, 0]):
group_slice = combined_arr[combined_arr[:, 0] == group]
values = (group_slice[:, 3].astype(float).mean(),
group_slice[:, 3].astype(np.float64).std(ddof=1),
group_slice[:, 3].astype(np.float64).shape[0])
result_di[group] = values
result_di = dict(sorted(result_di.items()))
result_di
{'ffa_stats/cope1': (0.76427, 0.36723498396046694, 10),
'ffa_stats/cope2': (0.7036800000000001, 0.4011380360923157, 10),
'ffa_stats/cope3': (1.0842100000000001, 0.39293685511938314, 10),
'ffa_stats/cope4': (0.511365, 0.394306610851392, 10),
'ffa_stats/cope5': (0.92214, 0.4897486570794361, 10),
'lingual_gyrus_stats/cope1': (-0.004992799999999996, 0.26929961480881803, 10),
'lingual_gyrus_stats/cope2': (0.0005299999999999909, 0.35327198121434866, 10),
'lingual_gyrus_stats/cope3': (-0.004019999999999996, 0.3000928154569664, 10),
'lingual_gyrus_stats/cope4': (0.018081000000000003, 0.3216293944709932, 10),
'lingual_gyrus_stats/cope5': (-0.0029950000000000024,
0.23756639419328654,
10),
'ppa_stats/cope1': (0.77065, 0.49211155521667826, 10),
'ppa_stats/cope2': (0.9696400000000001, 0.44630525229065166, 10),
'ppa_stats/cope3': (0.32782199999999995, 0.2926317890303634, 10),
'ppa_stats/cope4': (1.1446299999999998, 0.43584880673615867, 10),
'ppa_stats/cope5': (0.8539999999999999, 0.5421760148799568, 10)}