How to refactor double for loop faster in Python?

Question:

I have a folder with 20,000 stock files, I need to calculate corelation of each stock with each other.

dex is indexes lets say they are 10. So for each index i need to calculate the corelation with each of 20K stocks. current code takes like 3 days to fully run. is there any way to make it faster

files sample is

Date Close
20060206 45.020
20060207 43.870
20060208 44.610
20060209 44.280
20060210 43.800
dex = ['AAA','BBB','CCC','DDD']
roll = 21

for ndx in dex:
    dx = pd.read_csv(base + '/' + f'{ndx}.csv', usecols= ['TimeStamp','Close'])
    dx.columns = ['Date', ndx]
    dx = dx.set_index('Date')
    
    for files in os.listdir(base):

        df = pd.read_csv(base + '/' + files, usecols= ['TimeStamp','Close'])
        tckr = files[:-4]
    
        if tckr != ndx:
            df.columns = ['Date', tckr]
            df = df.set_index('Date')
            sd.append(df)
            print(files, df.shape, ndx)
    
            loo = pd.concat([dx,df], axis =1)
            poo = loo.pct_change()
            poo.dropna(axis = 0, how = 'any', inplace = True)
    
            poo[f'{tckr}_{ndx}_CR'] = round(poo[ndx].rolling(roll).corr(poo[tckr]), 2)

            poo.pop(tckr)
            poo.pop(ndx)

            os.makedirs(output + '/' + ndx + '/', exist_ok = True)
            poo.reset_index().to_csv(f'{output}' + '/' + ndx  + '/' + f'{tckr}.csv', index = False)

Answers:

I would start by inverting your loops so that you read the larger list of files once. That would likely also allow you to cache the smaller list of files in memory:

import pandas as pd
import os

dex = ['AAA','BBB','CCC','DDD']
roll = 21
base = "/some/base/path"
output = "/some/output/path"
sd = []

### -----------------------
### There are few of these so let's keep them around
### -----------------------
index_lookup = {}
for ndx in dex:
    dx = pd.read_csv(f"{ base }/{ndx}.csv", usecols= ['TimeStamp','Close'])
    dx.columns = ['Date', ndx]
    index_lookup[ndx] = dx.set_index('Date')
### -----------------------

### -----------------------
### Process each file (this could be done in parallel)
### -----------------------
for files in os.listdir(base):
    tckr = files[:-4]
    print(f"Processing {tckr} against all indexes.")

    for ndx, dx in index_lookup.items():
        if tckr == ndx:
            continue

        os.makedirs(f"{output}/{ndx}", exist_ok=True)

        df = pd.read_csv(f"{base}/{files}", usecols= ['TimeStamp','Close'])
        df.columns = ['Date', tckr]
        df = df.set_index('Date')

        ### -----------------------
        ### potentially not doing anything
        ### -----------------------
        sd.append(df)
        ### -----------------------

        poo = pd.concat([dx, df], axis =1).pct_change()
        poo.dropna(axis=0, how='any', inplace=True)
        poo[f'{tckr}_{ndx}_CR'] = round(poo[ndx].rolling(roll).corr(poo[tckr]), 2)
        poo.pop(tckr)
        poo.pop(ndx)
        poo.reset_index().to_csv(f"{output}/{ndx}/{tckr}.csv", index=False)
### -----------------------
Answered By: JonSG