Create a treemap showing directory structure with plotly graph object

Question:

I want to create a treemap that shows the folders in a given directory, including all subfolders and files using plotly.graph_objects.Treemap. I understand simple examples like this one and this one.

Problem: I can’t figure out how to generate the ids column to make my figure render properly. I’m going to have duplicate labels, so I need to use ids. Right now, the figure renders blank.

Code:

Here’s some code to generate a sample directory structure to help you help me:

import os

folder = 'Documents'

for i in range(10):
    for j in range(100):
        path = os.path.join(folder, f'folder_{i}', f'sub-folder-{j}')
        if not os.path.isdir(path):
            os.makedirs(path)
        for k in range(20):
            with open(os.path.join(path, f'file_{k + 1}.txt'), 'w') as file_out:
                file_out.write(f'Hello from file {k + 1}!n')

Here’s the code to calculate the files sizes and create the treemap:

import os
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go

directory = '[input your directory here]/Documents'

def calculate_size(folder):
    result = []
    for root, dirs, files in os.walk(folder):
        relpath = Path(root).relative_to(Path(folder).parent)

        # Calculate directory size
        dir_size = sum(os.path.getsize(os.path.join(root, name)) for name in files)
        result.append({
            'parents': str(relpath),
            'labels': str(Path(root).name),
            'size': dir_size,
            'ids': str(relpath),
        })

        # Calculate individual file size
        for f in files:
            fp = os.path.join(root, f)
            relpath_fp = Path(fp).relative_to(Path(folder).parent)
            result.append({
                'parents': str(relpath_fp),
                'labels': str(Path(fp).name),
                'size': os.path.getsize(fp),
                'ids': str(relpath_fp),
            })

    return result

result = calculate_size(directory)

df = pd.DataFrame(result)

# Set root
df.loc[df.index == 0, 'parents'] = ""

labels = df['labels'].tolist()
parents = df['parents'].tolist()
ids = df['ids'].tolist()
values = df['size'].tolist()

fig = go.Figure(go.Treemap(
    labels = labels,
    parents = parents,
    ids = ids,
    values = values,
    # maxdepth=3
))
  
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))

fig.show()
Asked By: Simon1

||

Answers:

You could use plotly.express to create your treemap. You need to create a new column for each level in the treemap (used a regex to extract that information from the parents column in your dataframe).

df['level1'] = df['parents'].str.replace(pat = '^(.*?)\\(.*?)\\(.*?)\\(.*)', repl = r'1')
df['level2'] = df['parents'].str.replace(pat = '^(.*?)\\(.*?)\\(.*?)\\(.*)', repl = r'2')
df['level3'] = df['parents'].str.replace(pat = '^(.*?)\\(.*?)\\(.*?)\\(.*)', repl = r'3')
df['level4'] = df['parents'].str.replace(pat = '^(.*?)\\(.*?)\\(.*?)\\(.*)', repl = r'4')

df = df.query("labels.str.contains('.txt')")

fig = px.treemap(df,
    title = 'treemap of folder structure',
    values = 'size',
    path = ['level1', 'level2', 'level3', 'level4'],
    maxdepth = 2,
                    )

enter image description here

Answered By: DougR

You’re really close. However, your ids and parents will be different when you have a multi-tiered treemap. Together they create the map for Plotly.

Here I’ve added another function.

def parPath(idpath):
    """determine if path is top or not, then determine parent path"""
    if idpath == os.path.split(directory)[1]:
        parpath = idpath
    else: 
        parpath = os.path.split(idpath)[0]
    return parpath

This function, parPath, is called within your function, calculate_size.
There are four changes in this function (noted with comments).

def calculate_size(folder):
    result = []
    for root, dirs, files in os.walk(folder):
        relpath = Path(root).relative_to(Path(folder).parent)

        newpar = parPath(relpath) # determine if parent and id are different

        # Calculate directory size
        dir_size = sum(os.path.getsize(os.path.join(root, name)) for name in files)
        result.append({
            'parents': str(newpar),                      # was str(relpath)
            'labels': str(Path(root).name),
            'size': dir_size,
            'ids': str(relpath),
        })

        # Calculate individual file size
        for f in files:
            fp = os.path.join(root, f)
            relpath_fp = Path(fp).relative_to(Path(folder).parent)

            newpar2 = parPath(relpath_fp) # determine if parent and id are different

            result.append({
                'parents': str(newpar2),                 # was str(relpath)
                'labels': str(Path(fp).name),
                'size': os.path.getsize(fp),
                'ids': str(relpath_fp),
            })

    return result

There is another modification in addition to your call to change the first parent; you’ll also change the first id.

df.loc[df.index == 0, 'ids'] = os.path.split(df.loc[0, 'ids'])[1] # get first folder

You’re ready to plot.

fig = go.Figure(go.Treemap(
    labels = labels,
    parents = parents,
    ids = ids,
    values = values,
))
  
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))

fig.show()

Here’s a drill down into one of my own folders that I used in testing.

enter image description here

Answered By: Kat
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.