Convert 2 column dataframe into multi-level hierarchical dataframe

Question:

I have a pandas dataframe

From To
A B
A C
D E
F F
B G
B H
B I
G J
G K
L L
M M
N N

I want to convert it into multi column hierarchy. The expected hierarchy will look like

Level_1 Level_2 Level_3 Level_4
A B G J
A B G K
A B H
A B I
A C
D E
F F
L L
M M
N N

Is there an in-built way in pandas to achieve this? I know i can use recursion, Is there any other simplified way?

Asked By: seb5ferrari

||

Answers:

You can easily get what you expect using networkx

# Python env: pip install networkx
# Anaconda env: conda install networkx

import networkx as nx
import pandas as pd

df = pd.DataFrame({'From': ['A', 'A', 'D', 'F', 'B', 'B', 'B', 'G', 'G', 'L', 'M', 'N'],
                   'To': ['B', 'C', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N']})

G = nx.from_pandas_edgelist(df, source='From', target='To', create_using=nx.DiGraph)
roots = [v for v, d in G.in_degree() if d == 0]
leaves = [v for v, d in G.out_degree() if d == 0]

all_paths = []
for root in roots:
    for leaf in leaves:
        paths = nx.all_simple_paths(G, root, leaf)
        all_paths.extend(paths)

for node in nx.nodes_with_selfloops(G):
    all_paths.append([node, node])

Output:

>>> pd.DataFrame(sorted(all_paths)).add_prefix('Level_').fillna('')
  Level_0 Level_1 Level_2 Level_3
0       A       B       G       J
1       A       B       G       K
2       A       B       H
3       A       B       I
4       A       C
5       D       E
6       F       F
7       L       L
8       M       M
9       N       N

Documentation: networkx.algorithms.simple_paths.all_simple_paths

Answered By: Corralien

Solution without networkx:

def path(df, parent, cur_path=None):
    if cur_path is None:
        cur_path = []

    x = df[df.From.eq(parent)]

    if len(x) == 0:
        yield cur_path
        return
    elif len(x) == 1:
        yield cur_path + x["To"].to_list()
        return

    for _, row in x.iterrows():
        yield from path(df, row["To"], cur_path + [row["To"]])


def is_sublist(l1, l2):
    # checks if l1 is sublist of l2

    if len(l1) > len(l2):
        return False

    for i in range(len(l2)):
        if l1 == l2[i : i + len(l1)]:
            return True

    return False


unique_paths = []
for v in df["From"].unique():
    for p in path(df, v, [v]):
        if not any(is_sublist(p, up) for up in unique_paths):
            unique_paths.append(p)


df = pd.DataFrame(
    [{f"level_{i}": v for i, v in enumerate(p, 1)} for p in unique_paths]
).fillna("")
print(df)

Prints:

  level_1 level_2 level_3 level_4
0       A       B       G       J
1       A       B       G       K
2       A       B       H        
3       A       B       I        
4       A       C                
5       D       E                
6       F       F                
7       L       L                
8       M       M                
9       N       N                
Answered By: Andrej Kesely

Here’s an alternative method that uses repeated pandas merges to get the result:

def create_multi_level(
    df: pd.DataFrame, from_col: str = "From", to_col: str = "To"
) -> pd.DataFrame:
    # Separate root nodes from non-root
    mask = df[from_col].isin(df[to_col].unique()) & (df[from_col] != df[to_col])
    root = df[~mask].copy()
    non_root = df[mask].copy()

    # Define starting columns
    root.columns = ["level_0", "level_1"]
    non_root.columns = ["level_1", "level_2"]
    i = 1
    while True:
        # merge
        root = root.merge(non_root, on=f"level_{i}", how="left")

        # Return if we're done
        non_missing = root[f"level_{i + 1}"].notna()
        if non_missing.mean() == 0:
            return root.drop(columns=f"level_{i + 1}")

        # Increment and rename columns
        i += 1
        non_root.columns = [f"level_{i}", f"level_{i + 1}"]

Output:

>>> create_multi_level(df)

  level_0 level_1 level_2 level_3
0       A       B       G       J
1       A       B       G       K
2       A       B       H     NaN
3       A       B       I     NaN
4       A       C     NaN     NaN
5       D       E     NaN     NaN
6       F       F     NaN     NaN
7       L       L     NaN     NaN
8       M       M     NaN     NaN
9       N       N     NaN     NaN
Answered By: Robert Robison
Categories: questions Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.