Transpose pandas DF based on value data type
Question:
I have pandas DataFrame
A. I am struggling transforming this into my desired format, see DataFrame
B. I tried pivot
or melt
but I am not sure how I could make it conditional (string
values to FIELD_STR_VALUE
, numeric
values to FIELD_NUM_VALUE
). I was hoping you could point me the right direction.
A: Input DataFrame
|FIELD_A |FIELD_B |FIELD_C |FIELD_D |
|--------|--------|--------|--------|
|123123 |8 |a |23423 |
|123124 |7 |c |6464 |
|123144 |99 |x |234 |
B: Desired output DataFrame
|ID |FIELD_A |FIELD_NAME |FIELD_STR_VALUE |FIELD_NUM_VALUE |
|---|--------|-----------|----------------|----------------|
|1 |123123 |B | |8 |
|2 |123123 |C |a | |
|3 |123123 |D | |23423 |
|4 |123124 |B | |7 |
|5 |123124 |C |c | |
|6 |123124 |D | |6464 |
|7 |123144 |B | |99 |
|8 |123144 |C |x | |
|9 |123144 |D | |234 |
Answers:
You can use:
# dic = {np.int64: 'NUM', object: 'STR'}
(df.set_index('FIELD_A')
.pipe(lambda d: d.set_axis(pd.MultiIndex.from_arrays(
[d.columns, d.dtypes],
# or for custom NAMES
#[d.columns, d.dtypes.map(dic)],
names=['FIELD_NAME', None]),
axis=1)
)
.stack(0).add_prefix('FIELD_').add_suffix('_VALUE')
.reset_index()
)
NB. if you really want STR/NUM, map
those strings from the dtypes (see comments in code).
Output:
FIELD_A FIELD_NAME FIELD_int64_VALUE FIELD_object_VALUE
0 123123 FIELD_B 8.0 NaN
1 123123 FIELD_C NaN a
2 123123 FIELD_D 23423.0 NaN
3 123124 FIELD_B 7.0 NaN
4 123124 FIELD_C NaN c
5 123124 FIELD_D 6464.0 NaN
6 123144 FIELD_B 99.0 NaN
7 123144 FIELD_C NaN x
8 123144 FIELD_D 234.0 NaN
df.melt(id_vars='FIELD_A', var_name='FIELD_NAME', value_name='FIELD_VALUE').sort_values(by='FIELD_A').reset_index(drop=True)
outputs
FIELD_A FIELD_NAME FIELD_VALUE
0 123123 FIELD_B 8
1 123123 FIELD_C a
2 123123 FIELD_D 23423
3 123124 FIELD_B 7
4 123124 FIELD_C c
5 123124 FIELD_D 6464
6 123144 FIELD_B 99
7 123144 FIELD_C x
8 123144 FIELD_D 234
I hope this helps to achieve your desired output!
We can stretch it further:
(df
.melt(
id_vars='FIELD_A',
var_name='FIELD_NAME',
value_name='FIELD_VALUE')
.assign(
FIELD_NAME = lambda df: df.FIELD_NAME.str[-1],
FIELD_NUM_VALUE = lambda df: df.FIELD_VALUE.where(df.FIELD_VALUE.map(type) == int),
FIELD_STR_VALUE = lambda df: np.where(df.FIELD_NUM_VALUE.isna(), df.FIELD_VALUE, np.nan))
.drop(columns='FIELD_VALUE')
.sort_values(by='FIELD_A', ignore_index = True)
)
FIELD_A FIELD_NAME FIELD_NUM_VALUE FIELD_STR_VALUE
0 123123 B 8 NaN
1 123123 C NaN a
2 123123 D 23423 NaN
3 123124 B 7 NaN
4 123124 C NaN c
5 123124 D 6464 NaN
6 123144 B 99 NaN
7 123144 C NaN x
8 123144 D 234 NaN
Another option, with pd.Series.str.extract
:
temp = df.melt(id_vars='FIELD_A', var_name='FIELD_NAME', value_name='FIELD_VALUE')
temp = temp.assign(FIELD_NAME = lambda df: df.FIELD_NAME.str[-1])
regex = r"(?P<FIELD_STR_VALUE>D+)|(?P<FIELD_NUM_VALUE>d+)"
extract = temp.FIELD_VALUE.astype(str).str.extract(regex)
temp.drop(columns='FIELD_VALUE').assign(**extract)
FIELD_A FIELD_NAME FIELD_STR_VALUE FIELD_NUM_VALUE
0 123123 B NaN 8
1 123124 B NaN 7
2 123144 B NaN 99
3 123123 C a NaN
4 123124 C c NaN
5 123144 C x NaN
6 123123 D NaN 23423
7 123124 D NaN 6464
8 123144 D NaN 234
You can also try this:
(df.melt('FIELD_A')
.pipe(lambda d: d[['FIELD_A', 'value']].join(d.variable.str.extract('w+_(?P<FIELD_NAME>w+)')))
.pipe(lambda g: g[['FIELD_A', 'FIELD_NAME']].join(g.value.astype(str).str.extract('(?P<FIELD_STR_VALUE>D+)|(?P<FIELD_NUM_VALUE>d+)')))
.sort_values('FIELD_A'))
FIELD_A FIELD_NAME FIELD_STR_VALUE FIELD_NUM_VALUE
0 123123 B NaN 8
3 123123 C a NaN
6 123123 D NaN 23423
1 123124 B NaN 7
4 123124 C c NaN
7 123124 D NaN 6464
2 123144 B NaN 99
5 123144 C x NaN
8 123144 D NaN 234
I have pandas DataFrame
A. I am struggling transforming this into my desired format, see DataFrame
B. I tried pivot
or melt
but I am not sure how I could make it conditional (string
values to FIELD_STR_VALUE
, numeric
values to FIELD_NUM_VALUE
). I was hoping you could point me the right direction.
A: Input DataFrame
|FIELD_A |FIELD_B |FIELD_C |FIELD_D |
|--------|--------|--------|--------|
|123123 |8 |a |23423 |
|123124 |7 |c |6464 |
|123144 |99 |x |234 |
B: Desired output DataFrame
|ID |FIELD_A |FIELD_NAME |FIELD_STR_VALUE |FIELD_NUM_VALUE |
|---|--------|-----------|----------------|----------------|
|1 |123123 |B | |8 |
|2 |123123 |C |a | |
|3 |123123 |D | |23423 |
|4 |123124 |B | |7 |
|5 |123124 |C |c | |
|6 |123124 |D | |6464 |
|7 |123144 |B | |99 |
|8 |123144 |C |x | |
|9 |123144 |D | |234 |
You can use:
# dic = {np.int64: 'NUM', object: 'STR'}
(df.set_index('FIELD_A')
.pipe(lambda d: d.set_axis(pd.MultiIndex.from_arrays(
[d.columns, d.dtypes],
# or for custom NAMES
#[d.columns, d.dtypes.map(dic)],
names=['FIELD_NAME', None]),
axis=1)
)
.stack(0).add_prefix('FIELD_').add_suffix('_VALUE')
.reset_index()
)
NB. if you really want STR/NUM, map
those strings from the dtypes (see comments in code).
Output:
FIELD_A FIELD_NAME FIELD_int64_VALUE FIELD_object_VALUE
0 123123 FIELD_B 8.0 NaN
1 123123 FIELD_C NaN a
2 123123 FIELD_D 23423.0 NaN
3 123124 FIELD_B 7.0 NaN
4 123124 FIELD_C NaN c
5 123124 FIELD_D 6464.0 NaN
6 123144 FIELD_B 99.0 NaN
7 123144 FIELD_C NaN x
8 123144 FIELD_D 234.0 NaN
df.melt(id_vars='FIELD_A', var_name='FIELD_NAME', value_name='FIELD_VALUE').sort_values(by='FIELD_A').reset_index(drop=True)
outputs
FIELD_A FIELD_NAME FIELD_VALUE
0 123123 FIELD_B 8
1 123123 FIELD_C a
2 123123 FIELD_D 23423
3 123124 FIELD_B 7
4 123124 FIELD_C c
5 123124 FIELD_D 6464
6 123144 FIELD_B 99
7 123144 FIELD_C x
8 123144 FIELD_D 234
I hope this helps to achieve your desired output!
We can stretch it further:
(df
.melt(
id_vars='FIELD_A',
var_name='FIELD_NAME',
value_name='FIELD_VALUE')
.assign(
FIELD_NAME = lambda df: df.FIELD_NAME.str[-1],
FIELD_NUM_VALUE = lambda df: df.FIELD_VALUE.where(df.FIELD_VALUE.map(type) == int),
FIELD_STR_VALUE = lambda df: np.where(df.FIELD_NUM_VALUE.isna(), df.FIELD_VALUE, np.nan))
.drop(columns='FIELD_VALUE')
.sort_values(by='FIELD_A', ignore_index = True)
)
FIELD_A FIELD_NAME FIELD_NUM_VALUE FIELD_STR_VALUE
0 123123 B 8 NaN
1 123123 C NaN a
2 123123 D 23423 NaN
3 123124 B 7 NaN
4 123124 C NaN c
5 123124 D 6464 NaN
6 123144 B 99 NaN
7 123144 C NaN x
8 123144 D 234 NaN
Another option, with pd.Series.str.extract
:
temp = df.melt(id_vars='FIELD_A', var_name='FIELD_NAME', value_name='FIELD_VALUE')
temp = temp.assign(FIELD_NAME = lambda df: df.FIELD_NAME.str[-1])
regex = r"(?P<FIELD_STR_VALUE>D+)|(?P<FIELD_NUM_VALUE>d+)"
extract = temp.FIELD_VALUE.astype(str).str.extract(regex)
temp.drop(columns='FIELD_VALUE').assign(**extract)
FIELD_A FIELD_NAME FIELD_STR_VALUE FIELD_NUM_VALUE
0 123123 B NaN 8
1 123124 B NaN 7
2 123144 B NaN 99
3 123123 C a NaN
4 123124 C c NaN
5 123144 C x NaN
6 123123 D NaN 23423
7 123124 D NaN 6464
8 123144 D NaN 234
You can also try this:
(df.melt('FIELD_A')
.pipe(lambda d: d[['FIELD_A', 'value']].join(d.variable.str.extract('w+_(?P<FIELD_NAME>w+)')))
.pipe(lambda g: g[['FIELD_A', 'FIELD_NAME']].join(g.value.astype(str).str.extract('(?P<FIELD_STR_VALUE>D+)|(?P<FIELD_NUM_VALUE>d+)')))
.sort_values('FIELD_A'))
FIELD_A FIELD_NAME FIELD_STR_VALUE FIELD_NUM_VALUE
0 123123 B NaN 8
3 123123 C a NaN
6 123123 D NaN 23423
1 123124 B NaN 7
4 123124 C c NaN
7 123124 D NaN 6464
2 123144 B NaN 99
5 123144 C x NaN
8 123144 D NaN 234