How to identify array records that has value in dataframe
Question:
my dataframe df looks like this
RepID Arr
0 3 [E83, J99]
1 5 [P81, G39, E34]
2 6 [R19]
...
...
1857 8885 [G39, F14, T22]
I want to find all records that have the value "G39" in the array
RepID Arr
1 5 [P81, G39, E34]
1857 8885 [G39, F14, T22]
I tried this but it did not work
np.where(df["Arr"]=="G39")
I get empty
(array([], dtype=int64),)
How to get that list?
Answers:
You can use list comprehension to iterate through array column and filter and collect dataframe records based on condition:
import pandas as pd
data = [
(3, ["E83", "J99"]),
(5, ["P81", "G39", "E34"]),
(6, ["R19"]),
(8885, ["G39", "F14", "T22"]),
]
df = pd.DataFrame(data=data, columns=["RepID", "Arr"])
df[[("G39" in typ_arr) for typ_arr in df["Arr"]]]
>> RepID Arr
>> 1 5 [P81, G39, E34]
>> 3 8885 [G39, F14, T22]
The code below creates both, a dictionary and a pandas data frame with the requested records:
import pandas as pd
from pprint import pprint
df_print="""
RepID Arr
0 3 [E83, J99]
1 5 [P81, G39, E34]
2 6 [R19]
3 8885 [G39, F14, T22]"""
open('df_print', 'w').write(df_print)
df = pd.read_table('df_print', sep=r'sss*', index_col = 0)
dct_rows = {}
for index, row in df.iterrows():
if "G39" in row["Arr"]:
dct_rows[index] = { 'RepID' : row['RepID'], "Arr" : row["Arr"] }
print("dct_rows = \n", end='')
pprint(dct_rows)
print("---")
df_G39 = pd.DataFrame.from_dict(dct_rows, orient='index')
print("df_G39_print = """\")
print(df_G39, end='"""n')
printing:
dct_rows =
{1: {'Arr': '[P81, G39, E34]', 'RepID': 5},
3: {'Arr': '[G39, F14, T22]', 'RepID': 8885}}
---
df_G39_print = """
RepID Arr
1 5 [P81, G39, E34]
3 8885 [G39, F14, T22]"""
my dataframe df looks like this
RepID Arr
0 3 [E83, J99]
1 5 [P81, G39, E34]
2 6 [R19]
...
...
1857 8885 [G39, F14, T22]
I want to find all records that have the value "G39" in the array
RepID Arr
1 5 [P81, G39, E34]
1857 8885 [G39, F14, T22]
I tried this but it did not work
np.where(df["Arr"]=="G39")
I get empty
(array([], dtype=int64),)
How to get that list?
You can use list comprehension to iterate through array column and filter and collect dataframe records based on condition:
import pandas as pd
data = [
(3, ["E83", "J99"]),
(5, ["P81", "G39", "E34"]),
(6, ["R19"]),
(8885, ["G39", "F14", "T22"]),
]
df = pd.DataFrame(data=data, columns=["RepID", "Arr"])
df[[("G39" in typ_arr) for typ_arr in df["Arr"]]]
>> RepID Arr
>> 1 5 [P81, G39, E34]
>> 3 8885 [G39, F14, T22]
The code below creates both, a dictionary and a pandas data frame with the requested records:
import pandas as pd
from pprint import pprint
df_print="""
RepID Arr
0 3 [E83, J99]
1 5 [P81, G39, E34]
2 6 [R19]
3 8885 [G39, F14, T22]"""
open('df_print', 'w').write(df_print)
df = pd.read_table('df_print', sep=r'sss*', index_col = 0)
dct_rows = {}
for index, row in df.iterrows():
if "G39" in row["Arr"]:
dct_rows[index] = { 'RepID' : row['RepID'], "Arr" : row["Arr"] }
print("dct_rows = \n", end='')
pprint(dct_rows)
print("---")
df_G39 = pd.DataFrame.from_dict(dct_rows, orient='index')
print("df_G39_print = """\")
print(df_G39, end='"""n')
printing:
dct_rows =
{1: {'Arr': '[P81, G39, E34]', 'RepID': 5},
3: {'Arr': '[G39, F14, T22]', 'RepID': 8885}}
---
df_G39_print = """
RepID Arr
1 5 [P81, G39, E34]
3 8885 [G39, F14, T22]"""