create dataframe in pandas using multilevel dict dynamic

Question

I am fetching api and trying that response into csv but on catch is there this is multilevel dict or json when i am converting into csv most of the look like list of dict or dicts
I am trying using this

def expand(data):
    d = pd.Series(data)
    t = d.index
    for i in t:
        if type(d[i]) in (list,dict):
            expend_s = pd.Series(d[i])
            t.append(expend_s.index)
            d = d.append(expend_s)
            d = d.drop([i])
    return d
df['person'].apply(expand)

but this solution is not working. if we see person col there is multiple dict or list of dict like

        "birthDate": "0000-00-00",
        "genderCode": {
            "codeValue": "M",
            "shortName": "Male",
            "longName": "Male"
        },
        "maritalStatusCode": {
            "codeValue": "M",
            "shortName": "Married"
        },
        "disabledIndicator": False,
        "preferredName": {},
        "ethnicityCode": {
            "codeValue": "4",
            "shortName": "4",
            "longName": "Not Hispanic or Latino"
        },
        "raceCode": {
            "identificationMethodCode": {},
            "codeValue": "1",
            "shortName": "White",
            "longName": "White"
        },
        "militaryClassificationCodes": [],
        "governmentIDs": [
            {
                "itemID": "9200037107708_4385",
                "idValue": "XXX-XX-XXXX",
                "nameCode": {
                    "codeValue": "SSN",
                    "longName": "Social Security Number"
                },
                "countryCode": "US"
            }
        ],
        "legalName": {
            "givenName": "Jack",
            "middleName": "C",
            "familyName1": "Abele",
            "formattedName": "Abele, Jack C"
        },
        "legalAddress": {
            "nameCode": {
                "codeValue": "Personal Address 1",
                "shortName": "Personal Address 1",
                "longName": "Personal Address 1"
            },
            "lineOne": "1932 Keswick Lane",
            "cityName": "Concord",
            "countrySubdivisionLevel1": {
                "subdivisionType": "StateTerritory",
                "codeValue": "CA",
                "shortName": "California"
            },
            "countryCode": "US",
            "postalCode": "94518"
        },
        "communication": {
            "mobiles": [
                {
                    "itemID": "9200037107708_4389",
                    "nameCode": {
                        "codeValue": "Personal Cell",
                        "shortName": "Personal Cell"
                    },
                    "countryDialing": "1",
                    "areaDialing": "925",
                    "dialNumber": "6860589",
                    "access": "1",
                    "formattedNumber": "(925) 686-0589"
                }
            ]
        }
    }

your suggestion and advice would be so helpful

Asked By: Noman

||

Source

Answer 1

I think we can solve multiple dict using read as pd.josn_normalise and list of dict using the below functions first we get those columns which have list

def df_list_and_dict_col(explode_df: pd.DataFrame, primary_key: str,
    col_name: str, folder: str) -> pd.DataFrame:

    """ convert list of dict or list of into clean dataframe
    Keyword arguments:
    -----------------
    dict: explode_df --  dataframe where we have to expand column
    dict: col_name   --  main_file name where most of data is present

    Return: pd.DataFrame
    return clean or expand dataframe
    """
    explode_df[col_name] = explode_df[col_name].replace('', '[]', regex=True)

    explode_df[col_name] = explode_df[col_name].fillna('[]')
    explode_df[col_name] = explode_df[col_name].astype(
        'string')  # to make sure that entire column is string
    explode_df[col_name] = explode_df[col_name].apply(ast.literal_eval)
    explode_df = explode_df.explode(col_name)
    explode_df = explode_df.reset_index(drop=True)
    normalized_df = pd.json_normalize(explode_df[col_name])
    explode_df = explode_df.join(
        other=normalized_df,
        lsuffix="_left",
        rsuffix="_right"
    )
    explode_df = explode_df.drop(columns=col_name)

    type_df = explode_df.applymap(type)
    col_list = []
    for col in type_df.columns:
        if (type_df[col]==type([])).any():
            col_list.append(col)
    # print(col_list,explode_df.columns)
    if len(col_list) != 0:
        for col in col_list:
            df_list_and_dict_col(explode_df[[primary_key,col]], primary_key,
            col, folder)
            explode_df.drop(columns=col, inplace =True)
    print(f'{col}.csv is done')
    explode_df.to_csv(f'{folder}/{col_name}.csv')

first we get list col and pass col to function one by one and then check is there any list inside col and then go on and save into csv

type_df = df.applymap(type)
col_list =[]
for col in  type_df.columns:
  if (type_df[col]==type([])).any():
       col_list.append(col)
for col in col_list:
       # print(col, df[['associateOID',col]])
       df_list_and_dict_col(df[['primary_key',col]].copy(), 'primary_key', col,folder='worker')
       df.drop(columns=col, inplace=True)

now you have multiple csv in normalise format

Answered By: Noman

create dataframe in pandas using multilevel dict dynamic

Question:

Answers: