data etl process

Question:

I have following data and want to validate values of it either integer, float or string
attribute::: id metric
attribute::: name points
attribute::: cake_name None
attribute::: time None
attribute::: time None
["key ‘id’ is not a string, got None", "key ‘metric’ is not a integer, got <class ‘str’>", "key ‘points’ is not a integer, got <class ‘NoneType’>"]

Asked By: Alexander

||

Answers:

My solution is a recursive soltuion, to read nested json data.

from functools import partial
from typing import Union, Callable
import json

def get_output(key, val, string_keys: list, int_keys: list, float_keys: list):
    out = None
    if key in string_keys:
        if not isinstance(val, str):
            out = f"key '{key}' is not a string, got {type(val)}"
    elif key in int_keys:
        if not isinstance(val, int):
            out = f"key '{key}' is not a integer, got {type(val)}"
    elif key in float_keys:
        if not isinstance(val, float):
            out = f"key '{key}' is not a float, got {type(val)}"
    return out

def explore_json(json: Union[dict, list], validator: Callable):
    result = []
    if isinstance(json, dict):
        for key, val in json.items():
            if isinstance(val, (dict, list)):
                result.extend(explore_json(val, validator))
            else: 
                out = validator(key, val)
                if out is not None:
                    result.append(out) 
    elif isinstance(json, list):
        for val in json:
             result.extend(explore_json(val, validator))
    return result

data = json.loads(json_data)
explore_json(data, validator)
                          
validator = partial(get_output,
                    string_keys=["id", "name", "cake_name", "time"], 
                    int_keys=['metric','points'], 
                    float_keys=["LA:TB2342", "LA:TB2341", "LA:TB2344"])
data = json.loads(json_data)
explore_json(data, validator)

The output of this is:

["key 'id' is not a string, got <class 'NoneType'>",
 "key 'metric' is not a integer, got <class 'str'>",
 "key 'LA:TB2342' is not a float, got <class 'str'>"]

The advance of the partial function is that we can have a validator for each specific json.

Moreover, note that only the keys inside the list string_keys, int_keys, float_keys defined in our specific validator can be in the output list any key not inside these lists is not verified.

Finally, I’m not sure if the lists are the same as yours, but just change them and check the output.

EDIT For tracking parent key:


def explore_json(json: Union[dict, list], validator: Callable, parent_key=" parent_key:"):
    result = []
    if isinstance(json, dict):
        for key, val in json.items():
            if isinstance(val, (dict, list)):
                #result = explore_json(val, validator, result)
                result.extend(explore_json(val, validator, f"{parent_key}.{key}"))
            else: 
                out = validator(key, val)
                if out is not None:
                    if parent_key != " parent_key:":
                        out += parent_key
                    result.append(out) 
    elif isinstance(json, list):
        for block_num, val in enumerate(json):
            result.extend(explore_json(val, validator, f"{parent_key}.item{block_num}"))
            # result = explore_json(val, validator, result)
    return result

output:

["key 'id' is not a string, got <class 'NoneType'>",
 "key 'metric' is not a integer, got <class 'str'>",
 "key 'LA:TB2342' is not a float, got <class 'str'> parent_key:.anticipations.item1.top_properties"]

item1 indicates that the error is in the first element of the list for key anticipations

Answered By: Lucas M. Uriarte