How to parse a dataframe column having dynamic records and write it into different columns by maintaining the relation with the table
Question:
cur = snow_connection.cursor()
data = snow_connection.cursor().execute("SELECT x,y,z FROM abc.testing_view2").fetchall()
df = pd.DataFrame(data)
df.columns = ['x','y','z']
print(df)
temp_col=df.iloc[:,2]
print(temp_col)
for i in range (len(df.iloc[:,0])):
for j in range(len(df.iloc[:,1])):
for k in range(len(temp_col)):
json_dict = str(json.loads(str(k)))
print(json_dict)
# accessing values in the dictionary
portfolio_name = str(json_dict[0])
portfolio_id = str(json_dict[1])
for l in range(len(portfolio_name)):
for m in range(len(portfolio_id)):
# created a new dictionary with all three columns
new_dict = {'name': portfolio_name, 'portfolio_id': portfolio_id,'x': ['X'], 'y': ['Y']}
print(new_dict)
new_data = []
new_data.append(new_dict)
new_df = pd.DataFrame(new_data)
The problem is column z(which is having json record with name and id) is having dynamic records i.e could contain only one record with name and id and more than 1 too. but it will be mapped with only one x and y. So i have to write this data into multiple rows i.e if column z contains one record then it should write the data into 4 columns x,y ,name and id and if column z contains 2 records then it should write the data and create 2 rows.
I am iterating into this column and writing it into a dictionary to maintain the relation however it is not reading the json dict i created for name and id.
Sample Data:
{
"document_portfolio": "[{"name": "MWP FT Income ICP", "portfolio_id": "31a01afd-1f69-e617-ade3-d7c1895c4461"}, {"name": "MWP Tactical ETP IMG", "portfolio_id": "13281ca6-a9a7-d361-1cf2-07e6fa3e283a"}]",
"document_uuid": "28d7ccb1-3f9f-fdd0-c757-6b134a74fdd3",
"user_id": "00u9vj92B0ZPUMU9b5d5"
}
{
"document_portfolio": "[{"name": "tesying", "portfolio_id": "59d26651-3484-e7ef-9ece-f7194d7639e0"}]","document_uuid": "1cf1ca8e-f0e9-844b-11d6-0d05302fb777",
"user_id": "00u5flkeths3G668k5d7"
}
Expected output :
portfolio_id name portfolio_uuid user_id
31a01afd-1f69-e617-ade3-d7c1895c4461 MWP FT Income ICP 28d7ccb1-3f9f-fdd0-c757-6b134a74fdd3 00u9vj92B0ZPUMU9b5d5
13281ca6-a9a7-d361-1cf2-07e6fa3e283a MWP Tactical ETP IMG 28d7ccb1-3f9f-fdd0-c757-6b134a74fdd3 00u9vj92B0ZPUMU9b5d5
Sample Snowflake logic:
CREATE OR REPLACE VIEW port_test_vw AS
SELECT
LATERAL FLATTEN(PARSE_JSON(data:document_portfolio):portfolio) flattened,
flattened:name::string AS portfolio_name,
flattened:portfolio_id::string AS portfolio_id,
FROM XYZ;
Answers:
for i in range (len(df.iloc[:,0])):
for j in range(len(df.iloc[:,1])):
for s in range(len(df.iloc[:,2])):
for index,row in df.iterrows():
json_dict = row['DOCUMENT_PORTFOLIO']
data = json.loads(json_dict)
for i,row in enumerate(data):
#for j in range(len(df['DOCUMENT_UUID'])):
if len(data) == 1:
result_dic = {}
portfolio_name = row['name']
result_dic[i] = portfolio_name
print(portfolio_name)
portfolio_id = row['portfolio_id']
result_dic[i] = portfolio_id
print(portfolio_id)
DOCUMENT_UUID = df['DOCUMENT_UUID']
result_dic[j] = DOCUMENT_UUID
print(DOCUMENT_UUID)
else:
exit()
output:
This is the output when I am not putting condition on length
Ali Mahbod – $1M Valued Client-Copy
5efbd6c7-2abe-5e78-b035-1cfffc18b9cd
[{‘name’: ‘Falip Large Caps 1’, ‘portfolio_id’: ‘7c3b5cd5-b788-8667-23b2-d25fb110e525’}, {‘name’: ‘Falip Large Caps 2’, ‘portfolio_id’: ‘7c464968-91e8-a4d5-9756-07efb0d0c7b6’}]
Falip Large Caps 1
7c3b5cd5-b788-8667-23b2-d25fb110e525
Falip Large Caps 2
7c464968-91
However on adding condition on length I am getting this error:
Traceback (most recent call last):
File "Document_custom_field.py", line 102, in
for index,row in name.iterrows():
AttributeError: ‘str’ object has no attribute ‘iterrows’
I tried converting it into dataframe again and loading the json the again but it didnt worked.
For now I am exiting the loop if length is more than one however it is not writing the records into one single dataframe as per the length condition.
I have fixed the above issue. Hope if someone is also having you can reuse this logic:
df = pd.DataFrame(data)
df.columns = ['DOCUMENT_ID', 'DOCUMENT_UUID', 'USER_ID', 'DOCUMENT_PORTFOLIO']
for row in data:
document_id = row[0]
document_uuid = row[1]
user_id = row[2]
json_dict = row[3]
portfolio_data = json.loads(json_dict)
for item in portfolio_data:
portfolio_name = item['name']
portfolio_id = item['portfolio_id']
new_row = {'DOCUMENT_ID': document_id, 'DOCUMENT_UUID': document_uuid, 'USER_ID': user_id, 'PORTFOLIO_NAME': portfolio_name, 'PORTFOLIO_ID': portfolio_id}
df = df.append(new_row, ignore_index=True)
#print(df)
table_name = 'XYZ'
with snow_connection.cursor() as cursor:
cursor.execute(f"CREATE OR REPLACE TABLE {table_name}(DOCUMENT_ID INT,DOCUMENT_UUID TEXT,USER_ID TEXT,PORTFOLIO_NAME TEXT,PORTFOLIO_ID TEXT)")
df.to_sql(table_name, snow_connection, index=False, if_exists='append')
Here is a Snowflake SQL syntax for you:
with x as (
select parse_json('{
"document_portfolio": [{"name": "MWP FT Income ICP", "portfolio_id": "31a01afd-1f69-e617-ade3-d7c1895c4461"}, {"name": "MWP Tactical ETP IMG", "portfolio_id": "13281ca6-a9a7-d361-1cf2-07e6fa3e283a"}],
"document_uuid": "28d7ccb1-3f9f-fdd0-c757-6b134a74fdd3",
"user_id": "00u9vj92B0ZPUMU9b5d5"
}')::variant as var
)
SELECT y.value:name::string AS portfolio_name,
y.value:portfolio_id::string AS portfolio_id
FROM x,
lateral flatten(input=>var:document_portfolio) y;
CREATE OR REPLACE VIEW document_testing_view222
AS SELECT
data:document_portfolio::STRING AS document_portfolio,
doc_port.value:name::string AS portfolio_name,
doc_port.value:portfolio_id::string AS portfolio_id
FROM pcs.document,
lateral flatten(input=>PARSE_JSON(data:document_portfolio):document_portfolio) doc_port;
cur = snow_connection.cursor()
data = snow_connection.cursor().execute("SELECT x,y,z FROM abc.testing_view2").fetchall()
df = pd.DataFrame(data)
df.columns = ['x','y','z']
print(df)
temp_col=df.iloc[:,2]
print(temp_col)
for i in range (len(df.iloc[:,0])):
for j in range(len(df.iloc[:,1])):
for k in range(len(temp_col)):
json_dict = str(json.loads(str(k)))
print(json_dict)
# accessing values in the dictionary
portfolio_name = str(json_dict[0])
portfolio_id = str(json_dict[1])
for l in range(len(portfolio_name)):
for m in range(len(portfolio_id)):
# created a new dictionary with all three columns
new_dict = {'name': portfolio_name, 'portfolio_id': portfolio_id,'x': ['X'], 'y': ['Y']}
print(new_dict)
new_data = []
new_data.append(new_dict)
new_df = pd.DataFrame(new_data)
The problem is column z(which is having json record with name and id) is having dynamic records i.e could contain only one record with name and id and more than 1 too. but it will be mapped with only one x and y. So i have to write this data into multiple rows i.e if column z contains one record then it should write the data into 4 columns x,y ,name and id and if column z contains 2 records then it should write the data and create 2 rows.
I am iterating into this column and writing it into a dictionary to maintain the relation however it is not reading the json dict i created for name and id.
Sample Data:
{
"document_portfolio": "[{"name": "MWP FT Income ICP", "portfolio_id": "31a01afd-1f69-e617-ade3-d7c1895c4461"}, {"name": "MWP Tactical ETP IMG", "portfolio_id": "13281ca6-a9a7-d361-1cf2-07e6fa3e283a"}]",
"document_uuid": "28d7ccb1-3f9f-fdd0-c757-6b134a74fdd3",
"user_id": "00u9vj92B0ZPUMU9b5d5"
}
{
"document_portfolio": "[{"name": "tesying", "portfolio_id": "59d26651-3484-e7ef-9ece-f7194d7639e0"}]","document_uuid": "1cf1ca8e-f0e9-844b-11d6-0d05302fb777",
"user_id": "00u5flkeths3G668k5d7"
}
Expected output :
portfolio_id name portfolio_uuid user_id
31a01afd-1f69-e617-ade3-d7c1895c4461 MWP FT Income ICP 28d7ccb1-3f9f-fdd0-c757-6b134a74fdd3 00u9vj92B0ZPUMU9b5d5
13281ca6-a9a7-d361-1cf2-07e6fa3e283a MWP Tactical ETP IMG 28d7ccb1-3f9f-fdd0-c757-6b134a74fdd3 00u9vj92B0ZPUMU9b5d5
Sample Snowflake logic:
CREATE OR REPLACE VIEW port_test_vw AS
SELECT
LATERAL FLATTEN(PARSE_JSON(data:document_portfolio):portfolio) flattened,
flattened:name::string AS portfolio_name,
flattened:portfolio_id::string AS portfolio_id,
FROM XYZ;
for i in range (len(df.iloc[:,0])):
for j in range(len(df.iloc[:,1])):
for s in range(len(df.iloc[:,2])):
for index,row in df.iterrows():
json_dict = row['DOCUMENT_PORTFOLIO']
data = json.loads(json_dict)
for i,row in enumerate(data):
#for j in range(len(df['DOCUMENT_UUID'])):
if len(data) == 1:
result_dic = {}
portfolio_name = row['name']
result_dic[i] = portfolio_name
print(portfolio_name)
portfolio_id = row['portfolio_id']
result_dic[i] = portfolio_id
print(portfolio_id)
DOCUMENT_UUID = df['DOCUMENT_UUID']
result_dic[j] = DOCUMENT_UUID
print(DOCUMENT_UUID)
else:
exit()
output:
This is the output when I am not putting condition on length
Ali Mahbod – $1M Valued Client-Copy
5efbd6c7-2abe-5e78-b035-1cfffc18b9cd
[{‘name’: ‘Falip Large Caps 1’, ‘portfolio_id’: ‘7c3b5cd5-b788-8667-23b2-d25fb110e525’}, {‘name’: ‘Falip Large Caps 2’, ‘portfolio_id’: ‘7c464968-91e8-a4d5-9756-07efb0d0c7b6’}]
Falip Large Caps 1
7c3b5cd5-b788-8667-23b2-d25fb110e525
Falip Large Caps 2
7c464968-91
However on adding condition on length I am getting this error:
Traceback (most recent call last):
File "Document_custom_field.py", line 102, in
for index,row in name.iterrows():
AttributeError: ‘str’ object has no attribute ‘iterrows’
I tried converting it into dataframe again and loading the json the again but it didnt worked.
For now I am exiting the loop if length is more than one however it is not writing the records into one single dataframe as per the length condition.
I have fixed the above issue. Hope if someone is also having you can reuse this logic:
df = pd.DataFrame(data)
df.columns = ['DOCUMENT_ID', 'DOCUMENT_UUID', 'USER_ID', 'DOCUMENT_PORTFOLIO']
for row in data:
document_id = row[0]
document_uuid = row[1]
user_id = row[2]
json_dict = row[3]
portfolio_data = json.loads(json_dict)
for item in portfolio_data:
portfolio_name = item['name']
portfolio_id = item['portfolio_id']
new_row = {'DOCUMENT_ID': document_id, 'DOCUMENT_UUID': document_uuid, 'USER_ID': user_id, 'PORTFOLIO_NAME': portfolio_name, 'PORTFOLIO_ID': portfolio_id}
df = df.append(new_row, ignore_index=True)
#print(df)
table_name = 'XYZ'
with snow_connection.cursor() as cursor:
cursor.execute(f"CREATE OR REPLACE TABLE {table_name}(DOCUMENT_ID INT,DOCUMENT_UUID TEXT,USER_ID TEXT,PORTFOLIO_NAME TEXT,PORTFOLIO_ID TEXT)")
df.to_sql(table_name, snow_connection, index=False, if_exists='append')
Here is a Snowflake SQL syntax for you:
with x as (
select parse_json('{
"document_portfolio": [{"name": "MWP FT Income ICP", "portfolio_id": "31a01afd-1f69-e617-ade3-d7c1895c4461"}, {"name": "MWP Tactical ETP IMG", "portfolio_id": "13281ca6-a9a7-d361-1cf2-07e6fa3e283a"}],
"document_uuid": "28d7ccb1-3f9f-fdd0-c757-6b134a74fdd3",
"user_id": "00u9vj92B0ZPUMU9b5d5"
}')::variant as var
)
SELECT y.value:name::string AS portfolio_name,
y.value:portfolio_id::string AS portfolio_id
FROM x,
lateral flatten(input=>var:document_portfolio) y;
CREATE OR REPLACE VIEW document_testing_view222
AS SELECT
data:document_portfolio::STRING AS document_portfolio,
doc_port.value:name::string AS portfolio_name,
doc_port.value:portfolio_id::string AS portfolio_id
FROM pcs.document,
lateral flatten(input=>PARSE_JSON(data:document_portfolio):document_portfolio) doc_port;