Replace json key with a value conditionally, while keeping all the rest
Question:
I have the JSON below: it contains a filename, for each filename there are sentences where specific words are marked as "literal" or "metaphoric".
{
"journal.pbio.0050304.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
],
[
{"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
{"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
{"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
{"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
]
]
},
"journal.pbio.0050093.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
]
]
}
}
I would like all literal entity groups to be replaced by plain strings of their values, dumping ‘entity group’, ‘score’, ‘start’ and ‘end’ keys. If the entity group is ‘metaphoric’, I want that to stay the same, but still within a string. Is it possible?
EDIT: trial2:
for filename in os.listdir(words_input_dir):
if filename.endswith(".xml"):
tree = ET.parse(filename)
root = tree.getroot()
node = root.findall("./body/sec/p")
for x in node:
if x is not None:
coco = x.text
data = nerpipeline(str(coco))
resu[filename] = {"sentence": data}
for filed in resu.values():
for idx1, sentence in enumerate(filed["sentence"]):
new_sentence = [word for word in sentence]
for idx2, word in enumerate(sentence):
if word["entity_group"] == "literal":
new_sentence[idx2] = word["word"]
filed["sentence"][idx1] = new_sentence
print(resu)
Answers:
When iterating through the sentences in each file, first copy the sentence, then iterate through each word in the sentence, replacing literal words in the copied sentence, then after the for loop replace the sentence with the new one. (Note that it is necessary to create a copy of the sentence list, since mutating a list while iterating over it can cause unexpected bugs)
from pprint import pprint
data = {
"journal.pbio.0050304.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
],
[
{"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
{"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
{"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
{"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
]
]
},
"journal.pbio.0050093.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
]
]
}
}
for file in data.values():
for idx1, sentence in enumerate(file["sentence"]):
new_sentence = [word for word in sentence]
for idx2, word in enumerate(sentence):
if word["entity_group"] == "literal":
new_sentence[idx2] = word["word"]
file["sentence"][idx1] = new_sentence
pprint(data)
Result:
{'journal.pbio.0050093.xml': {'sentence': [['The anterior–posterior (A–P) '
'axis ']]},
'journal.pbio.0050304.xml': {'sentence': [['The anterior–posterior (A–P) '
'axis '],
['RA, Fgfs, and Wnts are all '
'produced at the posterior of the '
'embryo, and might therefore be '
'expected to form posterior-',
{'end': 120,
'entity_group': 'metaphoric',
'score': 0.874372,
'start': 118,
'word': 'to'},
'-anterior gradients (for Fgf8',
{'end': 154,
'entity_group': 'metaphoric',
'score': 0.9993481,
'start': 150,
'word': 'this'}]]}}
I have the JSON below: it contains a filename, for each filename there are sentences where specific words are marked as "literal" or "metaphoric".
{
"journal.pbio.0050304.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
],
[
{"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
{"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
{"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
{"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
]
]
},
"journal.pbio.0050093.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
]
]
}
}
I would like all literal entity groups to be replaced by plain strings of their values, dumping ‘entity group’, ‘score’, ‘start’ and ‘end’ keys. If the entity group is ‘metaphoric’, I want that to stay the same, but still within a string. Is it possible?
EDIT: trial2:
for filename in os.listdir(words_input_dir):
if filename.endswith(".xml"):
tree = ET.parse(filename)
root = tree.getroot()
node = root.findall("./body/sec/p")
for x in node:
if x is not None:
coco = x.text
data = nerpipeline(str(coco))
resu[filename] = {"sentence": data}
for filed in resu.values():
for idx1, sentence in enumerate(filed["sentence"]):
new_sentence = [word for word in sentence]
for idx2, word in enumerate(sentence):
if word["entity_group"] == "literal":
new_sentence[idx2] = word["word"]
filed["sentence"][idx1] = new_sentence
print(resu)
When iterating through the sentences in each file, first copy the sentence, then iterate through each word in the sentence, replacing literal words in the copied sentence, then after the for loop replace the sentence with the new one. (Note that it is necessary to create a copy of the sentence list, since mutating a list while iterating over it can cause unexpected bugs)
from pprint import pprint
data = {
"journal.pbio.0050304.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
],
[
{"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
{"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
{"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
{"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
]
]
},
"journal.pbio.0050093.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
]
]
}
}
for file in data.values():
for idx1, sentence in enumerate(file["sentence"]):
new_sentence = [word for word in sentence]
for idx2, word in enumerate(sentence):
if word["entity_group"] == "literal":
new_sentence[idx2] = word["word"]
file["sentence"][idx1] = new_sentence
pprint(data)
Result:
{'journal.pbio.0050093.xml': {'sentence': [['The anterior–posterior (A–P) '
'axis ']]},
'journal.pbio.0050304.xml': {'sentence': [['The anterior–posterior (A–P) '
'axis '],
['RA, Fgfs, and Wnts are all '
'produced at the posterior of the '
'embryo, and might therefore be '
'expected to form posterior-',
{'end': 120,
'entity_group': 'metaphoric',
'score': 0.874372,
'start': 118,
'word': 'to'},
'-anterior gradients (for Fgf8',
{'end': 154,
'entity_group': 'metaphoric',
'score': 0.9993481,
'start': 150,
'word': 'this'}]]}}