How to parse/read a YAML file into a Python object?
Question:
How to parse/read a YAML file into a Python object?
For example, this YAML:
Person:
name: XYZ
To this Python class:
class Person(yaml.YAMLObject):
yaml_tag = 'Person'
def __init__(self, name):
self.name = name
I am using PyYAML by the way.
Answers:
If your YAML file looks like this:
# tree format
treeroot:
branch1:
name: Node 1
branch1-1:
name: Node 1-1
branch2:
name: Node 2
branch2-1:
name: Node 2-1
And you’ve installed PyYAML
like this:
pip install PyYAML
And the Python code looks like this:
import yaml
with open('tree.yaml') as f:
# use safe_load instead load
dataMap = yaml.safe_load(f)
The variable dataMap
now contains a dictionary with the tree data. If you print dataMap
using PrettyPrint, you will get something like:
{
'treeroot': {
'branch1': {
'branch1-1': {
'name': 'Node 1-1'
},
'name': 'Node 1'
},
'branch2': {
'branch2-1': {
'name': 'Node 2-1'
},
'name': 'Node 2'
}
}
}
So, now we have seen how to get data into our Python program. Saving data is just as easy:
with open('newtree.yaml', "w") as f:
yaml.dump(dataMap, f)
You have a dictionary, and now you have to convert it to a Python object:
class Struct:
def __init__(self, **entries):
self.__dict__.update(entries)
Then you can use:
>>> args = your YAML dictionary
>>> s = Struct(**args)
>>> s
<__main__.Struct instance at 0x01D6A738>
>>> s...
and follow "Convert Python dict to object".
For more information you can look at pyyaml.org and this.
From http://pyyaml.org/wiki/PyYAMLDocumentation:
add_path_resolver(tag, path, kind)
adds a path-based implicit tag resolver. A path is a list of keys that form a path to a node in the representation graph. Paths elements can be string values, integers, or None. The kind of a node can be str, list, dict, or None.
#!/usr/bin/env python
import yaml
class Person(yaml.YAMLObject):
yaml_tag = '!person'
def __init__(self, name):
self.name = name
yaml.add_path_resolver('!person', ['Person'], dict)
data = yaml.load("""
Person:
name: XYZ
""")
print data
# {'Person': <__main__.Person object at 0x7f2b251ceb10>}
print data['Person'].name
# XYZ
Here is one way to test which YAML implementation the user has selected on the virtualenv (or the system) and then define load_yaml_file
appropriately:
load_yaml_file = None
if not load_yaml_file:
try:
import yaml
load_yaml_file = lambda fn: yaml.load(open(fn))
except:
pass
if not load_yaml_file:
import commands, json
if commands.getstatusoutput('ruby --version')[0] == 0:
def load_yaml_file(fn):
ruby = "puts YAML.load_file('%s').to_json" % fn
j = commands.getstatusoutput('ruby -ryaml -rjson -e "%s"' % ruby)
return json.loads(j[1])
if not load_yaml_file:
import os, sys
print """
ERROR: %s requires ruby or python-yaml to be installed.
apt-get install ruby
OR
apt-get install python-yaml
OR
Demonstrate your mastery of Python by using pip.
Please research the latest pip-based install steps for python-yaml.
Usually something like this works:
apt-get install epel-release
apt-get install python-pip
apt-get install libyaml-cpp-dev
python2.7 /usr/bin/pip install pyyaml
Notes:
Non-base library (yaml) should never be installed outside a virtualenv.
"pip install" is permanent:
https://stackoverflow.com/questions/1550226/python-setup-py-uninstall
Beware when using pip within an aptitude or RPM script.
Pip might not play by all the rules.
Your installation may be permanent.
Ruby is 7X faster at loading large YAML files.
pip could ruin your life.
https://stackoverflow.com/questions/46326059/
https://stackoverflow.com/questions/36410756/
https://stackoverflow.com/questions/8022240/
Never use PyYaml in numerical applications.
https://stackoverflow.com/questions/30458977/
If you are working for a Fortune 500 company, your choices are
1. Ask for either the "ruby" package or the "python-yaml"
package. Asking for Ruby is more likely to get a fast answer.
2. Work in a VM. I highly recommend Vagrant for setting it up.
""" % sys.argv[0]
os._exit(4)
# test
import sys
print load_yaml_file(sys.argv[1])
I wrote an implementation using named tuples that I believe is neat because of it being a bit readable. It handles the cases where your dictionary is nested as well. The parser code is as follows:
from collections import namedtuple
class Dict2ObjParser:
def __init__(self, nested_dict):
self.nested_dict = nested_dict
def parse(self):
nested_dict = self.nested_dict
if (obj_type := type(nested_dict)) is not dict:
raise TypeError(f"Expected 'dict' but found '{obj_type}'")
return self._transform_to_named_tuples("root", nested_dict)
def _transform_to_named_tuples(self, tuple_name, possibly_nested_obj):
if type(possibly_nested_obj) is dict:
named_tuple_def = namedtuple(tuple_name, possibly_nested_obj.keys())
transformed_value = named_tuple_def(
*[
self._transform_to_named_tuples(key, value)
for key, value in possibly_nested_obj.items()
]
)
elif type(possibly_nested_obj) is list:
transformed_value = [
self._transform_to_named_tuples(f"{tuple_name}_{i}", possibly_nested_obj[i])
for i in range(len(possibly_nested_obj))
]
else:
transformed_value = possibly_nested_obj
return transformed_value
I tested basic cases with the following code:
x = Dict2ObjParser({
"a": {
"b": 123,
"c": "Hello, World!"
},
"d": [
1,
2,
3
],
"e": [
{
"f": "",
"g": None
},
{
"f": "Foo",
"g": "Bar"
},
{
"h": "Hi!",
"i": None
}
],
"j": 456,
"k": None
}).parse()
print(x)
It gives the following output: root(a=a(b=123, c='Hello, World!'), d=[1, 2, 3], e=[e_0(f='', g=None), e_1(f='Foo', g='Bar'), e_2(h='Hi!', i=None)], j=456, k=None)
Which when formatted a bit looks like:
root(
a=a(
b=123,
c='Hello, World!'
),
d=[1, 2, 3],
e=[
e_0(
f='',
g=None
),
e_1(
f='Foo',
g='Bar'
),
e_2(
h='Hi!',
i=None
)
],
j=456,
k=None
)
And I can access the nested fields like any other object:
print(x.a.b) # Prints: 123
In your case, the code would ultimately look as follows:
import yaml
with open(file_path, "r") as stream:
nested_dict = yaml.safe_load(stream)
nested_objt = Dict2ObjParser(nested_dict).parse()
I hope this helps!
How to parse/read a YAML file into a Python object?
For example, this YAML:
Person:
name: XYZ
To this Python class:
class Person(yaml.YAMLObject):
yaml_tag = 'Person'
def __init__(self, name):
self.name = name
I am using PyYAML by the way.
If your YAML file looks like this:
# tree format
treeroot:
branch1:
name: Node 1
branch1-1:
name: Node 1-1
branch2:
name: Node 2
branch2-1:
name: Node 2-1
And you’ve installed PyYAML
like this:
pip install PyYAML
And the Python code looks like this:
import yaml
with open('tree.yaml') as f:
# use safe_load instead load
dataMap = yaml.safe_load(f)
The variable dataMap
now contains a dictionary with the tree data. If you print dataMap
using PrettyPrint, you will get something like:
{
'treeroot': {
'branch1': {
'branch1-1': {
'name': 'Node 1-1'
},
'name': 'Node 1'
},
'branch2': {
'branch2-1': {
'name': 'Node 2-1'
},
'name': 'Node 2'
}
}
}
So, now we have seen how to get data into our Python program. Saving data is just as easy:
with open('newtree.yaml', "w") as f:
yaml.dump(dataMap, f)
You have a dictionary, and now you have to convert it to a Python object:
class Struct:
def __init__(self, **entries):
self.__dict__.update(entries)
Then you can use:
>>> args = your YAML dictionary
>>> s = Struct(**args)
>>> s
<__main__.Struct instance at 0x01D6A738>
>>> s...
and follow "Convert Python dict to object".
For more information you can look at pyyaml.org and this.
From http://pyyaml.org/wiki/PyYAMLDocumentation:
add_path_resolver(tag, path, kind)
adds a path-based implicit tag resolver. A path is a list of keys that form a path to a node in the representation graph. Paths elements can be string values, integers, or None. The kind of a node can be str, list, dict, or None.
#!/usr/bin/env python
import yaml
class Person(yaml.YAMLObject):
yaml_tag = '!person'
def __init__(self, name):
self.name = name
yaml.add_path_resolver('!person', ['Person'], dict)
data = yaml.load("""
Person:
name: XYZ
""")
print data
# {'Person': <__main__.Person object at 0x7f2b251ceb10>}
print data['Person'].name
# XYZ
Here is one way to test which YAML implementation the user has selected on the virtualenv (or the system) and then define load_yaml_file
appropriately:
load_yaml_file = None
if not load_yaml_file:
try:
import yaml
load_yaml_file = lambda fn: yaml.load(open(fn))
except:
pass
if not load_yaml_file:
import commands, json
if commands.getstatusoutput('ruby --version')[0] == 0:
def load_yaml_file(fn):
ruby = "puts YAML.load_file('%s').to_json" % fn
j = commands.getstatusoutput('ruby -ryaml -rjson -e "%s"' % ruby)
return json.loads(j[1])
if not load_yaml_file:
import os, sys
print """
ERROR: %s requires ruby or python-yaml to be installed.
apt-get install ruby
OR
apt-get install python-yaml
OR
Demonstrate your mastery of Python by using pip.
Please research the latest pip-based install steps for python-yaml.
Usually something like this works:
apt-get install epel-release
apt-get install python-pip
apt-get install libyaml-cpp-dev
python2.7 /usr/bin/pip install pyyaml
Notes:
Non-base library (yaml) should never be installed outside a virtualenv.
"pip install" is permanent:
https://stackoverflow.com/questions/1550226/python-setup-py-uninstall
Beware when using pip within an aptitude or RPM script.
Pip might not play by all the rules.
Your installation may be permanent.
Ruby is 7X faster at loading large YAML files.
pip could ruin your life.
https://stackoverflow.com/questions/46326059/
https://stackoverflow.com/questions/36410756/
https://stackoverflow.com/questions/8022240/
Never use PyYaml in numerical applications.
https://stackoverflow.com/questions/30458977/
If you are working for a Fortune 500 company, your choices are
1. Ask for either the "ruby" package or the "python-yaml"
package. Asking for Ruby is more likely to get a fast answer.
2. Work in a VM. I highly recommend Vagrant for setting it up.
""" % sys.argv[0]
os._exit(4)
# test
import sys
print load_yaml_file(sys.argv[1])
I wrote an implementation using named tuples that I believe is neat because of it being a bit readable. It handles the cases where your dictionary is nested as well. The parser code is as follows:
from collections import namedtuple
class Dict2ObjParser:
def __init__(self, nested_dict):
self.nested_dict = nested_dict
def parse(self):
nested_dict = self.nested_dict
if (obj_type := type(nested_dict)) is not dict:
raise TypeError(f"Expected 'dict' but found '{obj_type}'")
return self._transform_to_named_tuples("root", nested_dict)
def _transform_to_named_tuples(self, tuple_name, possibly_nested_obj):
if type(possibly_nested_obj) is dict:
named_tuple_def = namedtuple(tuple_name, possibly_nested_obj.keys())
transformed_value = named_tuple_def(
*[
self._transform_to_named_tuples(key, value)
for key, value in possibly_nested_obj.items()
]
)
elif type(possibly_nested_obj) is list:
transformed_value = [
self._transform_to_named_tuples(f"{tuple_name}_{i}", possibly_nested_obj[i])
for i in range(len(possibly_nested_obj))
]
else:
transformed_value = possibly_nested_obj
return transformed_value
I tested basic cases with the following code:
x = Dict2ObjParser({
"a": {
"b": 123,
"c": "Hello, World!"
},
"d": [
1,
2,
3
],
"e": [
{
"f": "",
"g": None
},
{
"f": "Foo",
"g": "Bar"
},
{
"h": "Hi!",
"i": None
}
],
"j": 456,
"k": None
}).parse()
print(x)
It gives the following output: root(a=a(b=123, c='Hello, World!'), d=[1, 2, 3], e=[e_0(f='', g=None), e_1(f='Foo', g='Bar'), e_2(h='Hi!', i=None)], j=456, k=None)
Which when formatted a bit looks like:
root(
a=a(
b=123,
c='Hello, World!'
),
d=[1, 2, 3],
e=[
e_0(
f='',
g=None
),
e_1(
f='Foo',
g='Bar'
),
e_2(
h='Hi!',
i=None
)
],
j=456,
k=None
)
And I can access the nested fields like any other object:
print(x.a.b) # Prints: 123
In your case, the code would ultimately look as follows:
import yaml
with open(file_path, "r") as stream:
nested_dict = yaml.safe_load(stream)
nested_objt = Dict2ObjParser(nested_dict).parse()
I hope this helps!