Parse YAML with dots delimiter in keys
Question:
We use YAML configuration for services scaling. Usually it goes like this:
service:
scalingPolicy:
capacity:
min: 1
max: 1
So it’s easy to open with basic PyYAML and parse as an dict to get config['service']['scalingPolicy']['capacity']['min']
result as 1
. Problem is that some configs are built with dots delimiter e.g:
service.scalingPolicy.capacity:
min: 1
max: 1
Basic consumer of this configs is Java’s Spring and somehow it’s treated equally as the example above. But due to need to also parse these configs with Python – I get whole dot separated line as a config['service.scalingPolicy.capacity']
key.
The question is – how would I make python parse any kind of keys combinations (both separated by dots
and separated by tabulation and :
). I didn’t find related parameters for Python YAML libs (I’ve checked standard PyYAML and ruamel.yaml
) and handling any possible combination manually seems like a crazy idea. The only possible idea I have is to write my own parser but maybe there is something I’m missing so I won’t have to reinvent the bicycle.
Answers:
This is not trivial, it is much more easy to split a lookup with a key with
dots into recursing into a nested data structure. Here you have a nested
data structure and different [key]
lookups mean different things
at different levels.
If you use ruamel.yaml
in the default round-trip mode, you can add a class-variable
to the type that represents a mapping, that defines on what the keys were split and an instance variable
that keeps track of the prefix already matched:
import sys
import ruamel.yaml
from ruamel.yaml.compat import ordereddict
from ruamel.yaml.comments import merge_attrib
yaml_str = """
service.scalingPolicy.capacity:
min: 1
max: 1
"""
def mapgetitem(self, key):
sep = getattr(ruamel.yaml.comments.CommentedMap, 'sep')
if sep is not None:
if not hasattr(self, 'splitprefix'):
self.splitprefix = ''
if self.splitprefix:
self.splitprefix += sep + key
else:
self.splitprefix = key
if self.splitprefix not in self:
for k in self.keys():
if k.startswith(self.splitprefix):
break
else:
raise KeyError(self.splitprefix)
return self
key = self.splitprefix
delattr(self, 'splitprefix') # to make the next lookup work from start
try:
return ordereddict.__getitem__(self, key)
except KeyError:
for merged in getattr(self, merge_attrib, []):
if key in merged[1]:
return merged[1][key]
raise
old_mapgetitem = ruamel.yaml.comments.CommentedMap.__getitem__ # save the original __getitem__
ruamel.yaml.comments.CommentedMap.__getitem__ = mapgetitem
ruamel.yaml.comments.CommentedMap.sep = '.'
yaml = ruamel.yaml.YAML()
# yaml.indent(mapping=4, sequence=4, offset=2)
# yaml.preserve_quotes = True
config = yaml.load(yaml_str)
print('min:', config['service']['scalingPolicy']['capacity']['min'])
print('max:', config['service']['scalingPolicy']['capacity']['max'])
print('---------')
config['service']['scalingPolicy']['capacity']['max'] = 42
# and dump with the original routine, as it uses __getitem__
ruamel.yaml.comments.CommentedMap.__getitem__ = old_mapgetitem
yaml.dump(config, sys.stdout)
which gives:
min: 1
max: 1
---------
service.scalingPolicy.capacity:
min: 1
max: 42
I have found an alternative solution with pyyaml replacing recursively dict keys with dot. I expect could be helpful.
import yaml
my_yaml = """
service.scalingPolicy.capacity:
min: 1
max: 50
"""
def convert_to_dict(source_string, split_symbol='.', value=None):
return_value = value
elements = source_string.split(split_symbol)
for element in reversed(elements):
if element:
return_value = {element: return_value}
return return_value
def split_dots(source_dict):
return_value = {}
for key in source_dict:
new_value = source_dict[key]
if isinstance(source_dict[key], dict):
new_value = split_dots(source_dict[key])
new_key = key
if '.' in key:
new_dict = convert_to_dict(key, '.', new_value)
new_key = list(new_dict.keys())[0]
new_value = new_dict[new_key]
return_value[new_key] = new_value
return return_value
def main():
try:
yaml_dict = yaml.safe_load(my_yaml)
processed_yaml_dict = split_dots(yaml_dict)
print("Min:", processed_yaml_dict['service']['scalingPolicy']['capacity']['min'])
print("Max:", processed_yaml_dict['service']['scalingPolicy']['capacity']['max'])
data = yaml.dump(processed_yaml_dict, indent=True)
print("New yaml:", data)
except yaml.YAMLError as exc:
print(exc)
if __name__ == "__main__":
main()
We use YAML configuration for services scaling. Usually it goes like this:
service:
scalingPolicy:
capacity:
min: 1
max: 1
So it’s easy to open with basic PyYAML and parse as an dict to get config['service']['scalingPolicy']['capacity']['min']
result as 1
. Problem is that some configs are built with dots delimiter e.g:
service.scalingPolicy.capacity:
min: 1
max: 1
Basic consumer of this configs is Java’s Spring and somehow it’s treated equally as the example above. But due to need to also parse these configs with Python – I get whole dot separated line as a config['service.scalingPolicy.capacity']
key.
The question is – how would I make python parse any kind of keys combinations (both separated by dots
and separated by tabulation and :
). I didn’t find related parameters for Python YAML libs (I’ve checked standard PyYAML and ruamel.yaml
) and handling any possible combination manually seems like a crazy idea. The only possible idea I have is to write my own parser but maybe there is something I’m missing so I won’t have to reinvent the bicycle.
This is not trivial, it is much more easy to split a lookup with a key with
dots into recursing into a nested data structure. Here you have a nested
data structure and different [key]
lookups mean different things
at different levels.
If you use ruamel.yaml
in the default round-trip mode, you can add a class-variable
to the type that represents a mapping, that defines on what the keys were split and an instance variable
that keeps track of the prefix already matched:
import sys
import ruamel.yaml
from ruamel.yaml.compat import ordereddict
from ruamel.yaml.comments import merge_attrib
yaml_str = """
service.scalingPolicy.capacity:
min: 1
max: 1
"""
def mapgetitem(self, key):
sep = getattr(ruamel.yaml.comments.CommentedMap, 'sep')
if sep is not None:
if not hasattr(self, 'splitprefix'):
self.splitprefix = ''
if self.splitprefix:
self.splitprefix += sep + key
else:
self.splitprefix = key
if self.splitprefix not in self:
for k in self.keys():
if k.startswith(self.splitprefix):
break
else:
raise KeyError(self.splitprefix)
return self
key = self.splitprefix
delattr(self, 'splitprefix') # to make the next lookup work from start
try:
return ordereddict.__getitem__(self, key)
except KeyError:
for merged in getattr(self, merge_attrib, []):
if key in merged[1]:
return merged[1][key]
raise
old_mapgetitem = ruamel.yaml.comments.CommentedMap.__getitem__ # save the original __getitem__
ruamel.yaml.comments.CommentedMap.__getitem__ = mapgetitem
ruamel.yaml.comments.CommentedMap.sep = '.'
yaml = ruamel.yaml.YAML()
# yaml.indent(mapping=4, sequence=4, offset=2)
# yaml.preserve_quotes = True
config = yaml.load(yaml_str)
print('min:', config['service']['scalingPolicy']['capacity']['min'])
print('max:', config['service']['scalingPolicy']['capacity']['max'])
print('---------')
config['service']['scalingPolicy']['capacity']['max'] = 42
# and dump with the original routine, as it uses __getitem__
ruamel.yaml.comments.CommentedMap.__getitem__ = old_mapgetitem
yaml.dump(config, sys.stdout)
which gives:
min: 1
max: 1
---------
service.scalingPolicy.capacity:
min: 1
max: 42
I have found an alternative solution with pyyaml replacing recursively dict keys with dot. I expect could be helpful.
import yaml
my_yaml = """
service.scalingPolicy.capacity:
min: 1
max: 50
"""
def convert_to_dict(source_string, split_symbol='.', value=None):
return_value = value
elements = source_string.split(split_symbol)
for element in reversed(elements):
if element:
return_value = {element: return_value}
return return_value
def split_dots(source_dict):
return_value = {}
for key in source_dict:
new_value = source_dict[key]
if isinstance(source_dict[key], dict):
new_value = split_dots(source_dict[key])
new_key = key
if '.' in key:
new_dict = convert_to_dict(key, '.', new_value)
new_key = list(new_dict.keys())[0]
new_value = new_dict[new_key]
return_value[new_key] = new_value
return return_value
def main():
try:
yaml_dict = yaml.safe_load(my_yaml)
processed_yaml_dict = split_dots(yaml_dict)
print("Min:", processed_yaml_dict['service']['scalingPolicy']['capacity']['min'])
print("Max:", processed_yaml_dict['service']['scalingPolicy']['capacity']['max'])
data = yaml.dump(processed_yaml_dict, indent=True)
print("New yaml:", data)
except yaml.YAMLError as exc:
print(exc)
if __name__ == "__main__":
main()