Is there a Python parsing library that can parse a TOML-like format that specifies nested fields with [ParentHeader_ChildSection]?
Question:
I want to parse an externally defined (and undocumented) file format in Python. It looks somewhat similar to TOML, but with different text styles, and no quoting. For example:
[Schedule_Step122]
m_nMaxCurrent=0
m_szAddIn=Relay OFF
m_szLabel=06 - End Charge
m_uLimitNum=2
[Schedule_Step122_Limit0]
Equation0_szCompareSign=>=
Equation0_szRight=F_05_Charge_Capacity
Equation0_szLeft=PV_CHAN_Charge_Capacity
m_bStepLimit=1
m_szGotoStep=End Test
[Schedule_Step122_Limit1]
Equation0_szCompareSign=>=
Equation0_szLeft=PV_CHAN_Voltage
Equation0_szRight=3
m_bStepLimit=1
m_szGotoStep=End Test
(This is Arbin’s test schedule format.)
I would like the parsed structure to be something like:
"steps": [
{
"max_current": 0,
"add_in": RELAY_OFF,
"label": "09 - End Charge",
"limits": [
{
"equations": [
{
"left": PV_CHAN_CHARGE_CAPACITY,
"compare_sign": ">=",
"right": F_05_CHARGE_CAPACITY
}
],
"step_limit": 1,
"goto_step": END_TEST
},
{
"equations": [
{
"left": PV_CHAN_VOLTAGE,
"compare_sign": ">=",
"right": 6
}
],
"step_limit": 1,
"goto_step": END_TEST
}
]
}
]
The format seems superficially similar to TOML, including some of the nesting, but the string handling is different. I would also like to capture certain values as named constants.
I was also looking into defining a context-free grammar and using a lexer/parser like ANTLR, PLY, pyparsing, or Lark. I’m familiar with reading grammars in documentation, but haven’t written or used one with a parser before. However, I don’t know how one would represent the nesting structure (such as Schedule_Step122_Limit0
being a member of Schedule_Step122
) or the lack of guaranteed order among related keys (like Equation0_szCompareSign
, Equation0_szLeft`, etc).
Is there a generic parsing tool I could write a definition for, which would give me the parsed/structured output? Or is the best approach here to write custom parsing logic?
Answers:
Tools like ANTLR, PLY, pyparsing, or Lark will give you almost no help with this problem. configparser might help a little, but I suspect it’d be more bother than it’s worth.
The following code is close to what you want. You’ll need to tweak it based on what you discover about the input-format, and what you’d like for the output-structure.
import re, json
def main():
obj = parse('input.txt')
print(json.dumps(obj, indent=2))
def parse(filename):
root_object = {}
current_object = None
for line in open(filename):
# trim trailing whitespace:
line = line.rstrip()
if line == '':
# blank line
pass
elif mo := re.fullmatch(r'[(w+)]', line):
# header line
# This identifies, via a 'path' from the root object,
# the object that subsequent name-value lines are talking about.
header_path = mo.group(1)
header_pieces = header_path.split('_')
current_object = get_nested_object(root_object, header_pieces)
elif mo := re.fullmatch(r'([^=]+)=(.*)', line):
# name-value line
(name_part, value_str) = mo.groups()
# The {name_part} identifies a field in {current_object}
# or some object nested within {current_object}.
# The {value_str} encodes the value to be assigned to that field.
name_pieces = name_part.split('_')
prefix_pieces = name_pieces[:-1]
field_name_piece = name_pieces[-1]
if prefix_pieces == ['m']:
# This is an 'immediate' field of {current_object}
obj_w_field = current_object
else:
# This is a field of some object nested within {current_object}
obj_w_field = get_nested_object(current_object, prefix_pieces)
mo = re.fullmatch(r'([a-z]+)([A-Z][a-zA-Z]*)', field_name_piece)
(type_indicator, field_name_pc) = mo.groups()
field_name = to_snake_case(field_name_pc)
field_value = value_str
obj_w_field[field_name] = field_value
else:
assert 0, line
return root_object
def get_nested_object(base_object, header_pieces):
if header_pieces == []:
return base_object
else:
prefix_pieces = header_pieces[:-1]
last_piece = header_pieces[-1]
obj = get_nested_object(base_object, prefix_pieces)
if mo := re.fullmatch(r'[A-Za-z]+', last_piece):
# e.g. "Schedule"
# This identifies a field/property/member of {obj}
field_name = to_snake_case(last_piece)
# That field might or might not exist already.
if field_name not in obj:
# It doesn't exist yet.
# We assume that the value of the field is an object
obj[field_name] = {}
return obj[field_name]
elif mo := re.fullmatch(r'([A-Za-z]+)(d+)', last_piece):
# e.g., "Step122", "Limit0"
# This identifies an element of an array that is a field of {obj}
# e.g., "Step122" implies that {obj} has a field named "steps",
# whose value is an array,
# and this identifies the element at index 122 in that array.
(array_field_name_pc, index_str) = mo.groups()
array_field_name = to_snake_case(array_field_name_pc) + 's'
index = int(index_str)
if array_field_name not in obj:
obj[array_field_name] = {}
# In practice, you might want to make this a list.
array = obj[array_field_name]
if index not in array:
array[index] = {}
return array[index]
else:
assert 0, last_piece
assert 0
# "_pc" suffix denotes a Pascal-cased name, e.g. "MaxCurrent"
def to_snake_case(name_pc):
assert '_' not in name_pc
def replfunc(mo):
cap_letter = mo.group(0)
low_letter = cap_letter.lower()
if mo.start() == 0:
return low_letter
else:
return '_' + low_letter
return re.sub(r'[A-Z]', replfunc, name_pc)
main()
For the example input, it prints:
{
"schedule": {
"steps": {
"122": {
"max_current": "0",
"add_in": "Relay OFF",
"label": "06 - End Charge",
"limit_num": "2",
"limits": {
"0": {
"equations": {
"0": {
"compare_sign": ">=",
"right": "F_05_Charge_Capacity",
"left": "PV_CHAN_Charge_Capacity"
}
},
"step_limit": "1",
"goto_step": "End Test"
},
"1": {
"equations": {
"0": {
"compare_sign": ">=",
"left": "PV_CHAN_Voltage",
"right": "3"
}
},
"step_limit": "1",
"goto_step": "End Test"
}
}
}
}
}
}
Is there a generic parsing tool I could write a definition for,
which would give me the parsed/structured output?
The input format is straightforward so Invisible XML
is an option if you have some knowledge of XML tools. Write an
ixml
grammar and convert the input to XML, or directly to
JSON, using CoffeePot:
# shellcheck shell=sh
alias coffeepot='java -jar /usr/local/share/java/coffeepot-3.2.6/coffeepot-3.2.6.jar'
coffeepot -g:arbin.ixml -i:input.txt -o:output.xml --pretty-print
coffeepot -g:arbin.ixml -i:input.txt --format:json-data |
python3 -m json.tool --indent 2 > output.json
ixml
grammar, XML output, and JSON output are listed below.
A few quick notes on the ixml
grammar are in order.
A mark before a non-terminal indicates the serialization:
@
(at sign) means XML attribute, ^
(circumflex) means
XML element (same as no mark), and -
(minus) means ignore
but process data under it.
For example, the @stepn = ["0"-"9"]+ .
rule says to serialize
stepn
as an XML attribute but this can be overridden when the
rule is used. Replace stepn
with ^stepn
in the stepid
rule
(or remove the @
) to serialize it as an XML element instead.
Contrariwise, changing an element to an attribute means loss of
structure (flattening).
[" " | #9 | #d]
specifies a character set, here horizontal
whitespace (#9
is tab and #d
is CR), and ~[#a]
a ditto
exclusion.
An ‘Invisible XML’ grammar allows ambiguity, in this case
there isn’t any.
File arbin.ixml (‘Invisible XML’ grammar)
ixml version "1.0".
schedule = blank*, step+ .
step = head, limit* .
-head = stephead, headprop+, blank* .
-stephead = -"[", stepid, -"]", NL .
-stepid = -"Schedule_Step", stepn .
@stepn = ["0"-"9"]+ .
-headprop = (max_current | add_in | label | limit_num) .
max_current = -"m_nMaxCurrent", eqval .
add_in = -"m_szAddIn", eqval .
label = -"m_szLabel", eqval .
limit_num = -"m_uLimitNum", eqval .
limit = limithead, equation, limprop+, blank* .
-limithead = -"[", limitid, -"]", NL .
-limitid = -"Schedule_Step", stepn, -"_Limit", limitn .
@limitn = ["0"-"9"]+ .
equation = (compare | left | right)+ .
-eqpre = -"Equation", -["0"-"9"]+, -"_" .
-eqval = -"=", simple-value, NL .
compare = eqpre, -"szCompareSign", eqval .
left = eqpre, -"szLeft", eqval .
right = eqpre, -"szRight", eqval .
-limprop = (step_limit | goto_step) .
step_limit = -"m_bStepLimit", eqval .
goto_step = -"m_szGotoStep", eqval .
-simple-value = ~[" " | #9 | #d], char* .
-blank = S, NL .
-char = ~[#a] .
-NL = -#a .
-S = (-" " | -#9 | -#d)* .
File: output.xml
<schedule>
<step stepn='101'>
<max_current>0</max_current>
<add_in>Relay ON</add_in>
<label>08 - End Charge</label>
<limit_num>0</limit_num>
</step>
<step stepn='122'>
<max_current>0</max_current>
<add_in>Relay OFF</add_in>
<label>06 - End Charge</label>
<limit_num>2</limit_num>
<limit stepn='122' limitn='0'>
<equation>
<compare>>=</compare>
<right>F_05_Charge_Capacity</right>
<left>PV_CHAN_Charge_Capacity</left>
</equation>
<step_limit>1</step_limit>
<goto_step>End Test</goto_step>
</limit>
<limit stepn='122' limitn='1'>
<equation>
<compare>>=</compare>
<left>PV_CHAN_Voltage</left>
<right>3</right>
</equation>
<step_limit>1</step_limit>
<goto_step>End Test</goto_step>
</limit>
</step>
</schedule>
File: output.json
{
"schedule": {
"step": [
{
"stepn": 101,
"max_current": 0,
"add_in": "Relay ON",
"label": "08 - End Charge",
"limit_num": 0
},
{
"stepn": 122,
"max_current": 0,
"add_in": "Relay OFF",
"label": "06 - End Charge",
"limit_num": 2,
"limit": [
{
"stepn": 122,
"limitn": 0,
"equation": {
"compare": ">=",
"right": "F_05_Charge_Capacity",
"left": "PV_CHAN_Charge_Capacity"
},
"step_limit": 1,
"goto_step": "End Test"
},
{
"stepn": 122,
"limitn": 1,
"equation": {
"compare": ">=",
"left": "PV_CHAN_Voltage",
"right": 3
},
"step_limit": 1,
"goto_step": "End Test"
}
]
}
]
}
}
I want to parse an externally defined (and undocumented) file format in Python. It looks somewhat similar to TOML, but with different text styles, and no quoting. For example:
[Schedule_Step122]
m_nMaxCurrent=0
m_szAddIn=Relay OFF
m_szLabel=06 - End Charge
m_uLimitNum=2
[Schedule_Step122_Limit0]
Equation0_szCompareSign=>=
Equation0_szRight=F_05_Charge_Capacity
Equation0_szLeft=PV_CHAN_Charge_Capacity
m_bStepLimit=1
m_szGotoStep=End Test
[Schedule_Step122_Limit1]
Equation0_szCompareSign=>=
Equation0_szLeft=PV_CHAN_Voltage
Equation0_szRight=3
m_bStepLimit=1
m_szGotoStep=End Test
(This is Arbin’s test schedule format.)
I would like the parsed structure to be something like:
"steps": [
{
"max_current": 0,
"add_in": RELAY_OFF,
"label": "09 - End Charge",
"limits": [
{
"equations": [
{
"left": PV_CHAN_CHARGE_CAPACITY,
"compare_sign": ">=",
"right": F_05_CHARGE_CAPACITY
}
],
"step_limit": 1,
"goto_step": END_TEST
},
{
"equations": [
{
"left": PV_CHAN_VOLTAGE,
"compare_sign": ">=",
"right": 6
}
],
"step_limit": 1,
"goto_step": END_TEST
}
]
}
]
The format seems superficially similar to TOML, including some of the nesting, but the string handling is different. I would also like to capture certain values as named constants.
I was also looking into defining a context-free grammar and using a lexer/parser like ANTLR, PLY, pyparsing, or Lark. I’m familiar with reading grammars in documentation, but haven’t written or used one with a parser before. However, I don’t know how one would represent the nesting structure (such as Schedule_Step122_Limit0
being a member of Schedule_Step122
) or the lack of guaranteed order among related keys (like Equation0_szCompareSign
, Equation0_szLeft`, etc).
Is there a generic parsing tool I could write a definition for, which would give me the parsed/structured output? Or is the best approach here to write custom parsing logic?
Tools like ANTLR, PLY, pyparsing, or Lark will give you almost no help with this problem. configparser might help a little, but I suspect it’d be more bother than it’s worth.
The following code is close to what you want. You’ll need to tweak it based on what you discover about the input-format, and what you’d like for the output-structure.
import re, json
def main():
obj = parse('input.txt')
print(json.dumps(obj, indent=2))
def parse(filename):
root_object = {}
current_object = None
for line in open(filename):
# trim trailing whitespace:
line = line.rstrip()
if line == '':
# blank line
pass
elif mo := re.fullmatch(r'[(w+)]', line):
# header line
# This identifies, via a 'path' from the root object,
# the object that subsequent name-value lines are talking about.
header_path = mo.group(1)
header_pieces = header_path.split('_')
current_object = get_nested_object(root_object, header_pieces)
elif mo := re.fullmatch(r'([^=]+)=(.*)', line):
# name-value line
(name_part, value_str) = mo.groups()
# The {name_part} identifies a field in {current_object}
# or some object nested within {current_object}.
# The {value_str} encodes the value to be assigned to that field.
name_pieces = name_part.split('_')
prefix_pieces = name_pieces[:-1]
field_name_piece = name_pieces[-1]
if prefix_pieces == ['m']:
# This is an 'immediate' field of {current_object}
obj_w_field = current_object
else:
# This is a field of some object nested within {current_object}
obj_w_field = get_nested_object(current_object, prefix_pieces)
mo = re.fullmatch(r'([a-z]+)([A-Z][a-zA-Z]*)', field_name_piece)
(type_indicator, field_name_pc) = mo.groups()
field_name = to_snake_case(field_name_pc)
field_value = value_str
obj_w_field[field_name] = field_value
else:
assert 0, line
return root_object
def get_nested_object(base_object, header_pieces):
if header_pieces == []:
return base_object
else:
prefix_pieces = header_pieces[:-1]
last_piece = header_pieces[-1]
obj = get_nested_object(base_object, prefix_pieces)
if mo := re.fullmatch(r'[A-Za-z]+', last_piece):
# e.g. "Schedule"
# This identifies a field/property/member of {obj}
field_name = to_snake_case(last_piece)
# That field might or might not exist already.
if field_name not in obj:
# It doesn't exist yet.
# We assume that the value of the field is an object
obj[field_name] = {}
return obj[field_name]
elif mo := re.fullmatch(r'([A-Za-z]+)(d+)', last_piece):
# e.g., "Step122", "Limit0"
# This identifies an element of an array that is a field of {obj}
# e.g., "Step122" implies that {obj} has a field named "steps",
# whose value is an array,
# and this identifies the element at index 122 in that array.
(array_field_name_pc, index_str) = mo.groups()
array_field_name = to_snake_case(array_field_name_pc) + 's'
index = int(index_str)
if array_field_name not in obj:
obj[array_field_name] = {}
# In practice, you might want to make this a list.
array = obj[array_field_name]
if index not in array:
array[index] = {}
return array[index]
else:
assert 0, last_piece
assert 0
# "_pc" suffix denotes a Pascal-cased name, e.g. "MaxCurrent"
def to_snake_case(name_pc):
assert '_' not in name_pc
def replfunc(mo):
cap_letter = mo.group(0)
low_letter = cap_letter.lower()
if mo.start() == 0:
return low_letter
else:
return '_' + low_letter
return re.sub(r'[A-Z]', replfunc, name_pc)
main()
For the example input, it prints:
{
"schedule": {
"steps": {
"122": {
"max_current": "0",
"add_in": "Relay OFF",
"label": "06 - End Charge",
"limit_num": "2",
"limits": {
"0": {
"equations": {
"0": {
"compare_sign": ">=",
"right": "F_05_Charge_Capacity",
"left": "PV_CHAN_Charge_Capacity"
}
},
"step_limit": "1",
"goto_step": "End Test"
},
"1": {
"equations": {
"0": {
"compare_sign": ">=",
"left": "PV_CHAN_Voltage",
"right": "3"
}
},
"step_limit": "1",
"goto_step": "End Test"
}
}
}
}
}
}
Is there a generic parsing tool I could write a definition for,
which would give me the parsed/structured output?
The input format is straightforward so Invisible XML
is an option if you have some knowledge of XML tools. Write an
ixml
grammar and convert the input to XML, or directly to
JSON, using CoffeePot:
# shellcheck shell=sh
alias coffeepot='java -jar /usr/local/share/java/coffeepot-3.2.6/coffeepot-3.2.6.jar'
coffeepot -g:arbin.ixml -i:input.txt -o:output.xml --pretty-print
coffeepot -g:arbin.ixml -i:input.txt --format:json-data |
python3 -m json.tool --indent 2 > output.json
ixml
grammar, XML output, and JSON output are listed below.
A few quick notes on the ixml
grammar are in order.
A mark before a non-terminal indicates the serialization:
@
(at sign) means XML attribute, ^
(circumflex) means
XML element (same as no mark), and -
(minus) means ignore
but process data under it.
For example, the @stepn = ["0"-"9"]+ .
rule says to serialize
stepn
as an XML attribute but this can be overridden when the
rule is used. Replace stepn
with ^stepn
in the stepid
rule
(or remove the @
) to serialize it as an XML element instead.
Contrariwise, changing an element to an attribute means loss of
structure (flattening).
[" " | #9 | #d]
specifies a character set, here horizontal
whitespace (#9
is tab and #d
is CR), and ~[#a]
a ditto
exclusion.
An ‘Invisible XML’ grammar allows ambiguity, in this case
there isn’t any.
File arbin.ixml (‘Invisible XML’ grammar)
ixml version "1.0".
schedule = blank*, step+ .
step = head, limit* .
-head = stephead, headprop+, blank* .
-stephead = -"[", stepid, -"]", NL .
-stepid = -"Schedule_Step", stepn .
@stepn = ["0"-"9"]+ .
-headprop = (max_current | add_in | label | limit_num) .
max_current = -"m_nMaxCurrent", eqval .
add_in = -"m_szAddIn", eqval .
label = -"m_szLabel", eqval .
limit_num = -"m_uLimitNum", eqval .
limit = limithead, equation, limprop+, blank* .
-limithead = -"[", limitid, -"]", NL .
-limitid = -"Schedule_Step", stepn, -"_Limit", limitn .
@limitn = ["0"-"9"]+ .
equation = (compare | left | right)+ .
-eqpre = -"Equation", -["0"-"9"]+, -"_" .
-eqval = -"=", simple-value, NL .
compare = eqpre, -"szCompareSign", eqval .
left = eqpre, -"szLeft", eqval .
right = eqpre, -"szRight", eqval .
-limprop = (step_limit | goto_step) .
step_limit = -"m_bStepLimit", eqval .
goto_step = -"m_szGotoStep", eqval .
-simple-value = ~[" " | #9 | #d], char* .
-blank = S, NL .
-char = ~[#a] .
-NL = -#a .
-S = (-" " | -#9 | -#d)* .
File: output.xml
<schedule>
<step stepn='101'>
<max_current>0</max_current>
<add_in>Relay ON</add_in>
<label>08 - End Charge</label>
<limit_num>0</limit_num>
</step>
<step stepn='122'>
<max_current>0</max_current>
<add_in>Relay OFF</add_in>
<label>06 - End Charge</label>
<limit_num>2</limit_num>
<limit stepn='122' limitn='0'>
<equation>
<compare>>=</compare>
<right>F_05_Charge_Capacity</right>
<left>PV_CHAN_Charge_Capacity</left>
</equation>
<step_limit>1</step_limit>
<goto_step>End Test</goto_step>
</limit>
<limit stepn='122' limitn='1'>
<equation>
<compare>>=</compare>
<left>PV_CHAN_Voltage</left>
<right>3</right>
</equation>
<step_limit>1</step_limit>
<goto_step>End Test</goto_step>
</limit>
</step>
</schedule>
File: output.json
{
"schedule": {
"step": [
{
"stepn": 101,
"max_current": 0,
"add_in": "Relay ON",
"label": "08 - End Charge",
"limit_num": 0
},
{
"stepn": 122,
"max_current": 0,
"add_in": "Relay OFF",
"label": "06 - End Charge",
"limit_num": 2,
"limit": [
{
"stepn": 122,
"limitn": 0,
"equation": {
"compare": ">=",
"right": "F_05_Charge_Capacity",
"left": "PV_CHAN_Charge_Capacity"
},
"step_limit": 1,
"goto_step": "End Test"
},
{
"stepn": 122,
"limitn": 1,
"equation": {
"compare": ">=",
"left": "PV_CHAN_Voltage",
"right": 3
},
"step_limit": 1,
"goto_step": "End Test"
}
]
}
]
}
}