How do I retrieve phrases from a NLTK.tree using custom node labels?

Question:

Given a NLTK tree produced using the code below, how do I retrieve the leaf values (phrases) that potentially match all of the node labels assigned using the nltk.RegexParser (e.g. those phrases which match the Present_Indefinite or Present_Perfect tense)?

from nltk import word_tokenize, pos_tag
import nltk

text = "#NOVAVAX has produced the #NUVAXOVID vaccine.
 Will that provide a new rally? We see Biotechnology
  Stock $NVAX Entering the Buying Area."
tokenized = word_tokenize(text) # Tokenize text
tagged = pos_tag(tokenized) # Tag tokenized text with PoS tags

my_grammar = r"""
Future_Perfect_Continuous: {<MD><VB><VBN><VBG>}
Future_Continuous:         {<MD><VB><VBG>}
Future_Perfect:            {<MD><VB><VBN>}
Past_Perfect_Continuous:   {<VBD><VBN><VBG>}
Present_Perfect_Continuous:{<VBP|VBZ><VBN><VBG>}
Future_Indefinite:         {<MD><VB>}
Past_Continuous:           {<VBD><VBG>}
Past_Perfect:              {<VBD><VBN>}
Present_Continuous:        {<VBZ|VBP><VBG>}
Present_Perfect:           {<VBZ|VBP><VBN>}
Past_Indefinite:           {<VBD>}
Present_Indefinite:        {<VBZ>|<VBP>}"""


def check_grammar(grammar, tags):
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(tags)
    return result

# Apply regex parser and create parse tree
result = check_grammar(my_grammar, tagged)
print(type(result))
# Output: <class 'nltk.tree.tree.Tree'>

More specifically, given that the output of print(result) is as shown below, how can I retrieve the phrases labelled as Present_Perfect and Present_Indefinite, or more generally, any other phrases which match the labels in my grammar?

(S
  #/#
  NOVAVAX/NNP
  (Present_Perfect has/VBZ produced/VBN)
  the/DT
  #/#
  NUVAXOVID/NNP
  vaccine/NN
  ./.
  Will/MD
  that/WDT
  provide/VB
  a/DT
  new/JJ
  rally/NN
  ?/.
  We/PRP
  (Present_Indefinite see/VBP)
  Biotechnology/NNP
  Stock/NNP
  $/$
  NVAX/NNP
  Entering/NNP
  the/DT
  Buying/NNP
  Area/NNP
  ./.)
Asked By: Samar Pratap Singh

||

Answers:

I’ve created a get_phrases_using_tense_label() function which takes:

  • the parse tree returned from your check_grammar() function (I’ve renamed it to get_parse_tree() as this is more meaningful in terms of what the function is doing), and
  • a list of tense labels based on your grammar.

The tense labels are retrieved using the get_labels_from_grammar() function I created, which iterates over the lines in your grammar and splits the string at the ":" retrieving the tense label.

The function then returns the list of phrases (along with their tags) for those nodes in the NLTK tree which match any of your tense_labels (e.g. "Present_Indefinite" and Present_Perfect" in the solution below). I’ve used a smaller text as input as an example.

Parse tree with multiple tense labels

Solution

from nltk import word_tokenize, pos_tag
import nltk

text = "#NOVAVAX produces #NUVAXOVID vaccine.
 Will that provide a new rally? We see Biotechnology
  Stock $NVAX Entering the Buying Area."

# Smaller text for testing
textSmall = "We see a surge in sales. It has been a great year."

tokenized = word_tokenize(textSmall)  # Tokenize text
tagged = pos_tag(tokenized)  # Tag tokenized text with PoS tags

my_grammar = r"""
Future_Perfect_Continuous: {<MD><VB><VBN><VBG>}
Future_Continuous:         {<MD><VB><VBG>}
Future_Perfect:            {<MD><VB><VBN>}
Past_Perfect_Continuous:   {<VBD><VBN><VBG>}
Present_Perfect_Continuous:{<VBP|VBZ><VBN><VBG>}
Future_Indefinite:         {<MD><VB>}
Past_Continuous:           {<VBD><VBG>}
Past_Perfect:              {<VBD><VBN>}
Present_Continuous:        {<VBZ|VBP><VBG>}
Present_Perfect:           {<VBZ|VBP><VBN>}
Past_Indefinite:           {<VBD>}
Present_Indefinite:        {<VBZ>|<VBP>}"""


def get_parse_tree(grammar, pos_tagged_text):
    cp = nltk.RegexpParser(grammar)
    parse_tree = cp.parse(pos_tagged_text)
    # parse_tree.draw()  # Visualise parse tree
    return parse_tree


# Function to get labels from grammar:
# takes line separated NLTK regexp grammar rules
def get_labels_from_grammar(grammar):
    labels = []
    for line in grammar.splitlines()[1:]:
        labels.append(line.split(":")[0])
    return labels


# Function takes parse tree & list of NLTK custom grammar labels as input
# Returns phrases which match
def get_phrases_using_tense_labels(parse_tree, tense_labels_to_get):
    matching_phrases = []
    for node in parse_tree.subtrees(filter=lambda x: any(x.label() == tense_lab for tense_lab in tense_labels_to_get)):
        matching_phrases.append(node.leaves()[0])
    return matching_phrases


# Function takes parse tree & list of NLTK custom grammar labels as input
# Returns the tense labels present in the parse tree
def get_tense_labels_in_tree(parse_tree, tense_labels_to_get):
    matching_labels = []
    for node in parse_tree.subtrees(filter=lambda x: any(x.label() == tense_lab for tense_lab in tense_labels_to_get)):
        matching_labels.append(node.label())
    return matching_labels


text_parse_tree = get_parse_tree(my_grammar, tagged)
# print(text_parse_tree)  # View parse tree output
tense_labels = get_labels_from_grammar(my_grammar)
phrases = get_phrases_using_tense_labels(text_parse_tree, tense_labels)
labels = get_tense_labels_in_tree(text_parse_tree, tense_labels)

print(phrases)
# Output: [('see', 'VBP'), ('has', 'VBZ')]
print([phrase[0] for phrase in phrases])
# Output: ['see', 'has']
print(labels)
# ['Present_Perfect', 'Present_Indefinite']
Answered By: Kyle F Hartzenberg