Getting a weird behaviour when using Matcher from Spacy several times

Question

I would like to use Matcher from Spacy on a list of span (sents)

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        print("(", span, ")")
        self.phrase[string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        sents = list(doc.sents)
        for sent in sents:
            self.phrase = {
                "NP": [],
                "VP": [],
                "VVP": []
            }
            self.phrases.append(self.phrase)
            print("[", sent, "]")
            self.matcher(sent)

            for phrase in self.phrase.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)

phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.nI love pdf, it is wonderfull.")
print(phrases)

but it seems confused and give me this response

[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
 ]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]

The first element is good but not the second {'NP': [built], 'VP': [is, of], 'VVP': []}
Is there a problem if we use the matcher several times with different text ?

Asked By: Faulheit

||

Source

Answer 1

Instead of using multiple sentence, I check the sentence ID on the callback function, It work but looks a bit gross

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        sents = list(doc.sents)
        sent_id = sents.index(span.sent)
        print("(", span, ")")
        print("Sentence number: ", sent_id)

        self.phrases[sent_id][string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
        self.matcher(doc)

        for phrases in self.phrases:
            for phrase in phrases.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases

Answered By: Faulheit

Getting a weird behaviour when using Matcher from Spacy several times

Question:

Answers: