Getting a weird behaviour when using Matcher from Spacy several times

Question:

I would like to use Matcher from Spacy on a list of span (sents)

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        print("(", span, ")")
        self.phrase[string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        sents = list(doc.sents)
        for sent in sents:
            self.phrase = {
                "NP": [],
                "VP": [],
                "VVP": []
            }
            self.phrases.append(self.phrase)
            print("[", sent, "]")
            self.matcher(sent)

            for phrase in self.phrase.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)

phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.nI love pdf, it is wonderfull.")
print(phrases)

but it seems confused and give me this response

[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
 ]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]

The first element is good but not the second {'NP': [built], 'VP': [is, of], 'VVP': []}
Is there a problem if we use the matcher several times with different text ?

Asked By: Faulheit

||

Answers:

Instead of using multiple sentence, I check the sentence ID on the callback function, It work but looks a bit gross

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        sents = list(doc.sents)
        sent_id = sents.index(span.sent)
        print("(", span, ")")
        print("Sentence number: ", sent_id)

        self.phrases[sent_id][string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
        self.matcher(doc)

        for phrases in self.phrases:
            for phrase in phrases.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases
Answered By: Faulheit
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.