Getting a weird behaviour when using Matcher from Spacy several times
Question:
I would like to use Matcher from Spacy on a list of span (sents)
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
print("(", span, ")")
self.phrase[string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
sents = list(doc.sents)
for sent in sents:
self.phrase = {
"NP": [],
"VP": [],
"VVP": []
}
self.phrases.append(self.phrase)
print("[", sent, "]")
self.matcher(sent)
for phrase in self.phrase.values():
phrase.sort(key=lambda x: x.start)
return self.phrases
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)
phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.nI love pdf, it is wonderfull.")
print(phrases)
but it seems confused and give me this response
[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]
The first element is good but not the second {'NP': [built], 'VP': [is, of], 'VVP': []}
Is there a problem if we use the matcher several times with different text ?
Answers:
Instead of using multiple sentence, I check the sentence ID on the callback function, It work but looks a bit gross
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
sents = list(doc.sents)
sent_id = sents.index(span.sent)
print("(", span, ")")
print("Sentence number: ", sent_id)
self.phrases[sent_id][string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
self.matcher(doc)
for phrases in self.phrases:
for phrase in phrases.values():
phrase.sort(key=lambda x: x.start)
return self.phrases
I would like to use Matcher from Spacy on a list of span (sents)
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
print("(", span, ")")
self.phrase[string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
sents = list(doc.sents)
for sent in sents:
self.phrase = {
"NP": [],
"VP": [],
"VVP": []
}
self.phrases.append(self.phrase)
print("[", sent, "]")
self.matcher(sent)
for phrase in self.phrase.values():
phrase.sort(key=lambda x: x.start)
return self.phrases
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)
phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.nI love pdf, it is wonderfull.")
print(phrases)
but it seems confused and give me this response
[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]
The first element is good but not the second {'NP': [built], 'VP': [is, of], 'VVP': []}
Is there a problem if we use the matcher several times with different text ?
Instead of using multiple sentence, I check the sentence ID on the callback function, It work but looks a bit gross
class Chunker:
def __init__(self, nlp, matcher):
self.nlp = nlp
self.matcher = matcher
self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
def on_match_callback(self, matcher, doc, i, matches):
match_id, start, end = matches[i]
string_id = self.nlp.vocab.strings[match_id]
span = doc[start:end]
sents = list(doc.sents)
sent_id = sents.index(span.sent)
print("(", span, ")")
print("Sentence number: ", sent_id)
self.phrases[sent_id][string_id].append(span)
def chunk(self, text):
self.phrases = []
doc = self.nlp(text)
self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
self.matcher(doc)
for phrases in self.phrases:
for phrase in phrases.values():
phrase.sort(key=lambda x: x.start)
return self.phrases