Save and load nlp results in spacy
Question:
I want to use SpaCy to analyze many small texts and I want to store the nlp results for further use to save processing time. I found code at Storing and Loading spaCy Documents Containing Word Vectors but I get an error and I cannot find how to fix it. I am fairly new to python.
In the following code, I store the nlp results to a file and try to read it again. I can write the first file but I do not find the second file (vocab). I also get two errors: that Doc
and Vocab
are not defined.
Any idea to fix this or another method to achieve the same result is more than welcomed.
Thanks!
import spacy
nlp = spacy.load('en_core_web_md')
doc = nlp("He eats a green apple")
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
NLP_FName = "E:\SaveTest.nlp"
doc.to_disk(NLP_FName)
Vocab_FName = "E:\SaveTest.voc"
doc.vocab.to_disk(Vocab_FName)
#To read the data again:
idoc = Doc(Vocab()).from_disk(NLP_FName)
idoc.vocab.from_disk(Vocab_FName)
for token in idoc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
Answers:
I tried your code and I had a few minor issues which I fixed on the code below.
Note that SaveTest.nlp
is a binary file with your doc info and
SaveTest.voc
is a folder with all the spacy model vocab information (vectors, strings among other).
Changes I made:
- Import
Doc
class from spacy.tokens
- Import
Vocab
class from spacy.vocab
- Download
en_core_web_md
model using the following command:
python -m spacy download en_core_web_md
Please note that spacy has multiple models for each language, and usually you have to download it first (typically sm
, md
and lg
models). Read more about it here.
Code:
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab
nlp = spacy.load('en_core_web_md')
doc = nlp("He eats a green apple")
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
NLP_FName = "E:\SaveTest.nlp"
doc.to_disk(NLP_FName)
Vocab_FName = "E:\SaveTest.voc"
doc.vocab.to_disk(Vocab_FName)
#To read the data again:
idoc = Doc(Vocab()).from_disk(NLP_FName)
idoc.vocab.from_disk(Vocab_FName)
for token in idoc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
Let me know if this is helpful to you, and if not, please add your error message to your original question so I can help.
The efficient way to do this is to use a DocBin
instead: https://spacy.io/usage/saving-loading#docs
Example adapted from the docs (you can use doc_bin.to/from_disk
instead of to/from_bytes
):
import spacy
from spacy.tokens import DocBin
doc_bin = DocBin()
texts = ["Some text", "Lots of texts...", "..."]
nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts):
doc_bin.add(doc)
bytes_data = doc_bin.to_bytes()
# Deserialize later, e.g. in a new process
nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(bytes_data)
docs = list(doc_bin.get_docs(nlp.vocab))
Long shot of getting an answer but I tried your code and it doesn’t work for DocBins. I pasted my code below for the import part
import spacy
from spacy.tokens import DocBin
from LanguageIdentifier import predict
import fitz
import glob
import os
from datetime import datetime
import logging
#English-Accuracy: en_core_web_trf
#French-Accuracy: fr_dep_news_trf
#German-Accuracy: de_dep_news_trf
#Multi Language-Accuracy: xx_sent_ud_sm
#DocBins
FRdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
ENdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
DEdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
MULTIdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
#NLP modules
frNLP = spacy.load('fr_dep_news_trf')
enNLP = spacy.load('en_core_web_trf')
deNLP = spacy.load('de_dep_news_trf')
multiNLP = spacy.load('xx_sent_ud_sm')
ErroredFiles =[]
def processNLP(text):
lang = predict(text)
if 'fr' in lang:
doc = frNLP(text)
FRdoc_bin.add(doc)
return
elif 'de' in lang:
DEdoc_bin.add(deNLP(text))
return
elif 'en' in lang:
ENdoc_bin.add(enNLP(text))
return
else:
MULTIdoc_bin.add(multiNLP(text))
return
def get_text_from_pdf(Path):
text = ''
content = fitz.open(Path)
for page in content:
if page.number == 1:
text = page.get_text()[212:]
else:
text = text + page.get_text()
return text
FolderPath = r'C:[Redacted]DataSource**.pdf'
PDFfiles = glob.glob(FolderPath)
counter = 0
for file in PDFfiles:
counter = counter +1
try:
textPDF = get_text_from_pdf(file)
processNLP(textPDF)
except Exception as e:
ErroredFiles.append(file)
logging.error('Error with file '+ file)
logging.error('Error message: '+ str(e))
MULTIdoc_bin.add(multiNLP(textPDF))
if(counter == 10): #For testing purposes only
break
CreatedModelPath = r'C:[Redacted]Results' + datetime.strftime(datetime.now(),"%Y%m%d%H%M%S")
os.mkdir(CreatedModelPath)
FRdoc_bin.to_disk(CreatedModelPath+r'FRdocBin'+'.nlp')
FRdoc_bin.vocab.to_disk(CreatedModelPath+r'FRdocBin'+'.voc')
ENdoc_bin.to_disk(CreatedModelPath+r'ENdocBin'+'.nlp')
DEdoc_bin.to_disk(CreatedModelPath+r'DEdocBin'+'.nlp')
MULTIdoc_bin.to_disk(CreatedModelPath+'MULTIdocBin'+'.nlp')
Error I get:
Traceback (most recent call last):
File "C:[Redacted]ProcessingEngine.py", line 117, in <module>
FRdoc_bin.vocab.to_disk(CreatedModelPath+r'FRdocBin'+'.voc')
AttributeError: 'DocBin' object has no attribute 'vocab'
I want to use SpaCy to analyze many small texts and I want to store the nlp results for further use to save processing time. I found code at Storing and Loading spaCy Documents Containing Word Vectors but I get an error and I cannot find how to fix it. I am fairly new to python.
In the following code, I store the nlp results to a file and try to read it again. I can write the first file but I do not find the second file (vocab). I also get two errors: that Doc
and Vocab
are not defined.
Any idea to fix this or another method to achieve the same result is more than welcomed.
Thanks!
import spacy
nlp = spacy.load('en_core_web_md')
doc = nlp("He eats a green apple")
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
NLP_FName = "E:\SaveTest.nlp"
doc.to_disk(NLP_FName)
Vocab_FName = "E:\SaveTest.voc"
doc.vocab.to_disk(Vocab_FName)
#To read the data again:
idoc = Doc(Vocab()).from_disk(NLP_FName)
idoc.vocab.from_disk(Vocab_FName)
for token in idoc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
I tried your code and I had a few minor issues which I fixed on the code below.
Note that SaveTest.nlp
is a binary file with your doc info and
SaveTest.voc
is a folder with all the spacy model vocab information (vectors, strings among other).
Changes I made:
- Import
Doc
class fromspacy.tokens
- Import
Vocab
class fromspacy.vocab
- Download
en_core_web_md
model using the following command:
python -m spacy download en_core_web_md
Please note that spacy has multiple models for each language, and usually you have to download it first (typically sm
, md
and lg
models). Read more about it here.
Code:
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab
nlp = spacy.load('en_core_web_md')
doc = nlp("He eats a green apple")
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
NLP_FName = "E:\SaveTest.nlp"
doc.to_disk(NLP_FName)
Vocab_FName = "E:\SaveTest.voc"
doc.vocab.to_disk(Vocab_FName)
#To read the data again:
idoc = Doc(Vocab()).from_disk(NLP_FName)
idoc.vocab.from_disk(Vocab_FName)
for token in idoc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
Let me know if this is helpful to you, and if not, please add your error message to your original question so I can help.
The efficient way to do this is to use a DocBin
instead: https://spacy.io/usage/saving-loading#docs
Example adapted from the docs (you can use doc_bin.to/from_disk
instead of to/from_bytes
):
import spacy
from spacy.tokens import DocBin
doc_bin = DocBin()
texts = ["Some text", "Lots of texts...", "..."]
nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts):
doc_bin.add(doc)
bytes_data = doc_bin.to_bytes()
# Deserialize later, e.g. in a new process
nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(bytes_data)
docs = list(doc_bin.get_docs(nlp.vocab))
Long shot of getting an answer but I tried your code and it doesn’t work for DocBins. I pasted my code below for the import part
import spacy
from spacy.tokens import DocBin
from LanguageIdentifier import predict
import fitz
import glob
import os
from datetime import datetime
import logging
#English-Accuracy: en_core_web_trf
#French-Accuracy: fr_dep_news_trf
#German-Accuracy: de_dep_news_trf
#Multi Language-Accuracy: xx_sent_ud_sm
#DocBins
FRdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
ENdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
DEdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
MULTIdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
#NLP modules
frNLP = spacy.load('fr_dep_news_trf')
enNLP = spacy.load('en_core_web_trf')
deNLP = spacy.load('de_dep_news_trf')
multiNLP = spacy.load('xx_sent_ud_sm')
ErroredFiles =[]
def processNLP(text):
lang = predict(text)
if 'fr' in lang:
doc = frNLP(text)
FRdoc_bin.add(doc)
return
elif 'de' in lang:
DEdoc_bin.add(deNLP(text))
return
elif 'en' in lang:
ENdoc_bin.add(enNLP(text))
return
else:
MULTIdoc_bin.add(multiNLP(text))
return
def get_text_from_pdf(Path):
text = ''
content = fitz.open(Path)
for page in content:
if page.number == 1:
text = page.get_text()[212:]
else:
text = text + page.get_text()
return text
FolderPath = r'C:[Redacted]DataSource**.pdf'
PDFfiles = glob.glob(FolderPath)
counter = 0
for file in PDFfiles:
counter = counter +1
try:
textPDF = get_text_from_pdf(file)
processNLP(textPDF)
except Exception as e:
ErroredFiles.append(file)
logging.error('Error with file '+ file)
logging.error('Error message: '+ str(e))
MULTIdoc_bin.add(multiNLP(textPDF))
if(counter == 10): #For testing purposes only
break
CreatedModelPath = r'C:[Redacted]Results' + datetime.strftime(datetime.now(),"%Y%m%d%H%M%S")
os.mkdir(CreatedModelPath)
FRdoc_bin.to_disk(CreatedModelPath+r'FRdocBin'+'.nlp')
FRdoc_bin.vocab.to_disk(CreatedModelPath+r'FRdocBin'+'.voc')
ENdoc_bin.to_disk(CreatedModelPath+r'ENdocBin'+'.nlp')
DEdoc_bin.to_disk(CreatedModelPath+r'DEdocBin'+'.nlp')
MULTIdoc_bin.to_disk(CreatedModelPath+'MULTIdocBin'+'.nlp')
Error I get:
Traceback (most recent call last):
File "C:[Redacted]ProcessingEngine.py", line 117, in <module>
FRdoc_bin.vocab.to_disk(CreatedModelPath+r'FRdocBin'+'.voc')
AttributeError: 'DocBin' object has no attribute 'vocab'