NLP Stemming and Lemmatization using Regular expression tokenization
Question:
Define a function called performStemAndLemma
, which takes a parameter. The first parameter, textcontent
, is a string. The function definition code stub is given in the editor. Perform the following specified tasks:
1.Tokenize all the words given in textcontent
. The word should contain alphabets or numbers or underscore. Store the tokenized list of words in tokenizedwords
. (Hint: Use regexp_tokenize)
-
Convert all the words into lowercase. Store the result into the variable tokenizedwords
.
-
Remove all the stop words from the unique set of tokenizedwords
. Store the result into the variable filteredwords
. (Hint: Use stopwords corpora)
-
Stem each word present in filteredwords
with PorterStemmer, and store the result in the list porterstemmedwords
.
-
Stem each word present in filteredwords
with LancasterStemmer, and store the result in the list lancasterstemmedwords
.
-
Lemmatize each word present in filteredwords
with WordNetLemmatizer, and store the result in the list lemmatizedwords
.
Return porterstemmedwords
, lancasterstemmedwords
, lemmatizedwords
variables from the function.
My code:
from nltk.corpus import stopwords
def performStemAndLemma(textcontent):
# Write your code here
#Step 1
tokenizedword = nltk.tokenize.regexp_tokenize(textcontent, pattern = 'w*', gaps = False)
#Step 2
tokenizedwords = [x.lower() for x in tokenizedword if x != '']
#Step 3
unique_tokenizedwords = set(tokenizedwords)
stop_words = set(stopwords.words('english'))
filteredwords = []
for x in unique_tokenizedwords:
if x not in stop_words:
filteredwords.append(x)
#Steps 4, 5 , 6
ps = nltk.stem.PorterStemmer()
ls = nltk.stem.LancasterStemmer()
wnl = nltk.stem.WordNetLemmatizer()
porterstemmedwords =[]
lancasterstemmedwords = []
lemmatizedwords = []
for x in filteredwords:
porterstemmedwords.append(ps.stem(x))
lancasterstemmedwords.append(ls.stem(x))
lemmatizedwords.append(wnl.lemmatize(x))
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords
Still the program is not working fine. Not passing the 2 test cases. Highlight the mistake in above code and provide alternate solution for the same.
Answers:
def performStemAndLemma(textcontent):
from nltk.corpus import stopwords
Just import stopwords
after defining the function as above. The rest of the code remains the same.
Actually the expected output is to consider uppercase and lowercase words as seperate tokens. Hence you should fetch all unique words before converting all of them to lower case. I hope the following code should work.
from nltk.corpus import stopwords
def performStemAndLemma(textcontent):
# Write your code here
#Step 1
tokenizedword = nltk.regexp_tokenize(textcontent, pattern = r'w*', gaps = False)
#Step 2
tokenizedwords = [y for y in tokenizedword if y != '']
unique_tokenizedwords = set(tokenizedwords)
tokenizedwords = [x.lower() for x in unique_tokenizedwords if x != '']
#Step 3
#unique_tokenizedwords = set(tokenizedwords)
stop_words = set(stopwords.words('english'))
filteredwords = []
for x in tokenizedwords:
if x not in stop_words:
filteredwords.append(x)
#Steps 4, 5 , 6
ps = nltk.stem.PorterStemmer()
ls = nltk.stem.LancasterStemmer()
wnl = nltk.stem.WordNetLemmatizer()
porterstemmedwords =[]
lancasterstemmedwords = []
lemmatizedwords = []
for x in filteredwords:
porterstemmedwords.append(ps.stem(x))
lancasterstemmedwords.append(ls.stem(x))
lemmatizedwords.append(wnl.lemmatize(x))
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords
The approach below cleared all the test cases for me
import re
from nltk.corpus import stopwords
def performStemAndLemma(textcontent):
# Write your code here
lancaster = nltk.LancasterStemmer()
porter = nltk.PorterStemmer()
wnl = nltk.WordNetLemmatizer()
tokens2_3 = nltk.regexp_tokenize(textcontent, r'w+')
stop_words = set(stopwords.words('english'))
tokenisedwords=[words for words in set(tokens2_3) if not words.lower() in stop_words ]
#print(tokenizedwords)
return [porter.stem(word.lower()) for word in set(tokenisedwords)],[lancaster.stem(word.lower()) for word in set(tokenisedwords)],[wnl.lemmatize(word.lower()) for word in set(tokenisedwords)]
def performStemAndLemma(textcontent):
# Write your code here
import re
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer, LancasterStemmer
pattern = r'w*'
tokenizedwords = nltk.regexp_tokenize(textcontent, pattern, gaps=False)
tokenizedwords = [words for words in tokenizedwords if words !='']
uniquetokenizedwords = set(tokenizedwords)
tokenizedwords = [words.lower() for words in uniquetokenizedwords if words !='']
stop_words = set(stopwords.words('english'))
filteredwords = [words for words in tokenizedwords if words not in stop_words]
porterstemmedwords = nltk.PorterStemmer()
porterstemmedwords =[porterstemmedwords.stem(words) for words in filteredwords]
lancasterstemmedwords = nltk.LancasterStemmer()
lancasterstemmedwords =[lancasterstemmedwords.stem(words) for words in filteredwords]
wnl = nltk.WordNetLemmatizer()
lemmatizedwords = [wnl.lemmatize(word) for word in filteredwords ]
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords
from nltk import PorterStemmer, WordNetLemmatizer, LancasterStemmer
from nltk.corpus import stopwords
import nltk
def performStemAndLemma(textcontent):
# Write your code here
tokenizedword = nltk.tokenize.regexp_tokenize(textcontent, pattern = ‘w*’, gaps = False)
tokenizedwords = [x.lower() for x in tokenizedword if x != '']
stop_words = set(stopwords.words('english'))
filteredwords = [x for x in set(tokenizedwords) if x not in stop_words]
ps = PorterStemmer()
ls = LancasterStemmer()
wnl = WordNetLemmatizer()
porterstemmedwords =[ps.stem(x) for x in filteredwords]
lancasterstemmedwords = [ls.stem(x) for x in filteredwords]
lemmatizedwords = [wnl.lemmatize(x) for x in filteredwords]
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords
”’This worked for me”’
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk import LancasterStemmer
def performStemAndLemma(textcontent):
pattern =r'w+'
tokenizewords=nltk.regexp_tokenize(textcontent,pattern)
tokenizewords = [w.lower() for w in set(tokenizewords)]
stopper = stopwords.words("english")
filteredwords = [w for w in tokenizewords if w not in stopper]
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
porterstemmedwords = [porter.stem(w) for w in filteredwords]
lancasterstemmedwords = [lancaster.stem(w) for w in filteredwords]
wnl = nltk.WordNetLemmatizer()
lemmatizedwords = [wnl.lemmatize(word) for word in filteredwords]
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords
Define a function called performStemAndLemma
, which takes a parameter. The first parameter, textcontent
, is a string. The function definition code stub is given in the editor. Perform the following specified tasks:
1.Tokenize all the words given in textcontent
. The word should contain alphabets or numbers or underscore. Store the tokenized list of words in tokenizedwords
. (Hint: Use regexp_tokenize)
-
Convert all the words into lowercase. Store the result into the variable
tokenizedwords
. -
Remove all the stop words from the unique set of
tokenizedwords
. Store the result into the variablefilteredwords
. (Hint: Use stopwords corpora) -
Stem each word present in
filteredwords
with PorterStemmer, and store the result in the listporterstemmedwords
. -
Stem each word present in
filteredwords
with LancasterStemmer, and store the result in the listlancasterstemmedwords
. -
Lemmatize each word present in
filteredwords
with WordNetLemmatizer, and store the result in the listlemmatizedwords
.
Return porterstemmedwords
, lancasterstemmedwords
, lemmatizedwords
variables from the function.
My code:
from nltk.corpus import stopwords
def performStemAndLemma(textcontent):
# Write your code here
#Step 1
tokenizedword = nltk.tokenize.regexp_tokenize(textcontent, pattern = 'w*', gaps = False)
#Step 2
tokenizedwords = [x.lower() for x in tokenizedword if x != '']
#Step 3
unique_tokenizedwords = set(tokenizedwords)
stop_words = set(stopwords.words('english'))
filteredwords = []
for x in unique_tokenizedwords:
if x not in stop_words:
filteredwords.append(x)
#Steps 4, 5 , 6
ps = nltk.stem.PorterStemmer()
ls = nltk.stem.LancasterStemmer()
wnl = nltk.stem.WordNetLemmatizer()
porterstemmedwords =[]
lancasterstemmedwords = []
lemmatizedwords = []
for x in filteredwords:
porterstemmedwords.append(ps.stem(x))
lancasterstemmedwords.append(ls.stem(x))
lemmatizedwords.append(wnl.lemmatize(x))
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords
Still the program is not working fine. Not passing the 2 test cases. Highlight the mistake in above code and provide alternate solution for the same.
def performStemAndLemma(textcontent):
from nltk.corpus import stopwords
Just import stopwords
after defining the function as above. The rest of the code remains the same.
Actually the expected output is to consider uppercase and lowercase words as seperate tokens. Hence you should fetch all unique words before converting all of them to lower case. I hope the following code should work.
from nltk.corpus import stopwords
def performStemAndLemma(textcontent):
# Write your code here
#Step 1
tokenizedword = nltk.regexp_tokenize(textcontent, pattern = r'w*', gaps = False)
#Step 2
tokenizedwords = [y for y in tokenizedword if y != '']
unique_tokenizedwords = set(tokenizedwords)
tokenizedwords = [x.lower() for x in unique_tokenizedwords if x != '']
#Step 3
#unique_tokenizedwords = set(tokenizedwords)
stop_words = set(stopwords.words('english'))
filteredwords = []
for x in tokenizedwords:
if x not in stop_words:
filteredwords.append(x)
#Steps 4, 5 , 6
ps = nltk.stem.PorterStemmer()
ls = nltk.stem.LancasterStemmer()
wnl = nltk.stem.WordNetLemmatizer()
porterstemmedwords =[]
lancasterstemmedwords = []
lemmatizedwords = []
for x in filteredwords:
porterstemmedwords.append(ps.stem(x))
lancasterstemmedwords.append(ls.stem(x))
lemmatizedwords.append(wnl.lemmatize(x))
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords
The approach below cleared all the test cases for me
import re
from nltk.corpus import stopwords
def performStemAndLemma(textcontent):
# Write your code here
lancaster = nltk.LancasterStemmer()
porter = nltk.PorterStemmer()
wnl = nltk.WordNetLemmatizer()
tokens2_3 = nltk.regexp_tokenize(textcontent, r'w+')
stop_words = set(stopwords.words('english'))
tokenisedwords=[words for words in set(tokens2_3) if not words.lower() in stop_words ]
#print(tokenizedwords)
return [porter.stem(word.lower()) for word in set(tokenisedwords)],[lancaster.stem(word.lower()) for word in set(tokenisedwords)],[wnl.lemmatize(word.lower()) for word in set(tokenisedwords)]
def performStemAndLemma(textcontent):
# Write your code here
import re
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer, LancasterStemmer
pattern = r'w*'
tokenizedwords = nltk.regexp_tokenize(textcontent, pattern, gaps=False)
tokenizedwords = [words for words in tokenizedwords if words !='']
uniquetokenizedwords = set(tokenizedwords)
tokenizedwords = [words.lower() for words in uniquetokenizedwords if words !='']
stop_words = set(stopwords.words('english'))
filteredwords = [words for words in tokenizedwords if words not in stop_words]
porterstemmedwords = nltk.PorterStemmer()
porterstemmedwords =[porterstemmedwords.stem(words) for words in filteredwords]
lancasterstemmedwords = nltk.LancasterStemmer()
lancasterstemmedwords =[lancasterstemmedwords.stem(words) for words in filteredwords]
wnl = nltk.WordNetLemmatizer()
lemmatizedwords = [wnl.lemmatize(word) for word in filteredwords ]
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords
from nltk import PorterStemmer, WordNetLemmatizer, LancasterStemmer
from nltk.corpus import stopwords
import nltk
def performStemAndLemma(textcontent):
# Write your code here
tokenizedword = nltk.tokenize.regexp_tokenize(textcontent, pattern = ‘w*’, gaps = False)
tokenizedwords = [x.lower() for x in tokenizedword if x != '']
stop_words = set(stopwords.words('english'))
filteredwords = [x for x in set(tokenizedwords) if x not in stop_words]
ps = PorterStemmer()
ls = LancasterStemmer()
wnl = WordNetLemmatizer()
porterstemmedwords =[ps.stem(x) for x in filteredwords]
lancasterstemmedwords = [ls.stem(x) for x in filteredwords]
lemmatizedwords = [wnl.lemmatize(x) for x in filteredwords]
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords
”’This worked for me”’
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk import LancasterStemmer
def performStemAndLemma(textcontent):
pattern =r'w+'
tokenizewords=nltk.regexp_tokenize(textcontent,pattern)
tokenizewords = [w.lower() for w in set(tokenizewords)]
stopper = stopwords.words("english")
filteredwords = [w for w in tokenizewords if w not in stopper]
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
porterstemmedwords = [porter.stem(w) for w in filteredwords]
lancasterstemmedwords = [lancaster.stem(w) for w in filteredwords]
wnl = nltk.WordNetLemmatizer()
lemmatizedwords = [wnl.lemmatize(word) for word in filteredwords]
return porterstemmedwords, lancasterstemmedwords, lemmatizedwords