Text summarization of txt file using python
Question:
Any good code to summarize the text of one txt file and convert it to an output txt file in Python, I am working on a project open to any library like NLTK
Answers:
It will require some tweaking to your preference using NLTK library, but it is a start. It takes text from one text file and outputs it to another text file as summarized text.
# importing the required libraries
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize, word_tokenize
# read the input file
filename = input("Enter the file name to be summarized: ")
file = open(filename, "r", encoding="utf-8")
text = file.read()
# tokenize the text into sentences
sentences = sent_tokenize(text)
# tokenize the text into words
words = word_tokenize(text)
# remove the stopwords
stop_words = set(stopwords.words("english"))
filtered_words = [w for w in words if not w in stop_words]
# calculate the frequency of each word
frequency = FreqDist(filtered_words)
# select the sentences with highest frequency of words
top_sentences = []
for sentence in sentences:
for word in word_tokenize(sentence):
if word in frequency:
top_sentences.append(sentence)
break
# sort the sentences in descending order of length
top_sentences.sort(key=len, reverse=True)
# write the summary to output file
summary = ". ".join(top_sentences[:7])
output_filename = input("Enter the output file name: ")
file = open(output_filename, "w")
file.write(summary)
file.close()
Any good code to summarize the text of one txt file and convert it to an output txt file in Python, I am working on a project open to any library like NLTK
It will require some tweaking to your preference using NLTK library, but it is a start. It takes text from one text file and outputs it to another text file as summarized text.
# importing the required libraries
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize, word_tokenize
# read the input file
filename = input("Enter the file name to be summarized: ")
file = open(filename, "r", encoding="utf-8")
text = file.read()
# tokenize the text into sentences
sentences = sent_tokenize(text)
# tokenize the text into words
words = word_tokenize(text)
# remove the stopwords
stop_words = set(stopwords.words("english"))
filtered_words = [w for w in words if not w in stop_words]
# calculate the frequency of each word
frequency = FreqDist(filtered_words)
# select the sentences with highest frequency of words
top_sentences = []
for sentence in sentences:
for word in word_tokenize(sentence):
if word in frequency:
top_sentences.append(sentence)
break
# sort the sentences in descending order of length
top_sentences.sort(key=len, reverse=True)
# write the summary to output file
summary = ". ".join(top_sentences[:7])
output_filename = input("Enter the output file name: ")
file = open(output_filename, "w")
file.write(summary)
file.close()