pandas: text analysis: Transfer raw data to dataframe
Question:
I need to read lines from a text file and extract the
quoted person name and quoted text from each line.
lines look similar to this:
"Am I ever!", Homer Simpson responded.
Remarks:
Hint: Use the returned object from the ‘open
‘ method to get the file
handler. Each line you read is expected to contain a new-line in the
end of the line. Remove the new-line as following: line_cln =line.strip()
There are the options for each line (assume one of these
three options): The first set of patterns, for which the person name
appears before the quoted text. The second set of patterns, for which
the quoted text appears before the person. Empty lines.
Complete the transfer_raw_text_to_dataframe
function to return a
dataframe with the extracted person name and text as explained
above. The information is expected to be extracted from the lines of
the given 'filename'
file.
The returned dataframe should include two columns:
person_name
– containing the extracted person name for each line.
extracted_text
– containing the extracted quoted text for each line.
The returned values:
- dataframe – The dataframe with the extracted information as described above.
- Important Note: if a line does not contain any quotation pattern, no information should be saved in the
corresponding row in the dataframe.
what I got so far: [edited]
def transfer_raw_text_to_dataframe(filename):
data = open(filename)
quote_pattern ='"(.*)"'
name_pattern = "w+sw+"
df = open(filename, encoding='utf8')
lines = df.readlines()
df.close()
dataframe = pd.DataFrame(columns=('person_name', 'extracted_text'))
i = 0
for line in lines:
quote = re.search(quote_pattern,line)
extracted_quotation = quote.group(1)
name = re.search(name_pattern,line)
extracted_person_name = name.group(0)
df2 = {'person_name': extracted_person_name, 'extracted_text': extracted_quotation}
dataframe = dataframe.append(df2, ignore_index = True)
dataframe.loc[i] = [person_name, extracted_text]
i =i+1
return dataframe
the dataframe is created with the correct shape, problem is, the person name in each row is: ‘Oh man’ and the quote is ‘Oh man, that guy’s tough to love.’ (in all of them)
which is weird because it’s not even in the txt file…
can anyone help me fix this?
Edit: I need to extract from a simple txt file that contains these lines only:
"Am I ever!", Homer Simpson responded.
"Hmmm. So... is it okay if I go to the women's conference with Chloe?", Lisa Simpson answered.
"Really? Uh, sure.", Bart Simpson answered.
"Sounds great.", Bart Simpson replied.
Homer Simpson responded: "Danica Patrick in my thoughts!"
C. Montgomery Burns: "Trust me, he'll say it, or I'll bust him down to Thursday night vespers."
"Gimme that torch." Lisa Simpson said.
"No! No, I've got a lot more mothering left in me!", Marge Simpson said.
"Oh, Homie, I don't care if you're a billionaire. I love you just because you're..." Marge Simpson said.
"Damn you, e-Bay!" Homer Simpson answered.
Answers:
for loop in folder:
# All files acc. mask ending with .txt
print(glob.glob("C:\MyFolder\*.txt"))
mylist=[ff for ff in glob.glob("C:\MyFolder\*.txt")]
print("file_list:n", mylist)
for filepath in mylist:
# do smth with each filepath
to collect all dfs you’re getting from files – smth like this (e.g. reading csv-files by-mask):
import glob
import pandas as pd
def dfs_collect():
mylist=[ff for ff in glob.glob("C:\MyFolder\*.txt")] # all files by-mask
print("file_list:n", mylist)
dfa=pd.concat((pd.read_csv(file, sep=';', encoding='windows-1250', index_col=False) for file in mylist), ignore_index=True)
but to get the content of your files – the example of the content is needed… without the example of your txt file (having dummy_info but left its real structure), I doubt, that anybody will try to imagine how it should look like
possibly in such a way:
import pandas as pd
import re
# do smth
with open("12.txt", "r") as f:
data = f.read()
# print(data)
# ########## findall text in quotes
m = re.findall(r'"(.+)"', data)
print("RESULT: n", m)
df = pd.DataFrame({'rep': m})
print(df)
# ########## retrieve and replace text in quotes for nothing
m = re.sub(r'"(.+)"', r'', data)
# ########## get First Name & Last Name from the rest text in each line
regex = re.compile("([A-Z]{1}[a-z]+ [A-Z]{1}[a-z]+)")
mm = regex.findall(m)
df1 = pd.DataFrame({'author': mm})
print(df1)
# ########## join 2 dataframes
fin = pd.concat([df, df1], axis=1)
print(fin)
all print just for checking (get them away for cleaner code).
Just "C. Montgomery Burns" is loosing his first letter…
I think that following does what you need. Please verify whether the output is accurate. I’ll explain any line that is unclear
import pandas as pd
import numpy as np
import nltk
from nltk.tree import ParentedTree
import typing as t # This is optional
# Using `read_csv` to read in the text because I find it easier
data = pd.read_csv("dialog.txt", header = None, sep = "~", quoting=3)
dialouges = data.squeeze() # Getting a series from the above DF with one column
def tag_sentence(tokenized: t.List[str]) -> t.List[t.Tuple[str, str]]:
tagged = nltk.pos_tag(tokenized)
tagged = [(token, tag) if tag not in {"``", "''"} else (token, "Q") for token, tag in tagged]
keep = {"Q", "NNP"}
renamed = [(token, "TEXT") if tag not in keep else (token, tag) for token, tag in tagged]
return renamed
def get_parse_tree(tagged_sent):
grammar = """
NAME: {<NNP>+}
WORDS: {<TEXT>+}
DIALOUGE: {<Q><WORDS|NAME>+<Q>}
"""
cp = nltk.RegexpParser(grammar)
parse_tree = cp.parse(tagged_sent)
return parse_tree
def extract_info(parse_tree):
ptree = ParentedTree.convert(parse_tree)
trees = list(ptree.subtrees())
root = ptree.root()
for subtree in trees[1:]:
if subtree.parent() == root:
if subtree.label() == "DIALOUGE":
dialouge = ' '.join(word for word, _ in subtree.leaves()[1:-1]) # Skipping quotaton marks
if subtree.label() == "NAME":
person = ' '.join(word for word, _ in subtree.leaves())
return dialouge, person
def process_sentence(sentence):
return extract_info(get_parse_tree(tag_sentence(nltk.word_tokenize(sentence))))
processed = [process_sentence(line) for line in dialouges]
result = pd.DataFrame(processed, columns=["extracted_text", "person_name"])
The resulting DataFrame looks like this:
I need to read lines from a text file and extract the
quoted person name and quoted text from each line.
lines look similar to this:
"Am I ever!", Homer Simpson responded.
Remarks:
Hint: Use the returned object from the ‘
open
‘ method to get the file
handler. Each line you read is expected to contain a new-line in the
end of the line. Remove the new-line as following:line_cln =line.strip()
There are the options for each line (assume one of these
three options): The first set of patterns, for which the person name
appears before the quoted text. The second set of patterns, for which
the quoted text appears before the person. Empty lines.
Complete the
transfer_raw_text_to_dataframe
function to return a
dataframe with the extracted person name and text as explained
above. The information is expected to be extracted from the lines of
the given'filename'
file.
The returned dataframe should include two columns:
person_name
– containing the extracted person name for each line.extracted_text
– containing the extracted quoted text for each line.The returned values:
- dataframe – The dataframe with the extracted information as described above.
- Important Note: if a line does not contain any quotation pattern, no information should be saved in the
corresponding row in the dataframe.
what I got so far: [edited]
def transfer_raw_text_to_dataframe(filename):
data = open(filename)
quote_pattern ='"(.*)"'
name_pattern = "w+sw+"
df = open(filename, encoding='utf8')
lines = df.readlines()
df.close()
dataframe = pd.DataFrame(columns=('person_name', 'extracted_text'))
i = 0
for line in lines:
quote = re.search(quote_pattern,line)
extracted_quotation = quote.group(1)
name = re.search(name_pattern,line)
extracted_person_name = name.group(0)
df2 = {'person_name': extracted_person_name, 'extracted_text': extracted_quotation}
dataframe = dataframe.append(df2, ignore_index = True)
dataframe.loc[i] = [person_name, extracted_text]
i =i+1
return dataframe
the dataframe is created with the correct shape, problem is, the person name in each row is: ‘Oh man’ and the quote is ‘Oh man, that guy’s tough to love.’ (in all of them)
which is weird because it’s not even in the txt file…
can anyone help me fix this?
Edit: I need to extract from a simple txt file that contains these lines only:
"Am I ever!", Homer Simpson responded.
"Hmmm. So... is it okay if I go to the women's conference with Chloe?", Lisa Simpson answered.
"Really? Uh, sure.", Bart Simpson answered.
"Sounds great.", Bart Simpson replied.
Homer Simpson responded: "Danica Patrick in my thoughts!"
C. Montgomery Burns: "Trust me, he'll say it, or I'll bust him down to Thursday night vespers."
"Gimme that torch." Lisa Simpson said.
"No! No, I've got a lot more mothering left in me!", Marge Simpson said.
"Oh, Homie, I don't care if you're a billionaire. I love you just because you're..." Marge Simpson said.
"Damn you, e-Bay!" Homer Simpson answered.
for loop in folder:
# All files acc. mask ending with .txt
print(glob.glob("C:\MyFolder\*.txt"))
mylist=[ff for ff in glob.glob("C:\MyFolder\*.txt")]
print("file_list:n", mylist)
for filepath in mylist:
# do smth with each filepath
to collect all dfs you’re getting from files – smth like this (e.g. reading csv-files by-mask):
import glob
import pandas as pd
def dfs_collect():
mylist=[ff for ff in glob.glob("C:\MyFolder\*.txt")] # all files by-mask
print("file_list:n", mylist)
dfa=pd.concat((pd.read_csv(file, sep=';', encoding='windows-1250', index_col=False) for file in mylist), ignore_index=True)
but to get the content of your files – the example of the content is needed… without the example of your txt file (having dummy_info but left its real structure), I doubt, that anybody will try to imagine how it should look like
possibly in such a way:
import pandas as pd
import re
# do smth
with open("12.txt", "r") as f:
data = f.read()
# print(data)
# ########## findall text in quotes
m = re.findall(r'"(.+)"', data)
print("RESULT: n", m)
df = pd.DataFrame({'rep': m})
print(df)
# ########## retrieve and replace text in quotes for nothing
m = re.sub(r'"(.+)"', r'', data)
# ########## get First Name & Last Name from the rest text in each line
regex = re.compile("([A-Z]{1}[a-z]+ [A-Z]{1}[a-z]+)")
mm = regex.findall(m)
df1 = pd.DataFrame({'author': mm})
print(df1)
# ########## join 2 dataframes
fin = pd.concat([df, df1], axis=1)
print(fin)
all print just for checking (get them away for cleaner code).
Just "C. Montgomery Burns" is loosing his first letter…
I think that following does what you need. Please verify whether the output is accurate. I’ll explain any line that is unclear
import pandas as pd
import numpy as np
import nltk
from nltk.tree import ParentedTree
import typing as t # This is optional
# Using `read_csv` to read in the text because I find it easier
data = pd.read_csv("dialog.txt", header = None, sep = "~", quoting=3)
dialouges = data.squeeze() # Getting a series from the above DF with one column
def tag_sentence(tokenized: t.List[str]) -> t.List[t.Tuple[str, str]]:
tagged = nltk.pos_tag(tokenized)
tagged = [(token, tag) if tag not in {"``", "''"} else (token, "Q") for token, tag in tagged]
keep = {"Q", "NNP"}
renamed = [(token, "TEXT") if tag not in keep else (token, tag) for token, tag in tagged]
return renamed
def get_parse_tree(tagged_sent):
grammar = """
NAME: {<NNP>+}
WORDS: {<TEXT>+}
DIALOUGE: {<Q><WORDS|NAME>+<Q>}
"""
cp = nltk.RegexpParser(grammar)
parse_tree = cp.parse(tagged_sent)
return parse_tree
def extract_info(parse_tree):
ptree = ParentedTree.convert(parse_tree)
trees = list(ptree.subtrees())
root = ptree.root()
for subtree in trees[1:]:
if subtree.parent() == root:
if subtree.label() == "DIALOUGE":
dialouge = ' '.join(word for word, _ in subtree.leaves()[1:-1]) # Skipping quotaton marks
if subtree.label() == "NAME":
person = ' '.join(word for word, _ in subtree.leaves())
return dialouge, person
def process_sentence(sentence):
return extract_info(get_parse_tree(tag_sentence(nltk.word_tokenize(sentence))))
processed = [process_sentence(line) for line in dialouges]
result = pd.DataFrame(processed, columns=["extracted_text", "person_name"])
The resulting DataFrame looks like this: