CSV read columns corresponding to other columns values
Question:
I need to parse a csv
file.
Input: file + name:
Index | writer | year | words
0 | Philip | 1994 | this is first row
1 | Heinz | 2000 | python is wonderful (new line) second line
2 | Thomas | 1993 | i don't like this
3 | Heinz | 1898 | this is another row
. | . | . | .
. | . | . | .
N | Fritz | 2014 | i hate man united
Output: list of all words corresponding to name
l = ['python is wonderful second line', 'this is another row']
What have I tried?
import csv
import sys
class artist:
def __init__(self, name, file):
self.file = file
self.name = name
self.list = []
def extractText(self):
with open(self.file, 'rb') as f:
reader = csv.reader(f)
temp = list(reader)
k = len(temp)
for i in range(1, k):
s = temp[i]
if s[1] == self.name:
self.list.append(str(s[3]))
if __name__ == '__main__':
# arguments
inputFile = str(sys.argv[1])
Heinz = artist('Heinz', inputFile)
Heinz.extractText()
print(Heinz.list)
Output is:
["python is wonderfulrnsecond line", 'this is another row']
How do I get rid of rn
for cells that contain more than one line of words, and could the loop be improved as its extremely slow?
Answers:
You could simply use pandas to get the list:
import pandas
df = pandas.read_csv('test1.csv')
index = df[df['writer'] == "Heinz"].index.tolist() # get the specific name's index
l = list()
for i in index:
l.append(df.iloc[i, 3].replace('n','')) # get the cell and strip new line 'n', append to list.
l
Output:
['python is wonderful second line', 'this is another row']
Getting rid of newlines in s[3]
: I’d suggest ' '.join(s[3].splitlines())
. See documentations for "".splitlines
, see also "".translate
.
Improving the loop:
def extractText(self):
with open(self.file, 'rb') as f:
for s in csv.reader(f):
s = temp[i]
if s[1] == self.name:
self.list.append(str(s[3]))
This saves one pass over the data.
But please consider @Tiny.D’s advice and give pandas a try.
This should at least be faster since you are parsing as you are reading the file, and then stripping out the unwanted carriage return and new line characters if they are there.
with open(self.file) as csv_fh:
for n in csv.reader(csv_fh):
if n[1] == self.name:
self.list.append(n[3].replace('rn', ' ')
To collapse multiple white space you can use a regular expression, and to speed things up a bit, try a loop comprehension:
import re
def extractText(self):
RE_WHITESPACE = re.compile(r'[ trn]+')
with open(self.file, 'rU') as f:
reader = csv.reader(f)
# skip the first line
next(reader)
# put all of the words into a list if the artist matches
self.list = [RE_WHITESPACE.sub(' ', s[3])
for s in reader if s[1] == self.name]
I need to parse a csv
file.
Input: file + name:
Index | writer | year | words
0 | Philip | 1994 | this is first row
1 | Heinz | 2000 | python is wonderful (new line) second line
2 | Thomas | 1993 | i don't like this
3 | Heinz | 1898 | this is another row
. | . | . | .
. | . | . | .
N | Fritz | 2014 | i hate man united
Output: list of all words corresponding to name
l = ['python is wonderful second line', 'this is another row']
What have I tried?
import csv
import sys
class artist:
def __init__(self, name, file):
self.file = file
self.name = name
self.list = []
def extractText(self):
with open(self.file, 'rb') as f:
reader = csv.reader(f)
temp = list(reader)
k = len(temp)
for i in range(1, k):
s = temp[i]
if s[1] == self.name:
self.list.append(str(s[3]))
if __name__ == '__main__':
# arguments
inputFile = str(sys.argv[1])
Heinz = artist('Heinz', inputFile)
Heinz.extractText()
print(Heinz.list)
Output is:
["python is wonderfulrnsecond line", 'this is another row']
How do I get rid of rn
for cells that contain more than one line of words, and could the loop be improved as its extremely slow?
You could simply use pandas to get the list:
import pandas
df = pandas.read_csv('test1.csv')
index = df[df['writer'] == "Heinz"].index.tolist() # get the specific name's index
l = list()
for i in index:
l.append(df.iloc[i, 3].replace('n','')) # get the cell and strip new line 'n', append to list.
l
Output:
['python is wonderful second line', 'this is another row']
Getting rid of newlines in s[3]
: I’d suggest ' '.join(s[3].splitlines())
. See documentations for "".splitlines
, see also "".translate
.
Improving the loop:
def extractText(self):
with open(self.file, 'rb') as f:
for s in csv.reader(f):
s = temp[i]
if s[1] == self.name:
self.list.append(str(s[3]))
This saves one pass over the data.
But please consider @Tiny.D’s advice and give pandas a try.
This should at least be faster since you are parsing as you are reading the file, and then stripping out the unwanted carriage return and new line characters if they are there.
with open(self.file) as csv_fh:
for n in csv.reader(csv_fh):
if n[1] == self.name:
self.list.append(n[3].replace('rn', ' ')
To collapse multiple white space you can use a regular expression, and to speed things up a bit, try a loop comprehension:
import re
def extractText(self):
RE_WHITESPACE = re.compile(r'[ trn]+')
with open(self.file, 'rU') as f:
reader = csv.reader(f)
# skip the first line
next(reader)
# put all of the words into a list if the artist matches
self.list = [RE_WHITESPACE.sub(' ', s[3])
for s in reader if s[1] == self.name]