Count letter frequency in word list, excluding duplicates in the same word
Question:
I’m trying to find the most frequent letter in a list of words. I’m struggling with the algorithm because I need to count the letter frequency in a word only once skipping duplicates, so I need help finding a way to count the frequency of the letters in the entire list with only one occurrence per word, ignoring the second occurrence.
For example if i have:
words = ["tree", "bone", "indigo", "developer"]
The frequency will be:
letters={a:0, b:1, c:0, d:2, e:3, f:0, g:1, h:0, i:1, j:0, k:0, l:1, m:0, n:2, o:3, p:1, q:0, r:2, s:0, t:1, u:0, v:1, w:0, x:0, y:0, z:0}
As you can see from the letters dictionary: ‘e’ is 3 and not 5 because if ‘e’ repeats more than once in the same word it should be ignored.
This is the algorithm that I came up with, it’s implemented in Python:
for word in words:
count=0;
for letter in word:
if(letter.isalpha()):
if((letters[letter.lower()] > 0 && count == 0) ||
(letters[letter.lower()] == 0 && count == 0)):
letters[letter.lower()]+=1
count=1
elif(letters[letter.lower()]==0 && count==1):
letters[letter.lower()]+=1
But it still requires work and I can’t think about anything else, I’d be glad to anyone who will help me to think about a working solution.
Answers:
Create a counter object and then update it with sets for each word:
from collections import Counter
wordlist = ["tree","bone","indigo","developer"]
c = Counter()
for word in wordlist:
c.update(set(word.lower()))
print(c)
Output:
Counter({'e': 3, 'o': 3, 'r': 2, 'n': 2, 'd': 2, 't': 1, 'b': 1, 'i': 1, 'g': 1, 'v': 1, 'p': 1, 'l': 1})
Note that although letters that weren’t present in wordlist
aren’t present in in the Counter
, this is fine because a Counter
behaves like a defaultdict(int)
, so accessing a value not present automatically returns a default value of 0.
A variation on @Primusa answer without using update:
from collections import Counter
words = ["tree", "bone", "indigo", "developer"]
counts = Counter(c for word in words for c in set(word.lower()) if c.isalpha())
Output
Counter({'e': 3, 'o': 3, 'r': 2, 'd': 2, 'n': 2, 'p': 1, 'i': 1, 'b': 1, 'v': 1, 'g': 1, 'l': 1, 't': 1})
Basically convert each word to a set and then iterate over each set.
One without Counter
words=["tree","bone","indigo","developer"]
d={}
for word in words: # iterate over words
for i in set(word): # to remove the duplication of characters within word
d[i]=d.get(i,0)+1
Output
{'b': 1,
'd': 2,
'e': 3,
'g': 1,
'i': 1,
'l': 1,
'n': 2,
'o': 3,
'p': 1,
'r': 2,
't': 1,
'v': 1}
Comparing speed of the solutions presented so far:
def f1(words):
c = Counter()
for word in words:
c.update(set(word.lower()))
return c
def f2(words):
return Counter(
c
for word in words
for c in set(word.lower()))
def f3(words):
d = {}
for word in words:
for i in set(word.lower()):
d[i] = d.get(i, 0) + 1
return d
My timing function (using different sizes for the list of words):
word_list = [
'tree', 'bone', 'indigo', 'developer', 'python',
'language', 'timeit', 'xerox', 'printer', 'offset',
]
for exp in range(5):
words = word_list * 10**exp
result_list = []
for i in range(1, 4):
t = timeit.timeit(
'f(words)',
'from __main__ import words, f{} as f'.format(i),
number=100)
result_list.append((i, t))
print('{:10,d} words | {}'.format(
len(words),
' | '.join(
'f{} {:8.4f} sec'.format(i, t) for i, t in result_list)))
The results:
10 words | f1 0.0028 sec | f2 0.0012 sec | f3 0.0011 sec
100 words | f1 0.0245 sec | f2 0.0082 sec | f3 0.0113 sec
1,000 words | f1 0.2450 sec | f2 0.0812 sec | f3 0.1134 sec
10,000 words | f1 2.4601 sec | f2 0.8113 sec | f3 1.1335 sec
100,000 words | f1 24.4195 sec | f2 8.1828 sec | f3 11.2167 sec
The Counter
with list comprehension (here as f2()
) seems to be the fastest. Using counter.update()
seems to be a slow point (here as f1()
).
The other solutions are good, but they specifically don’t include the letters with zero frequency. Here’s an approach which does, but is approximately 2-3 times slower than the others.
import string
counts = {c: len([w for w in words if c in w.lower()]) for c in string.ascii_lowercase}
which produces a dict like this:
{'a': 4, 'b': 2, 'c': 2, 'd': 4, 'e': 7, 'f': 2, 'g': 2, 'h': 3, 'i': 7, 'j': 0, 'k': 0, 'l': 4, 'm': 5, 'n': 4, 'o': 4, 'p': 1, 'q': 0, 'r': 5, 's': 3, 't': 3, 'u': 2, 'v': 0, 'w': 3, 'x': 0, 'y': 2, 'z': 1}
Here’s my update of Ralf’s timings:
10 words | f1 0.0004 sec | f2 0.0004 sec | f3 0.0003 sec | f4 0.0010 sec
100 words | f1 0.0019 sec | f2 0.0014 sec | f3 0.0013 sec | f4 0.0034 sec
1,000 words | f1 0.0180 sec | f2 0.0118 sec | f3 0.0140 sec | f4 0.0298 sec
10,000 words | f1 0.1960 sec | f2 0.1278 sec | f3 0.1542 sec | f4 0.2648 sec
100,000 words | f1 2.0859 sec | f2 1.3971 sec | f3 1.6815 sec | f4 3.5196 sec
based on the following code and the word list from https://github.com/dwyl/english-words/
import string
import timeit
import random
from collections import Counter
def f1(words):
c = Counter()
for word in words:
c.update(set(word.lower()))
return c
def f2(words):
return Counter(
c
for word in words
for c in set(word.lower()))
def f3(words):
d = {}
for word in words:
for i in set(word.lower()):
d[i] = d.get(i, 0) + 1
return d
def f4(words):
d = {c: len([w for w in words if c in w.lower()]) for c in string.ascii_lowercase}
return d
with open('words.txt') as word_file:
valid_words = set(word_file.read().split())
for exp in range(5):
result_list = []
for i in range(1, 5):
t = timeit.timeit(
'f(words)',
'from __main__ import f{} as f, valid_words, exp; import random; words = random.sample(valid_words, 10**exp)'.format(i),
number=100)
result_list.append((i, t))
print('{:10,d} words | {}'.format(
len(words),
' | '.join(
'f{} {:8.4f} sec'.format(i, t) for i, t in result_list)))
print(f4(random.sample(valid_words, 10000)))
print(f4(random.sample(valid_words, 1000)))
print(f4(random.sample(valid_words, 100)))
print(f4(random.sample(valid_words, 10)))
Try using a dictionary comprehension:
import string
print({k:max(i.count(k) for i in words) for k in string.ascii_lowercase})
A bit too late to the party, but here you go:
freq = {k: sum(k in word for word in words) for k in set(''.join(words))}
which returns:
{'i': 1, 'v': 1, 'p': 1, 'b': 1, 'e': 3, 'g': 1, 't': 1, 'n': 2, 'd': 2, 'o': 3, 'l': 1, 'r': 2}
from collections import Counter
import string
words=["tree","bone","indigo","developer"]
y=Counter(string.ascii_lowercase)
new_dict=dict(y)
for k in new_dict:
new_dict[k]=0
trial = 0
while len(words) > trial:
for let in set(words[trial]):
if let in new_dict:
new_dict[str(let)]=new_dict[str(let)]+1
trial = trial +1
print(new_dict)
import collections
import itertools
import string
def main():
words = ["tree", "bone", "indigo", "developer"]
no_repeated_letters = (set(word) for word in words)
letter_stream = itertools.chain.from_iterable(no_repeated_letters)
counter = collections.Counter(letter_stream)
# set zeros for unseen letters, to match poster's answer.
for letter in string.ascii_lowercase:
if letter not in counter:
counter[letter] = 0
# print result.
for key in sorted(counter):
print(key, counter[key])
if __name__ == '__main__':
main()
I’m trying to find the most frequent letter in a list of words. I’m struggling with the algorithm because I need to count the letter frequency in a word only once skipping duplicates, so I need help finding a way to count the frequency of the letters in the entire list with only one occurrence per word, ignoring the second occurrence.
For example if i have:
words = ["tree", "bone", "indigo", "developer"]
The frequency will be:
letters={a:0, b:1, c:0, d:2, e:3, f:0, g:1, h:0, i:1, j:0, k:0, l:1, m:0, n:2, o:3, p:1, q:0, r:2, s:0, t:1, u:0, v:1, w:0, x:0, y:0, z:0}
As you can see from the letters dictionary: ‘e’ is 3 and not 5 because if ‘e’ repeats more than once in the same word it should be ignored.
This is the algorithm that I came up with, it’s implemented in Python:
for word in words:
count=0;
for letter in word:
if(letter.isalpha()):
if((letters[letter.lower()] > 0 && count == 0) ||
(letters[letter.lower()] == 0 && count == 0)):
letters[letter.lower()]+=1
count=1
elif(letters[letter.lower()]==0 && count==1):
letters[letter.lower()]+=1
But it still requires work and I can’t think about anything else, I’d be glad to anyone who will help me to think about a working solution.
Create a counter object and then update it with sets for each word:
from collections import Counter
wordlist = ["tree","bone","indigo","developer"]
c = Counter()
for word in wordlist:
c.update(set(word.lower()))
print(c)
Output:
Counter({'e': 3, 'o': 3, 'r': 2, 'n': 2, 'd': 2, 't': 1, 'b': 1, 'i': 1, 'g': 1, 'v': 1, 'p': 1, 'l': 1})
Note that although letters that weren’t present in wordlist
aren’t present in in the Counter
, this is fine because a Counter
behaves like a defaultdict(int)
, so accessing a value not present automatically returns a default value of 0.
A variation on @Primusa answer without using update:
from collections import Counter
words = ["tree", "bone", "indigo", "developer"]
counts = Counter(c for word in words for c in set(word.lower()) if c.isalpha())
Output
Counter({'e': 3, 'o': 3, 'r': 2, 'd': 2, 'n': 2, 'p': 1, 'i': 1, 'b': 1, 'v': 1, 'g': 1, 'l': 1, 't': 1})
Basically convert each word to a set and then iterate over each set.
One without Counter
words=["tree","bone","indigo","developer"]
d={}
for word in words: # iterate over words
for i in set(word): # to remove the duplication of characters within word
d[i]=d.get(i,0)+1
Output
{'b': 1,
'd': 2,
'e': 3,
'g': 1,
'i': 1,
'l': 1,
'n': 2,
'o': 3,
'p': 1,
'r': 2,
't': 1,
'v': 1}
Comparing speed of the solutions presented so far:
def f1(words):
c = Counter()
for word in words:
c.update(set(word.lower()))
return c
def f2(words):
return Counter(
c
for word in words
for c in set(word.lower()))
def f3(words):
d = {}
for word in words:
for i in set(word.lower()):
d[i] = d.get(i, 0) + 1
return d
My timing function (using different sizes for the list of words):
word_list = [
'tree', 'bone', 'indigo', 'developer', 'python',
'language', 'timeit', 'xerox', 'printer', 'offset',
]
for exp in range(5):
words = word_list * 10**exp
result_list = []
for i in range(1, 4):
t = timeit.timeit(
'f(words)',
'from __main__ import words, f{} as f'.format(i),
number=100)
result_list.append((i, t))
print('{:10,d} words | {}'.format(
len(words),
' | '.join(
'f{} {:8.4f} sec'.format(i, t) for i, t in result_list)))
The results:
10 words | f1 0.0028 sec | f2 0.0012 sec | f3 0.0011 sec
100 words | f1 0.0245 sec | f2 0.0082 sec | f3 0.0113 sec
1,000 words | f1 0.2450 sec | f2 0.0812 sec | f3 0.1134 sec
10,000 words | f1 2.4601 sec | f2 0.8113 sec | f3 1.1335 sec
100,000 words | f1 24.4195 sec | f2 8.1828 sec | f3 11.2167 sec
The Counter
with list comprehension (here as f2()
) seems to be the fastest. Using counter.update()
seems to be a slow point (here as f1()
).
The other solutions are good, but they specifically don’t include the letters with zero frequency. Here’s an approach which does, but is approximately 2-3 times slower than the others.
import string
counts = {c: len([w for w in words if c in w.lower()]) for c in string.ascii_lowercase}
which produces a dict like this:
{'a': 4, 'b': 2, 'c': 2, 'd': 4, 'e': 7, 'f': 2, 'g': 2, 'h': 3, 'i': 7, 'j': 0, 'k': 0, 'l': 4, 'm': 5, 'n': 4, 'o': 4, 'p': 1, 'q': 0, 'r': 5, 's': 3, 't': 3, 'u': 2, 'v': 0, 'w': 3, 'x': 0, 'y': 2, 'z': 1}
Here’s my update of Ralf’s timings:
10 words | f1 0.0004 sec | f2 0.0004 sec | f3 0.0003 sec | f4 0.0010 sec
100 words | f1 0.0019 sec | f2 0.0014 sec | f3 0.0013 sec | f4 0.0034 sec
1,000 words | f1 0.0180 sec | f2 0.0118 sec | f3 0.0140 sec | f4 0.0298 sec
10,000 words | f1 0.1960 sec | f2 0.1278 sec | f3 0.1542 sec | f4 0.2648 sec
100,000 words | f1 2.0859 sec | f2 1.3971 sec | f3 1.6815 sec | f4 3.5196 sec
based on the following code and the word list from https://github.com/dwyl/english-words/
import string
import timeit
import random
from collections import Counter
def f1(words):
c = Counter()
for word in words:
c.update(set(word.lower()))
return c
def f2(words):
return Counter(
c
for word in words
for c in set(word.lower()))
def f3(words):
d = {}
for word in words:
for i in set(word.lower()):
d[i] = d.get(i, 0) + 1
return d
def f4(words):
d = {c: len([w for w in words if c in w.lower()]) for c in string.ascii_lowercase}
return d
with open('words.txt') as word_file:
valid_words = set(word_file.read().split())
for exp in range(5):
result_list = []
for i in range(1, 5):
t = timeit.timeit(
'f(words)',
'from __main__ import f{} as f, valid_words, exp; import random; words = random.sample(valid_words, 10**exp)'.format(i),
number=100)
result_list.append((i, t))
print('{:10,d} words | {}'.format(
len(words),
' | '.join(
'f{} {:8.4f} sec'.format(i, t) for i, t in result_list)))
print(f4(random.sample(valid_words, 10000)))
print(f4(random.sample(valid_words, 1000)))
print(f4(random.sample(valid_words, 100)))
print(f4(random.sample(valid_words, 10)))
Try using a dictionary comprehension:
import string
print({k:max(i.count(k) for i in words) for k in string.ascii_lowercase})
A bit too late to the party, but here you go:
freq = {k: sum(k in word for word in words) for k in set(''.join(words))}
which returns:
{'i': 1, 'v': 1, 'p': 1, 'b': 1, 'e': 3, 'g': 1, 't': 1, 'n': 2, 'd': 2, 'o': 3, 'l': 1, 'r': 2}
from collections import Counter
import string
words=["tree","bone","indigo","developer"]
y=Counter(string.ascii_lowercase)
new_dict=dict(y)
for k in new_dict:
new_dict[k]=0
trial = 0
while len(words) > trial:
for let in set(words[trial]):
if let in new_dict:
new_dict[str(let)]=new_dict[str(let)]+1
trial = trial +1
print(new_dict)
import collections
import itertools
import string
def main():
words = ["tree", "bone", "indigo", "developer"]
no_repeated_letters = (set(word) for word in words)
letter_stream = itertools.chain.from_iterable(no_repeated_letters)
counter = collections.Counter(letter_stream)
# set zeros for unseen letters, to match poster's answer.
for letter in string.ascii_lowercase:
if letter not in counter:
counter[letter] = 0
# print result.
for key in sorted(counter):
print(key, counter[key])
if __name__ == '__main__':
main()