elegant find sub-list in list
Question:
Given a list containing a known pattern surrounded by noise, is there an elegant way to get all items that equal the pattern. See below for my crude code.
list_with_noise = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
known_pattern = [1,2,3,4]
res = []
for i in list_with_noise:
for j in known_pattern:
if i == j:
res.append(i)
continue
print res
we would get 2, 1, 2, 3, 4, 2, 1, 2, 3, 4, 1, 2, 3, 4, 4, 3
bonus: avoid appending i if the full pattern is not present (ie., allow 1,2,3,4 but not 1,2,3)
examples:
find_sublists_in_list([7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5],[1,2,3,4])
[1,2,3,4],[1,2,3,4],[1,2,3,4]
find_sublists_in_list([7,2,1,2,3,2,1,2,3,6,9,9,1,2,3,4,7,4,3,1,2,6],[1,2,3,4])
[1,2,3],[1,2,3],[1,2,3]
The lists contain named tuples.
Answers:
This will get the “bonus” part of your question:
pattern = [1, 2, 3, 4]
search_list = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
cursor = 0
found = []
for i in search_list:
if i == pattern[cursor]:
cursor += 1
if cursor == len(pattern):
found.append(pattern)
cursor = 0
else:
cursor = 0
For non-bonus:
pattern = [1, 2, 3, 4]
search_list = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
cursor = 0
found = []
for i in search_list:
if i != pattern[cursor]:
if cursor > 0:
found.append(pattern[:cursor])
cursor = 0
else:
cursor += 1
Finally, this one handles overlaps:
def find_matches(pattern_list, search_list):
cursor_list = []
found = []
for element in search_list:
cursors_to_kill = []
for cursor_index in range(len(cursor_list)):
if element == pattern_list[cursor_list[cursor_index]]:
cursor_list[cursor_index] += 1
if cursor_list[cursor_index] == len(pattern_list):
found.append(pattern_list)
cursors_to_kill.append(cursor_index)
else:
cursors_to_kill.append(cursor_index)
cursors_to_kill.reverse()
for cursor_index in cursors_to_kill:
cursor_list.pop(cursor_index)
if element == pattern_list[0]:
cursor_list.append(1)
return found
list_with_noise = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
string_withNoise = "".join(str(i) for i in list_with_noise)
known_pattern = [1,2,3,4]
string_pattern = "".join(str(i) for i in known_pattern)
string_withNoise.count(string_pattern)
Given:
a_list = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
pat = [1,2,3,4]
Here is an inefficient one liner:
res = [pat for i in range(len(a_list)) if a_list[i:i+len(pat)] == pat]
Here is a more efficient itertools version:
from itertools import izip_longest, islice
res = []
i = 0
while True:
try:
i = a_list.index(pat[0], i)
except ValueError:
break
if all(a==b for (a,b) in izip_longest(pat, islice(a_list, i, i+len(pat)))):
res.append(pat)
i += len(pat)
i += 1
I know this question is 5 months old and already “accepted”, but googling a very similar problem brought me to this question and all the answers seem to have a couple of rather significant problems, plus I’m bored and want to try my hand at a SO answer, so I’m just going to rattle off what I’ve found.
The first part of the question, as I understand it, is pretty trivial: just return the original list with all the elements not in the “pattern” filtered out. Following that thinking, the first code I thought of used the filter() function:
def subfinder(mylist, pattern):
return list(filter(lambda x: x in pattern, mylist))
I would say that this solution is definitely more succinct than the original solution, but it’s not any faster, or at least not appreciably, and I try to avoid lambda expressions if there’s not a very good reason for using them. In fact, the best solution I could come up with involved a simple list comprehension:
def subfinder(mylist, pattern):
pattern = set(pattern)
return [x for x in mylist if x in pattern]
This solution is both more elegant and significantly faster than the original: the comprehension is about 120% faster than the original, while casting the pattern into a set first bumps that up to a whopping 320% faster in my tests.
Now for the bonus: I’ll just jump right into it, my solution is as follows:
def subfinder(mylist, pattern):
matches = []
for i in range(len(mylist)):
if mylist[i] == pattern[0] and mylist[i:i+len(pattern)] == pattern:
matches.append(pattern)
return matches
This is a variation of Steven Rumbalski’s “inefficient one liner”, that, with the addition of the “mylist[i] == pattern[0]” check and thanks to python’s short-circuit evaluation, is significantly faster than both the original statement and the itertools version (and every other offered solution as far as I can tell) and it even supports overlapping patterns. So there you go.
An idiomatic, composable solution to the problem.
First off, we need to borrow an itertools
recipe, consume
(which consumes and discards a given number of elements from an iterator. Then we take the itertools
recipe for pairwise
, and extend it to an nwise
function using consume
:
import itertools
def nwise(iterable, size=2):
its = itertools.tee(iterable, size)
for i, it in enumerate(its):
consume(it, i) # Discards i elements from it
return zip(*its)
Now that we have that, solving the bonus problem is really easy:
def find_sublists_in_list(biglist, searchlist):
searchtup = tuple(searchlist)
return [list(subtup) for subtup in nwise(biglist, len(searchlist)) if subtup == searchtup]
# Or for more obscure but faster one-liner:
return map(list, filter(tuple(searchlist).__eq__, nwise(biglist, len(searchlist))))
Similarly, a more succinct and speedier (if somewhat less pretty) solution to the main problem replaces:
def subfinder(mylist, pattern):
pattern = set(pattern)
return [x for x in mylist if x in pattern]
with:
def subfinder(mylist, pattern):
# Wrap filter call in list() if on Python 3 and you need a list, not a generator
return filter(set(pattern).__contains__, mylist)
This behaves the same way, but avoids needing to store the temporary set
to a name, and pushes all the filtering work to C.
def sublist_in_list(sub, lis):
return str(sub).strip('[]') in str(lis).strip('[]')
An iterator-based approach that is still based on the naive algorithm, but tries to do as much implicit looping as possible making use of .index()
:
def find_pivot(seq, subseq):
n = len(seq)
m = len(subseq)
stop = n - m + 1
if n > 0:
item = subseq[0]
i = 0
try:
while i < stop:
i = seq.index(item, i)
if seq[i:i + m] == subseq:
yield i
i += 1
except ValueError:
return
compared to a couple of others approaches with various degrees of explicit looping:
def find_loop(seq, subseq):
n = len(seq)
m = len(subseq)
for i in range(n - m + 1):
if all(seq[i + j] == subseq[j] for j in (range(m))):
yield i
def find_slice(seq, subseq):
n = len(seq)
m = len(subseq)
for i in range(n - m + 1):
if seq[i:i + m] == subseq:
yield i
def find_mix(seq, subseq):
n = len(seq)
m = len(subseq)
for i in range(n - m + 1):
if seq[i] == subseq[0] and seq[i:i + m] == subseq:
yield i
one would get:
a = list(range(10))
print(a)
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
b = list(range(5, 10))
print(b)
# [5, 6, 7, 8, 9]
funcs = find_pivot, find_loop, find_slice, find_mix,
for func in funcs:
print()
print(func.__name__)
print(list(func(a * 10, b)))
aa = a * 100
%timeit list(func(aa, b))
random.shuffle(aa)
%timeit list(func(aa, b))
# find_pivot
# [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
# 10000 loops, best of 3: 49.6 µs per loop
# 10000 loops, best of 3: 50.1 µs per loop
# find_loop
# [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
# 1000 loops, best of 3: 712 µs per loop
# 1000 loops, best of 3: 680 µs per loop
# find_slice
# [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
# 10000 loops, best of 3: 162 µs per loop
# 10000 loops, best of 3: 162 µs per loop
# find_mix
# [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
# 10000 loops, best of 3: 82.2 µs per loop
# 10000 loops, best of 3: 83.9 µs per loop
Note that this is ~30% faster than the currently accepted answer with the tested input.
Given a list containing a known pattern surrounded by noise, is there an elegant way to get all items that equal the pattern. See below for my crude code.
list_with_noise = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
known_pattern = [1,2,3,4]
res = []
for i in list_with_noise:
for j in known_pattern:
if i == j:
res.append(i)
continue
print res
we would get 2, 1, 2, 3, 4, 2, 1, 2, 3, 4, 1, 2, 3, 4, 4, 3
bonus: avoid appending i if the full pattern is not present (ie., allow 1,2,3,4 but not 1,2,3)
examples:
find_sublists_in_list([7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5],[1,2,3,4])
[1,2,3,4],[1,2,3,4],[1,2,3,4]
find_sublists_in_list([7,2,1,2,3,2,1,2,3,6,9,9,1,2,3,4,7,4,3,1,2,6],[1,2,3,4])
[1,2,3],[1,2,3],[1,2,3]
The lists contain named tuples.
This will get the “bonus” part of your question:
pattern = [1, 2, 3, 4]
search_list = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
cursor = 0
found = []
for i in search_list:
if i == pattern[cursor]:
cursor += 1
if cursor == len(pattern):
found.append(pattern)
cursor = 0
else:
cursor = 0
For non-bonus:
pattern = [1, 2, 3, 4]
search_list = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
cursor = 0
found = []
for i in search_list:
if i != pattern[cursor]:
if cursor > 0:
found.append(pattern[:cursor])
cursor = 0
else:
cursor += 1
Finally, this one handles overlaps:
def find_matches(pattern_list, search_list):
cursor_list = []
found = []
for element in search_list:
cursors_to_kill = []
for cursor_index in range(len(cursor_list)):
if element == pattern_list[cursor_list[cursor_index]]:
cursor_list[cursor_index] += 1
if cursor_list[cursor_index] == len(pattern_list):
found.append(pattern_list)
cursors_to_kill.append(cursor_index)
else:
cursors_to_kill.append(cursor_index)
cursors_to_kill.reverse()
for cursor_index in cursors_to_kill:
cursor_list.pop(cursor_index)
if element == pattern_list[0]:
cursor_list.append(1)
return found
list_with_noise = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
string_withNoise = "".join(str(i) for i in list_with_noise)
known_pattern = [1,2,3,4]
string_pattern = "".join(str(i) for i in known_pattern)
string_withNoise.count(string_pattern)
Given:
a_list = [7,2,1,2,3,4,2,1,2,3,4,9,9,1,2,3,4,7,4,3,1,2,3,5]
pat = [1,2,3,4]
Here is an inefficient one liner:
res = [pat for i in range(len(a_list)) if a_list[i:i+len(pat)] == pat]
Here is a more efficient itertools version:
from itertools import izip_longest, islice
res = []
i = 0
while True:
try:
i = a_list.index(pat[0], i)
except ValueError:
break
if all(a==b for (a,b) in izip_longest(pat, islice(a_list, i, i+len(pat)))):
res.append(pat)
i += len(pat)
i += 1
I know this question is 5 months old and already “accepted”, but googling a very similar problem brought me to this question and all the answers seem to have a couple of rather significant problems, plus I’m bored and want to try my hand at a SO answer, so I’m just going to rattle off what I’ve found.
The first part of the question, as I understand it, is pretty trivial: just return the original list with all the elements not in the “pattern” filtered out. Following that thinking, the first code I thought of used the filter() function:
def subfinder(mylist, pattern):
return list(filter(lambda x: x in pattern, mylist))
I would say that this solution is definitely more succinct than the original solution, but it’s not any faster, or at least not appreciably, and I try to avoid lambda expressions if there’s not a very good reason for using them. In fact, the best solution I could come up with involved a simple list comprehension:
def subfinder(mylist, pattern):
pattern = set(pattern)
return [x for x in mylist if x in pattern]
This solution is both more elegant and significantly faster than the original: the comprehension is about 120% faster than the original, while casting the pattern into a set first bumps that up to a whopping 320% faster in my tests.
Now for the bonus: I’ll just jump right into it, my solution is as follows:
def subfinder(mylist, pattern):
matches = []
for i in range(len(mylist)):
if mylist[i] == pattern[0] and mylist[i:i+len(pattern)] == pattern:
matches.append(pattern)
return matches
This is a variation of Steven Rumbalski’s “inefficient one liner”, that, with the addition of the “mylist[i] == pattern[0]” check and thanks to python’s short-circuit evaluation, is significantly faster than both the original statement and the itertools version (and every other offered solution as far as I can tell) and it even supports overlapping patterns. So there you go.
An idiomatic, composable solution to the problem.
First off, we need to borrow an itertools
recipe, consume
(which consumes and discards a given number of elements from an iterator. Then we take the itertools
recipe for pairwise
, and extend it to an nwise
function using consume
:
import itertools
def nwise(iterable, size=2):
its = itertools.tee(iterable, size)
for i, it in enumerate(its):
consume(it, i) # Discards i elements from it
return zip(*its)
Now that we have that, solving the bonus problem is really easy:
def find_sublists_in_list(biglist, searchlist):
searchtup = tuple(searchlist)
return [list(subtup) for subtup in nwise(biglist, len(searchlist)) if subtup == searchtup]
# Or for more obscure but faster one-liner:
return map(list, filter(tuple(searchlist).__eq__, nwise(biglist, len(searchlist))))
Similarly, a more succinct and speedier (if somewhat less pretty) solution to the main problem replaces:
def subfinder(mylist, pattern):
pattern = set(pattern)
return [x for x in mylist if x in pattern]
with:
def subfinder(mylist, pattern):
# Wrap filter call in list() if on Python 3 and you need a list, not a generator
return filter(set(pattern).__contains__, mylist)
This behaves the same way, but avoids needing to store the temporary set
to a name, and pushes all the filtering work to C.
def sublist_in_list(sub, lis):
return str(sub).strip('[]') in str(lis).strip('[]')
An iterator-based approach that is still based on the naive algorithm, but tries to do as much implicit looping as possible making use of .index()
:
def find_pivot(seq, subseq):
n = len(seq)
m = len(subseq)
stop = n - m + 1
if n > 0:
item = subseq[0]
i = 0
try:
while i < stop:
i = seq.index(item, i)
if seq[i:i + m] == subseq:
yield i
i += 1
except ValueError:
return
compared to a couple of others approaches with various degrees of explicit looping:
def find_loop(seq, subseq):
n = len(seq)
m = len(subseq)
for i in range(n - m + 1):
if all(seq[i + j] == subseq[j] for j in (range(m))):
yield i
def find_slice(seq, subseq):
n = len(seq)
m = len(subseq)
for i in range(n - m + 1):
if seq[i:i + m] == subseq:
yield i
def find_mix(seq, subseq):
n = len(seq)
m = len(subseq)
for i in range(n - m + 1):
if seq[i] == subseq[0] and seq[i:i + m] == subseq:
yield i
one would get:
a = list(range(10))
print(a)
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
b = list(range(5, 10))
print(b)
# [5, 6, 7, 8, 9]
funcs = find_pivot, find_loop, find_slice, find_mix,
for func in funcs:
print()
print(func.__name__)
print(list(func(a * 10, b)))
aa = a * 100
%timeit list(func(aa, b))
random.shuffle(aa)
%timeit list(func(aa, b))
# find_pivot
# [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
# 10000 loops, best of 3: 49.6 µs per loop
# 10000 loops, best of 3: 50.1 µs per loop
# find_loop
# [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
# 1000 loops, best of 3: 712 µs per loop
# 1000 loops, best of 3: 680 µs per loop
# find_slice
# [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
# 10000 loops, best of 3: 162 µs per loop
# 10000 loops, best of 3: 162 µs per loop
# find_mix
# [5, 15, 25, 35, 45, 55, 65, 75, 85, 95]
# 10000 loops, best of 3: 82.2 µs per loop
# 10000 loops, best of 3: 83.9 µs per loop
Note that this is ~30% faster than the currently accepted answer with the tested input.