Most efficient way to search the last X lines of a file?
Question:
I have a file and I don’t know how big it’s going to be (it could be quite large, but the size will vary greatly). I want to search the last 10 lines or so to see if any of them match a string. I need to do this as quickly and efficiently as possible and was wondering if there’s anything better than:
s = "foo"
last_bit = fileObj.readlines()[-10:]
for line in last_bit:
if line == s:
print "FOUND"
Answers:
read the last few Ks of the file, and split that into lines to return only the last 10.
it’s quite unlikely the start of that chunk to fall on a line boundary, but you’ll discard the first lines anyway.
You could read chunks of 1,000 bytes or so from the end of the file into a buffer until you have 10 lines.
I think reading the last 2 KB or so of the file should make sure you get 10 lines, and shouldn’t be too much of a resource hog.
file_handle = open("somefile")
file_size = file_handle.tell()
file_handle.seek(max(file_size - 2*1024, 0))
# this will get rid of trailing newlines, unlike readlines()
last_10 = file_handle.read().splitlines()[-10:]
assert len(last_10) == 10, "Only read %d lines" % len(last_10)
Personally I’d be tempted to break out to the shell and call tail -n10 to load the file. But then I’m not really a Python programmer 😉
I think I remember adapting the code from this blog post from Manu Garg when I had to do something similar.
# Tail
from __future__ import with_statement
find_str = "FIREFOX" # String to find
fname = "g:/autoIt/ActiveWin.log_2" # File to check
with open(fname, "r") as f:
f.seek (0, 2) # Seek @ EOF
fsize = f.tell() # Get Size
f.seek (max (fsize-1024, 0), 0) # Set pos @ last n chars
lines = f.readlines() # Read to end
lines = lines[-10:] # Get last 10 lines
# This returns True if any line is exactly find_str + "n"
print find_str + "n" in lines
# If you're searching for a substring
for line in lines:
if find_str in line:
print True
break
First, a function that returns a list:
def lastNLines(file, N=10, chunksize=1024):
lines = None
file.seek(0,2) # go to eof
size = file.tell()
for pos in xrange(chunksize,size-1,chunksize):
# read a chunk
file.seek(pos,2)
chunk = file.read(chunksize)
if lines is None:
# first time
lines = chunk.splitlines()
else:
# other times, update the 'first' line with
# the new data, and re-split
lines[0:1] = (chunk + lines[0]).splitlines()
if len(lines) > N:
return lines[-N:]
file.seek(0)
chunk = file.read(size-pos)
lines[0:1] = (chunk + lines[0]).splitlines()
return lines[-N:]
Second, a function that iterates over the lines in reverse order:
def iter_lines_reversed(file, chunksize=1024):
file.seek(0,2)
size = file.tell()
last_line = ""
for pos in xrange(chunksize,size-1,chunksize):
# read a chunk
file.seek(pos,2)
chunk = file.read(chunksize) + last_line
# split into lines
lines = chunk.splitlines()
last_line = lines[0]
# iterate in reverse order
for index,line in enumerate(reversed(lines)):
if index > 0:
yield line
# handle the remaining data at the beginning of the file
file.seek(0)
chunk = file.read(size-pos) + last_line
lines = chunk.splitlines()
for line in reversed(lines):
yield line
For your example:
s = "foo"
for index, line in enumerate(iter_lines_reversed(fileObj)):
if line == s:
print "FOUND"
break
elif index+1 >= 10:
break
Edit: Now gets the file-size automaticly
Edit2: Now only iterates for 10 lines.
If you are running Python on a POSIX system, you can use ‘tail -10’ to retrieve the last few lines. This may be faster than writing your own Python code to get the last 10 lines. Rather than opening the file directly, open a pipe from the command ‘tail -10 filename’. If you are certain of the log output though (for example, you know that there are never any very long lines that are hundreds or thousands of characters long) then using one of the ‘read the last 2KB’ approaches listed would be fine.
If you’re on a unix box, os.popen("tail -10 " + filepath).readlines()
will probably be the fastest way. Otherwise, it depends on how robust you want it to be. The methods proposed so far will all fall down, one way or another. For robustness and speed in the most common case you probably want something like a logarithmic search: use file.seek to go to end of the file minus 1000 characters, read it in, check how many lines it contains, then to EOF minus 3000 characters, read in 2000 characters, count the lines, then EOF minus 7000, read in 4000 characters, count the lines, etc. until you have as many lines as you need. But if you know for sure that it’s always going to be run on files with sensible line lengths, you may not need that.
You might also find some inspiration in the source code for the unix tail
command.
Here’s an answer like MizardX’s, but without its apparent problem of taking quadratic time in the worst case from rescanning the working string repeatedly for newlines as chunks are added.
Compared to the Active State solution (which also seems to be quadratic), this doesn’t blow up given an empty file and does one seek per block read instead of two.
Compared to spawning ‘tail’, this is self-contained. (But ‘tail’ is best if you have it.)
Compared to grabbing a few kB off the end and hoping it’s enough, this works for any line length.
import os
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == 'n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
To use it as requested:
from itertools import islice
def check_last_10_lines(file, key):
for line in islice(reversed_lines(file), 10):
if line.rstrip('n') == key:
print 'FOUND'
break
Edit: changed map() to itertools.imap() in head(). Edit 2: simplified reversed_blocks(). Edit 3: avoid rescanning tail for newlines. Edit 4: rewrote reversed_lines() because str.splitlines() ignores a final ‘n’, as BrianB noticed (thanks).
Note that in very old Python versions the string concatenation in a loop here will take quadratic time. CPython from at least the last few years avoids this problem automatically.
I ran into that problem, parsing the last hour of LARGE syslog files, and used this function from activestate’s recipe site… (http://code.activestate.com/recipes/439045/)
!/usr/bin/env python
# -*-mode: python; coding: iso-8859-1 -*-
#
# Copyright (c) Peter Astrand <[email protected]>
import os
import string
class BackwardsReader:
"""Read a file line by line, backwards"""
BLKSIZE = 4096
def readline(self):
while 1:
newline_pos = string.rfind(self.buf, "n")
pos = self.file.tell()
if newline_pos != -1:
# Found a newline
line = self.buf[newline_pos+1:]
self.buf = self.buf[:newline_pos]
if pos != 0 or newline_pos != 0 or self.trailing_newline:
line += "n"
return line
else:
if pos == 0:
# Start-of-file
return ""
else:
# Need to fill buffer
toread = min(self.BLKSIZE, pos)
self.file.seek(-toread, 1)
self.buf = self.file.read(toread) + self.buf
self.file.seek(-toread, 1)
if pos - toread == 0:
self.buf = "n" + self.buf
def __init__(self, file):
self.file = file
self.buf = ""
self.file.seek(-1, 2)
self.trailing_newline = 0
lastchar = self.file.read(1)
if lastchar == "n":
self.trailing_newline = 1
self.file.seek(-1, 2)
# Example usage
br = BackwardsReader(open('bar'))
while 1:
line = br.readline()
if not line:
break
print repr(line)
It works really well and is much more efficient then anything like fileObj.readlines()[-10:], which makes python read the entire file into memory and then chops the last ten lines off of it.
Here is a version using mmap
that seems pretty efficient. The big plus is that mmap
will automatically handle the file to memory paging requirements for you.
import os
from mmap import mmap
def lastn(filename, n):
# open the file and mmap it
f = open(filename, 'r+')
m = mmap(f.fileno(), os.path.getsize(f.name))
nlcount = 0
i = m.size() - 1
if m[i] == 'n': n += 1
while nlcount < n and i > 0:
if m[i] == 'n': nlcount += 1
i -= 1
if i > 0: i += 2
return m[i:].splitlines()
target = "target string"
print [l for l in lastn('somefile', 10) if l == target]
This solution will read the file only once, but using 2 file object pointers to be able obtain the last N lines of file without re-reading it:
def getLastLines (path, n):
# return the las N lines from the file indicated in path
fp = open(path)
for i in range(n):
line = fp.readline()
if line == '':
return []
back = open(path)
for each in fp:
back.readline()
result = []
for line in back:
result.append(line[:-1])
return result
s = "foo"
last_bit = getLastLines(r'C:Documents and Settingsricardo.m.reyesMy Documentsdesarrollotail.py', 10)
for line in last_bit:
if line == s:
print "FOUND"
You could also count the lines as you reverse through the file, instead of guessing at a byte offset.
lines = 0
chunk_size = 1024
f = file('filename')
f.seek(0, 2)
f.seek(f.tell() - chunk_size)
while True:
s = f.read(chunk_size)
lines += s.count('n')
if lines > NUM_OF_LINES:
break
f.seek(f.tell() - chunk_size*2)
Now the file is at a good position to run readlines()
. You also could cache the strings you read the first time, to eliminate reading the same portion of the file twice.
Maybe this might be useful:
import os.path
path = 'path_to_file'
os.system('tail -n1 ' + path)
I took mhawke’s suggestion to use mmap
and wrote a version that uses rfind
:
from mmap import mmap
import sys
def reverse_file(f):
mm = mmap(f.fileno(), 0)
nl = mm.size() - 1
prev_nl = mm.size()
while nl > -1:
nl = mm.rfind('n', 0, nl)
yield mm[nl + 1:prev_nl]
prev_nl = nl + 1
def main():
# Example usage
with open('test.txt', 'r+') as infile:
for line in reverse_file(infile):
sys.stdout.write(line)
Thanks to the solution by 18 Darius Bacon but with a 30% faster implementation and wrapping into io.BaseIO class.
class ReverseFile(io.IOBase):
def __init__ (self, filename, headers=1):
self.fp = open(filename)
self.headers = headers
self.reverse = self.reversed_lines()
self.end_position = -1
self.current_position = -1
def readline(self, size=-1):
if self.headers > 0:
self.headers -= 1
raw = self.fp.readline(size)
self.end_position = self.fp.tell()
return raw
raw = next(self.reverse)
if self.current_position > self.end_position:
return raw
raise StopIteration
def reversed_lines(self):
"""Generate the lines of file in reverse order.
"""
part = ''
for block in self.reversed_blocks():
block = block + part
block = block.split('n')
block.reverse()
part = block.pop()
if block[0] == '':
block.pop(0)
for line in block:
yield line + 'n'
if part:
yield part
def reversed_blocks(self, blocksize=0xFFFF):
"Generate blocks of file's contents in reverse order."
file = self.fp
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
self.current_position = file.tell()
yield file.read(delta)
An example
rev = ReverseFile(filename)
for i, line in enumerate(rev):
print("{0}: {1}".format(i, line.strip()))
This will return the last 10 lines as a list then you can search your line easily. (Python 3 compatible)
def read_last_n_lines_new(lines_need=10):
with open('Log.txt', 'rb') as f:
f.seek(0, 2)
data = []
lines_found = 0
while True:
try:
f.seek(-1, 1)
except:
break
finally:
c = f.read(1)
f.seek(-1, 1)
if c == b'n':
lines_found = lines_found+1
if lines_found > lines_need or not c:
break
data.insert(0, c.decode('utf-8'))
lines = []
cur = ""
for l in data:
if(l == 'n'):
lines.append(cur)
cur = ''
else:
cur = cur + l
return lines
I have a file and I don’t know how big it’s going to be (it could be quite large, but the size will vary greatly). I want to search the last 10 lines or so to see if any of them match a string. I need to do this as quickly and efficiently as possible and was wondering if there’s anything better than:
s = "foo"
last_bit = fileObj.readlines()[-10:]
for line in last_bit:
if line == s:
print "FOUND"
read the last few Ks of the file, and split that into lines to return only the last 10.
it’s quite unlikely the start of that chunk to fall on a line boundary, but you’ll discard the first lines anyway.
You could read chunks of 1,000 bytes or so from the end of the file into a buffer until you have 10 lines.
I think reading the last 2 KB or so of the file should make sure you get 10 lines, and shouldn’t be too much of a resource hog.
file_handle = open("somefile")
file_size = file_handle.tell()
file_handle.seek(max(file_size - 2*1024, 0))
# this will get rid of trailing newlines, unlike readlines()
last_10 = file_handle.read().splitlines()[-10:]
assert len(last_10) == 10, "Only read %d lines" % len(last_10)
Personally I’d be tempted to break out to the shell and call tail -n10 to load the file. But then I’m not really a Python programmer 😉
I think I remember adapting the code from this blog post from Manu Garg when I had to do something similar.
# Tail
from __future__ import with_statement
find_str = "FIREFOX" # String to find
fname = "g:/autoIt/ActiveWin.log_2" # File to check
with open(fname, "r") as f:
f.seek (0, 2) # Seek @ EOF
fsize = f.tell() # Get Size
f.seek (max (fsize-1024, 0), 0) # Set pos @ last n chars
lines = f.readlines() # Read to end
lines = lines[-10:] # Get last 10 lines
# This returns True if any line is exactly find_str + "n"
print find_str + "n" in lines
# If you're searching for a substring
for line in lines:
if find_str in line:
print True
break
First, a function that returns a list:
def lastNLines(file, N=10, chunksize=1024):
lines = None
file.seek(0,2) # go to eof
size = file.tell()
for pos in xrange(chunksize,size-1,chunksize):
# read a chunk
file.seek(pos,2)
chunk = file.read(chunksize)
if lines is None:
# first time
lines = chunk.splitlines()
else:
# other times, update the 'first' line with
# the new data, and re-split
lines[0:1] = (chunk + lines[0]).splitlines()
if len(lines) > N:
return lines[-N:]
file.seek(0)
chunk = file.read(size-pos)
lines[0:1] = (chunk + lines[0]).splitlines()
return lines[-N:]
Second, a function that iterates over the lines in reverse order:
def iter_lines_reversed(file, chunksize=1024):
file.seek(0,2)
size = file.tell()
last_line = ""
for pos in xrange(chunksize,size-1,chunksize):
# read a chunk
file.seek(pos,2)
chunk = file.read(chunksize) + last_line
# split into lines
lines = chunk.splitlines()
last_line = lines[0]
# iterate in reverse order
for index,line in enumerate(reversed(lines)):
if index > 0:
yield line
# handle the remaining data at the beginning of the file
file.seek(0)
chunk = file.read(size-pos) + last_line
lines = chunk.splitlines()
for line in reversed(lines):
yield line
For your example:
s = "foo"
for index, line in enumerate(iter_lines_reversed(fileObj)):
if line == s:
print "FOUND"
break
elif index+1 >= 10:
break
Edit: Now gets the file-size automaticly
Edit2: Now only iterates for 10 lines.
If you are running Python on a POSIX system, you can use ‘tail -10’ to retrieve the last few lines. This may be faster than writing your own Python code to get the last 10 lines. Rather than opening the file directly, open a pipe from the command ‘tail -10 filename’. If you are certain of the log output though (for example, you know that there are never any very long lines that are hundreds or thousands of characters long) then using one of the ‘read the last 2KB’ approaches listed would be fine.
If you’re on a unix box, os.popen("tail -10 " + filepath).readlines()
will probably be the fastest way. Otherwise, it depends on how robust you want it to be. The methods proposed so far will all fall down, one way or another. For robustness and speed in the most common case you probably want something like a logarithmic search: use file.seek to go to end of the file minus 1000 characters, read it in, check how many lines it contains, then to EOF minus 3000 characters, read in 2000 characters, count the lines, then EOF minus 7000, read in 4000 characters, count the lines, etc. until you have as many lines as you need. But if you know for sure that it’s always going to be run on files with sensible line lengths, you may not need that.
You might also find some inspiration in the source code for the unix tail
command.
Here’s an answer like MizardX’s, but without its apparent problem of taking quadratic time in the worst case from rescanning the working string repeatedly for newlines as chunks are added.
Compared to the Active State solution (which also seems to be quadratic), this doesn’t blow up given an empty file and does one seek per block read instead of two.
Compared to spawning ‘tail’, this is self-contained. (But ‘tail’ is best if you have it.)
Compared to grabbing a few kB off the end and hoping it’s enough, this works for any line length.
import os
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == 'n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
To use it as requested:
from itertools import islice
def check_last_10_lines(file, key):
for line in islice(reversed_lines(file), 10):
if line.rstrip('n') == key:
print 'FOUND'
break
Edit: changed map() to itertools.imap() in head(). Edit 2: simplified reversed_blocks(). Edit 3: avoid rescanning tail for newlines. Edit 4: rewrote reversed_lines() because str.splitlines() ignores a final ‘n’, as BrianB noticed (thanks).
Note that in very old Python versions the string concatenation in a loop here will take quadratic time. CPython from at least the last few years avoids this problem automatically.
I ran into that problem, parsing the last hour of LARGE syslog files, and used this function from activestate’s recipe site… (http://code.activestate.com/recipes/439045/)
!/usr/bin/env python
# -*-mode: python; coding: iso-8859-1 -*-
#
# Copyright (c) Peter Astrand <[email protected]>
import os
import string
class BackwardsReader:
"""Read a file line by line, backwards"""
BLKSIZE = 4096
def readline(self):
while 1:
newline_pos = string.rfind(self.buf, "n")
pos = self.file.tell()
if newline_pos != -1:
# Found a newline
line = self.buf[newline_pos+1:]
self.buf = self.buf[:newline_pos]
if pos != 0 or newline_pos != 0 or self.trailing_newline:
line += "n"
return line
else:
if pos == 0:
# Start-of-file
return ""
else:
# Need to fill buffer
toread = min(self.BLKSIZE, pos)
self.file.seek(-toread, 1)
self.buf = self.file.read(toread) + self.buf
self.file.seek(-toread, 1)
if pos - toread == 0:
self.buf = "n" + self.buf
def __init__(self, file):
self.file = file
self.buf = ""
self.file.seek(-1, 2)
self.trailing_newline = 0
lastchar = self.file.read(1)
if lastchar == "n":
self.trailing_newline = 1
self.file.seek(-1, 2)
# Example usage
br = BackwardsReader(open('bar'))
while 1:
line = br.readline()
if not line:
break
print repr(line)
It works really well and is much more efficient then anything like fileObj.readlines()[-10:], which makes python read the entire file into memory and then chops the last ten lines off of it.
Here is a version using mmap
that seems pretty efficient. The big plus is that mmap
will automatically handle the file to memory paging requirements for you.
import os
from mmap import mmap
def lastn(filename, n):
# open the file and mmap it
f = open(filename, 'r+')
m = mmap(f.fileno(), os.path.getsize(f.name))
nlcount = 0
i = m.size() - 1
if m[i] == 'n': n += 1
while nlcount < n and i > 0:
if m[i] == 'n': nlcount += 1
i -= 1
if i > 0: i += 2
return m[i:].splitlines()
target = "target string"
print [l for l in lastn('somefile', 10) if l == target]
This solution will read the file only once, but using 2 file object pointers to be able obtain the last N lines of file without re-reading it:
def getLastLines (path, n):
# return the las N lines from the file indicated in path
fp = open(path)
for i in range(n):
line = fp.readline()
if line == '':
return []
back = open(path)
for each in fp:
back.readline()
result = []
for line in back:
result.append(line[:-1])
return result
s = "foo"
last_bit = getLastLines(r'C:Documents and Settingsricardo.m.reyesMy Documentsdesarrollotail.py', 10)
for line in last_bit:
if line == s:
print "FOUND"
You could also count the lines as you reverse through the file, instead of guessing at a byte offset.
lines = 0
chunk_size = 1024
f = file('filename')
f.seek(0, 2)
f.seek(f.tell() - chunk_size)
while True:
s = f.read(chunk_size)
lines += s.count('n')
if lines > NUM_OF_LINES:
break
f.seek(f.tell() - chunk_size*2)
Now the file is at a good position to run readlines()
. You also could cache the strings you read the first time, to eliminate reading the same portion of the file twice.
Maybe this might be useful:
import os.path
path = 'path_to_file'
os.system('tail -n1 ' + path)
I took mhawke’s suggestion to use mmap
and wrote a version that uses rfind
:
from mmap import mmap
import sys
def reverse_file(f):
mm = mmap(f.fileno(), 0)
nl = mm.size() - 1
prev_nl = mm.size()
while nl > -1:
nl = mm.rfind('n', 0, nl)
yield mm[nl + 1:prev_nl]
prev_nl = nl + 1
def main():
# Example usage
with open('test.txt', 'r+') as infile:
for line in reverse_file(infile):
sys.stdout.write(line)
Thanks to the solution by 18 Darius Bacon but with a 30% faster implementation and wrapping into io.BaseIO class.
class ReverseFile(io.IOBase):
def __init__ (self, filename, headers=1):
self.fp = open(filename)
self.headers = headers
self.reverse = self.reversed_lines()
self.end_position = -1
self.current_position = -1
def readline(self, size=-1):
if self.headers > 0:
self.headers -= 1
raw = self.fp.readline(size)
self.end_position = self.fp.tell()
return raw
raw = next(self.reverse)
if self.current_position > self.end_position:
return raw
raise StopIteration
def reversed_lines(self):
"""Generate the lines of file in reverse order.
"""
part = ''
for block in self.reversed_blocks():
block = block + part
block = block.split('n')
block.reverse()
part = block.pop()
if block[0] == '':
block.pop(0)
for line in block:
yield line + 'n'
if part:
yield part
def reversed_blocks(self, blocksize=0xFFFF):
"Generate blocks of file's contents in reverse order."
file = self.fp
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
self.current_position = file.tell()
yield file.read(delta)
An example
rev = ReverseFile(filename)
for i, line in enumerate(rev):
print("{0}: {1}".format(i, line.strip()))
This will return the last 10 lines as a list then you can search your line easily. (Python 3 compatible)
def read_last_n_lines_new(lines_need=10):
with open('Log.txt', 'rb') as f:
f.seek(0, 2)
data = []
lines_found = 0
while True:
try:
f.seek(-1, 1)
except:
break
finally:
c = f.read(1)
f.seek(-1, 1)
if c == b'n':
lines_found = lines_found+1
if lines_found > lines_need or not c:
break
data.insert(0, c.decode('utf-8'))
lines = []
cur = ""
for l in data:
if(l == 'n'):
lines.append(cur)
cur = ''
else:
cur = cur + l
return lines