Split large text file(around 50GB) into multiple files

Question:

I would like to split a large text file around size of 50GB into multiple files.
Data in the files are like this-[x= any integer between 0-9]

xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............

There might be few billions of lines in the file and i would like write for example 30/40 millions per file.
I guess the steps would be-

  • I’ve to open the file
  • then using readline() have to read the file line by line and write at the same time to a new file
  • and as soon as it hits the maximum number of lines it will create another file and
    starts writing again.

I’m wondering, how to put all these steps together in a memory efficient and faster way. I’ve seen some examples in stack but none of them totally helping what i exactly need. I would really appreciate if anyone could help me out.

Asked By: saz

||

Answers:

This working solution uses split command available in shell. Since the author has already accepted a possibility of a non-python solution, please do not downvote.

First, I created a test file with 1000M entries (15 GB) with

awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt

Then I used split:

split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t

It took 5 min to produce a set of 34 small files with names t00t33. 33 files are 458 MB each and the last t33 is 153 MB.

Answered By: Andrey

I would use the Unix utility split, if it is available to you and your only task is to split the file. Here is however a pure Python solution:

import contextlib

file_large = 'large_file.txt'
l = 30*10**6  # lines per split file
with contextlib.ExitStack() as stack:
    fd_in = stack.enter_context(open(file_large))
    for i, line in enumerate(fd_in):
        if not i % l:
           file_split = '{}.{}'.format(file_large, i//l)
           fd_out = stack.enter_context(open(file_split, 'w'))
        fd_out.write('{}n'.format(line))

If all of your lines have 4 3-digit numbers on them and you have multiple cores available, then you can exploit file seek and run multiple processes.

Answered By: tommy.carstensen
from itertools import chain, islice

def chunks(iterable, n):
   "chunks(ABCDE,2) => AB CD E"
   iterable = iter(iterable)
   while True:
       # store one line in memory,
       # chain it to an iterator on the rest of the chunk
       yield chain([next(iterable)], islice(iterable, n-1))

l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
    for i, lines in enumerate(chunks(bigfile, l)):
        file_split = '{}.{}'.format(file_large, i)
        with open(file_split, 'w') as f:
            f.writelines(lines)
Answered By: log0

This class may solve your problem.
I’ve tested it on Linux and Windows operating system, and it’s worked perfectly on both of them.
Also, I’ve tested binary and text file with different sizes each time and it was great.
Enjoy 🙂

import os
import math

class FileSpliter:
    # If file type is text then CHUNK_SIZE is count of chars
    # If file type is binary then CHUNK_SIZE is count of bytes
    def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
        self.CHUNK_SIZE = CHUNK_SIZE    # byte or char
        self.InputFile = InputFile
        self.FileType = FileType        # b: binary,  t: text
        self.OutFile = OutFile
        self.FileSize = 0
        self.Parts = None
        self.CurrentPartNo = 0
        self.Progress = 0.0

    def Prepare(self):
        if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
            print("ERROR: The file is not exists or empty!")
            return False
        self.FileSize = os.path.getsize(self.InputFile)
        if self.CHUNK_SIZE >= self.FileSize:
            self.Parts = 1
        else:
            self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
        return True

    def Split(self):
        if self.FileSize == 0 or self.Parts == None:
            print("ERROR: File is not prepared for split!")
            return False        
        with open(self.InputFile, "r" + self.FileType) as f:
            while True:
                if self.FileType == "b":
                    buf = bytearray(f.read(self.CHUNK_SIZE))
                elif self.FileType == "t":
                    buf = f.read(self.CHUNK_SIZE)
                else:
                    print("ERROR: File type error!")
                if not buf:
                    # we've read the entire file in, so we're done.
                    break
                of = self.OutFile + str(self.CurrentPartNo)
                outFile = open(of, "w" + self.FileType)
                outFile.write(buf)                              
                outFile.close()
                self.CurrentPartNo += 1 
                self.ProgressBar()
        return True

    def Rebuild(self):
        self.CurrentPartNo = 0
        if self.Parts == None:
            return False    
        with open(self.OutFile, "w" + self.FileType) as f:
            while self.CurrentPartNo < self.Parts:
                If = self.OutFile + str(self.CurrentPartNo) 
                if not(os.path.isfile(If) and os.path.getsize(If) > 0):
                    print("ERROR: The file [" + If + "] is not exists or empty!")
                    return False
                InputFile = open(If, "r" + self.FileType)
                buf = InputFile.read()
                if not buf:
                    # we've read the entire file in, so we're done.
                    break               
                f.write(buf)                                
                InputFile.close()
                os.remove(If)
                self.CurrentPartNo += 1 
                self.ProgressBar()
        return True 

    def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
        try:
            # You can't have a progress bar with zero or negative length.
            if BarLength <1:
                BarLength = 20
            # Use status variable for going to the next line after progress completion.
            Status = ""
            # Calcuting progress between 0 and 1 for percentage.
            self.Progress = float(self.CurrentPartNo) / float(self.Parts)
            # Doing this conditions at final progressing.
            if self.Progress >= 1.:
                self.Progress = 1
                Status = "rn"    # Going to the next line             
            # Calculating how many places should be filled
            Block = int(round(BarLength * self.Progress))
            # Show this
            Bar = "r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
            print(Bar, end="")
        except:
            print("rERROR")

def main():
    fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
    if fp.Prepare():
        # Spliting ...      
        print("Spliting ...")
        sr = fp.Split()
        if sr == True:
            print("The file splited successfully.")
        print()
        # Rebuilding ...
        print("Rebuilding ...") 
        rr = fp.Rebuild()
        if rr == True:
            print("The file rebuilded successfully.")

if __name__ == "__main__":
    main()

I am writing a Python3 code solution which I usually use to split files having size in MBs.

However, I have not yet tried for files having size in GBs.

TextFileSplitter.py

import traceback

#get a file name to be read
fileToRead = input("Enter file name : ")

# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1    

try:
    print('Start splitting...')
    #read a file
    fileReader = open(fileToRead)
    line = fileReader.readline()
    fileWriter = open(str(fileCount)+".txt","a")

    while line != '':#empty is EOF
        if lineCount == 0:
            #create a file in append mode
            fileWriter = open(str(fileCount)+".txt","a")
            #increment file count, use it for new file name
            fileCount += 1
        #write a line
        fileWriter.write(line+"n")
        lineCount += 1
        if lineCount == fileLineCount:
            lineCount = 0
            fileWriter.close()
        #read a line
        line = fileReader.readline()

    fileWriter.close()

except Exception as e:
    #print the exception if any
    print(e.__traceback__)
    traceback.print_exc()
finally:
    #close the file reader
    fileReader.close()

o/p will look like, files, each having fileLineCount(i.e. 2000) lines, created in a same directory as :

1.txt
2.txt
3.txt
.
.
.
.
n.txt
Answered By: Jyo the Whiff

This script will allow you to split a UTF-8 text file into multiple files with a specified size limit.

If the input file is input.txt, it will create input-1.txt, input-2.txt, and so on in the same directory.

It works by reading each line of an input text file (input.txt), determining how many bytes it is, and checking if writing it to the current output file (input-X.txt) will exceed the maximum file size limit. If writing the line will exceed the limit, the script will create a new file (input-X+1.txt) to write the line to. If not, it will write to the current output file. Note that the input file will not be modified or overwritten.

import os
from pathlib import Path

input_file_path = Path('test.txt')

input_file_name = Path(input_file_path).stem
input_file_extension = Path(input_file_path).suffix

# Max output file size (in megabytes).
max_file_size = 5 * 1000000

with open(input_file_path, 'r') as input_file:
    # Read total number of lines in file.
    number_of_lines = sum(1 for line in input_file)
    input_file.seek(0, 0)

    if number_of_lines < 1:
        quit()

    # Keep track of how many lines have been read.
    lines_read = 0

    # Keep track of how many bytes have been written per file.
    total_bytes_written_to_file = 0

    # Each output file will be the name of the input file plus dash plus the
    # output file number.
    output_file_number = 1

    # Store the last read line.
    last_read_line = None

    write_last_read_line_to_new_file = False

    # Write output files.
    while True:
        output_file_path = os.path.join(
            Path(input_file_path).parent.resolve(),
            input_file_name + 
            '-' + 
            str(output_file_number) + 
            input_file_extension
        )

        with open(output_file_path, 'w') as output_file:
            if write_last_read_line_to_new_file:
                output_file.write(last_read_line)
                write_last_read_line_to_new_file = False
                total_bytes_written_to_file = len(
                    last_read_line.encode('utf8')
                )

            for index, line in enumerate(input_file):
                last_read_line = line
                lines_read += 1

                # Measure size of each line as we go along.
                line_size_in_bytes = len(line.encode('utf8'))

                # Check if max file size will be exceeded by writing this line.
                if total_bytes_written_to_file + line_size_in_bytes 
                    >= max_file_size:
                    # Create new output file on next while loop iteration.
                    total_bytes_written_to_file = 0
                    output_file_number += 1

                    # Write this line to next file.
                    write_last_read_line_to_new_file = True

                    # Advance to the next while loop iteration.
                    break
                else:
                    output_file.write(line)
                    total_bytes_written_to_file += line_size_in_bytes

        # Break out of the loop if there are no more lines to be read.
        if lines_read == number_of_lines:
            break
Answered By: Tyler
Categories: questions Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.