batch renaming 100K files with python
Question:
I have a folder with over 100,000 files, all numbered with the same stub, but without leading zeros, and the numbers aren’t always contiguous (usually they are, but there are gaps) e.g:
file-21.png,
file-22.png,
file-640.png,
file-641.png,
file-642.png,
file-645.png,
file-2130.png,
file-2131.png,
file-3012.png,
etc.
I would like to batch process this to create padded, contiguous files. e.g:
file-000000.png,
file-000001.png,
file-000002.png,
file-000003.png,
When I parse the folder with for filename in os.listdir('.'):
the files don’t come up in the order I’d like to them to. Understandably they come up
file-1,
file-1x,
file-1xx,
file-1xxx,
etc. then
file-2,
file-2x,
file-2xx,
etc. How can I get it to go through in the order of the numeric value? I am a complete python noob, but looking at the docs i’m guessing I could use map to create a new list filtering out only the numerical part, and then sort that list, then iterate that? With over 100K files this could be heavy. Any tips welcome!
Answers:
1) Take the number in the filename.
2) Left-pad it with zeros
3) Save name.
Why don’t you do it in a two step process. Parse all the files and rename with padded numbers and then run another script that takes those files, which are sorted correctly now, and renames them so they’re contiguous?
There are three steps. The first is getting all the filenames. The second is converting the filenames. The third is renaming them.
If all the files are in the same folder, then glob should work.
import glob
filenames = glob.glob("/path/to/folder/*.txt")
Next, you want to change the name of the file. You can print with padding to do this.
>>> filename = "file-338.txt"
>>> import os
>>> fnpart = os.path.splitext(filename)[0]
>>> fnpart
'file-338'
>>> _, num = fnpart.split("-")
>>> num.rjust(5, "0")
'00338'
>>> newname = "file-%s.txt" % num.rjust(5, "0")
>>> newname
'file-00338.txt'
Now, you need to rename them all. os.rename
does just that.
os.rename(filename, newname)
To put it together:
for filename in glob.glob("/path/to/folder/*.txt"): # loop through each file
newname = make_new_filename(filename) # create a function that does step 2, above
os.rename(filename, newname)
import re
thenum = re.compile('^file-(d+).png$')
def bynumber(fn):
mo = thenum.match(fn)
if mo: return int(mo.group(1))
allnames = os.listdir('.')
allnames.sort(key=bynumber)
Now you have the files in the order you want them and can loop
for i, fn in enumerate(allnames):
...
using the progressive number i
(which will be 0, 1, 2, …) padded as you wish in the destination-name.
def renamer():
for iname in os.listdir('.'):
first, second = iname.replace(" ", "").split("-")
number, ext = second.split('.')
first, number, ext = first.strip(), number.strip(), ext.strip()
number = '0'*(6-len(number)) + number # pad the number to be 7 digits long
oname = first + "-" + number + '.' + ext
os.rename(iname, oname)
print "Done"
Hope this helps
Thank you all for your suggestions, I will try them all to learn the different approaches. The solution I went for is based on using a natural sort on my filelist, and then iterating that to rename. This was one of the suggested answers but for some reason it has disappeared now so I cannot mark it as accepted!
import os
files = os.listdir('.')
natsort(files)
index = 0
for filename in files:
os.rename(filename, str(index).zfill(7)+'.png')
index += 1
where natsort is defined in http://code.activestate.com/recipes/285264-natural-string-sorting/
The simplest method is given below. You can also modify for recursive search this script.
- use
os
module.
- get filenames
os.rename
import os
class Renamer:
def __init__(self, pattern, extension):
self.ext = extension
self.pat = pattern
return
def rename(self):
p, e = (self.pat, self.ext)
number = 0
for x in os.listdir(os.getcwd()):
if str(x).endswith(f".{e}") == True:
os.rename(x, f'{p}_{number}.{e}')
number+=1
return
if __name__ == "__main__":
pattern = "myfile"
extension = "txt"
r = Renamer(pattern=pattern, extension=extension)
r.rename()
I have a folder with over 100,000 files, all numbered with the same stub, but without leading zeros, and the numbers aren’t always contiguous (usually they are, but there are gaps) e.g:
file-21.png,
file-22.png,
file-640.png,
file-641.png,
file-642.png,
file-645.png,
file-2130.png,
file-2131.png,
file-3012.png,
etc.
I would like to batch process this to create padded, contiguous files. e.g:
file-000000.png,
file-000001.png,
file-000002.png,
file-000003.png,
When I parse the folder with for filename in os.listdir('.'):
the files don’t come up in the order I’d like to them to. Understandably they come up
file-1,
file-1x,
file-1xx,
file-1xxx,
etc. then
file-2,
file-2x,
file-2xx,
etc. How can I get it to go through in the order of the numeric value? I am a complete python noob, but looking at the docs i’m guessing I could use map to create a new list filtering out only the numerical part, and then sort that list, then iterate that? With over 100K files this could be heavy. Any tips welcome!
1) Take the number in the filename.
2) Left-pad it with zeros
3) Save name.
Why don’t you do it in a two step process. Parse all the files and rename with padded numbers and then run another script that takes those files, which are sorted correctly now, and renames them so they’re contiguous?
There are three steps. The first is getting all the filenames. The second is converting the filenames. The third is renaming them.
If all the files are in the same folder, then glob should work.
import glob
filenames = glob.glob("/path/to/folder/*.txt")
Next, you want to change the name of the file. You can print with padding to do this.
>>> filename = "file-338.txt"
>>> import os
>>> fnpart = os.path.splitext(filename)[0]
>>> fnpart
'file-338'
>>> _, num = fnpart.split("-")
>>> num.rjust(5, "0")
'00338'
>>> newname = "file-%s.txt" % num.rjust(5, "0")
>>> newname
'file-00338.txt'
Now, you need to rename them all. os.rename
does just that.
os.rename(filename, newname)
To put it together:
for filename in glob.glob("/path/to/folder/*.txt"): # loop through each file
newname = make_new_filename(filename) # create a function that does step 2, above
os.rename(filename, newname)
import re
thenum = re.compile('^file-(d+).png$')
def bynumber(fn):
mo = thenum.match(fn)
if mo: return int(mo.group(1))
allnames = os.listdir('.')
allnames.sort(key=bynumber)
Now you have the files in the order you want them and can loop
for i, fn in enumerate(allnames):
...
using the progressive number i
(which will be 0, 1, 2, …) padded as you wish in the destination-name.
def renamer():
for iname in os.listdir('.'):
first, second = iname.replace(" ", "").split("-")
number, ext = second.split('.')
first, number, ext = first.strip(), number.strip(), ext.strip()
number = '0'*(6-len(number)) + number # pad the number to be 7 digits long
oname = first + "-" + number + '.' + ext
os.rename(iname, oname)
print "Done"
Hope this helps
Thank you all for your suggestions, I will try them all to learn the different approaches. The solution I went for is based on using a natural sort on my filelist, and then iterating that to rename. This was one of the suggested answers but for some reason it has disappeared now so I cannot mark it as accepted!
import os
files = os.listdir('.')
natsort(files)
index = 0
for filename in files:
os.rename(filename, str(index).zfill(7)+'.png')
index += 1
where natsort is defined in http://code.activestate.com/recipes/285264-natural-string-sorting/
The simplest method is given below. You can also modify for recursive search this script.
- use
os
module. - get filenames
os.rename
import os
class Renamer:
def __init__(self, pattern, extension):
self.ext = extension
self.pat = pattern
return
def rename(self):
p, e = (self.pat, self.ext)
number = 0
for x in os.listdir(os.getcwd()):
if str(x).endswith(f".{e}") == True:
os.rename(x, f'{p}_{number}.{e}')
number+=1
return
if __name__ == "__main__":
pattern = "myfile"
extension = "txt"
r = Renamer(pattern=pattern, extension=extension)
r.rename()