How to copy only non-duplicate files whilst maintaining folder structure?

Question:

I am trying to find duplicates between two folders and copy only unique image files to the ‘dest’ folder. I can copy all the non-dupes using the code below, however it doesn’t maintain the source directory structure. I think OS.walk returns 3 tuples, but they aren’t linked so not sure how to re-construct the sub dir?

Example:

import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup  destination folder:')

ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')

search = dif(source, dest)
result = search.result
result


dupes = []
srcfiles = []
filecount = []
failed = []
removed = []

for i in result.values(): 
        dupes.append(i['location'])

for dirpath, subdirs, files in os.walk(source):
    for x in files:
        if x.endswith(ext):
            srcfiles.append(os.path.join(dirpath, x))

for f in srcfiles:
                if f not in dupes:
                        shutil.copy(f, dest)
                        print('File copied successfully - '+f)
                        filecount.append(f)
                else:
                        print('File not copied successfully !!!! - '+f)
                        failed.append(f)

I have also tried using the shutil.copytree function with an ignore list, however it requires a new folder and can’t get the ignore list function to work

shutil.copytree example:

for i in result.values(): 
        df = []
        df.append(i['filename'])

def ignorelist(source, df):
        return [f for f in df if os.path.isfile(os.path.join(source, f))]

shutil.copytree(source, destnew, ignore=ignorelist)
Asked By: W4K1NG

||

Answers:

import sysrsync

source = input('Input source folder:')
dest = input('Input backup  destination folder:')
sysrsync.run(source=source,
             destination=dest,
             sync_source_contents=False)

from: https://github.com/gchamon/sysrsync

Answered By: Marty

This function ignorelist should do the trick:

import shutil, os
from difPy import dif
source = input('Input source folder:')
dest = input('Input backup  destination folder:')

ext = ('.jpg','.jpeg','.gif')

search = dif(source, dest)

dupes = [value['location'] for value in search.result.values()]

def ignorelist(source, files):
    return [file for file in files
                    if (os.path.isfile(os.path.join(source, file))
                         and (os.path.join(source, file) in dupes
                              or not file.lower().endswith(ext)))]

shutil.copytree(source, dest, ignore=ignorelist)

And the other "more manual" way would be

import shutil, os
from difPy import dif
source = input('Input source folder:').rstrip('/\')
dest = input('Input backup  destination folder:').rstrip('/\')

ext = ('.jpg','.jpeg','.gif')

search = dif(source, dest)

dupes = [value['location'] for value in search.result.values()]

srcfiles = []
copied = []
failed = []
skipped = []

for dirpath, subdirs, files in os.walk(source):
    for file in files:
        if file.lower().endswith(ext):
            srcfile = os.path.join(dirpath,file)
            srcfiles.append(srcfile)
            if srcfile in dupes:
                print('File not copied (duplicate) - '+srcfile)
                skipped.append(srcfile)
            else:
                try:
                    destfile = os.path.join(dest,srcfile[len(source)+1:])
                    os.makedirs(os.path.dirname(destfile), exist_ok=True)
                    shutil.copy(srcfile,destfile)
                    print('File copied successfully - '+srcfile)
                    copied.append(srcfile)
                except Exception as err:
                    print('File not copied (error %s) - %s' % (str(err),srcfile))
                    failed.append(f)
Answered By: Frank

I have changed some variable names to make them more descriptive. And what you call failed is really just a list of files that are not copied because they are duplicates rather than files whose copying was attempted but failed.

import shutil, os
from difPy import dif

source = input('Input source folder: ')
dest = input('Input backup  destination folder: ')

# Remove trailing path separators if they exist:
if source.endswith(('/', '\')):
    source = source[:-1]
if dest.endswith(('/', '\')):
    dest = dest[:-1]

# Use the correct path separator to
# ensure correct matching with dif results:
if os.sep == '/':
    source = source.replace('\', os.sep)
elif os.sep == '\':
    source = source.replace('/', os.sep)

source_directory_length = len(source) + 1

ext = ('.jpg','.jpeg','.gif','.JPG','.JPEG','.GIF')

search = dif(source, dest)
result = search.result

# Set comprehension:
dupes = {duplicate['location'] for duplicate in result.values()}

copied = []
not_copied = []
for dirpath, subdirs, files in os.walk(source):
    for file in files:
        if file.endswith(ext):
            source_path = os.path.join(dirpath, file)
            if source_path not in dupes:
                # get subdirectory of source directory that this file is in:
                file_length = len(file) + 1
                # Get subdirectory relative to the source directory:
                subdirectory = source_path[source_directory_length:-file_length]
                if subdirectory:
                    dest_directory = os.path.join(dest, subdirectory)
                    # ensure directory exists:
                    os.makedirs(dest_directory, exist_ok=True)
                else:
                    dest_directory = dest
                dest_path = os.path.join(dest_directory, file)
                shutil.copy(source_path, dest_path)
                print('File copied successfully -', source_path)
                copied.append(source_path)
            else:
                print('File not copied -', source_path)
                not_copied.append(source_path)
Answered By: Booboo
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.