Using python difflib to compare more than two files

Question:

I would like to get an overview over e.g. the ldd dependency list of multiple (3+) computers by comparing them with each other and highlighting the differences. For example, if I have a dict that looks as following:

my_ldd_outputs = {
  01:"<ldd_output>",
  02:"<ldd_output>", 
  ...
  09:"<ldd_output>",
  10:"<ldd_output>"
}

I would like the output to look something like

<identical line 1>
<identical line 2>
<identical line 3>
<differing line 4> (computer 01 02)
<differing line 4> (computer 04 05 06 07)
<differing line 4> (computer 08 09 10)
<identical line 5>
<identical line 6>
...

My first approach involved python difflib, where my idea was to first get to a datastructure where all the ldd_output lists (just the result split with n) from the abovementioned my_ldd_outputs dictionary are the same length, and any missing line that exists in another ldd_output string is added with a string. So if two files looked like this:

ldd_1 = """
<identical line 1>
<identical line 2>
<differing line 3>
<identical line 4>
<extra line 5>
<identical line 6>
"""

ldd_2 = """
<identical line 1>
<identical line 2>
<differing line 3>
<identical line 4>
<identical line 6>
"""

My goal was to store those files as

ldd_1 = """
<identical line 1>
<identical line 2>
<differing line 3>
<identical line 4>
<extra line 5>
<identical line 6>
"""

ldd_2 = """
<identical line 1>
<identical line 2>
<differing line 3>
<identical line 4>
<None>
<identical line 6>
"""

And ultimately just iterate over every line of the converted files (which now all have the same length) and compare each line in terms of their differences and ignore any <None> entries so the diff can be printed consecutively.

I created a function that uses python difflib to fill the missing lines from other files with a <None> string. However, I am not sure how to expand this function to incorporate an arbitrary amount of diffs

def generate_diff(file_1, file_2):
    #differing hashvalues from ldd can be ignored, we only care about version and path
    def remove_hashvalues(input):
        return re.sub("([a-zA-Z0-9_.-]{32}/|([a-zA-Z0-9_.-]*))", "<>", input)
    diff = [line.strip() for line in difflib.ndiff(remove_hashvalues(base).splitlines(keepends=True),remove_hashvalues(file_2).splitlines(keepends=True))]
    list_1 = []
    list_2 = []
    i = 0
    while i<len(diff):
        if diff[i].strip():
            if diff[i][0:2]=="- ":
                lost = []
                gained = []
                while diff[i][0:2]=="- " or diff[i][0:2]=="? ":
                    if diff[i][0:2]=="- ": lost.append(diff[i][1:].strip())
                    i+=1
                while diff[i][0:2]=="+ " or diff[i][0:2]=="? ":
                    if diff[i][0:2]=="+ ": gained.append(diff[i][1:].strip())
                    i+=1
                while len(lost) != len(gained):
                    lost.append("<None>") if len(lost)<len(gained) else gained.insert(0,"<None>")
                list_1+=lost; list_2+=gained
            elif diff[i][0:2]=="+ ":
                list_1.append("<None>"); list_2.append(diff[i][1:].strip())
            if not diff[i][0:2]=="? ":
                list_1.append(diff[i].strip()); list_2.append(diff[i].strip())
        i+=1
    return list_1, list_2

I also found this tool that allows the comparison of multiple files, but unfortunately its not designed to compare code.

EDIT: I adjusted the solution suggestion of @AyoubKaanich to create a more simplified version that does what I want:

from collections import defaultdict
import re
def transform(input):
    input = re.sub("([a-zA-Z0-9_.-]{32}/|([a-zA-Z0-9_.-]*))", "<>", input) # differing hashvalues can be ignored, we only care about version and path
    return sorted(input.splitlines())
def generate_diff(outputs: dict):
    mapping = defaultdict(set)
    for target, output in outputs.items():
        for line in transform(output):
            mapping[line.strip()].add(target)
    result = []
    current_line = None
    color_index = 0
    for line in sorted(mapping.keys()):
        if len(outputs) == len(mapping[line]):
            if current_line: current_line = None
            result.append((line))
        else:
            if current_line != line.split(" ")[0]:
                current_line = line.split(" ")[0]
                color_index+=1
            result.append((f"33[3{color_index%6+1}m{line}33[0m",mapping[line]))
    return result

The only downside is that this does not apply to diffs where the string varies in an arbitrary section as opposed to just the beginning, which is what difflib is good at detecting. However, for the case of ldd, since the dependency is always listed at first, sorting alphabetically and taking the first section of the string works.

Asked By: Yes

||

Answers:

Pure Python solution, no libraries or extra dependencies.

Note: this solutions works due some assumptions:

  • Order of lines do not matter
  • A line either exists, or is missing (no logic to check similarity between lines)

from collections import defaultdict
import re

def transform(input):
    # differing hashvalues from ldd can be ignored, we only care about version and path
    input = re.sub("([a-zA-Z0-9_.-]{32}/|([a-zA-Z0-9_.-]*))", "<>", input)
    return sorted(input.splitlines())

def generate_diff(outputs: dict, common_threshold = 0):
    """
        common_threshold: how many outputs need to contain line to consider it common
            and mark outputs that do not have it as missing
    """
    assert(common_threshold <= len(outputs))

    mapping = defaultdict(set)
    for target, output in outputs.items():
        for line in transform(output):
            mapping[line].add(target)
    
    for line in sorted(mapping.keys()):
        found = mapping[line]
        if len(outputs) == len(found):
            print('  ' + line)
        elif len(found) >= common_threshold:
            missed_str = ",".join(map(str, set(outputs.keys()) - found))
            print(f'- {line}  ({missed_str})')
        else:
            added_str = ",".join(map(str, found))
            print(f'+ {line}  ({added_str})')

Sample execution


my_ldd_outputs = {
'A': """
linux-vdso.so.1 (0x00007ffde4f09000)
libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 (0x00007fe0594f3000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fe0592cb000)
/lib64/ld-linux-x86-64.so.2 (0x00007fe059690000)
""",
'B': """
linux-vdso.so.1 (0x00007fff697b6000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f1c54045000)
/lib64/ld-linux-x86-64.so.2 (0x00007f1c54299000)
""",
'C': """
linux-vdso.so.1 (0x00007fffd61f9000)
libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 (0x00007f08a51a3000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f08a4f7b000)
/lib64/ld-linux-x86-64.so.2 (0x00007f08a5612000)
""",
'D': """
linux-vdso.so.1 (0x00007ffcf9ddd000)
libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 (0x00007fa2e381b000)
libselinux.so.1 => /lib/x86_64-linux-gnu/libselinux.so.1 (0x00007fa2e37ef000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fa2e35c7000)
libpcre2-8.so.0 => /lib/x86_64-linux-gnu/libpcre2-8.so.0 (0x00007fa2e3530000)
/lib64/ld-linux-x86-64.so.2 (0x00007fa2e3cd7000)
""",
'E': """
linux-vdso.so.1 (0x00007ffc2deab000)
libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 (0x00007f31fed91000)
libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007f31fed75000)
libselinux.so.1 => /lib/x86_64-linux-gnu/libselinux.so.1 (0x00007f31fed49000)
libgssapi_krb5.so.2 => /lib/x86_64-linux-gnu/libgssapi_krb5.so.2 (0x00007f31fecf5000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f31feacd000)
libpcre2-8.so.0 => /lib/x86_64-linux-gnu/libpcre2-8.so.0 (0x00007f31fea34000)
/lib64/ld-linux-x86-64.so.2 (0x00007f31ff2af000)
libkrb5.so.3 => /lib/x86_64-linux-gnu/libkrb5.so.3 (0x00007f31fe969000)
libk5crypto.so.3 => /lib/x86_64-linux-gnu/libk5crypto.so.3 (0x00007f31fe93a000)
libcom_err.so.2 => /lib/x86_64-linux-gnu/libcom_err.so.2 (0x00007f31fe934000)
libkrb5support.so.0 => /lib/x86_64-linux-gnu/libkrb5support.so.0 (0x00007f31fe926000)
libkeyutils.so.1 => /lib/x86_64-linux-gnu/libkeyutils.so.1 (0x00007f31fe91f000)
libresolv.so.2 => /lib/x86_64-linux-gnu/libresolv.so.2 (0x00007f31fe909000)
"""
}
generate_diff(my_ldd_outputs, 2)

Outputs

  /lib64/ld-linux-x86-64.so.2 <>
  libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 <>
+ libcom_err.so.2 => /lib/x86_64-linux-gnu/libcom_err.so.2 <>  (E)
- libcrypto.so.3 => /lib/x86_64-linux-gnu/libcrypto.so.3 <>  (B,A)
+ libgssapi_krb5.so.2 => /lib/x86_64-linux-gnu/libgssapi_krb5.so.2 <>  (E)
+ libk5crypto.so.3 => /lib/x86_64-linux-gnu/libk5crypto.so.3 <>  (E)
+ libkeyutils.so.1 => /lib/x86_64-linux-gnu/libkeyutils.so.1 <>  (E)
+ libkrb5.so.3 => /lib/x86_64-linux-gnu/libkrb5.so.3 <>  (E)
+ libkrb5support.so.0 => /lib/x86_64-linux-gnu/libkrb5support.so.0 <>  (E)
- libpcre2-8.so.0 => /lib/x86_64-linux-gnu/libpcre2-8.so.0 <>  (C,B,A)
+ libresolv.so.2 => /lib/x86_64-linux-gnu/libresolv.so.2 <>  (E)
- libselinux.so.1 => /lib/x86_64-linux-gnu/libselinux.so.1 <>  (C,B,A)
+ libtinfo.so.6 => /lib/x86_64-linux-gnu/libtinfo.so.6 <>  (A)
+ libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 <>  (E)
  linux-vdso.so.1 <>
Answered By: Ayoub Kaanich

git diff from Git.

git add ldd.tmp1 ldd.tmp2 ldd.tmp3
git diff -- | grep + > result.tmp

see https://git-scm.com/docs/git-diff

Also:

  • Meld
  • Kdiff3
Answered By: Mo. Atairu
Categories: questions Tags: , , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.