How do I edit this python script to make sure all the file paths are unique too?

Question:

Here is the script in question –

import os
import re

# Define the folder to start the search from
start_folder = "path"

# Define the name of the output file
output_file = "output.txt"

# Walk through all the subdirectories and files starting from the start folder
for root, dirs, files in os.walk(start_folder):

    # Loop through all the files in the current directory
    for file_name in files:

        # Check if the file is an HTML file
        if file_name.endswith(".html"):

            # Get the full path of the file
            file_path = os.path.join(root, file_name)

            # Open the file for reading
            with open(file_path, "r", encoding="UTF-8") as html_file:

                # Read the contents of the file into a string
                file_content = html_file.read()

                # Find all the http links in the file
                http_links = re.findall("(http[^<>'" ]+)", file_content)

                # Find all the https links in the file
                https_links = re.findall("(https[^<>'" ]+)", file_content)

                # Combine the http and https links into a single list
                all_links = http_links + https_links

                # Keep track of the unique links for this file
                unique_links = set()

                # Loop through all the links found in the file
                for link in all_links:

                    # Check if the link is already in the set of unique links for this file
                    if link not in unique_links:

                        # If the link is not in the set, add it to the set and write it to the output file
                        unique_links.add(link)
                        with open(output_file, "a", encoding="UTF-8") as f:
                            f.write(file_path + "n-n")
                            f.write(link + "n")
                            f.write("n")

# Print "Done scanning" when the script is finished
print("Done scanning")

If I don’t want the code to be too long, what’s the best way to make the file paths unique as well? I want the format to change from:

samepath-

link

samepath-

nextlink

To this:

samepath-

link

nextlink

Asked By: CluelessDumbo

||

Answers:

this should do the trick:

change this part

unique_links = set()

# Loop through all the links found in the file
for link in all_links:

    # Check if the link is already in the set of unique links for this file
    if link not in unique_links:

        # If the link is not in the set, add it to the set and write it to the output file
        unique_links.add(link)
        with open(output_file, "a", encoding="UTF-8") as f:
            f.write(file_path + "n-n")
            f.write(link + "n")
            f.write("n")

for this

unique_links = set(all_links)

#write the new links into the output file

#check if at least one or more links were found, if there are write to the file
if unique_links: # this is equivalent to checking len(unique_links)>0

    with open(output_file, "a", encoding="UTF-8") as f:
    
        # identify the origin file
        f.write(file_path + "n-n") 
    
        # write each unique link found 
        for link in unique_links:
            f.write(link + "n")
            f.write("n")
            

alternatively, to reduce one level of indentation, and given that is inside a for-loop, you can use the continue key-word

unique_links = set(all_links)

#write the new links into the output file

#check if at least one or more links were found, if there are write to the file
if not unique_links: # this is equivalent to checking not len(unique_links)>0
    continue #in this case go to the next html-file skipping the writing part below 

with open(output_file, "a", encoding="UTF-8") as f:
    
    # identify the origin file
    f.write(file_path + "n-n") 
  
    # write each unique link found 
    for link in unique_links:
        f.write(link + "n")
        f.write("n")
Answered By: Copperfield

.iterdir() is a useful technique for extracting all files out of a specified directory path. Here, those files are stored into a list that is later iterated upon for further processing.

import pathlib
# Go to specified directory path
MY_PATH = pathlib.Path("path/to/my/links")
# iterdir() method creates an iterator that lists the files randomly
# creates a list containing all of the files within specified path
all_links = list(MY_PATH.iterdir())
# iterates through each file
for link in all_links:
    # opens each file
    with open(link) as f:
        # writes to current file
        f.write(...)
Answered By: Chicken McNuggets
Categories: questions Tags:
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.