How do I edit this python script to make sure all the file paths are unique too?
Question:
Here is the script in question –
import os
import re
# Define the folder to start the search from
start_folder = "path"
# Define the name of the output file
output_file = "output.txt"
# Walk through all the subdirectories and files starting from the start folder
for root, dirs, files in os.walk(start_folder):
# Loop through all the files in the current directory
for file_name in files:
# Check if the file is an HTML file
if file_name.endswith(".html"):
# Get the full path of the file
file_path = os.path.join(root, file_name)
# Open the file for reading
with open(file_path, "r", encoding="UTF-8") as html_file:
# Read the contents of the file into a string
file_content = html_file.read()
# Find all the http links in the file
http_links = re.findall("(http[^<>'" ]+)", file_content)
# Find all the https links in the file
https_links = re.findall("(https[^<>'" ]+)", file_content)
# Combine the http and https links into a single list
all_links = http_links + https_links
# Keep track of the unique links for this file
unique_links = set()
# Loop through all the links found in the file
for link in all_links:
# Check if the link is already in the set of unique links for this file
if link not in unique_links:
# If the link is not in the set, add it to the set and write it to the output file
unique_links.add(link)
with open(output_file, "a", encoding="UTF-8") as f:
f.write(file_path + "n-n")
f.write(link + "n")
f.write("n")
# Print "Done scanning" when the script is finished
print("Done scanning")
If I don’t want the code to be too long, what’s the best way to make the file paths unique as well? I want the format to change from:
samepath-
link
samepath-
nextlink
To this:
samepath-
link
nextlink
Answers:
this should do the trick:
change this part
unique_links = set()
# Loop through all the links found in the file
for link in all_links:
# Check if the link is already in the set of unique links for this file
if link not in unique_links:
# If the link is not in the set, add it to the set and write it to the output file
unique_links.add(link)
with open(output_file, "a", encoding="UTF-8") as f:
f.write(file_path + "n-n")
f.write(link + "n")
f.write("n")
for this
unique_links = set(all_links)
#write the new links into the output file
#check if at least one or more links were found, if there are write to the file
if unique_links: # this is equivalent to checking len(unique_links)>0
with open(output_file, "a", encoding="UTF-8") as f:
# identify the origin file
f.write(file_path + "n-n")
# write each unique link found
for link in unique_links:
f.write(link + "n")
f.write("n")
alternatively, to reduce one level of indentation, and given that is inside a for-loop, you can use the continue
key-word
unique_links = set(all_links)
#write the new links into the output file
#check if at least one or more links were found, if there are write to the file
if not unique_links: # this is equivalent to checking not len(unique_links)>0
continue #in this case go to the next html-file skipping the writing part below
with open(output_file, "a", encoding="UTF-8") as f:
# identify the origin file
f.write(file_path + "n-n")
# write each unique link found
for link in unique_links:
f.write(link + "n")
f.write("n")
.iterdir() is a useful technique for extracting all files out of a specified directory path. Here, those files are stored into a list that is later iterated upon for further processing.
import pathlib
# Go to specified directory path
MY_PATH = pathlib.Path("path/to/my/links")
# iterdir() method creates an iterator that lists the files randomly
# creates a list containing all of the files within specified path
all_links = list(MY_PATH.iterdir())
# iterates through each file
for link in all_links:
# opens each file
with open(link) as f:
# writes to current file
f.write(...)
Here is the script in question –
import os
import re
# Define the folder to start the search from
start_folder = "path"
# Define the name of the output file
output_file = "output.txt"
# Walk through all the subdirectories and files starting from the start folder
for root, dirs, files in os.walk(start_folder):
# Loop through all the files in the current directory
for file_name in files:
# Check if the file is an HTML file
if file_name.endswith(".html"):
# Get the full path of the file
file_path = os.path.join(root, file_name)
# Open the file for reading
with open(file_path, "r", encoding="UTF-8") as html_file:
# Read the contents of the file into a string
file_content = html_file.read()
# Find all the http links in the file
http_links = re.findall("(http[^<>'" ]+)", file_content)
# Find all the https links in the file
https_links = re.findall("(https[^<>'" ]+)", file_content)
# Combine the http and https links into a single list
all_links = http_links + https_links
# Keep track of the unique links for this file
unique_links = set()
# Loop through all the links found in the file
for link in all_links:
# Check if the link is already in the set of unique links for this file
if link not in unique_links:
# If the link is not in the set, add it to the set and write it to the output file
unique_links.add(link)
with open(output_file, "a", encoding="UTF-8") as f:
f.write(file_path + "n-n")
f.write(link + "n")
f.write("n")
# Print "Done scanning" when the script is finished
print("Done scanning")
If I don’t want the code to be too long, what’s the best way to make the file paths unique as well? I want the format to change from:
samepath-
link
samepath-
nextlink
To this:
samepath-
link
nextlink
this should do the trick:
change this part
unique_links = set()
# Loop through all the links found in the file
for link in all_links:
# Check if the link is already in the set of unique links for this file
if link not in unique_links:
# If the link is not in the set, add it to the set and write it to the output file
unique_links.add(link)
with open(output_file, "a", encoding="UTF-8") as f:
f.write(file_path + "n-n")
f.write(link + "n")
f.write("n")
for this
unique_links = set(all_links)
#write the new links into the output file
#check if at least one or more links were found, if there are write to the file
if unique_links: # this is equivalent to checking len(unique_links)>0
with open(output_file, "a", encoding="UTF-8") as f:
# identify the origin file
f.write(file_path + "n-n")
# write each unique link found
for link in unique_links:
f.write(link + "n")
f.write("n")
alternatively, to reduce one level of indentation, and given that is inside a for-loop, you can use the continue
key-word
unique_links = set(all_links)
#write the new links into the output file
#check if at least one or more links were found, if there are write to the file
if not unique_links: # this is equivalent to checking not len(unique_links)>0
continue #in this case go to the next html-file skipping the writing part below
with open(output_file, "a", encoding="UTF-8") as f:
# identify the origin file
f.write(file_path + "n-n")
# write each unique link found
for link in unique_links:
f.write(link + "n")
f.write("n")
.iterdir() is a useful technique for extracting all files out of a specified directory path. Here, those files are stored into a list that is later iterated upon for further processing.
import pathlib
# Go to specified directory path
MY_PATH = pathlib.Path("path/to/my/links")
# iterdir() method creates an iterator that lists the files randomly
# creates a list containing all of the files within specified path
all_links = list(MY_PATH.iterdir())
# iterates through each file
for link in all_links:
# opens each file
with open(link) as f:
# writes to current file
f.write(...)