Python: Convert several/multiple .docx file from ANSI to UTF-8 on a particular folder
Question:
I am not very good programmer. But I want to make a py code that may convert several/multiple .docx file from ANSI to UTF-8, from a particular folder.
I will start with this. But I don’t know further, how to select files from folder. Maybe someone helps me a little bit.
from unidecode import unidecode
python2_text = docx_paragraph.text
unicode_text = python2_text.decode("utf-8", "replace") if isinstance(python2_text , str) else python2_text
unidecode(unicode_text)
Answers:
import os
import zipfile
import io
import chardet
# Set the folder path where the .docx files are located
folder_path = os.getcwd()
# Loop through all files in the folder
for filename in os.listdir(folder_path):
if filename.endswith(".docx"):
# Open the .docx file
file_path = os.path.join(folder_path, filename)
try:
with zipfile.ZipFile(file_path) as docx_file:
# Read the contents of the document.xml file
xml_content = docx_file.read('word/document.xml')
except Exception as e:
print(f"Error opening {file_path}: {e}")
continue
# Detect the current encoding of the file
detected_encoding = chardet.detect(xml_content)['encoding']
print(f"{file_path} is encoded in {detected_encoding}")
# If the detected encoding is not UTF-8, save the file in UTF-8 format
if detected_encoding != "utf-8":
new_filename = os.path.splitext(filename)[0] + "_utf8.docx"
new_file_path = os.path.join(folder_path, new_filename)
with zipfile.ZipFile(new_file_path, "w") as docx_file:
# Write the contents of the modified document.xml file
docx_file.writestr('word/document.xml', xml_content.decode(detected_encoding).encode('utf-8'))
print(f"Converted {file_path} from {detected_encoding} to UTF-8 and saved as {new_file_path}")
else:
print(f"{file_path} is already in UTF-8 format")
I am not very good programmer. But I want to make a py code that may convert several/multiple .docx file from ANSI to UTF-8, from a particular folder.
I will start with this. But I don’t know further, how to select files from folder. Maybe someone helps me a little bit.
from unidecode import unidecode
python2_text = docx_paragraph.text
unicode_text = python2_text.decode("utf-8", "replace") if isinstance(python2_text , str) else python2_text
unidecode(unicode_text)
import os
import zipfile
import io
import chardet
# Set the folder path where the .docx files are located
folder_path = os.getcwd()
# Loop through all files in the folder
for filename in os.listdir(folder_path):
if filename.endswith(".docx"):
# Open the .docx file
file_path = os.path.join(folder_path, filename)
try:
with zipfile.ZipFile(file_path) as docx_file:
# Read the contents of the document.xml file
xml_content = docx_file.read('word/document.xml')
except Exception as e:
print(f"Error opening {file_path}: {e}")
continue
# Detect the current encoding of the file
detected_encoding = chardet.detect(xml_content)['encoding']
print(f"{file_path} is encoded in {detected_encoding}")
# If the detected encoding is not UTF-8, save the file in UTF-8 format
if detected_encoding != "utf-8":
new_filename = os.path.splitext(filename)[0] + "_utf8.docx"
new_file_path = os.path.join(folder_path, new_filename)
with zipfile.ZipFile(new_file_path, "w") as docx_file:
# Write the contents of the modified document.xml file
docx_file.writestr('word/document.xml', xml_content.decode(detected_encoding).encode('utf-8'))
print(f"Converted {file_path} from {detected_encoding} to UTF-8 and saved as {new_file_path}")
else:
print(f"{file_path} is already in UTF-8 format")