Remove specific chars in text file Python
Question:
I am searching some specific strings, however; I find it in places where it shouldn’t be. Because I convert pdf files to .txt using html. I found some pattern that I can reach string which i dont want it.
This is the string I want to delete "6n5n4n3n2n1nDnCnBnA" . I am looking " nCn" in some other text block not here. So if i find pattern " nCnB", I can delete but how I don’t figure out. I write inplace_change function but i got error message like this.
"subprocess.CalledProcessError: Command ‘[‘java’, ‘-jar’, ‘C:UsersKronosAppDataRoamingPythonPython310site-packagestabulatabula-1.0.5-jar-with-dependencies.jar’, ‘–pages’, ‘1’, ‘–guess’, ‘–format’, ‘JSON’, ‘383026_C.pdf’]’ returned non-zero exit status 1."
def findWordInText(name):
a = name+".txt"
count =0
with open(a,'r',encoding='utf-8',errors="ignore") as f:
line = f.read()
i = 0
if(r"nCn")in line:
String = r"nCn"
ch ="B"
if (String+ch) in line:
print('Need to remove')
txt = String+ch
f.close()
inplace_change(a,txt,"removed")
with open(a,'r',encoding='utf-8',errors="ignore") as f:
line = f.read()
if(r"nCn") in line:
txt="C"
writeOnExcel(name,txt)
count +=1
def inplace_change(file,old,new):
with open(file) as f:
s = f.read()
if old not in s:
print('"{old_string}" not found in {filename}.'.format(**locals()))
return
with open(file, 'w') as f:
print('Changing "{old_string}" to "{new_string}" in {filename}'.format(**locals()))
s = s.replace(old, new)
f.write(s)
Answers:
If you have something like a string with alot of newline characters followed by a character like a number or letter, you can use regex to substitute them with an empty string, instead of going through it line by line, you can go through them all at once like this.
a = r"""python
nanbncndnenfngnhninjnknlnmnnnonpnqnrnsntnunvnwnxnynz
n1n2n3n4n5n6n7n8n9n0Message Herenanbncndne
nAnBnCnDnEnFnGnHnInJnKnLnMnNnOnPSome Other MessagenQnRnSnTnUnVnWnXnYnZ
n!n@n$n%n^n&n*n(n)n_n-n=n+n{n}n[n]n|n;n:n'n,n.n<n>n?n/n`n~
"""
# regex to clear n followed by a character
import re
a = re.sub(r"\n[^\]", "", a)
print(a)
Output:
Message Here
Some Other Message
I am searching some specific strings, however; I find it in places where it shouldn’t be. Because I convert pdf files to .txt using html. I found some pattern that I can reach string which i dont want it.
This is the string I want to delete "6n5n4n3n2n1nDnCnBnA" . I am looking " nCn" in some other text block not here. So if i find pattern " nCnB", I can delete but how I don’t figure out. I write inplace_change function but i got error message like this.
"subprocess.CalledProcessError: Command ‘[‘java’, ‘-jar’, ‘C:UsersKronosAppDataRoamingPythonPython310site-packagestabulatabula-1.0.5-jar-with-dependencies.jar’, ‘–pages’, ‘1’, ‘–guess’, ‘–format’, ‘JSON’, ‘383026_C.pdf’]’ returned non-zero exit status 1."
def findWordInText(name):
a = name+".txt"
count =0
with open(a,'r',encoding='utf-8',errors="ignore") as f:
line = f.read()
i = 0
if(r"nCn")in line:
String = r"nCn"
ch ="B"
if (String+ch) in line:
print('Need to remove')
txt = String+ch
f.close()
inplace_change(a,txt,"removed")
with open(a,'r',encoding='utf-8',errors="ignore") as f:
line = f.read()
if(r"nCn") in line:
txt="C"
writeOnExcel(name,txt)
count +=1
def inplace_change(file,old,new):
with open(file) as f:
s = f.read()
if old not in s:
print('"{old_string}" not found in {filename}.'.format(**locals()))
return
with open(file, 'w') as f:
print('Changing "{old_string}" to "{new_string}" in {filename}'.format(**locals()))
s = s.replace(old, new)
f.write(s)
If you have something like a string with alot of newline characters followed by a character like a number or letter, you can use regex to substitute them with an empty string, instead of going through it line by line, you can go through them all at once like this.
a = r"""python
nanbncndnenfngnhninjnknlnmnnnonpnqnrnsntnunvnwnxnynz
n1n2n3n4n5n6n7n8n9n0Message Herenanbncndne
nAnBnCnDnEnFnGnHnInJnKnLnMnNnOnPSome Other MessagenQnRnSnTnUnVnWnXnYnZ
n!n@n$n%n^n&n*n(n)n_n-n=n+n{n}n[n]n|n;n:n'n,n.n<n>n?n/n`n~
"""
# regex to clear n followed by a character
import re
a = re.sub(r"\n[^\]", "", a)
print(a)
Output:
Message Here
Some Other Message