HTML tag validation in Python
Question:
I want to validate HTML tag to determine whether it is valid HTML or not.
I tried the following code.
def must_have_proper_htmltag(self,filename):
"""
:param filename:
:return:
"""
print(filename)
return bool(BeautifulSoup(filename, 'html.parser').find())
'''Beautiful soup is library function to pull HTML or XML
html.parser is for choosing html doc and find() for
checking occuranc'''
`htmltags='<html><head><title>Test</title></head>''<body><h1>Parse me!</h1>
</body></html>'
nohtmltag = '<html><head><title>Test</title></head>''<body><h1>Parse me!'
print('html checkers:-',qc.must_have_proper_htmltag(htmltag))
print('html checkers:-',qc.must_have_proper_htmltag(nohtmltag))
This function check whether there is HTML tag or not..it doesn’t validate whether the HTML tag is right or not.
How can I validate HTML tag? I want output that HTML tag one is True other one is False
Answers:
While not an exact match to your requirement, maybe it’s easier to leverage work others have already done. For example:
It does not check for single tags but for the whole of the HTML to be correct, which is apparently what you’re after.
Possibly this approach may help you as well:
import HTMLParser
import urllib
import sys
import urlparse
##################################################
# config
base_url = 'http://foo.com/bill_reid/'
depth = 100
w3c_validator = 'http://validator.w3.org/'
##################################################
# classes and functions
# HTML parser class
class parseLinks(HTMLParser.HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
url = url_normalize(value)
if url != "" and not(l.has_key(url)):
l[url] = True;
# HTML parsing function (use the class)
def parse_links(url):
try:
lParser = parseLinks()
lParser.feed(urllib.urlopen(url).read())
lParser.close()
except:
pass
# clean/normalize/reject url
def url_normalize(url):
url= url.strip()
# check it's not an email address
if url.startswith('mailto:'):
return ""
# remove any anchor
url = url.partition('#')[0]
# check it's not an outside-of-the-tree link
url = urlparse.urljoin(current_url, url)
if not(url.startswith(base_url)):
return ""
# check it's an HTML page
if urllib.urlopen(url).info().gettype() != 'text/html':
return ""
return url
# W3C validation
def url_w3c_validate(url):
return urllib.urlopen(w3c_validator + 'check?uri=' + url).info().getheader('x-w3c-validator-status') == 'Valid'
##################################################
# main
##################################################
l = {base_url: True}
l_error = []
n = 0
for i in range(depth):
for url in l.copy():
if l[url]:
n += 1
current_url = url
print n,
print "-",
print current_url,
print " parsing...",
parse_links(url)
print "done -",
print "validating...",
is_valid = url_w3c_validate(url)
print "done ->",
if is_valid:
print "Valid"
else:
l_error.append(url)
print "Invalid"
l[url] = False
#report
print """
-------------------------------------
URLs parsed: %d
URLS with invalid HTML: %d""" % (len(l), len(l_error))
for url in l_error:
print url
You can validate it using w3c validator
from py_w3c.validators.html.validator import HTMLValidator
def must_have_proper_htmltag(self,filename):
"""
:param filename:
:return:
"""
print(filename)
html_validator = HTMLValidator()
html_validator.validate_fragment(filename)
if not html_validator.errors:
'''Where there is no error it return empty list'''
return True
else:
return False
print('html checkers:-',qc.must_have_proper_htmltag('<!DOCTYPE html><html>
<head><title>Test</title></head>''<body><h1>Parse me!</h1></body></html>'))
I want to validate HTML tag to determine whether it is valid HTML or not.
I tried the following code.
def must_have_proper_htmltag(self,filename):
"""
:param filename:
:return:
"""
print(filename)
return bool(BeautifulSoup(filename, 'html.parser').find())
'''Beautiful soup is library function to pull HTML or XML
html.parser is for choosing html doc and find() for
checking occuranc'''
`htmltags='<html><head><title>Test</title></head>''<body><h1>Parse me!</h1>
</body></html>'
nohtmltag = '<html><head><title>Test</title></head>''<body><h1>Parse me!'
print('html checkers:-',qc.must_have_proper_htmltag(htmltag))
print('html checkers:-',qc.must_have_proper_htmltag(nohtmltag))
This function check whether there is HTML tag or not..it doesn’t validate whether the HTML tag is right or not.
How can I validate HTML tag? I want output that HTML tag one is True other one is False
While not an exact match to your requirement, maybe it’s easier to leverage work others have already done. For example:
It does not check for single tags but for the whole of the HTML to be correct, which is apparently what you’re after.
Possibly this approach may help you as well:
import HTMLParser
import urllib
import sys
import urlparse
##################################################
# config
base_url = 'http://foo.com/bill_reid/'
depth = 100
w3c_validator = 'http://validator.w3.org/'
##################################################
# classes and functions
# HTML parser class
class parseLinks(HTMLParser.HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
url = url_normalize(value)
if url != "" and not(l.has_key(url)):
l[url] = True;
# HTML parsing function (use the class)
def parse_links(url):
try:
lParser = parseLinks()
lParser.feed(urllib.urlopen(url).read())
lParser.close()
except:
pass
# clean/normalize/reject url
def url_normalize(url):
url= url.strip()
# check it's not an email address
if url.startswith('mailto:'):
return ""
# remove any anchor
url = url.partition('#')[0]
# check it's not an outside-of-the-tree link
url = urlparse.urljoin(current_url, url)
if not(url.startswith(base_url)):
return ""
# check it's an HTML page
if urllib.urlopen(url).info().gettype() != 'text/html':
return ""
return url
# W3C validation
def url_w3c_validate(url):
return urllib.urlopen(w3c_validator + 'check?uri=' + url).info().getheader('x-w3c-validator-status') == 'Valid'
##################################################
# main
##################################################
l = {base_url: True}
l_error = []
n = 0
for i in range(depth):
for url in l.copy():
if l[url]:
n += 1
current_url = url
print n,
print "-",
print current_url,
print " parsing...",
parse_links(url)
print "done -",
print "validating...",
is_valid = url_w3c_validate(url)
print "done ->",
if is_valid:
print "Valid"
else:
l_error.append(url)
print "Invalid"
l[url] = False
#report
print """
-------------------------------------
URLs parsed: %d
URLS with invalid HTML: %d""" % (len(l), len(l_error))
for url in l_error:
print url
You can validate it using w3c validator
from py_w3c.validators.html.validator import HTMLValidator
def must_have_proper_htmltag(self,filename):
"""
:param filename:
:return:
"""
print(filename)
html_validator = HTMLValidator()
html_validator.validate_fragment(filename)
if not html_validator.errors:
'''Where there is no error it return empty list'''
return True
else:
return False
print('html checkers:-',qc.must_have_proper_htmltag('<!DOCTYPE html><html>
<head><title>Test</title></head>''<body><h1>Parse me!</h1></body></html>'))