HTML entity fixer
25th of November 2004
Here's a little program I wrote recently to fix incorrectly defined characters into HTML entities. For example, this is incorrect:
<b>Bärs & Öl</b>
But this is correct:
<b>Bärs & Öl</b>
To demonstrate I have set up a little test page here so that you can test to convert your impure HTML content.
Run test program
Here's the source code for the program:
from htmlentitydefs import entitydefs
entitydefs_inverted = {}
for k,v in entitydefs.items():
entitydefs_inverted[v] = k
_badchars_regex = re.compile('|'.join(entitydefs.values()))
_been_fixed_regex = re.compile('&\w+;|&#[0-9]+;')
def html_entity_fixer(text, skipchars=[], extra_careful=1):
# if extra_careful we don't attempt to do anything to
# the string if it might have been converted already.
if extra_careful and _been_fixed_regex.findall(text):
return text
if type(skipchars) == type('s'):
skipchars = [skipchars]
keyholder= {}
for x in _badchars_regex.findall(text):
if x not in skipchars:
keyholder[x] = 1
text = text.replace('&','&')
text = text.replace('\x80', '€')
for each in keyholder.keys():
if each == '&':
continue
better = entitydefs_inverted[each]
if not better.startswith('&#'):
better = '&%s;'%entitydefs_inverted[each]
text = text.replace(each, better)
return text
entitydefs_inverted = {}
for k,v in entitydefs.items():
entitydefs_inverted[v] = k
_badchars_regex = re.compile('|'.join(entitydefs.values()))
_been_fixed_regex = re.compile('&\w+;|&#[0-9]+;')
def html_entity_fixer(text, skipchars=[], extra_careful=1):
# if extra_careful we don't attempt to do anything to
# the string if it might have been converted already.
if extra_careful and _been_fixed_regex.findall(text):
return text
if type(skipchars) == type('s'):
skipchars = [skipchars]
keyholder= {}
for x in _badchars_regex.findall(text):
if x not in skipchars:
keyholder[x] = 1
text = text.replace('&','&')
text = text.replace('\x80', '€')
for each in keyholder.keys():
if each == '&':
continue
better = entitydefs_inverted[each]
if not better.startswith('&#'):
better = '&%s;'%entitydefs_inverted[each]
text = text.replace(each, better)
return text
Comment
Show all 9 commentsCommenting is currently disabled in Mobile version