sanitize.py

"""
sanitize: bringing sanitiy to world of messed-up data
"""

__author__ = ["Mark Pilgrim <http://diveintomark.org/>", 
              "Aaron Swartz <http://www.aaronsw.com/>"]
__contributors__ = ["Sam Ruby <http://intertwingly.net/>"]
__license__ = "BSD"
__version__ = "0.33"

_debug = 0

# If you want sanitize to automatically run HTML markup through HTML Tidy, set
# this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
# or utidylib <http://utidylib.berlios.de/>.
TIDY_MARKUP = 0

# List of Python interfaces for HTML Tidy, in order of preference.  Only useful
# if TIDY_MARKUP = 1
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]

import sgmllib, re, urlparse, sys

# chardet library auto-detects character encodings
# Download from http://chardet.feedparser.org/
try:
    import chardet
    if _debug:
        import chardet.constants
        chardet.constants._debug = 1

    _chardet = lambda data: chardet.detect(data)['encoding']
except:
    chardet = None
    _chardet = lambda data: None

class _BaseHTMLProcessor(sgmllib.SGMLParser):
    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
      'img', 'input', 'isindex', 'link', 'meta', 'param']
    
    _r_barebang = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE)
    _r_bareamp = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
    _r_shorttag = re.compile(r'<([^<\s]+?)\s*/>')
    
    def __init__(self, encoding):
        self.encoding = encoding
        if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
        sgmllib.SGMLParser.__init__(self)
        
    def reset(self):
        self.pieces = []
        sgmllib.SGMLParser.reset(self)

    def _shorttag_replace(self, match):
        tag = match.group(1)
        if tag in self.elements_no_end_tag:
            return '<' + tag + ' />'
        else:
            return '<' + tag + '></' + tag + '>'
        
    def feed(self, data):
        if _debug: sys.stderr.write('_BaseHTMLProcessor, feed, data=%s\n' % repr(data))
        data = self._r_barebang.sub(r'&lt;!\1', data)
        data = self._r_bareamp.sub("&amp;", data)
        data = self._r_shorttag.sub(self._shorttag_replace, data) 
        if self.encoding and type(data) == type(u''):
            data = data.encode(self.encoding)
        sgmllib.SGMLParser.feed(self, data)

    def normalize_attrs(self, attrs):
        # utility method to be called by descendants
        attrs = [(k.lower(), v) for k, v in attrs]
        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
        return attrs

    def unknown_starttag(self, tag, attrs):
        # called for each start tag
        # attrs is a list of (attr, value) tuples
        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
        
        def attrquote(data):
            data = self._r_bareamp.sub("&amp;", data)
            data = data.replace('"', '&quot;')
            return data
        
        newattrs = []
        # hack to reverse attribute decoding in py2.5
        for key, value in attrs:
            newvalue = []
            for c in value:
                if ord(c) > 127:
                    c = '&#' + str(ord(c)) + ';'
                newvalue.append(c)
            newattrs.append((key, ''.join(newvalue)))
        strattrs = ''.join([' %s="%s"' % (key, attrquote(value)) for key, value in newattrs])
        
        if tag in self.elements_no_end_tag:
            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
        else:
            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())

    def unknown_endtag(self, tag):
        # called for each end tag, e.g. for </pre>, tag will be 'pre'
        # Reconstruct the original end tag.
        if tag not in self.elements_no_end_tag:
            self.pieces.append("</%(tag)s>" % locals())

    def handle_charref(self, ref):
        # called for each character reference, e.g. for '&#160;', ref will be '160'
        # Reconstruct the original character reference.
        self.pieces.append('&#%(ref)s;' % locals())
        
    def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
        # Reconstruct the original entity reference.
        self.pieces.append('&%(ref)s;' % locals())

    def handle_data(self, text):
        # called for each block of plain text, i.e. outside of any tag and
        # not containing any character or entity references
        # Store the original text verbatim.
        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
        self.pieces.append(text)
        
    def handle_comment(self, text):
        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
        # Reconstruct the original comment.
        self.pieces.append('<!--%(text)s-->' % locals())
        
    def handle_pi(self, text):
        # called for each processing instruction, e.g. <?instruction>
        # Reconstruct original processing instruction.
        self.pieces.append('<?%(text)s>' % locals())

    def handle_decl(self, text):
        # called for the DOCTYPE, if present, e.g.
        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
        #     "http://www.w3.org/TR/html4/loose.dtd">
        # Reconstruct original DOCTYPE
        self.pieces.append('<!%(text)s>' % locals())
        
    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
    def _scan_name(self, i, declstartpos):
        rawdata = self.rawdata
        n = len(rawdata)
        if i == n:
            return None, -1
        m = self._new_declname_match(rawdata, i)
        if m:
            s = m.group()
            name = s.strip()
            if (i + len(s)) == n:
                return None, -1  # end of buffer
            return name.lower(), m.end()
        else:
            self.handle_data(rawdata)
#            self.updatepos(declstartpos, i)
            return None, -1

    def output(self):
        '''Return processed HTML as a single string'''
        return ''.join(self.pieces) 
        # used to be: [str(p) for p in self.pieces]
        # not sure why... -- ASw

class _HTMLSanitizer(_BaseHTMLProcessor):
    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
      'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', 
      'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
      'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
      'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', 
      'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
      'strong', 'sub', 'sup', 'table', 'textarea', 'tbody', 'td', 'tfoot', 'th', 
      'thead', 'tr', 'tt', 'u', 'ul', 'var']
    
    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
      'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
      'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
      'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
      'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
      'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
      'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
      'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
      'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
      'usemap', 'valign', 'value', 'vspace', 'width']
    
    # http://www.iana.org/assignments/uri-schemes.html
    acceptable_uri_schemes = [
      'cid', 'crid', 'data', 'dav', 'dict', 'dns', 'fax', 
      'ftp', 'go', 'gopher', 'h323', 'http', 'https', 'im',
      'imap', 'info', 'ipp', 'iris.beep', 'ldap', 'mailto',
      'mid', 'modem', 'news', 'nfs', 'nntp', 'pres', 'rtsp',
      'sip', 'sips', 'snmp', 'tag', 'tel', 'telnet', 'tftp', 
      'urn',
      
      # unspecified
      # http://esw.w3.org/topic/UriSchemes
      
      'aim', 'irc', 'feed', 'webcal']

    ignorable_elements = ['script', 'applet', 'style']
    
    relative_uris = [('a', 'href'),
                     ('applet', 'codebase'),
                     ('area', 'href'),
                     ('blockquote', 'cite'),
                     ('body', 'background'),
                     ('del', 'cite'),
                     ('form', 'action'),
                     ('frame', 'longdesc'),
                     ('frame', 'src'),
                     ('iframe', 'longdesc'),
                     ('iframe', 'src'),
                     ('head', 'profile'),
                     ('img', 'longdesc'),
                     ('img', 'src'),
                     ('img', 'usemap'),
                     ('input', 'src'),
                     ('input', 'usemap'),
                     ('ins', 'cite'),
                     ('link', 'href'),
                     ('object', 'classid'),
                     ('object', 'codebase'),
                     ('object', 'data'),
                     ('object', 'usemap'),
                     ('q', 'cite'),
                     ('script', 'src')]
    
    def __init__(self, baseuri, encoding, required_attributes=None):
        _BaseHTMLProcessor.__init__(self, encoding)
        self.baseuri = baseuri
        self.required_attributes = required_attributes
        # urlparse caches URL parsing for some reason
        # and its cache doesn't distinguish between Unicode and non-unicode
        # so it caches the Unicode version feedparser sends it
        # which causes breakage
        urlparse._parse_cache = {}

    def resolveURI(self, uri):
        if ':' in uri:
            scheme, rest = uri.split(':', 1)
            if scheme not in self.acceptable_uri_schemes:
                uri = '#' + rest
        if self.baseuri:
            return urlparse.urljoin(self.baseuri, uri)
        else:
            return uri

    def reset(self):
        _BaseHTMLProcessor.reset(self)
        self.tag_stack = []
        self.ignore_level = 0

    def feed(self, data):
        _BaseHTMLProcessor.feed(self, data)
        while self.tag_stack:
            _BaseHTMLProcessor.unknown_endtag(self, self.tag_stack.pop())
        
    def unknown_starttag(self, tag, attrs):
        if tag in self.ignorable_elements:
            self.ignore_level += 1
            return
        
        if self.ignore_level:
            return
        
        if tag in self.acceptable_elements:
            attrs = self.normalize_attrs(attrs)
            attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
            attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
            if self.required_attributes and tag in self.required_attributes:
                attrs = [(key, value) for key, value in attrs if key not in [k for k, v in self.required_attributes[tag]]]
                attrs += self.required_attributes[tag]
            
            if tag not in self.elements_no_end_tag:
                self.tag_stack.append(tag)
            _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
        
    def unknown_endtag(self, tag):
        if tag in self.ignorable_elements:
            self.ignore_level -= 1
            return
        
        if self.ignore_level:
            return
        
        if tag in self.acceptable_elements and tag not in self.elements_no_end_tag:
            match = False
            while self.tag_stack:
                top = self.tag_stack.pop()
                if top == tag:
                    match = True
                    break
                _BaseHTMLProcessor.unknown_endtag(self, top)

            if match:
                _BaseHTMLProcessor.unknown_endtag(self, tag)

    def handle_pi(self, text):
        pass

    def handle_decl(self, text):
        pass

    def handle_data(self, text):
        if not self.ignore_level:
            text = text.replace('<', '')
            _BaseHTMLProcessor.handle_data(self, text)

def HTML(htmlSource, encoding='utf8', baseuri=None, required_attributes=None, addnofollow=False):
    if not required_attributes:
        required_attributes = {}
    if addnofollow:
        required_attributes['a'] = [('rel', 'nofollow')]
    p = _HTMLSanitizer(baseuri, encoding, required_attributes)
    p.feed(htmlSource)
    data = p.output()
    if TIDY_MARKUP:
        # loop through list of preferred Tidy interfaces looking for one that's installed,
        # then set up a common _tidy function to wrap the interface-specific API.
        _tidy = None
        for tidy_interface in PREFERRED_TIDY_INTERFACES:
            try:
                if tidy_interface == "uTidy":
                    from tidy import parseString as _utidy
                    def _tidy(data, **kwargs):
                        return str(_utidy(data, **kwargs))
                    break
                elif tidy_interface == "mxTidy":
                    from mx.Tidy import Tidy as _mxtidy
                    def _tidy(data, **kwargs):
                        nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
                        return data
                    break
            except:
                pass
        if _tidy:
            utf8 = type(data) == type(u'')
            if utf8:
                data = data.encode('utf-8')
            data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
            if utf8:
                data = unicode(data, 'utf-8')
            if data.count('<body'):
                data = data.split('<body', 1)[1]
                if data.count('>'):
                    data = data.split('>', 1)[1]
            if data.count('</body'):
                data = data.split('</body', 1)[0]
    data = data.strip().replace('\r\n', '\n')
    return data

unicode_bom_map = {
  '\x00\x00\xfe\xff': 'utf-32be',
  '\xff\xfe\x00\x00': 'utf-32le',
  '\xfe\xff##': 'utf-16be',
  '\xff\xfe##': 'utf-16le',
  '\xef\bb\bf': 'utf-8'
}
xml_bom_map = {
  '\x00\x00\x00\x3c': 'utf-32be',
  '\x3c\x00\x00\x00': 'utf-32le',
  '\x00\x3c\x00\x3f': 'utf-16be',
  '\x3c\x00\x3f\x00': 'utf-16le',
  '\x3c\x3f\x78\x6d': 'utf-8', # or equivalent
  '\x4c\x6f\xa7\x94': 'ebcdic'
}

_ebcdic_to_ascii_map = None
def _ebcdic_to_ascii(s):
    global _ebcdic_to_ascii_map
    if not _ebcdic_to_ascii_map:
        emap = (
            0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
            16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
            128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
            144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
            32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
            38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
            45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
            186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
            195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
            202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
            209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
            216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
            123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
            125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
            92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
            48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
            )
        import string
        _ebcdic_to_ascii_map = string.maketrans( \
            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
    return s.translate(_ebcdic_to_ascii_map)

def _startswithbom(text, bom):
    for i, c in enumerate(bom):
        if c == '#':
            if text[i] == '\x00':
                return False
        else:
            if text[i] != c:
                return False
    return True

def _detectbom(text, bom_map=unicode_bom_map):
    for bom, encoding in bom_map.iteritems():
        if _startswithbom(text, bom):
            return encoding
    return None

def characters(text, isXML=False, guess=None):
    """
    Takes a string text of unknown encoding and tries to 
    provide a Unicode string for it.
    """
    _triedEncodings = []
    def tryEncoding(encoding):
        if encoding and encoding not in _triedEncodings:
            if encoding == 'ebcdic':
                return _ebcdic_to_ascii(text)
            try:
                return unicode(text, encoding)
            except UnicodeDecodeError:
                pass
            _triedEncodings.append(encoding)
    
    return (
      tryEncoding(guess) or 
      tryEncoding(_detectbom(text)) or 
      isXML and tryEncoding(_detectbom(text, xml_bom_map)) or
      tryEncoding(_chardet(text)) or
      tryEncoding('utf8') or
      tryEncoding('windows-1252') or
      tryEncoding('iso-8859-1'))