From cd99c1ed07f30163ff0ad2e01c8dc26ec98d94db Mon Sep 17 00:00:00 2001 From: MomentXu Date: Fri, 15 Mar 2019 17:24:15 +0800 Subject: [PATCH] lrm character and rlm character throw exception MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when the code parse html code like: ‎June, 2016 program will throw IndexError exception. I find this bug in the implement of handle_charref. In handle_data, it maybe match the zero element of char, but the lrm and rlm character are defined as ''(empty). So, when program match the zero element of lrm and rlm character data, +++++++++++++++++++++++++++++++++++ elif (self.preceding_stressed and re.match(r'[^\s.!?]', data[0]) and not hn(self.current_tag) and self.current_tag not in ['a', 'code', 'pre']): +++++++++++++++++++++++++++++++++++ This is traceback: Traceback (most recent call last): File "get_email.py", line 37, in text = h.handle(mail_content_string) # html格式 转成 markdown 格式 File "/data/vijay/emailcont_venv/lib/python3.4/site-packages/html2text/__init__.py", line 149, in handle self.feed(data) File "/data/vijay/emailcont_venv/lib/python3.4/site-packages/html2text/__init__.py", line 146, in feed HTMLParser.HTMLParser.feed(self, data) File "/usr/lib64/python3.4/html/parser.py", line 165, in feed self.goahead(0) File "/usr/lib64/python3.4/html/parser.py", line 268, in goahead self.handle_charref(name) File "/data/vijay/emailcont_venv/lib/python3.4/site-packages/html2text/__init__.py", line 186, in handle_charref self.handle_data(self.charref(c), True) File "/data/vijay/emailcont_venv/lib/python3.4/site-packages/html2text/__init__.py", line 802, in handle_data and re.match(r'[^\s.!?]', data[0]) IndexError: string index out of range --- html2text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html2text.py b/html2text.py index 17528901..b2c5d54d 100755 --- a/html2text.py +++ b/html2text.py @@ -80,7 +80,7 @@ def name2cp(k): 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', -'lrm':'', 'rlm':''} +'lrm':' ', 'rlm':' '} unifiable_n = {}