From cd99c1ed07f30163ff0ad2e01c8dc26ec98d94db Mon Sep 17 00:00:00 2001
From: MomentXu <momentxu@gmail.com>
Date: Fri, 15 Mar 2019 17:24:15 +0800
Subject: [PATCH] lrm  character and rlm character throw exception
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

when the code parse html code like:
<b> &#8206;June, 2016</b>
program will throw IndexError exception.
I find this bug in the implement of handle_charref.

In handle_data, it maybe match the zero element of char, but the lrm and rlm character are defined as ''(empty).
So, when program match the zero element of lrm and rlm character data,
+++++++++++++++++++++++++++++++++++
        elif (self.preceding_stressed
              and re.match(r'[^\s.!?]', data[0])
              and not hn(self.current_tag)
              and self.current_tag not in ['a', 'code', 'pre']):
+++++++++++++++++++++++++++++++++++
This is traceback:
Traceback (most recent call last):
  File "get_email.py", line 37, in <module>
    text = h.handle(mail_content_string)  # html格式 转成 markdown 格式
  File "/data/vijay/emailcont_venv/lib/python3.4/site-packages/html2text/__init__.py", line 149, in handle
    self.feed(data)
  File "/data/vijay/emailcont_venv/lib/python3.4/site-packages/html2text/__init__.py", line 146, in feed
    HTMLParser.HTMLParser.feed(self, data)
  File "/usr/lib64/python3.4/html/parser.py", line 165, in feed
    self.goahead(0)
  File "/usr/lib64/python3.4/html/parser.py", line 268, in goahead
    self.handle_charref(name)
  File "/data/vijay/emailcont_venv/lib/python3.4/site-packages/html2text/__init__.py", line 186, in handle_charref
    self.handle_data(self.charref(c), True)
  File "/data/vijay/emailcont_venv/lib/python3.4/site-packages/html2text/__init__.py", line 802, in handle_data
    and re.match(r'[^\s.!?]', data[0])
IndexError: string index out of range
---
 html2text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/html2text.py b/html2text.py
index 17528901..b2c5d54d 100755
--- a/html2text.py
+++ b/html2text.py
@@ -80,7 +80,7 @@ def name2cp(k):
 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
-'lrm':'', 'rlm':''}
+'lrm':' ', 'rlm':' '}
 
 unifiable_n = {}