diff --git a/.eggs/README.txt b/.eggs/README.txt new file mode 100644 index 0000000..5d01668 --- /dev/null +++ b/.eggs/README.txt @@ -0,0 +1,6 @@ +This directory contains eggs that were downloaded by setuptools to build, test, and run plug-ins. + +This directory caches those eggs to prevent repeated downloads. + +However, it is safe to delete this directory. + diff --git a/htmlslacker/htmlslacker.py b/htmlslacker/htmlslacker.py index a81c5de..c104c4e 100644 --- a/htmlslacker/htmlslacker.py +++ b/htmlslacker/htmlslacker.py @@ -4,6 +4,7 @@ except ImportError: from HTMLParser import HTMLParser from htmlentitydefs import name2codepoint +import re LINEBR = "::LINEBR::" @@ -23,6 +24,9 @@ def __init__(self, html, *args, **kwargs): except TypeError: HTMLParser.__init__(self, *args, **kwargs) self.skip = False + self.isProcessingList = False + self.isProcessingOrderedList = False + self.orderedNumber = 0 # slackified string self.output = '' @@ -43,9 +47,11 @@ def handle_starttag(self, tag, attrs): if tag == 'br' or tag == 'p': self.output += LINEBR if tag == 'b' or tag == 'strong': - self.output += '*' + self.output += ' *' + if re.match("h[1-6]{1}", tag): + self.output += ' *' if tag == 'i' or tag == 'em': - self.output += '_' + self.output += ' _' if tag == 'code': self.output += '`' if tag == 'a': @@ -55,6 +61,16 @@ def handle_starttag(self, tag, attrs): self.output += attr[1] + '|' if tag == 'style' or tag == 'script': self.skip = True + if tag == 'ul': + self.isProcessingList = True + if tag == 'li' and self.isProcessingList: + self.output += '• ' + if tag == 'ol': + self.orderedNumber = 1 + self.isProcessingOrderedList = True + if tag == 'li' and self.isProcessingOrderedList: + self.output += '{}. '.format(self.orderedNumber) + self.orderedNumber = self.orderedNumber + 1 def handle_endtag(self, tag): """ @@ -63,15 +79,25 @@ def handle_endtag(self, tag): :return: """ if tag == 'b' or tag == 'strong': - self.output += '*' + self.output += '* ' + if re.match("h[1-6]{1}", tag): + self.output += '* '+LINEBR if tag == 'i' or tag == 'em': - self.output += '_' + self.output += '_ ' if tag == 'a': self.output += '>' if tag == 'code': self.output += '`' if tag == 'style' or tag == 'script': self.skip = False + if tag == 'ul': + self.isProcessingList = False + if tag == 'li' and self.isProcessingList: + self.output += LINEBR + if tag == 'ol': + self.isProcessingOrderedList = False + if tag == 'li' and self.isProcessingOrderedList: + self.output += LINEBR def handle_data(self, data): """ @@ -105,4 +131,12 @@ def get_output(self): link: https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python :return: """ - return ' '.join(self.output.split()).replace(LINEBR, "\n") + output = self.output + output = re.sub(r'\*(\s\*)+', '*', output) + output = re.sub(r'_( _)+', '_', output) + output = output.replace('[] ', '☐ ').replace('[x] ', '☑︎ ') + output = ' '.join(output.split()) + output = output.replace(LINEBR, "\n") + output = re.sub(r' *\n *', '\n', output) + output = output.strip() + return output diff --git a/test_general.py b/test_general.py index 3bd2a2f..8f80fe1 100644 --- a/test_general.py +++ b/test_general.py @@ -11,7 +11,7 @@ def test_example_1(): link in a paragraph!
""" - expected = "*Hello*\n There is _something_ interesting about `this doc` \n And