-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathhtmlslacker.py
142 lines (124 loc) · 4.25 KB
/
htmlslacker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
try:
from html.parser import HTMLParser
from html.entities import name2codepoint
except ImportError:
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
import re
LINEBR = "::LINEBR::"
class HTMLSlacker(HTMLParser):
"""
>>> from htmlslacker import HTMLSlacker
>>> HTMLSlacker('<b>Hello</b>, <i>Slack</i>!').get_output()
'*Hello*,_Slack_!'
"""
def __init__(self, html, *args, **kwargs):
# call parent constructor __init__
try:
super().__init__(*args, **kwargs)
except TypeError:
HTMLParser.__init__(self, *args, **kwargs)
self.skip = False
self.isProcessingList = False
self.isProcessingOrderedList = False
self.orderedNumber = 0
# slackified string
self.output = ''
# send to HTMLParser feed function to parse HTML string
self.feed(html)
def handle_starttag(self, tag, attrs):
"""
Create slack markdown
https://api.slack.com/docs/message-formatting
:param tag: income tag, that will be switched into slack supported markdown
:param attrs: we need to recover attributes of anchor
:return:
"""
if tag == 'br' or tag == 'p':
self.output += LINEBR
if tag == 'b' or tag == 'strong':
self.output += ' *'
if re.match("h[1-6]{1}", tag):
self.output += ' *'
if tag == 'i' or tag == 'em':
self.output += ' _'
if tag == 'code':
self.output += '`'
if tag == 'a':
self.output += '<'
for attr in attrs:
if attr[0] == 'href':
self.output += attr[1] + '|'
if tag == 'style' or tag == 'script':
self.skip = True
if tag == 'ul':
self.isProcessingList = True
if tag == 'li' and self.isProcessingList:
self.output += '• '
if tag == 'ol':
self.orderedNumber = 1
self.isProcessingOrderedList = True
if tag == 'li' and self.isProcessingOrderedList:
self.output += '{}. '.format(self.orderedNumber)
self.orderedNumber = self.orderedNumber + 1
def handle_endtag(self, tag):
"""
https://api.slack.com/docs/message-formatting
:param tag: endtag. Close tag via markdown
:return:
"""
if tag == 'b' or tag == 'strong':
self.output += '* '
if re.match("h[1-6]{1}", tag):
self.output += '* '+LINEBR
if tag == 'i' or tag == 'em':
self.output += '_ '
if tag == 'a':
self.output += '>'
if tag == 'code':
self.output += '`'
if tag == 'style' or tag == 'script':
self.skip = False
if tag == 'ul':
self.isProcessingList = False
if tag == 'li' and self.isProcessingList:
self.output += LINEBR
if tag == 'ol':
self.isProcessingOrderedList = False
if tag == 'li' and self.isProcessingOrderedList:
self.output += LINEBR
def handle_data(self, data):
"""
concatenate TEXT nodes into output
:param data:
:return:
"""
if not self.skip:
self.output += data
def handle_comment(self, data):
pass
def handle_entityref(self, name):
c = chr(name2codepoint[name])
pass
def handle_charref(self, name):
if name.startswith('x'):
c = chr(int(name[1:], 16))
else:
c = chr(int(name))
def handle_decl(self, data):
pass
def get_output(self):
"""
substitute multiple whitespace with single whitespace
link: https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python
:return:
"""
output = self.output
output = re.sub(r'\*(\s\*)+', '*', output)
output = re.sub(r'_( _)+', '_', output)
output = output.replace('[] ', '☐ ').replace('[x] ', '☑︎ ')
output = ' '.join(output.split())
output = output.replace(LINEBR, "\n")
output = re.sub(r' *\n *', '\n', output)
output = output.strip()
return output