Skip to content

Commit 7eedbca

Browse files
authored
Multi-table support (#2)
* update readme; add url in setup * add get_workbook option * add consideration for multiple tables * update requirements * update gitignore, setup
1 parent a9e6334 commit 7eedbca

File tree

7 files changed

+56
-32
lines changed

7 files changed

+56
-32
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
tmp/
55
**egg-info/
66
build/
7-
dist/
7+
dist/
8+
**tests/

README.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
# HTML2Excel Documentation
22
Library to convert HTML Tables to Excel file.
33

4+
While libraries like pandas do read html files, they often don't work with merged cells and line breaks.
5+
This library was created with the sole intention of converting HTML tables to Excel files
6+
as they're seen while opening them with softwares such as MS Excel and LibreOffice.
7+
8+
A sample flask host code is provided [in this link](https://github.com/Annmayn/sample-host-html2excel)
49

510
## Installation
611
```pip install html2excel```

html2excel/__main__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import sys
22
from html2excel import ExcelParser
33

4+
45
def run():
56
if len(sys.argv) == 3:
67
file_path = sys.argv[1]
@@ -11,6 +12,7 @@ def run():
1112
# First argument for file name, we'll ignore that
1213
print("Expected 2 arguments. Got {num}".format(num=len(sys.argv)-1))
1314

15+
1416
if __name__ == "__main__":
1517
run()
1618

html2excel/base/parser.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,12 @@ def __init__(self, file_path):
99
self.wb = Workbook()
1010
self.ws = self.wb.active
1111
self.file_path = file_path
12+
self.load_workbook()
1213

13-
def read_file(self):
14+
def load_workbook(self):
15+
raise NotImplemented
16+
17+
def _read_file(self):
1418
"""
1519
returns the data contained in a file
1620
"""
@@ -21,12 +25,12 @@ def read_file(self):
2125
except:
2226
raise Exception("Error while reading input file")
2327

24-
def get_row(self, table: Tag, tags: Union[List, str]) -> Iterator[Tag]:
28+
def _get_row(self, table: Tag, tags: Union[List, str]) -> Iterator[Tag]:
2529
row_data = table.find_all(tags)
2630
for each in row_data:
2731
yield each
2832

29-
def pre_validate_and_format(self, i: int, j: int, col: Tag) -> Tuple[int, str]:
33+
def _pre_validate_and_format(self, i: int, j: int, col: Tag) -> Tuple[int, str]:
3034
attrs = col.attrs
3135
end = j
3236
if "colspan" in attrs:
@@ -38,13 +42,13 @@ def pre_validate_and_format(self, i: int, j: int, col: Tag) -> Tuple[int, str]:
3842
end += 1
3943
return (end, col.text.strip())
4044

41-
def write_cell(self, row, col, val) -> None:
45+
def _write_cell(self, row, col, val) -> None:
4246
self.ws.cell(row=row, column=col).value = val
4347

4448
def get_workbook(self) -> Workbook:
4549
return self.wb
4650

47-
def save_workbook(self, loc) -> bool:
51+
def _save_workbook(self, loc) -> bool:
4852
try:
4953
self.wb.save(loc)
5054
return True

html2excel/excel/parser.py

+25-15
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,30 @@ class ExcelParser(Parser):
66
def __init__(self, file_path: str):
77
super().__init__(file_path)
88

9-
def to_excel(self, save_file_path: str, ignore_merged_row: bool = True) -> bool:
10-
# TODO: handle case when rows are merged
11-
data = self.read_file()
12-
soup = BeautifulSoup(data, features='html.parser')
13-
table_data = soup.table
14-
if table_data is None:
9+
def load_workbook(self):
10+
data = self._read_file()
11+
soup = BeautifulSoup(data, features='html5lib')
12+
13+
all_data = soup.html.body.find_all(recursive=False)
14+
if all_data is None:
1515
raise Exception("No table found")
16-
data_rows = self.get_row(table_data, ["tr"])
17-
for i, row in enumerate(data_rows, 1):
18-
columns = self.get_row(row, ["th", "td"])
19-
next_j = 1
20-
for j, col in enumerate(columns, 1):
21-
j = next_j
22-
next_j, col_data = self.pre_validate_and_format(i, j, col)
23-
self.write_cell(i, j, col_data)
16+
i, offset = 0, 0
17+
for each in all_data:
18+
if each.name == 'br':
19+
offset += 1
20+
elif each.name == 'table':
21+
data_rows = self._get_row(each, ["tr"])
22+
for i, row in enumerate(data_rows, 1):
23+
i += offset
24+
columns = self._get_row(row, ["th", "td"])
25+
next_j = 1
26+
for j, col in enumerate(columns, 1):
27+
j = next_j
28+
next_j, col_data = self._pre_validate_and_format(
29+
i, j, col)
30+
self._write_cell(i, j, col_data)
31+
offset += i
2432

25-
self.save_workbook(save_file_path)
33+
def to_excel(self, save_file_path: str, ignore_merged_row: bool = True) -> bool:
34+
# TODO: handle case when rows are merged
35+
self._save_workbook(save_file_path)

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ certifi==2020.12.5
22
et-xmlfile==1.0.1
33
jdcal==1.4.1
44
openpyxl==3.0.5
5+
html5lib==1.1

setup.py

+12-11
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44
readme = f.read()
55

66
setup(
7-
name = "html2excel",
8-
version = "0.0.4",
9-
author = "Neema Tsering",
10-
author_email = "[email protected]",
11-
description = ("Convert HTML Table to Excel file"),
12-
long_description = readme,
13-
long_description_content_type = "text/markdown",
14-
install_requires = ['bs4', 'openpyxl'],
15-
license = "MIT",
16-
packages = find_packages(),
17-
)
7+
name="html2excel",
8+
version="0.0.5",
9+
author="Neema Tsering",
10+
author_email="[email protected]",
11+
description=("Convert HTML Table to Excel file"),
12+
long_description=readme,
13+
long_description_content_type="text/markdown",
14+
install_requires=['bs4', 'openpyxl', 'html5lib'],
15+
license="MIT",
16+
packages=find_packages(),
17+
url="https://github.com/Annmayn/html2excel",
18+
)

0 commit comments

Comments
 (0)