Multi-table support (#2)

Annmayn · web-flow · commit 7eedbcac638b · 2021-01-25T20:51:54.000+05:45
* update readme; add url in setup

* add get_workbook option

* add consideration for multiple tables

* update requirements

* update gitignore, setup
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,5 @@
 tmp/
 **egg-info/
 build/
-dist/
+dist/
+**tests/
diff --git a/README.md b/README.md
@@ -1,6 +1,11 @@
 # HTML2Excel Documentation
 Library to convert HTML Tables to Excel file.
 
+While libraries like pandas do read html files, they often don't work with merged cells and line breaks.
+This library was created with the sole intention of converting HTML tables to Excel files
+as they're seen while opening them with softwares such as MS Excel and LibreOffice. 
+
+A sample flask host code is provided [in this link](https://github.com/Annmayn/sample-host-html2excel)
 
 ## Installation
 ```pip install html2excel```
diff --git a/html2excel/__main__.py b/html2excel/__main__.py
@@ -1,6 +1,7 @@
 import sys
 from html2excel import ExcelParser
 
+
 def run():
     if len(sys.argv) == 3:
         file_path = sys.argv[1]
@@ -11,6 +12,7 @@ def run():
         # First argument for file name, we'll ignore that
         print("Expected 2 arguments. Got {num}".format(num=len(sys.argv)-1))
 
+
 if __name__ == "__main__":
     run()
 
diff --git a/html2excel/base/parser.py b/html2excel/base/parser.py
@@ -9,8 +9,12 @@ def __init__(self, file_path):
         self.wb = Workbook()
         self.ws = self.wb.active
         self.file_path = file_path
+        self.load_workbook()
 
-    def read_file(self):
+    def load_workbook(self):
+        raise NotImplemented
+
+    def _read_file(self):
         """
         returns the data contained in a file
         """
@@ -21,12 +25,12 @@ def read_file(self):
         except:
             raise Exception("Error while reading input file")
 
-    def get_row(self, table: Tag, tags: Union[List, str]) -> Iterator[Tag]:
+    def _get_row(self, table: Tag, tags: Union[List, str]) -> Iterator[Tag]:
         row_data = table.find_all(tags)
         for each in row_data:
             yield each
 
-    def pre_validate_and_format(self, i: int, j: int, col: Tag) -> Tuple[int, str]:
+    def _pre_validate_and_format(self, i: int, j: int, col: Tag) -> Tuple[int, str]:
         attrs = col.attrs
         end = j
         if "colspan" in attrs:
@@ -38,13 +42,13 @@ def pre_validate_and_format(self, i: int, j: int, col: Tag) -> Tuple[int, str]:
         end += 1
         return (end, col.text.strip())
 
-    def write_cell(self, row, col, val) -> None:
+    def _write_cell(self, row, col, val) -> None:
         self.ws.cell(row=row, column=col).value = val
 
     def get_workbook(self) -> Workbook:
         return self.wb
 
-    def save_workbook(self, loc) -> bool:
+    def _save_workbook(self, loc) -> bool:
         try:
             self.wb.save(loc)
             return True
diff --git a/html2excel/excel/parser.py b/html2excel/excel/parser.py
@@ -6,20 +6,30 @@ class ExcelParser(Parser):
     def __init__(self, file_path: str):
         super().__init__(file_path)
 
-    def to_excel(self, save_file_path: str, ignore_merged_row: bool = True) -> bool:
-        # TODO: handle case when rows are merged
-        data = self.read_file()
-        soup = BeautifulSoup(data, features='html.parser')
-        table_data = soup.table
-        if table_data is None:
+    def load_workbook(self):
+        data = self._read_file()
+        soup = BeautifulSoup(data, features='html5lib')
+
+        all_data = soup.html.body.find_all(recursive=False)
+        if all_data is None:
             raise Exception("No table found")
-        data_rows = self.get_row(table_data, ["tr"])
-        for i, row in enumerate(data_rows, 1):
-            columns = self.get_row(row, ["th", "td"])
-            next_j = 1
-            for j, col in enumerate(columns, 1):
-                j = next_j
-                next_j, col_data = self.pre_validate_and_format(i, j, col)
-                self.write_cell(i, j, col_data)
+        i, offset = 0, 0
+        for each in all_data:
+            if each.name == 'br':
+                offset += 1
+            elif each.name == 'table':
+                data_rows = self._get_row(each, ["tr"])
+                for i, row in enumerate(data_rows, 1):
+                    i += offset
+                    columns = self._get_row(row, ["th", "td"])
+                    next_j = 1
+                    for j, col in enumerate(columns, 1):
+                        j = next_j
+                        next_j, col_data = self._pre_validate_and_format(
+                            i, j, col)
+                        self._write_cell(i, j, col_data)
+                offset += i
 
-        self.save_workbook(save_file_path)
+    def to_excel(self, save_file_path: str, ignore_merged_row: bool = True) -> bool:
+        # TODO: handle case when rows are merged
+        self._save_workbook(save_file_path)
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ certifi==2020.12.5
 et-xmlfile==1.0.1
 jdcal==1.4.1
 openpyxl==3.0.5
+html5lib==1.1
diff --git a/setup.py b/setup.py
@@ -4,14 +4,15 @@
     readme = f.read()
 
 setup(
-    name = "html2excel",
-    version = "0.0.4",
-    author = "Neema Tsering",
-    author_email = "ntvirus333@gmail.com",
-    description = ("Convert HTML Table to Excel file"),
-    long_description = readme,
-    long_description_content_type = "text/markdown",
-    install_requires = ['bs4', 'openpyxl'],
-    license = "MIT",
-    packages = find_packages(),
-)
+    name="html2excel",
+    version="0.0.5",
+    author="Neema Tsering",
+    author_email="ntvirus333@gmail.com",
+    description=("Convert HTML Table to Excel file"),
+    long_description=readme,
+    long_description_content_type="text/markdown",
+    install_requires=['bs4', 'openpyxl', 'html5lib'],
+    license="MIT",
+    packages=find_packages(),
+    url="https://github.com/Annmayn/html2excel",
+)