Unstructured-IO
diff --git a/‎.github/dependabot.yml
Lines changed: 13 additions & 0 deletions b/‎.github/dependabot.yml
Lines changed: 13 additions & 0 deletions
diff --git a/‎.github/workflows/ci.yml
Lines changed: 102 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 102 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 7 additions & 0 deletions b/‎.gitignore
Lines changed: 7 additions & 0 deletions
diff --git a/‎.models/.gitkeep b/‎.models/.gitkeep
diff --git a/‎CHANGELOG.md
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile
Lines changed: 136 additions & 0 deletions b/‎Makefile
Lines changed: 136 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 74 additions & 2 deletions b/‎README.md
Lines changed: 74 additions & 2 deletions
@@ -0,0 +1,13 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/requirements"
+    schedule:
+      interval: "weekly"
+
+  - package-ecosystem: "github-actions"
+    # NOTE(robinson) - Workflow files stored in the
+    # default location of `.github/workflows`
+    directory: "/"
+    schedule:
+      interval: "weekly"
@@ -0,0 +1,102 @@
+name: CI
+
+on:
+  push:
+    branches: [ main, robinson/initial-repo-setup ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  PYTHON_VERSION: 3.8
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/cache@v3
+      id: virtualenv-cache
+      with:
+        path: |
+          .venv
+        key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
+    - name: Set up Python ${{ env.PYTHON_VERSION }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Setup virtual environment (no cache hit)
+      if: steps.virtualenv-cache.outputs.cache-hit != 'true'
+      run: |
+        python${{ env.PYTHON_VERSION }} -m venv .venv
+        source .venv/bin/activate
+        make install-ci
+
+  lint:
+    runs-on: ubuntu-latest
+    needs: setup
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/cache@v3
+      id: virtualenv-cache
+      with:
+        path: .venv
+        key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
+    # NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
+    # We can take this out when we implement the fix in CORE-99
+    - name: Setup virtual environment (no cache hit)
+      if: steps.virtualenv-cache.outputs.cache-hit != 'true'
+      run: |
+        python${{ env.PYTHON_VERSION }} -m venv .venv
+        source .venv/bin/activate
+        make install-ci
+    - name: Lint
+      run: |
+        source .venv/bin/activate
+        make check
+
+  shellcheck:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: ShellCheck
+        uses: ludeeus/action-shellcheck@master
+
+  test:
+    runs-on: ubuntu-latest
+    needs: [setup, lint]
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/cache@v3
+      id: virtualenv-cache
+      with:
+        path: |
+          .venv
+        key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
+    # NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
+    # We can take this out when we implement the fix in CORE-99
+    - name: Setup virtual environment (no cache hit)
+      if: steps.virtualenv-cache.outputs.cache-hit != 'true'
+      run: |
+        python${{ env.PYTHON_VERSION }} -m venv .venv
+        source .venv/bin/activate
+        make install-ci
+    - name: Test
+      run: |
+        source .venv/bin/activate
+        make test
+        make check-coverage
+
+  changelog:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - if: github.ref != 'refs/heads/main'
+      uses: dorny/paths-filter@v2
+      id: changes
+      with:
+        filters: |
+          src:
+            - 'unstructured_inference/**'
+
+    - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
+      uses: dangoslen/changelog-enforcer@v3
@@ -127,3 +127,10 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Model artifacts
+.models/*
+!.models/.gitkeep
+
+# Mac stuff
+.DS_Store
@@ -0,0 +1,3 @@
+## 0.2.0
+
+* Initial release of unstructured-inference
@@ -0,0 +1,136 @@
+PACKAGE_NAME := unstructured_inference
+PIP_VERSION := 22.3
+
+
+.PHONY: help
+help: Makefile
+	@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
+
+
+###########
+# Install #
+###########
+
+## install-base:            installs core requirements needed for text processing bricks
+.PHONY: install-base
+install-base: install-base-pip-packages
+
+## install:                 installs all test, dev, and experimental requirements
+.PHONY: install
+install: install-base-pip-packages install-dev install-detectron2 install-test
+
+.PHONY: install-ci
+install-ci: install-base-pip-packages install-test
+
+.PHONY: install-base-pip-packages
+install-base-pip-packages:
+	python3 -m pip install pip==${PIP_VERSION}
+	pip install -r requirements/base.txt
+
+.PHONY: install-detectron2
+install-detectron2:
+	pip install "detectron2@git+https://github.com/facebookresearch/[email protected]#egg=detectron2"
+
+.PHONY: install-test
+install-test:
+	pip install -r requirements/test.txt
+
+.PHONY: install-dev
+install-dev:
+	pip install -r requirements/dev.txt
+
+## pip-compile:             compiles all base/dev/test requirements
+.PHONY: pip-compile
+pip-compile:
+	pip-compile requirements/base.in
+	# NOTE(robinson) - We want the dependencies for detectron2 in the requirements.txt, but not
+	# the detectron2 repo itself. If detectron2 is in the requirements.txt file, an order of
+	# operations issue related to the torch library causes the install to fail
+	sed 's/^detectron2 @/# detectron2 @/g' requirements/base.txt
+	pip-compile requirements/dev.in
+	pip-compile requirements/test.in
+
+#########
+# Build #
+#########
+
+## docker-build:            builds the docker container for detectron2
+.PHONY: docker-build
+docker-build:
+	PIP_VERSION=${PIP_VERSION}  ./scripts/docker-build.sh
+
+#########
+# Local #
+########
+
+## download-models          downloads unstructured models (AWS credentials must be in environment variables)
+.PHONY: download-models
+download-models:
+	./scripts/dl-models.sh
+
+## run-app-dev:             runs the FastAPI api with hot reloading
+.PHONY: run-app-dev
+run-app-dev:
+	PYTHONPATH=. uvicorn unstructured_inference.api:app --reload
+
+## start-app-local:         runs FastAPI in the container with hot reloading
+.PHONY: start-app-local
+start-app-local:
+	docker run --name=ml-inference-container -p 127.0.0.1:5000:5000 ml-inference-dev
+
+## stop-app-local:          stops the container
+.PHONY: stop-app-local
+stop-app-local:
+	docker stop ml-inference-container | xargs docker rm
+
+#################
+# Test and Lint #
+#################
+
+## test:                    runs all unittests
+.PHONY: test
+test:
+	PYTHONPATH=. pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
+
+## check:                   runs linters (includes tests)
+.PHONY: check
+check: check-src check-tests check-version
+
+## check-src:               runs linters (source only, no tests)
+.PHONY: check-src
+check-src:
+	black --line-length 100 ${PACKAGE_NAME} --check
+	flake8 ${PACKAGE_NAME}
+	mypy ${PACKAGE_NAME} --ignore-missing-imports
+
+.PHONY: check-tests
+check-tests:
+	black --line-length 100 test_${PACKAGE_NAME} --check
+	flake8 test_${PACKAGE_NAME}
+
+## check-scripts:           run shellcheck
+.PHONY: check-scripts
+check-scripts:
+    # Fail if any of these files have warnings
+	scripts/shellcheck.sh
+
+## check-version:           run check to ensure version in CHANGELOG.md matches version in package
+.PHONY: check-version
+check-version:
+    # Fail if syncing version would produce changes
+	scripts/version-sync.sh -c
+
+## tidy:                    run black
+.PHONY: tidy
+tidy:
+	black --line-length 100 ${PACKAGE_NAME}
+	black --line-length 100 test_${PACKAGE_NAME}
+
+## version-sync:            update __version__.py with most recent version from CHANGELOG.md
+.PHONY: version-sync
+version-sync:
+	scripts/version-sync.sh
+
+.PHONY: check-coverage
+check-coverage:
+	coverage report --fail-under=95
@@ -1,2 +1,74 @@
-# ml-inference
-Repo for housing Unstructured ML inference code
+<h3 align="center">
+  <img src="img/unstructured_logo.png" height="200">
+</h3>
+
+<h3 align="center">
+  <p>Open-Source Pre-Processing Tools for Unstructured Data</p>
+</h3>
+
+The `unstructured-inference` repo contains hosted model inference code for layout parsing models. 
+These models are invoked via API as part of the partitioning bricks in the `unstructured` package.
+
+## Installation
+
+### Package
+
+Requires [`torch>=1.8`](https://pytorch.org/get-started/locally/). Once this is satisfied, run 
+`pip install unstructured-inference`.
+
+### Repository
+
+Clone the repo and run `make install` to install dependencies.
+Run `make help` for a full list of install options.
+
+## Getting Started
+
+To get started with the layout parsing model, use the following commands:
+
+```python
+from unstructured_inference.inference.layout import DocumentLayout
+
+layout = DocumentLayout.from_file("sample-docs/loremipsum.pdf")
+
+print(layout.pages[0].elements)
+```
+
+Once the model has detected the layout and OCR'd the document, the text extracted from the first 
+page of the sample document will be displayed.
+You can convert a given element to a `dict` by running the `.to_dict()` method.
+
+To build the Docker container, run `make docker-build`. Note that Apple hardware with an M1 chip 
+has trouble building `Detectron2` on Docker and for best results you should build it on Linux. To 
+run the API locally, use `make start-app-local`. You can stop the API with `make stop-app-local`. 
+The API will run at `http:/localhost:5000`. 
+You can then `POST` a PDF file to the API endpoint to see its layout with the command:
+```
+curl -X 'POST' 'http://localhost:5000/layout/pdf' -F 'file=@<your_pdf_file>' | jq -C . | less -R
+```
+
+You can also choose the types of elements you want to return from the output of PDF parsing by 
+passing a list of types to the `include_elems` parameter. For example, if you only want to return 
+`Text` elements and `Title` elements, you can curl:
+```
+curl -X 'POST' 'http://localhost:5000/layout/pdf' \
+-F 'file=@<your_pdf_file>' \
+-F include_elems=Text \
+-F include_elems=Title \
+ | jq -C | less -R
+```
+If you are using an Apple M1 chip, use `make run-app-dev` instead of `make start-app-local` to 
+start the API with hot reloading. The API will run at `http:/localhost:8000`.
+
+View the swagger documentation at `http://localhost:5000/docs`.
+## Security Policy
+
+See our [security policy](https://github.com/Unstructured-IO/unstructured-inference/security/policy) for
+information on how to report security vulnerabilities.
+
+## Learn more
+
+| Section | Description |
+|-|-|
+| [Unstructured Community Github](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects  |
+| [Unstructured Github](https://github.com/Unstructured-IO) | Unstructured.io open source repositories |
+| [Company Website](https://unstructured.io) | Unstructured.io product and company info |
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+## 0.2.0`
	`2`	`+`
	`3`	`+* Initial release of unstructured-inference`