Skip to content

Commit 5df3bec

Browse files
committed
Public release
1 parent 4930309 commit 5df3bec

36 files changed

+1634
-2
lines changed

.github/dependabot.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
version: 2
2+
updates:
3+
- package-ecosystem: "pip"
4+
directory: "/requirements"
5+
schedule:
6+
interval: "weekly"
7+
8+
- package-ecosystem: "github-actions"
9+
# NOTE(robinson) - Workflow files stored in the
10+
# default location of `.github/workflows`
11+
directory: "/"
12+
schedule:
13+
interval: "weekly"

.github/workflows/ci.yml

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [ main, robinson/initial-repo-setup ]
6+
pull_request:
7+
branches: [ main ]
8+
9+
env:
10+
PYTHON_VERSION: 3.8
11+
12+
jobs:
13+
setup:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- uses: actions/checkout@v3
17+
- uses: actions/cache@v3
18+
id: virtualenv-cache
19+
with:
20+
path: |
21+
.venv
22+
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
23+
- name: Set up Python ${{ env.PYTHON_VERSION }}
24+
uses: actions/setup-python@v4
25+
with:
26+
python-version: ${{ env.PYTHON_VERSION }}
27+
- name: Setup virtual environment (no cache hit)
28+
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
29+
run: |
30+
python${{ env.PYTHON_VERSION }} -m venv .venv
31+
source .venv/bin/activate
32+
make install-ci
33+
34+
lint:
35+
runs-on: ubuntu-latest
36+
needs: setup
37+
steps:
38+
- uses: actions/checkout@v3
39+
- uses: actions/cache@v3
40+
id: virtualenv-cache
41+
with:
42+
path: .venv
43+
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
44+
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
45+
# We can take this out when we implement the fix in CORE-99
46+
- name: Setup virtual environment (no cache hit)
47+
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
48+
run: |
49+
python${{ env.PYTHON_VERSION }} -m venv .venv
50+
source .venv/bin/activate
51+
make install-ci
52+
- name: Lint
53+
run: |
54+
source .venv/bin/activate
55+
make check
56+
57+
shellcheck:
58+
runs-on: ubuntu-latest
59+
steps:
60+
- uses: actions/checkout@v2
61+
- name: ShellCheck
62+
uses: ludeeus/action-shellcheck@master
63+
64+
test:
65+
runs-on: ubuntu-latest
66+
needs: [setup, lint]
67+
steps:
68+
- uses: actions/checkout@v3
69+
- uses: actions/cache@v3
70+
id: virtualenv-cache
71+
with:
72+
path: |
73+
.venv
74+
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
75+
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
76+
# We can take this out when we implement the fix in CORE-99
77+
- name: Setup virtual environment (no cache hit)
78+
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
79+
run: |
80+
python${{ env.PYTHON_VERSION }} -m venv .venv
81+
source .venv/bin/activate
82+
make install-ci
83+
- name: Test
84+
run: |
85+
source .venv/bin/activate
86+
make test
87+
make check-coverage
88+
89+
changelog:
90+
runs-on: ubuntu-latest
91+
steps:
92+
- uses: actions/checkout@v3
93+
- if: github.ref != 'refs/heads/main'
94+
uses: dorny/paths-filter@v2
95+
id: changes
96+
with:
97+
filters: |
98+
src:
99+
- 'unstructured_inference/**'
100+
101+
- if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
102+
uses: dangoslen/changelog-enforcer@v3

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,10 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
131+
# Model artifacts
132+
.models/*
133+
!.models/.gitkeep
134+
135+
# Mac stuff
136+
.DS_Store

.models/.gitkeep

Whitespace-only changes.

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
## 0.2.0
2+
3+
* Initial release of unstructured-inference

Makefile

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
PACKAGE_NAME := unstructured_inference
2+
PIP_VERSION := 22.3
3+
4+
5+
.PHONY: help
6+
help: Makefile
7+
@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
8+
9+
10+
###########
11+
# Install #
12+
###########
13+
14+
## install-base: installs core requirements needed for text processing bricks
15+
.PHONY: install-base
16+
install-base: install-base-pip-packages
17+
18+
## install: installs all test, dev, and experimental requirements
19+
.PHONY: install
20+
install: install-base-pip-packages install-dev install-detectron2 install-test
21+
22+
.PHONY: install-ci
23+
install-ci: install-base-pip-packages install-test
24+
25+
.PHONY: install-base-pip-packages
26+
install-base-pip-packages:
27+
python3 -m pip install pip==${PIP_VERSION}
28+
pip install -r requirements/base.txt
29+
30+
.PHONY: install-detectron2
31+
install-detectron2:
32+
pip install "detectron2@git+https://github.com/facebookresearch/[email protected]#egg=detectron2"
33+
34+
.PHONY: install-test
35+
install-test:
36+
pip install -r requirements/test.txt
37+
38+
.PHONY: install-dev
39+
install-dev:
40+
pip install -r requirements/dev.txt
41+
42+
## pip-compile: compiles all base/dev/test requirements
43+
.PHONY: pip-compile
44+
pip-compile:
45+
pip-compile requirements/base.in
46+
# NOTE(robinson) - We want the dependencies for detectron2 in the requirements.txt, but not
47+
# the detectron2 repo itself. If detectron2 is in the requirements.txt file, an order of
48+
# operations issue related to the torch library causes the install to fail
49+
sed 's/^detectron2 @/# detectron2 @/g' requirements/base.txt
50+
pip-compile requirements/dev.in
51+
pip-compile requirements/test.in
52+
53+
#########
54+
# Build #
55+
#########
56+
57+
## docker-build: builds the docker container for detectron2
58+
.PHONY: docker-build
59+
docker-build:
60+
PIP_VERSION=${PIP_VERSION} ./scripts/docker-build.sh
61+
62+
#########
63+
# Local #
64+
########
65+
66+
## download-models downloads unstructured models (AWS credentials must be in environment variables)
67+
.PHONY: download-models
68+
download-models:
69+
./scripts/dl-models.sh
70+
71+
## run-app-dev: runs the FastAPI api with hot reloading
72+
.PHONY: run-app-dev
73+
run-app-dev:
74+
PYTHONPATH=. uvicorn unstructured_inference.api:app --reload
75+
76+
## start-app-local: runs FastAPI in the container with hot reloading
77+
.PHONY: start-app-local
78+
start-app-local:
79+
docker run --name=ml-inference-container -p 127.0.0.1:5000:5000 ml-inference-dev
80+
81+
## stop-app-local: stops the container
82+
.PHONY: stop-app-local
83+
stop-app-local:
84+
docker stop ml-inference-container | xargs docker rm
85+
86+
#################
87+
# Test and Lint #
88+
#################
89+
90+
## test: runs all unittests
91+
.PHONY: test
92+
test:
93+
PYTHONPATH=. pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
94+
95+
## check: runs linters (includes tests)
96+
.PHONY: check
97+
check: check-src check-tests check-version
98+
99+
## check-src: runs linters (source only, no tests)
100+
.PHONY: check-src
101+
check-src:
102+
black --line-length 100 ${PACKAGE_NAME} --check
103+
flake8 ${PACKAGE_NAME}
104+
mypy ${PACKAGE_NAME} --ignore-missing-imports
105+
106+
.PHONY: check-tests
107+
check-tests:
108+
black --line-length 100 test_${PACKAGE_NAME} --check
109+
flake8 test_${PACKAGE_NAME}
110+
111+
## check-scripts: run shellcheck
112+
.PHONY: check-scripts
113+
check-scripts:
114+
# Fail if any of these files have warnings
115+
scripts/shellcheck.sh
116+
117+
## check-version: run check to ensure version in CHANGELOG.md matches version in package
118+
.PHONY: check-version
119+
check-version:
120+
# Fail if syncing version would produce changes
121+
scripts/version-sync.sh -c
122+
123+
## tidy: run black
124+
.PHONY: tidy
125+
tidy:
126+
black --line-length 100 ${PACKAGE_NAME}
127+
black --line-length 100 test_${PACKAGE_NAME}
128+
129+
## version-sync: update __version__.py with most recent version from CHANGELOG.md
130+
.PHONY: version-sync
131+
version-sync:
132+
scripts/version-sync.sh
133+
134+
.PHONY: check-coverage
135+
check-coverage:
136+
coverage report --fail-under=95

README.md

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,74 @@
1-
# ml-inference
2-
Repo for housing Unstructured ML inference code
1+
<h3 align="center">
2+
<img src="img/unstructured_logo.png" height="200">
3+
</h3>
4+
5+
<h3 align="center">
6+
<p>Open-Source Pre-Processing Tools for Unstructured Data</p>
7+
</h3>
8+
9+
The `unstructured-inference` repo contains hosted model inference code for layout parsing models.
10+
These models are invoked via API as part of the partitioning bricks in the `unstructured` package.
11+
12+
## Installation
13+
14+
### Package
15+
16+
Requires [`torch>=1.8`](https://pytorch.org/get-started/locally/). Once this is satisfied, run
17+
`pip install unstructured-inference`.
18+
19+
### Repository
20+
21+
Clone the repo and run `make install` to install dependencies.
22+
Run `make help` for a full list of install options.
23+
24+
## Getting Started
25+
26+
To get started with the layout parsing model, use the following commands:
27+
28+
```python
29+
from unstructured_inference.inference.layout import DocumentLayout
30+
31+
layout = DocumentLayout.from_file("sample-docs/loremipsum.pdf")
32+
33+
print(layout.pages[0].elements)
34+
```
35+
36+
Once the model has detected the layout and OCR'd the document, the text extracted from the first
37+
page of the sample document will be displayed.
38+
You can convert a given element to a `dict` by running the `.to_dict()` method.
39+
40+
To build the Docker container, run `make docker-build`. Note that Apple hardware with an M1 chip
41+
has trouble building `Detectron2` on Docker and for best results you should build it on Linux. To
42+
run the API locally, use `make start-app-local`. You can stop the API with `make stop-app-local`.
43+
The API will run at `http:/localhost:5000`.
44+
You can then `POST` a PDF file to the API endpoint to see its layout with the command:
45+
```
46+
curl -X 'POST' 'http://localhost:5000/layout/pdf' -F 'file=@<your_pdf_file>' | jq -C . | less -R
47+
```
48+
49+
You can also choose the types of elements you want to return from the output of PDF parsing by
50+
passing a list of types to the `include_elems` parameter. For example, if you only want to return
51+
`Text` elements and `Title` elements, you can curl:
52+
```
53+
curl -X 'POST' 'http://localhost:5000/layout/pdf' \
54+
-F 'file=@<your_pdf_file>' \
55+
-F include_elems=Text \
56+
-F include_elems=Title \
57+
| jq -C | less -R
58+
```
59+
If you are using an Apple M1 chip, use `make run-app-dev` instead of `make start-app-local` to
60+
start the API with hot reloading. The API will run at `http:/localhost:8000`.
61+
62+
View the swagger documentation at `http://localhost:5000/docs`.
63+
## Security Policy
64+
65+
See our [security policy](https://github.com/Unstructured-IO/unstructured-inference/security/policy) for
66+
information on how to report security vulnerabilities.
67+
68+
## Learn more
69+
70+
| Section | Description |
71+
|-|-|
72+
| [Unstructured Community Github](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects |
73+
| [Unstructured Github](https://github.com/Unstructured-IO) | Unstructured.io open source repositories |
74+
| [Company Website](https://unstructured.io) | Unstructured.io product and company info |

0 commit comments

Comments
 (0)