Skip to content

Add tests #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Apr 7, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
@@ -9,22 +9,34 @@ on:
jobs:
test:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.13'

- name: Install Poetry
uses: snok/install-poetry@v1

- name: Install dependencies
run: |
poetry install
- name: Run tests
env:
TEXTUAL_HOST: ${{ vars.TEXTUAL_HOST }}
TEXTUAL_API_KEY: ${{ secrets.TEXTUAL_API_KEY }}
S3_UPLOAD_ACCESS_KEY: ${{ secrets.S3_UPLOAD_ACCESS_KEY }}
S3_UPLOAD_SECRET_KEY: ${{ secrets.S3_UPLOAD_SECRET_KEY }}
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }}
AZURE_ACCOUNT_KEY: ${{ secrets.AZURE_ACCOUNT_KEY }}
AZURE_ACCOUNT_NAME: ${{ secrets.AZURE_ACCOUNT_NAME }}
DATABRICKS_URL: ${{ secrets.DATABRICKS_URL }}
DATABRICKS_ACCESS_TOKEN: ${{ secrets.DATABRICKS_ACCESS_TOKEN }}
S3_UPLOAD_BUCKET: ${{ secrets.S3_UPLOAD_BUCKET }}
S3_OUTPUT_BUCKET: ${{ secrets.S3_OUTPUT_BUCKET }}
run: |
poetry run pytest
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -132,4 +132,7 @@ dmypy.json
.pyre/

# Rider project settings
.idea/
.idea/

# Ruff
.ruff_cache/
1,317 changes: 1,306 additions & 11 deletions poetry.lock

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -24,6 +24,17 @@ ruff = "^0.7.3"
pytest = "^7.4.4"
pytest-dotenv = "^0.5.2"
pytest-regressions = "^2.4.3"
# Numpy (which Pandas depends on) put out a version (1.21.1) that breaks on Python 3.11, but they did not mark it as incompatible with 3.11.
# In a following release (1.21.2), Numpy did mark it as incompatible with 3.11 to fix the issue.
# However, Poetry still tries to install the incompatible version as it's the only one that meets the requirements for ^3.7 when your machine is running Python 3.11.
# So we need to specify different versions of Pandas for different versions of Python to avoid it installing the faulty version.
pandas = [
{version = "^1.1.5", python = ">=3.7,<3.8"},
{version = "^2.0.3", python = ">=3.8,<4.0"}
]
pymupdf = "^1.22.5"
boto3 = "^1.33.13"
azure-storage-blob = "12.19.1"

[build-system]
requires = ["poetry-core>=1.0.0"]
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
filterwarnings =
ignore::DeprecationWarning
Empty file removed tests/__init__.py
Empty file.
138 changes: 138 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import os
from typing import Generator, List, Tuple
import pytest
import uuid
import time
import boto3

from tonic_textual.parse_api import TonicTextualParse
from tonic_textual.redact_api import TonicTextual
from dotenv import load_dotenv
from tonic_textual.classes.common_api_responses.single_detection_result import (
SingleDetectionResult,
)

from tests.utils.resource_utils import get_resource_path


def assert_spans_match_python_indices(s: str, spans: List[SingleDetectionResult]):
"""
Check that spans correspond to the text they claim to represent.
This is a basic validation that the spans are positioned correctly.
"""
for x in spans:
assert x["text"] == s[x["start"] : x["end"]]


def wait_for_file_processing(textual: TonicTextual, dataset_name: str):
while True:
dataset = textual.get_dataset(dataset_name)
queued_files = dataset.get_queued_files()
running_files = dataset.get_running_files()

if not queued_files and not running_files:
print("All files processed.")
break
time.sleep(5)


@pytest.fixture(scope="session", autouse=True)
def load_env():
# Load .env file from the directory where this file is located
dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
load_dotenv(dotenv_path)


@pytest.fixture(scope="module")
def textual():
should_verify = True if os.environ.get("GITHUB_ACTIONS") == "true" else False
return TonicTextual(
base_url=os.environ["TEXTUAL_HOST"],
api_key=os.environ["TEXTUAL_API_KEY"],
verify=should_verify,
)


@pytest.fixture(scope="module")
def textual_parse():
should_verify = True if os.environ.get("GITHUB_ACTIONS") == "true" else False
return TonicTextualParse(
os.environ["TEXTUAL_HOST"], os.environ["TEXTUAL_API_KEY"], verify=should_verify
)


@pytest.fixture(scope="module")
def setup_bill_gates_txt_dataset(
textual,
) -> Generator[Tuple[TonicTextual, str, str], None, None]:
yield from setup_dataset(
f"bill_gates-{uuid.uuid4()}",
get_resource_path("William Henry Gates III (born Octob.txt"),
textual,
)


@pytest.fixture(scope="module")
def pipeline_with_files(textual_parse):
return setup_pipeline(f"pipeline-{uuid.uuid4()}", textual_parse)


@pytest.fixture(scope="module")
def s3_boto_client():
return boto3.client(
"s3",
aws_access_key_id=os.environ["S3_UPLOAD_ACCESS_KEY"],
aws_secret_access_key=os.environ["S3_UPLOAD_SECRET_KEY"],
region_name=os.environ["AWS_DEFAULT_REGION"],
)


def setup_dataset(
dataset_name, dataset_path, textual
) -> Generator[Tuple[TonicTextual, str, str], None, None]:
dataset = textual.create_dataset(dataset_name)
dataset.add_file(dataset_path)
wait_for_file_processing(textual, dataset_name)
failed_files = textual.get_dataset(dataset_name).get_failed_files()
assert len(failed_files) == 0, "Expected no failed files"
yield textual, dataset_name, dataset_path
# Will be executed after the last test
textual.delete_dataset(dataset_name)


def setup_pipeline(pipeline_name, textual_parse):
pipeline = textual_parse.create_local_pipeline(pipeline_name)

files = [
"multiple_sheets_multiple_cells_with_inline_strings.xlsx",
"utterances_twocol.csv",
"chat_transcript.txt",
"Sample Invoice.pdf",
"ocean_report.docx",
]

for file in files:
with open(get_resource_path(file), "rb") as f:
file_bytes = f.read()
pipeline.add_file(file_bytes, file)

# wait to make sure all files are processed.
max_retries = 60
while max_retries > 0:
runs = pipeline.get_runs()
successful_runs = list(filter(lambda r: r.status == "Completed", runs))
if len(successful_runs) > 0:
break
else:
print(f"Runs:{len(runs)}; Successful:{len(successful_runs)}")
time.sleep(1)
max_retries -= 1

if max_retries == 0:
raise Exception("Failed to process uploaded files")

# we can remove this sleep later. right now we stop checking for status once 1 job has completed.
# but depending on de-bounce times for local files the 5 files we upload may be spread across multiple jobs.
# so we need to add logic to actual count total number of files processed.
time.sleep(10)
return pipeline
Binary file added tests/resources/Coachella.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/resources/Redacted_Robin_Hood.pdf
Binary file not shown.
Binary file added tests/resources/Robin_Hood.pdf
Binary file not shown.
Binary file added tests/resources/Sample Invoice.pdf
Binary file not shown.
7 changes: 7 additions & 0 deletions tests/resources/William Henry Gates III (born Octob.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
William Henry Gates III (born October 28, 1955) is an American business magnate, investor, philanthropist, and writer best known for co-founding the software giant Microsoft, along with his childhood friend Paul Allen.[2][3] During his career at Microsoft, Gates held the positions of chairman, chief executive officer (CEO), president, and chief software architect, while also being its largest individual shareholder until May 2014.[4] He was a major entrepreneur of the microcomputer revolution of the 1970s and 1980s.

Gates was born and raised in Seattle, Washington. In 1975, he and Allen founded Microsoft in Albuquerque, New Mexico. It later became the world's largest personal computer software company.[5][a] Gates led the company as its chairman and chief executive officer until stepping down as CEO in January 2000, succeeded by Steve Ballmer, but he remained chairman of the board of directors and became chief software architect.[8] During the late 1990s, he was criticized for his business tactics, which were considered anti-competitive. This opinion has been upheld by numerous court rulings.[9] In June 2008, Gates transitioned into a part-time role at Microsoft and full-time work at the Bill & Melinda Gates Foundation, the private charitable foundation he and his then-wife Melinda had established in 2000.[10] He stepped down as chairman of the Microsoft board in February 2014 and assumed the role of technology adviser to support newly appointed CEO Satya Nadella.[11] In March 2020, Gates left his board positions at Microsoft and Berkshire Hathaway to focus on his philanthropic efforts on climate change, global health and development, and education.[12]

Since 1987, Gates has been included in the Forbes list of the world's billionaires.[13][14] From 1995 to 2017, he held the Forbes title of the richest person in the world every year except in 2008 and from 2010 to 2013.[15] In October 2017, he was surpassed by Amazon founder and CEO Jeff Bezos, who had an estimated net worth of US$90.6 billion compared to Gates's net worth of US$89.9 billion at the time.[16] In the Forbes 400 list of wealthiest Americans in 2023, he was ranked 6th with a wealth of $111.0 billion.[17] As of September 2023, Gates has an estimated net worth of US$123 billion, making him the fourth-richest person in the world according to Bloomberg Billionaires Index.[18]

Later in his career and since leaving day-to-day operations at Microsoft in 2008, Gates has pursued other business and philanthropic endeavors. He is the founder and chairman of several companies, including BEN, Cascade Investment, TerraPower, bgC3, and Breakthrough Energy. He has donated sizable amounts of money to various charitable organizations and scientific research programs through the Bill & Melinda Gates Foundation, reported to be the world's largest private charity.[19] Through the foundation, he led an early 21st century vaccination campaign that significantly contributed to the eradication of the wild poliovirus in Africa.[20][21] In 2010, Gates and Warren Buffett founded The Giving Pledge, whereby they and other billionaires pledge to give at least half of their wealth to philanthropy.[22]
11 changes: 11 additions & 0 deletions tests/resources/chat_transcript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
agent: Hi, welcome to Nike customer support. This is Shannon how may I help you?
customer: I'm trying to place a return but don't have a shipping label.
agent: I can help with that, who am I speaking with?
customer: This is John Lemy
agent: Can you spell your last name for me?
customer: Sure, L E M Y
agent: sorry, was than an M or N?
customer: M as in man.
agent: Thanks so much. If you can give me your address I'll have a box with a return label sent to your house.
customer: Sure, its 5843 abbot drive. A B B O T drive sacremento california 94234
agent: Thanks so much, we'll get that out today or tomorrow.
3 changes: 3 additions & 0 deletions tests/resources/emoji_file.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Today is my birthday. 🌮 Thanks 🙏 Adam🙏🏼! I live in Atlanta
🙏🏼 Adam Kamor
The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary.
1 change: 1 addition & 0 deletions tests/resources/emoji_file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Today is my birthday. 🌮 Thanks 🙏 Adam🙏🏼! I live in Atlanta. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary. The remainder of this text is not interesting. It contains nothing sensitive and is just meant to add ascii characters so we do not incorrectly treat this file as binary.
Binary file not shown.
Binary file added tests/resources/ocean_report.docx
Binary file not shown.
2 changes: 2 additions & 0 deletions tests/resources/simple_file.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
my name is adam kamor. I live in atlanta.
my name is andrew colombi. I live in san francisco.
1 change: 1 addition & 0 deletions tests/resources/simple_file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
my name is adam kamor. I live in atlanta.
1 change: 1 addition & 0 deletions tests/resources/simple_text_with_no_pii.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
There is no pii here.
5 changes: 5 additions & 0 deletions tests/resources/utterances_twocol.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"my name is john smith. i live in atlanta, ga.", "i am married to sally"
my name is sally smith. I am married to John and also live in atlanta., He is 45 years old
I was born on January 1 1987., Nothing here
Richard Jenkins is the name of a guy but he isn't related to the ci/cd product Jenkins, \N
I gotta head over to Walmart so I can pick up 7 apples, Oops, I meant I'll go to Target
12 changes: 12 additions & 0 deletions tests/sample.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/* retrieve from 1pass. Note is called pytest env file */
TEXTUAL_HOST=http://localhost:9002
TEXTUAL_API_KEY=
S3_UPLOAD_ACCESS_KEY=
S3_UPLOAD_SECRET_KEY=
S3_UPLOAD_BUCKET=
S3_OUTPUT_BUCKET=
AWS_DEFAULT_REGION=us-east-1
AZURE_ACCOUNT_KEY=
AZURE_ACCOUNT_NAME=
DATABRICKS_URL=
DATABRICKS_ACCESS_TOKEN=
Loading