Skip to content

fix: DEV-2523: Support webhook data loading in NER ml backend example #138

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions label_studio_ml/examples/bert/bert_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@

from label_studio_ml.model import LabelStudioMLBase

from utils import prepare_texts, calc_slope

from utils import prepare_texts, calc_slope, get_annotated_dataset

if torch.cuda.is_available():
device = torch.device("cuda")
Expand Down Expand Up @@ -128,6 +127,10 @@ def predict(self, tasks, **kwargs):
return predictions

def fit(self, completions, workdir=None, cache_dir=None, **kwargs):
# check if training is from web hook and load tasks from api
if kwargs.get('data'):
project_id = kwargs['data']['project']['id']
completions = get_annotated_dataset(project_id)
input_texts = []
output_labels, output_labels_idx = [], []
label2idx = {l: i for i, l in enumerate(self.labels)}
Expand Down
7 changes: 7 additions & 0 deletions label_studio_ml/examples/flair/ner_ml_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import os

#writing class with inheretance
from label_studio_ml.utils import get_annotated_dataset


class SequenceTaggerModel(LabelStudioMLBase):
def __init__(self, **kwargs):
#initialize base class
Expand Down Expand Up @@ -87,6 +90,10 @@ def convert_to_ls_annotation(self, flair_sentences):
return results

def fit(self, completions, workdir=None, **kwargs):
# check if training is from web hook
if kwargs.get('data'):
project_id = kwargs['data']['project']['id']
completions = get_annotated_dataset(project_id)
#completions contain ALL the annotated samples.
#train a model from scratch here.
flair_sents = []
Expand Down
3 changes: 3 additions & 0 deletions label_studio_ml/examples/mmdetection/mmdetection.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ def predict(self, tasks, **kwargs):
'score': avg_score
}]

def fit(self, completions, workdir=None, **kwargs):
return {}


def json_load(file, int_keys=False):
with io.open(file, encoding='utf8') as f:
Expand Down
11 changes: 9 additions & 2 deletions label_studio_ml/examples/ner/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
from transformers import AdamW, get_linear_schedule_with_warmup

from label_studio_ml.model import LabelStudioMLBase
from label_studio_ml.utils import get_annotated_dataset
from utils import calc_slope


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -342,7 +342,7 @@ def __init__(self, **kwargs):
self.to_name = self.info['to_name'][0]
self.value = self.info['inputs'][0]['value']

if not self.train_output:
if not self.train_output or (not self.train_output.get('model_path')):
self.labels = self.info['labels']
else:
self.load(self.train_output)
Expand Down Expand Up @@ -464,6 +464,13 @@ def fit(
warmup_steps=0, save_steps=50, dump_dataset=True, cache_dir='~/.heartex/cache', train_logs=None,
**kwargs
):
# check if training is from web hook and load tasks from api
if kwargs.get('data'):
project_id = kwargs['data']['project']['id']
completions = get_annotated_dataset(project_id)
# assert that there annotations
assert len(completions) > 0

train_logs = train_logs or os.path.join(workdir, 'train_logs')
os.makedirs(train_logs, exist_ok=True)
logger.debug('Prepare models')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from torchvision import models, transforms

from label_studio_ml.model import LabelStudioMLBase
from label_studio_ml.utils import get_single_tag_keys, get_choice, is_skipped, get_local_path
from label_studio_ml.utils import get_single_tag_keys, get_choice, is_skipped, get_local_path, get_annotated_dataset

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Expand Down Expand Up @@ -177,6 +177,10 @@ def predict(self, tasks, **kwargs):
return predictions

def fit(self, completions, workdir=None, batch_size=32, num_epochs=10, **kwargs):
# check if training is from web hook and load tasks from api
if kwargs.get('data'):
project_id = kwargs['data']['project']['id']
completions = get_annotated_dataset(project_id)
image_urls, image_classes = [], []
print('Collecting annotations...')
for completion in completions:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, **kwargs):
self.to_name = self.info['to_name'][0]
self.value = self.info['inputs'][0]['value']

if not self.train_output:
if (not self.train_output) or (self.train_output and not self.train_output.get('model_file')):
# If there is no trainings, define cold-started the simple TF-IDF text classifier
self.reset_model()
# This is an array of <Choice> labels
Expand Down Expand Up @@ -102,7 +102,7 @@ def _get_annotated_dataset(self, project_id):
return json.loads(response.content)

def fit(self, annotations, workdir=None, **kwargs):
# check if training is from web hook
# check if training is from web hook and load tasks from api
if kwargs.get('data'):
project_id = kwargs['data']['project']['id']
tasks = self._get_annotated_dataset(project_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,7 @@ def _extract_meta(task):
meta['start'] = task['value']['start']
meta['end'] = task['value']['end']
return meta

def fit(self, completions, workdir=None, **kwargs):
# save some training outputs to the job result
return {'random': random.randint(1, 10)}
7 changes: 6 additions & 1 deletion label_studio_ml/examples/tensorflow/mobilenet_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

from PIL import Image
from label_studio_ml.model import LabelStudioMLBase
from label_studio_ml.utils import get_image_local_path, get_single_tag_keys, get_choice, is_skipped
from label_studio_ml.utils import get_image_local_path, get_single_tag_keys, get_choice, is_skipped, \
get_annotated_dataset

logger = logging.getLogger(__name__)
feature_extractor_model = 'https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/4'
Expand Down Expand Up @@ -62,6 +63,10 @@ def predict(self, tasks, **kwargs):
}]

def fit(self, completions, workdir=None, **kwargs):
# check if training is from web hook and load tasks from api
if kwargs.get('data'):
project_id = kwargs['data']['project']['id']
completions = get_annotated_dataset(project_id)

annotations = []
for completion in completions:
Expand Down
6 changes: 6 additions & 0 deletions label_studio_ml/examples/tesseract/tesseract.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import random

from PIL import Image
import pytesseract as pt
from label_studio_ml.model import LabelStudioMLBase
Expand Down Expand Up @@ -74,3 +76,7 @@ def _extract_meta(task):
meta["original_width"] = task['original_width']
meta["original_height"] = task['original_height']
return meta

def fit(self, completions, workdir=None, **kwargs):
# save some training outputs to the job result
return {'random': random.randint(1, 10)}
7 changes: 4 additions & 3 deletions label_studio_ml/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,14 @@
from rq.job import Job
from colorama import Fore

from label_studio_tools.core.utils.params import get_bool_env
from label_studio_tools.core.utils.params import get_bool_env, get_env
from label_studio_tools.core.label_config import parse_config
from label_studio_tools.core.utils.io import get_local_path

logger = logging.getLogger(__name__)

LABEL_STUDIO_ML_BACKEND_V2_DEFAULT = False
LABEL_STUDIO_STRICT_ERRORS = get_env("LS_STRICT_ERRORS", False)

@attr.s
class ModelWrapper(object):
Expand Down Expand Up @@ -189,12 +190,12 @@ def _get_result_from_job_id(self, job_id):
if not os.path.exists(job_dir):
logger.warning(f"=> Warning: {job_id} dir doesn't exist. "
f"It seems that you don't have specified model dir.")
return None
return None if LABEL_STUDIO_STRICT_ERRORS else {}
result_file = os.path.join(job_dir, self.JOB_RESULT)
if not os.path.exists(result_file):
logger.warning(f"=> Warning: {job_id} dir doesn't contain result file. "
f"It seems that previous training session ended with error.")
return None
return None if LABEL_STUDIO_STRICT_ERRORS else {}
logger.debug(f'Read result from {result_file}')
with open(result_file) as f:
result = json.load(f)
Expand Down
16 changes: 16 additions & 0 deletions label_studio_ml/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import logging
import requests

from PIL import Image

Expand Down Expand Up @@ -48,3 +50,17 @@ def get_image_local_path(url, image_cache_dir=None, project_dir=None, image_dir=

def get_image_size(filepath):
return Image.open(filepath).size


def get_annotated_dataset(project_id, hostname=None, api_key=None):
"""Just for demo purposes: retrieve annotated data from Label Studio API"""
if hostname is None:
hostname = get_env('HOSTNAME')
if api_key is None:
api_key = get_env('API_KEY')
download_url = f'{hostname.rstrip("/")}/api/projects/{project_id}/export'
response = requests.get(download_url, headers={'Authorization': f'Token {api_key}'})
if response.status_code != 200:
raise Exception(f"Can't load task data using {download_url}, "
f"response status_code = {response.status_code}")
return json.loads(response.content)