HumanSignal · robot-ci-heartex · Dec 16, 2025 · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025
diff --git a/docs/source/guide/webhook_reference.md b/docs/source/guide/webhook_reference.md
@@ -129,7 +129,6 @@ The webhook payload includes the name of the action and some additional task dat
         "created_at": "2021-08-17T13:49:34.326416Z",
         "updated_at": "2021-08-17T13:49:35.911271Z",
         "sampling": "Sequential sampling",
-        "show_ground_truth_first": true,
         "show_overlap_first": true,
         "overlap_cohort_percentage": 100,
         "task_data_login": null,
@@ -219,7 +218,6 @@ Sent when a task is deleted from Label Studio. See how to [set up a webhook for
         "created_at": "2021-08-17T13:49:34.326416Z",
         "updated_at": "2021-08-17T13:52:09.334425Z",
         "sampling": "Sequential sampling",
-        "show_ground_truth_first": true,
         "show_overlap_first": true,
         "overlap_cohort_percentage": 100,
         "task_data_login": null,
@@ -328,7 +326,6 @@ The webhook payload includes the name of the action and some additional annotati
         "created_at": "2021-08-17T13:49:34.326416Z",
         "updated_at": "2021-08-17T13:52:09.334425Z",
         "sampling": "Sequential sampling",
-        "show_ground_truth_first": true,
         "show_overlap_first": true,
         "overlap_cohort_percentage": 100,
         "task_data_login": null,
@@ -463,7 +460,6 @@ The webhook payload includes the name of the action and some additional annotati
         "created_at": "2021-08-12T14:15:01.744507Z",
         "updated_at": "2021-08-17T13:35:25.697471Z",
         "sampling": "Sequential sampling",
-        "show_ground_truth_first": true,
         "show_overlap_first": true,
         "overlap_cohort_percentage": 100,
         "task_data_login": null,
@@ -538,7 +534,6 @@ Sent when an annotation is deleted. See how to [set up a webhook for this event]
         "created_at": "2021-08-17T13:49:34.326416Z",
         "updated_at": "2021-08-17T13:52:09.334425Z",
         "sampling": "Sequential sampling",
-        "show_ground_truth_first": true,
         "show_overlap_first": true,
         "overlap_cohort_percentage": 100,
         "task_data_login": null,
@@ -603,7 +598,6 @@ The webhook payload includes the name of the action and some additional project
         "created_at": "2021-08-17T13:55:58.809065Z",
         "updated_at": "2021-08-17T13:55:58.809098Z",
         "sampling": "Sequential sampling",
-        "show_ground_truth_first": true,
         "show_overlap_first": true,
         "overlap_cohort_percentage": 100,
         "task_data_login": null,
@@ -674,7 +668,6 @@ The webhook payload includes the name of the action and some additional project
         "created_at": "2021-08-12T14:15:01.744507Z",
         "updated_at": "2021-08-17T13:39:14.054849Z",
         "sampling": "Sequential sampling",
-        "show_ground_truth_first": true,
         "show_overlap_first": true,
         "overlap_cohort_percentage": 100,
         "task_data_login": null,

diff --git a/label_studio/projects/api.py b/label_studio/projects/api.py
@@ -307,7 +307,7 @@ def get_queryset(self):
                             'total_annotations_number': 10,
                             'total_predictions_number': 0,
                             'sampling': 'Sequential sampling',
-                            'show_ground_truth_first': True,
+                            'annotator_evaluation_enabled': False,
                             'show_overlap_first': True,
                             'overlap_cohort_percentage': 100,
                             'task_data_login': 'user',

diff --git a/label_studio/projects/functions/next_task.md b/label_studio/projects/functions/next_task.md
@@ -21,7 +21,7 @@ flowchart TD
     B3 -- no --> B4{"LSE low-agreement path?<br/>fflag OPTIC-161<br/>agreement_threshold set<br/>user is annotator"}
     B4 -- yes --> B6["Filter by agreement threshold<br/>and annotator capacity"] --> B7[Optionally prioritize by low agreement]
 
-    B4 -- no --> B8{"Evaluation mode?<br/>fflag ALL-LEAP-1825<br/>show_ground_truth_first"}
+    B4 -- no --> B8{"Evaluation mode?<br/>fflag ALL-LEAP-1825<br/>annotator_evaluation_enabled"}
     B8 -- yes --> B7
     B8 -- no --> B9[Filter: is_labeled=false] --> B7
   end
@@ -69,9 +69,7 @@ flowchart TD
 
 ### GT-first gating
 - `should_attempt_ground_truth_first(user, project)` returns true when:
-  - `show_ground_truth_first=True` and either no `lse_project` or `annotator_evaluation_minimum_tasks` is not set, or
-  - the user's completed GT-equipped tasks < `annotator_evaluation_minimum_tasks`, or
-  - minimum tasks reached but the user's GT agreement score is missing or below `annotator_evaluation_minimum_score` (percent).
+  - `annotator_evaluation_enabled=True` and `annotator_evaluation_onboarding_tasks > 0` and the user's completed GT-equipped tasks < `annotator_evaluation_onboarding_tasks`.
 - Otherwise returns false (GT-first disabled; proceed via low-agreement/overlap/sampling).
 
 ## Queue labels appended to response

diff --git a/label_studio/projects/functions/next_task.py b/label_studio/projects/functions/next_task.py
@@ -17,14 +17,14 @@
 
 
 # Hook for GT-first gating (Enterprise can override via settings)
-def _oss_should_attempt_gt_first(user: User, project: Project) -> bool:
-    # Open-source default: if project enables GT-first, allow it without onboarding gates
-    return bool(project.show_ground_truth_first)
+def _lso_should_attempt_gt_first(user: User, project: Project) -> bool:
+    # Open-source default: if project enables annotator evaluation, allow it without onboarding gates
+    return bool(project.annotator_evaluation_enabled)
 
 
 get_tasks_agreement_queryset = load_func(settings.GET_TASKS_AGREEMENT_QUERYSET)
 should_attempt_ground_truth_first = (
-    load_func(settings.SHOULD_ATTEMPT_GROUND_TRUTH_FIRST) or _oss_should_attempt_gt_first
+    load_func(settings.SHOULD_ATTEMPT_GROUND_TRUTH_FIRST) or _lso_should_attempt_gt_first
 )
 
 
@@ -59,10 +59,7 @@ def _get_first_unlocked(tasks_query: QuerySet[Task], user) -> Union[Task, None]:
 
 def _try_ground_truth(tasks: QuerySet[Task], project: Project, user: User) -> Union[Task, None]:
     """Returns task from ground truth set"""
-    ground_truth = Annotation.objects.filter(task=OuterRef('pk'), ground_truth=True)
-    not_solved_tasks_with_ground_truths = tasks.annotate(has_ground_truths=Exists(ground_truth)).filter(
-        has_ground_truths=True
-    )
+    not_solved_tasks_with_ground_truths = _annotate_has_ground_truths(tasks).filter(has_ground_truths=True)
     if not_solved_tasks_with_ground_truths.exists():
         if project.sampling == project.SEQUENCE:
             return _get_first_unlocked(not_solved_tasks_with_ground_truths, user)
@@ -78,13 +75,15 @@ def _try_tasks_with_overlap(tasks: QuerySet[Task]) -> Tuple[Union[Task, None], Q
         return None, tasks.filter(overlap=1)
 
 
-def _try_breadth_first(tasks: QuerySet[Task], user: User, project: Project) -> Union[Task, None]:
+def _try_breadth_first(
+    tasks: QuerySet[Task], user: User, project: Project, attempt_gt_first: bool = False
+) -> Union[Task, None]:
     """Try to find tasks with maximum amount of annotations, since we are trying to label tasks as fast as possible"""
 
-    # Exclude ground truth annotations from the count when not in onboarding mode
+    # Exclude ground truth annotations from the count when not in onboarding window
     # to prevent GT tasks from being prioritized via breadth-first logic
     annotation_filter = ~Q(annotations__completed_by=user)
-    if not project.show_ground_truth_first:
+    if not attempt_gt_first:
         annotation_filter &= ~Q(annotations__ground_truth=True)
 
     tasks = tasks.annotate(annotations_count=Count('annotations', filter=annotation_filter))
@@ -158,13 +157,18 @@ def _try_uncertainty_sampling(
     return next_task
 
 
+def _annotate_has_ground_truths(tasks: QuerySet[Task]) -> QuerySet[Task]:
+    ground_truth = Annotation.objects.filter(task=OuterRef('pk'), ground_truth=True)
+    return tasks.annotate(has_ground_truths=Exists(ground_truth))
+
+
 def get_not_solved_tasks_qs(
     user: User,
     project: Project,
     prepared_tasks: QuerySet[Task],
     assigned_flag: Union[bool, None],
     queue_info: str,
-    allow_gt_first: bool,
+    attempt_gt_first: bool,
 ) -> Tuple[QuerySet[Task], List[int], str, bool]:
     user_solved_tasks_array = user.annotations.filter(project=project, task__isnull=False)
     user_solved_tasks_array = user_solved_tasks_array.distinct().values_list('task__pk', flat=True)
@@ -188,7 +192,6 @@ def get_not_solved_tasks_qs(
             and get_tasks_agreement_queryset
             and user.is_project_annotator(project)
         ):
-            # Onboarding mode (GT-first) should keep GT tasks eligible regardless of is_labeled/agreement
             qs = get_tasks_agreement_queryset(not_solved_tasks)
             qs = qs.annotate(annotators=Count('annotations__completed_by', distinct=True))
 
@@ -197,13 +200,10 @@ def get_not_solved_tasks_qs(
             )
             capacity_pred = Q(annotators__lt=F('overlap') + (lse_project.max_additional_annotators_assignable or 0))
 
-            if project.show_ground_truth_first:
-                gt_subq = Annotation.objects.filter(task=OuterRef('pk'), ground_truth=True)
-                qs = qs.annotate(has_ground_truths=Exists(gt_subq))
-                # Keep all GT tasks + apply low-agreement+capacity to the rest. For sure, we can do:
-                # - if user.solved_tasks_array.count < lse_project.annotator_evaluation_minimum_tasks
-                # - else, apply low-agreement+capacity to the rest (maybe performance will be better)
-                # but it's a question - what is better here. This version is simpler at least from the code perspective.
+            if project.annotator_evaluation_enabled:
+                # Include ground truth tasks in the query if annotator evaluation is enabled
+                qs = _annotate_has_ground_truths(qs)
+                # Keep all GT tasks + apply low-agreement+capacity to the rest.
                 not_solved_tasks = qs.filter(Q(has_ground_truths=True) | (low_agreement_pred & capacity_pred))
             else:
                 not_solved_tasks = qs.filter(low_agreement_pred & capacity_pred)
@@ -212,9 +212,15 @@ def get_not_solved_tasks_qs(
 
         # otherwise, filtering out completed tasks is sufficient
         else:
-            # ignore tasks that are already labeled when GT-first is NOT allowed
-            if not allow_gt_first:
-                not_solved_tasks = not_solved_tasks.filter(is_labeled=False)
+            if not attempt_gt_first:
+                # Outside of onboarding window
+                if project.annotator_evaluation_enabled:
+                    # Include ground truth tasks in the query if outside of onboarding window and annotator evaluation is enabled
+                    not_solved_tasks = _annotate_has_ground_truths(not_solved_tasks)
+                    not_solved_tasks = not_solved_tasks.filter(Q(is_labeled=False) | Q(has_ground_truths=True))
+                else:
+                    # Ignore tasks that are already labeled when outside of onboarding window and annotator evaluation is not enabled
+                    not_solved_tasks = not_solved_tasks.filter(is_labeled=False)
 
     if not flag_set('fflag_fix_back_lsdv_4523_show_overlap_first_order_27022023_short'):
         # show tasks with overlap > 1 first (unless tasks are already prioritized on agreement)
@@ -244,7 +250,7 @@ def get_next_task_without_dm_queue(
     not_solved_tasks: QuerySet,
     assigned_flag: Union[bool, None],
     prioritized_low_agreement: bool,
-    allow_gt_first: bool,
+    attempt_gt_first: bool,
 ) -> Tuple[Union[Task, None], bool, str]:
     next_task = None
     use_task_lock = True
@@ -265,8 +271,8 @@ def get_next_task_without_dm_queue(
             use_task_lock = False
             queue_info += (' & ' if queue_info else '') + 'Task lock'
 
-    # Ground truth: use precomputed gating for GT-first
-    if not next_task and allow_gt_first:
+    # Ground truth: attempt to label ground truth tasks in onboarding window
+    if not next_task and attempt_gt_first:
         logger.debug(f'User={user} tries ground truth from prepared tasks')
         next_task = _try_ground_truth(not_solved_tasks, project, user)
         if next_task:
@@ -283,7 +289,7 @@ def get_next_task_without_dm_queue(
     if not next_task and project.maximum_annotations > 1:
         # if there are already labeled tasks, but task.overlap still < project.maximum_annotations, randomly sampling from them
         logger.debug(f'User={user} tries depth first from prepared tasks')
-        next_task = _try_breadth_first(not_solved_tasks, user, project)
+        next_task = _try_breadth_first(not_solved_tasks, user, project, attempt_gt_first)
         if next_task:
             queue_info += (' & ' if queue_info else '') + 'Breadth first queue'
 
@@ -378,16 +384,16 @@ def get_next_task(
         use_task_lock = True
         queue_info = ''
 
-        # Ground truth: label GT first only during onboarding window for user (gated by min tasks and min score)
-        allow_gt_first = should_attempt_ground_truth_first(user, project)
+        # Ground truth: label GT first only during onboarding window for user (gated by onboarding task number)
+        attempt_gt_first = should_attempt_ground_truth_first(user, project)
 
         not_solved_tasks, user_solved_tasks_array, queue_info, prioritized_low_agreement = get_not_solved_tasks_qs(
-            user, project, prepared_tasks, assigned_flag, queue_info, allow_gt_first
+            user, project, prepared_tasks, assigned_flag, queue_info, attempt_gt_first
         )
 
         if not dm_queue:
             next_task, use_task_lock, queue_info = get_next_task_without_dm_queue(
-                user, project, not_solved_tasks, assigned_flag, prioritized_low_agreement, allow_gt_first
+                user, project, not_solved_tasks, assigned_flag, prioritized_low_agreement, attempt_gt_first
             )
 
         if flag_set('fflag_fix_back_lsdv_4523_show_overlap_first_order_27022023_short'):
@@ -452,7 +458,7 @@ def get_next_task(
                         'maximum_annotations': project.maximum_annotations,
                         'skip_queue': project.skip_queue,
                         'sampling': project.sampling,
-                        'show_ground_truth_first': project.show_ground_truth_first,
+                        'annotator_evaluation_enabled': project.annotator_evaluation_enabled,
                         'show_overlap_first': project.show_overlap_first,
                         'overlap_cohort_percentage': project.overlap_cohort_percentage,
                         'project_id': project.id,

@@ -0,0 +1,23 @@
+# Generated by Django 5.1.14 on 2025-12-09 21:37
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("projects", "0033_projects_soft_delete_indexes_async"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="project",
+            name="annotator_evaluation_enabled",
+            field=models.BooleanField(
+                db_default=False,
+                default=False,
+                help_text="Enable annotator evaluation for the project",
+                verbose_name="annotator evaluation enabled",
+            ),
+        ),
+    ]
diff --git a/label_studio/projects/models.py b/label_studio/projects/models.py
@@ -315,11 +315,21 @@ class SkipQueue(models.TextChoices):
     skip_queue = models.CharField(
         max_length=100, choices=SkipQueue.choices, null=True, default=SkipQueue.REQUEUE_FOR_OTHERS
     )
+
+    # Deprecated in favor of annotator_evaluation_enabled
     show_ground_truth_first = models.BooleanField(
         _('show ground truth first'),
         default=False,
         help_text='Onboarding mode (true): show ground truth tasks first in the labeling stream',
     )
+
+    annotator_evaluation_enabled = models.BooleanField(
+        _('annotator evaluation enabled'),
+        default=False,
+        db_default=False,
+        help_text='Enable annotator evaluation for the project',
+    )
+
     show_overlap_first = models.BooleanField(_('show overlap first'), default=False)
     overlap_cohort_percentage = models.IntegerField(_('overlap_cohort_percentage'), default=100)
 

diff --git a/label_studio/projects/serializers.py b/label_studio/projects/serializers.py
@@ -3,6 +3,7 @@
 import bleach
 from constants import SAFE_HTML_ATTRIBUTES, SAFE_HTML_TAGS
 from django.db.models import Q
+from drf_spectacular.utils import extend_schema_serializer
 from fsm.serializer_fields import FSMStateField
 from label_studio_sdk.label_interface import LabelInterface
 from label_studio_sdk.label_interface.control_tags import (
@@ -43,6 +44,7 @@ def __call__(self, serializer_field):
         return serializer_field.context.get('created_by')
 
 
+@extend_schema_serializer(deprecate_fields=['show_ground_truth_first'])
 class ProjectSerializer(FlexFieldsModelSerializer):
     """Serializer get numbers from project queryset annotation,
     make sure, that you use correct one(Project.objects.with_counts())
@@ -236,6 +238,7 @@ class Meta:
             'total_predictions_number',
             'sampling',
             'show_ground_truth_first',
+            'annotator_evaluation_enabled',
             'show_overlap_first',
             'overlap_cohort_percentage',
             'task_data_login',

diff --git a/label_studio/tasks/models.py b/label_studio/tasks/models.py
@@ -289,10 +289,8 @@ def has_lock(self, user=None):
         """
         from projects.functions.next_task import get_next_task_logging_level
 
-        if self.project.show_ground_truth_first:
-            # in show_ground_truth_first mode(onboarding)
-            # we ignore overlap setting for ground_truth tasks
-            # https://humansignal.atlassian.net/browse/LEAP-1963
+        if self.project.annotator_evaluation_enabled:
+            # In annotator evaluation mode, ignore overlap setting for ground truth tasks
             if self.annotations.filter(ground_truth=True).exists():
                 return False
 

diff --git a/label_studio/tests/data_manager/columns.tavern.yml b/label_studio/tests/data_manager/columns.tavern.yml
@@ -42,7 +42,7 @@ stages:
              "start_training_on_annotation_update": false, "show_collab_predictions": true, "num_tasks_with_annotations": null,
              "task_number": null, "useful_annotation_number": null, "ground_truth_number": null, "skipped_annotations_number": null,
              "total_annotations_number": null, "total_predictions_number": null, "sampling": "Sequential sampling",
-             "show_ground_truth_first": false, "show_overlap_first": false, "overlap_cohort_percentage": 100,
+             "show_ground_truth_first": false, "annotator_evaluation_enabled": false, "show_overlap_first": false, "overlap_cohort_percentage": 100,
              "task_data_login": null, "task_data_password": null,
              "control_weights": {"label": {"overall": 1.0, "type": "Choices", "labels": {"pos": 1.0, "neg": 1.0}}},
              "parsed_label_config": {

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -74,7 +74,7 @@ dependencies = [
     "tldextract (>=5.1.3)",
     "uuid-utils (>=0.11.0,<1.0.0)",
     ## HumanSignal repo dependencies :start
-    "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/58970c14d2c1683e5093eae848a8265cbbc4acac.zip",
+    "label-studio-sdk @ https://github.com/HumanSignal/label-studio-sdk/archive/8ce1b4f80f12780da5d07184e4bea25a5c05fe96.zip",
     ## HumanSignal repo dependencies :end
 ]