xiaomi-research
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 2 deletions b/‎.gitignore‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 126 additions & 21 deletions b/‎README.md‎
Lines changed: 126 additions & 21 deletions
diff --git a/‎config/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎config/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎config/config_utils.py‎
Lines changed: 78 additions & 0 deletions b/‎config/config_utils.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎config/dataset_info.json‎
Lines changed: 6 additions & 6 deletions b/‎config/dataset_info.json‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎config/model_info.json‎ ‎config/model_paths.json‎config/model_info.json renamed to config/model_paths.json
Lines changed: 14 additions & 2 deletions b/‎config/model_info.json‎ ‎config/model_paths.json‎config/model_info.json renamed to config/model_paths.json
Lines changed: 14 additions & 2 deletions
@@ -48,6 +48,7 @@ coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
+test/
 
 # Translations
 *.mo
@@ -117,10 +118,11 @@ dmypy.json
 # customize
 /data/android_control/
 /data/android_in_the_zoo/
-/data/cagui_agent/
+cagui_agent/
 /data/gui_odyssey/
-/data/hypertrack/
 /outputs
+/models
+/logs
 .idea/
 .DS_Store
 .vscode/
@@ -6,15 +6,31 @@ GUIEvalKit is an open-source evaluation toolkit for GUI agents, allowing practit
 
 This work has been tested in the following environment:
 * `python == 3.10.12`
-* `torch == 2.7.1+cu126`
-* `transformers == 4.56.1`
-* `vllm == 0.10.1`
+* `torch == 2.8.1+cu128`
+* `transformers == 4.57.1`
+* `vllm == 0.11.0`
+
+### Installation
+
+Install the required dependencies:
+
+```bash
+pip install uv
+
+pushd ./guievalkit/
+uv venv ur_venv
+source ur_venv/bin/activate
+uv pip install -r requirements.txt -i accessible_url  # uv won't read accessible url from pip.conf
+```
+
+Make sure you have CUDA 12.8.x installed for GPU acceleration with vLLM.
 
 ## Supported Models
 
 | Model                                                   | Model Name                                    | Organization |
 |---------------------------------------------------------|-----------------------------------------------|--------------|
 | [Qwen2.5-VL](https://github.com/QwenLM/Qwen2.5-VL)      | `qwen2.5-vl-3/7/32/72b-instruct`              | Alibaba      |
+| [Qwen3-VL](https://github.com/QwenLM/Qwen3-VL)          | `qwen3-vl-4/8b-instruct/thinking`             | Alibaba      |
 | [GUI-Owl](https://github.com/X-PLUG/MobileAgent)        | `gui-owl-7/32b`                               | Alibaba      |
 | [UI-Venus](https://github.com/inclusionAI/UI-Venus)     | `ui-venus-navi-7/72b`                         | Ant Group    |
 | [UI-TARS](https://github.com/bytedance/UI-TARS)         | `ui-tars-2/7/72b-sft`, `ui-tars-7/72b-dpo`    | Bytedance    |
@@ -38,29 +54,118 @@ This work has been tested in the following environment:
 
 Please follow the [instructions](./data/README.md) to download and preprocess the datasets.
 
-## Data & Model Registration
+## Development
+
+### Configuration
 
-Please update the files [dataset_info.json](./config/dataset_info.json) and [model_info.json](./config/model_info.json) with your own information.
+Please update the configuration files or objs with your own information:
+- **[dataset_info.json](./config/dataset_info.json)**: Configure dataset paths and settings
+- **[guieval/config.py](./guieval/config.py)**: `DATASET` for clear type notation and static checking
+- **[model_paths.json](./config/model_paths.json)**: Configure default model paths for supported models
+
+### Model Core Implementation
+- **[ur_model.py](./guieval/models/ur_model.py)**: Implement ur model's core methods
+- **[__init__.py](./guieval/models/__init__.py)**: Register ur model
 
 ## Evaluation
 
+### Quick Start
+
+You can use the provided `run.sh` script as a template, or run directly with Python:
+
 ```bash
-python3 run.py \
-  --model agentcpm-gui-8b \
-  --dataset cagui_agent \
-  --mode all \
-  --outputs outputs/agentcpm-gui-8b/cagui_agent \
-  --use-vllm
+python3 run.py all \
+    --setup.datasets cagui_agent \
+    --setup.model.model_name agentcpm-gui-8b \
+    --setup.eval_mode offline_rule \
+    --setup.vllm_mode online
 ```
-**Arguments:**
-- `--model (str)`: Set the model name that is supported in GUIEvalKit (defined in `config/model_info.json`).
-- `--dataset (str)`: Set the benchmark name that is supported in GUIEvalKit (defined in `config/dataset_info.json`).
-- `--mode (str, default to 'all', choices are ['all', 'infer', 'eval'])`: When `mode` set to `all`, will perform both inference and evaluation; when set to `infer`, will only perform the inference; when set to `eval`, will only perform the evaluation.
-- `--outputs (str, default to './outputs')`: The directory to save evaluation results.
-- `--batch-size (int, default to 64)`: The batch size used for inference.
-- `--no-think`: Use this argument if you want to disable the thinking mode (if applicable).
-- `--use-vllm`: Use this argument if you want to inference with `vllm`, otherwise `transformers` will be adopted.
-- `--over-size`: Use this argument for deploying large models on four GPUs and inferring with `vllm`. 
+
+### Command Structure
+
+The evaluation command follows this structure:
+
+```bash
+python3 run.py <mode> [--setup.<config_path> <value> ...]
+```
+
+**Mode Options:**
+- `all`: Perform both inference and evaluation (default)
+- `infer`: Only perform inference
+- `eval`: Only perform evaluation (currently not implemented)
+
+### Configuration Options
+
+#### Dataset Configuration
+- `--setup.datasets (str | list)`: Comma-separated list of datasets to evaluate. Supported datasets: `androidcontrol_low`, `androidcontrol_high`, `cagui_agent`, `gui_odyssey`, `aitz`
+
+#### Model Configuration
+- `--setup.model.model_name (str)`: Model name from the supported models list (required)
+- `--setup.model.model (str)`: Custom model path (optional, defaults to path in `model_paths.json`)
+- `--setup.model.model_alias (str)`: Human-readable model identifier for logs (optional, defaults to `model_name`)
+- `--setup.model.max_model_len (int)`: Maximum context length (default: 8192)
+- `--setup.model.tensor_parallel_size (int)`: Number of GPUs for tensor parallelism (default: 1)
+- `--setup.model.data_parallel_size (int)`: Number of GPUs for data parallelism (default: 1)
+- `--setup.model.pipeline_parallel_size (int)`: Number of GPUs for pipeline parallelism (default: 1)
+- `--setup.model.max_num_batched_tokens (int)`: Maximum batched tokens per inference (default: 4096)
+- `--setup.model.max_num_seqs (int)`: Maximum sequences per inference (default: 32)
+- `--setup.model.image_limit (int)`: Maximum images per prompt (default: 3)
+
+#### Evaluation Configuration
+- `--setup.eval_mode (str)`: Evaluation mode (default: `offline_rule`)
+  - `offline_rule`: Evaluate with model off-policy based on predefined rules
+  - `semi_online`: Evaluate on-policy with model's own outputs when task succeeds
+  - ...
+- `--setup.vllm_mode (str)`: vLLM inference mode (default: `online`)
+  - `online`: Use vLLM online serving for concurrent generation
+  - `offline`: Use vLLM batched generation
+- `--setup.enable_thinking (bool)`: Enable thinking mode for models that support it (default: `true`)
+- `--setup.batch_size (int)`: Task Batch size for offline vLLM mode (default: 64)
+- `--setup.max_concurrent_tasks (int)`: Maximum concurrent tasks for online vLLM mode (default: 128)
+
+#### Output Configuration
+- `--setup.output_dir (str)`: Directory to save evaluation results (default: `./outputs`)
+- `--setup.log_dir (str)`: Directory to save logs (default: `./logs/guieval`)
+
+### Example: Using run.sh
+
+You can modify `run.sh` to customize your evaluation:
+
+```bash
+datasets=androidcontrol_high,gui_odyssey,cagui_agent
+model="ui-tars-1.5-7b"
+model_path="None"  # or /path/to/specific_model
+model_alias="None"  # or custom alias
+mode=all
+vllm_mode=online
+max_model_len=40960
+tp=1
+dp=8
+pp=1
+tokens_batch_size=16384
+seq_box=32
+image_limit=1
+concurrent=32
+eval_mode=offline_rule
+enable_thinking=false
+
+python3 run.py ${mode} \
+    --setup.datasets ${datasets} \
+    --setup.model.model_name ${model} \
+    --setup.model.model_alias ${model_alias} \
+    --setup.model.model ${model_path} \
+    --setup.model.max_model_len ${max_model_len} \
+    --setup.model.tensor_parallel_size ${tp} \
+    --setup.model.data_parallel_size ${dp} \
+    --setup.model.pipeline_parallel_size ${pp} \
+    --setup.model.max_num_batched_tokens ${tokens_batch_size} \
+    --setup.model.max_num_seqs ${seq_box} \
+    --setup.model.image_limit ${image_limit} \
+    --setup.eval_mode ${eval_mode} \
+    --setup.vllm_mode ${vllm_mode} \
+    --setup.max_concurrent_tasks ${concurrent} \
+    --setup.enable_thinking ${enable_thinking}
+``` 
 
 **Please check [here](./docs/results.md) for the detailed evaluation results.**
 
@@ -74,4 +179,4 @@ To add new GUI agents and benchmarks to GUIEvalKit, please refer to the [Develop
 
 ## Acknowledgement
 
-This repo benefits from [AgentCPM-GUI/eval](https://github.com/OpenBMB/AgentCPM-GUI/tree/main/eval) and [VLMEvalKit](https://github.com/open-compass/VLMEvalKit). Thanks for their wonderful works.
+This repo benefits from [AgentCPM-GUI/eval](https://github.com/OpenBMB/AgentCPM-GUI/tree/main/eval) and [VLMEvalKit](https://github.com/open-compass/VLMEvalKit). Thanks for their wonderful works.
@@ -0,0 +1,12 @@
+import json
+
+from config.config_utils import CONFIG_BASE, model_config_handler
+
+MODEL_PATH_FILE = CONFIG_BASE / 'model_paths.json'
+MODEL_PATHS: dict = json.loads(MODEL_PATH_FILE.read_text())
+
+
+__all__ = ['CONFIG_BASE',
+           'model_config_handler',
+           'MODEL_PATH_FILE',
+           'MODEL_PATHS']
@@ -0,0 +1,78 @@
+try:
+    import flash_attn  # noqa: F401
+    hf_attn_implementation = "flash_attention_2"
+except Exception:
+    hf_attn_implementation = "sdpa"
+
+import importlib.resources as res
+
+from pydantic import BaseModel
+from typing import Any
+from transformers import (AutoProcessor, AutoTokenizer, AutoModelForCausalLM,
+                          Qwen2VLForConditionalGeneration, Qwen2VLProcessor, Qwen2_5_VLForConditionalGeneration,
+                          Qwen3VLForConditionalGeneration,
+                          Glm4vForConditionalGeneration, Glm4vMoeForConditionalGeneration)
+
+CONFIG_BASE = res.files('config')
+
+
+class ModelConfig(BaseModel):
+    llm_class: Any
+    tokenizer_class: Any
+    attn_implementation: str | Any
+
+
+MODEL_CONFIGS: dict[tuple[str], ModelConfig] = {
+    ("agentcpm-gui-8b", ): ModelConfig(
+        llm_class=AutoModelForCausalLM,
+        tokenizer_class=AutoTokenizer,
+        attn_implementation="sdpa"
+    ),
+    ("qwen2.5-vl-3b-instruct", "qwen2.5-vl-7b-instruct",
+     "ui-tars-1.5-7b",
+     "mimo-vl-7b-sft", "mimo-vl-7b-rl", "mimo-vl-7b-sft-2508", "mimo-vl-7b-rl-2508",
+     "gui-owl-7b", "gui-owl-32b",
+     "ui-venus-navi-7b", "ui-venus-navi-72b"): ModelConfig(
+        llm_class=Qwen2_5_VLForConditionalGeneration,
+        tokenizer_class=AutoProcessor,
+        attn_implementation=hf_attn_implementation
+    ),
+    ("qwen3-vl-4b-instruct", "qwen3-vl-4b-thinking", "qwen3-vl-8b-instruct", "qwen3-vl-8b-thinking"): ModelConfig(
+        llm_class=Qwen3VLForConditionalGeneration,
+        tokenizer_class=AutoProcessor,
+        attn_implementation=hf_attn_implementation
+    ),
+    ("ui-tars-2b-sft", "ui-tars-7b-sft", "ui-tars-7b-dpo", "ui-tars-72b-sft", "ui-tars-72b-dpo"): ModelConfig(
+        llm_class=Qwen2VLForConditionalGeneration,
+        tokenizer_class=AutoProcessor,
+        attn_implementation=hf_attn_implementation
+    ),
+    ("glm-4.1v-9b-thinking", ): ModelConfig(
+        llm_class=Glm4vForConditionalGeneration,
+        tokenizer_class=AutoProcessor,
+        attn_implementation=hf_attn_implementation
+    ),
+    ("glm-4.5v", ): ModelConfig(
+        llm_class=Glm4vMoeForConditionalGeneration,
+        tokenizer_class=AutoProcessor,
+        attn_implementation=hf_attn_implementation
+    ),
+    ("magicgui-cpt", "magicgui-rft"): ModelConfig(
+        llm_class=Qwen2VLForConditionalGeneration,
+        tokenizer_class=Qwen2VLProcessor,
+        attn_implementation=hf_attn_implementation
+    ),
+}
+
+
+def model_config_handler(model_name: str) -> ModelConfig:
+    '''
+    Get the model config for a given model name.
+
+    If the model name is not found, a ValueError will be raised.
+    '''
+    for _names, _config in MODEL_CONFIGS.items():
+        if model_name in _names:
+            return _config
+    else:
+        raise ValueError(f"Model {model_name} not found in {MODEL_CONFIGS}")
@@ -1,27 +1,27 @@
 {
   "aitz": {
-    "folder_name": "data/android_in_the_zoo",
+    "folder_name": "android_in_the_zoo",
     "split": "test",
     "subset": ["general", "google_apps", "install", "web_shopping"]
   },
   "androidcontrol_high":{
-    "folder_name": "data/android_control",
+    "folder_name": "android_control",
     "split": "test",
     "subset": ["android_control"]
   },
   "androidcontrol_low":{
-    "folder_name": "data/android_control",
+    "folder_name": "android_control",
     "split": "test",
     "subset": ["android_control"]
   },
   "cagui_agent": {
-    "folder_name": "data/cagui_agent",
+    "folder_name": "cagui_agent",
     "split": "test",
     "subset": ["domestic"]
   },
   "gui_odyssey": {
-    "folder_name": "data/gui_odyssey",
+    "folder_name": "gui_odyssey",
     "split": "test",
     "subset": ["gui_odyssey"]
   }
-}
+}
@@ -3,7 +3,7 @@
         "folder_name": "models/AgentCPM-GUI"
     },
     "ui-tars-1.5-7b": {
-        "folder_name": "models/UI-TARS-1.5-7B"
+        "folder_name": "models/UI-TARS/UI-TARS-1.5-7B"
     },
     "ui-tars-2b-sft": {
         "folder_name": "models/UI-TARS-2B-SFT"
@@ -26,6 +26,18 @@
     "qwen2.5-vl-7b-instruct": {
         "folder_name": "models/Qwen2.5-VL-7B-Instruct"
     },
+    "qwen3-vl-4b-instruct": {
+        "folder_name": "models/Qwen3-VL-4B-Instruct"
+    },
+    "qwen3-vl-4b-thinking": {
+        "folder_name": "models/Qwen3-VL-4B-Thinking"
+    },
+    "qwen3-vl-8b-instruct": {
+        "folder_name": "models/Qwen3-VL-8B-Instruct"
+    },
+    "qwen3-vl-8b-thinking": {
+        "folder_name": "models/Qwen3-VL-8B-Thinking"
+    },
     "mimo-vl-7b-sft": {
         "folder_name": "models/MiMo-VL-7B-SFT"
     },
@@ -62,4 +74,4 @@
     "magicgui-rft":{
         "folder_name": "models/MagicGUI_RFT"
     }
-}
+}