menloresearch · gau-nernst · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 18, 2025
diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt
@@ -177,6 +177,8 @@ add_executable(${TARGET_NAME} main.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/file_logger.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/extensions/template_renderer.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/extensions/python-engines/python_utils.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/extensions/python-engines/vllm_engine.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/dylib_path_manager.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/process/utils.cc

diff --git a/engine/cli/CMakeLists.txt b/engine/cli/CMakeLists.txt
@@ -74,6 +74,8 @@ add_executable(${TARGET_NAME} main.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../services/database_service.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/remote-engine/remote_engine.cc
 
+    ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/python-engines/python_utils.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/python-engines/vllm_engine.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/template_renderer.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/easywsclient.cc

diff --git a/engine/cli/commands/chat_completion_cmd.cc b/engine/cli/commands/chat_completion_cmd.cc
@@ -137,7 +137,11 @@ void ChatCompletionCmd::Exec(const std::string& host, int port,
       new_data["content"] = user_input;
       histories_.push_back(std::move(new_data));
 
-      Json::Value json_data = mc.ToJson();
+      // vLLM doesn't support params used model config
+      Json::Value json_data;
+      if (mc.engine != kVllmEngine) {
+        json_data = mc.ToJson();
+      }
       json_data["engine"] = mc.engine;
 
       Json::Value msgs_array(Json::arrayValue);

diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc
@@ -7,6 +7,13 @@
 #include "utils/string_utils.h"
 
 namespace commands {
+
+// NOTE: should have a single source of truth between CLI and server
+static bool NeedCudaDownload(const std::string& engine) {
+  return !system_info_utils::GetDriverAndCudaVersion().second.empty() &&
+         engine == kLlamaRepo;
+}
+
 bool EngineInstallCmd::Exec(const std::string& engine,
                             const std::string& version,
                             const std::string& src) {
@@ -35,15 +42,18 @@ bool EngineInstallCmd::Exec(const std::string& engine,
   if (show_menu_) {
     DownloadProgress dp;
     dp.Connect(host_, port_);
+    bool need_cuda_download = NeedCudaDownload(engine);
     // engine can be small, so need to start ws first
-    auto dp_res = std::async(std::launch::deferred, [&dp] {
-      bool need_cuda_download =
-          !system_info_utils::GetDriverAndCudaVersion().second.empty();
-      if (need_cuda_download) {
+    auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download, engine] {
+      // if (need_cuda_download) {
+      //   return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
+      // } else {
+      //   return dp.Handle({DownloadType::Engine});
+      // }
+      if (engine == kLlamaRepo)
         return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
-      } else {
-        return dp.Handle({DownloadType::Engine});
-      }
+      else
+        return dp.Handle({});
     });
 
     auto releases_url = url_parser::Url{
@@ -151,15 +161,18 @@ bool EngineInstallCmd::Exec(const std::string& engine,
   // default
   DownloadProgress dp;
   dp.Connect(host_, port_);
+  bool need_cuda_download = NeedCudaDownload(engine);
   // engine can be small, so need to start ws first
-  auto dp_res = std::async(std::launch::deferred, [&dp] {
-    bool need_cuda_download =
-        !system_info_utils::GetDriverAndCudaVersion().second.empty();
-    if (need_cuda_download) {
+  auto dp_res = std::async(std::launch::deferred, [&dp, need_cuda_download, engine] {
+    // if (need_cuda_download) {
+    //   return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
+    // } else {
+    //   return dp.Handle({DownloadType::Engine});
+    // }
+    if (engine == kLlamaRepo)
       return dp.Handle({DownloadType::Engine, DownloadType::CudaToolkit});
-    } else {
-      return dp.Handle({DownloadType::Engine});
-    }
+    else
+      return dp.Handle({});
   });
 
   auto install_url = url_parser::Url{

diff --git a/engine/cli/commands/model_pull_cmd.cc b/engine/cli/commands/model_pull_cmd.cc
@@ -67,8 +67,12 @@ std::optional<std::string> ModelPullCmd::Exec(const std::string& host, int port,
   auto download_url = res.value()["downloadUrl"].asString();
 
   if (downloaded.empty() && avails.empty()) {
-    model_id = id;
-    model = download_url;
+    if (res.value()["modelSource"].asString() == "huggingface") {
+      model = id;
+    } else {
+      model_id = id;
+      model = download_url;
+    }
   } else {
     if (is_cortexso) {
       auto selection = cli_selection_utils::PrintModelSelection(

diff --git a/engine/cli/commands/run_cmd.cc b/engine/cli/commands/run_cmd.cc
@@ -84,11 +84,18 @@ void RunCmd::Exec(bool run_detach,
       CLI_LOG("Error: " + model_entry.error());
       return;
     }
-    yaml_handler.ModelConfigFromFile(
-        fmu::ToAbsoluteCortexDataPath(
-            fs::path(model_entry.value().path_to_model_yaml))
-            .string());
-    auto mc = yaml_handler.GetModelConfig();
+
+    config::ModelConfig mc;
+    if (model_entry.value().engine == kVllmEngine) {
+      // vLLM engine doesn't have model config
+      mc.engine = kVllmEngine;
+    } else {
+      yaml_handler.ModelConfigFromFile(
+          fmu::ToAbsoluteCortexDataPath(
+              fs::path(model_entry.value().path_to_model_yaml))
+              .string());
+      mc = yaml_handler.GetModelConfig();
+    }
 
     // Check if engine existed. If not, download it
     {

diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc
@@ -28,7 +28,7 @@ void Models::PullModel(const HttpRequestPtr& req,
     return;
   }
 
-  auto model_handle = (*(req->getJsonObject())).get("model", "").asString();
+  auto model_handle = req->getJsonObject()->get("model", "").asString();
   if (model_handle.empty()) {
     Json::Value ret;
     ret["result"] = "Bad Request";
@@ -39,52 +39,19 @@ void Models::PullModel(const HttpRequestPtr& req,
   }
 
   std::optional<std::string> desired_model_id = std::nullopt;
-  auto id = (*(req->getJsonObject())).get("id", "").asString();
+  auto id = req->getJsonObject()->get("id", "").asString();
   if (!id.empty()) {
     desired_model_id = id;
   }
 
   std::optional<std::string> desired_model_name = std::nullopt;
-  auto name_value = (*(req->getJsonObject())).get("name", "").asString();
-
+  auto name_value = req->getJsonObject()->get("name", "").asString();
   if (!name_value.empty()) {
     desired_model_name = name_value;
   }
 
-  auto handle_model_input =
-      [&, model_handle]() -> cpp::result<DownloadTask, std::string> {
-    CTL_INF("Handle model input, model handle: " + model_handle);
-    if (string_utils::StartsWith(model_handle, "https")) {
-      return model_service_->HandleDownloadUrlAsync(
-          model_handle, desired_model_id, desired_model_name);
-    } else if (model_handle.find(":") != std::string::npos) {
-      auto model_and_branch = string_utils::SplitBy(model_handle, ":");
-      if (model_and_branch.size() == 3) {
-        auto mh = url_parser::Url{
-            /* .protocol = */ "https",
-            /* .host = */ kHuggingFaceHost,
-            /* .pathParams = */
-            {
-                model_and_branch[0],
-                model_and_branch[1],
-                "resolve",
-                "main",
-                model_and_branch[2],
-            },
-            /* queries= */ {},
-        }
-                      .ToFullPath();
-        return model_service_->HandleDownloadUrlAsync(mh, desired_model_id,
-                                                      desired_model_name);
-      }
-      return model_service_->DownloadModelFromCortexsoAsync(
-          model_and_branch[0], model_and_branch[1], desired_model_id);
-    }
-
-    return cpp::fail("Invalid model handle or not supported!");
-  };
-
-  auto result = handle_model_input();
+  auto result = model_service_->PullModel(model_handle, desired_model_id,
+                                          desired_model_name);
   if (result.has_error()) {
     Json::Value ret;
     ret["message"] = result.error();
@@ -213,6 +180,17 @@ void Models::ListModel(
           data.append(std::move(obj));
           continue;
         }
+
+        if (model_entry.engine == kVllmEngine) {
+          Json::Value obj;
+          obj["id"] = model_entry.model;
+          obj["model"] = model_entry.model;
+          obj["engine"] = model_entry.engine;
+          obj["status"] = "downloaded";
+          data.append(std::move(obj));
+          continue;
+        }
+
         yaml_handler.ModelConfigFromFile(
             fmu::ToAbsoluteCortexDataPath(
                 fs::path(model_entry.path_to_model_yaml))

diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc
@@ -138,7 +138,7 @@ void server::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
   auto err_or_done = std::make_shared<std::atomic_bool>(false);
   auto chunked_content_provider = [this, q, err_or_done, engine_type, model_id](
                                       char* buf,
-                                       std::size_t buf_size) -> std::size_t {
+                                      std::size_t buf_size) -> std::size_t {
     if (buf == nullptr) {
       LOG_TRACE << "Buf is null";
       if (!(*err_or_done)) {

diff --git a/engine/e2e-test/api/engines/test_api_engine.py b/engine/e2e-test/api/engines/test_api_engine.py
@@ -20,12 +20,12 @@ def setup_and_teardown(self):
 
         # Teardown
         stop_server()
-    
+
     # engines get
     def test_engines_get_llamacpp_should_be_successful(self):
         response = requests.get("http://localhost:3928/engines/llama-cpp")
         assert response.status_code == 200
-        
+
     # engines install
     def test_engines_install_llamacpp_specific_version_and_variant(self):
         data = {"version": "v0.1.40-b4354", "variant": "linux-amd64-avx"}
@@ -40,7 +40,7 @@ def test_engines_install_llamacpp_specific_version_and_null_variant(self):
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
         assert response.status_code == 200
-    
+
     # engines uninstall
     @pytest.mark.asyncio
     async def test_engines_install_uninstall_llamacpp_should_be_successful(self):

diff --git a/engine/extensions/python-engines/python_utils.cc b/engine/extensions/python-engines/python_utils.cc
@@ -0,0 +1,119 @@
+#include "python_utils.h"
+#include <filesystem>
+
+#include "utils/archive_utils.h"
+#include "utils/curl_utils.h"
+#include "utils/file_manager_utils.h"
+#include "utils/set_permission_utils.h"
+#include "utils/system_info_utils.h"
+
+namespace python_utils {
+
+std::filesystem::path GetPythonEnginesPath() {
+  return file_manager_utils::GetCortexDataPath() / "python_engines";
+}
+std::filesystem::path GetEnvsPath() {
+  return GetPythonEnginesPath() / "envs";
+}
+std::filesystem::path GetUvPath() {
+  auto system_info = system_info_utils::GetSystemInfo();
+  const auto bin_name = system_info->os == kWindowsOs ? "uv.exe" : "uv";
+  return GetPythonEnginesPath() / "bin" / bin_name;
+}
+bool UvCleanCache() {
+  auto cmd = UvBuildCommand("cache");
+  cmd.push_back("clean");
+  auto result = cortex::process::SpawnProcess(cmd);
+  if (result.has_error()) {
+    CTL_INF(result.error());
+    return false;
+  }
+  return cortex::process::WaitProcess(result.value());
+}
+
+bool UvIsInstalled() {
+  return std::filesystem::exists(GetUvPath());
+}
+cpp::result<void, std::string> UvInstall() {
+  const auto py_bin_path = GetPythonEnginesPath() / "bin";
+  std::filesystem::create_directories(py_bin_path);
+
+  // NOTE: do we need a mechanism to update uv, or just pin uv version with cortex release?
+  const std::string uv_version = "0.6.11";
+
+  // build download url based on system info
+  std::stringstream fname_stream;
+  fname_stream << "uv-";
+
+  auto system_info = system_info_utils::GetSystemInfo();
+  if (system_info->arch == "amd64")
+    fname_stream << "x86_64";
+  else if (system_info->arch == "arm64")
+    fname_stream << "aarch64";
+
+  // NOTE: there is also a musl linux version
+  if (system_info->os == kMacOs)
+    fname_stream << "-apple-darwin.tar.gz";
+  else if (system_info->os == kWindowsOs)
+    fname_stream << "-pc-windows-msvc.zip";
+  else if (system_info->os == kLinuxOs)
+    fname_stream << "-unknown-linux-gnu.tar.gz";
+
+  const std::string fname = fname_stream.str();
+  const std::string base_url =
+      "https://github.com/astral-sh/uv/releases/download/";
+
+  std::stringstream url_stream;
+  url_stream << base_url << uv_version << "/" << fname;
+  const std::string url = url_stream.str();
+  CTL_INF("Download uv from " << url);
+
+  const auto save_path = py_bin_path / fname;
+  auto res = curl_utils::SimpleDownload(url, save_path.string());
+  if (res.has_error())
+    return res;
+
+  archive_utils::ExtractArchive(save_path, py_bin_path.string(), true);
+  set_permission_utils::SetExecutePermissionsRecursive(py_bin_path);
+  std::filesystem::remove(save_path);
+
+  // install Python3.10 from Astral. this will be preferred over system
+  // Python when possible.
+  // NOTE: currently this will install to a user-wide directory. we can
+  // install to a specific location using `--install-dir`, but later
+  // invocation of `uv run` needs to have `UV_PYTHON_INSTALL_DIR` set to use
+  // this Python installation.
+  // we can add this once we allow passing custom env var to SpawnProcess().
+  // https://docs.astral.sh/uv/reference/cli/#uv-python-install
+  std::vector<std::string> command = UvBuildCommand("python");
+  command.push_back("install");
+  command.push_back("3.10");
+
+  auto result = cortex::process::SpawnProcess(command);
+  if (result.has_error())
+    return cpp::fail(result.error());
+
+  if (!cortex::process::WaitProcess(result.value())) {
+    const auto msg = "Process spawned but fail to wait";
+    CTL_ERR(msg);
+    return cpp::fail(msg);
+  }
+
+  return {};
+}
+
+std::vector<std::string> UvBuildCommand(const std::string& action,
+                                        const std::string& directory) {
+  // use our own cache dir so that when users delete cortexcpp/, everything is deleted.
+  const auto cache_dir = GetPythonEnginesPath() / "cache" / "uv";
+  std::vector<std::string> command = {GetUvPath().string(), "--cache-dir",
+                                      cache_dir.string()};
+  if (!directory.empty()) {
+    command.push_back("--directory");
+    command.push_back(directory);
+  }
+  command.push_back(action);
+  return command;
+}
+
+}  // namespace python_utils