diff --git a/docs/api/api-endpoints.md b/docs/api/api-endpoints.md index 42361511..0c2c79f4 100644 --- a/docs/api/api-endpoints.md +++ b/docs/api/api-endpoints.md @@ -1,6 +1,6 @@ --- -title: "API Endpoints" -description: "Unlock the power of RunPod's API Endpoints, manage models without managing pods, and retrieve results via the status endpoint within 30 minutes for privacy protection; rate limits enforced per user." +title: "API endpoints" +description: "Unlock the power of RunPod's API endpoints, manage models without managing pods, and retrieve results via the status endpoint within 30 minutes for privacy protection; rate limits enforced per user." sidebar_position: 1 --- @@ -13,8 +13,8 @@ We don't keep your inputs or outputs longer than that to protect your privacy! ::: -API Endpoints are Endpoints managed by RunPod that you can use to interact with your favorite models without managing the pods yourself. -These Endpoints are available to all users. +API endpoints are endpoints managed by RunPod that you can use to interact with your favorite models without managing the pods yourself. +These endpoints are available to all users. ## Overview @@ -22,13 +22,13 @@ The API Endpoint implementation works asynchronously as well as synchronous. Let's take a look at the differences between the two different implementations. -### Asynchronous Endpoints +### Asynchronous endpoints Asynchronous endpoints are useful for long-running jobs that you don't want to wait for. You can submit a job and then check back later to see if it's done. When you fire an Asynchronous request with the API Endpoint, your input parameters are sent to our endpoint and you immediately get a response with a unique job ID. You can then query the response by passing the job ID to the status endpoint. The status endpoint will give you the job results when completed. -### Synchronous Endpoints +### Synchronous endpoints Synchronous endpoints are useful for short-running jobs that you want to wait for. You can submit a job and get the results back immediately. @@ -137,4 +137,4 @@ Exceeding limits returns a `429` error. `/run` - 1000 requests/10s, max 200 concurrent `/runsync` - 2000 requests/10s, max 400 concurrent -For more information, see [Job operations](/serverless/references/operations). +For more information, see [Job operations](/serverless/endpoints/operations). diff --git a/docs/glossary.md b/docs/glossary.md index 4392e788..f6bf0a8c 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -19,77 +19,15 @@ A [worker](./serverless/workers/overview.md) is a single compute resource that p ## Endpoint -An Endpoint refers to a specific REST API (URL) provided by RunPod that your applications or services can interact with. These endpoints enable standard functionality for submitting jobs and retrieving their outputs. +An endpoint refers to a specific REST API (URL) provided by RunPod that your applications or services can interact with. These endpoints enable standard functionality for submitting jobs and retrieving their outputs. ## Handler -A Handler is a function you create that takes in submitted inputs, processes them (like generating images, text, or audio), and returns the final output. +A handler is a function you create that takes in submitted inputs, processes them (like generating images, text, or audio), and returns the final output. -## Serverless [SDK](https://github.com/runpod/runpod-python?tab=readme-ov-file#--serverless-worker-sdk) +## Serverless SDK -A Python package used when creating a handler function. This package helps your code receive requests from our serverless system, triggers your handler function to execute, and returns the function’s result back to the serverless system. - -## Endpoint Settings - -### Idle Timeout - -The amount of time a worker remains running after completing its current request. During this period, the worker stays active, continuously checking the queue for new jobs, and continues to incur charges. If no new requests arrive within this time, the worker will go to sleep. - -Default: 5 seconds - -### Execution Timeout - -The maximum time a job can run before the system terminates the worker. This prevents “bad” jobs from running indefinitely and draining your credit. - -You can disable this setting, but we highly recommend keeping it enabled. The default maximum value is 24 hours, but if you need a longer duration, you can use job TTL to override it. - -Default: 600 seconds (10 minutes) - -### Job [TTL](/serverless/endpoints/send-requests#execution-policies)(Time-To-Live) - -Defines the maximum time a job can remain in the queue before it's automatically terminated. This parameter ensures that jobs don't stay in the queue indefinitely. You should set this if your job runs longer than 24 hours or if you want to remove job data as soon as it is finished. - -Minimum value: 10,000 milliseconds (10 seconds) -Default value: 86,400,000 milliseconds (24 hours) - -### Flashboot - -FlashBoot is RunPod’s magic solution for reducing the average cold-start times on your endpoint. It works probabilistically. When your endpoint has consistent traffic, your workers have a higher chance of benefiting from FlashBoot for faster spin-ups. However, if your endpoint isn’t receiving frequent requests, FlashBoot has fewer opportunities to optimize performance. There’s no additional cost associated with FlashBoot. - -### Scale Type - -- Queue Delay scaling strategy adjusts worker numbers based on request wait times. With zero workers initially, the first request adds one worker. Subsequent requests add workers only after waiting in the queue for the defined number of delay seconds. -- Request Count scaling strategy adjusts worker numbers according to total requests in the queue and in progress. It automatically adds workers as the number of requests increases, ensuring tasks are handled efficiently. - -### Expose HTTP/TCP Ports - -We allow direct communication with your worker using its public IP and port. This is especially useful for real-time applications that require minimal latency. Check out this [WebSocket example](https://github.com/runpod-workers/worker-websocket) to see how it works! - -## Endpoint Metrics - -### Requests - -Displays the total number of requests received by your endpoint, along with the number of completed, failed, and retried requests. - -### Execution Time - -Displays the P70, P90, and P98 execution times for requests on your endpoint. These percentiles help analyze execution time distribution and identify potential performance bottlenecks. - -### Delay Time - -Delay time is the duration a request spends waiting in the queue before being picked up by a worker. Displays the P70, P90, and P98 delay times for requests on your endpoint. These percentiles help assess whether your endpoint is scaling efficiently. - -### Cold Start Time - -Cold start time measures how long it takes to wake up a worker. This includes the time needed to start the container, load the model into GPU VRAM, and get the worker ready to process a job. Displays the P70, P90, and P98 cold start times for your endpoint. - -### Cold Start Count - -Displays the number of cold starts your endpoint has during a given period. The fewer, the better, as fewer cold starts mean faster response times. - -### WebhookRequest Responses - -Displays the number of webhook requests sent and their corresponding responses, including success and failure counts. +The [Serverless SDK](https://github.com/runpod/runpod-python) is a Python package used when creating a handler function. This package helps your code receive requests from our serverless system, triggers your handler function to execute, and returns the function's result back to the serverless system. # Pod @@ -101,13 +39,14 @@ GPU instances that run in T3/T4 data centers, providing high reliability and sec GPU instances connect individual compute providers to consumers through a vetted, secure peer-to-peer system. -## Datacenter +## Data center + +A data center is a secure location where RunPod's cloud computing services, such as GPU instances and storage instances, are hosted. These data centers are equipped with redundant power, multiple ISP connections, and data backups to ensure the safety and reliability of your compute services and data. -A data center is a secure location where RunPod's cloud computing services, such as Secure Cloud and GPU Instances, are hosted. These data centers are equipped with redundancy and data backups to ensure the safety and reliability of your data. +## GPU instance -## GPU Instance +A GPU instance is a container-based compute resource that you can deploy. -GPU Instance is a container-based GPU instance that you can deploy. These instances spin up in seconds using both public and private repositories. They are available in two different types: diff --git a/docs/hosting/burn-testing.md b/docs/hosting/burn-testing.md index 942e76e3..13e53791 100644 --- a/docs/hosting/burn-testing.md +++ b/docs/hosting/burn-testing.md @@ -1,5 +1,5 @@ --- -title: "Burn Testing" +title: "Burn testing" description: "Before listing a machine on the RunPod platform, thoroughly test it with a burn test, verifying memory, CPU, and disk capabilities, and ensure compatibility with popular templates by self-renting the machine after verifying its performance." --- diff --git a/docs/hosting/partner-requirements.md b/docs/hosting/partner-requirements.md index c9ec7d2b..c73634b1 100644 --- a/docs/hosting/partner-requirements.md +++ b/docs/hosting/partner-requirements.md @@ -1,4 +1,8 @@ -# RunPod Secure Cloud Partner Requirements - Release 2025 +--- +title: "Secure Cloud partner requirements" +--- + +# RunPod Secure Cloud partner requirements (2025) # Introduction diff --git a/docs/integrations/mods/mods.md b/docs/integrations/mods/mods.md index f9a0bc1d..c32a4104 100644 --- a/docs/integrations/mods/mods.md +++ b/docs/integrations/mods/mods.md @@ -29,7 +29,7 @@ To start using Mods, follow these step-by-step instructions: ```yml runpod: - # https://docs.runpod.io/serverless/workers/vllm/openai-compatibility + # https://docs.runpod.io/serverless/vllm/openai-compatibility base-url: https://api.runpod.ai/v2/${YOUR_ENDPOINT}/openai/v1 api-key: api-key-env: RUNPOD_API_KEY diff --git a/docs/overview.md b/docs/overview.md index b49f6a1e..13046efe 100644 --- a/docs/overview.md +++ b/docs/overview.md @@ -30,7 +30,7 @@ Use Serverless to: ### Get started with Serverless - [Build your first Serverless app](/serverless/get-started) -- [Run any LLM as an endpoint using vLLM workers](/serverless/workers/vllm/get-started) +- [Run any LLM as an endpoint using vLLM workers](/serverless/vllm/get-started) - [Tutorial: Create a Serverless endpoint with Stable Diffusion](/tutorials/serverless/gpu/run-your-first) ## Pods diff --git a/docs/references/troubleshooting/storage-full.md b/docs/references/troubleshooting/storage-full.md index bd3fc477..9cbd717f 100644 --- a/docs/references/troubleshooting/storage-full.md +++ b/docs/references/troubleshooting/storage-full.md @@ -1,12 +1,12 @@ --- -title: "Storage Full" +title: "Storage full" id: "storage-full" description: "This document provides guidance to troubleshoot the storage full, which may occur when users generate many files, transfer files, or perform other storage-intensive tasks." --- Storage full can occur when users generate many files, transfer files, or perform other storage-intensive tasks. This document provides guidance to help you troubleshoot this. -## Check Disk Usage +## Check disk usage When encountering a storage full, the first step is to check your container’s disk usage. You can use the `df -h` command to display a summary of disk usage. @@ -34,7 +34,7 @@ tmpfs 252G 0 252G 0% /sys/firmware tmpfs 252G 0 252G 0% /sys/devices/virtual/powercap ``` -## Key Areas to Check +## Key areas to check **Container Disk Usage**: The primary storage area for your container is mounted on the `overlay` filesystem. This indicates the container’s root directory. @@ -62,7 +62,7 @@ root@9b8e325167b2:/# find /workspace -type f -exec du -h {} + | sort -rh | head 512 /workspace/a.txt ``` -## Removing Files and Directories +## Removing files and directories Once you’ve identified large files or directories that are no longer needed, you can remove them to free up space. diff --git a/docs/references/troubleshooting/troubleshooting-502-errors.md b/docs/references/troubleshooting/troubleshooting-502-errors.md index 254ec98a..e0d65e60 100644 --- a/docs/references/troubleshooting/troubleshooting-502-errors.md +++ b/docs/references/troubleshooting/troubleshooting-502-errors.md @@ -1,12 +1,12 @@ --- -title: "502 Errors" +title: "502 errors" id: "troubleshooting-502-errors" description: "Troubleshoot 502 errors in your deployed pod by checking GPU attachment, pod logs, and official template instructions to resolve issues and enable seamless access." --- 502 errors can occur when users attempt to access a program running on a specific port of a deployed pod and the program isn't running or has encountered an error. This document provides guidance to help you troubleshoot this error. -### Check Your Pod's GPU +### Check your Pod's GPU The first step to troubleshooting a 502 error is to check whether your pod has a GPU attached. @@ -18,7 +18,7 @@ If a GPU is attached, you will see it under the Pods screen (e.g. 1 x A6000). If ![](/img/docs/fb4c0dd-image.png) -### Check Your Pod's Logs +### Check your Pod's logs After confirming that your pod has a GPU attached, the next step is to check your pod's logs for any errors. @@ -27,7 +27,7 @@ After confirming that your pod has a GPU attached, the next step is to check you 2. ![](/img/docs/3500eba-image.png)\ **Look for errors**: Browse through the logs to find any error messages that may provide clues about why you're experiencing a 502 error. -### Verify Additional Steps for Official Templates +### Verify additional steps for official templates In some cases, for our official templates, the user interface does not work right away and may require additional steps to be performed by the user. diff --git a/docs/sdks/javascript/endpoints.md b/docs/sdks/javascript/endpoints.md index f2d78f78..5826ad6b 100644 --- a/docs/sdks/javascript/endpoints.md +++ b/docs/sdks/javascript/endpoints.md @@ -626,7 +626,7 @@ console.log(result); -For more information, see [Execution policy](/serverless/endpoints/job-operations). +For more information, see [Execution policy](/serverless/endpoints/operations). ## Purge queue diff --git a/docs/serverless/workers/development/_category_.json b/docs/serverless/development/_category_.json similarity index 88% rename from docs/serverless/workers/development/_category_.json rename to docs/serverless/development/_category_.json index 297d4a64..796ac3ec 100644 --- a/docs/serverless/workers/development/_category_.json +++ b/docs/serverless/development/_category_.json @@ -1,6 +1,6 @@ { "label": "Development", - "position": 6, + "position": 10, "link": { "type": "generated-index", "description": "Learn to develop your application." diff --git a/docs/serverless/workers/development/cleanup.md b/docs/serverless/development/cleanup.md similarity index 100% rename from docs/serverless/workers/development/cleanup.md rename to docs/serverless/development/cleanup.md diff --git a/docs/serverless/workers/development/concurrency.md b/docs/serverless/development/concurrency.md similarity index 100% rename from docs/serverless/workers/development/concurrency.md rename to docs/serverless/development/concurrency.md diff --git a/docs/serverless/workers/development/debugger.md b/docs/serverless/development/debugger.md similarity index 100% rename from docs/serverless/workers/development/debugger.md rename to docs/serverless/development/debugger.md diff --git a/docs/serverless/workers/development/environment-variables.md b/docs/serverless/development/environment-variables.md similarity index 100% rename from docs/serverless/workers/development/environment-variables.md rename to docs/serverless/development/environment-variables.md diff --git a/docs/serverless/workers/development/local-testing.md b/docs/serverless/development/local-testing.md similarity index 100% rename from docs/serverless/workers/development/local-testing.md rename to docs/serverless/development/local-testing.md diff --git a/docs/serverless/workers/development/overview.md b/docs/serverless/development/overview.md similarity index 93% rename from docs/serverless/workers/development/overview.md rename to docs/serverless/development/overview.md index 30f212d7..34883a1c 100644 --- a/docs/serverless/workers/development/overview.md +++ b/docs/serverless/development/overview.md @@ -1,11 +1,11 @@ --- -title: "Local Server Flags" +title: "Local server flags" description: "A comprehensive guide to all flags available when starting your RunPod local server for endpoint testing" sidebar_position: 1 --- -When developing RunPod serverless functions, it's crucial to test them thoroughly before deployment. -The RunPod SDK provides a powerful local testing environment that allows you to simulate your serverless endpoints right on your development machine. +When developing RunPod Serverless functions, it's crucial to test them thoroughly before deployment. +The RunPod SDK provides a powerful local testing environment that allows you to simulate your Serverless endpoints right on your development machine. This local server eliminates the need for constant Docker container rebuilds, uploads, and endpoint updates during the development and testing phase. To facilitate this local testing environment, the RunPod SDK offers a variety of flags that allow you to customize your setup. @@ -20,7 +20,7 @@ By using these flags, you can create a local environment that closely mimics the This guide provides a comprehensive overview of all available flags, their purposes, and how to use them effectively in your local testing workflow. -## Basic Usage +## Basic usage To start your local server with additional flags, use the following format: @@ -30,7 +30,7 @@ python your_function.py [flags] Replace `your_function.py` with the name of your Python file containing the RunPod handler. -## Available Flags +## Available flags ### --rp_serve_api @@ -138,6 +138,6 @@ python main.py --rp_serve_api \ This command starts the local server on port `8080` with 4 concurrent workers, sets the log level to `DEBUG`, and provides test input data. -These flags provide powerful tools for customizing your local testing environment. By using them effectively, you can simulate various scenarios, debug issues, and ensure your serverless functions are robust and ready for deployment to the RunPod cloud. +These flags provide powerful tools for customizing your local testing environment. By using them effectively, you can simulate various scenarios, debug issues, and ensure your Serverless functions are robust and ready for deployment to the RunPod cloud. For more detailed information on each flag and advanced usage scenarios, refer to the individual tutorials in this documentation. diff --git a/docs/serverless/workers/development/test-response-times.md b/docs/serverless/development/test-response-times.md similarity index 100% rename from docs/serverless/workers/development/test-response-times.md rename to docs/serverless/development/test-response-times.md diff --git a/docs/serverless/workers/development/validator.md b/docs/serverless/development/validator.md similarity index 100% rename from docs/serverless/workers/development/validator.md rename to docs/serverless/development/validator.md diff --git a/docs/serverless/endpoints/_category_.json b/docs/serverless/endpoints/_category_.json index 96c5162c..00abeee5 100644 --- a/docs/serverless/endpoints/_category_.json +++ b/docs/serverless/endpoints/_category_.json @@ -1,6 +1,6 @@ { "label": "Endpoints", - "position": 5, + "position": 6, "link": { "type": "generated-index", "description": "Learn how to customize the serverless functions used by in your applications." diff --git a/docs/serverless/endpoints/endpoint-configurations.md b/docs/serverless/endpoints/endpoint-configurations.md new file mode 100644 index 00000000..06f85975 --- /dev/null +++ b/docs/serverless/endpoints/endpoint-configurations.md @@ -0,0 +1,134 @@ +--- +title: "Endpoint configurations" +sidebar_position: 8 +description: Configure your endpoint settings to optimize performance and cost, including GPU selection, worker count, idle timeout, and advanced options like data centers, network volumes, and scaling strategies. +--- + +This guide explains all configurable settings for RunPod Serverless endpoints, helping you optimize for performance, cost, and reliability. + +## Basic configurations + +### Endpoint name + +The name you assign to your endpoint for easy identification in your dashboard. This name is only visible to you and doesn't affect the endpoint ID used for API calls. + +### GPU selection + +Choose one or more GPU types for your endpoint in order of preference. RunPod prioritizes allocating the first GPU type in your list and falls back to subsequent GPU types if your first choice is unavailable. Selecting multiple GPU types improves availability, especially for high-demand GPUs. + +### Worker configuration + +#### Active (min) workers + +Sets the minimum number of workers that remain running at all times. Setting this at one or higher eliminates cold start delays for faster response times. Active workers incur charges immediately, but receive up to 30% discount from regular pricing. + +Default: 0 + +#### Max workers + +The maximum number of concurrent workers your endpoint can scale to. + +Default: 3 + +:::tip + +We recommend that you set this value 20% higher than your expected maximum concurrency. If requests are frequently throttled, consider increasing this value to 5 or more. + +::: + +#### GPUs per worker + +The number of GPUs assigned to each worker instance. + +Default: 1 + +### Timeout settings + +#### Idle timeout + +The amount of time that a worker continues running after completing a request. You’re still charged for this time, even if the worker isn’t actively processing any requests. + +By default, the idle timeout is set to 5 seconds to help avoid frequent start/stop cycles and reduce the likelihood of cold starts. Setting a longer idle timeout can help minimize cold starts for intermittent traffic, but it may also increase your costs + +#### Execution timeout + +The maximum time a job can run before automatic termination. This prevents runaway jobs from consuming excessive resources. You can turn off this setting, but we highly recommend keeping it on. + +Default: 600 seconds (10 minutes) +Maximum: 24 hours (can be extended using job TTL) + +#### Job TTL (time-to-live) + +The maximum time a job remains in the queue before automatic termination. + +Default: 86,400,000 milliseconds (24 hours) +Minimum: 10,000 milliseconds (10 seconds) + +See [Execution policies](/serverless/endpoints/send-requests#execution-policies) for more information. + +:::tip + +You can use the `/status` operation to configure the time-to-live (TTL) for an individual job by appending a TTL parameter when checking the status of a job. For example, `https://api.runpod.ai/v2/{endpoint_id}/status/{job_id}?ttl=6000` sets the TTL for the job to 6 seconds. Use this when you want to tell the system to remove a job result sooner than the default retention time. + +::: + +### FlashBoot + +FlashBoot is RunPod's solution for reducing the average cold-start times on your endpoint. It works probabilistically. When your endpoint has consistent traffic, your workers have a higher chance of benefiting from FlashBoot for faster spin-ups. However, if your endpoint isn't receiving frequent requests, FlashBoot has fewer opportunities to optimize performance. There is no additional cost associated with FlashBoot. + +## Advanced configurations + +### Data centers + +Control which data centers can deploy and cache your workers. Allowing multiple data centers improves availability, while using a network volume restricts your endpoint to a single data center. + +Default: All data centers + +### Network volumes + +Attach persistent storage to your workers. Network volumes have higher latency than local storage, and restrict workers to the data center containing your volume. However, they're very useful for sharing large models or data between workers on an endpoint. + +See [Create a network volume](/pods/storage/create-network-volumes) for more information. + +### Auto-scaling type + +#### Queue delay + +Adds workers based on request wait times. + +The queue delay scaling strategy adjusts worker numbers based on request wait times. Workers are added if requests spend more than X seconds in the queue, where X is a threshold you define. By default, this threshold is set at 4 seconds. + +#### Request count + +The request count scaling strategy adjusts worker numbers according to total requests in the queue and in progress. It automatically adds workers as the number of requests increases, ensuring tasks are handled efficiently. + +Total workers formula: `Math.ceil((requestsInQueue + requestsInProgress) / 4)` + +### Expose HTTP/TCP ports + +Enables direct communication with your worker via its public IP and port. This can be useful for real-time applications requiring minimal latency, such as [WebSocket applications](https://github.com/runpod-workers/worker-websocket). + +### Enabled GPU types + +Here you can specify which [GPU types](/references/gpu-types) to use within your selected GPU size categories. By default, all GPU types are enabled. + +### CUDA version selection + +Specify which CUDA versions can be used with your workload to ensures your code runs on compatible GPU hardware RunPod will match your workload to GPU instances with the selected CUDA versions. + +:::tip + +CUDA is generally backward compatible, so we recommend that you check for the version you need and any higher versions. For example, if your code requires CUDA 12.4, you should also try running it on 12.5, 12.6, and so on. + +Limiting your endpoint to just one or two CUDA versions can significantly reduce GPU availability. RunPod continuously updates GPU drivers to support the latest CUDA versions, so keeping more CUDA versions selected gives you access to more resources. + +::: + +## Best practices + +- **Start conservative** with max workers and scale up as needed. +- **Monitor throttling** and adjust max workers accordingly. +- **Use active workers** for latency-sensitive applications. +- **Select multiple GPU types** to improve availability. +- **Choose appropriate timeouts** based on your workload characteristics. +- **Consider data locality** when using network volumes. diff --git a/docs/serverless/endpoints/get-started.md b/docs/serverless/endpoints/get-started.md deleted file mode 100644 index 36f24068..00000000 --- a/docs/serverless/endpoints/get-started.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -title: Get started -sidebar_position: 2 -description: "Learn how to test your deployed Endpoint with a sample request, view the response, and send requests using cURL or an HTTP client, then customize your Handler Function for more control over your API." ---- - -Now that your Endpoint is deployed, send a test. -This is a great way to test your Endpoint before sending a request from your application. - -## Send a Request - -1. From the Endpoint's page, select **Requests**. -2. Choose **Run**. -3. You should see a successful response with the following: - -```json -{ - "id": "6de99fd1-4474-4565-9243-694ffeb65218-u1", - "status": "IN_QUEUE" -} -``` - -After a few minutes, the stream will show the full response. - -You can now begin sending requests to your Endpoint from your terminal and an application. - -## Send a request using cURL - -Once your Endpoint is deployed, you can send a request. -This example sends a response to the Endpoint using cURL; however, you can use any HTTP client. - -```curl -curl --request POST \ - --url https://api.runpod.ai/v2/${endpoint_id}/runsync - --header "accept: application/json" \ - --header "authorization: ${YOUR_API_KEY}" \ - --header "content-type: application/json" \ - --data ' -{ - "input": { - "prompt": "A coffee cup.", - "height": 512, - "width": 512, - "num_outputs": 1, - "num_inference_steps": 50, - "guidance_scale": 7.5, - "scheduler": "KLMS" - } -} -' -``` - -Where `endpoint_id` is the name of your Endpoint and `YOUR_API_KEY` is your API Key. - -:::note - -Depending on any modifications you made to your Handler Function, you may need to modify the request. - -::: - -## Next steps - -Now that you have successfully launched an endpoint using a template, you can: - -- [Invoke jobs](/serverless/endpoints/job-operations) - -If the models provided aren't enough, you can write your own customize Function Handler: - -- [Customize the Handler Function](/serverless/workers/handlers/overview) diff --git a/docs/serverless/endpoints/job-operations.md b/docs/serverless/endpoints/job-operations.md deleted file mode 100644 index 6316a951..00000000 --- a/docs/serverless/endpoints/job-operations.md +++ /dev/null @@ -1,337 +0,0 @@ ---- -title: Job operations -description: "Learn how to use the Runpod Endpoint to manage job operations, including running, checking status, purging queues, and streaming results, with cURL and SDK examples." -sidebar_position: 2 ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -This page provides instructions on job operations using the Runpod Endpoint. -You can invoke a job to run Endpoints the way you would interact with an API, get a status of a job, purge your job queue, and more with operations. - -The following guide demonstrates how to use cURL to interact with an Endpoint. -You can also use the following SDK to interact with Endpoints programmatically: - -- [Python SDK](/sdks/python/endpoints) - -For information on sending requests, see [Send a request](/serverless/endpoints/send-requests). - -## Asynchronous Endpoints - -Asynchronous endpoints are designed for long-running tasks. When you submit a job through these endpoints, you receive a Job ID in response. -You can use this Job ID to check the status of your job at a later time, allowing your application to continue processing without waiting for the job to complete immediately. -This approach is particularly useful for tasks that require significant processing time or when you want to manage multiple jobs concurrently. - - - - -```bash -curl -X POST https://api.runpod.ai/v2/{endpoint_id}/run \ - -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer ${API_KEY}' \ - -d '{"input": {"prompt": "Your prompt"}}' -``` - - - - - -```json -{ - "id": "eaebd6e7-6a92-4bb8-a911-f996ac5ea99d", - "status": "IN_QUEUE" -} -``` - - - - -## Synchronous Endpoints - -Synchronous endpoints are ideal for short-lived tasks where immediate results are necessary. -Unlike asynchronous calls, synchronous endpoints wait for the job to complete and return the result directly in the response. -This method is suitable for operations that are expected to complete quickly and where the client can afford to wait for the result. - - - - -```bash -curl -X POST https://api.runpod.ai/v2/{endpoint_id}/runsync \ - -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer ${API_KEY}' \ - -d '{"input": {"prompt": "Your prompt"}}' -``` - - - - -```json -{ - "delayTime": 824, - "executionTime": 3391, - "id": "sync-79164ff4-d212-44bc-9fe3-389e199a5c15", - "output": [ - { - "image": "https://image.url", - "seed": 46578 - } - ], - "status": "COMPLETED" -} -``` - - - -## Health Endpoint - -The `/health` endpoint provides insights into the operational status of the endpoint, including the number of workers available and job statistics. -This information can be used to monitor the health and performance of the API, helping you manage workload and troubleshoot issues more effectively. - - - - -```bash -curl --request GET \ - --url https://api.runpod.ai/v2/{endpoint_id}/health \ - --header 'accept: application/json' \ - --header 'Authorization: Bearer ${API_KEY}' -``` - - - - - -```json -{ - "jobs": { - "completed": 1, - "failed": 5, - "inProgress": 0, - "inQueue": 2, - "retried": 0 - }, - "workers": { - "idle": 0, - "running": 0 - } -} -``` - - - - -## Cancel Job - -To cancel a job in progress, specify the `cancel` parameter with the endpoint ID and the job ID. - - - - -```bash -curl -X POST https://api.runpod.ai/v2/{endpoint_id}/cancel/{job_id} \ - -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer ${API_KEY}' -``` - - - - - -```json -{ - "id": "724907fe-7bcc-4e42-998d-52cb93e1421f-u1", - "status": "CANCELLED" -} -``` - - - - -## Purge Queue Endpoint - -The `/purge-queue` endpoint allows you to clear all jobs that are currently in the queue. -This operation does not affect jobs that are already in progress. -It is a useful tool for managing your job queue, especially in situations where you need to reset or clear pending tasks due to operational changes or errors. - - - - -```bash -curl -X POST https://api.runpod.ai/v2/{endpoint_id}/purge-queue \ - -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer ${API_KEY}' -``` - - - - - -```json -{ - "removed": 2, - "status": "completed" -} -``` - - - - -## Check Job Status - -To track the progress or result of an asynchronous job, you can check its status using the Job ID. -This endpoint provides detailed information about the job, including its current status, execution time, and the output if the job has completed. - - - - -```bash -curl -X POST https://api.runpod.ai/v2/{endpoint_id}/status/{job_id} \ - -H 'Authorization: Bearer ${API_KEY}' -``` - - - - - -```json -{ - "delayTime": 31618, - "executionTime": 1437, - "id": "60902e6c-08a1-426e-9cb9-9eaec90f5e2b-u1", - "output": { - "input_tokens": 22, - "output_tokens": 16, - "text": ["Hello! How can I assist you today?\nUSER: I'm having"] - }, - "status": "COMPLETED" -} -``` - - - - -## Retry a job - -To retry a job that has failed or encountered an error, send a POST request to `/retry/{job_id}` with the Job ID. -The system will automatically requeue and retry the job. - -- You can retry any job with a `FAILED` or `TIMED_OUT` status, as long as the job hasn’t expired. -- Jobs submitted via `/run` expire 30 minutes after completion. -- Jobs submitted via `/runsync` expire 1 minute after completion. -- When a job is retried, the previous output is removed. If you call `/status` right after the retry, it will return no output until the new job run is complete. - - - - -```bash -curl -X POST https://api.runpod.ai/v2/{endpoint_id}/retry/{job_id} \ - -H 'Authorization: Bearer ${API_KEY}' -``` - - - - - -```json -{ - "id": "60902e6c-08a1-426e-9cb9-9eaec90f5e2b-u1", - "status": "IN_QUEUE" -} -``` - - - - -## Stream results - -For jobs that produce output incrementally, the stream endpoint allows you to receive results as they are generated. -This is particularly useful for tasks that involve continuous data processing or where immediate partial results are beneficial. - - - - -```bash -curl -X POST https://api.runpod.ai/v2/{endpoint_id}/stream/{job_id} \ - -H 'Content-Type: application/json' \ - -H 'Authorization: Bearer ${API_KEY}' -``` - - - - - -```json -[ - { - "metrics": { - "avg_gen_throughput": 0, - "avg_prompt_throughput": 0, - "cpu_kv_cache_usage": 0, - "gpu_kv_cache_usage": 0.0016722408026755853, - "input_tokens": 0, - "output_tokens": 1, - "pending": 0, - "running": 1, - "scenario": "stream", - "stream_index": 2, - "swapped": 0 - }, - "output": { - "input_tokens": 0, - "output_tokens": 1, - "text": [" How"] - } - } - // omitted for brevity -] -``` - - - - -:::note - -The maximum size for a payload that can be sent using yield to stream results is 1 MB. - -::: - -## Rate Limits - -RunPod's Endpoints facilitate submitting jobs and retrieving outputs. -Access these endpoints at: `https://api.runpod.ai/v2/{endpoint_id}/{operation}` - -- `/run` - - - 1000 requests per 10 seconds, 200 concurrent - -- `/runsync` - - - 2000 requests per 10 seconds, 400 concurrent - -- `/status`, `/status-sync`, `/stream` - - - 2000 requests per 10 seconds, 400 concurrent - -- `/cancel` - - - 100 requests per 10 seconds, 20 concurrent - -- `/purge-queue` - - - 2 requests per 10 seconds - -- `/openai/*` - - - 2000 requests per 10 seconds, 400 concurrent - -- `/requests` - - 10 requests per 10 seconds, 2 concurrent - -:::note - -Retrieve results from `/status` within 30 minutes for privacy protection. - -::: - -For reference information on Endpoints, see [Endpoint Operations](/serverless/references/operations.md). diff --git a/docs/serverless/endpoints/job-states.md b/docs/serverless/endpoints/job-states.md new file mode 100644 index 00000000..9018d904 --- /dev/null +++ b/docs/serverless/endpoints/job-states.md @@ -0,0 +1,30 @@ +--- +title: "Job states and metrics" +description: "Learn how to monitor your RunPod endpoints, including job states, performance metrics, and system health indicators to effectively manage and optimize your serverless workloads." +sidebar_position: 9 +--- + +Understanding job states and metrics is essential for effectively managing your Serverless endpoints. This documentation covers the different states your jobs can be in and the key metrics available to monitor endpoint performance and health. + +## Request job states + +Understanding job states helps you track the progress of individual requests and identify where potential issues might occur in your workflow. + +- `IN_QUEUE`: The job is waiting in the endpoint queue for an available worker to process it. +- `RUNNING`: A worker has picked up the job and is actively processing it. +- `COMPLETED`: The job has finished processing successfully and returned a result. +- `FAILED`: The job encountered an error during execution and did not complete successfully. +- `CANCELLED`: The job was manually cancelled using the `/cancel/job_id` endpoint before completion. +- `TIMED_OUT`: The job either expired before it was picked up by a worker or the worker failed to report back before reaching the timeout threshold. + +## Endpoint metrics + +You can find endpoint metrics in the **Metrics** tab of the Serverless endpoint details page in the [RunPod web interface](https://www.runpod.io/console/serverless). + +- **Requests**: Displays the total number of requests received by your endpoint, along with the number of completed, failed, and retried requests. +- **Execution time**: Displays the P70, P90, and P98 execution times for requests on your endpoint. These percentiles help analyze execution time distribution and identify potential performance bottlenecks. +- **Delay time**: Delay time is the duration a request spends waiting in the queue before it is picked up by a worker. Displays the P70, P90, and P98 delay times for requests on your endpoint. These percentiles help assess whether your endpoint is scaling efficiently. +- **Cold start time**: Cold start time measures how long it takes to wake up a worker. This includes the time needed to start the container, load the model into GPU VRAM, and get the worker ready to process a job. Displays the P70, P90, and P98 cold start times for your endpoint. +- **Cold start count**: Displays the number of cold starts your endpoint has during a given period. The fewer, the better, as fewer cold starts mean faster response times. +- **WebhookRequest responses**: Displays the number of webhook requests sent and their corresponding responses, including success and failure counts. +- **Worker states**: Displays the number of workers that are [running, idle, throttled, etc.](/serverless/workers/overview) across the selected time interval. diff --git a/docs/serverless/endpoints/manage-endpoints.md b/docs/serverless/endpoints/manage-endpoints.md index d413801b..35536b82 100644 --- a/docs/serverless/endpoints/manage-endpoints.md +++ b/docs/serverless/endpoints/manage-endpoints.md @@ -1,68 +1,92 @@ --- -title: "Manage Endpoints" -description: "Learn to create, edit, and manage Serverless Endpoints, including adding network volumes and setting GPU prioritization, with step-by-step guides and tutorials." -sidebar_position: 10 +title: "Manage endpoints" +description: "Learn how to create, configure, and manage your RunPod Serverless endpoints, including GPU prioritization and network volumes for optimal performance and cost efficiency." +sidebar_position: 2 --- -Learn to manage Severless Endpoints. +# Manage Serverless endpoints -## Create an Endpoint +This guide covers the essential management operations for RunPod Serverless endpoints, helping you deploy, configure, and maintain your Serverless applications effectively. -You can create an Endpoint in the Web interface. +## Create an endpoint -1. Navigate to [Serverless Endpoints](https://www.runpod.io/console/serverless). -2. Select **+ New Endpoint** and enter the following: - 1. Endpoint Name. - 2. Select your GPUs. - 3. Configure your workers. - 4. Add a container image. - 5. Select **Deploy**. +Create a new Serverless endpoint through the RunPod web interface: -## Delete an Endpoint +1. Navigate to the [Serverless section](https://www.runpod.io/console/serverless) of the RunPod console. +2. Click **New Endpoint**. +3. Select a source for your endpoint, such as a Docker image, GitHub repo, or a preset model. Click **Next**. +4. Follow the UI steps to select a Docker image, GitHub repo, or Hugging Face model. Click **Next**. +5. Configure your endpoint, setting the **Endpoint Name**, the number of **Max Workers**, **Environment Variables**, etc. For a full list of options, see [Endpoint configurations](/serverless/endpoints/endpoint-configurations) +6. Click **Create Endpoint** to deploy. -You can delete an Endpoint in the Web interface. -Before an Endpoint can be deleted, all workers must be removed. +:::tip -1. Navigate to [Serverless Endpoints](https://www.runpod.io/console/serverless). -2. Select the Endpoint you'd like to remove. -3. Select **Edit Endpoint** and set **Max Workers** to `0`. -4. Choose **Update** and then **Delete Endpoint**. +You can optimize cost and availability by specifying GPU preferences in order of priority. RunPod attempts to allocate your first choice GPU. If unavailable, it automatically uses the next GPU in your priority list, ensuring your workloads run on the best available resources. -## Edit an Endpoint +You can enable or disable particular GPU types using the **Advanced > Enabled GPU Types** section. -You can edit a running Endpoint in the Web interface after you've deployed it. +::: -1. Navigate to [Serverless Endpoints](https://www.runpod.io/console/serverless). -2. Select the Endpoint you'd like to edit. -3. Select **Edit Endpoint** and make your changes. -4. Choose **Update**. +After deployment, your endpoint takes time to initialize before it is ready to process requests. You can monitor the deployment status on the endpoint details page, which shows worker status and initialization progress. Once active, your endpoint displays a unique API URL (`https://api.runpod.ai/v2/{endpoint_id}/`) that you can use to send requests. For information on how to interact with your endpoint, see [Endpoint operations](/serverless/endpoints/operations). -## Set GPU prioritization an Endpoint +## Edit an endpoint -When creating or modifying a Worker Endpoint, specify your GPU preferences in descending order of priority. -This allows you to configure the desired GPU models for your Worker Endpoints. +You can modify your endpoint's configuration at any time: -RunPod attempts to allocate your first choice if it's available. -If your preferred GPU isn't available, the system automatically defaults to the next available GPU in your priority list. +1. Navigate to the [Serverless section](https://www.runpod.io/console/serverless) in the RunPod console. +2. Click the three dots in the bottom right corner of the endpoint you want to modify. +3. Click **Edit Endpoint**. +4. Update any [configuration parameters](/serverless/endpoints/endpoint-configurations) as needed: + - Endpoint name + - Worker configuration + - Docker configuration (container image or version) + - Environment variables + - Storage +5. Click **Save Endpoint** to apply your changes. -1. Navigate to [Serverless Endpoints](https://www.runpod.io/console/serverless). -2. Select the Endpoint you'd like to update. -3. Select the priority of the GPUs you'd like to use. -4. Choose **Update**. +Changes take effect over time as each worker is updated to the new configuration. -:::note +:::tip -You can force a configuration update by setting **Max Workers** to 0, selecting **Update**, then updating your max workers back to your needed value. +To force an immediate configuration update, temporarily set **Max Workers** to 0, trigger the **Release**, then restore your desired worker count and update again. ::: -## Add a Network Volume +## Add a network volume + +Attach persistent storage to share data across workers: + +1. Navigate to the [Serverless section](https://www.runpod.io/console/serverless) in the RunPod console. +2. Click the three dots in the bottom right corner of the endpoint you want to modify. +3. Click **Edit Endpoint**. +4. Expand the **Advanced** section. +5. Select a volume from the dropdown below **Network Volume**. +7. Click **Save Endpoint** to attach the volume to your endpoint. + +Network volumes are mounted to the same path on each worker, making them ideal for sharing large models, datasets, or any data that needs to persist across worker instances. + +## Delete an endpoint + +When you no longer need an endpoint, you can remove it from your account: + +1. Navigate to the [Serverless section](https://www.runpod.io/console/serverless) in the RunPod console. +2. Click the three dots in the bottom right corner of the endpoint you want to delete. +3. Click **Delete Endpoint**. +4. Type the name of the endpoint, then click **Confirm**. + +After confirmation, the endpoint will be removed from your account, and you'll no longer be charged for its resources. + +## Best practices for endpoint management + +- **Start small and scale**: Begin with fewer workers and scale up as demand increases. +- **Monitor usage**: Regularly check your endpoint metrics to optimize worker count and GPU allocation. +- **Use GPU prioritization**: Set up fallback GPU options to balance cost and availability. +- **Leverage network volumes** for large models or datasets rather than embedding them in your container image. +- **Set appropriate timeouts** based on your workload's processing requirements. -Network volumes are a way to share data between Workers: they are mounted to the same path on each Worker. -For example, if a Worker contains a large-language model, you can use a network volume to share the model across all Workers. +## Next steps -1. Navigate to [Serverless Endpoints](https://www.runpod.io/console/serverless). -2. Select the Endpoint you'd like to edit. -3. Select **Edit Endpoint** and make your changes. -4. Under **Advanced** choose **Select Network Volume**. -5. Select the storage device and then choose **Update** to continue. +- [Learn how to send requests to your endpoints.](/serverless/endpoints/send-requests) +- [Explore advanced endpoint operations.](/serverless/endpoints/operations) +- [Optimize your endpoints for cost and performance.](/serverless/endpoints/endpoint-configurations) +- [Learn about endpoint job states and metrics.](/serverless/endpoints/job-states) diff --git a/docs/serverless/endpoints/operations.md b/docs/serverless/endpoints/operations.md new file mode 100644 index 00000000..7b8d9769 --- /dev/null +++ b/docs/serverless/endpoints/operations.md @@ -0,0 +1,593 @@ +--- +title: Endpoint operations +description: "Learn how to effectively manage RunPod Serverless jobs throughout their lifecycle, from submission to completion, using asynchronous and synchronous endpoints, status tracking, cancellation, and streaming capabilities." +sidebar_position: 4 +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Endpoint operations + +RunPod's endpoint operations allow you to control the complete lifecycle of your Serverless workloads. This guide demonstrates how to submit, monitor, manage, and retrieve results from jobs running on your Serverless endpoints. + +## Operation overview + +- **`/run`**: Submit an asynchronous job that processes in the background while you receive an immediate job ID. +- **`/runsync`**: Submit a synchronous job and wait for the complete results in a single response. +- **`/status`**: Check the current status, execution details, and results of a previously submitted job. +- **`/stream`**: Receive incremental results from a job as they become available. +- **`/cancel`**: Stop a job that is in progress or waiting in the queue. +- **`/retry`**: Requeue a failed or timed-out job using the same job ID and input parameters. +- **`/purge-queue`**: Clear all pending jobs from the queue without affecting jobs already in progress. +- **`/health`**: Monitor the operational status of your endpoint, including worker and job statistics. + +## Submitting jobs + +RunPod offers two primary methods for submitting jobs, each suited for different use cases. + +### Asynchronous jobs (`/run`) + +Use asynchronous jobs for longer-running tasks that don't require immediate results. This approach returns immediately with a job ID and then processes the job in the background. This approach is particularly useful for operations that require significant processing time, or when you want to manage multiple jobs concurrently. + +- **Payload limit**: 10 MB +- **Job availability**: Results are available for 30 minutes after completion + + + + +```bash +# Submit an asynchronous job +curl -X POST https://api.runpod.ai/v2/{endpoint_id}/run \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer ${API_KEY}' \ + -d '{"input": {"prompt": "Your prompt"}}' +``` + + + + +```python +import requests + +def submit_async_job(endpoint_id, api_key, input_data): + """ + Submit an asynchronous job to a RunPod endpoint. + + Args: + endpoint_id: Your RunPod endpoint ID + api_key: Your RunPod API key + input_data: Dictionary containing the job input + + Returns: + Dictionary containing job ID and status + """ + url = f"https://api.runpod.ai/v2/{endpoint_id}/run" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + payload = {"input": input_data} + + response = requests.post(url, headers=headers, json=payload) + return response.json() + +# Example usage +if __name__ == "__main__": + endpoint_id = "your-endpoint-id" + api_key = "your-api-key" + input_data = {"prompt": "Your prompt"} + + result = submit_async_job(endpoint_id, api_key, input_data) + print(f"Job ID: {result['id']}") + print(f"Status: {result['status']}") +``` + + + + +```json +{ + "id": "eaebd6e7-6a92-4bb8-a911-f996ac5ea99d", + "status": "IN_QUEUE" +} +``` + + + + +### Synchronous jobs (`/runsync`) + +Use synchronous jobs for shorter tasks where you need immediate results. Synchronous jobs waits for job completion before returning the complete result in a single response. This simplifies your code by eliminating the need for status polling, which works best for quick operations (under 30 seconds). + +- **Payload limit**: 20 MB +- **Job availability**: Results are available for 60 seconds after completion + + + + +```bash +# Submit a synchronous job +curl -X POST https://api.runpod.ai/v2/{endpoint_id}/runsync \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer ${API_KEY}' \ + -d '{"input": {"prompt": "Your prompt"}}' +``` + + + + +```python +import requests + +def submit_sync_job(endpoint_id, api_key, input_data): + """ + Submit a synchronous job to a RunPod endpoint. + + Args: + endpoint_id: Your RunPod endpoint ID + api_key: Your RunPod API key + input_data: Dictionary containing the job input + + Returns: + Dictionary containing complete job results + """ + url = f"https://api.runpod.ai/v2/{endpoint_id}/runsync" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + payload = {"input": input_data} + + response = requests.post(url, headers=headers, json=payload) + return response.json() +``` + + + + +```json +{ + "delayTime": 824, // Time in queue (ms) + "executionTime": 3391, // Processing time (ms) + "id": "sync-79164ff4-d212-44bc-9fe3-389e199a5c15", + "output": [ + { + "image": "https://image.url", + "seed": 46578 + } + ], + "status": "COMPLETED" +} +``` + + + + +## Monitoring jobs + +### Checking job status (`/status`) + +For asynchronous jobs, you can check the status at any time using the job ID. The status endpoint provides: + +- Current job state (`IN_QUEUE`, `IN_PROGRESS`, `COMPLETED`, `FAILED`, etc.). +- Execution statistics (queue delay, processing time). +- Job output (if completed). + + + + +```bash +# Check job status +curl -X GET https://api.runpod.ai/v2/{endpoint_id}/status/{job_id} \ + -H 'Authorization: Bearer ${API_KEY}' +``` + + + + +```python +import requests + +def check_job_status(endpoint_id, job_id, api_key): + """ + Check the status of a RunPod job. + + Args: + endpoint_id: Your RunPod endpoint ID + job_id: The ID of the job to check + api_key: Your RunPod API key + + Returns: + Dictionary containing job status and results (if complete) + """ + url = f"https://api.runpod.ai/v2/{endpoint_id}/status/{job_id}" + headers = {"Authorization": f"Bearer {api_key}"} + + response = requests.get(url, headers=headers) + return response.json() +``` + + + + +```json +{ + "delayTime": 31618, // Time in queue (ms) + "executionTime": 1437, // Processing time (ms) + "id": "60902e6c-08a1-426e-9cb9-9eaec90f5e2b-u1", + "output": { + "input_tokens": 22, + "output_tokens": 16, + "text": ["Hello! How can I assist you today?\nUSER: I'm having"] + }, + "status": "COMPLETED" +} +``` + + + + +:::tip + +You can use the `/status` operation to configure the time-to-live (TTL) for an individual job by appending a TTL parameter when checking the status of a job. For example, `https://api.runpod.ai/v2/{endpoint_id}/status/{job_id}?ttl=6000` sets the TTL for the job to 6 seconds. Use this when you want to tell the system to remove a job result sooner than the default retention time. + +::: + +### Streaming results (`/stream`) + +For jobs that generate output incrementally or for very large outputs, use the stream endpoint to receive partial results as they become available. This is especially useful for: + +- Text generation tasks where you want to display output as it's created +- Long-running jobs where you want to show progress +- Large outputs that benefit from incremental processing + + + + +```bash +# Stream job results +curl -X GET https://api.runpod.ai/v2/{endpoint_id}/stream/{job_id} \ + -H 'Authorization: Bearer ${API_KEY}' +``` + + + + +```python +import requests + +def stream_job_results(endpoint_id, job_id, api_key): + """ + Stream results from a RunPod job. + + Args: + endpoint_id: Your RunPod endpoint ID + job_id: The ID of the job to stream + api_key: Your RunPod API key + + Returns: + List of partial results + """ + url = f"https://api.runpod.ai/v2/{endpoint_id}/stream/{job_id}" + headers = {"Authorization": f"Bearer {api_key}"} + + response = requests.get(url, headers=headers) + return response.json() +``` + + + + +```json +[ + { + "metrics": { + "avg_gen_throughput": 0, + "avg_prompt_throughput": 0, + "cpu_kv_cache_usage": 0, + "gpu_kv_cache_usage": 0.0016722408026755853, + "input_tokens": 0, + "output_tokens": 1, + "pending": 0, + "running": 1, + "scenario": "stream", + "stream_index": 2, + "swapped": 0 + }, + "output": { + "input_tokens": 0, + "output_tokens": 1, + "text": [" How"] + } + }, + { + // Additional stream chunks... + } +] +``` + + + + +:::note + +The maximum size for a single streamed payload chunk is 1 MB. Larger outputs will be split across multiple chunks. + +::: + +### Endpoint health monitoring (`/health`) + +The health endpoint provides a quick overview of your endpoint's operational status. Use it to monitor worker availability, track job queue status, identify potential bottlenecks, and determine if scaling adjustments are needed. + + + + +```bash +# Check endpoint health +curl -X GET https://api.runpod.ai/v2/{endpoint_id}/health \ + -H 'Authorization: Bearer ${API_KEY}' +``` + + + + +```python +import requests + +def check_endpoint_health(endpoint_id, api_key): + """ + Check the health of a RunPod endpoint. + + Args: + endpoint_id: Your RunPod endpoint ID + api_key: Your RunPod API key + + Returns: + Dictionary containing endpoint health information + """ + url = f"https://api.runpod.ai/v2/{endpoint_id}/health" + headers = {"Authorization": f"Bearer {api_key}"} + + response = requests.get(url, headers=headers) + return response.json() +``` + + + + +```json +{ + "jobs": { + "completed": 1, + "failed": 5, + "inProgress": 0, + "inQueue": 2, + "retried": 0 + }, + "workers": { + "idle": 0, + "running": 0 + } +} +``` + + + + +## Managing jobs + +### Cancelling jobs (`/cancel`) + +Cancel jobs that are no longer needed or taking too long to complete. This operation stops jobs that are in progress, removes jobs from the queue if they are not yet started, and returns immediately with the job's canceled status. + + + + +```bash +# Cancel a job +curl -X POST https://api.runpod.ai/v2/{endpoint_id}/cancel/{job_id} \ + -H 'Authorization: Bearer ${API_KEY}' +``` + + + + +```python +import requests + +def cancel_job(endpoint_id, job_id, api_key): + """ + Cancel a RunPod job. + + Args: + endpoint_id: Your RunPod endpoint ID + job_id: The ID of the job to cancel + api_key: Your RunPod API key + + Returns: + Dictionary containing job status after cancellation + """ + url = f"https://api.runpod.ai/v2/{endpoint_id}/cancel/{job_id}" + headers = {"Authorization": f"Bearer {api_key}"} + + response = requests.post(url, headers=headers) + return response.json() +``` + + + + +```json +{ + "id": "724907fe-7bcc-4e42-998d-52cb93e1421f-u1", + "status": "CANCELLED" +} +``` + + + + +### Retrying failed jobs (`/retry`) + +Retry jobs that have failed or timed out without having to submit a new job request. This operation maintains the same job ID for tracking and requeues the job with the original input parameters, removing the previous output (if any). It can only be used for jobs with a `FAILED` or `TIMED_OUT` status. + + + + +```bash +# Retry a failed job +curl -X POST https://api.runpod.ai/v2/{endpoint_id}/retry/{job_id} \ + -H 'Authorization: Bearer ${API_KEY}' +``` + + + + +```python +import requests + +def retry_job(endpoint_id, job_id, api_key): + """ + Retry a failed or timed-out RunPod job. + + Args: + endpoint_id: Your RunPod endpoint ID + job_id: The ID of the job to retry + api_key: Your RunPod API key + + Returns: + Dictionary containing job ID and new status + """ + url = f"https://api.runpod.ai/v2/{endpoint_id}/retry/{job_id}" + headers = {"Authorization": f"Bearer {api_key}"} + + response = requests.post(url, headers=headers) + return response.json() +``` + + + + +```json +{ + "id": "60902e6c-08a1-426e-9cb9-9eaec90f5e2b-u1", + "status": "IN_QUEUE" +} +``` + + + + +:::important + +Job results expire after a set period: + +- Asynchronous jobs (`/run`): Results available for 30 minutes +- Synchronous jobs (`/runsync`): Results available for 1 minute + +Once expired, jobs cannot be retried. + +::: + +### Purging the queue (`/purge-queue`) + +Clear all pending jobs from the queue when you need to reset or cancel multiple jobs at once. This is useful for error recovery, clearing outdated requests, resetting after configuration changes, and managing resource allocation. + + + + +```bash +# Purge the job queue +curl -X POST https://api.runpod.ai/v2/{endpoint_id}/purge-queue \ + -H 'Authorization: Bearer ${API_KEY}' +``` + + + + +```python +import requests + +def purge_queue(endpoint_id, api_key): + """ + Purge all queued jobs for a RunPod endpoint. + + Args: + endpoint_id: Your RunPod endpoint ID + api_key: Your RunPod API key + + Returns: + Dictionary with number of removed jobs and status + """ + url = f"https://api.runpod.ai/v2/{endpoint_id}/purge-queue" + headers = {"Authorization": f"Bearer {api_key}"} + + response = requests.post(url, headers=headers) + return response.json() +``` + + + + +```json +{ + "removed": 2, + "status": "completed" +} +``` + + + + +:::caution + +The purge-queue operation only affects jobs waiting in the queue. Jobs already in progress will continue to run. + +::: + +## Rate limits and quotas + +RunPod enforces rate limits to ensure fair platform usage. These limits apply per endpoint and operation: + +| Operation | Method | Rate Limit | Concurrent Limit | +|-----------|--------|------------|------------------| +| `/run` | POST | 1000 requests per 10 seconds | 200 concurrent | +| `/runsync` | POST | 2000 requests per 10 seconds | 400 concurrent | +| `/status`, `/status-sync`, `/stream` | GET/POST | 2000 requests per 10 seconds | 400 concurrent | +| `/cancel` | POST | 100 requests per 10 seconds | 20 concurrent | +| `/purge-queue` | POST | 2 requests per 10 seconds | N/A | +| `/openai/*` | POST | 2000 requests per 10 seconds | 400 concurrent | +| `/requests` | GET | 10 requests per 10 seconds | 2 concurrent | + +Requests will receive a `429 (Too Many Requests)` status if: +- Queue size exceeds 50 jobs AND +- Queue size exceeds `endpoint.WorkersMax * 500` + +Exceeding these limits will result in HTTP 429 (Too Many Requests) responses. Implement appropriate retry logic with exponential backoff in your applications to handle rate limiting gracefully. + +## Best practices + +- **Use asynchronous endpoints** for jobs that take more than a few seconds to complete. +- **Implement polling with backoff** when checking status of asynchronous jobs. +- **Set appropriate timeouts** in your client applications. +- **Monitor endpoint health** regularly to detect issues early. +- **Implement error handling** for all API calls. +- **Use webhooks** for notification-based workflows instead of polling. See [Send requests](/serverless/endpoints/send-requests#webhook-notifications) for implementation details. +- **Cancel unneeded jobs** to free up resources and reduce costs. + +## Troubleshooting + +| Issue | Possible Causes | Solutions | +|-------|-----------------|-----------| +| Job stuck in queue | No available workers, max workers limit reached | Increase max workers, check endpoint health | +| Timeout errors | Job takes longer than execution timeout | Increase timeout in job policy, optimize job processing | +| Failed jobs | Worker errors, input validation issues | Check logs, verify input format, retry with fixed input | +| Rate limiting | Too many requests in short time | Implement backoff strategy, batch requests when possible | +| Missing results | Results expired | Retrieve results within expiration window (30 min for async, 1 min for sync) | + +## Related resources + +- [Send requests to endpoints.](/serverless/endpoints/send-requests) +- [Python SDK for endpoints.](/sdks/python/endpoints) +- [Endpoint configurations.](/serverless/endpoints/endpoint-configurations) \ No newline at end of file diff --git a/docs/serverless/endpoints/overview.md b/docs/serverless/endpoints/overview.md index 819cbdc4..8ab494cf 100644 --- a/docs/serverless/endpoints/overview.md +++ b/docs/serverless/endpoints/overview.md @@ -1,46 +1,66 @@ --- title: Overview sidebar_position: 1 -description: "Deploy and manage serverless workers with RunPod Endpoints, featuring asynchronous and synchronous operations, scalability, and flexibility for modern computing tasks." +description: "Deploy and manage Serverless workers with RunPod endpoints, featuring asynchronous and synchronous operations, scalability, and flexibility for modern computing tasks." --- -RunPod Endpoints serve as the gateway to deploying and managing your Serverless Workers. -These endpoints allow for flexible interaction with a variety of models, supporting both asynchronous and synchronous operations tailored to your computational needs. -Whether you're processing large data sets, requiring immediate results, or scheduling tasks to run in the background, RunPod's API Endpoints provide the versatility and scalability essential for modern computing tasks. +# Endpoints overview -### Key features +Endpoints are the foundation of RunPod Serverless, serving as the gateway for deploying and managing your Serverless workers. They provide a consistent API interface that allows your applications to interact with powerful computational resources on demand. -- **Asynchronous and synchronous jobs:** Choose the execution mode that best fits your workflow, whether it's a task that runs in the background or one that delivers immediate results. -- **Serverless Workers:** Deploy your computational tasks without worrying about server management, enjoying the benefits of a fully managed infrastructure. -- **Scalability and flexibility:** Easily scale your operations up or down based on demand, with the flexibility to handle various computational loads. +Whether you're processing large datasets, running AI inference, or performing compute-intensive tasks, endpoints give you the flexibility to deploy and scale your workloads. -### Key Concepts +## What are endpoints? -Check out these two links for fundamental endpoint concepts, including key definitions and basic settings. +RunPod endpoints are RESTful APIs that accept HTTP requests, execute your code, and return the result via HTTP response. Each endpoint provides a unique URL and abstracts away the complexity of managing infrastructure. Behind the scenes, RunPod handles the entire lifecycle of Serverless workers, including job queuing, execution, and result delivery, so you can focus on your code, not the infrastructure. -- [Glossary](../../glossary.md) -- [Settings](../references/endpoint-configurations.md) +## Key features -### Getting started +### Execution modes -Before you begin, ensure you have obtained your [RunPod API key](/get-started/api-keys). -This key is essential for authentication, billing, and accessing the API. +Serverless offers **asynchronous processing** via the `/run` endpoint operation, which lets you submit jobs that run in the background and check results later, making this ideal for long-running tasks. -You can find your API key in the [user settings section](https://www.runpod.io/console/user/settings) of your RunPod account. +It also provides **synchronous operations** through the `/runsync` endpoint operation, allowing you to receive immediate results in the same request, which is perfect for interactive applications. -:::note +To learn more, see [Endpoint operations](/serverless/endpoints/operations). -**Privacy and security:** RunPod prioritizes your data's privacy and security. -Inputs and outputs are retained for a maximum of 30 minutes for asynchronous requests and 1 minute for synchronous requests to protect your information. +### Deployment and scaling -::: +RunPod endpoints are **auto-scaling**, automatically scaling from zero to hundreds of workers based on demand. You can **customize your endpoint configuration** to adjust the minimum and maximum worker count, GPU allocation, and memory settings. The system also offers **GPU prioritization**, allowing you to specify preferred GPU types in order of priority. -### Exploring RunPod Endpoints +To learn more, see [Endpoint configurations](/serverless/endpoints/endpoint-configurations). -Dive deeper into what you can achieve with RunPod Endpoints through the following resources: +### Integration options -- [Use the vLLM Worker](/serverless/workers/vllm/overview): Learn how to deploy a vLLM Worker as a Serverless Endpoint, with detailed guides on configuration and sending requests. -- [Invoke Jobs](/serverless/endpoints/job-operations): Learn how to submit jobs to your serverless workers, with detailed guides on both asynchronous and synchronous operations. -- [Send Requests](/serverless/endpoints/send-requests): Discover how to communicate with your endpoints, including tips on structuring requests for optimal performance. -- [Manage Endpoints](/serverless/endpoints/manage-endpoints): Find out how to manage your endpoints effectively, from deployment to scaling and monitoring. -- [Endpoint Operations](/serverless/references/operations): Access a comprehensive list of operations supported by RunPod Endpoints, including detailed documentation and examples. +RunPod endpoints support [webhook notifications](/serverless/endpoints/send-requests#webhook-notifications), allowing you to configure endpoints to call your webhook when jobs complete. + +It also includes [S3-compatible storage integration](/serverless/endpoints/send-requests#s3-compatible-storage-integration) for working with object storage for larger inputs and outputs. + +## Key concepts + +Understanding these fundamental concepts will help you work effectively with Serverless endpoints: + +An **endpoint** is a RESTful API, which provides a URL that serves as the entry point for your Serverless worker, allowing you to send requests and receive responses. + +A **request** is an HTTP request that you send to an endpoint, which can include parameters, payloads, and headers that define what the endpoint should process. For example, a `POST` request to run a job, or a `GET` request to check status of a job or endpoint health. + +When a request is sent to an endpoint, it creates a **job** that gets processed by a worker. **Jobs** can be either synchronous (immediate response) or asynchronous (background processing). + +A **worker** is the containerized environment that executes your handler code, providing the compute resources (CPU, GPU, memory) needed to process requests. + +The **handler** is the code that processes incoming requests and returns responses, defining the business logic of your endpoint. + +A diagram demonstrating the Serverless endpoint request flow + +## Getting started + +[Follow this step-by-step guide](/serverless/get-started) to create your first custom endpoint. This tutorial walks you through the process of setting up your development environment, creating a handler file, testing your endpoint locally, building and deploying a worker image, and sending endpoint requests using the RunPod console. + +## Next steps + +Dive deeper into what you can achieve with RunPod Serverless endpoints: + +- [Learn how to deploy a vLLM worker as a Serverless endpoint.](/serverless/vllm/overview) +- [Learn how to submit jobs to your Serverless workers.](/serverless/endpoints/operations) +- [Send requests to your endpoints programmatically.](/serverless/endpoints/send-requests) +- [Learn how to manage your endpoints using the RunPod console.](/serverless/endpoints/manage-endpoints) \ No newline at end of file diff --git a/docs/serverless/endpoints/send-requests.md b/docs/serverless/endpoints/send-requests.md index 2c770f07..eb90d35b 100644 --- a/docs/serverless/endpoints/send-requests.md +++ b/docs/serverless/endpoints/send-requests.md @@ -1,103 +1,287 @@ --- -title: "Send a request" -description: "Learn how to construct a JSON request body to send to your custom endpoint, including optional inputs for webhooks, execution policies, and S3-compatible storage, to optimize job execution and resource management." -sidebar_position: 4 +title: "Send requests" +description: "Learn how to send requests to your Serverless endpoint, including constructing JSON request bodies, testing with the UI, using cURL, and configuring optional inputs for webhooks, execution policies, and S3-compatible storage." +sidebar_position: 3 --- -Before sending a job request, ensure you have deployed your custom endpoint. +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -Let's start by constructing our request body to send to the endpoint. +# Send requests to an endpoint -## JSON Request Body +After deploying a Serverless endpoint, you need to know how to interact with it effectively. This guide covers everything from testing your endpoint in the console to sending requests programmatically and configuring advanced options. -You can make requests to your endpoint with JSON. Your request must include a JSON object containing an `input` key. For example, if your handler requires an input prompt, you might send in something like this: +## Understanding request structure + +Before sending requests, it's important to understand the basic structure. All requests to RunPod endpoints must: + +1. Include an `input` object that contains the parameters for your worker's [handler function](/serverless/handlers/overview). +2. Be formatted as valid JSON. +3. Include your API key for authentication (unless sent from the RunPod console). + +### Basic request structure + +Every request must include a JSON object containing an `input` key. This is the minimum valid request: ```json { "input": { - "prompt": "The lazy brown fox jumps over the" + "prompt": "Your input here" } } ``` -## Optional Inputs +The exact parameters inside the `input` object depend on your specific worker implementation. Check your worker's documentation for the required and optional parameters. + +## Testing methods -Along with an input key, you can include other top-level inputs to access different functions. If a key is passed in at the top level and not included in the body of your request, it will be discarded and unavailable to your handler. +### Test in the RunPod console -The following optional inputs are available to all endpoints regardless of the worker. +The quickest way to test your endpoint is directly in the RunPod console: -- Webhooks -- Execution policies -- S3-compatible storage +1. Navigate to the [Serverless section](https://www.runpod.io/console/serverless). +2. Select your endpoint. +3. Click the **Requests** tab. -### Webhooks +Screenshot of the endpoint details page. -To see notifications for completed jobs, pass a URL in the top level of the request: +You'll see a default test request in the editor: ```json { - "input": {}, - "webhook": "https://URL.TO.YOUR.WEBHOOK" + "input": { + "prompt": "Hello World" + } } ``` -Your webhook endpoint should respond with a `200` status to acknowledge the successful call. If the call is not successful, the request waits 10 seconds and sends the call again up to two more times. +You can modify this request as needed, then click **Run** to test your endpoint. On first execution, your workers will need to initialize, which may take a moment. -A `POST` request goes to your URL when the job is complete. This request contains the same information as fetching the results from the `/status/{job_id}` endpoint. +The initial response will look similar to this: -### Execution Policies +```json +{ + "id": "6de99fd1-4474-4565-9243-694ffeb65218-u1", + "status": "IN_QUEUE" +} +``` -By default, if a job remains `IN_PROGRESS` for longer than 10 minutes without completion, it's automatically terminated. +After processing completes, you'll see the full response. If there are any errors, the console will display error logs to help you troubleshoot. -This default behavior keeps a hanging request from draining your account credits. +### Send requests programmatically -To customize the management of job lifecycles and resource consumption, the following policies can be configured: +Once your endpoint is working correctly, you'll likely want to integrate it with your applications. Here are examples using common methods: -- **Execution Timeout**: Specifies the maximum duration that a job can run before it's automatically terminated. This limit helps prevent jobs from running indefinitely and consuming resources. You can overwrite the value for a request by specifying `executionTimeout` in the job input. + + -:::note +For command-line testing or scripting: -Changing the **Execution Timeout** value through the Web UI sets the value for all requests to an Endpoint. -You can still overwrite the value for individual requests with `executionTimeout` in the job input. +```bash +curl --request POST \ + --url https://api.runpod.ai/v2/[ENDPOINT_ID]/runsync \ + --header "accept: application/json" \ + --header "authorization: [YOUR_API_KEY]" \ + --header "content-type: application/json" \ + --data ' +{ + "input": { + "prompt": "A coffee cup.", + "height": 512, + "width": 512, + "num_outputs": 1, + "num_inference_steps": 50, + "guidance_scale": 7.5, + "scheduler": "KLMS" + } +} +' +``` -::: +Replace `[ENDPOINT_ID]` with your endpoint ID and `[YOUR_API_KEY]` with your RunPod API key. + + + + + +To send a request using the RunPod Python SDK: + +```python +import runpod +import os + +runpod.api_key = os.getenv("[YOUR_API_KEY]") + +endpoint = runpod.Endpoint("[ENDPOINT_ID]") + +try: + run_request = endpoint.run_sync( + { + "prompt": "Hello, world!", + }, + timeout=60, # Timeout in seconds. + ) + + print(run_request) +except TimeoutError: + print("Job timed out.") +``` + +For more details, see the [RunPod Python SDK reference](/sdks/python/endpoints). + + + + + +To send a request using the RunPod JavaScript SDK: + +```javascript +const { [YOUR_API_KEY], [ENDPOINT_ID] } = process.env; +import runpodSdk from "runpod-sdk"; + +const runpod = runpodSdk([YOUR_API_KEY]); +const endpoint = runpod.endpoint([ENDPOINT_ID]); +const result = await endpoint.runSync({ + "input": { + "prompt": "Hello, World!", + }, +}); + +console.log(result); +``` + +For more details, see the [RunPod JavaScript SDK reference](/sdks/javascript/endpoints). + + + + + + +## Advanced configuration options + +In addition to the required `input` object, you can include optional top-level parameters to enable additional functionality. + +### Webhook notifications -- **Low Priority**: When true, the job does not trigger scaling up resources to execute. Instead, it executes when there are no pending higher priority jobs in the queue. Use this option for tasks that are not time-sensitive. -- **TTL (Time-to-Live)**: Defines the maximum time a job can remain in the queue before it's automatically terminated. This parameter ensures that jobs don't stay in the queue indefinitely. +To receive notifications when your job completes, specify a webhook URL: ```json { - "input": {}, + "input": { + "prompt": "Your input here" + }, + "webhook": "https://your-webhook-url.com" +} +``` + +When your job completes, RunPod will send a `POST` request to your webhook URL containing the same information as the `/status/{job_id}` endpoint. Your webhook should return a `200` status code to acknowledge receipt. If the call fails, RunPod will retry up to 2 more times with a 10-second delay between attempts. + +### Execution policies + +By default, if a job remains `IN_PROGRESS` for longer than 10 minutes without completion, it's automatically terminated. This default behavior keeps a hanging request from draining your account credits. You can manually control job execution behavior with policies: + +```json +{ + "input": { + "prompt": "Your input here" + }, "policy": { - "executionTimeout": int, // Time in milliseconds. Must be greater than 5 seconds. - "lowPriority": bool, // Sets the job's priority to low. Default behavior escalates to high under certain conditions. - "ttl": int // Time in milliseconds. Must be greater than or equal to 10 seconds. Default is 24 hours. Maximum is one week. + "executionTimeout": 900000, // 15 minutes in milliseconds + "lowPriority": false, + "ttl": 3600000 // 1 hour in milliseconds } } ``` -By configuring the execution timeout, priority, and TTL policies, you have more control over job execution and efficient system resource management. +Policy options include: -### S3-Compatible Storage +| Option | Description | Default | Constraints | +|--------|-------------|---------|------------| +| `executionTimeout` | Maximum job runtime in milliseconds | 600000 (10 minutes) | Must be > 5000 ms | +| `lowPriority` | When true, job won't trigger worker scaling | false | - | +| `ttl` | Maximum queue time in milliseconds | 86400000 (24 hours) | Must be ≥ 10000 ms, max 1 week | -Pass in the credentials for S3-compatible object storage as follows: +:::note + +Setting the `executionTimeout` in a request overrides the default endpoint setting for that specific job only. + +::: + +### S3-compatible storage integration + +For endpoints that need to work with large files, configure S3-compatible storage: ```json { - "input": {}, + "input": { + "prompt": "Your input here" + }, "s3Config": { - "accessId": "key_id_or_username", - "accessSecret": "key_secret_or_password", - "bucketName": "storage_location_name", - "endpointUrl": "storage_location_address" + "accessId": "your-access-id", + "accessSecret": "your-access-secret", + "bucketName": "your-bucket-name", + "endpointUrl": "your-s3-endpoint-url" } } ``` -The configuration only passes to the worker. It is not returned as part of the job request output. +This configuration is passed directly to your worker but is not included in the response. Your worker must contain logic to use this information for storage operations. -:::note +:::tip +The S3 integration works with any S3-compatible storage provider, not just AWS S3. You can use MinIO, Backblaze B2, DigitalOcean Spaces, and other compatible providers. +::: -The serverless worker must contain logic that allows it to use this input. If you build a custom endpoint and request s3Config in the job input, your worker is ultimately responsible for using the information passed in to upload the output. +## Asynchronous vs. synchronous requests -::: +RunPod endpoints support two types of requests: synchronous and asynchronous. To learn about them in more detail, see [Endpoint operations](/serverless/endpoints/operations). + +### Synchronous requests (`/runsync`) + +Synchronous requests wait for the job to complete and return the result in a single response: + +``` +POST https://api.runpod.ai/v2/{endpoint_id}/runsync +``` + +Best for: +- Short-running tasks (under 30 seconds) +- Interactive applications where immediate results are needed +- Simpler client code (no need to poll for status) + +### Asynchronous requests (`/run`) + +Asynchronous requests return immediately with a job ID, allowing your application to continue while the job processes in the background: + +``` +POST https://api.runpod.ai/v2/{endpoint_id}/run +``` + +Best for: +- Long-running tasks +- Batch processing +- Workflows with webhooks + +When using asynchronous requests, you'll need to check the job status using the `/status/{job_id}` endpoint or configure a webhook to receive the result. + +## Error handling + +When sending requests, be prepared to handle these common errors: + +| HTTP Status | Meaning | Solution | +|-------------|---------|----------| +| 400 | Bad Request | Check your request format and parameters | +| 401 | Unauthorized | Verify your API key is correct and has permission | +| 404 | Not Found | Check your endpoint ID | +| 429 | Too Many Requests | Implement backoff and retry logic | +| 500 | Internal Server Error | Check endpoint logs; worker may have crashed | + +Implementing proper error handling and retry logic will make your integrations more robust. + +## Next steps + +Now that you've learned how to send requests to your endpoint, you can: + +- [Manage job operations.](/serverless/endpoints/operations) +- [Create more advanced handler functions.](/serverless/handlers/overview) +- [Learn about local testing.](/serverless/development/local-testing) +- [Deploy your endpoints with GitHub.](/serverless/github-integration) \ No newline at end of file diff --git a/docs/serverless/get-started.md b/docs/serverless/get-started.md index 918efdcd..6ac7b86c 100644 --- a/docs/serverless/get-started.md +++ b/docs/serverless/get-started.md @@ -1,5 +1,5 @@ --- -title: "Get started" +title: "Create a custom endpoint" sidebar_position: 2 description: Create and deploy your first custom Serverless endpoint. Learn to create a handler, test it locally, build a Docker image, deploy an endpoint, and send requests with this step-by-step tutorial. --- @@ -7,7 +7,7 @@ description: Create and deploy your first custom Serverless endpoint. Learn to c import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Get started with Serverless +# Create a custom endpoint Learn how to create, deploy, and test your first custom Serverless endpoint. @@ -146,7 +146,6 @@ DEBUG | local_test | run_job return: {'output': 'Hey there!'} INFO | Job local_test completed successfully. INFO | Job result: {'output': 'Hey there!'} INFO | Local testing complete, exiting. -(venv) moking@Mos-MacBook-Pro-2 serverless-test % ``` ## Step 5: Create a Dockerfile @@ -185,7 +184,7 @@ CMD ["python3", "-u", "rp_handler.py"] 1. Then push the image to your container registry: ```bash - docker push yourusername/yourrepo:yourtag + docker push yourusername/serverless-test:latest ``` ## Step 7: Deploy your endpoint using the web interface @@ -193,8 +192,8 @@ CMD ["python3", "-u", "rp_handler.py"] 1. Go to the [Serverless section](https://www.runpod.io/console/serverless) of the RunPod web interface. 2. Click **New Endpoint**. 3. Under **Custom Source**, select **Docker Image**, then click **Next**. -4. In the **Container Image** field, enter your Docker image URL: `docker.io/[YOUR_USERNAME]/serverless-test:latest` -5. Enter a name for your endpoint (or leave the randomly generated name in place). +4. In the **Container Image** field, enter your Docker image URL: `docker.io/yourusername/serverless-test:latest` +5. (Optional) Enter a custom name for your endpoint, or use the randomly generated name. 6. Under **Worker Configuration**, check the box for **16 GB** GPUs. 7. Leave the rest of the settings at their defaults. 8. Click **Create Endpoint**. @@ -238,8 +237,8 @@ Congratulations, you've successfully deployed and tested your first Serverless e Now that you've learned the basics, you're ready to: -- [Deploy large language models from Hugging Face as Serverless endpoints.](/serverless/workers/vllm/get-started) -- [Manage your Serverless endpoints using the web interface.](/serverless/endpoints/manage-endpoints) -- [Create more advanced handler functions.](/serverless/workers/handlers/overview) -- [Deploy your endpoints with GitHub.](/serverless/github-integration) -- [Learn more about local testing.](/serverless/workers/development/local-testing) \ No newline at end of file +- [Send endpoint requests using cURL and the Serverless SDK.](/serverless/endpoints/send-requests) +- [Learn how to use endpoint operations like `/run` and `/status`.](/serverless/endpoints/operations) +- [Manage your Serverless endpoints using the RunPod console.](/serverless/endpoints/manage-endpoints) +- [Create more advanced handler functions.](/serverless/handlers/overview) +- [Learn more about local testing.](/serverless/development/local-testing) \ No newline at end of file diff --git a/docs/serverless/github-integration.md b/docs/serverless/github-integration.md index d3dec21d..dc3f1dfb 100644 --- a/docs/serverless/github-integration.md +++ b/docs/serverless/github-integration.md @@ -15,7 +15,7 @@ This integration enables you to focus on development while RunPod handles the in :::note -You must use [RunPod](/serverless/workers/development/overview) Python library to develop your Serverless worker. +You must use [RunPod](/serverless/development/overview) Python library to develop your Serverless worker. ::: diff --git a/docs/serverless/workers/handlers/_category_.json b/docs/serverless/handlers/_category_.json similarity index 67% rename from docs/serverless/workers/handlers/_category_.json rename to docs/serverless/handlers/_category_.json index 01c88dac..82e0c207 100644 --- a/docs/serverless/workers/handlers/_category_.json +++ b/docs/serverless/handlers/_category_.json @@ -1,6 +1,6 @@ { - "label": "Handler Functions", - "position": 3, + "label": "Handler functions", + "position": 7, "link": { "type": "generated-index", "description": "Learn more about Serverless." diff --git a/docs/serverless/workers/handlers/handler-additional-controls.md b/docs/serverless/handlers/handler-additional-controls.md similarity index 100% rename from docs/serverless/workers/handlers/handler-additional-controls.md rename to docs/serverless/handlers/handler-additional-controls.md diff --git a/docs/serverless/workers/handlers/handler-async.md b/docs/serverless/handlers/handler-async.md similarity index 96% rename from docs/serverless/workers/handlers/handler-async.md rename to docs/serverless/handlers/handler-async.md index 5dfc53c5..2560d2f9 100644 --- a/docs/serverless/workers/handlers/handler-async.md +++ b/docs/serverless/handlers/handler-async.md @@ -1,5 +1,5 @@ --- -title: "Asynchronous Handler" +title: "Asynchronous handler" id: "handler-async" sidebar_position: 3 description: "RunPod supports asynchronous handlers in Python, enabling efficient handling of tasks with non-blocking operations, such as processing large datasets, API interactions, or I/O-bound operations, boosting efficiency, scalability, and flexibility." @@ -7,7 +7,7 @@ description: "RunPod supports asynchronous handlers in Python, enabling efficien RunPod supports the use of asynchronous handlers, enabling efficient handling of tasks that benefit from non-blocking operations. This feature is particularly useful for tasks like processing large datasets, interacting with APIs, or handling I/O-bound operations. -## Writing asynchronous Handlers +## Writing asynchronous handlers Asynchronous handlers in RunPod are written using Python's `async` and `await` syntax. Below is a sample implementation of an asynchronous generator handler. This example demonstrates how you can yield multiple outputs over time, simulating tasks such as processing data streams or generating responses incrementally. @@ -35,7 +35,7 @@ runpod.serverless.start( ) ``` -### Benefits of asynchronous Handlers +### Benefits of asynchronous handlers - **Efficiency**: Asynchronous handlers can perform non-blocking operations, allowing for more tasks to be handled concurrently. - **Scalability**: They are ideal for scaling applications, particularly when dealing with high-frequency requests or large-scale data processing. diff --git a/docs/serverless/workers/handlers/handler-concurrency.md b/docs/serverless/handlers/handler-concurrency.md similarity index 90% rename from docs/serverless/workers/handlers/handler-concurrency.md rename to docs/serverless/handlers/handler-concurrency.md index c1fe61f7..a5db8af7 100644 --- a/docs/serverless/workers/handlers/handler-concurrency.md +++ b/docs/serverless/handlers/handler-concurrency.md @@ -1,5 +1,5 @@ --- -title: Concurrent Handlers +title: Concurrent handlers description: "RunPod's concurrency functionality enables efficient task handling through asynchronous requests, allowing a single worker to manage multiple tasks concurrently. The concurrency_modifier configures the worker's concurrency level to optimize resource consumption and performance." --- @@ -11,7 +11,7 @@ Serverless architectures allow each worker to process multiple requests simultan The `concurrency_modifier` is a configuration option within `runpod.serverless.start` that dynamically adjusts a worker's concurrency level. This adjustment enables the optimization of resource consumption and performance by regulating the number of tasks a worker can handle concurrently. -### Step 1: Define an asynchronous Handler function +### Step 1: Define an asynchronous handler function Create an asynchronous function dedicated to processing incoming requests. This function should efficiently yield results, ideally in batches, to enhance throughput. @@ -56,9 +56,9 @@ def adjust_concurrency(current_concurrency): return current_concurrency ``` -### Step 3: Initialize the serverless function +### Step 3: Initialize the Serverless function -Start the serverless function with the defined handler and `concurrency_modifier` to enable dynamic concurrency adjustment. +Start the Serverless function with the defined handler and `concurrency_modifier` to enable dynamic concurrency adjustment. ```python runpod.serverless.start( @@ -73,7 +73,7 @@ runpod.serverless.start( ## Example code -Here is an example demonstrating the setup for a RunPod serverless function capable of handling multiple concurrent requests. +Here is an example demonstrating the setup for a RunPod Serverless function capable of handling multiple concurrent requests. ```python import runpod @@ -121,10 +121,10 @@ def update_request_rate(): request_rate = random.randint(20, 100) -# Start the serverless function with the handler and concurrency modifier +# Start the Serverless function with the handler and concurrency modifier runpod.serverless.start( {"handler": process_request, "concurrency_modifier": adjust_concurrency} ) ``` -Using the `concurrency_modifier` in RunPod, serverless functions can efficiently handle multiple requests concurrently, optimizing resource usage and improving performance. This approach allows for scalable and responsive serverless applications. +Using the `concurrency_modifier` in RunPod, Serverless functions can efficiently handle multiple requests concurrently, optimizing resource usage and improving performance. This approach allows for scalable and responsive Serverless applications. diff --git a/docs/serverless/workers/handlers/handler-error-handling.md b/docs/serverless/handlers/handler-error-handling.md similarity index 92% rename from docs/serverless/workers/handlers/handler-error-handling.md rename to docs/serverless/handlers/handler-error-handling.md index f8665e6f..5b79059e 100644 --- a/docs/serverless/workers/handlers/handler-error-handling.md +++ b/docs/serverless/handlers/handler-error-handling.md @@ -1,5 +1,5 @@ --- -title: Handling Errors +title: Handling errors description: "Learn how to handle exceptions and implement custom error responses in your RunPod SDK handler function, including how to validate input and return customized error messages." sidebar_position: 4 --- @@ -37,4 +37,4 @@ Be cautious with `try/except` blocks in your handler function. Avoid suppressing ::: -One design pattern to consider, is to [Refresh your Worker](/serverless/workers/handlers/handler-additional-controls#refresh-worker) when an error occurs. +One design pattern to consider, is to [Refresh your Worker](/serverless/handlers/handler-additional-controls#refresh-worker) when an error occurs. diff --git a/docs/serverless/workers/handlers/handler-generator.md b/docs/serverless/handlers/handler-generator.md similarity index 95% rename from docs/serverless/workers/handlers/handler-generator.md rename to docs/serverless/handlers/handler-generator.md index 67713f25..680b8421 100644 --- a/docs/serverless/workers/handlers/handler-generator.md +++ b/docs/serverless/handlers/handler-generator.md @@ -1,5 +1,5 @@ --- -title: "Generator Handler" +title: "Generator handler" description: "RunPod offers real-time streaming for Language Model tasks, providing users with instant updates on job outputs. Two types of generator functions are supported, including regular and async generators, with the option to enable aggregate streaming for seamless access to results." sidebar_position: 2 --- @@ -24,6 +24,6 @@ runpod.serverless.start( ) ``` -### Return aggregate Stream +### Return aggregate stream By default, when a generator handler is running, the fractional outputs will only be available at the `/stream` endpoint, if you would also like the outputs to be available from the `/run` and `/runsync` endpoints you will need to set `return_aggregate_stream` to True when starting your handler. diff --git a/docs/serverless/workers/handlers/overview.md b/docs/serverless/handlers/overview.md similarity index 99% rename from docs/serverless/workers/handlers/overview.md rename to docs/serverless/handlers/overview.md index 23aeef20..07fa7a7a 100644 --- a/docs/serverless/workers/handlers/overview.md +++ b/docs/serverless/handlers/overview.md @@ -159,5 +159,5 @@ If any errors are returned by the worker while running a `test_input` job, the w Otherwise, the worker will exit with a zero exit code. This can be used to check if the worker ran successfully, for example, in a CI/CD pipeline. -- For information on testing your handler locally, see [Local testing](/serverless/workers/development/local-testing). +- For information on testing your handler locally, see [Local testing](/serverless/development/local-testing). - For information on setting a continuous integration pipeline, see [Continuous integration](/serverless/workers/deploy). diff --git a/docs/serverless/overview.md b/docs/serverless/overview.md index 68ba3244..a777d238 100644 --- a/docs/serverless/overview.md +++ b/docs/serverless/overview.md @@ -20,39 +20,40 @@ RunPod Serverless is a cloud computing platform that lets you run AI models and RunPod Serverless offers three ways to deploy your workloads, each designed for different use cases: -### 1. Quick Deploys +### Quick Deploys **Best for**: Getting popular AI models running quickly with minimal setup. Quick Deploys are pre-configured templates for popular AI models that you can deploy with just a few clicks: -* No coding required -* Pre-optimized configurations -* Wide selection of popular AI models -* Minimal technical knowledge needed +* No coding required. +* Pre-optimized configurations. +* Wide selection of popular AI models. +* Minimal technical knowledge needed. [Get started with Quick Deploys →](/serverless/quick-deploys) -### 2. vLLM endpoints +### vLLM endpoints **Best for**: Deploying and serving large language models (LLMs). vLLM endpoints are specifically optimized for running LLMs: -* Support for any [Hugging Face model](https://huggingface.co/models) -* Optimized for LLM inference -* Simple configuration via environment variables -* High-performance serving with vLLM +* Support for any [Hugging Face model](https://huggingface.co/models). +* Optimized for LLM inference. +* Simple configuration via environment variables. +* High-performance serving with vLLM. -[Get started with vLLM endpoints →](/serverless/workers/vllm/get-started) +[Get started with vLLM endpoints →](/serverless/vllm/get-started) -### 3. Custom endpoints +### Custom endpoints **Best for**: Running custom code or specialized AI workloads. Custom endpoints give you complete control over your application: -* Write your own Python code -* Package in Docker containers -* Full flexibility for any use case -* Custom processing logic + +* Write your own Python code. +* Package in Docker containers. +* Full flexibility for any use case. +* Custom processing logic. [Get started with custom endpoints →](/serverless/get-started) @@ -68,7 +69,7 @@ An [endpoint](/serverless/endpoints/overview) is the access point for your Serve ### Handler functions -[Handler functions](/serverless/workers/handlers/overview) are the core of your Serverless application. These are the functions that process incoming requests and return results. They follow a simple pattern: +[Handler functions](/serverless/handlers/overview) are the core of your Serverless application. These are the functions that process incoming requests and return results. They follow a simple pattern: ```python # rp_handler.py import runpod # Required @@ -93,11 +94,11 @@ When a user/client sends a request to your Serverless endpoint: 1. If no workers are active, RunPod automatically starts one (cold start). 2. The request is queued until a worker is available. 3. Your handler function processes the request. -4. The result is returned to the user/client. +4. The result is returned to the user/client after they call `/status` (see [Job operations](/serverless/endpoints/operations)). 5. Workers remain active for a period to handle additional requests. 6. Idle workers eventually shut down if no new requests arrive. -A diagram demonstrating the Serverless endpoint request flow +Diagram showing the complete flow of a request through a Serverless endpoint, from initial request to response ## Common use cases @@ -113,6 +114,6 @@ Ready to get started with RunPod Serverless? - [Deploy your first Serverless endpoint.](/serverless/get-started) - [Try a Quick Deploy model.](/serverless/quick-deploys) -- [Deploy large language models in minutes with vLLM.](/serverless/workers/vllm/overview) -- [Learn about handler functions.](/serverless/workers/handlers/overview) +- [Deploy large language models in minutes with vLLM.](/serverless/vllm/overview) +- [Learn about handler functions.](/serverless/handlers/overview) - [Learn about endpoints.](/serverless/endpoints/overview) \ No newline at end of file diff --git a/docs/serverless/quick-deploys.md b/docs/serverless/quick-deploys.md index ab8b4bab..1cddf553 100644 --- a/docs/serverless/quick-deploys.md +++ b/docs/serverless/quick-deploys.md @@ -1,5 +1,5 @@ --- -title: Quick Deploys +title: Quick Deploy an endpoint sidebar_position: 2 description: "Quickly deploy Serverless endpoints using popular AI models with minimal configuration through the web interface, following a simple 5-step process. Customize your deployments using RunPod's GitHub repositories and Handler Functions." --- @@ -25,4 +25,4 @@ Here, you can fork the programming and compute model templates. Begin with the [worker-template](https://github.com/runpod-workers/worker-template) and modify it as needed. These RunPod workers incorporate CI/CD features to streamline your project setup. -For detailed guidance on customizing your interaction Endpoints, refer to [Handler Functions](/serverless/workers/handlers/overview). \ No newline at end of file +For detailed guidance on customizing your interaction Endpoints, refer to [Handler Functions](/serverless/handlers/overview). \ No newline at end of file diff --git a/docs/serverless/references/_category_.json b/docs/serverless/references/_category_.json deleted file mode 100644 index 69bc3d2c..00000000 --- a/docs/serverless/references/_category_.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "label": "References", - "position": 9, - "link": { - "type": "generated-index", - "description": "Reference documentation for Serverless." - } -} diff --git a/docs/serverless/references/endpoint-configurations.md b/docs/serverless/references/endpoint-configurations.md deleted file mode 100644 index f39b4f5d..00000000 --- a/docs/serverless/references/endpoint-configurations.md +++ /dev/null @@ -1,138 +0,0 @@ ---- -title: "Endpoint configurations" -sidebar_position: 1 -description: Configure your Endpoint settings to optimize performance and cost, including GPU selection, worker count, idle timeout, and advanced options like data centers, network volumes, and scaling strategies. ---- - -The following are configurable settings within an Endpoint. - -## Endpoint Name - -Create a name you'd like to use for the Endpoint configuration. -The resulting Endpoint is assigned a random ID to be used for making calls. - -The name is only visible to you. - -## GPU Selection - -Select one or more GPUs that you want your Endpoint to run on. RunPod matches you with GPUs in the order that you select them, so the first GPU type that you select is prioritized, then the second, and so on. Selecting multiple GPU types can help you get a worker more quickly, especially if your first selection is an in-demand GPU. - -## Active (Min) Workers - -Setting the active workers to 1 or more ensures you have “always on” workers, ready to respond to job requests without cold start delays. - -Default: 0 - -:::note - -Active workers incur charges as soon as you enable them (set to >0), but they come with a discount of up to 30% off the regular price. - -::: - -## Max Workers - -Max workers set a limit on the number of workers your endpoint can run simultaneously. If the max workers are set too low, you might experience [throttled workers](/glossary#throttled-worker). To prevent this, consider increasing the max workers to 5 or more if you see frequent throttling. - -Default: 3 - -
- - -How to configure Max Workers - - -You can also configure a max worker count. This is the top limit of what RunPod will attempt to auto-scale for you. Use this to cap your concurrent request count and also limit your cost ceiling. - -:::note - -We currently base your caching coefficient by this number, so an endpoint with higher max worker count will also receive a higher priority when caching workers. - -This is partially why we limit new accounts to a relatively low max concurrency at the account level. -If you want to get this number raised, you generally will need to have a higher history of spending, or commit to a relatively high spend per month. - -You should generally aim to set your max worker count to be 20% higher than you expect your max concurrency to be. - -::: - -
- -## GPUs / Worker - -The number of GPUs you would like assigned to your worker. - -:::note - -Currently only available for 48 GB GPUs. - -::: - -## Idle Timeout - -The amount of time a worker remains running after completing its current request. During this period, the worker stays active, continuously checking the queue for new jobs, and continues to incur charges. If no new requests arrive within this time, the worker will go to sleep. - -Default: 5 seconds - -## FlashBoot - -FlashBoot is RunPod’s magic solution for reducing the average cold-start times on your endpoint. It works probabilistically. When your endpoint has consistent traffic, your workers have a higher chance of benefiting from FlashBoot for faster spin-ups. However, if your endpoint isn’t receiving frequent requests, FlashBoot has fewer opportunities to optimize performance. There’s no additional cost associated with FlashBoot. - -## Advanced - -Additional controls to help you control where your endpoint is deployed and how it responds to incoming requests. - -### Data Centers - -Control which data centers can deploy and cache your workers. Allowing multiple data centers can help you get a worker more quickly. - -Default: all data centers - -### Select Network Volume - -Attach a network storage volume to your deployed workers. - -Network volumes will be mounted to `/runpod-volume/`. - -:::note - -While this is a high performance network drive, do keep in mind that it will have higher latency than a local drive. - -This will limit the availability of cards, as your endpoint workers will be locked to the datacenter that houses your network volume. - -::: - -### Scale Type - -- **Queue Delay** scaling strategy adjusts worker numbers based on request wait times. With zero workers initially, the first request adds one worker. Subsequent requests add workers only after waiting in the queue for the defined number of delay seconds. -- **Request Count** scaling strategy adjusts worker numbers according to total requests in the queue and in progress. It automatically adds workers as the number of requests increases, ensuring tasks are handled efficiently. - -```text -_Total Workers Formula: Math.ceil((requestsInQueue + requestsInProgress) / - -What's the difference between GPU models. - -A100s are about 2-3x faster than A5000s and also allow double the VRAM with very high bandwidth throughout. 3090s and A5000s are 1.5-2x faster than A4000s. Sometimes, it may make more sense to use 24 GB even if you don't need it compared to 16 GB due to faster response times. Depending on the nature of the task, it's also possible that execution speeds may be bottlenecked and not significantly improved simply by using a higher-end card. Do your own calculations and experimentation to determine out what's most cost-effective for your workload and task type. - -Want access to different flavors? [Let us know](https://www.runpod.io/contact) and we can look at expanding our offerings! - - - -## CUDA version selection - -You have the ability to select the allowed CUDA versions for your workloads. -The CUDA version selection determines the compatible GPU types that will be used to execute your serverless tasks. - -Specifically, the CUDA version selection works as follows: - -- You can choose one or more CUDA versions that your workload is compatible with or requires. -- RunPod will then match your workload to available GPU instances that have the selected CUDA versions installed. -- This ensures that your serverless tasks run on GPU hardware that meets the CUDA version requirements. - -For example, if you select CUDA 11.6, your serverless tasks will be scheduled to run on GPU instances that have CUDA 11.6 or a compatible version installed. This allows you to target specific CUDA versions based on your workload's dependencies or performance requirements. diff --git a/docs/serverless/references/job-states.md b/docs/serverless/references/job-states.md deleted file mode 100644 index b013f742..00000000 --- a/docs/serverless/references/job-states.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -title: "Job states" -id: "job-states" -description: "Understand the various states of a job in RunPod's Handler Functions, including IN_QUEUE, IN_PROGRESS, COMPLETED, FAILED, CANCELLED, and TIMED_OUT, to effectively manage job flow and troubleshoot issues." -sidebar_position: 5 ---- - -When working with Handler Functions in RunPod, it's essential to understand the various states a job can go through from initiation to completion. -Each state provides insight into the job's current status and helps in managing the job flow effectively. - -## Job state - -Here are the states a job can be in: - -- `IN_QUEUE`: This state indicates that the job is currently in the endpoint queue. It's waiting for an available worker to pick it up for processing. -- `IN_PROGRESS`: Once a worker picks up the job, its state changes to `IN_PROGRESS`. This means the job is actively being processed and is no longer in the queue. -- `COMPLETED`: After the job successfully finishes processing and returns a result, it moves to the `COMPLETED` state. This indicates the successful execution of the job. -- `FAILED`: If a job encounters an error during its execution and returns with an error, it is marked as `FAILED`. This state signifies that the job did not complete successfully and encountered issues. -- `CANCELLED`: Jobs can be manually cancelled using the `/cancel/job_id` endpoint. If a job is cancelled before it completes or fails, it will be in the `CANCELLED` state. -- `TIMED_OUT`: This state occurs in two scenarios: when a job expires before a worker picks it up, or if the worker fails to report back a result for the job before it reaches its timeout threshold. diff --git a/docs/serverless/references/operations.md b/docs/serverless/references/operations.md deleted file mode 100644 index fb038c41..00000000 --- a/docs/serverless/references/operations.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -title: "Endpoint operations" -description: "RunPod's Endpoints enable job submission and output retrieval, using a constructed URL starting with https://api.runpod.ai/v2/{endpoint_id}/{operation}. Operations include job submission, synchronous execution, job status checking, and more." -sidebar_position: 2 ---- - -RunPod's Endpoints facilitate submitting jobs and retrieving outputs. Access these endpoints at: `https://api.runpod.ai/v2/{endpoint_id}/{operation}` - -### /run - -- **Method**: `POST` -- **Description**: Asynchronous endpoint for submitting jobs -- **Returns**: Unique Job ID -- **Payload Limit**: 10 MB -- **Rate Limit**: 1000 requests per 10 seconds, 200 concurrent -- **Job Availability**: 30 minutes after completion - -### /runsync - -- **Method**: `POST` -- **Description**: Synchronous endpoint for shorter running jobs -- **Returns**: Immediate results -- **Payload Limit**: 20 MB -- **Rate Limit**: 2000 requests per 10 seconds, 400 concurrent -- **Job Availability**: 60 seconds after completion - -### Queue Limits - -- Requests will receive a `429 (Too Many Requests)` status if: - - Queue size exceeds 50 jobs AND - - Queue size exceeds endpoint.WorkersMax * 500 - -### Status and Stream Operations - -All status and stream operations share a rate limit of 2000 requests per 10 seconds, 400 concurrent: - -- `/status/{job_id}` - Check job status and retrieve outputs - - Methods: `GET` | `POST` -- `/status-sync/{job_id}` - Synchronous status check - - Methods: `GET` | `POST` -- `/stream/{job_id}` - Stream results from generator-type handlers - - Methods: `GET` | `POST` - -### Additional Operations - -- `/cancel/{job_id}` - - **Method**: `POST` - - **Rate Limit**: 100 requests per 10 seconds, 20 concurrent - -- `/purge-queue` - - **Method**: `POST` - - **Description**: Clears all queued jobs (does not affect running jobs) - - **Rate Limit**: 2 requests per 10 seconds - -- `/health` - - **Method**: `GET` - - **Description**: Provides worker statistics and endpoint health - -- `/requests` - - **Method**: `GET` - - **Rate Limit**: 10 requests per 10 seconds, 2 concurrent - -To see how to run these Endpoint Operations, see [Invoke a Job](/serverless/endpoints/job-operations). diff --git a/docs/serverless/workers/vllm/_category_.json b/docs/serverless/vllm/_category_.json similarity index 76% rename from docs/serverless/workers/vllm/_category_.json rename to docs/serverless/vllm/_category_.json index 2c610594..13d76af6 100644 --- a/docs/serverless/workers/vllm/_category_.json +++ b/docs/serverless/vllm/_category_.json @@ -1,6 +1,6 @@ { - "label": "vLLM Endpoint", - "position": 8, + "label": "vLLM endpoints", + "position": 9, "link": { "type": "generated-index", "description": "Deploy blazingly fast OpenAI-compatible serverless endpoints for any LLM." diff --git a/docs/serverless/workers/vllm/configurable-endpoints.md b/docs/serverless/vllm/configurable-endpoints.md similarity index 84% rename from docs/serverless/workers/vllm/configurable-endpoints.md rename to docs/serverless/vllm/configurable-endpoints.md index dcac62b1..0f154589 100644 --- a/docs/serverless/workers/vllm/configurable-endpoints.md +++ b/docs/serverless/vllm/configurable-endpoints.md @@ -1,9 +1,9 @@ --- -title: Configurable Endpoints -description: "Deploy large language models with ease using RunPod's Configurable Endpoints feature, leveraging vLLM to simplify model loading, hardware configuration, and execution, allowing you to focus on model selection and customization." +title: Configurable endpoints +description: "Deploy large language models with ease using RunPod's configurable endpoints, leveraging vLLM to simplify model loading, hardware configuration, and execution, allowing you to focus on model selection and customization." --- -RunPod's Configurable Endpoints feature leverages vLLM to enable the deployment of any large language model. +RunPod's configurable endpoints feature leverages vLLM to enable the deployment of any large language model. When you select the **Serverless vLLM** option, RunPod utilizes vLLM's capabilities to load and run the specified Hugging Face model. By integrating vLLM into the configurable endpoints, RunPod simplifies the process of deploying and running large language models. @@ -28,7 +28,7 @@ Focus on selecting your desired model and customizing the template parameters, w 4. Review the **Environment Variables**. 5. Select **Deploy**. -Your LLM is now deployed to an Endpoint. +Your LLM is now deployed to an endpoint. You can now use the API to interact with your model. :::note diff --git a/docs/serverless/workers/vllm/environment-variables.md b/docs/serverless/vllm/environment-variables.md similarity index 100% rename from docs/serverless/workers/vllm/environment-variables.md rename to docs/serverless/vllm/environment-variables.md diff --git a/docs/serverless/workers/vllm/get-started.md b/docs/serverless/vllm/get-started.md similarity index 94% rename from docs/serverless/workers/vllm/get-started.md rename to docs/serverless/vllm/get-started.md index 5721a6f7..f78113f9 100644 --- a/docs/serverless/workers/vllm/get-started.md +++ b/docs/serverless/vllm/get-started.md @@ -9,7 +9,7 @@ import TabItem from '@theme/TabItem'; RunPod provides a simple way to run large language models (LLMs) as Serverless Endpoints. vLLM Workers are pre-built Docker images that you can configure entirely within the RunPod UI. -This tutorial will guide you through deploying an OpenAI compatible Endpoint with a vLLM inference engine on RunPod. +This tutorial will guide you through deploying an OpenAI compatible endpoint with a vLLM inference engine on RunPod. ## Prerequisites @@ -26,7 +26,7 @@ You can use RunPod's Web UI to deploy a vLLM Worker with a model directly from H 1. Log in to your RunPod account and go to the [Serverless page](https://www.runpod.io/console/serverless). 2. Under **Quick Deploy**, find **Serverless vLLM** and choose **Start**. -You will now enter the vLLM module. Follow the on-screen instructions to add your LLM as a Serverless Endpoint: +You will now enter the vLLM module. Follow the on-screen instructions to add your LLM as a Serverless endpoint: 1. Select a vLLM version. 2. Add a Hugging Face model (e.g., `openchat/openchat-3.5-0106`). @@ -38,7 +38,7 @@ On the **vLLM parameters** page, provide additional parameters and options for y 1. In **LLM Settings**, enter **8192** for the **Max Model Length** parameter. 2. Review your options and choose **Next**. -On the **Endpoint parameters** page, configure your deployment: +On the **endpoint Parameters** page, configure your deployment: 1. Specify your GPU configuration for your Worker. 2. Configure your Worker deployment. @@ -48,19 +48,19 @@ On the **Endpoint parameters** page, configure your deployment: 4. Select **Deploy**. -Once the Endpoint initializes, you can send requests to your [Endpoint](/serverless/endpoints/get-started). +Once the endpoint initializes, you can send requests to your [endpoint](/serverless/endpoints/get-started). Continue to the [Send a request](#send-a-request) section. ## Deploy using the Worker image One advantage of deploying your model with the vLLM Worker is the minimal configuration required. For most models, you only need to provide the pre-built vLLM Worker image name and the LLM model name. -Follow these steps to run the vLLM Worker on a Serverless Endpoint: +Follow these steps to run the vLLM Worker on a Serverless endpoint: 1. Log in to the [RunPod Serverless console](https://www.runpod.io/console/serverless). -2. Select **+ New Endpoint**. +2. Select **+ New endpoint**. 3. Provide the following: - - Endpoint name + - endpoint name - Select a GPU (filter for CUDA 12.1.0+ support under the **Advanced** tab if needed) - Configure the number of Workers - (Optional) Select **FlashBoot** to speed up Worker startup times @@ -73,20 +73,20 @@ Follow these steps to run the vLLM Worker on a Serverless Endpoint: - `HF_TOKEN`: (Optional) Your Hugging Face API token for private models 4. Select **Deploy**. -Once the Endpoint initializes, you can send requests to your [Endpoint](/serverless/endpoints/get-started). +Once the endpoint initializes, you can send requests to your [endpoint](/serverless/endpoints/get-started). Continue to the [Send a request](#send-a-request) section. -For a complete list of available environment variables, see the [vLLM Worker variables](/serverless/workers/vllm/environment-variables). +For a complete list of available environment variables, see the [vLLM Worker variables](/serverless/vllm/environment-variables). ## Send a request -This section walks you through sending a request to your Serverless Endpoint. +This section walks you through sending a request to your Serverless endpoint. The vLLM Worker can use any Hugging Face model and is compatible with OpenAI's API. If you have the OpenAI library installed, you can continue using it with the vLLM Worker. See the [OpenAI documentation](https://platform.openai.com/docs/libraries/) for more information. ### Environment setup -Set the `RUNPOD_ENDPOINT_ID` and `RUNPOD_API_KEY` environment variables with your Endpoint ID and API Key. +Set the `RUNPOD_ENDPOINT_ID` and `RUNPOD_API_KEY` environment variables with your endpoint ID and API Key. @@ -258,8 +258,8 @@ curl -X POST "https://api.runpod.ai/v2/yf2k4t0vl3ciaf/run" \ If you encounter issues deploying or using vLLM Workers, check the following: - Ensure your RunPod API Key has the necessary permissions to deploy and access Serverless Endpoints. -- Double-check that you have set the correct environment variables for your Endpoint ID and API Key. +- Double-check that you have set the correct environment variables for your endpoint ID and API Key. - Verify that you are using the correct CUDA version for your selected GPU. - If using a gated model, ensure your Hugging Face token is valid and has access to the model. -To learn more about managing your Serverless Endpoints, see the [Manage Endpoints](/serverless/endpoints/manage-endpoints) guide. For a complete reference of the vLLM Worker environment variables, see the [vLLM Worker variables](/serverless/workers/vllm/environment-variables) documentation. +To learn more about managing your Serverless endpoints, see the [Manage Endpoints](/serverless/endpoints/manage-endpoints) guide. For a complete reference of the vLLM Worker environment variables, see the [vLLM Worker variables](/serverless/vllm/environment-variables) documentation. diff --git a/docs/serverless/workers/vllm/openai-compatibility.md b/docs/serverless/vllm/openai-compatibility.md similarity index 100% rename from docs/serverless/workers/vllm/openai-compatibility.md rename to docs/serverless/vllm/openai-compatibility.md diff --git a/docs/serverless/workers/vllm/overview.md b/docs/serverless/vllm/overview.md similarity index 72% rename from docs/serverless/workers/vllm/overview.md rename to docs/serverless/vllm/overview.md index 0295cbb7..1b55b5e2 100644 --- a/docs/serverless/workers/vllm/overview.md +++ b/docs/serverless/vllm/overview.md @@ -35,13 +35,13 @@ At a high level, you can set up the vLLM Worker by: - Configure any necessary environment variables - Deploy your model -For detailed guidance on setting up, configuring, and deploying your vLLM Serverless Endpoint Worker, including compatibility details, environment variable settings, and usage examples, see [Get started](/serverless/workers/vllm/get-started). +For detailed instructions, configuration options, and usage examples, see [Get started](/serverless/vllm/get-started). ### Deployment options -- **[Configurable Endpoints](/serverless/workers/vllm/get-started#deploy-using-the-web-ui)**: (recommended) Use RunPod's Web UI to quickly deploy the OpenAI compatible LLM with the vLLM Worker. +- **[Configurable Endpoints](/serverless/vllm/get-started#deploy-using-the-web-ui)**: (recommended) Use RunPod's Web UI to quickly deploy the OpenAI compatible LLM with the vLLM Worker. -- **[Pre-Built docker image](/serverless/workers/vllm/get-started#deploy-using-the-worker-image)**: Leverage pre-configured Docker image for hassle-free deployment. Ideal for users seeking a quick and straightforward setup process +- **[Pre-Built docker image](/serverless/vllm/get-started#deploy-using-the-worker-image)**: Leverage pre-configured Docker image for hassle-free deployment. Ideal for users seeking a quick and straightforward setup process - **Custom docker image**: For advanced users, customize and build your Docker image with the model baked in, offering greater control over the deployment process. @@ -54,7 +54,7 @@ For more information on creating a custom docker image, see [Build Docker Image ## Next steps -- [Get started](/serverless/workers/vllm/get-started): Learn how to deploy a vLLM Worker as a Serverless Endpoint, with detailed guides on configuration and sending requests. -- [Configurable Endpoints](/serverless/workers/vllm/configurable-endpoints): Select your Hugging Face model and vLLM takes care of the low-level details of model loading, hardware configuration, and execution. -- [Environment variables](/serverless/workers/vllm/environment-variables): Explore the environment variables available for the vLLM Worker, including detailed documentation and examples. +- [Get started](/serverless/vllm/get-started): Learn how to deploy a vLLM Worker as a Serverless Endpoint, with detailed guides on configuration and sending requests. +- [Configurable Endpoints](/serverless/vllm/configurable-endpoints): Select your Hugging Face model and vLLM takes care of the low-level details of model loading, hardware configuration, and execution. +- [Environment variables](/serverless/vllm/environment-variables): Explore the environment variables available for the vLLM Worker, including detailed documentation and examples. - [Run Gemma 7b](/tutorials/serverless/gpu/run-gemma-7b): Walk through deploying Google's Gemma model using RunPod's vLLM Worker, guiding you to set up a Serverless Endpoint with a gated large language model (LLM). diff --git a/docs/serverless/workers/_category_.json b/docs/serverless/workers/_category_.json index 237cd83d..b749c721 100644 --- a/docs/serverless/workers/_category_.json +++ b/docs/serverless/workers/_category_.json @@ -1,6 +1,6 @@ { "label": "Workers", - "position": 7, + "position": 8, "link": { "type": "generated-index", "description": "Build your LLM with serverless workers." diff --git a/docs/serverless/workers/deploy/deploy.md b/docs/serverless/workers/deploy.md similarity index 96% rename from docs/serverless/workers/deploy/deploy.md rename to docs/serverless/workers/deploy.md index b7a522c7..fec76b8d 100644 --- a/docs/serverless/workers/deploy/deploy.md +++ b/docs/serverless/workers/deploy.md @@ -9,7 +9,7 @@ This is accomplished by defining a Docker file to import everything required to :::note -For deploying large language models (LLMs), you can use the [Configurable Endpoints](/serverless/workers/vllm/configurable-endpoints) feature instead of working directly with Docker. +For deploying large language models (LLMs), you can use the [Configurable Endpoints](/serverless/vllm/configurable-endpoints) feature instead of working directly with Docker. Configurable Endpoints simplify the deployment process by allowing you to select a pre-configured template and customize it according to your needs. @@ -45,7 +45,7 @@ ADD handler.py . CMD [ "python", "-u", "/handler.py" ] ``` -To build and push the image, review the steps in [Get started](/serverless/workers/overview). +To build and push the image, review the steps in [Get started](/serverless/overview). > 🚧 If your handler requires external files such as model weights, be sure to cache them into your docker image. You are striving for a completely self-contained worker that doesn't need to download or fetch external files to run. diff --git a/docs/serverless/workers/deploy/_category_.json b/docs/serverless/workers/deploy/_category_.json deleted file mode 100644 index 397c6ea5..00000000 --- a/docs/serverless/workers/deploy/_category_.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "label": "Deploy", - "position": 7, - "link": { - "type": "generated-index", - "description": "Learn to deploy your application to a serverless Worker." - } -} diff --git a/docs/tutorials/migrations/banana/overview.md b/docs/tutorials/migrations/banana/overview.md index 977be6b1..2abffc03 100644 --- a/docs/tutorials/migrations/banana/overview.md +++ b/docs/tutorials/migrations/banana/overview.md @@ -182,7 +182,7 @@ Banana uses a `banana_config.json` file which contains things like Idle Timeout, **Idle Timeout** -RunPod allows you to set an [Idle Timeout](/serverless/references/endpoint-configurations#idle-timeout) when creating the Endpoint. +RunPod allows you to set an [Idle Timeout](/serverless/endpoints/endpoint-configurations#idle-timeout) when creating the endpoint. The default value is 5 seconds. **Inference Timeout** @@ -193,8 +193,8 @@ For runs that take longer than 30 seconds to execute, you should use the `sync` **Max Replicas** -When creating a Worker in RunPod, you can set the max Workers that will scale up depending on the amount of Worker sent to your Endpoint. -For more information, see [Scale Type](/serverless/references/endpoint-configurations#scale-type). +When creating a Worker in RunPod, you can set the max Workers that will scale up depending on the amount of Worker sent to your endpoint. +For more information, see [Scale Type](/serverless/endpoints/endpoint-configurations#scale-type). :::note diff --git a/docs/tutorials/migrations/cog/overview.md b/docs/tutorials/migrations/cog/overview.md index a5bd0f98..6813b4df 100644 --- a/docs/tutorials/migrations/cog/overview.md +++ b/docs/tutorials/migrations/cog/overview.md @@ -19,7 +19,7 @@ This guide assumes you are operating within a Linux terminal environment and hav This method might occur a delay when working with RunPod Serverless Endpoints. This delay is due to the FastAPI server that is used to run the Cog model. -To eliminate this delay, consider using [RunPod Handler](/serverless/workers/overview) functions in a future iteration. +To eliminate this delay, consider using [RunPod Handler](/serverless/handlers/overview) functions in a future iteration. ::: diff --git a/docusaurus.config.js b/docusaurus.config.js index 1d895979..e9be839f 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -251,6 +251,41 @@ const config = { apiKey: "phc_1ku7R949l2D5wsXgMCBNSRIVRMiAn8FyKFNoJWDCcOb", }, ], + [ + "@docusaurus/plugin-client-redirects", + { + createRedirects(existingPath) { + const redirects = []; + // Redirect from /serverless/workers/vllm/ to /serverless/vllm/ + if (existingPath.startsWith("/serverless/vllm/")) { + redirects.push(existingPath.replace("/serverless/vllm/", "/serverless/workers/vllm/")); + } + // Redirect from /serverless/workers/handlers/ to /serverless/handlers/ + else if (existingPath.startsWith("/serverless/handlers/")) { + redirects.push(existingPath.replace("/serverless/handlers/", "/serverless/workers/handlers/")); + } + // Redirect from /serverless/workers/development/ to /serverless/development/ + else if (existingPath.startsWith("/serverless/development/")) { + redirects.push(existingPath.replace("/serverless/development/", "/serverless/workers/development/")); + } + if (existingPath.includes('/serverless/endpoints/')) { + redirects.push(existingPath.replace('/serverless/endpoints/', '/serverless/references/')); + } + return redirects; + }, + + redirects: [ + { + to: '/serverless/endpoints/send-requests', + from: '/serverless/endpoints/get-started', + }, + { + to: '/serverless/endpoints/operations', + from: '/serverless/endpoints/job-operations', + }, + ] + }, + ] ], }; diff --git a/package.json b/package.json index e0c5b76d..46d4676c 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,7 @@ "dependencies": { "@docusaurus/core": "^3.7.0", "@docusaurus/faster": "^3.7.0", + "@docusaurus/plugin-client-redirects": "^3.7.0", "@docusaurus/preset-classic": "^3.7.0", "@mdx-js/react": "^3.0.0", "@vercel/speed-insights": "^1.0.3", diff --git a/src/pages/index.js b/src/pages/index.js index e090789c..e97b0860 100644 --- a/src/pages/index.js +++ b/src/pages/index.js @@ -69,7 +69,7 @@ const ActionCards = [ "Make inference requests via API", ], cta: "Deploy vLLM", - url: "/serverless/workers/vllm/get-started", + url: "/serverless/vllm/get-started", color: "primary", }, { diff --git a/yarn.lock b/yarn.lock index 13cde1f1..9fc6943f 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1698,6 +1698,21 @@ react-helmet-async "*" react-loadable "npm:@docusaurus/react-loadable@6.0.0" +"@docusaurus/plugin-client-redirects@^3.7.0": + version "3.7.0" + resolved "https://registry.yarnpkg.com/@docusaurus/plugin-client-redirects/-/plugin-client-redirects-3.7.0.tgz#b5cf92529768c457c01ad350bfc50862c6149463" + integrity sha512-6B4XAtE5ZVKOyhPgpgMkb7LwCkN+Hgd4vOnlbwR8nCdTQhLjz8MHbGlwwvZ/cay2SPNRX5KssqKAlcHVZP2m8g== + dependencies: + "@docusaurus/core" "3.7.0" + "@docusaurus/logger" "3.7.0" + "@docusaurus/utils" "3.7.0" + "@docusaurus/utils-common" "3.7.0" + "@docusaurus/utils-validation" "3.7.0" + eta "^2.2.0" + fs-extra "^11.1.1" + lodash "^4.17.21" + tslib "^2.6.0" + "@docusaurus/plugin-content-blog@3.7.0": version "3.7.0" resolved "https://registry.yarnpkg.com/@docusaurus/plugin-content-blog/-/plugin-content-blog-3.7.0.tgz#7bd69de87a1f3adb652e1473ef5b7ccc9468f47e"