-
Notifications
You must be signed in to change notification settings - Fork 217
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
无法正确调度申请gpu的pod #661
Comments
根据错误,当前 Pod 还是由 |
看起来和 #653 类似,可以参考下~ |
好的,太感谢了,那我看一下 |
参考: |
知道问题了,好像是我在安装hami之前,安装了nvidia-device-plugin,所以在调度的时候两个会有冲突。 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Please provide an in-depth description of the question you have:
当我使用下面的yaml来申请pod的时候
apiVersion: v1 #表示本配置文件所定义的k8s对象的api版本 kind: Pod #表示本配置文件所定义的对象的类型 metadata: name: gpu-pod1 spec: containers: - name: ubuntu-container image: ubuntu:18.04 command: ["bash", "-c", "sleep 86400"] #意思是让容器睡眠24小时,保持运行状态 #这时候就相当于待机,不占用计算资源,当收到请求的时候才会占用计算资源 resources: limits: nvidia.com/gpu: 1 # 请求1个vGP
会发生UnexpectedAdmissionError错误,如下图所示。
当我使用下面的yaml来申请pod的时候
apiVersion: v1 #表示本配置文件所定义的k8s对象的api版本 kind: Pod #表示本配置文件所定义的对象的类型 metadata: name: gpu-pod2 spec: containers: - name: ubuntu-container image: ubuntu:18.04 command: ["bash", "-c", "sleep 86400"] #意思是让容器睡眠24小时,保持运行状态 #这时候就相当于待机,不占用计算资源,当收到请求的时候才会占用计算资源 resources: limits: nvidia.com/gpu: 1 # 请求1个vGPUs nvidia.com/gpumem: 100 # 每个vGPU申请3000m显存 (可选,整数类型) nvidia.com/gpucores: 5 # 每个vGPU的算力为30%实际显卡的算力 (可选,整数类型)
会发生pod一直在pending的错误,如下图所示
What do you think about this question?:
Environment:
NVIDIA Container Runtime version 1.17.2
commit: fa66e4cd562804509055e44a88f666673e6d27c0
spec: 1.2.0
runc version 1.1.12-0ubuntu2~22.04.1
spec: 1.0.2-dev
go: go1.21.1
libseccomp: 2.5.3
1.containerd/config.toml:
disabled_plugins = []
imports = ["/etc/containerd/config.toml"]
oom_score = 0
plugin_dir = ""
required_plugins = []
root = "/var/lib/containerd"
state = "/run/containerd"
temp = ""
version = 2
[cgroup]
path = ""
[debug]
address = ""
format = ""
gid = 0
level = ""
uid = 0
[grpc]
address = "/run/containerd/containerd.sock"
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216
tcp_address = ""
tcp_tls_ca = ""
tcp_tls_cert = ""
tcp_tls_key = ""
uid = 0
[metrics]
address = ""
grpc_histogram = false
[plugins]
[plugins."io.containerd.gc.v1.scheduler"]
deletion_threshold = 0
mutation_threshold = 100
pause_threshold = 0.02
schedule_delay = "0s"
startup_delay = "100ms"
[plugins."io.containerd.grpc.v1.cri"]
cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"]
device_ownership_from_security_context = false
disable_apparmor = false
disable_cgroup = false
disable_hugetlb_controller = true
disable_proc_mount = false
disable_tcp_service = true
drain_exec_sync_io_timeout = "0s"
enable_cdi = false
enable_selinux = false
enable_tls_streaming = false
enable_unprivileged_icmp = false
enable_unprivileged_ports = false
ignore_deprecation_warnings = []
ignore_image_defined_volumes = false
image_pull_progress_timeout = "5m0s"
image_pull_with_sync_fs = false
max_concurrent_downloads = 3
max_container_log_line_size = 16384
netns_mounts_under_state_dir = false
restrict_oom_score_adj = false
sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.9"
selinux_category_range = 1024
stats_collect_period = 10
stream_idle_timeout = "4h0m0s"
stream_server_address = "127.0.0.1"
stream_server_port = "0"
systemd_cgroup = false
tolerate_missing_hugetlb_controller = true
unset_seccomp_profile = ""
[plugins."io.containerd.internal.v1.opt"]
path = "/opt/containerd"
[plugins."io.containerd.internal.v1.restart"]
interval = "10s"
[plugins."io.containerd.internal.v1.tracing"]
[plugins."io.containerd.metadata.v1.bolt"]
content_sharing_policy = "shared"
[plugins."io.containerd.monitor.v1.cgroups"]
no_prometheus = false
[plugins."io.containerd.nri.v1.nri"]
disable = true
disable_connections = false
plugin_config_path = "/etc/nri/conf.d"
plugin_path = "/opt/nri/plugins"
plugin_registration_timeout = "5s"
plugin_request_timeout = "2s"
socket_path = "/var/run/nri/nri.sock"
[plugins."io.containerd.runtime.v1.linux"]
no_shim = false
runtime = "runc"
runtime_root = ""
shim = "containerd-shim"
shim_debug = false
[plugins."io.containerd.runtime.v2.task"]
platforms = ["linux/amd64"]
sched_core = false
[plugins."io.containerd.service.v1.diff-service"]
default = ["walking"]
[plugins."io.containerd.service.v1.tasks-service"]
blockio_config_file = ""
rdt_config_file = ""
[plugins."io.containerd.snapshotter.v1.aufs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.blockfile"]
fs_type = ""
mount_options = []
root_path = ""
scratch_file = ""
[plugins."io.containerd.snapshotter.v1.btrfs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.devmapper"]
async_remove = false
base_image_size = ""
discard_blocks = false
fs_options = ""
fs_type = ""
pool_name = ""
root_path = ""
[plugins."io.containerd.snapshotter.v1.native"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.overlayfs"]
mount_options = []
root_path = ""
sync_remove = false
upperdir_label = false
[plugins."io.containerd.snapshotter.v1.zfs"]
root_path = ""
[plugins."io.containerd.tracing.processor.v1.otlp"]
[plugins."io.containerd.transfer.v1.local"]
config_path = ""
max_concurrent_downloads = 3
max_concurrent_uploaded_layers = 3
[proxy_plugins]
[stream_processors]
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar"]
accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"]
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
path = "ctd-decoder"
returns = "application/vnd.oci.image.layer.v1.tar"
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"]
accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"]
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
path = "ctd-decoder"
returns = "application/vnd.oci.image.layer.v1.tar+gzip"
[timeouts]
"io.containerd.timeout.bolt.open" = "0s"
"io.containerd.timeout.metrics.shimstats" = "2s"
"io.containerd.timeout.shim.cleanup" = "5s"
"io.containerd.timeout.shim.load" = "5s"
"io.containerd.timeout.shim.shutdown" = "3s"
"io.containerd.timeout.task.state" = "2s"
[ttrpc]
address = ""
gid = 0
uid = 0
2.helm相关pod状态
The text was updated successfully, but these errors were encountered: