[TOC]
故障 pod describe
[root@master1 ~]# kubectl -n kube-system describe pod ascend-device-plugin-ll46f
Name: ascend-device-plugin-ll46f
Namespace: kube-system
Priority: 2000001000
Priority Class Name: system-node-critical
Service Account: ascend-device-plugin-sa
Node: master1/10.17.30.131
Start Time: Mon, 30 Mar 2026 11:08:32 +0800
Labels: app.kubernetes.io/managed-by=npu-operator
controller-revision-hash=7df5dcb887
helm.sh/chart=npu-operator-0.15.0
name=ascend-device-plugin-ds
pod-template-generation=1
Annotations: cni.projectcalico.org/containerID: c1f2adcaeaaf2bdcf0a6e09730f68231a293074e31d58f61997f714dfb520878
cni.projectcalico.org/podIP: 192.168.137.118/32
cni.projectcalico.org/podIPs: 192.168.137.118/32
scheduler.alpha.kubernetes.io/critical-pod:
seccomp.security.alpha.kubernetes.io/pod: runtime/default
Status: Running
IP: 192.168.137.118
IPs:
IP: 192.168.137.118
Controlled By: DaemonSet/ascend-device-plugin
Init Containers:
init-permission:
Container ID: containerd://4406968a522bea48dfefebae81ec53644312762af4781c25de689952ed6c2d27
Image: cr.openfuyao.cn/openfuyao/busybox:1.36.1
Image ID: cr.openfuyao.cn/openfuyao/busybox@sha256:4b8407fadd8100c61b097d63efe992b2c033e7d371c9117f7a9462fe87e31176
Port: <none>
Host Port: <none>
Command:
sh
-c
chown 9000:9000 /var/log/mindx-dl /var/log/mindx-dl/devicePlugin
chmod 750 /var/log/mindx-dl/devicePlugin
State: Terminated
Reason: Completed
Exit Code: 0
Started: Mon, 30 Mar 2026 15:28:32 +0800
Finished: Mon, 30 Mar 2026 15:28:32 +0800
Ready: True
Restart Count: 1
Environment: <none>
Mounts:
/var/log/mindx-dl/devicePlugin from log-path (rw)
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-gfldg (ro)
Containers:
device-plugin-01:
Container ID: containerd://fcc0c4742285847e2621a9a9217502307fc7e28644fbf86b32f9c11d67a2c0ab
Image: cr.openfuyao.cn/openfuyao/ascend-image/ascend-k8sdeviceplugin:v6.0.0
Image ID: cr.openfuyao.cn/openfuyao/ascend-image/ascend-k8sdeviceplugin@sha256:a5b9612b21bcd35384f9f19a05b2d7915b865e7b2be6a30bfd7806a9b8a86f58
Port: <none>
Host Port: <none>
Command:
/bin/bash
-c
--
Args:
device-plugin -useAscendDocker=true -volcanoType=false -logFile=/var/log/mindx-dl/devicePlugin/devicePlugin.log -logLevel=0
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Completed
Exit Code: 0
Started: Tue, 31 Mar 2026 10:28:58 +0800
Finished: Tue, 31 Mar 2026 10:28:58 +0800
Ready: False
Restart Count: 274
Limits:
cpu: 500m
memory: 500Mi
Requests:
cpu: 500m
memory: 500Mi
Environment:
NODE_NAME: (v1:spec.nodeName)
Mounts:
/tmp from tmp (rw)
/usr/local/Ascend/driver from hiai-driver (ro)
/var/lib/kubelet/device-plugins from device-plugin (rw)
/var/lib/kubelet/pod-resources from pod-resource (rw)
/var/log/mindx-dl/devicePlugin from log-path (rw)
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-gfldg (ro)
Conditions:
Type Status
PodReadyToStartContainers True
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
device-plugin:
Type: HostPath (bare host directory volume)
Path: /var/lib/kubelet/device-plugins
HostPathType:
pod-resource:
Type: HostPath (bare host directory volume)
Path: /var/lib/kubelet/pod-resources
HostPathType:
hiai-driver:
Type: HostPath (bare host directory volume)
Path: /usr/local/Ascend/driver
HostPathType:
log-path:
Type: HostPath (bare host directory volume)
Path: /var/log/mindx-dl/devicePlugin
HostPathType: DirectoryOrCreate
tmp:
Type: HostPath (bare host directory volume)
Path: /tmp
HostPathType:
kube-api-access-gfldg:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
Optional: false
DownwardAPI: true
QoS Class: Burstable
Node-Selectors: openfuyao.com/npu.present=
Tolerations: CriticalAddonsOnly op=Exists
device-plugin=v2:NoSchedule
huawei.com/Ascend910:NoSchedule op=Exists
node-role.kubernetes.io/control-plane:NoSchedule
node-role.kubernetes.io/master:NoSchedule
node.kubernetes.io/disk-pressure:NoSchedule op=Exists
node.kubernetes.io/memory-pressure:NoSchedule op=Exists
node.kubernetes.io/not-ready:NoExecute op=Exists
node.kubernetes.io/pid-pressure:NoSchedule op=Exists
node.kubernetes.io/unreachable:NoExecute op=Exists
node.kubernetes.io/unschedulable:NoSchedule op=Exists
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Pulled 16m (x205 over 18h) kubelet (combined from similar events): Successfully pulled image "cr.openfuyao.cn/openfuyao/ascend-image/ascend-k8sdeviceplugin:v6.0.0" in 403ms (403ms including waiting). Image size: 48017174 bytes.
Warning BackOff 2m47s (x5216 over 18h) kubelet Back-off restarting failed container device-plugin-01 in pod ascend-device-plugin-ll46f_kube-system(8edcd384-ab2d-4998-8077-5ac58801c79e)
Normal Pulling 66s (x227 over 19h) kubelet Pulling image "cr.openfuyao.cn/openfuyao/ascend-image/ascend-k8sdeviceplugin:v6.0.0"故障 pod /dev 检查
[root@master1 fuyao-26.3-rc3]# kubectl -n kube-system exec -it daemonsets/ascend-device-plugin -- ls /dev
Defaulted container "device-plugin-01" out of: device-plugin-01, init-permission (init)
autofs null tty10 tty34 tty58 vcs5
bsg ppp tty11 tty35 tty59 vcs6
btrfs-control ptmx tty12 tty36 tty6 vcsa
bus pts tty13 tty37 tty60 vcsa1
core random tty14 tty38 tty61 vcsa2
cpu_dma_latency raw tty15 tty39 tty62 vcsa3
cuse relationship_ctrl tty16 tty4 tty63 vcsa4
davinci0 rfkill tty17 tty40 tty7 vcsa5
davinci_manager rtc0 tty18 tty41 tty8 vcsa6
devmm_svm sda tty19 tty42 tty9 vcsu
dri sda1 tty2 tty43 ttyAMA0 vcsu1
fb0 sda2 tty20 tty44 ttyS0 vcsu2
fd sg0 tty21 tty45 ttyS1 vcsu3
full sg1 tty22 tty46 ttyS2 vcsu4
fuse sg2 tty23 tty47 ttyS3 vcsu5
hidraw0 shm tty24 tty48 uhid vcsu6
hidraw1 snapshot tty25 tty49 uinput vfio
hisi_hdc sr0 tty26 tty5 urandom vga_arbiter
hwrng sr1 tty27 tty50 usbmon0 vhost-net
input stderr tty28 tty51 usbmon1 vhost-vsock
kmsg stdin tty29 tty52 usbmon2 vport2p1
loop-control stdout tty3 tty53 vcs zero
mapper termination-log tty30 tty54 vcs1
mem tty tty31 tty55 vcs2
mqueue tty0 tty32 tty56 vcs3
net tty1 tty33 tty57 vcs4
故障 pod 驱动检查
[root@master1 fuyao-26.3-rc3]# kubectl -n kube-system exec -it daemonsets/ascend-device-plugin -- ls -lha /usr/local/Ascend/driver
Defaulted container "device-plugin-01" out of: device-plugin-01, init-permission (init)
total 44K
drwxr-xr-x 8 root root 4.0K Mar 27 08:03 .
drwxr-xr-x 3 root root 4.0K Mar 31 02:34 ..
drwxr-xr-x 2 root root 4.0K Mar 27 08:01 bin
-r--r--r-- 1 root root 20 Mar 27 08:01 build.info
dr-xr-x--- 2 root root 4.0K Mar 27 08:01 device
dr-x------ 41 root root 4.0K Mar 27 08:01 kernel
drwxr-xr-x 6 root root 4.0K Mar 27 08:01 lib64
-r--r----- 1 root root 56 Mar 27 08:01 scene.info
dr-xr-x--- 2 root root 4.0K Mar 27 08:01 script
drwxr-xr-x 2 root root 4.0K Mar 27 08:01 tools
-r--r--r-- 1 root root 352 Mar 27 08:03 version.info
故障 pod 日志
[root@master1 ~]# kubectl -n kube-system logs daemonsets/ascend-device-plugin --previous
Defaulted container "device-plugin-01" out of: device-plugin-01, init-permission (init)
[INFO] 2026/03/31 06:46:54.593254 1 hwlog/api.go:108 devicePlugin.log's logger init success
[INFO] 2026/03/31 06:46:54.593449 1 main.go:187 ascend device plugin starting and the version is v6.0.0_linux-aarch64
[INFO] 2026/03/31 06:46:54.593494 1 main.go:188 ascend device plugin starting scene is center
[INFO] 2026/03/31 06:46:54.787930 1 devmanager/devmanager.go:104 the dcmi version is 24.1.rc3
[ERROR] 2026/03/31 06:46:54.788019 1 devmanager/devmanager.go:211 get error card quantity: 0
[ERROR] 2026/03/31 06:46:54.788052 1 devmanager/devmanager.go:195 get card list failed for init
[ERROR] 2026/03/31 06:46:54.788101 1 main.go:203 init devmanager failed, err: auto init failed, err: get card list failed for init故障 pod 驱动检查
[root@master1 ~]# kubectl -n kube-system exec -it daemonsets/ascend-device-plugin -- bash -c 'find /usr/local/Ascend/driver -name libdcmi.so 2>/dev/null; echo $LD_LIBRARY_PATH'
Defaulted container "device-plugin-01" out of: device-plugin-01, init-permission (init)
/usr/local/Ascend/driver/lib64/driver/libdcmi.so
command terminated with exit code 137
[root@master1 ~]# ps -ef | grep -E 'dmp_daemon|slogd' | grep -v grep
root 21578 1 0 Mar30 ? 00:00:19 /usr/sbin/rsyslogd -n -i/var/run/rsyslogd.pid检查服务状态?
[root@master1 ~]# systemctl status ascend-dmi
Unit ascend-dmi.service could not be found.
[root@master1 ~]# systemctl status ascend-dkms
Unit ascend-dkms.service could not be found.
[root@master1 ~]# systemctl status npu-smi
Unit npu-smi.service could not be found.
[root@master1 ~]# find / -name dmp_daemon 2>/dev/null
[root@master1 ~]# find / -name slogd 2>/dev/null
[root@master1 ~]# ls -l /var/dmp_daemon /var/slogd 2>/dev/null
[root@master1 ~]# dcmi 问题,需硬件排查
检查卡获取
#include <stdlib.h>
#include <stdio.h>
#include "dcmi_interface_api.h"
int my_get_card_list();
int main(int argc,char *argv[])
{
my_get_card_list();
return 0;
}
int my_get_card_list()
{
printf("\n==================================card id info list=========================\n");
dcmi_init();
int card_num = 0;
int card_list[16] = {0};
int ret = dcmi_get_card_list(&card_num, card_list, 16);
if (ret != DCMI_OK) {
printf("dcmi get card list failed ret=%d\n", ret);
}
printf("card_num=%d, card_list:[",card_num);
for (int i = 0; i < card_num; i++) {
printf("%d ", card_list[i]);
}
}cc ./test1.c -o test1 -I /usr/local/dcmi -L /usr/local/dcmi -ldcmi
-I头文件(.h)搜索路径
-L库文件(.so/.a)搜索路径
-l链接的库名(去掉 lib 前缀)nerdctl run --rm \
-v /usr/local/Ascend:/usr/local/Ascend \
-v /usr/local/dcmi:/usr/local/dcmi \
-v $(pwd):/build \
ubuntu:18.04 bash -c "
sed -i -e 's@http*://ports.ubuntu.com/\? @http://10.17.31.217:8081/repository/mirror-ubuntu-ports/@g' \
-e 's@http*://ports.ubuntu.com@http://10.17.31.217:8081/repository/mirror-ubuntu-ports@g' \
/etc/apt/sources.list
apt update && apt install -y gcc
cd /build
cc ./test1.c -o test1 \
-I /usr/local/dcmi \
-L /usr/local/dcmi \
-L /usr/local/Ascend/driver/lib64/common \
-L /usr/local/Ascend/driver/lib64/driver \
-ldcmi \
-Wl,-rpath,/usr/local/Ascend/driver/lib64/common \
-Wl,-rpath,/usr/local/Ascend/driver/lib64/driver \
-Wl,-rpath,/usr/local/dcmi
"分析二进制:
[root@master1 ascend_debug]# ldd ./test1 | grep -i dcmi
libdcmi.so => /usr/local/Ascend/driver/lib64/driver/libdcmi.so (0x0000ffffa6dd0000)
[root@master1 ascend_debug]# LD_DEBUG=libs ./test1 2>&1 | grep -i dcmi
284830: find library=libdcmi.so [0]; searching
284830: search path=/usr/local/Ascend/driver/lib64/common/tls/aarch64/atomics:/usr/local/Ascend/driver/lib64/common/tls/aarch64:/usr/local/Ascend/driver/lib64/common/tls/atomics:/usr/local/Ascend/driver/lib64/common/tls:/usr/local/Ascend/driver/lib64/common/aarch64/atomics:/usr/local/Ascend/driver/lib64/common/aarch64:/usr/local/Ascend/driver/lib64/common/atomics:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver/tls/aarch64/atomics:/usr/local/Ascend/driver/lib64/driver/tls/aarch64:/usr/local/Ascend/driver/lib64/driver/tls/atomics:/usr/local/Ascend/driver/lib64/driver/tls:/usr/local/Ascend/driver/lib64/driver/aarch64/atomics:/usr/local/Ascend/driver/lib64/driver/aarch64:/usr/local/Ascend/driver/lib64/driver/atomics:/usr/local/Ascend/driver/lib64/driver:/usr/local/dcmi/tls/aarch64/atomics:/usr/local/dcmi/tls/aarch64:/usr/local/dcmi/tls/atomics:/usr/local/dcmi/tls:/usr/local/dcmi/aarch64/atomics:/usr/local/dcmi/aarch64:/usr/local/dcmi/atomics:/usr/local/dcmi (RUNPATH from file ./test1)
284830: trying file=/usr/local/Ascend/driver/lib64/common/tls/aarch64/atomics/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/common/tls/aarch64/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/common/tls/atomics/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/common/tls/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/common/aarch64/atomics/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/common/aarch64/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/common/atomics/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/common/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/driver/tls/aarch64/atomics/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/driver/tls/aarch64/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/driver/tls/atomics/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/driver/tls/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/driver/aarch64/atomics/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/driver/aarch64/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/driver/atomics/libdcmi.so
284830: trying file=/usr/local/Ascend/driver/lib64/driver/libdcmi.so
284830: search path=/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/dcmi/tls/aarch64/atomics:/usr/local/dcmi/tls/aarch64:/usr/local/dcmi/tls/atomics:/usr/local/dcmi/tls:/usr/local/dcmi/aarch64/atomics:/usr/local/dcmi/aarch64:/usr/local/dcmi/atomics:/usr/local/dcmi (RUNPATH from file ./test1)
284830: trying file=/usr/local/dcmi/tls/aarch64/atomics/libc.so.6
284830: trying file=/usr/local/dcmi/tls/aarch64/libc.so.6
284830: trying file=/usr/local/dcmi/tls/atomics/libc.so.6
284830: trying file=/usr/local/dcmi/tls/libc.so.6
284830: trying file=/usr/local/dcmi/aarch64/atomics/libc.so.6
284830: trying file=/usr/local/dcmi/aarch64/libc.so.6
284830: trying file=/usr/local/dcmi/atomics/libc.so.6
284830: trying file=/usr/local/dcmi/libc.so.6
284830: calling init: /usr/local/Ascend/driver/lib64/driver/libdcmi.so
284830: calling fini: /usr/local/Ascend/driver/lib64/driver/libdcmi.so [0]
[root@master1 ascend_debug]# readlink -f /usr/local/dcmi/libdcmi.so
/usr/local/dcmi/libdcmi.so
[root@master1 ascend_debug]# readlink -f /usr/local/Ascend/driver/lib64/driver/libdcmi.so
/usr/local/Ascend/driver/lib64/driver/libdcmi.so
[root@master1 ascend_debug]# sha256sum /usr/local/dcmi/libdcmi.so /usr/local/Ascend/driver/lib64/driver/libdcmi.so
13a38cae84bad0f06367ff9280016e372c0608ca16465b5ae5f000d3844ee401 /usr/local/dcmi/libdcmi.so
13a38cae84bad0f06367ff9280016e372c0608ca16465b5ae5f000d3844ee401 /usr/local/Ascend/driver/lib64/driver/libdcmi.so跟踪 strace
宿主机跑
strace -f -o /tmp/host.strace -e trace=file,ioctl ./test1
容器里跑
strace -f -o /tmp/container.strace -e trace=file,ioctl ./test1新增挂载继续跟踪
volumeMounts:
- name: hdc-basic
mountPath: /etc/hdcBasic.cfg
readOnly: true
- name: localtime
mountPath: /etc/localtime
readOnly: true
volumes:
- name: hdc-basic
hostPath:
path: /etc/hdcBasic.cfg
type: File
- name: localtime
hostPath:
path: /etc/localtime
type: File跟踪并查看日志
kubectl -n kube-system exec -it ascend-device-plugin-69q5t -c device-plugin-01 -- bash
strace -f -o /tmp/container.strace -e trace=file,ioctl ./test1
root@ascend-device-plugin-69q5t:/tmp# strace -f -o /tmp/container.strace -e trace=file,ioctl ./test1
==================================card id info list=========================
card_num=0, card_list:[
root@ascend-device-plugin-69q5t:/tmp# cat /var/log/nputools_LOG_INFO.log > /tmp/nputools_LOG_INFO.log
root@ascend-device-plugin-69q5t:/tmp# cat /var/log/nputools_LOG_ERR.log > /tmp/nputools_LOG_ERR.log
cat: /var/log/nputools_LOG_ERR.log: No such file or directory
root@ascend-device-plugin-69q5t:/tmp# cat /tmp/nputools_LOG_INFO.log
[2026/04/01 11:18:12][0583][root][127.0.0.1][dcmi_api.c,dcmi_board_init,86]:dcmi board init success. device_count=1.
[2026/04/01 11:18:12][0583][root][127.0.0.1][dcmi_api.c,dcmi_init,119]:dcmi init all success.检查代码2
#include <stdio.h>
#include <stdlib.h>
#include "dcmi_interface_api.h"
#ifndef DCMI_OK
#define DCMI_OK 0
#endif
/* 头文件里没看到这个声明,手动补一个 */
extern int dcmi_get_card_num_list(int *card_num, int *card_list, int list_length);
static void print_list(const char *name, int ret, int num, int *list) {
printf("%s ret=%d num=%d list=[", name, ret, num);
for (int i = 0; i < num; ++i) {
printf("%d ", list[i]);
}
printf("]\n");
}
int main(void) {
int ret = dcmi_init();
printf("dcmi_init ret=%d\n", ret);
if (ret != DCMI_OK) {
return 1;
}
int card_num = 0;
int card_list[16] = {0};
ret = dcmi_get_card_list(&card_num, card_list, 16);
print_list("dcmi_get_card_list", ret, card_num, card_list);
int card_num2 = 0;
int card_list2[16] = {0};
ret = dcmi_get_card_num_list(&card_num2, card_list2, 16);
print_list("dcmi_get_card_num_list", ret, card_num2, card_list2);
for (int i = 0; i < card_num && i < 16; ++i) {
int dev_num = -1;
ret = dcmi_get_device_num_in_card(card_list[i], &dev_num);
printf("dcmi_get_device_num_in_card card=%d ret=%d dev_num=%d\n",
card_list[i], ret, dev_num);
}
return 0;
}主机编译
cc ./test2.c -o test2 -I /usr/local/dcmi -L /usr/local/dcmi -ldcmi容器编译
nerdctl run --rm \
-v /usr/local/Ascend:/usr/local/Ascend \
-v /usr/local/dcmi:/usr/local/dcmi \
-v $(pwd):/build \
ubuntu:18.04 bash -c "
sed -i -e 's@http*://ports.ubuntu.com/\? @http://10.17.31.217:8081/repository/mirror-ubuntu-ports/@g' \
-e 's@http*://ports.ubuntu.com@http://10.17.31.217:8081/repository/mirror-ubuntu-ports@g' \
/etc/apt/sources.list
apt update && apt install -y gcc
cd /build
cc ./test2.c -o test2 \
-I /usr/local/dcmi \
-L /usr/local/dcmi \
-L /usr/local/Ascend/driver/lib64/common \
-L /usr/local/Ascend/driver/lib64/driver \
-ldcmi \
-Wl,-rpath,/usr/local/Ascend/driver/lib64/common \
-Wl,-rpath,/usr/local/Ascend/driver/lib64/driver \
-Wl,-rpath,/usr/local/dcmi
"拷入容器运行
kubectl -n kube-system cp ./test2 ascend-device-plugin-69q5t:/tmp/# 主机运行
[root@master1 ascend_debug]# ./test2
dcmi_init ret=0
dcmi_get_card_list ret=0 num=1 list=[176 ]
dcmi_get_card_num_list ret=0 num=1 list=[176 ]
dcmi_get_device_num_in_card card=176 ret=0 dev_num=1
# 容器运行
root@ascend-device-plugin-69q5t:/tmp# ./test2
dcmi_init ret=0
dcmi_get_card_list ret=0 num=0 list=[]
dcmi_get_card_num_list ret=0 num=0 list=[]虚拟机场景
经过许老师认真定位,最终发现是因为非裸金属环境。虚拟机场景需要定制镜像。
根据官网文档:
如果在虚拟机场景下部署Ascend Device Plugin,需要在Ascend Device Plugin的镜像中安装systemd,推荐在Dockerfile中加入RUN apt-get update && apt-get install -y systemd命令进行安装。
为了使用 nerdctl 构建镜像首先安装 buildkit
wegt https://github.com/moby/buildkit/releases/download/v0.29.0/buildkit-v0.29.0.linux-arm64.tar.gz
tar zxvf buildkit-v0.29.0.linux-arm64.tar.gz
cp bin/* /usr/local/bin/之后找一个新终端启动 buildkit ,这里是为了 nerdctl 构建 image, 如果不需要则不用启动。
buildkitd --oci-worker=false --containerd-worker=true --containerd-worker-namespace=k8s.io Dockerfile 如下:
镜像源部分按需修改
FROM hub.oepkgs.net/openfuyao/ascendhub/ascend-k8sdeviceplugin:v6.0.0
# 替换 apt 镜像源
RUN sed -i \
-e 's@http*://ports.ubuntu.com/\? @http://10.17.31.217:8081/repository/mirror-ubuntu-ports/@g' \
-e 's@http*://ports.ubuntu.com@http://10.17.31.217:8081/repository/mirror-ubuntu-ports@g' \
/etc/apt/sources.list
# 安装 systemd
RUN apt-get update && \
apt-get install -y --no-install-recommends systemd systemd-sysv && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# 设置 systemd 为 init
STOPSIGNAL SIGRTMIN+3STOPSIGNAL SIGRTMIN+3 是告诉容器运行时(containerd/docker)停止这个容器时应该发送哪个信号。
如果你的容器里 不跑 systemd 作为主进程(比如 entrypoint 是业务程序),这行可以删掉,没有任何作用。
如果确实用 systemd 管理容器内服务,保留它能避免 kubectl delete pod 时等待 30 秒超时再强杀的问题。
构建命令如下:
nerdctl build \
--namespace k8s.io \
-t hub.oepkgs.net/openfuyao/ascendhub/ascend-k8sdeviceplugin:v6.0.0-systemd \
-f Dockerfile \
.之后将出问题的镜像替换为新构建的镜像即可。
npu-operator 有同样的问题,一样修改即可。
修复确认
最终在 node 中能看到 npu 资源即成功。
[root@master1 ~]# kubectl describe node master1
Name: master1
Roles: control-plane,master,node,worker
Labels: accelerator=huawei-Ascend310P
beta.kubernetes.io/arch=arm64
beta.kubernetes.io/os=linux
...
servertype=Ascend310P-8
workerselector=dls-worker-node
Annotations: baseDeviceInfos: {"Ascend310P-0":{"IP":"","SuperDeviceID":0}}
...
Capacity:
cpu: 16
ephemeral-storage: 129724184Ki
huawei.com/Ascend310P: 1
hugepages-1Gi: 0
hugepages-2Mi: 0
hugepages-32Mi: 0
hugepages-64Ki: 0
memory: 32595632Ki
pods: 110
Allocatable:
cpu: 16
ephemeral-storage: 119553807777
huawei.com/Ascend310P: 1
hugepages-1Gi: 0
hugepages-2Mi: 0
hugepages-32Mi: 0
hugepages-64Ki: 0
memory: 32493232Ki
pods: 110
...
Allocated resources:
(Total limits may be over 100 percent, i.e., overcommitted.)
Resource Requests Limits
-------- -------- ------
cpu 15565m (97%) 16910m (105%)
memory 17492Mi (55%) 30900Mi (97%)
ephemeral-storage 0 (0%) 0 (0%)
hugepages-1Gi 0 (0%) 0 (0%)
hugepages-2Mi 0 (0%) 0 (0%)
hugepages-32Mi 0 (0%) 0 (0%)
hugepages-64Ki 0 (0%) 0 (0%)
huawei.com/Ascend310P 0 0
...