[TOC]

故障 pod describe

[root@master1 ~]# kubectl -n kube-system describe pod ascend-device-plugin-ll46f 
Name:                 ascend-device-plugin-ll46f
Namespace:            kube-system
Priority:             2000001000
Priority Class Name:  system-node-critical
Service Account:      ascend-device-plugin-sa
Node:                 master1/10.17.30.131
Start Time:           Mon, 30 Mar 2026 11:08:32 +0800
Labels:               app.kubernetes.io/managed-by=npu-operator
                      controller-revision-hash=7df5dcb887
                      helm.sh/chart=npu-operator-0.15.0
                      name=ascend-device-plugin-ds
                      pod-template-generation=1
Annotations:          cni.projectcalico.org/containerID: c1f2adcaeaaf2bdcf0a6e09730f68231a293074e31d58f61997f714dfb520878
                      cni.projectcalico.org/podIP: 192.168.137.118/32
                      cni.projectcalico.org/podIPs: 192.168.137.118/32
                      scheduler.alpha.kubernetes.io/critical-pod: 
                      seccomp.security.alpha.kubernetes.io/pod: runtime/default
Status:               Running
IP:                   192.168.137.118
IPs:
  IP:           192.168.137.118
Controlled By:  DaemonSet/ascend-device-plugin
Init Containers:
  init-permission:
    Container ID:  containerd://4406968a522bea48dfefebae81ec53644312762af4781c25de689952ed6c2d27
    Image:         cr.openfuyao.cn/openfuyao/busybox:1.36.1
    Image ID:      cr.openfuyao.cn/openfuyao/busybox@sha256:4b8407fadd8100c61b097d63efe992b2c033e7d371c9117f7a9462fe87e31176
    Port:          <none>
    Host Port:     <none>
    Command:
      sh
      -c
      chown 9000:9000 /var/log/mindx-dl /var/log/mindx-dl/devicePlugin
      chmod 750 /var/log/mindx-dl/devicePlugin
      
    State:          Terminated
      Reason:       Completed
      Exit Code:    0
      Started:      Mon, 30 Mar 2026 15:28:32 +0800
      Finished:     Mon, 30 Mar 2026 15:28:32 +0800
    Ready:          True
    Restart Count:  1
    Environment:    <none>
    Mounts:
      /var/log/mindx-dl/devicePlugin from log-path (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-gfldg (ro)
Containers:
  device-plugin-01:
    Container ID:  containerd://fcc0c4742285847e2621a9a9217502307fc7e28644fbf86b32f9c11d67a2c0ab
    Image:         cr.openfuyao.cn/openfuyao/ascend-image/ascend-k8sdeviceplugin:v6.0.0
    Image ID:      cr.openfuyao.cn/openfuyao/ascend-image/ascend-k8sdeviceplugin@sha256:a5b9612b21bcd35384f9f19a05b2d7915b865e7b2be6a30bfd7806a9b8a86f58
    Port:          <none>
    Host Port:     <none>
    Command:
      /bin/bash
      -c
      --
    Args:
      device-plugin  -useAscendDocker=true -volcanoType=false -logFile=/var/log/mindx-dl/devicePlugin/devicePlugin.log -logLevel=0
    State:          Waiting
      Reason:       CrashLoopBackOff
    Last State:     Terminated
      Reason:       Completed
      Exit Code:    0
      Started:      Tue, 31 Mar 2026 10:28:58 +0800
      Finished:     Tue, 31 Mar 2026 10:28:58 +0800
    Ready:          False
    Restart Count:  274
    Limits:
      cpu:     500m
      memory:  500Mi
    Requests:
      cpu:     500m
      memory:  500Mi
    Environment:
      NODE_NAME:   (v1:spec.nodeName)
    Mounts:
      /tmp from tmp (rw)
      /usr/local/Ascend/driver from hiai-driver (ro)
      /var/lib/kubelet/device-plugins from device-plugin (rw)
      /var/lib/kubelet/pod-resources from pod-resource (rw)
      /var/log/mindx-dl/devicePlugin from log-path (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-gfldg (ro)
Conditions:
  Type                        Status
  PodReadyToStartContainers   True 
  Initialized                 True 
  Ready                       False 
  ContainersReady             False 
  PodScheduled                True 
Volumes:
  device-plugin:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/kubelet/device-plugins
    HostPathType:  
  pod-resource:
    Type:          HostPath (bare host directory volume)
    Path:          /var/lib/kubelet/pod-resources
    HostPathType:  
  hiai-driver:
    Type:          HostPath (bare host directory volume)
    Path:          /usr/local/Ascend/driver
    HostPathType:  
  log-path:
    Type:          HostPath (bare host directory volume)
    Path:          /var/log/mindx-dl/devicePlugin
    HostPathType:  DirectoryOrCreate
  tmp:
    Type:          HostPath (bare host directory volume)
    Path:          /tmp
    HostPathType:  
  kube-api-access-gfldg:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    Optional:                false
    DownwardAPI:             true
QoS Class:                   Burstable
Node-Selectors:              openfuyao.com/npu.present=
Tolerations:                 CriticalAddonsOnly op=Exists
                             device-plugin=v2:NoSchedule
                             huawei.com/Ascend910:NoSchedule op=Exists
                             node-role.kubernetes.io/control-plane:NoSchedule
                             node-role.kubernetes.io/master:NoSchedule
                             node.kubernetes.io/disk-pressure:NoSchedule op=Exists
                             node.kubernetes.io/memory-pressure:NoSchedule op=Exists
                             node.kubernetes.io/not-ready:NoExecute op=Exists
                             node.kubernetes.io/pid-pressure:NoSchedule op=Exists
                             node.kubernetes.io/unreachable:NoExecute op=Exists
                             node.kubernetes.io/unschedulable:NoSchedule op=Exists
Events:
  Type     Reason   Age                     From     Message
  ----     ------   ----                    ----     -------
  Normal   Pulled   16m (x205 over 18h)     kubelet  (combined from similar events): Successfully pulled image "cr.openfuyao.cn/openfuyao/ascend-image/ascend-k8sdeviceplugin:v6.0.0" in 403ms (403ms including waiting). Image size: 48017174 bytes.
  Warning  BackOff  2m47s (x5216 over 18h)  kubelet  Back-off restarting failed container device-plugin-01 in pod ascend-device-plugin-ll46f_kube-system(8edcd384-ab2d-4998-8077-5ac58801c79e)
  Normal   Pulling  66s (x227 over 19h)     kubelet  Pulling image "cr.openfuyao.cn/openfuyao/ascend-image/ascend-k8sdeviceplugin:v6.0.0"

故障 pod /dev 检查

[root@master1 fuyao-26.3-rc3]# kubectl  -n kube-system exec -it daemonsets/ascend-device-plugin -- ls /dev
Defaulted container "device-plugin-01" out of: device-plugin-01, init-permission (init)
autofs           null               tty10  tty34  tty58    vcs5
bsg              ppp                tty11  tty35  tty59    vcs6
btrfs-control    ptmx               tty12  tty36  tty6     vcsa
bus              pts                tty13  tty37  tty60    vcsa1
core             random             tty14  tty38  tty61    vcsa2
cpu_dma_latency  raw                tty15  tty39  tty62    vcsa3
cuse             relationship_ctrl  tty16  tty4   tty63    vcsa4
davinci0         rfkill             tty17  tty40  tty7     vcsa5
davinci_manager  rtc0               tty18  tty41  tty8     vcsa6
devmm_svm        sda                tty19  tty42  tty9     vcsu
dri              sda1               tty2   tty43  ttyAMA0  vcsu1
fb0              sda2               tty20  tty44  ttyS0    vcsu2
fd               sg0                tty21  tty45  ttyS1    vcsu3
full             sg1                tty22  tty46  ttyS2    vcsu4
fuse             sg2                tty23  tty47  ttyS3    vcsu5
hidraw0          shm                tty24  tty48  uhid     vcsu6
hidraw1          snapshot           tty25  tty49  uinput   vfio
hisi_hdc         sr0                tty26  tty5   urandom  vga_arbiter
hwrng            sr1                tty27  tty50  usbmon0  vhost-net
input            stderr             tty28  tty51  usbmon1  vhost-vsock
kmsg             stdin              tty29  tty52  usbmon2  vport2p1
loop-control     stdout             tty3   tty53  vcs      zero
mapper           termination-log    tty30  tty54  vcs1
mem              tty                tty31  tty55  vcs2
mqueue           tty0               tty32  tty56  vcs3
net              tty1               tty33  tty57  vcs4

故障 pod 驱动检查

[root@master1 fuyao-26.3-rc3]# kubectl  -n kube-system exec -it daemonsets/ascend-device-plugin -- ls -lha /usr/local/Ascend/driver
Defaulted container "device-plugin-01" out of: device-plugin-01, init-permission (init)
total 44K
drwxr-xr-x  8 root root 4.0K Mar 27 08:03 .
drwxr-xr-x  3 root root 4.0K Mar 31 02:34 ..
drwxr-xr-x  2 root root 4.0K Mar 27 08:01 bin
-r--r--r--  1 root root   20 Mar 27 08:01 build.info
dr-xr-x---  2 root root 4.0K Mar 27 08:01 device
dr-x------ 41 root root 4.0K Mar 27 08:01 kernel
drwxr-xr-x  6 root root 4.0K Mar 27 08:01 lib64
-r--r-----  1 root root   56 Mar 27 08:01 scene.info
dr-xr-x---  2 root root 4.0K Mar 27 08:01 script
drwxr-xr-x  2 root root 4.0K Mar 27 08:01 tools
-r--r--r--  1 root root  352 Mar 27 08:03 version.info

故障 pod 日志

[root@master1 ~]# kubectl -n kube-system logs daemonsets/ascend-device-plugin --previous
Defaulted container "device-plugin-01" out of: device-plugin-01, init-permission (init)
[INFO]     2026/03/31 06:46:54.593254 1       hwlog/api.go:108    devicePlugin.log's logger init success
[INFO]     2026/03/31 06:46:54.593449 1       main.go:187    ascend device plugin starting and the version is v6.0.0_linux-aarch64
[INFO]     2026/03/31 06:46:54.593494 1       main.go:188    ascend device plugin starting scene is center
[INFO]     2026/03/31 06:46:54.787930 1       devmanager/devmanager.go:104    the dcmi version is 24.1.rc3
[ERROR]    2026/03/31 06:46:54.788019 1       devmanager/devmanager.go:211    get error card quantity: 0
[ERROR]    2026/03/31 06:46:54.788052 1       devmanager/devmanager.go:195    get card list failed for init
[ERROR]    2026/03/31 06:46:54.788101 1       main.go:203    init devmanager failed, err: auto init failed, err: get card list failed for init

故障 pod 驱动检查

[root@master1 ~]# kubectl -n kube-system exec -it daemonsets/ascend-device-plugin -- bash -c 'find /usr/local/Ascend/driver -name libdcmi.so 2>/dev/null; echo $LD_LIBRARY_PATH'
Defaulted container "device-plugin-01" out of: device-plugin-01, init-permission (init)
/usr/local/Ascend/driver/lib64/driver/libdcmi.so
command terminated with exit code 137
[root@master1 ~]# ps -ef | grep -E 'dmp_daemon|slogd' | grep -v grep
root       21578       1  0 Mar30 ?        00:00:19 /usr/sbin/rsyslogd -n -i/var/run/rsyslogd.pid

检查服务状态?

[root@master1 ~]# systemctl status ascend-dmi
Unit ascend-dmi.service could not be found.
[root@master1 ~]# systemctl status ascend-dkms
Unit ascend-dkms.service could not be found.
[root@master1 ~]# systemctl status npu-smi
Unit npu-smi.service could not be found.
[root@master1 ~]# find / -name dmp_daemon 2>/dev/null
[root@master1 ~]# find / -name slogd 2>/dev/null
[root@master1 ~]# ls -l /var/dmp_daemon /var/slogd 2>/dev/null
[root@master1 ~]# 

dcmi 问题,需硬件排查

检查卡获取

#include <stdlib.h>
#include <stdio.h>
#include "dcmi_interface_api.h"

int my_get_card_list();

int main(int argc,char *argv[])
{
    my_get_card_list();
    return 0;
}

int my_get_card_list()
{
    printf("\n==================================card id info list=========================\n");
    dcmi_init();
    int card_num = 0;
    int card_list[16] = {0};
    int ret = dcmi_get_card_list(&card_num, card_list, 16);
    if (ret != DCMI_OK) {
        printf("dcmi get card list failed ret=%d\n", ret);
    }
    printf("card_num=%d, card_list:[",card_num);
    for (int i = 0; i < card_num; i++) {
        printf("%d ", card_list[i]);
    }
}
cc ./test1.c -o test1 -I /usr/local/dcmi -L /usr/local/dcmi -ldcmi

-I头文件(.h)搜索路径
-L库文件(.so/.a)搜索路径
-l链接的库名(去掉 lib 前缀)
nerdctl run --rm \
  -v /usr/local/Ascend:/usr/local/Ascend \
  -v /usr/local/dcmi:/usr/local/dcmi \
  -v $(pwd):/build \
  ubuntu:18.04 bash -c "
    sed -i -e 's@http*://ports.ubuntu.com/\? @http://10.17.31.217:8081/repository/mirror-ubuntu-ports/@g' \
           -e 's@http*://ports.ubuntu.com@http://10.17.31.217:8081/repository/mirror-ubuntu-ports@g' \
           /etc/apt/sources.list
    apt update && apt install -y gcc
    cd /build
    cc ./test1.c -o test1 \
      -I /usr/local/dcmi \
      -L /usr/local/dcmi \
      -L /usr/local/Ascend/driver/lib64/common \
      -L /usr/local/Ascend/driver/lib64/driver \
      -ldcmi \
      -Wl,-rpath,/usr/local/Ascend/driver/lib64/common \
      -Wl,-rpath,/usr/local/Ascend/driver/lib64/driver \
      -Wl,-rpath,/usr/local/dcmi
  "

分析二进制:

[root@master1 ascend_debug]# ldd ./test1 | grep -i dcmi
        libdcmi.so => /usr/local/Ascend/driver/lib64/driver/libdcmi.so (0x0000ffffa6dd0000)
[root@master1 ascend_debug]# LD_DEBUG=libs ./test1 2>&1 | grep -i dcmi
    284830:     find library=libdcmi.so [0]; searching
    284830:      search path=/usr/local/Ascend/driver/lib64/common/tls/aarch64/atomics:/usr/local/Ascend/driver/lib64/common/tls/aarch64:/usr/local/Ascend/driver/lib64/common/tls/atomics:/usr/local/Ascend/driver/lib64/common/tls:/usr/local/Ascend/driver/lib64/common/aarch64/atomics:/usr/local/Ascend/driver/lib64/common/aarch64:/usr/local/Ascend/driver/lib64/common/atomics:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver/tls/aarch64/atomics:/usr/local/Ascend/driver/lib64/driver/tls/aarch64:/usr/local/Ascend/driver/lib64/driver/tls/atomics:/usr/local/Ascend/driver/lib64/driver/tls:/usr/local/Ascend/driver/lib64/driver/aarch64/atomics:/usr/local/Ascend/driver/lib64/driver/aarch64:/usr/local/Ascend/driver/lib64/driver/atomics:/usr/local/Ascend/driver/lib64/driver:/usr/local/dcmi/tls/aarch64/atomics:/usr/local/dcmi/tls/aarch64:/usr/local/dcmi/tls/atomics:/usr/local/dcmi/tls:/usr/local/dcmi/aarch64/atomics:/usr/local/dcmi/aarch64:/usr/local/dcmi/atomics:/usr/local/dcmi            (RUNPATH from file ./test1)
    284830:       trying file=/usr/local/Ascend/driver/lib64/common/tls/aarch64/atomics/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/common/tls/aarch64/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/common/tls/atomics/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/common/tls/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/common/aarch64/atomics/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/common/aarch64/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/common/atomics/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/common/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/driver/tls/aarch64/atomics/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/driver/tls/aarch64/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/driver/tls/atomics/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/driver/tls/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/driver/aarch64/atomics/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/driver/aarch64/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/driver/atomics/libdcmi.so
    284830:       trying file=/usr/local/Ascend/driver/lib64/driver/libdcmi.so
    284830:      search path=/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/dcmi/tls/aarch64/atomics:/usr/local/dcmi/tls/aarch64:/usr/local/dcmi/tls/atomics:/usr/local/dcmi/tls:/usr/local/dcmi/aarch64/atomics:/usr/local/dcmi/aarch64:/usr/local/dcmi/atomics:/usr/local/dcmi          (RUNPATH from file ./test1)
    284830:       trying file=/usr/local/dcmi/tls/aarch64/atomics/libc.so.6
    284830:       trying file=/usr/local/dcmi/tls/aarch64/libc.so.6
    284830:       trying file=/usr/local/dcmi/tls/atomics/libc.so.6
    284830:       trying file=/usr/local/dcmi/tls/libc.so.6
    284830:       trying file=/usr/local/dcmi/aarch64/atomics/libc.so.6
    284830:       trying file=/usr/local/dcmi/aarch64/libc.so.6
    284830:       trying file=/usr/local/dcmi/atomics/libc.so.6
    284830:       trying file=/usr/local/dcmi/libc.so.6
    284830:     calling init: /usr/local/Ascend/driver/lib64/driver/libdcmi.so
    284830:     calling fini: /usr/local/Ascend/driver/lib64/driver/libdcmi.so [0]
[root@master1 ascend_debug]# readlink -f /usr/local/dcmi/libdcmi.so
/usr/local/dcmi/libdcmi.so
[root@master1 ascend_debug]# readlink -f /usr/local/Ascend/driver/lib64/driver/libdcmi.so
/usr/local/Ascend/driver/lib64/driver/libdcmi.so
[root@master1 ascend_debug]# sha256sum /usr/local/dcmi/libdcmi.so /usr/local/Ascend/driver/lib64/driver/libdcmi.so
13a38cae84bad0f06367ff9280016e372c0608ca16465b5ae5f000d3844ee401  /usr/local/dcmi/libdcmi.so
13a38cae84bad0f06367ff9280016e372c0608ca16465b5ae5f000d3844ee401  /usr/local/Ascend/driver/lib64/driver/libdcmi.so

跟踪 strace

宿主机跑
strace -f -o /tmp/host.strace -e trace=file,ioctl ./test1
容器里跑
strace -f -o /tmp/container.strace -e trace=file,ioctl ./test1

新增挂载继续跟踪

volumeMounts:
        - name: hdc-basic
          mountPath: /etc/hdcBasic.cfg
          readOnly: true
        - name: localtime
          mountPath: /etc/localtime
          readOnly: true
      volumes:
      - name: hdc-basic
        hostPath:
          path: /etc/hdcBasic.cfg
          type: File
      - name: localtime
        hostPath:
          path: /etc/localtime
          type: File

跟踪并查看日志

kubectl -n kube-system exec -it ascend-device-plugin-69q5t -c device-plugin-01 -- bash

strace -f -o /tmp/container.strace -e trace=file,ioctl ./test1

root@ascend-device-plugin-69q5t:/tmp# strace -f -o /tmp/container.strace -e trace=file,ioctl ./test1

==================================card id info list=========================
card_num=0, card_list:[


root@ascend-device-plugin-69q5t:/tmp# cat /var/log/nputools_LOG_INFO.log > /tmp/nputools_LOG_INFO.log
root@ascend-device-plugin-69q5t:/tmp# cat /var/log/nputools_LOG_ERR.log > /tmp/nputools_LOG_ERR.log
cat: /var/log/nputools_LOG_ERR.log: No such file or directory
root@ascend-device-plugin-69q5t:/tmp# cat /tmp/nputools_LOG_INFO.log 
[2026/04/01 11:18:12][0583][root][127.0.0.1][dcmi_api.c,dcmi_board_init,86]:dcmi board init success. device_count=1.
[2026/04/01 11:18:12][0583][root][127.0.0.1][dcmi_api.c,dcmi_init,119]:dcmi init all success.

检查代码2

#include <stdio.h>
#include <stdlib.h>
#include "dcmi_interface_api.h"

#ifndef DCMI_OK
#define DCMI_OK 0
#endif

/* 头文件里没看到这个声明,手动补一个 */
extern int dcmi_get_card_num_list(int *card_num, int *card_list, int list_length);

static void print_list(const char *name, int ret, int num, int *list) {
    printf("%s ret=%d num=%d list=[", name, ret, num);
    for (int i = 0; i < num; ++i) {
        printf("%d ", list[i]);
    }
    printf("]\n");
}

int main(void) {
    int ret = dcmi_init();
    printf("dcmi_init ret=%d\n", ret);
    if (ret != DCMI_OK) {
        return 1;
    }

    int card_num = 0;
    int card_list[16] = {0};

    ret = dcmi_get_card_list(&card_num, card_list, 16);
    print_list("dcmi_get_card_list", ret, card_num, card_list);

    int card_num2 = 0;
    int card_list2[16] = {0};
    ret = dcmi_get_card_num_list(&card_num2, card_list2, 16);
    print_list("dcmi_get_card_num_list", ret, card_num2, card_list2);

    for (int i = 0; i < card_num && i < 16; ++i) {
        int dev_num = -1;
        ret = dcmi_get_device_num_in_card(card_list[i], &dev_num);
        printf("dcmi_get_device_num_in_card card=%d ret=%d dev_num=%d\n",
               card_list[i], ret, dev_num);
    }

    return 0;
}

主机编译

cc ./test2.c -o test2 -I /usr/local/dcmi -L /usr/local/dcmi -ldcmi

容器编译

nerdctl run --rm \
  -v /usr/local/Ascend:/usr/local/Ascend \
  -v /usr/local/dcmi:/usr/local/dcmi \
  -v $(pwd):/build \
  ubuntu:18.04 bash -c "
    sed -i -e 's@http*://ports.ubuntu.com/\? @http://10.17.31.217:8081/repository/mirror-ubuntu-ports/@g' \
           -e 's@http*://ports.ubuntu.com@http://10.17.31.217:8081/repository/mirror-ubuntu-ports@g' \
           /etc/apt/sources.list
    apt update && apt install -y gcc
    cd /build
    cc ./test2.c -o test2 \
      -I /usr/local/dcmi \
      -L /usr/local/dcmi \
      -L /usr/local/Ascend/driver/lib64/common \
      -L /usr/local/Ascend/driver/lib64/driver \
      -ldcmi \
      -Wl,-rpath,/usr/local/Ascend/driver/lib64/common \
      -Wl,-rpath,/usr/local/Ascend/driver/lib64/driver \
      -Wl,-rpath,/usr/local/dcmi
  "

拷入容器运行

kubectl -n kube-system cp ./test2 ascend-device-plugin-69q5t:/tmp/
# 主机运行
[root@master1 ascend_debug]# ./test2
dcmi_init ret=0
dcmi_get_card_list ret=0 num=1 list=[176 ]
dcmi_get_card_num_list ret=0 num=1 list=[176 ]
dcmi_get_device_num_in_card card=176 ret=0 dev_num=1

# 容器运行
root@ascend-device-plugin-69q5t:/tmp# ./test2 
dcmi_init ret=0
dcmi_get_card_list ret=0 num=0 list=[]
dcmi_get_card_num_list ret=0 num=0 list=[]

虚拟机场景

经过许老师认真定位,最终发现是因为非裸金属环境。虚拟机场景需要定制镜像。

根据官网文档

如果在虚拟机场景下部署Ascend Device Plugin,需要在Ascend Device Plugin的镜像中安装systemd,推荐在Dockerfile中加入RUN apt-get update && apt-get install -y systemd命令进行安装。

为了使用 nerdctl 构建镜像首先安装 buildkit

wegt https://github.com/moby/buildkit/releases/download/v0.29.0/buildkit-v0.29.0.linux-arm64.tar.gz
tar zxvf buildkit-v0.29.0.linux-arm64.tar.gz
cp bin/* /usr/local/bin/

之后找一个新终端启动 buildkit ,这里是为了 nerdctl 构建 image, 如果不需要则不用启动。

buildkitd --oci-worker=false --containerd-worker=true --containerd-worker-namespace=k8s.io 

Dockerfile 如下:

镜像源部分按需修改
FROM hub.oepkgs.net/openfuyao/ascendhub/ascend-k8sdeviceplugin:v6.0.0

# 替换 apt 镜像源
RUN sed -i \
    -e 's@http*://ports.ubuntu.com/\? @http://10.17.31.217:8081/repository/mirror-ubuntu-ports/@g' \
    -e 's@http*://ports.ubuntu.com@http://10.17.31.217:8081/repository/mirror-ubuntu-ports@g' \
    /etc/apt/sources.list

# 安装 systemd
RUN apt-get update && \
    apt-get install -y --no-install-recommends systemd systemd-sysv && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

# 设置 systemd 为 init
STOPSIGNAL SIGRTMIN+3
STOPSIGNAL SIGRTMIN+3 是告诉容器运行时(containerd/docker)停止这个容器时应该发送哪个信号。
如果你的容器里 不跑 systemd 作为主进程(比如 entrypoint 是业务程序),这行可以删掉,没有任何作用。
如果确实用 systemd 管理容器内服务,保留它能避免 kubectl delete pod 时等待 30 秒超时再强杀的问题。

构建命令如下:

nerdctl build \
  --namespace k8s.io \
  -t hub.oepkgs.net/openfuyao/ascendhub/ascend-k8sdeviceplugin:v6.0.0-systemd \
  -f Dockerfile \
  .

之后将出问题的镜像替换为新构建的镜像即可。

npu-operator 有同样的问题,一样修改即可。

修复确认

最终在 node 中能看到 npu 资源即成功。
[root@master1 ~]# kubectl  describe node master1 
Name:               master1
Roles:              control-plane,master,node,worker
Labels:             accelerator=huawei-Ascend310P
                    beta.kubernetes.io/arch=arm64
                    beta.kubernetes.io/os=linux
...
                    servertype=Ascend310P-8
                    workerselector=dls-worker-node
Annotations:        baseDeviceInfos: {"Ascend310P-0":{"IP":"","SuperDeviceID":0}}
...
Capacity:
  cpu:                    16
  ephemeral-storage:      129724184Ki
  huawei.com/Ascend310P:  1
  hugepages-1Gi:          0
  hugepages-2Mi:          0
  hugepages-32Mi:         0
  hugepages-64Ki:         0
  memory:                 32595632Ki
  pods:                   110
Allocatable:
  cpu:                    16
  ephemeral-storage:      119553807777
  huawei.com/Ascend310P:  1
  hugepages-1Gi:          0
  hugepages-2Mi:          0
  hugepages-32Mi:         0
  hugepages-64Ki:         0
  memory:                 32493232Ki
  pods:                   110
...
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource               Requests       Limits
  --------               --------       ------
  cpu                    15565m (97%)   16910m (105%)
  memory                 17492Mi (55%)  30900Mi (97%)
  ephemeral-storage      0 (0%)         0 (0%)
  hugepages-1Gi          0 (0%)         0 (0%)
  hugepages-2Mi          0 (0%)         0 (0%)
  hugepages-32Mi         0 (0%)         0 (0%)
  hugepages-64Ki         0 (0%)         0 (0%)
  huawei.com/Ascend310P  0              0
...
最后修改:2026 年 04 月 01 日
如果觉得我的文章对你有用,请随意赞赏