20230924: おうちでConnect-X4 Lx

便利コマンド

良く使うWrapper

opt(){
        sudo ethtool -m $1 | egrep '(Transceiver type|Vendor name|Vendor PN|Vendor SN|aser output power |Receiver signal average|Module temperature|Transmit avg optical power|Rcvr signal avg optical power)' | grep -v threshold | grep -v warning | grep -v alarm
}

ops(){
        grep PCI_SLOT_NAME /sys/class/net/*/device/uevent | sed -e 's/^.*net\/\([^/]*\)[^=]*/\1/' |cut -d = -f 1
}

optics(){
        for x in `ops`; do echo === $x ===; opt $x; done
}

簡易的な一覧

# 調べる
$ sudo lshw -c network -businfo
Bus info          Device        Class          Description
==========================================================
pci@0000:01:00.0  enp1s0f0np0   network        MT27710 Family [ConnectX-4 Lx]
pci@0000:01:00.1  enp1s0f1np1   network        MT27710 Family [ConnectX-4 Lx]
pci@0000:02:00.0  enp2s0        network        RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller

NICやトランシーバの状態の確認

kanai@kanaipc1:~/MLNX_OFED_LINUX-23.04-0.5.3.3-ubuntu22.04-x86_64$ sudo ethtool enp1s0f0np0
Settings for enp1s0f0np0:
        Supported ports: [ Backplane ]
        Supported link modes:   1000baseKX/Full
                                10000baseKR/Full
        Supported pause frame use: Symmetric
        Supports auto-negotiation: Yes
        Supported FEC modes: None        RS      BASER
        Advertised link modes:  1000baseKX/Full
                                10000baseKR/Full
        Advertised pause frame use: Symmetric
        Advertised auto-negotiation: Yes
        Advertised FEC modes: None
        Link partner advertised link modes:  Not reported
        Link partner advertised pause frame use: No
        Link partner advertised auto-negotiation: Yes
        Link partner advertised FEC modes: Not reported
        Speed: 10000Mb/s
        Duplex: Full
        Auto-negotiation: on
        Port: Direct Attach Copper
        PHYAD: 0
        Transceiver: internal
        Supports Wake-on: d
        Wake-on: d
        Current message level: 0x00000004 (4)
                               link
        Link detected: yes


kanai@kanaipc1:~/MLNX_OFED_LINUX-23.04-0.5.3.3-ubuntu22.04-x86_64$ sudo ethtool -m enp1s0f0np0
        Identifier                                : 0x03 (SFP)
        Extended identifier                       : 0x04 (GBIC/SFP defined by 2-wire interface ID)
        Connector                                 : 0x21 (Copper pigtail)
        Transceiver codes                         : 0x01 0x00 0x00 0x04 0x00 0x04 0x80 0xd5 0x00
        Transceiver type                          : Infiniband: 1X Copper Passive
        Transceiver type                          : Ethernet: 1000BASE-CX
        Transceiver type                          : Passive Cable
        Transceiver type                          : FC: Twin Axial Pair (TW)
        Transceiver type                          : FC: 1200 MBytes/sec
        Transceiver type                          : FC: 800 MBytes/sec
        Transceiver type                          : FC: 400 MBytes/sec
        Transceiver type                          : FC: 200 MBytes/sec
        Transceiver type                          : FC: 100 MBytes/sec
        Encoding                                  : 0x00 (unspecified)
        BR, Nominal                               : 10300MBd
        Rate identifier                           : 0x00 (unspecified)
        Length (SMF,km)                           : 0km
        Length (SMF)                              : 0m
        Length (50um)                             : 0m
        Length (62.5um)                           : 0m
        Length (Copper)                           : 1m
        Length (OM3)                              : 0m
        Passive Cu cmplnce.                       : 0x01 (SFF-8431 appendix E) [SFF-8472 rev10.4 only]
        Vendor name                               : Mellanox
        Vendor OUI                                : 00:02:c9
        Vendor PN                                 : MC3309130-001
        Vendor rev                                : A1
        Option values                             : 0x00 0x00
        BR margin, max                            : 0%
        BR margin, min                            : 0%
        Vendor SN                                 : MT1419VS13961

pcieトポロジを出す

$ sudo lspci -tv
-[0000:00]-+-00.0  Intel Corporation Xeon E3-1200 v5/E3-1500 v5/6th Gen Core Processor Host Bridge/DRAM Registers
           +-01.0-[01]--+-00.0  Mellanox Technologies MT27710 Family [ConnectX-4 Lx]
           |            \-00.1  Mellanox Technologies MT27710 Family [ConnectX-4 Lx]
           +-02.0  Intel Corporation HD Graphics 530
           +-14.0  Intel Corporation 100 Series/C230 Series Chipset Family USB 3.0 xHCI Controller
           +-14.2  Intel Corporation 100 Series/C230 Series Chipset Family Thermal Subsystem
           +-16.0  Intel Corporation 100 Series/C230 Series Chipset Family MEI Controller #1
           +-17.0  Intel Corporation Q170/Q150/B150/H170/H110/Z170/CM236 Chipset SATA Controller [AHCI Mode]
           +-1c.0-[02]----00.0  Realtek Semiconductor Co., Ltd. RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
           +-1f.0  Intel Corporation H110 Chipset LPC/eSPI Controller
           +-1f.2  Intel Corporation 100 Series/C230 Series Chipset Family Power Management Controller
           +-1f.3  Intel Corporation 100 Series/C230 Series Chipset Family HD Audio Controller
           \-1f.4  Intel Corporation 100 Series/C230 Series Chipset Family SMBus

lspci view

kanai@kanaipc1:~$ lspci -v | grep Eth
01:00.0 Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx]
01:00.1 Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx]
02:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller (rev 15)
        Subsystem: Dell RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller

カードの種別を調べる

kanai@kanaipc1:~$ sudo lspci -s 01:00 -vv  | grep Sub
        Subsystem: Mellanox Technologies ConnectX-4 Lx Stand-up dual-port 10GbE MCX4121A-XCAT
        Subsystem: Mellanox Technologies ConnectX-4 Lx Stand-up dual-port 10GbE MCX4121A-XCAT

PCIeカードの認識確認

LnkCapとLnkStaが一致していることを確認する。PCIeが古かったり、x16スロットなのにx8までのレーンしか上がらないと不一致が出るかも。

$ sudo lspci -s 01:00 -vv | egrep '(^[[0-9]|LnkCap|LnkSta)'
01:00.0 Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx]
                LnkCap: Port #0, Speed 8GT/s, Width x8, ASPM L1, Exit Latency L1 <4us
                LnkSta: Speed 8GT/s (ok), Width x8 (ok)
                LnkCap2: Supported Link Speeds: 2.5-8GT/s, Crosslink- Retimer- 2Retimers- DRS-
                LnkSta2: Current De-emphasis Level: -6dB, EqualizationComplete+ EqualizationPhase1+
01:00.1 Ethernet controller: Mellanox Technologies MT27710 Family [ConnectX-4 Lx]
                LnkCap: Port #0, Speed 8GT/s, Width x8, ASPM L1, Exit Latency L1 <4us
                LnkSta: Speed 8GT/s (ok), Width x8 (ok)
                LnkSta2: Current De-emphasis Level: -6dB, EqualizationComplete- EqualizationPhase1-

OFEDドライバのインストール

> これはやらなくてもどうせ入る
sudo apt -y install flex debhelper autoconf libnl-route-3-200 bison pkg-config m4 libfuse2 gfortran libnl-3-dev quilt automake libgfortran5 swig chrpath libnl-route-3-dev autotools-dev libltdl-dev graphviz tk
tar zxfv MLNX_OFED_LINUX-23.04-0.5.3.3-ubuntu22.04-x86_64.tgz
cd MLNX_OFED_LINUX-23.04-0.5.3.3-ubuntu22.04-x86_64/
sudo ./install.sh
sudo ./mlnxofedinstall --with-nvmf
 > この実行はサイレントに行われるので進捗が知りたければtopなどする
sudo /etc/init.d/openibd restart
sudo update-initramfs -u
sudo modprobe nvmet
sudo modprobe nvmet-rdma
sudo modprobe nvme-rdma
modprobe mlx5_core
# このrebootは必須
reboot

確認

ofed_info -n
> 23.07-0.5.1.2

バージョンあげ

sudo mlxfwmanager --online
Querying Mellanox devices firmware ...

Device #1:
----------

  Device Type:      ConnectX4LX
  Part Number:      MCX4121A-XCA_Ax
  Description:      ConnectX-4 Lx EN network interface card; 10GbE dual-port SFP28; PCIe3.0 x8; ROHS R6
  PSID:             MT_2420110004
  PCI Device Name:  0000:01:00.0
  Base MAC:         b83fd20fd7ee
  Versions:         Current        Available
     FW             14.30.1004     14.32.1010
     PXE            3.6.0301       3.6.0502
     UEFI           14.23.0017     14.25.0017

  Status:           Update required
sudo mlxfwmanager --online -u

ethtoolのバージョンアップ

古いethtoolを使っていたり新しいトランシーバを使っているとethtoolでうまくトランシーバー情報を読めないことがあります。

sudo ethtool -m eth0
Offset		Values
------		------
0x0000:		00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x0010:		00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

ethtoolをコンパイルします。

https://cdn.kernel.org/pub/software/network/ethtool/

wget https://cdn.kernel.org/pub/software/network/ethtool/ethtool-6.5.tar.gz
tar zxfv ethtool-6.5.tar.gz
cd ethtool-6.5/
sudo apt -y install gcc pkg-config libmnl-dev make
./ethtool --version
> ethtool version 6.5

Firm

# firmをダウンロード
ls -1
>fw-ConnectX4Lx-rel-14_29_1016-MCX4121A-XCA_Ax-UEFI-14.22.14-FlexBoot-3.6.204.bin
>fw-ConnectX4Lx-rel-14_30_1004-MCX4121A-XCA_Ax-UEFI-14.23.17-FlexBoot-3.6.301.bin
>fw-ConnectX4Lx-rel-14_32_1010-MCX4121A-XCA_Ax-UEFI-14.25.17-FlexBoot-3.6.502.bin
sudo mlxfwmanager -d /dev/mst/mt4117_pciconf0
> 確認
sudo mlxfwmanager -d /dev/mst/mt4117_pciconf0 -u -i fw-ConnectX4Lx-rel-14_30_1004-MCX4121A-XCA_Ax-UEFI-14.23.17-FlexBoot-3.6.301.bin
> 注意!これをやってもdowngradeの場合はupgradeされない
sudo mlxfwmanager -d /dev/mst/mt4117_pciconf0 -u -i fw-ConnectX4Lx-rel-14_30_1004-MCX4121A-XCA_Ax-UEFI-14.23.17-FlexBoot-3.6.301.bin -f
>
ubuntu@optiplex:~/tmp$ sudo mlxfwmanager -d /dev/mst/mt4117_pciconf0
Querying Mellanox devices firmware ...
     FW             14.30.1004     N/A
     FW (Running)   14.32.1010     --
     PXE            3.6.0502       --
     UEFI           14.25.0017     --

sudo reboot

mst(Mellanox Software Tools)

もし、LINK TYPEがIBと認識している場合以下のようなコマンドが必要になる

mlxconfig -d mt4117_pciconf0 set LINK_TYPE_P1=ETH
kanai@kanaipc1:~$ sudo mst  start
Starting MST (Mellanox Software Tools) driver set
Loading MST PCI module - Success
Loading MST PCI configuration module - Success
Create devices
Unloading MST PCI module (unused) - Success

kanai@kanaipc1:~$ sudo mst  status
MST modules:
------------
    MST PCI module is not loaded
    MST PCI configuration module loaded

MST devices:
------------
/dev/mst/mt4117_pciconf0         - PCI configuration cycles access.
                                   domain:bus:dev.fn=0000:01:00.0 addr.reg=88 data.reg=92 cr_bar.gw_offset=-1
                                   Chip revision is: 00

sudo mst status -v
MST modules:
------------
    MST PCI module is not loaded
    MST PCI configuration module loaded
PCI devices:
------------
DEVICE_TYPE             MST                           PCI       RDMA            NET                                     NUMA
ConnectX4LX(rev:0)      /dev/mst/mt4117_pciconf0.1    01:00.1   mlx5_1          net-enp1s0f1np1                         -1

ConnectX4LX(rev:0)      /dev/mst/mt4117_pciconf0      01:00.0   mlx5_0          net-enp1s0f0np0                         -1

kanai@kanaipc1:~$ sudo mlxconfig -d /dev/mst/mt4117_pciconf0 query

Device #1:
----------

Device type:    ConnectX4LX
Name:           MCX4121A-XCA_Ax
Description:    ConnectX-4 Lx EN network interface card; 10GbE dual-port SFP28; PCIe3.0 x8; ROHS R6
Device:         /dev/mst/mt4117_pciconf0

Configurations:                                      Next Boot
         MEMIC_BAR_SIZE                              0
         MEMIC_SIZE_LIMIT                            _256KB(1)
         FLEX_PARSER_PROFILE_ENABLE                  0
         FLEX_IPV4_OVER_VXLAN_PORT                   0
         ROCE_NEXT_PROTOCOL                          254
         PF_NUM_OF_VF_VALID                          False(0)
         NON_PREFETCHABLE_PF_BAR                     False(0)
         VF_VPD_ENABLE                               False(0)
         STRICT_VF_MSIX_NUM                          False(0)
         VF_NODNIC_ENABLE                            False(0)
         NUM_PF_MSIX_VALID                           True(1)
         NUM_OF_VFS                                  8
         NUM_OF_PF                                   2
         SRIOV_EN                                    True(1)
         PF_LOG_BAR_SIZE                             5
         VF_LOG_BAR_SIZE                             0
         NUM_PF_MSIX                                 63
         NUM_VF_MSIX                                 11
         INT_LOG_MAX_PAYLOAD_SIZE                    AUTOMATIC(0)
         PCIE_CREDIT_TOKEN_TIMEOUT                   0
         ACCURATE_TX_SCHEDULER                       False(0)
         PARTIAL_RESET_EN                            False(0)
         SW_RECOVERY_ON_ERRORS                       False(0)
         RESET_WITH_HOST_ON_ERRORS                   False(0)
         PCI_BUS0_RESTRICT_SPEED                     PCI_GEN_1(0)
         PCI_BUS0_RESTRICT_ASPM                      False(0)
         PCI_BUS0_RESTRICT_WIDTH                     PCI_X1(0)
         PCI_BUS0_RESTRICT                           False(0)
         PCI_DOWNSTREAM_PORT_OWNER                   Array[0..15]
         CQE_COMPRESSION                             BALANCED(0)
         IP_OVER_VXLAN_EN                            False(0)
         MKEY_BY_NAME                                False(0)
         UCTX_EN                                     True(1)
         PCI_ATOMIC_MODE                             PCI_ATOMIC_DISABLED_EXT_ATOMIC_ENABLED(0)
         TUNNEL_ECN_COPY_DISABLE                     False(0)
         LRO_LOG_TIMEOUT0                            6
         LRO_LOG_TIMEOUT1                            7
         LRO_LOG_TIMEOUT2                            8
         LRO_LOG_TIMEOUT3                            13
         ICM_CACHE_MODE                              DEVICE_DEFAULT(0)
         TX_SCHEDULER_BURST                          0
         LOG_MAX_QUEUE                               17
         LOG_DCR_HASH_TABLE_SIZE                     14
         MAX_PACKET_LIFETIME                         0
         DCR_LIFO_SIZE                               16384
         ROCE_CC_PRIO_MASK_P1                        255
         ROCE_CC_PRIO_MASK_P2                        255
         CLAMP_TGT_RATE_AFTER_TIME_INC_P1            True(1)
         CLAMP_TGT_RATE_P1                           False(0)
         RPG_TIME_RESET_P1                           300
         RPG_BYTE_RESET_P1                           32767
         RPG_THRESHOLD_P1                            1
         RPG_MAX_RATE_P1                             0
         RPG_AI_RATE_P1                              5
         RPG_HAI_RATE_P1                             50
         RPG_GD_P1                                   11
         RPG_MIN_DEC_FAC_P1                          50
         RPG_MIN_RATE_P1                             1
         RATE_TO_SET_ON_FIRST_CNP_P1                 0
         DCE_TCP_G_P1                                1019
         DCE_TCP_RTT_P1                              1
         RATE_REDUCE_MONITOR_PERIOD_P1               4
         INITIAL_ALPHA_VALUE_P1                      1023
         MIN_TIME_BETWEEN_CNPS_P1                    4
         CNP_802P_PRIO_P1                            6
         CNP_DSCP_P1                                 48
         CLAMP_TGT_RATE_AFTER_TIME_INC_P2            True(1)
         CLAMP_TGT_RATE_P2                           False(0)
         RPG_TIME_RESET_P2                           300
         RPG_BYTE_RESET_P2                           32767
         RPG_THRESHOLD_P2                            1
         RPG_MAX_RATE_P2                             0
         RPG_AI_RATE_P2                              5
         RPG_HAI_RATE_P2                             50
         RPG_GD_P2                                   11
         RPG_MIN_DEC_FAC_P2                          50
         RPG_MIN_RATE_P2                             1
         RATE_TO_SET_ON_FIRST_CNP_P2                 0
         DCE_TCP_G_P2                                1019
         DCE_TCP_RTT_P2                              1
         RATE_REDUCE_MONITOR_PERIOD_P2               4
         INITIAL_ALPHA_VALUE_P2                      1023
         MIN_TIME_BETWEEN_CNPS_P2                    4
         CNP_802P_PRIO_P2                            6
         CNP_DSCP_P2                                 48
         LLDP_NB_DCBX_P1                             False(0)
         LLDP_NB_RX_MODE_P1                          OFF(0)
         LLDP_NB_TX_MODE_P1                          OFF(0)
         LLDP_NB_DCBX_P2                             False(0)
         LLDP_NB_RX_MODE_P2                          OFF(0)
         LLDP_NB_TX_MODE_P2                          OFF(0)
         ROCE_RTT_RESP_DSCP_P1                       0
         ROCE_RTT_RESP_DSCP_MODE_P1                  DEVICE_DEFAULT(0)
         ROCE_RTT_RESP_DSCP_P2                       0
         ROCE_RTT_RESP_DSCP_MODE_P2                  DEVICE_DEFAULT(0)
         DCBX_IEEE_P1                                True(1)
         DCBX_CEE_P1                                 True(1)
         DCBX_WILLING_P1                             True(1)
         DCBX_IEEE_P2                                True(1)
         DCBX_CEE_P2                                 True(1)
         DCBX_WILLING_P2                             True(1)
         KEEP_ETH_LINK_UP_P1                         True(1)
         KEEP_IB_LINK_UP_P1                          False(0)
         KEEP_LINK_UP_ON_BOOT_P1                     False(0)
         KEEP_LINK_UP_ON_STANDBY_P1                  False(0)
         DO_NOT_CLEAR_PORT_STATS_P1                  False(0)
         AUTO_POWER_SAVE_LINK_DOWN_P1                False(0)
         KEEP_ETH_LINK_UP_P2                         True(1)
         KEEP_IB_LINK_UP_P2                          False(0)
         KEEP_LINK_UP_ON_BOOT_P2                     False(0)
         KEEP_LINK_UP_ON_STANDBY_P2                  False(0)
         DO_NOT_CLEAR_PORT_STATS_P2                  False(0)
         AUTO_POWER_SAVE_LINK_DOWN_P2                False(0)
         NUM_OF_VL_P1                                _4_VLs(3)
         NUM_OF_TC_P1                                _8_TCs(0)
         NUM_OF_PFC_P1                               8
         VL15_BUFFER_SIZE_P1                         0
         NUM_OF_VL_P2                                _4_VLs(3)
         NUM_OF_TC_P2                                _8_TCs(0)
         NUM_OF_PFC_P2                               8
         VL15_BUFFER_SIZE_P2                         0
         DUP_MAC_ACTION_P1                           LAST_CFG(0)
         SRIOV_IB_ROUTING_MODE_P1                    LID(1)
         IB_ROUTING_MODE_P1                          LID(1)
         DUP_MAC_ACTION_P2                           LAST_CFG(0)
         SRIOV_IB_ROUTING_MODE_P2                    LID(1)
         IB_ROUTING_MODE_P2                          LID(1)
         PHY_FEC_OVERRIDE_P1                         DEVICE_DEFAULT(0)
         PHY_FEC_OVERRIDE_P2                         DEVICE_DEFAULT(0)
         ROCE_CONTROL                                ROCE_ENABLE(2)
         PCI_WR_ORDERING                             per_mkey(0)
         MULTI_PORT_VHCA_EN                          False(0)
         PORT_OWNER                                  True(1)
         ALLOW_RD_COUNTERS                           True(1)
         RENEG_ON_CHANGE                             True(1)
         TRACER_ENABLE                               True(1)
         IP_VER                                      IPv4(0)
         BOOT_UNDI_NETWORK_WAIT                      0
         UEFI_HII_EN                                 True(1)
         BOOT_DBG_LOG                                False(0)
         UEFI_LOGS                                   DISABLED(0)
         BOOT_VLAN                                   1
         LEGACY_BOOT_PROTOCOL                        PXE(1)
         BOOT_INTERRUPT_DIS                          False(0)
         BOOT_LACP_DIS                               True(1)
         BOOT_VLAN_EN                                False(0)
         BOOT_PKEY                                   0
         DYNAMIC_VF_MSIX_TABLE                       False(0)
         EXP_ROM_UEFI_ARM_ENABLE                     False(0)
         EXP_ROM_UEFI_x86_ENABLE                     False(0)
         EXP_ROM_PXE_ENABLE                          True(1)
         ADVANCED_PCI_SETTINGS                       False(0)
         SAFE_MODE_THRESHOLD                         10
         SAFE_MODE_ENABLE                            True(1)

おうち10Gワイヤレート(iperf)

1台で流します。

sudo ip netns add pc2
# sudo ip link add name enp1s0f0np0 type veth peer name enp1s0f1np1

sudo ip link set enp1s0f1np1 netns pc2

sudo ip addr add 192.168.0.1/24 dev enp1s0f0np0
sudo ip netns exec pc2 ip addr add 192.168.0.2/24 dev enp1s0f1np1
sudo ip link set enp1s0f0np0 up
sudo ip netns exec pc2 ip link set enp1s0f1np1 up
sudo ip addr
sudo ip netns exec pc2 ip addr
sudo ip netns del pc1
sudo ip netns del pc2
apt-get install iperf3

# 双方からiperfする
iperf -s -i 1 -P 10
sudo ip netns exec pc2 iperf -c 192.168.0.1 -i 1 -P 10 -i 10

おうちRDMA

一台で折り返しNVMe-OF

https://blog.nishi.network/2022/06/27/nvme-over-rdma-with-rocev2/ を見る

modprobe mlx5_core
apt install nvme-cli
git clone https://github.com/spdk/spdk
cd spdk/
git submodule update --init
sudo scripts/pkgdep.sh --rdma
./configure --with-rdma --enable-debug
make
./test/unit/unittest.sh
sudo scripts/setup.sh status
sudo scripts/setup.sh
sudo ./build/bin/nvmf_tgt

sudo scripts/rpc.py nvmf_create_transport -t RDMA -u 8192 -i 131072 -c 8192
sudo scripts/rpc.py bdev_malloc_create -b Malloc0 512 512
sudo scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001 -d SPDK_Controller1
sudo  scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 Malloc0
sudo scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t rdma -a 192.168.0.1 -s 4420