Skip to content

migration fails at destination with "Unable to write to socket: Bad file descriptor" #6

@tmakatos

Description

@tmakatos

When trying to migrate SPDK NVMf/vfio-user target which creates an NVMe controller with one namespace in the guest (/dev/nvme0n1), destination QEMU fails with:

Unable to write to socket: Bad file descriptor

Using the following:

Debugging further, this happens here:

#0  0x0000555555cb8eff in qio_channel_socket_writev (ioc=0x5555568a2400, iov=0x7fffec35da90, niov=1, fds=0x555557314934, nfds=3, errp=0x7fffec35da70) at ../io/channel-socket.c:571
#1  0x0000555555cb2627 in qio_channel_writev_full (ioc=0x5555568a2400, iov=0x7fffec35da90, niov=1, fds=0x555557314934, nfds=3, errp=0x7fffec35da70) at ../io/channel.c:86
#2  0x0000555555c812d5 in vfio_user_send_locked (proxy=0x5555575af7e0, msg=0x55555747eb70, fds=0x7fffec35db40) at ../hw/vfio/user.c:278
#3  0x0000555555c815c9 in vfio_user_send_recv (proxy=0x5555575af7e0, msg=0x55555747eb70, fds=0x7fffec35db40, rsize=0) at ../hw/vfio/user.c:351
#4  0x0000555555c82c38 in vfio_user_set_irqs (vbasedev=0x5555575a9c70, irq=0x555557314920) at ../hw/vfio/user.c:898
#5  0x0000555555c6b79d in vfio_enable_vectors (vdev=0x5555575a9370, msix=true) at ../hw/vfio/pci.c:413
#6  0x0000555555c6bb4c in vfio_msix_vector_do_use (pdev=0x5555575a9370, nr=3, msg=0x0, handler=0x0) at ../hw/vfio/pci.c:516
#7  0x0000555555c6be8c in vfio_msix_enable (vdev=0x5555575a9370) at ../hw/vfio/pci.c:615
#8  0x0000555555c70b0b in vfio_pci_load_config (vbasedev=0x5555575a9c70, f=0x5555568f5af0) at ../hw/vfio/pci.c:2528
#9  0x0000555555bab3df in vfio_load_device_config_state (f=0x5555568f5af0, opaque=0x5555575a9c70) at ../hw/vfio/migration.c:382
#10 0x0000555555babbe2 in vfio_load_state (f=0x5555568f5af0, opaque=0x5555575a9c70, version_id=1) at ../hw/vfio/migration.c:649
#11 0x00005555558a5cb9 in vmstate_load (f=0x5555568f5af0, se=0x555556964df0) at ../migration/savevm.c:908
#12 0x00005555558a8dec in qemu_loadvm_section_start_full (f=0x5555568f5af0, mis=0x5555568cec70) at ../migration/savevm.c:2433
#13 0x00005555558a944a in qemu_loadvm_state_main (f=0x5555568f5af0, mis=0x5555568cec70) at ../migration/savevm.c:2619
#14 0x00005555558a95c5 in qemu_loadvm_state (f=0x5555568f5af0) at ../migration/savevm.c:2698
#15 0x00005555558e437d in process_incoming_migration_co (opaque=0x0) at ../migration/migration.c:555
#16 0x0000555555e28cb6 in coroutine_trampoline (i0=1457783792, i1=21845) at ../util/coroutine-ucontext.c:173
#17 0x00007ffff75a4b50 in __correctly_grouped_prefixwc (begin=0x7fffec35da70 L"\x56965b50啕\003", end=0x0, thousands=-175363960 L'\xf58c2888', grouping=0x555556650010 "") at grouping.c:171
#18 0x0000000000000000 in  ()
(gdb) p errno
$2 = 9
(gdb) p sioc->fd
$3 = 13

Looking at the FD:

# ls -lh /proc/1816/fd/13
lrwx------ 1 root root 64 Jun  8 11:43 /proc/1816/fd/13 -> 'socket:[30949]'
# cat /proc/1816/fdinfo/13
pos:    0
flags:  02000002
mnt_id: 10

The source QEMU is run as follows:

/opt/qemu/bin/qemu-system-x86_64 -smp 4 -nographic -m 2G -object memory-backend-file,id=mem0,size=2G,mem-path=/dev/hugepages,share=on,prealloc=yes, -numa node,memdev=mem0 -kernel bionic-server-cloudimg-amd64-vmlinuz-generic -initrd bionic-server-cloudimg-amd64-initrd-generic -append console=ttyS0 root=/dev/sda1 single intel_iommu=on -hda bionic-server-cloudimg-amd64-0.raw -hdb nvme.img -nic user,model=virtio-net-pci -machine pc-q35-3.1 -device vfio-user-pci,socket=/var/run/vfio-user.sock,x-enable-migration=on -D qemu.out -trace enable=vfio*

and destination QEMU:

/opt/qemu/bin/qemu-system-x86_64 -smp 4 -nographic -m 2G -object memory-backend-file,id=mem0,size=2G,mem-path=/dev/hugepages,share=on,prealloc=yes, -numa node,memdev=mem0 -kernel bionic-server-cloudimg-amd64-vmlinuz-generic -initrd bionic-server-cloudimg-amd64-initrd-generic -append console=ttyS0 root=/dev/sda1 single intel_iommu=on -hda bionic-server-cloudimg-amd64-0.raw -hdb nvme.img -nic user,model=virtio-net-pci -machine pc-q35-3.1 -device vfio-user-pci,socket=/var/run/vfio-user.sock,x-enable-migration=on -D qemu.out -trace enable=vfio* -incoming tcp:0:4444

I migrate using:

migrate -d tcp:<IP address>:4444

In the source QEMU log:

vfio_msi_interrupt  (VFIO user </var/run/vfio-user.sock>) vector 2 0xfee04004/0x4023
vfio_get_dirty_bitmap container fd=-1, iova=0x0 size= 0xa0000 bitmap_size=0x18 start=0x0
vfio_get_dirty_bitmap container fd=-1, iova=0xc0000 size= 0xb000 bitmap_size=0x8 start=0xc0000
vfio_get_dirty_bitmap container fd=-1, iova=0xcb000 size= 0x3000 bitmap_size=0x8 start=0xcb000
vfio_get_dirty_bitmap container fd=-1, iova=0xce000 size= 0x1e000 bitmap_size=0x8 start=0xce000
vfio_msi_interrupt  (VFIO user </var/run/vfio-user.sock>) vector 2 0xfee04004/0x4023
vfio_get_dirty_bitmap container fd=-1, iova=0xec000 size= 0x4000 bitmap_size=0x8 start=0xec000
vfio_get_dirty_bitmap container fd=-1, iova=0xf0000 size= 0x10000 bitmap_size=0x8 start=0xf0000
vfio_get_dirty_bitmap container fd=-1, iova=0x100000 size= 0x7ff00000 bitmap_size=0xffe0 start=0x100000
vfio_get_dirty_bitmap container fd=-1, iova=0xfd000000 size= 0x1000000 bitmap_size=0x200 start=0x80080000
vfio_get_dirty_bitmap container fd=-1, iova=0xfebd1000 size= 0x1000 bitmap_size=0x8 start=0x81100000
vfio_get_dirty_bitmap container fd=-1, iova=0xfffc0000 size= 0x40000 bitmap_size=0x8 start=0x80000000
vfio_update_pending  (VFIO user </var/run/vfio-user.sock>) pending 0x8000
vfio_save_pending  (VFIO user </var/run/vfio-user.sock>) precopy 0x1195000 postcopy 0x0 compatible 0x0
vfio_migration_set_state  (VFIO user </var/run/vfio-user.sock>) state 2
vfio_vmstate_change  (VFIO user </var/run/vfio-user.sock>) running 0 reason finish-migrate device state 2
vfio_get_dirty_bitmap container fd=-1, iova=0x0 size= 0xa0000 bitmap_size=0x18 start=0x0
vfio_get_dirty_bitmap container fd=-1, iova=0xc0000 size= 0xb000 bitmap_size=0x8 start=0xc0000
vfio_get_dirty_bitmap container fd=-1, iova=0xcb000 size= 0x3000 bitmap_size=0x8 start=0xcb000
vfio_get_dirty_bitmap container fd=-1, iova=0xce000 size= 0x1e000 bitmap_size=0x8 start=0xce000
vfio_get_dirty_bitmap container fd=-1, iova=0xec000 size= 0x4000 bitmap_size=0x8 start=0xec000
vfio_get_dirty_bitmap container fd=-1, iova=0xf0000 size= 0x10000 bitmap_size=0x8 start=0xf0000
vfio_get_dirty_bitmap container fd=-1, iova=0x100000 size= 0x7ff00000 bitmap_size=0xffe0 start=0x100000
vfio_get_dirty_bitmap container fd=-1, iova=0xfd000000 size= 0x1000000 bitmap_size=0x200 start=0x80080000
vfio_get_dirty_bitmap container fd=-1, iova=0xfebd1000 size= 0x1000 bitmap_size=0x8 start=0x81100000
vfio_get_dirty_bitmap container fd=-1, iova=0xfffc0000 size= 0x40000 bitmap_size=0x8 start=0x80000000
vfio_migration_set_state  (VFIO user </var/run/vfio-user.sock>) state 2
vfio_update_pending  (VFIO user </var/run/vfio-user.sock>) pending 0x8000
vfio_save_buffer  (VFIO user </var/run/vfio-user.sock>) Offset 0x1000 size 0x8000 pending 0x8000
vfio_update_pending  (VFIO user </var/run/vfio-user.sock>) pending 0x8000
vfio_save_buffer  (VFIO user </var/run/vfio-user.sock>) Offset 0x9000 size 0x0 pending 0x8000
vfio_migration_set_state  (VFIO user </var/run/vfio-user.sock>) state 0
vfio_save_complete_precopy  (VFIO user </var/run/vfio-user.sock>)
vfio_save_device_config_state  (VFIO user </var/run/vfio-user.sock>)
vfio_region_unmap Region migration mmaps[0] unmap [0x1000 - 0x8fff]
vfio_save_cleanup  (VFIO user </var/run/vfio-user.sock>)
vfio_migration_state_notifier  (VFIO user </var/run/vfio-user.sock>) state completed

And in the destination QEMU:

...
vfio_region_mmap Region migration mmaps[0] [0x1000 - 0x8fff]
vfio_migration_set_state  (VFIO user </var/run/vfio-user.sock>) state 4
vfio_load_state  (VFIO user </var/run/vfio-user.sock>) data 0xffffffffef100003                                                                                                                                                                  vfio_load_state  (VFIO user </var/run/vfio-user.sock>) data 0xffffffffef100004
vfio_load_state_device_data  (VFIO user </var/run/vfio-user.sock>) Offset 0x1000 size 0x8000
vfio_load_state  (VFIO user </var/run/vfio-user.sock>) data 0xffffffffef100004
vfio_listener_region_del region_del 0xc0000 - 0xdffff
vfio_listener_region_add_ram region_add [ram] 0xc0000 - 0xcafff [0x7fa250200000]
vfio_listener_region_add_ram region_add [ram] 0xcb000 - 0xcdfff [0x7fa2506cb000]
vfio_listener_region_add_ram region_add [ram] 0xce000 - 0xdffff [0x7fa25020e000]
vfio_listener_region_add_skip SKIPPING region_add 0xb0000000 - 0xbfffffff
vfio_listener_region_del region_del 0xc0000 - 0xcafff
vfio_listener_region_del region_del 0xce000 - 0xdffff
vfio_listener_region_del region_del 0xe0000 - 0xfffff
vfio_listener_region_add_ram region_add [ram] 0xc0000 - 0xcafff [0x7fa2506c0000]
vfio_listener_region_add_ram region_add [ram] 0xce000 - 0xebfff [0x7fa2506ce000]
vfio_listener_region_add_ram region_add [ram] 0xec000 - 0xeffff [0x7fa2506ec000]                                                                                                                                                                vfio_listener_region_add_ram region_add [ram] 0xf0000 - 0xfffff [0x7fa2506f0000]
vfio_listener_region_add_skip SKIPPING region_add 0xfed1c000 - 0xfed1ffff
vfio_listener_region_add_skip SKIPPING region_add 0xfebd7000 - 0xfebd7fff
vfio_listener_region_add_ram region_add [ram] 0xfd000000 - 0xfdffffff [0x7fa241400000]
vfio_listener_region_add_skip SKIPPING region_add 0xfebd4000 - 0xfebd43ff
vfio_listener_region_add_skip SKIPPING region_add 0xfebd4400 - 0xfebd441f
vfio_listener_region_add_skip SKIPPING region_add 0xfebd4420 - 0xfebd44ff
vfio_listener_region_add_skip SKIPPING region_add 0xfebd4500 - 0xfebd4515
vfio_listener_region_add_skip SKIPPING region_add 0xfebd4516 - 0xfebd45ff
vfio_listener_region_add_skip SKIPPING region_add 0xfebd4600 - 0xfebd4607
vfio_listener_region_add_skip SKIPPING region_add 0xfebd4608 - 0xfebd4fff
vfio_listener_region_add_skip SKIPPING region_add 0xfe000000 - 0xfe000fff
vfio_listener_region_add_skip SKIPPING region_add 0xfe001000 - 0xfe001fff
vfio_listener_region_add_skip SKIPPING region_add 0xfe002000 - 0xfe002fff
vfio_listener_region_add_skip SKIPPING region_add 0xfe003000 - 0xfe003fff
vfio_load_state  (VFIO user </var/run/vfio-user.sock>) data 0xffffffffef100002
vfio_listener_region_add_skip SKIPPING region_add 0xfebd0000 - 0xfebd0fff
vfio_listener_region_add_ram region_add [ram] 0xfebd1000 - 0xfebd1fff [0x7fa35db96000]
vfio_listener_region_add_skip SKIPPING region_add 0xfebd0000 - 0xfebd0fff
vfio_listener_region_add_ram region_add [ram] 0xfebd1000 - 0xfebd1fff [0x7fa35db96000]
vfio_listener_region_add_skip SKIPPING region_add 0xfebd2000 - 0xfebd3fff
vfio_listener_region_add_skip SKIPPING region_add 0xfebd5000 - 0xfebd53ff
vfio_listener_region_add_skip SKIPPING region_add 0xfebd5400 - 0xfebd5fff
vfio_listener_region_add_skip SKIPPING region_add 0xfebd6000 - 0xfebd6fff
vfio_pci_write_config  (VFIO user </var/run/vfio-user.sock>, @0x4, 0x507, len=0x2)
vfio_listener_region_add_skip SKIPPING region_add 0xfebd2000 - 0xfebd3fff
vfio_region_mmaps_set_enabled Region VFIO user </var/run/vfio-user.sock> BAR 0 mmaps enabled: 1
vfio_region_mmaps_set_enabled Region VFIO user </var/run/vfio-user.sock> BAR 4 mmaps enabled: 1
vfio_region_mmaps_set_enabled Region VFIO user </var/run/vfio-user.sock> BAR 5 mmaps enabled: 1
vfio_intx_disable  (VFIO user </var/run/vfio-user.sock>)
vfio_msix_vector_do_use  (VFIO user </var/run/vfio-user.sock>) vector 3 used

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions