kvm虚拟磁盘设备全过程

读者慎入,没有虚拟化相关知识可能完全看不懂。

虚拟PCI设备配对过程:
以kvmtool中的pci disk为例:

disk pci配置空间中有:
vendor_id = 0x1af4
device_id = 0x1001
subsys_id = 2
步骤:guest linux kernel 通过0xcf8 0xcfc端口遍历pci配置空间,遍历端口过程中被vm截获,传递给qemu,qemu中的pci配置信息反馈给guest ,在guest中建立对应的struct pci_dev 然后注册pci dev.
在drivers/virtio/virtio_pci_common.c中有 对应的pci_driver与之匹配。

static struct pci_driver virtio_pci_driver = { 
        .name           = "virtio-pci",
        .id_table       = virtio_pci_id_table,
        .probe          = virtio_pci_probe,
        .remove         = virtio_pci_remove,
#ifdef CONFIG_PM_SLEEP
        .driver.pm      = &virtio_pci_pm_ops,
#endif
};
#define PCI_VENDOR_ID_REDHAT_QUMRANET 0x1af4
#define PCI_ANY_ID (~0)
static const struct pci_device_id virtio_pci_id_table[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) },
        { 0 }
};
struct pci_dev {
  ....
  ....
unsigned short  vendor;
  unsigned short  device;
  unsigned short  subsystem_vendor;
  unsigned short  subsystem_device;
  unsigned int  class;    /* 3 bytes: (base,sub,prog-if) */
 .....
 ....
}

pci驱动和设备匹配之后,运行pci驱动中的probe函数virtio_pci_probe()

static int virtio_pci_probe(struct pci_dev *pci_dev,
          const struct pci_device_id *id)
{
  struct virtio_pci_device *vp_dev, *reg_dev = NULL;
  pci_set_drvdata(pci_dev, vp_dev);
  vp_dev->vdev.dev.parent = &pci_dev->dev;
  vp_dev->vdev.dev.release = virtio_pci_release_dev;
  vp_dev->pci_dev = pci_dev;
  INIT_LIST_HEAD(&vp_dev->virtqueues);
  spin_lock_init(&vp_dev->lock);

  /* enable the device */
  rc = pci_enable_device(pci_dev);
  if (rc)
    goto err_enable_device;

  if (force_legacy) {
    rc = virtio_pci_legacy_probe(vp_dev);
    /* Also try modern mode if we can't map BAR0 (no IO space). */
    if (rc == -ENODEV || rc == -ENOMEM)
      rc = virtio_pci_modern_probe(vp_dev);
    if (rc)
      goto err_probe;
  } else {
    rc = virtio_pci_modern_probe(vp_dev);
    if (rc == -ENODEV)
      rc = virtio_pci_legacy_probe(vp_dev);
  }
  pci_set_master(pci_dev);
  rc = register_virtio_device(&vp_dev->vdev);
}

值得注意的是上图中的 virtio_pci_legacy_probe() 和 register_virtio_device

首先看下 struct virtio_pci_device结构

struct virtio_pci_device {
  struct virtio_device vdev;
  struct pci_dev *pci_dev;
  /* Number of available vectors */
  unsigned msix_vectors;
  /* Vectors allocated, excluding per-vq vectors if any */
  unsigned msix_used_vectors;

  /* Whether we have vector per vq */
  bool per_vq_vectors;

  struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev,
              struct virtio_pci_vq_info *info,
              unsigned idx,
              void (*callback)(struct virtqueue *vq),
              const char *name,
              bool ctx,
              u16 msix_vec);
  void (*del_vq)(struct virtio_pci_vq_info *info);

  u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector);
};

再看下 intvirtio_pci_legacy_probe()

/ the PCI probing function /
intvirtio_pci_legacy_probe(struct virtio_pci_device vp_dev)
{
struct pci_dev
pci_dev =vp_dev->pci_dev;
int rc;

     if (pci_dev->device < 0x1000 ||pci_dev->device > 0x103f)
               return -ENODEV;

     rc = dma_set_mask(&pci_dev->dev,DMA_BIT_MASK(64));

     rc = pci_request_region(pci_dev, 0,"virtio-pci-legacy");

     vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
     vp_dev->vdev.id.device = pci_dev->subsystem_device;
     vp_dev->vdev.config = &virtio_pci_config_ops;

}

static const struct virtio_config_ops virtio_pci_config_ops = {
  .get    = vp_get,
  .set    = vp_set,
  .get_status  = vp_get_status,
  .set_status  = vp_set_status,
  .reset    = vp_reset,
  .find_vqs  = vp_find_vqs,
  .del_vqs  = vp_del_vqs,
  .get_features  = vp_get_features,
  .finalize_features = vp_finalize_features,
  .bus_name  = vp_bus_name,
  .set_vq_affinity = vp_set_vq_affinity,
  .get_vq_affinity = vp_get_vq_affinity,
};

现在开始注册虚拟设备(磁盘)virtio_device

注册virtio_device
int register_virtio_device(struct virtio_device *dev)
{
  int err;

  dev->dev.bus = &virtio_bus;
  device_initialize(&dev->dev);
  dev_set_name(&dev->dev, "virtio%u", dev->index);

  /* We always start by resetting the device, in case a previous
   * driver messed it up.  This also tests that code path a little. */
  dev->config->reset(dev);
  /* Acknowledge that we've seen the device. */
  virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
  device_add(&dev->dev);
}

在drivers/block/virtio_blk.c中有 register_virtio_driver(&virtio_blk)和上面的 register_virtio_device正好匹配上。

static int __init init(void)
{
       virtblk_wq = alloc_workqueue("virtio-blk", 0, 0); 
       if (!virtblk_wq)
                return -ENOMEM;
       major = register_blkdev(0, "virtblk");

       register_virtio_driver(&virtio_blk);
}

#define VIRTIO_ID_BLOCK    2 /* virtio block */

static const struct virtio_device_id id_table[] = {
  { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
  { 0 },
};

static struct virtio_driver virtio_blk = {
  .feature_table      = features,
  .feature_table_size    = ARRAY_SIZE(features),
  .feature_table_legacy    = features_legacy,
  .feature_table_size_legacy  = ARRAY_SIZE(features_legacy),
  .driver.name      = KBUILD_MODNAME,
  .driver.owner      = THIS_MODULE,
  .id_table      = id_table,
  .probe        = virtblk_probe,
  .remove        = virtblk_remove,
  .config_changed      = virtblk_config_changed,
#ifdef CONFIG_PM_SLEEP
  .freeze        = virtblk_freeze,
  .restore      = virtblk_restore,
#endif
};

struct virtio_blk {
  struct virtio_device *vdev;

  /* The disk structure for the kernel. */
  struct gendisk *disk;

  /* Block layer tags. */
  struct blk_mq_tag_set tag_set;

  /* Process context for config space updates */
  struct work_struct config_work;

  /* What host tells us, plus 2 for header & tailer. */
  unsigned int sg_elems;

  /* Ida index - used to track minor number allocations. */
  int index;

  /* num of vqs */
  int num_vqs;
  struct virtio_blk_vq *vqs;
};

匹配上之后执行virtblk_probe().

总的过程就是首先虚拟的磁盘pci设备匹配上,匹配上之后执行驱动的probe函数然后注册虚拟磁盘设备register_virtio_device(),
然后再与系统中的虚拟磁盘驱动匹配一次,执行virtblk_probe()

匹配上之后执行virtblk_probe()

staticint virtblk_probe(struct virtio_device *vdev)

{

         struct virtio_blk *vblk;
         struct request_queue *q;
         err = init_vq(vblk);
         if (err)
                   goto out_free_vblk;
         vblk->disk = alloc_disk(1 <<PART_BITS);
         if (!vblk->disk) {
                   err = -ENOMEM;
                   goto out_free_vq;
         }
         memset(&vblk->tag_set, 0,sizeof(vblk->tag_set));
         vblk->tag_set.ops =&virtio_mq_ops;
         vblk->tag_set.queue_depth =virtblk_queue_depth;
         vblk->tag_set.numa_node =NUMA_NO_NODE;
         vblk->tag_st.flags =BLK_MQ_F_SHOULD_MERGE;
         vblk->tag_set.cmd_size =
                   sizeof(struct virtblk_req) +
                   sizeof(struct scatterlist) *sg_elems;
         vblk->tag_set.driver_data = vblk;
         vblk->tag_set.nr_hw_queues =vblk->num_vqs;
         err =blk_mq_alloc_tag_set(&vblk->tag_set);
         if (err)
                   goto out_put_disk;
         q =blk_mq_init_queue(&vblk->tag_set);
         if (IS_ERR(q)) {
                   err = -ENOMEM;
                   goto out_free_tags;
         }
         vblk->disk->queue = q;
         q->queuedata = vblk;
         virtblk_name_format("vd",index, vblk->disk->disk_name, DISK_NAME_LEN);
/* 虚拟磁盘与 major = register_blkdev(0,"virtblk") 关联起来 */
         vblk->disk->major = major;
         vblk->disk->first_minor =index_to_minor(index);
         vblk->disk->private_data = vblk;
         vblk->disk->fops =&virtblk_fops;
static const struct blk_mq_ops virtio_mq_ops = {
         .queue_rq         = virtio_queue_rq,
         .commit_rqs    = virtio_commit_rqs,
         .complete         = virtblk_request_done,
         .init_request    = virtblk_init_request,
#ifdefCONFIG_VIRTIO_BLK_SCSI
         .initialize_rq_fn = virtblk_initialize_rq,
#endif
         .map_queues  = virtblk_map_queues,
};

staticblk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,

                               const struct blk_mq_queue_data *bd)

{
         switch (req_op(req)) {
         case REQ_OP_READ:
         case REQ_OP_WRITE:
                   type = 0;
                   break;
         case REQ_OP_FLUSH:
                   type = VIRTIO_BLK_T_FLUSH;
                   break;
         case REQ_OP_DISCARD:
                   type = VIRTIO_BLK_T_DISCARD;
                   break;
         case REQ_OP_WRITE_ZEROES:
                   type =VIRTIO_BLK_T_WRITE_ZEROES;
                   unmap = !(req->cmd_flags& REQ_NOUNMAP);
                   break;
         case REQ_OP_SCSI_IN:
         case REQ_OP_SCSI_OUT:
                   type = VIRTIO_BLK_T_SCSI_CMD;
                   break;
         case REQ_OP_DRV_IN:
                   type = VIRTIO_BLK_T_GET_ID;
                   break;
         default:
                   WARN_ON_ONCE(1);
                   return BLK_STS_IOERR;
         }
         blk_mq_start_request(req);
         if (notify)
                   virtqueue_notify(vblk->vqs[qid].vq);
         return BLK_STS_OK;
}

在drivers/virtio/virtio_ring.c中

boolvirtqueue_notify(struct virtqueue *_vq)

{
         struct vring_virtqueue *vq =to_vvq(_vq);
         if (unlikely(vq->broken))
                   return false;
         /* Prod other side to tell it aboutchanges. */
         if (!vq->notify(_vq)) {
                   vq->broken = true;
                   return false;
         }
         return true;
}
   /* Host must always specify thecapacity. */
     virtio_cread(vdev, struct virtio_blk_config, capacity, &cap);

这里获取虚拟硬盘的容量:

static void vp_get(struct virtio_device *vdev, unsigned offset,

                      void *buf, unsigned len)
{
         struct virtio_pci_device *vp_dev =to_vp_device(vdev);
         void __iomem *ioaddr =vp_dev->ioaddr +
                            VIRTIO_PCI_CONFIG_OFF(vp_dev->msix_enabled)+
                            offset;
         u8 *ptr = buf;
         int i;
         for (i = 0; i < len; i++)
                   ptr[i] = ioread8(ioaddr + i);

通过io端口读取硬盘容量,io操作会被vm截获,看看在kvmtool(相当于qemu)中传递给guest.在kvmtool中有相应的结构传递给guest.
kvmtool/virtio/blk.c

static voidset_guest_features(struct kvm *kvm, void *dev, u32 features)

{

         struct blk_dev *bdev = dev;
         struct virtio_blk_config *conf = &bdev->blk_config;
         struct virtio_blk_geometry *geo = &conf->geometry;
         bdev->features = features;
         conf->capacity= virtio_host_to_guest_u64(&bdev->vdev, conf->capacity);
         conf->size_max =virtio_host_to_guest_u32(&bdev->vdev, conf->size_max);
         conf->seg_max = virtio_host_to_guest_u32(&bdev->vdev,conf->seg_max);
         /* Geometry */
         geo->cylinders =virtio_host_to_guest_u16(&bdev->vdev, geo->cylinders);
         conf->blk_size =virtio_host_to_guest_u32(&bdev->vdev, conf->blk_size);
         conf->min_o_size =virtio_host_to_guest_u16(&bdev->vdev, conf->min_io_size);
         conf->opt_io_size =virtio_host_to_guest_u32(&bdev->vdev, conf->opt_io_size);

}
*bdev = (structblk_dev) {
                   .mutex                        = MUTEX_INITIALIZER,
                   .disk                   = disk,
                   .blk_config                 = (struct virtio_blk_config) {
                            .capacity  =disk->size / SECTOR_SIZE,
                            .seg_max          =DISK_SEG_MAX,
                   },
                   .io_efd                        = eventfd(0, 0),
                   .kvm                            = kvm,
         };
}
  set_capacity(vblk->disk, cap);
    /* 把虚拟磁盘添加进系统 */
  device_add_disk(&vdev->dev,vblk->disk);

系统里面通过虚拟pci的方式添加虚拟设备,原理都是类似的。
drivers/block/virtio_blk.c
drivers/net/virtio_net.c
drivers/char/virtio_console.c
drivers/char/hw_random/virtio-rng.c
drivers/scsi/virtio_scsi.c
block/blk-mq-virtio.c

上一篇:proxmox安装virtio驱动


下一篇:多线程案例