读者慎入,没有虚拟化相关知识可能完全看不懂。
虚拟PCI设备配对过程:
以kvmtool中的pci disk为例:
disk pci配置空间中有:
vendor_id = 0x1af4
device_id = 0x1001
subsys_id = 2
步骤:guest linux kernel 通过0xcf8 0xcfc端口遍历pci配置空间,遍历端口过程中被vm截获,传递给qemu,qemu中的pci配置信息反馈给guest ,在guest中建立对应的struct pci_dev 然后注册pci dev.
在drivers/virtio/virtio_pci_common.c中有 对应的pci_driver与之匹配。
static struct pci_driver virtio_pci_driver = {
.name = "virtio-pci",
.id_table = virtio_pci_id_table,
.probe = virtio_pci_probe,
.remove = virtio_pci_remove,
#ifdef CONFIG_PM_SLEEP
.driver.pm = &virtio_pci_pm_ops,
#endif
};
#define PCI_VENDOR_ID_REDHAT_QUMRANET 0x1af4
#define PCI_ANY_ID (~0)
static const struct pci_device_id virtio_pci_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) },
{ 0 }
};
struct pci_dev {
....
....
unsigned short vendor;
unsigned short device;
unsigned short subsystem_vendor;
unsigned short subsystem_device;
unsigned int class; /* 3 bytes: (base,sub,prog-if) */
.....
....
}
pci驱动和设备匹配之后,运行pci驱动中的probe函数virtio_pci_probe()
static int virtio_pci_probe(struct pci_dev *pci_dev,
const struct pci_device_id *id)
{
struct virtio_pci_device *vp_dev, *reg_dev = NULL;
pci_set_drvdata(pci_dev, vp_dev);
vp_dev->vdev.dev.parent = &pci_dev->dev;
vp_dev->vdev.dev.release = virtio_pci_release_dev;
vp_dev->pci_dev = pci_dev;
INIT_LIST_HEAD(&vp_dev->virtqueues);
spin_lock_init(&vp_dev->lock);
/* enable the device */
rc = pci_enable_device(pci_dev);
if (rc)
goto err_enable_device;
if (force_legacy) {
rc = virtio_pci_legacy_probe(vp_dev);
/* Also try modern mode if we can't map BAR0 (no IO space). */
if (rc == -ENODEV || rc == -ENOMEM)
rc = virtio_pci_modern_probe(vp_dev);
if (rc)
goto err_probe;
} else {
rc = virtio_pci_modern_probe(vp_dev);
if (rc == -ENODEV)
rc = virtio_pci_legacy_probe(vp_dev);
}
pci_set_master(pci_dev);
rc = register_virtio_device(&vp_dev->vdev);
}
值得注意的是上图中的 virtio_pci_legacy_probe() 和 register_virtio_device
首先看下 struct virtio_pci_device结构
struct virtio_pci_device {
struct virtio_device vdev;
struct pci_dev *pci_dev;
/* Number of available vectors */
unsigned msix_vectors;
/* Vectors allocated, excluding per-vq vectors if any */
unsigned msix_used_vectors;
/* Whether we have vector per vq */
bool per_vq_vectors;
struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev,
struct virtio_pci_vq_info *info,
unsigned idx,
void (*callback)(struct virtqueue *vq),
const char *name,
bool ctx,
u16 msix_vec);
void (*del_vq)(struct virtio_pci_vq_info *info);
u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector);
};
再看下 intvirtio_pci_legacy_probe()
/ the PCI probing function /
intvirtio_pci_legacy_probe(struct virtio_pci_device vp_dev)
{
struct pci_dev pci_dev =vp_dev->pci_dev;
int rc;
if (pci_dev->device < 0x1000 ||pci_dev->device > 0x103f)
return -ENODEV;
rc = dma_set_mask(&pci_dev->dev,DMA_BIT_MASK(64));
rc = pci_request_region(pci_dev, 0,"virtio-pci-legacy");
vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
vp_dev->vdev.id.device = pci_dev->subsystem_device;
vp_dev->vdev.config = &virtio_pci_config_ops;
}
static const struct virtio_config_ops virtio_pci_config_ops = {
.get = vp_get,
.set = vp_set,
.get_status = vp_get_status,
.set_status = vp_set_status,
.reset = vp_reset,
.find_vqs = vp_find_vqs,
.del_vqs = vp_del_vqs,
.get_features = vp_get_features,
.finalize_features = vp_finalize_features,
.bus_name = vp_bus_name,
.set_vq_affinity = vp_set_vq_affinity,
.get_vq_affinity = vp_get_vq_affinity,
};
现在开始注册虚拟设备(磁盘)virtio_device
注册virtio_device
int register_virtio_device(struct virtio_device *dev)
{
int err;
dev->dev.bus = &virtio_bus;
device_initialize(&dev->dev);
dev_set_name(&dev->dev, "virtio%u", dev->index);
/* We always start by resetting the device, in case a previous
* driver messed it up. This also tests that code path a little. */
dev->config->reset(dev);
/* Acknowledge that we've seen the device. */
virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
device_add(&dev->dev);
}
在drivers/block/virtio_blk.c中有 register_virtio_driver(&virtio_blk)和上面的 register_virtio_device正好匹配上。
static int __init init(void)
{
virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
if (!virtblk_wq)
return -ENOMEM;
major = register_blkdev(0, "virtblk");
register_virtio_driver(&virtio_blk);
}
#define VIRTIO_ID_BLOCK 2 /* virtio block */
static const struct virtio_device_id id_table[] = {
{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
{ 0 },
};
static struct virtio_driver virtio_blk = {
.feature_table = features,
.feature_table_size = ARRAY_SIZE(features),
.feature_table_legacy = features_legacy,
.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
.driver.name = KBUILD_MODNAME,
.driver.owner = THIS_MODULE,
.id_table = id_table,
.probe = virtblk_probe,
.remove = virtblk_remove,
.config_changed = virtblk_config_changed,
#ifdef CONFIG_PM_SLEEP
.freeze = virtblk_freeze,
.restore = virtblk_restore,
#endif
};
struct virtio_blk {
struct virtio_device *vdev;
/* The disk structure for the kernel. */
struct gendisk *disk;
/* Block layer tags. */
struct blk_mq_tag_set tag_set;
/* Process context for config space updates */
struct work_struct config_work;
/* What host tells us, plus 2 for header & tailer. */
unsigned int sg_elems;
/* Ida index - used to track minor number allocations. */
int index;
/* num of vqs */
int num_vqs;
struct virtio_blk_vq *vqs;
};
匹配上之后执行virtblk_probe().
总的过程就是首先虚拟的磁盘pci设备匹配上,匹配上之后执行驱动的probe函数然后注册虚拟磁盘设备register_virtio_device(),
然后再与系统中的虚拟磁盘驱动匹配一次,执行virtblk_probe()
匹配上之后执行virtblk_probe()
staticint virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
struct request_queue *q;
err = init_vq(vblk);
if (err)
goto out_free_vblk;
vblk->disk = alloc_disk(1 <<PART_BITS);
if (!vblk->disk) {
err = -ENOMEM;
goto out_free_vq;
}
memset(&vblk->tag_set, 0,sizeof(vblk->tag_set));
vblk->tag_set.ops =&virtio_mq_ops;
vblk->tag_set.queue_depth =virtblk_queue_depth;
vblk->tag_set.numa_node =NUMA_NO_NODE;
vblk->tag_st.flags =BLK_MQ_F_SHOULD_MERGE;
vblk->tag_set.cmd_size =
sizeof(struct virtblk_req) +
sizeof(struct scatterlist) *sg_elems;
vblk->tag_set.driver_data = vblk;
vblk->tag_set.nr_hw_queues =vblk->num_vqs;
err =blk_mq_alloc_tag_set(&vblk->tag_set);
if (err)
goto out_put_disk;
q =blk_mq_init_queue(&vblk->tag_set);
if (IS_ERR(q)) {
err = -ENOMEM;
goto out_free_tags;
}
vblk->disk->queue = q;
q->queuedata = vblk;
virtblk_name_format("vd",index, vblk->disk->disk_name, DISK_NAME_LEN);
/* 虚拟磁盘与 major = register_blkdev(0,"virtblk") 关联起来 */
vblk->disk->major = major;
vblk->disk->first_minor =index_to_minor(index);
vblk->disk->private_data = vblk;
vblk->disk->fops =&virtblk_fops;
static const struct blk_mq_ops virtio_mq_ops = {
.queue_rq = virtio_queue_rq,
.commit_rqs = virtio_commit_rqs,
.complete = virtblk_request_done,
.init_request = virtblk_init_request,
#ifdefCONFIG_VIRTIO_BLK_SCSI
.initialize_rq_fn = virtblk_initialize_rq,
#endif
.map_queues = virtblk_map_queues,
};
staticblk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
switch (req_op(req)) {
case REQ_OP_READ:
case REQ_OP_WRITE:
type = 0;
break;
case REQ_OP_FLUSH:
type = VIRTIO_BLK_T_FLUSH;
break;
case REQ_OP_DISCARD:
type = VIRTIO_BLK_T_DISCARD;
break;
case REQ_OP_WRITE_ZEROES:
type =VIRTIO_BLK_T_WRITE_ZEROES;
unmap = !(req->cmd_flags& REQ_NOUNMAP);
break;
case REQ_OP_SCSI_IN:
case REQ_OP_SCSI_OUT:
type = VIRTIO_BLK_T_SCSI_CMD;
break;
case REQ_OP_DRV_IN:
type = VIRTIO_BLK_T_GET_ID;
break;
default:
WARN_ON_ONCE(1);
return BLK_STS_IOERR;
}
blk_mq_start_request(req);
if (notify)
virtqueue_notify(vblk->vqs[qid].vq);
return BLK_STS_OK;
}
在drivers/virtio/virtio_ring.c中
boolvirtqueue_notify(struct virtqueue *_vq)
{
struct vring_virtqueue *vq =to_vvq(_vq);
if (unlikely(vq->broken))
return false;
/* Prod other side to tell it aboutchanges. */
if (!vq->notify(_vq)) {
vq->broken = true;
return false;
}
return true;
}
/* Host must always specify thecapacity. */
virtio_cread(vdev, struct virtio_blk_config, capacity, &cap);
这里获取虚拟硬盘的容量:
static void vp_get(struct virtio_device *vdev, unsigned offset,
void *buf, unsigned len)
{
struct virtio_pci_device *vp_dev =to_vp_device(vdev);
void __iomem *ioaddr =vp_dev->ioaddr +
VIRTIO_PCI_CONFIG_OFF(vp_dev->msix_enabled)+
offset;
u8 *ptr = buf;
int i;
for (i = 0; i < len; i++)
ptr[i] = ioread8(ioaddr + i);
通过io端口读取硬盘容量,io操作会被vm截获,看看在kvmtool(相当于qemu)中传递给guest.在kvmtool中有相应的结构传递给guest.
kvmtool/virtio/blk.c
static voidset_guest_features(struct kvm *kvm, void *dev, u32 features)
{
struct blk_dev *bdev = dev;
struct virtio_blk_config *conf = &bdev->blk_config;
struct virtio_blk_geometry *geo = &conf->geometry;
bdev->features = features;
conf->capacity= virtio_host_to_guest_u64(&bdev->vdev, conf->capacity);
conf->size_max =virtio_host_to_guest_u32(&bdev->vdev, conf->size_max);
conf->seg_max = virtio_host_to_guest_u32(&bdev->vdev,conf->seg_max);
/* Geometry */
geo->cylinders =virtio_host_to_guest_u16(&bdev->vdev, geo->cylinders);
conf->blk_size =virtio_host_to_guest_u32(&bdev->vdev, conf->blk_size);
conf->min_o_size =virtio_host_to_guest_u16(&bdev->vdev, conf->min_io_size);
conf->opt_io_size =virtio_host_to_guest_u32(&bdev->vdev, conf->opt_io_size);
}
*bdev = (structblk_dev) {
.mutex = MUTEX_INITIALIZER,
.disk = disk,
.blk_config = (struct virtio_blk_config) {
.capacity =disk->size / SECTOR_SIZE,
.seg_max =DISK_SEG_MAX,
},
.io_efd = eventfd(0, 0),
.kvm = kvm,
};
}
set_capacity(vblk->disk, cap);
/* 把虚拟磁盘添加进系统 */
device_add_disk(&vdev->dev,vblk->disk);
系统里面通过虚拟pci的方式添加虚拟设备,原理都是类似的。
drivers/block/virtio_blk.c
drivers/net/virtio_net.c
drivers/char/virtio_console.c
drivers/char/hw_random/virtio-rng.c
drivers/scsi/virtio_scsi.c
block/blk-mq-virtio.c