Cinder Volume 服务启动流程分析和周期性任务分析

2023-02-24 16:45:17

1、cinder-volume服务的程序入口

#!/usr/bin/python2

# PBR Generated from u'console_scripts'

import sys

from cinder.cmd.volume import main

if __name__ == "__main__":

    sys.exit(main())

2、cinder/cmd/volume.py的main方法实现

def main():

    objects.register_all()   # import cinder/objects目录下的所有模块

    gmr_opts.set_defaults(CONF)  # oslo_reports模块，用于生成错误报告，如内存泄漏等

    CONF(sys.argv[1:], project='cinder',

         version=version.version_string())

    logging.setup(CONF, "cinder")

    python_logging.captureWarnings(True)

    priv_context.init(root_helper=shlex.split(utils.get_root_helper()))  # oslo_privsep，service的执行权限设置

    utils.monkey_patch()   # monkey-patch，替换库比如socket、thread等，不改变import的行为，改变的是操作的指向函数

    gmr.TextGuruMeditation.setup_autorun(version, conf=CONF)

    launcher = service.get_launcher()----------------第一步

    LOG = logging.getLogger(__name__)

    service_started = False

    if CONF.enabled_backends:-----------在cinder.conf文件中使能的多存储后端

        for backend in filter(None, CONF.enabled_backends):-------循环读取配置文件中enabled_backends的值

            CONF.register_opt(host_opt, group=backend)

            backend_host = getattr(CONF, backend).backend_host

            host = "%s@%s" % (backend_host or CONF.host, backend)------生成cinder-volume host的名字，使用命令行cinder service-list的时候，查到的名字

            # We also want to set cluster to None on empty strings, and we

            # ignore leading and trailing spaces.

            cluster = CONF.cluster and CONF.cluster.strip()

            cluster = (cluster or None) and '%s@%s' % (cluster, backend)

            try:

                server = service.Service.create(host=host,------------------s1 步为每一个backend生成一个服务对象

                                                service_name=backend,

                                                binary='cinder-volume',

                                                coordination=True,

                                                cluster=cluster)

            except Exception:

                msg = _('Volume service %s failed to start.') % host

                LOG.exception(msg)

            else:--------如果没有异常发生，那么走这个分支

                # Dispose of the whole DB connection pool here before

                # starting another process.  Otherwise we run into cases where

                # child processes share DB connections which results in errors.

                session.dispose_engine()

                launcher.launch_service(server)--------------第二步，调用launcher里面的launch_service

                service_started = True

    else:

        LOG.error(_LE('Configuration for cinder-volume does not specify '

                      '"enabled_backends". Using DEFAULT section to configure '

                      'drivers is not supported since Ocata.'))

    if not service_started:

        msg = _('No volume service(s) started successfully, terminating.')

        LOG.error(msg)

        sys.exit(1)

    launcher.wait()

3、对service.Service.create()的详解　　

cinder/service.py:class Service(service.Service)里面的类方法

    @classmethod

    def create(cls, host=None, binary=None, topic=None, manager=None,

               report_interval=None, periodic_interval=None,

               periodic_fuzzy_delay=None, service_name=None,

               coordination=False, cluster=None):

        """Instantiates class and passes back application object.

        :param host: defaults to CONF.host

        :param binary: defaults to basename of executable

        :param topic: defaults to bin_name - 'cinder-' part

        :param manager: defaults to CONF.<topic>_manager

        :param report_interval: defaults to CONF.report_interval

        :param periodic_interval: defaults to CONF.periodic_interval

        :param periodic_fuzzy_delay: defaults to CONF.periodic_fuzzy_delay

        :param cluster: Defaults to None, as only some services will have it

        """

        if not host:

            host = CONF.host

        if not binary:

            binary = os.path.basename(inspect.stack()[-1][1])

        if not topic:

            topic = binary

        if not manager:

            subtopic = topic.rpartition('cinder-')[2]

            manager = CONF.get('%s_manager' % subtopic, None)

        if report_interval is None:

            report_interval = CONF.report_interval-------------读取配置文件中的上报周期值

        if periodic_interval is None:

            periodic_interval = CONF.periodic_interval---------读取配置文件中的轮训周期值

        if periodic_fuzzy_delay is None:

            periodic_fuzzy_delay = CONF.periodic_fuzzy_delay

        service_obj = cls(host, binary, topic, manager,--------调用类初始化一个服务对象

                          report_interval=report_interval,

                          periodic_interval=periodic_interval,

                          periodic_fuzzy_delay=periodic_fuzzy_delay,

                          service_name=service_name,

                          coordination=coordination,

                          cluster=cluster)

        return service_obj

4、　launcher = service.get_launcher()详解

cinder/cmd/volume.py

from cinder import service

launcher = service.get_launcher()

cinder/service.py

from oslo_service import service

def get_launcher():

    # Note(lpetrut): ProcessLauncher uses green pipes which fail on Windows

    # due to missing support of non-blocking I/O pipes. For this reason, the

    # service must be spawned differently on Windows, using the ServiceLauncher

    # class instead.

    if os.name == 'nt':

        return Launcher()--------------------windows操作系统

    else:

        return process_launcher()--------------Linux操作系统

def process_launcher():

    return service.ProcessLauncher(CONF)	

oslo_service/service.py

class ProcessLauncher(object):

    """Launch a service with a given number of workers."""

    def __init__(self, conf, wait_interval=0.01, restart_method='reload'):

        """Constructor.

        :param conf: an instance of ConfigOpts

        :param wait_interval: The interval to sleep for between checks

                              of child process exit.

        :param restart_method: If 'reload', calls reload_config_files on

            SIGHUP. If 'mutate', calls mutate_config_files on SIGHUP. Other

            values produce a ValueError.

        """

        self.conf = conf

        conf.register_opts(_options.service_opts)

        self.children = {}

        self.sigcaught = None

        self.running = True

        self.wait_interval = wait_interval

        self.launcher = None

        rfd, self.writepipe = os.pipe()

        self.readpipe = eventlet.greenio.GreenPipe(rfd, 'r')

        self.signal_handler = SignalHandler()

        self.handle_signal()

        self.restart_method = restart_method

        if restart_method not in _LAUNCHER_RESTART_METHODS:

            raise ValueError(_("Invalid restart_method: %s") % restart_method)	

   def launch_service(self, service, workers=1):-----默认是一个worker,即一个进程

        """Launch a service with a given number of workers.

       :param service: a service to launch, must be an instance of

              :class:`oslo_service.service.ServiceBase`

       :param workers: a number of processes in which a service

              will be running

        """

        _check_service_base(service)

        wrap = ServiceWrapper(service, workers)

        # Hide existing objects from the garbage collector, so that most

        # existing pages will remain in shared memory rather than being

        # duplicated between subprocesses in the GC mark-and-sweep. (Requires

        # Python 3.7 or later.)

        if hasattr(gc, 'freeze'):

            gc.freeze()

        LOG.info('Starting %d workers', wrap.workers)

        while self.running and len(wrap.children) < wrap.workers:

            self._start_child(wrap)

    def _start_child(self, wrap):

        if len(wrap.forktimes) > wrap.workers:

            # Limit ourselves to one process a second (over the period of

            # number of workers * 1 second). This will allow workers to

            # start up quickly but ensure we don't fork off children that

            # die instantly too quickly.

            if time.time() - wrap.forktimes[0] < wrap.workers:

                LOG.info('Forking too fast, sleeping')

                time.sleep(1)

            wrap.forktimes.pop(0)

        wrap.forktimes.append(time.time())

        pid = os.fork()

		使用fork创建子进程后，子进程会复制父进程的数据信息，而后，程序就分两个进程继续运行后面的程序

		在子进程内，这个方法会返回0；在父进程内，这个方法会返回子进程的编号PID

        if pid == 0:

            self.launcher = self._child_process(wrap.service)-----fork的子进程接管服务

            while True:

                self._child_process_handle_signal()

                status, signo = self._child_wait_for_exit_or_signal(

                    self.launcher)

                if not _is_sighup_and_daemon(signo):

                    self.launcher.wait()

                    break

                self.launcher.restart()------重启这个进程

            os._exit(status)

        LOG.debug('Started child %d', pid)

        wrap.children.add(pid)

        self.children[pid] = wrap

        return pid

因此得出，每个 backend 启动的 cinder-volume service 都是独立的子进程，即每个 launcher 都是一个子进程

oslo_service/service.py:class Launcher(object):

    def restart(self):

        """Reload config files and restart service.

        :returns: The return value from reload_config_files or

          mutate_config_files, according to the restart_method.

        """

        if self.restart_method == 'reload':

            self.conf.reload_config_files()

        elif self.restart_method == 'mutate':

            self.conf.mutate_config_files()

        self.services.restart()

oslo_service/service.py:class Services(object)

    def restart(self):

        """Reset services and start them in new threads."""

        self.stop()

        self.done = event.Event()

        for restart_service in self.services:

            restart_service.reset()

            self.tg.add_thread(self.run_service, restart_service, self.done)

    @staticmethod

    def run_service(service, done):

        """Service start wrapper.

        :param service: service to run

        :param done: event to wait on until a shutdown is triggered

        :returns: None

        """

        try:

            service.start()-------调用服务的start方法，最终调用的是cinder/service.py:class Service(service.Service):start方法，因为该方法继承oslo_service,service方法

        except Exception:

            LOG.exception('Error starting thread.')

            raise SystemExit(1)

        else:

            done.wait()		

cinder/service.py:class Service类:start方法，

   def start(self):

        version_string = version.version_string()

        LOG.info(_LI('Starting %(topic)s node (version %(version_string)s)'),

                 {'topic': self.topic, 'version_string': version_string})

        self.model_disconnected = False

        if self.coordination:

            coordination.COORDINATOR.start()

        self.manager.init_host(added_to_cluster=self.added_to_cluster,--------------------初始化驱动程序

                               service_id=Service.service_id)

        LOG.debug("Creating RPC server for service %s", self.topic)

        ctxt = context.get_admin_context()

        endpoints = [self.manager]

        endpoints.extend(self.manager.additional_endpoints)

        obj_version_cap = objects.Service.get_minimum_obj_version(ctxt)

        LOG.debug("Pinning object versions for RPC server serializer to %s",

                  obj_version_cap)

        serializer = objects_base.CinderObjectSerializer(obj_version_cap)

        target = messaging.Target(topic=self.topic, server=self.host)

        self.rpcserver = rpc.get_server(target, endpoints, serializer)

        self.rpcserver.start()

        # NOTE(dulek): Kids, don't do that at home. We're relying here on

        # oslo.messaging implementation details to keep backward compatibility

        # with pre-Ocata services. This will not matter once we drop

        # compatibility with them.

        if self.topic == constants.VOLUME_TOPIC:

            target = messaging.Target(

                topic='%(topic)s.%(host)s' % {'topic': self.topic,

                                              'host': self.host},

                server=vol_utils.extract_host(self.host, 'host'))

            self.backend_rpcserver = rpc.get_server(target, endpoints,

                                                    serializer)

            self.backend_rpcserver.start()

        # TODO(geguileo): In O - Remove the is_svc_upgrading_to_n part

        if self.cluster and not self.is_svc_upgrading_to_n(self.binary):

            LOG.info(_LI('Starting %(topic)s cluster %(cluster)s (version '

                         '%(version)s)'),

                     {'topic': self.topic, 'version': version_string,

                      'cluster': self.cluster})

            target = messaging.Target(

                topic='%s.%s' % (self.topic, self.cluster),

                server=vol_utils.extract_host(self.cluster, 'host'))

            serializer = objects_base.CinderObjectSerializer(obj_version_cap)

            self.cluster_rpcserver = rpc.get_server(target, endpoints,

                                                    serializer)

            self.cluster_rpcserver.start()

        self.manager.init_host_with_rpc()

        if self.report_interval:

            pulse = loopingcall.FixedIntervalLoopingCall(----------------启动上报给scheduler定时任务

                self.report_state)

            pulse.start(interval=self.report_interval,

                        initial_delay=self.report_interval)

            self.timers.append(pulse)

        if self.periodic_interval:

            if self.periodic_fuzzy_delay:

                initial_delay = random.randint(0, self.periodic_fuzzy_delay)

            else:

                initial_delay = None

            periodic = loopingcall.FixedIntervalLoopingCall(--------------启动轮训根性定时任务

                self.periodic_tasks)

            periodic.start(interval=self.periodic_interval,

                           initial_delay=initial_delay)

            self.timers.append(periodic)

cinder/volume/manager.py:VolumeManager

init_host 过程分析

    def init_host(self, added_to_cluster=None, **kwargs):

        """Perform any required initialization."""

        ctxt = context.get_admin_context()

        if not self.driver.supported:

            utils.log_unsupported_driver_warning(self.driver)

            if not self.configuration.enable_unsupported_driver:

                LOG.error(_LE("Unsupported drivers are disabled."

                              " You can re-enable by adding "

                              "enable_unsupported_driver=True to the "

                              "driver section in cinder.conf"),

                          resource={'type': 'driver',

                                    'id': self.__class__.__name__})

                return

        # If we have just added this host to a cluster we have to include all

        # our resources in that cluster.

        if added_to_cluster:

            self._include_resources_in_cluster(ctxt)

        LOG.info(_LI("Starting volume driver %(driver_name)s (%(version)s)"),

                 {'driver_name': self.driver.__class__.__name__,

                  'version': self.driver.get_version()})

        try:

            self.driver.do_setup(ctxt)---------------------存储的初始化操作放到了 do_setup 中，当 do_setup 失败时，并不会导致服务启动失败，也不会影响 multi backend 的其他 backend。

            self.driver.check_for_setup_error()

        except Exception:

            LOG.exception(_LE("Failed to initialize driver."),

                          resource={'type': 'driver',

                                    'id': self.__class__.__name__})

            # we don't want to continue since we failed

            # to initialize the driver correctly.

            return

        # Initialize backend capabilities list

        self.driver.init_capabilities()

        volumes = self._get_my_volumes(ctxt)

        snapshots = self._get_my_snapshots(ctxt)

        self._sync_provider_info(ctxt, volumes, snapshots)

        # FIXME volume count for exporting is wrong

        self.stats['pools'] = {}

        self.stats.update({'allocated_capacity_gb': 0})

        try:

            for volume in volumes:

                # available volume should also be counted into allocated

                if volume['status'] in ['in-use', 'available']:

                    # calculate allocated capacity for driver

                    self._count_allocated_capacity(ctxt, volume)

                    try:

                        if volume['status'] in ['in-use']:

                            self.driver.ensure_export(ctxt, volume)

                    except Exception:

                        LOG.exception(_LE("Failed to re-export volume, "

                                          "setting to ERROR."),

                                      resource=volume)

                        volume.conditional_update({'status': 'error'},

                                                  {'status': 'in-use'})

            # All other cleanups are processed by parent class CleanableManager

        except Exception:

            LOG.exception(_LE("Error during re-export on driver init."),

                          resource=volume)

            return

        self.driver.set_throttle()

        # at this point the driver is considered initialized.

        # NOTE(jdg): Careful though because that doesn't mean

        # that an entry exists in the service table

        self.driver.set_initialized()-------------set_initialized 将驱动的 initialized 属性设定为 true，标志驱动成功启动并初始化完成。

        # Keep the image tmp file clean when init host.

        backend_name = vol_utils.extract_host(self.service_topic_queue)

        image_utils.cleanup_temporary_file(backend_name)

        # collect and publish service capabilities

        self.publish_service_capabilities(ctxt)

        LOG.info(_LI("Driver initialization completed successfully."),

                 resource={'type': 'driver',

                           'id': self.driver.__class__.__name__})

        # Make sure to call CleanableManager to do the cleanup

        super(VolumeManager, self).init_host(added_to_cluster=added_to_cluster,

                                             **kwargs)

码农公寓

相关文章