ML之catboost:catboost模型中常用的Pool类型数据结构源代码解读、案例应用之详细攻略(二)

 def slice(self, rindex):

       if not isinstance(rindex, ARRAY_TYPES):

           raise CatBoostError("Invalid rindex type={} : must be list

            or numpy.ndarray".format(type(rindex)))

       slicedPool = Pool(None)

       slicedPool._take_slice(self, rindex)

       return slicedPool

   

   def set_pairs(self, pairs):

       self._check_pairs_type(pairs)

       if isinstance(pairs, DataFrame):

           pairs = pairs.values

       self._check_pairs_value(pairs)

       self._set_pairs(pairs)

       return self

   

   def set_feature_names(self, feature_names):

       self._check_feature_names(feature_names)

       self._set_feature_names(feature_names)

       return self

   

   def set_baseline(self, baseline):

       self._check_baseline_type(baseline)

       baseline = self._if_pandas_to_numpy(baseline)

       baseline = np.reshape(baseline, (self.num_row(), -1))

       self._check_baseline_shape(baseline, self.num_row())

       self._set_baseline(baseline)

       return self

   

   def set_weight(self, weight):

       self._check_weight_type(weight)

       weight = self._if_pandas_to_numpy(weight)

       self._check_weight_shape(weight, self.num_row())

       self._set_weight(weight)

       return self

   

   def set_group_id(self, group_id):

       self._check_group_id_type(group_id)

       group_id = self._if_pandas_to_numpy(group_id)

       self._check_group_id_shape(group_id, self.num_row())

       self._set_group_id(group_id)

       return self

   

   def set_group_weight(self, group_weight):

       self._check_group_weight_type(group_weight)

       group_weight = self._if_pandas_to_numpy(group_weight)

       self._check_group_weight_shape(group_weight, self.

        num_row())

       self._set_group_weight(group_weight)

       return self

   

   def set_subgroup_id(self, subgroup_id):

       self._check_subgroup_id_type(subgroup_id)

       subgroup_id = self._if_pandas_to_numpy(subgroup_id)

       self._check_subgroup_id_shape(subgroup_id, self.

        num_row())

       self._set_subgroup_id(subgroup_id)

       return self

   

   def set_pairs_weight(self, pairs_weight):

       self._check_weight_type(pairs_weight)

       pairs_weight = self._if_pandas_to_numpy(pairs_weight)

       self._check_weight_shape(pairs_weight, self.num_pairs())

       self._set_pairs_weight(pairs_weight)

       return self

   

   def save(self, fname):

       """

       Save the quantized pool to a file.        Parameters

       ----------

       fname : string

           Output file name.

       """

       if not self.is_quantized():

           raise CatBoostError('Pool is not quantized')

       if not isinstance(fname, STRING_TYPES):

           raise CatBoostError("Invalid fname type={}: must be

            str().".format(type(fname)))

       self._save(fname)

   def quantize(self, ignored_features=None,

    per_float_feature_quantization=None, border_count=None,

       max_bin=None, feature_border_type=None,

        sparse_features_conflict_fraction=None,

       nan_mode=None, input_borders=None, task_type=None,

        used_ram_limit=None, random_seed=None, **kwargs):

       """

       Quantize this pool

       Parameters

       ----------

       pool : catboost.Pool

           Dataset to quantize.

       ignored_features : list, [default=None]

           Indices or names of features that should be excluded

            when training.

       per_float_feature_quantization : list of strings,

        [default=None]

           List of float binarization descriptions.

           Format : described in documentation on catboost.ai

           Example 1: ['0:1024'] means that feature 0 will have 1024

            borders.

           Example 2: ['0:border_count=1024', '1:

            border_count=1024', ...] means that two first features have

            1024 borders.

           Example 3: ['0:nan_mode=Forbidden,border_count=32,

            border_type=GreedyLogSum',

                       '1:nan_mode=Forbidden,border_count=32,

                        border_type=GreedyLogSum'] - defines more quantization

                        properties for first two features.

       border_count : int, [default = 254 for training on CPU or

        128 for training on GPU]

           The number of partitions in numeric features

            binarization. Used in the preliminary calculation.

           range: [1,65535] on CPU, [1,255] on GPU

       max_bin : float, synonym for border_count.

       feature_border_type : string, [default='GreedyLogSum']

           The binarization mode in numeric features binarization.

            Used in the preliminary calculation.

           Possible values:

               - 'Median'

               - 'Uniform'

               - 'UniformAndQuantiles'

               - 'GreedyLogSum'

               - 'MaxLogSum'

               - 'MinEntropy'

       sparse_features_conflict_fraction : float, [default=0.0]

           CPU only. Maximum allowed fraction of conflicting non-

            default values for features in exclusive features bundle.

           Should be a real value in [0, 1) interval.

       nan_mode : string, [default=None]

           Way to process missing values for numeric features.

           Possible values:

               - 'Forbidden' - raises an exception if there is a missing

                value for a numeric feature in a dataset.

               - 'Min' - each missing value will be processed as the

                minimum numerical value.

               - 'Max' - each missing value will be processed as the

                maximum numerical value.

           If None, then nan_mode=Min.

       input_borders : string, [default=None]

           input file with borders used in numeric features

            binarization.

       task_type : string, [default=None]

           The calcer type used to train the model.

           Possible values:

               - 'CPU'

               - 'GPU'

       used_ram_limit=None

       random_seed : int, [default=None]

           The random seed used for data sampling.

           If None, 0 is used.

       """

       if self.is_quantized():

           raise CatBoostError('Pool is already quantized')

       params = {}

       _process_synonyms(params)

       if border_count is None:

           border_count = max_bin

       dev_efb_max_buckets = kwargs.pop

        ('dev_efb_max_buckets', None)

       dev_max_subset_size_for_build_borders = kwargs.pop

        ('dev_max_subset_size_for_build_borders', None)

       if kwargs:

           raise CatBoostError("got an unexpected keyword

            arguments: {}".format(kwargs.keys()))

       _update_params_quantize_part(params, ignored_features,

        per_float_feature_quantization, border_count,

        feature_border_type, sparse_features_conflict_fraction,

        dev_efb_max_buckets, nan_mode, input_borders, task_type,

        used_ram_limit, random_seed,

        dev_max_subset_size_for_build_borders)

       self._quantize(params)

   

   def _if_pandas_to_numpy(self, array):

       if isinstance(array, Series):

           array = array.values

       if isinstance(array, DataFrame):

           array = np.transpose(array.values)[0]

       return array

   

   def _label_if_pandas_to_numpy(self, label):

       if isinstance(label, Series):

           label = label.values

       if isinstance(label, DataFrame):

           label = label.values

       return label

   

   def _read(

       self,

       pool_file,

       column_description,

       pairs,

       feature_names_path,

       delimiter,

       has_header,

       ignore_csv_quoting,

       thread_count,

       quantization_params=None):

       """

       Read Pool from file.

       """

       with log_fixup():

           self._check_files(pool_file, column_description, pairs)

           self._check_delimiter(delimiter)

           if column_description is None:

               column_description = ''

           else:

               self._check_column_description_type

                (column_description)

           if pairs is None:

               pairs = ''

           if feature_names_path is None:

               feature_names_path = ''

           self._check_thread_count(thread_count)

           self._read_pool(pool_file, column_description, pairs,

            feature_names_path, delimiter[0], has_header,

            ignore_csv_quoting, thread_count, quantization_params)

   

   def _init(

       self,

       data,

       label,

       cat_features,

       text_features,

       embedding_features,

       pairs, weight,

       group_id,

       group_weight,

       subgroup_id,

       pairs_weight,

       baseline,

       feature_names,

       thread_count):

       """

       Initialize Pool from array like data.

       """

       if isinstance(data, DataFrame):

           if feature_names is None:

               feature_names = list(data.columns)

       if isinstance(data, Series):

           data = data.values.tolist()

       if isinstance(data, FeaturesData):

           samples_count = data.get_object_count()

           features_count = data.get_feature_count()

       else:

           if len(np.shape(data)) == 1:

               data = np.expand_dims(data, 1)

           samples_count, features_count = np.shape(data)

       pairs_len = 0

       if label is not None:

           self._check_label_type(label)

           self._check_label_empty(label)

           label = self._label_if_pandas_to_numpy(label)

           if len(np.shape(label)) == 1:

               label = np.expand_dims(label, 1)

           self._check_label_shape(label, samples_count)

       if feature_names is not None:

           self._check_feature_names(feature_names,

            features_count)

       if cat_features is not None:

           cat_features = _get_features_indices(cat_features,

            feature_names)

           self._check_string_feature_type(cat_features,

            'cat_features')

           self._check_string_feature_value(cat_features,

            features_count, 'cat_features')

       if text_features is not None:

           text_features = _get_features_indices(text_features,

            feature_names)

           self._check_string_feature_type(text_features,

            'text_features')

           self._check_string_feature_value(text_features,

            features_count, 'text_features')

       if embedding_features is not None:

           embedding_features = _get_features_indices

            (embedding_features, feature_names)

           self._check_string_feature_type(embedding_features,

            'embedding_features')

           self._check_string_feature_value(embedding_features,

            features_count, 'embedding_features')

       if pairs is not None:

           self._check_pairs_type(pairs)

           if isinstance(pairs, DataFrame):

               pairs = pairs.values

           self._check_pairs_value(pairs)

           pairs_len = np.shape(pairs)[0]

       if weight is not None:

           self._check_weight_type(weight)

           weight = self._if_pandas_to_numpy(weight)

           self._check_weight_shape(weight, samples_count)

       if group_id is not None:

           self._check_group_id_type(group_id)

           group_id = self._if_pandas_to_numpy(group_id)

           self._check_group_id_shape(group_id, samples_count)

       if group_weight is not None:

           self._check_group_weight_type(group_weight)

           group_weight = self._if_pandas_to_numpy

            (group_weight)

           self._check_group_weight_shape(group_weight,

            samples_count)

       if subgroup_id is not None:

           self._check_subgroup_id_type(subgroup_id)

           subgroup_id = self._if_pandas_to_numpy(subgroup_id)

           self._check_subgroup_id_shape(subgroup_id,

            samples_count)

       if pairs_weight is not None:

           self._check_weight_type(pairs_weight)

           pairs_weight = self._if_pandas_to_numpy(pairs_weight)

           self._check_weight_shape(pairs_weight, pairs_len)

       if baseline is not None:

           self._check_baseline_type(baseline)

           baseline = self._if_pandas_to_numpy(baseline)

           baseline = np.reshape(baseline, (samples_count, -1))

           self._check_baseline_shape(baseline, samples_count)

       self._init_pool(data, label, cat_features, text_features,

        embedding_features, pairs, weight, group_id, group_weight,

        subgroup_id, pairs_weight, baseline, feature_names,

        thread_count)



上一篇:make 随笔


下一篇:ML:模型训练评估中常用的两种方法代码实现(留一法一次性切分训练和K折交叉验证训练)