def slice(self, rindex):
if not isinstance(rindex, ARRAY_TYPES):
raise CatBoostError("Invalid rindex type={} : must be list
or numpy.ndarray".format(type(rindex)))
slicedPool = Pool(None)
slicedPool._take_slice(self, rindex)
return slicedPool
def set_pairs(self, pairs):
self._check_pairs_type(pairs)
if isinstance(pairs, DataFrame):
pairs = pairs.values
self._check_pairs_value(pairs)
self._set_pairs(pairs)
return self
def set_feature_names(self, feature_names):
self._check_feature_names(feature_names)
self._set_feature_names(feature_names)
return self
def set_baseline(self, baseline):
self._check_baseline_type(baseline)
baseline = self._if_pandas_to_numpy(baseline)
baseline = np.reshape(baseline, (self.num_row(), -1))
self._check_baseline_shape(baseline, self.num_row())
self._set_baseline(baseline)
return self
def set_weight(self, weight):
self._check_weight_type(weight)
weight = self._if_pandas_to_numpy(weight)
self._check_weight_shape(weight, self.num_row())
self._set_weight(weight)
return self
def set_group_id(self, group_id):
self._check_group_id_type(group_id)
group_id = self._if_pandas_to_numpy(group_id)
self._check_group_id_shape(group_id, self.num_row())
self._set_group_id(group_id)
return self
def set_group_weight(self, group_weight):
self._check_group_weight_type(group_weight)
group_weight = self._if_pandas_to_numpy(group_weight)
self._check_group_weight_shape(group_weight, self.
num_row())
self._set_group_weight(group_weight)
return self
def set_subgroup_id(self, subgroup_id):
self._check_subgroup_id_type(subgroup_id)
subgroup_id = self._if_pandas_to_numpy(subgroup_id)
self._check_subgroup_id_shape(subgroup_id, self.
num_row())
self._set_subgroup_id(subgroup_id)
return self
def set_pairs_weight(self, pairs_weight):
self._check_weight_type(pairs_weight)
pairs_weight = self._if_pandas_to_numpy(pairs_weight)
self._check_weight_shape(pairs_weight, self.num_pairs())
self._set_pairs_weight(pairs_weight)
return self
def save(self, fname):
"""
Save the quantized pool to a file. Parameters
----------
fname : string
Output file name.
"""
if not self.is_quantized():
raise CatBoostError('Pool is not quantized')
if not isinstance(fname, STRING_TYPES):
raise CatBoostError("Invalid fname type={}: must be
str().".format(type(fname)))
self._save(fname)
def quantize(self, ignored_features=None,
per_float_feature_quantization=None, border_count=None,
max_bin=None, feature_border_type=None,
sparse_features_conflict_fraction=None,
nan_mode=None, input_borders=None, task_type=None,
used_ram_limit=None, random_seed=None, **kwargs):
"""
Quantize this pool
Parameters
----------
pool : catboost.Pool
Dataset to quantize.
ignored_features : list, [default=None]
Indices or names of features that should be excluded
when training.
per_float_feature_quantization : list of strings,
[default=None]
List of float binarization descriptions.
Format : described in documentation on catboost.ai
Example 1: ['0:1024'] means that feature 0 will have 1024
borders.
Example 2: ['0:border_count=1024', '1:
border_count=1024', ...] means that two first features have
1024 borders.
Example 3: ['0:nan_mode=Forbidden,border_count=32,
border_type=GreedyLogSum',
'1:nan_mode=Forbidden,border_count=32,
border_type=GreedyLogSum'] - defines more quantization
properties for first two features.
border_count : int, [default = 254 for training on CPU or
128 for training on GPU]
The number of partitions in numeric features
binarization. Used in the preliminary calculation.
range: [1,65535] on CPU, [1,255] on GPU
max_bin : float, synonym for border_count.
feature_border_type : string, [default='GreedyLogSum']
The binarization mode in numeric features binarization.
Used in the preliminary calculation.
Possible values:
- 'Median'
- 'Uniform'
- 'UniformAndQuantiles'
- 'GreedyLogSum'
- 'MaxLogSum'
- 'MinEntropy'
sparse_features_conflict_fraction : float, [default=0.0]
CPU only. Maximum allowed fraction of conflicting non-
default values for features in exclusive features bundle.
Should be a real value in [0, 1) interval.
nan_mode : string, [default=None]
Way to process missing values for numeric features.
Possible values:
- 'Forbidden' - raises an exception if there is a missing
value for a numeric feature in a dataset.
- 'Min' - each missing value will be processed as the
minimum numerical value.
- 'Max' - each missing value will be processed as the
maximum numerical value.
If None, then nan_mode=Min.
input_borders : string, [default=None]
input file with borders used in numeric features
binarization.
task_type : string, [default=None]
The calcer type used to train the model.
Possible values:
- 'CPU'
- 'GPU'
used_ram_limit=None
random_seed : int, [default=None]
The random seed used for data sampling.
If None, 0 is used.
"""
if self.is_quantized():
raise CatBoostError('Pool is already quantized')
params = {}
_process_synonyms(params)
if border_count is None:
border_count = max_bin
dev_efb_max_buckets = kwargs.pop
('dev_efb_max_buckets', None)
dev_max_subset_size_for_build_borders = kwargs.pop
('dev_max_subset_size_for_build_borders', None)
if kwargs:
raise CatBoostError("got an unexpected keyword
arguments: {}".format(kwargs.keys()))
_update_params_quantize_part(params, ignored_features,
per_float_feature_quantization, border_count,
feature_border_type, sparse_features_conflict_fraction,
dev_efb_max_buckets, nan_mode, input_borders, task_type,
used_ram_limit, random_seed,
dev_max_subset_size_for_build_borders)
self._quantize(params)
def _if_pandas_to_numpy(self, array):
if isinstance(array, Series):
array = array.values
if isinstance(array, DataFrame):
array = np.transpose(array.values)[0]
return array
def _label_if_pandas_to_numpy(self, label):
if isinstance(label, Series):
label = label.values
if isinstance(label, DataFrame):
label = label.values
return label
def _read(
self,
pool_file,
column_description,
pairs,
feature_names_path,
delimiter,
has_header,
ignore_csv_quoting,
thread_count,
quantization_params=None):
"""
Read Pool from file.
"""
with log_fixup():
self._check_files(pool_file, column_description, pairs)
self._check_delimiter(delimiter)
if column_description is None:
column_description = ''
else:
self._check_column_description_type
(column_description)
if pairs is None:
pairs = ''
if feature_names_path is None:
feature_names_path = ''
self._check_thread_count(thread_count)
self._read_pool(pool_file, column_description, pairs,
feature_names_path, delimiter[0], has_header,
ignore_csv_quoting, thread_count, quantization_params)
def _init(
self,
data,
label,
cat_features,
text_features,
embedding_features,
pairs, weight,
group_id,
group_weight,
subgroup_id,
pairs_weight,
baseline,
feature_names,
thread_count):
"""
Initialize Pool from array like data.
"""
if isinstance(data, DataFrame):
if feature_names is None:
feature_names = list(data.columns)
if isinstance(data, Series):
data = data.values.tolist()
if isinstance(data, FeaturesData):
samples_count = data.get_object_count()
features_count = data.get_feature_count()
else:
if len(np.shape(data)) == 1:
data = np.expand_dims(data, 1)
samples_count, features_count = np.shape(data)
pairs_len = 0
if label is not None:
self._check_label_type(label)
self._check_label_empty(label)
label = self._label_if_pandas_to_numpy(label)
if len(np.shape(label)) == 1:
label = np.expand_dims(label, 1)
self._check_label_shape(label, samples_count)
if feature_names is not None:
self._check_feature_names(feature_names,
features_count)
if cat_features is not None:
cat_features = _get_features_indices(cat_features,
feature_names)
self._check_string_feature_type(cat_features,
'cat_features')
self._check_string_feature_value(cat_features,
features_count, 'cat_features')
if text_features is not None:
text_features = _get_features_indices(text_features,
feature_names)
self._check_string_feature_type(text_features,
'text_features')
self._check_string_feature_value(text_features,
features_count, 'text_features')
if embedding_features is not None:
embedding_features = _get_features_indices
(embedding_features, feature_names)
self._check_string_feature_type(embedding_features,
'embedding_features')
self._check_string_feature_value(embedding_features,
features_count, 'embedding_features')
if pairs is not None:
self._check_pairs_type(pairs)
if isinstance(pairs, DataFrame):
pairs = pairs.values
self._check_pairs_value(pairs)
pairs_len = np.shape(pairs)[0]
if weight is not None:
self._check_weight_type(weight)
weight = self._if_pandas_to_numpy(weight)
self._check_weight_shape(weight, samples_count)
if group_id is not None:
self._check_group_id_type(group_id)
group_id = self._if_pandas_to_numpy(group_id)
self._check_group_id_shape(group_id, samples_count)
if group_weight is not None:
self._check_group_weight_type(group_weight)
group_weight = self._if_pandas_to_numpy
(group_weight)
self._check_group_weight_shape(group_weight,
samples_count)
if subgroup_id is not None:
self._check_subgroup_id_type(subgroup_id)
subgroup_id = self._if_pandas_to_numpy(subgroup_id)
self._check_subgroup_id_shape(subgroup_id,
samples_count)
if pairs_weight is not None:
self._check_weight_type(pairs_weight)
pairs_weight = self._if_pandas_to_numpy(pairs_weight)
self._check_weight_shape(pairs_weight, pairs_len)
if baseline is not None:
self._check_baseline_type(baseline)
baseline = self._if_pandas_to_numpy(baseline)
baseline = np.reshape(baseline, (samples_count, -1))
self._check_baseline_shape(baseline, samples_count)
self._init_pool(data, label, cat_features, text_features,
embedding_features, pairs, weight, group_id, group_weight,
subgroup_id, pairs_weight, baseline, feature_names,
thread_count)