ML之Xgboost：利用Xgboost模型对数据集(比马印第安人糖尿病)进行二分类预测(5年内是否患糖尿病)

2021-10-30 21:17:00

输出结果

X_train内容：

[[ 3. 102. 44. ... 30.8 0.4 26. ]

[ 1. 77. 56. ... 33.3 1.251 24. ]

[ 9. 124. 70. ... 35.4 0.282 34. ]

...

[ 0. 57. 60. ... 21.7 0.735 67. ]

[ 1. 105. 58. ... 24.3 0.187 21. ]

[ 8. 179. 72. ... 32.7 0.719 36. ]]

y_train内容：

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1.

0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.

1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1.

1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0.

0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1.

0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.

0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1.

0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0.

0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1.

0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0.

0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0.

0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.

1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.

1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.

1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.

0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1.

0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.

0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1.

1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.

1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1.

0. 1. 0. 0. 0. 1. 1. 0. 0. 1.]

ML之Xgboost：利用Xgboost模型对数据集(比马印第安人糖尿病)进行二分类预测(5年内是否患糖尿病)

设计思路

核心代码

class XGBClassifier Found at: xgboost.sklearn

class XGBClassifier(XGBModel, XGBClassifierBase):

# pylint: disable=missing-docstring,too-many-arguments,invalid-name

__doc__ = "Implementation of the scikit-learn API for XGBoost classification.\n\n" + '\n'.join

(XGBModel.__doc__.split('\n')[2:])

def __init__(self, max_depth=3, learning_rate=0.1,

n_estimators=100, silent=True,

objective="binary:logistic", booster='gbtree',

n_jobs=1, nthread=None, gamma=0, min_child_weight=1,

max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,

reg_alpha=0, reg_lambda=1, scale_pos_weight=1,

base_score=0.5, random_state=0, seed=None, missing=None, **kwargs):

super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent,

objective, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample,

colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight,

base_score, random_state, seed, missing, **kwargs)

def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,

early_stopping_rounds=None, verbose=True, xgb_model=None,

sample_weight_eval_set=None, callbacks=

# pylint: disable = attribute-defined-outside-init,arguments-differ

None):

"""

Fit gradient boosting classifier

Parameters

----------

X : array_like

Feature matrix

y : array_like

Labels

sample_weight : array_like

Weight for each instance

eval_set : list, optional

A list of (X, y) pairs to use as a validation set for

early-stopping

sample_weight_eval_set : list, optional

A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of

instance weights on the i-th validation set.

eval_metric : str, callable, optional

If a str, should be a built-in evaluation metric to use. See

doc/parameter.rst. If callable, a custom evaluation metric. The call

signature is func(y_predicted, y_true) where y_true will be a

DMatrix object such that you may need to call the get_label

method. It must return a str, value pair where the str is a name

for the evaluation and value is the value of the evaluation

function. This objective is always minimized.

early_stopping_rounds : int, optional

Activates early stopping. Validation error needs to decrease at

least every <early_stopping_rounds> round(s) to continue training.

Requires at least one item in evals. If there's more than one,

will use the last. If early stopping occurs, the model will have

three additional fields: bst.best_score, bst.best_iteration and

bst.best_ntree_limit (bst.best_ntree_limit is the ntree_limit parameter

default value in predict method if not any other value is specified).

(Use bst.best_ntree_limit to get the correct value if num_parallel_tree

and/or num_class appears in the parameters)

verbose : bool

If `verbose` and an evaluation set is used, writes the evaluation

metric measured on the validation set to stderr.

xgb_model : str

file name of stored xgb model or 'Booster' instance Xgb model to be

loaded before training (allows training continuation).

callbacks : list of callback functions

List of callback functions that are applied at end of each iteration.

It is possible to use predefined callbacks by using :ref:`callback_api`.

Example:

.. code-block:: python

[xgb.callback.reset_learning_rate(custom_rates)]

"""

evals_result = {}

self.classes_ = np.unique(y)

self.n_classes_ = len(self.classes_)

xgb_options = self.get_xgb_params()

if callable(self.objective):

obj = _objective_decorator(self.objective)

# Use default value. Is it really not used ?

xgb_options["objective"] = "binary:logistic"

else:

obj = None

if self.n_classes_ > 2:

# Switch to using a multiclass objective in the underlying XGB instance

xgb_options["objective"] = "multi:softprob"

xgb_options['num_class'] = self.n_classes_

feval = eval_metric if callable(eval_metric) else None

if eval_metric is not None:

if callable(eval_metric):

eval_metric = None

else:

xgb_options.update({"eval_metric":eval_metric})

self._le = XGBLabelEncoder().fit(y)

training_labels = self._le.transform(y)

if eval_set is not None:

if sample_weight_eval_set is None:

sample_weight_eval_set = [None] * len(eval_set)

evals = list(

DMatrix(eval_set[i][0], label=self._le.transform(eval_set[i][1]),

missing=self.missing, weight=sample_weight_eval_set[i],

nthread=self.n_jobs) for

i in range(len(eval_set)))

nevals = len(evals)

eval_names = ["validation_{}".format(i) for i in range(nevals)]

evals = list(zip(evals, eval_names))

else:

evals = ()

self._features_count = X.shape[1]

if sample_weight is not None:

train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,

missing=self.missing, nthread=self.n_jobs)

else:

train_dmatrix = DMatrix(X, label=training_labels,

missing=self.missing, nthread=self.n_jobs)

self._Booster = train(xgb_options, train_dmatrix, self.n_estimators,

evals=evals,

early_stopping_rounds=early_stopping_rounds,

evals_result=evals_result, obj=obj, feval=feval,

verbose_eval=verbose, xgb_model=xgb_model,

callbacks=callbacks)

self.objective = xgb_options["objective"]

if evals_result:

for val in evals_result.items():

evals_result_key = list(val[1].keys())[0]

evals_result[val[0]][evals_result_key] = val[1][evals_result_key]

self.evals_result_ = evals_result

if early_stopping_rounds is not None:

self.best_score = self._Booster.best_score

self.best_iteration = self._Booster.best_iteration

self.best_ntree_limit = self._Booster.best_ntree_limit

return self

def predict(self, data, output_margin=False, ntree_limit=None, validate_features=True):

"""

Predict with `data`.

.. note:: This function is not thread safe.

For each booster object, predict can only be called from one thread.

If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies

of model object and then call ``predict()``.

.. note:: Using ``predict()`` with DART booster

If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only

some of the trees will be evaluated. This will produce incorrect results if ``data`` is

not the training data. To obtain correct results on test sets, set ``ntree_limit`` to

a nonzero value, e.g.

.. code-block:: python

preds = bst.predict(dtest, ntree_limit=num_round)

Parameters

----------

data : DMatrix

The dmatrix storing the input.

output_margin : bool

Whether to output the raw untransformed margin value.

ntree_limit : int

Limit number of trees in the prediction; defaults to best_ntree_limit if defined

(i.e. it has been trained with early stopping), otherwise 0 (use all trees).

validate_features : bool

When this is True, validate that the Booster's and data's feature_names are identical.

Otherwise, it is assumed that the feature_names are the same.

Returns

-------

prediction : numpy array

"""

test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs)

if ntree_limit is None:

ntree_limit = getattr(self, "best_ntree_limit", 0)

class_probs = self.get_booster().predict(test_dmatrix,

output_margin=output_margin,

ntree_limit=ntree_limit,

validate_features=validate_features)

if output_margin:

# If output_margin is active, simply return the scores

return class_probs

if len(class_probs.shape) > 1:

column_indexes = np.argmax(class_probs, axis=1)

else:

column_indexes = np.repeat(0, class_probs.shape[0])

column_indexes[class_probs > 0.5] = 1

return self._le.inverse_transform(column_indexes)

def predict_proba(self, data, ntree_limit=None, validate_features=True):

"""

Predict the probability of each `data` example being of a given class.

.. note:: This function is not thread safe

For each booster object, predict can only be called from one thread.

If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies

of model object and then call predict

Parameters

----------

data : DMatrix

The dmatrix storing the input.

ntree_limit : int

Limit number of trees in the prediction; defaults to best_ntree_limit if defined

(i.e. it has been trained with early stopping), otherwise 0 (use all trees).

validate_features : bool

When this is True, validate that the Booster's and data's feature_names are identical.

Otherwise, it is assumed that the feature_names are the same.

Returns

-------

prediction : numpy array

a numpy array with the probability of each data example being of a given class.

"""

test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs)

if ntree_limit is None:

ntree_limit = getattr(self, "best_ntree_limit", 0)

class_probs = self.get_booster().predict(test_dmatrix,

ntree_limit=ntree_limit,

validate_features=validate_features)

if self.objective == "multi:softprob":

return class_probs

else:

classone_probs = class_probs

classzero_probs = 1.0 - classone_probs

return np.vstack((classzero_probs, classone_probs)).transpose()

def evals_result(self):

"""Return the evaluation results.

If **eval_set** is passed to the `fit` function, you can call

``evals_result()`` to get evaluation results for all passed **eval_sets**.

When **eval_metric** is also passed to the `fit` function, the

**evals_result** will contain the **eval_metrics** passed to the `fit` function.

Returns

-------

evals_result : dictionary

Example

-------

.. code-block:: python

param_dist = {'objective':'binary:logistic', 'n_estimators':2}

clf = xgb.XGBClassifier(**param_dist)

clf.fit(X_train, y_train,

eval_set=[(X_train, y_train), (X_test, y_test)],

eval_metric='logloss',

verbose=True)

evals_result = clf.evals_result()

The variable **evals_result** will contain

.. code-block:: python

{'validation_0': {'logloss': ['0.604835', '0.531479']},

'validation_1': {'logloss': ['0.41965', '0.17686']}}

"""

if self.evals_result_:

evals_result = self.evals_result_

else:

raise XGBoostError('No results.')

return evals_result

码农公寓

相关文章