文章目录
import pandas as pd, numpy as np, warnings
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
warnings.filterwarnings("ignore", category=DeprecationWarning)
假数据构造
iddict = {'advert_place': [1, 2, 11, 4, 3], 'provinced_id':[10, 10, 30, 2, 10]}
data1 = pd.DataFrame(iddict)
data1
iddict = {'A': [1, 2, 11, 4, 3], 'B':[10, 10, 30, 2, 10], 'C':[100, 200, 300, 400, 500], 'y':[1, 0, 0, 0, 1]}
data2 = pd.DataFrame(iddict)
data2
one-hot
ohecodel = OneHotEncoder()
advert_place = ohecodel.fit_transform(np.array(data1['advert_place']).reshape(-1, 1))
print(advert_place)
print(advert_place.shape)
# print(advert_place.toarray()) # 稀疏矩阵转成array
x_train, y_train = np.array(data2[['A', 'B', 'C']]), np.array(data2['y'])
gbdt = GradientBoostingClassifier(n_estimators=100, learning_rate=0.743, max_depth=3, min_samples_leaf=50, min_samples_split=5, min_impurity_decrease=0.2)
gbdt.fit(x_train, y_train.ravel())
x_train_leaves = gbdt.apply(x_train)[:, :, 0]
ohecodel = OneHotEncoder()
x_train_trans = ohecodel.fit_transform(x_train_leaves)
print(x_train_trans)
print(x_train_trans.shape)
拼接one-hot后的稀疏矩阵
思路一
将两个稀疏矩阵,通过toarray()
或todense()
的方式转为array,在按行拼接
a = x_train_trans.todense()
b = advert_place.todense()
c = np.hstack((a,b))
print(c)
a = x_train_trans.toarray()
b = advert_place.toarray()
c = np.hstack((a,b))
print(c)
思路二
通过hstack
from scipy.sparse import hstack
a = x_train_trans
b = advert_place
c = hstack((a, b))
print(c.todense())