聚类kmeans案例

注:本案例为黑马的课堂案例,上传仅为方便查看

# 1.获取数据
# 2.数据基本处理
# 2.1 合并表格
# 2.2 交叉表合并
# 2.3 数据截取
# 3.特征工程 — pca
# 4.机器学习(k-means)
# 5.模型评估
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# 1.获取数据
order_product = pd.read_csv("./data/instacart/order_products__prior.csv")
products = pd.read_csv("./data/instacart/products.csv")
orders = pd.read_csv("./data/instacart/orders.csv")
aisles = pd.read_csv("./data/instacart/aisles.csv")
# 2.数据基本处理
# 2.1 合并表格
table1 = pd.merge(order_product, products, on=["product_id", "product_id"])
table2 = pd.merge(table1, orders, on=["order_id", "order_id"])
table = pd.merge(table2, aisles, on=["aisle_id", "aisle_id"])
table.shape
(32434489, 14)
table.head()
order_id product_id add_to_cart_order reordered product_name aisle_id department_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order aisle
0 2 33120 1 1 Organic Egg Whites 86 16 202279 prior 3 5 9 8.0 eggs
1 26 33120 5 0 Organic Egg Whites 86 16 153404 prior 2 0 16 7.0 eggs
2 120 33120 13 0 Organic Egg Whites 86 16 23750 prior 11 6 8 10.0 eggs
3 327 33120 5 1 Organic Egg Whites 86 16 58707 prior 21 6 9 8.0 eggs
4 390 33120 28 1 Organic Egg Whites 86 16 166654 prior 48 0 12 9.0 eggs
# 2.2 交叉表合并
data = pd.crosstab(table["user_id"], table["aisle"])
data.head()
aisle air fresheners candles asian foods baby accessories baby bath body care baby food formula bakery desserts baking ingredients baking supplies decor beauty beers coolers ... spreads tea tofu meat alternatives tortillas flat bread trail mix snack mix trash bags liners vitamins supplements water seltzer sparkling water white wines yogurt
user_id
1 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 1
2 0 3 0 0 0 0 2 0 0 0 ... 3 1 1 0 0 0 0 2 0 42
3 0 0 0 0 0 0 0 0 0 0 ... 4 1 0 0 0 0 0 2 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 1 0 0
5 0 2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 3

5 rows × 134 columns

data.shape
(206209, 134)
# 2.3 数据截取
new_data = data[:1000]
new_data.shape
(1000, 134)
# 3.特征工程 — pca
transfer = PCA(n_components=0.9)
trans_data = transfer.fit_transform(new_data)
trans_data.shape
(1000, 22)
trans_data
array([[-2.27452872e+01, -7.32942365e-01, -2.48945893e+00, ...,
        -4.78491473e+00, -3.10742945e+00, -2.45192316e+00],
       [ 5.28638801e+00, -3.00176267e+01, -1.11226906e+00, ...,
         9.24145693e+00, -3.11309382e+00,  2.20144174e+00],
       [-6.52593099e+00, -3.87333123e+00, -9.23859508e+00, ...,
        -1.33929081e+00,  1.25062993e+00,  6.12717485e-01],
       ...,
       [ 1.31226615e+01, -2.77296885e+01, -4.62403246e+00, ...,
         7.40793534e+00,  1.03829352e+00, -1.39058393e+01],
       [ 1.64905900e+02, -8.54916188e+01,  1.90577481e-02, ...,
        -5.62014943e+00, -1.38488891e+01, -7.11424774e+00],
       [-1.60244724e+00,  1.82037661e+00,  8.55756408e+00, ...,
         3.69860152e+00,  2.82248188e+00, -3.79491023e+00]])
# 4.机器学习(k-means)
estimator = KMeans(n_clusters=5)
y_pre = estimator.fit_predict(trans_data)
# 5.模型评估
silhouette_score(trans_data, y_pre)

0.4472179873751538
上一篇:它说你的代码有 Bug「GitHub 热点速览 v.21.44」


下一篇:机器学习常用sklearn库