使用C++实现kmeans并使用Python可视化聚簇结果

2022-07-17 09:14:55

文章目录

C++实现过程

算法流程
数据结构设计
代码:

Python可视化过程
部分截图

注: 本文代码及方法仅供参考，请勿直接使用

C++实现过程

算法流程

kmeans - 点作为数据，cluster是点的聚簇
BEGIN
     选出来 k 个点作为中心点生成聚簇
     循环
          计算点与聚簇的距离
          每个点加入到距离最近的聚簇中
          更新聚簇中心点
          聚簇中心点未变 或 循环次数足够？退出
     输出聚簇
END

数据结构设计

为了设计出更为通用的结构，选择采用OOP面向对象设计，结构比较复杂，尤其是距离计算，求中心这两个函数。想要通用，那么就不能限定距离的计算方法，同理，求中心点的方法也可能是任意的，因此需要作为参数传递给算法。

结构概要

VirtualPoint - 虚拟点类(抽象类)，无数据成员，定义了 == != 两个纯虚函数
Cluster - 聚簇类，数据成员: VirtualPoint的集合 和 中心点(VirtualPoint类型)  
          函数成员: 设置中心 更新中心 清空点...
KmeansAlg - 算法框架，run方法实现了聚类算法，提供必要参数(点之间距离计算，求平均点方法)，无需重写算法即可运行

------------------

NDimenPoint - 多维点类，继承VirtualPoint，用来处理多维数据

首先是两个通用类 - 虚拟点与聚簇，实际使用的时候，继承VirtualPoint类，实现两个运算符之后即可(当然由于avgPoints和calcDis两个函数，可能需要添加其它方法，不过这是另一回事儿了)。

class VirtualPoint {
private:
public:
    VirtualPoint() {}
    virtual ~VirtualPoint() {}
    // 如下的 相等 判断主要在判断中心是否更改时用到
    virtual bool operator==(const VirtualPoint &p) = 0;
    virtual bool operator!=(const VirtualPoint &p) = 0;
    virtual string toString() = 0;
};

typedef shared_ptr<VirtualPoint> sharedVPoint;
typedef sharedVPoint avgPointFunc(const vector<sharedVPoint> &); 
// 聚簇类
class Cluster {
private:
    vector<sharedVPoint> points; // 所有的点
    sharedVPoint centroid; // 中心
    avgPointFunc *avgPoints; // 计算所有点的中心的方法
public:
    Cluster(avgPointFunc avg);
    ~Cluster() {}
    Cluster &setCentroid(sharedVPoint p); // 设置中心
    bool updateCentroid(); // 更新中心，使用 avgPoints 函数更新得到新的中心，并且返回新中心是否与旧中心不同
    void clear(); // 清空点
    void addPoint(sharedVPoint p); // 添加点
    string toString(); 
    // 获取中心与所有的点，输出时用
    sharedVPoint getCentroid(); 
    const vector<sharedVPoint> &getPoints(); 
};

然后是kmeans主要过程类，注意下面的run方法为算法框架，已经实现，因此如果要针对其他数据类型实现kmeans，无需修改该类，而是继承VirtualPoint然后调用该类即可。

// 计算 VirtualPoint 与 Cluster的质心 之间的距离
typedef double calcFunc(const VirtualPoint &, const Cluster &);

class KmeansAlg {
public:
    KmeansAlg() {}
    ~KmeansAlg() {}
    // 生成 k 个 位于 [0, n) 中的随机数, n < 100000000
    static vector<int> randDiffNumbers(int n, int k);
    static vector<Cluster> run(vector<sharedVPoint> data, int k, calcFunc calcDis, avgPointFunc avgPoints, const int maxRuond = 2000);
};

然后是一个继承VirtualPoint的多维点类，能够处理任意维度的点

class NDimenPoint : public VirtualPoint {
private:
    int dimension; // 维度
    vector<double> xs; // x1 x2 x3 ...

public:
    NDimenPoint(const int d);
    NDimenPoint(const int d, vector<double> l);
    NDimenPoint(const NDimenPoint &p); 
    ~NDimenPoint();
    bool operator==(const VirtualPoint &p) override; // 重载，需要 static_cast
    bool operator!=(const VirtualPoint &p) override; // 重载，需要 static_cast
    NDimenPoint operator+(const NDimenPoint &p); // + / 主要用来计算点的平均值
    NDimenPoint operator/(const int n);
    double disTo(const NDimenPoint &p); // 计算到某个点的距离
    string toString() override;
    // 两个静态函数，计算点到聚簇距离 以及 计算点的中心值
    static double calcDisToCluster(const VirtualPoint &p, const Cluster &c);
    static sharedVPoint avgPoints(const vector<sharedVPoint> &points);
};

和多维点类一样，对于其他非点类型的数据，通过继承VirtualPoint，实现必要的函数之后即可调用前述KmeansAlg的run方法从而实现kmeans聚类。

代码:

kmeans_oop.h

#include <algorithm>
#include <cmath>
#include <ctime>
#include <exception>
#include <iostream>
#include <memory>
#include <random>
#include <sstream>
#include <string>
#include <vector>
using std::cerr;
using std::endl;
using std::make_shared;
using std::pow;
using std::shared_ptr;
using std::sqrt;
using std::string;
using std::stringstream;
using std::to_string;
using std::vector;

/**
 * kmeans - 点作为数据，cluster是点的聚簇
 * BEGIN
 *      选出来 k 个点作为中心点生成聚簇
 *      循环
 *           计算点与聚簇的距离
 *           每个点加入到距离最近的聚簇中
 *           更新聚簇中心点
 *           聚簇中心点未变？退出
 *      输出聚簇
 * END
 * 
 * 数据结构
 * 点 - ==() toString()
 * 聚簇 - 计算中心点() 
 * calcDis(point cluster)
 * kmeans() - 
*/

class VirtualPoint {
private:
public:
    VirtualPoint() {}
    virtual ~VirtualPoint() {}
    virtual bool operator==(const VirtualPoint &p) = 0;
    virtual bool operator!=(const VirtualPoint &p) = 0;
    virtual string toString() = 0;
};

typedef shared_ptr<VirtualPoint> sharedVPoint;
typedef sharedVPoint avgPointFunc(const vector<sharedVPoint> &);

class Cluster {
private:
    vector<sharedVPoint> points;
    sharedVPoint centroid;
    avgPointFunc *avgPoints;

public:
    Cluster(avgPointFunc avg) { avgPoints = avg; }
    ~Cluster() {}
    Cluster &setCentroid(sharedVPoint p) {
        centroid = p;
        points.push_back(p);
        return *this;
    }
    bool updateCentroid() {
        sharedVPoint tmpPoint = avgPoints(points);
        if (tmpPoint == nullptr) return false;
        bool changed;
        if (tmpPoint != nullptr && centroid != nullptr)
            changed = (*tmpPoint) != (*centroid);
        else
            changed = true;
        centroid = tmpPoint;
        return changed;
    }
    void clear() { points.clear(); }
    void addPoint(sharedVPoint p) {
        points.push_back(p);
    }
    string toString() const {
        stringstream ss;
        if (centroid == nullptr || points.size() == 0) return "{}";
        ss << "{\"centroid\": " << centroid->toString() << ",\"points\": [";
        for (int i = 0; i < points.size(); i++) {
            if (i > 0) ss << ", ";
            ss << points[i]->toString();
        }
        ss << "]}";
        return ss.str();
    }
    sharedVPoint getCentroid() const { return centroid; }
    const vector<sharedVPoint> &getPoints() { return points; }
};

// 计算 VirtualPoint 与 Cluster的质心 之间的距离
typedef double calcFunc(const VirtualPoint &, const Cluster &);

class KmeansAlg {
public:
    KmeansAlg() {}
    ~KmeansAlg() {}
    // 生成 k 个 位于 [0, n) 中的随机数, n < 100000000
    static vector<int> randDiffNumbers(int n, int k) {
        const int maxn = 100000000;
        vector<int> res;
        if (n <= 0 || n >= maxn)
            throw std::runtime_error("n is less than zero or greater than maxn(100,000,000)");
        for (int i = 0; i < n; i++)
            res.push_back(i);
        random_shuffle(res.begin(), res.end());
        res.resize(k);
        return res;
    }
    static vector<Cluster> run(vector<sharedVPoint> data, int k, calcFunc calcDis, avgPointFunc avgPoints, const int maxRuond = 2000) {
        if (k <= 1) throw std::runtime_error("k is less than 1");
        vector<Cluster> clusters;
        for (auto &&i : randDiffNumbers(data.size(), k))
            clusters.push_back(Cluster(avgPoints).setCentroid(data[i]));
        for (int round = 0; round < maxRuond; round++) {
            // 清空
            for (auto &&c : clusters) c.clear();
            for (size_t i = 0; i < data.size(); i++) {
                // 计算距离，加入到最近聚簇中
                double minDis = calcDis(*(data[i]), clusters[0]);
                int minIndex = 0;
                for (size_t j = 1; j < clusters.size(); j++) {
                    double tmpDis = calcDis(*(data[i]), clusters[j]);
                    if (tmpDis < minDis) minDis = tmpDis, minIndex = j;
                }
                clusters[minIndex].addPoint(data[i]);
            }
            bool changed = false;
            for (auto &&c : clusters) changed = changed || c.updateCentroid();
            if (!changed) break;

            // cerr << "debug\t\tround: " << round << " ";
            // for (auto &&c : clusters)
            //     if (c.getPoints().size() > 0)
            //         cerr << c.getCentroid()->toString() << ", ";
            // cerr << endl;
        }
        return clusters;
    }
};

kmeans_h

#include "kmeans_oop.h"

using std::cin;
using std::cout;
using std::initializer_list;
using std::runtime_error;

class NDimenPoint : public VirtualPoint {
private:
    int dimension;
    vector<double> xs;

public:
    NDimenPoint(const int d) : dimension(d) { xs.resize(d); }
    NDimenPoint(const int d, vector<double> l) : dimension(d), xs(l){};
    NDimenPoint(const NDimenPoint &p) : dimension(p.dimension), xs(p.xs) {}
    ~NDimenPoint(){};
    bool operator==(const VirtualPoint &p) override {
        auto pp = static_cast<const NDimenPoint &>(p);
        if (dimension != pp.dimension) return false;
        for (size_t i = 0; i < xs.size(); i++)
            if (xs[i] != pp.xs[i]) return false;
        return true;
    }
    bool operator!=(const VirtualPoint &p) override {
        auto pp = static_cast<const NDimenPoint &>(p);
        if (dimension != pp.dimension) return true;
        for (size_t i = 0; i < xs.size(); i++)
            if (xs[i] != pp.xs[i]) return true;
        return false;
    }
    NDimenPoint operator+(const NDimenPoint &p) {
        if (p.dimension != dimension) throw runtime_error("dimension mismatch");
        NDimenPoint res(p.dimension);
        for (size_t i = 0; i < xs.size(); i++) res.xs[i] += p.xs[i];
        return res;
    }
    NDimenPoint operator/(const int n) {
        if (n == 0) throw std::runtime_error("divisor zero error!");
        NDimenPoint res(dimension);
        for (size_t i = 0; i < dimension; i++) {
            res.xs[i] = xs[i] / n;
        }
        return res;
    }
    double disTo(const NDimenPoint &p) {
        double tmp = 0;
        for (size_t i = 0; i < dimension; i++) tmp += pow(xs[i] - p.xs[i], 2);
        return sqrt(tmp);
    }
    string toString() override {
        stringstream ss;
        ss << "[";
        for (size_t i = 0; i < dimension; i++) {
            if (i > 0) ss << ", ";
            ss << xs[i];
        }
        ss << "]";
        return ss.str();
    }

    static double calcDisToCluster(const VirtualPoint &p, const Cluster &c) {
        auto pp = static_cast<const NDimenPoint &>(p);
        auto cp = static_cast<const NDimenPoint &>(*(c.getCentroid()));
        return pp.disTo(cp);
    }
    static sharedVPoint avgPoints(const vector<sharedVPoint> &points) {
        if (points.size() <= 0) return nullptr;
        NDimenPoint resPoint(static_cast<const NDimenPoint &>(*points[0]).dimension);
        for (auto &&p : points)
            resPoint = resPoint + static_cast<const NDimenPoint &>(*p);
        resPoint = resPoint / points.size();
        return make_shared<NDimenPoint>(resPoint);
    };
};
vector<NDimenPoint> geneData(int num, const int dimension, double maxVal = 1000) {
    std::default_random_engine generator(time(NULL));
    std::uniform_real_distribution<double> distribution(0, maxVal);
    vector<NDimenPoint> points;
    for (size_t i = 0; i < num; i++) {
        vector<double> tmpVec;
        for (size_t j = 0; j < dimension; j++) tmpVec.push_back(distribution(generator));
        points.push_back(NDimenPoint(dimension, tmpVec));
    }
    return points;
}
void output(const vector<Cluster> &clusters, const int dimension) {
    cout << "{"
         << "\"dimension\":" << dimension << "," << endl
         << "\"clusters\":[";
    for (int i = 0; i < clusters.size(); i++) {
        if (i > 0) cout << ", ";
        std::cout << clusters[i].toString() << std::endl;
    }
    cout << "]}" << endl;
}
void kmeans_work() {

    const int maxRound = 10000;
    const int pointCnt = 150;
    int dimension = 1;
    int k = 0;

    cerr << "dimension, k: ";
    cin >> dimension >> k;
    vector<sharedVPoint> points;
    for (auto &&p : geneData(pointCnt, dimension))
        points.push_back(make_shared<NDimenPoint>(p));
    auto clusters = KmeansAlg::run(points, k, NDimenPoint::calcDisToCluster, NDimenPoint::avgPoints, maxRound);
    output(clusters, dimension);
}

main.cpp

int main(int argc, char const *argv[]) {
    kmeans_work();
    return 0;
}

Python可视化过程

原本打算使用opengl可视化，但是绘制一个三角形就需要一二百行代码实在难以接受且低效，则选择使用matplotlib实现，支持二维和三维

实现过程的tips:

matplotlib 绘制三维图 - plt.figure().add_subplot(111, projection='3d')
1. 二维参数 - ax.scatter(xs=xs, ys=ys, zs=zs, zdir='z', c=color, marker=marker)
2. 三维参数 - ax.scatter(x=xs, y=ys, c=color, marker=marker)
散点图scatter
1. 可以在一个ax(fig.add_subplot返回值)上多次scatter
2. 每次scatter的时候可以指定一个颜色’#000000’
3. marker - “.”: 点, “,”:像素 , “o”: 圈, “^”: 倒三角, “+”: 加, 参考官方文档

具体实现过程与代码如下

# 运行kmeans算法
# 将结果(JSON化)输出到文件中
# 使用Python读取文件内容
# 使用pyplot可视化

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import json
import random
colors = [
    "#ff0000", "#00ff00", "#0000ff", "#404040", "#ff00ff", "#00ffff", "#C0ff00", "#ffC000", "#ff00C0", "#000070", "#007000", "#700000",
]
def paint(ax, xs, ys, color, zs=None, marker='.'):
    if zs != None:
        ax.scatter(xs=xs, ys=ys, zs=zs, zdir='z', c=color, marker=marker)
    else:
        ax.scatter(x=xs, y=ys, c=color, marker=marker)
def readData():
    random.shuffle(colors)
    data = json.load(open("foo.json", mode="r", encoding="utf-8"))
    dimension = data["dimension"]
    clusters = []
    clusterCnt = 0
    for tmpRawCluster in data["clusters"]:
        tmpCluster = {"centroid": None, "xss": [],
                      "color": colors[clusterCnt % 140]}
        if "centroid" in tmpRawCluster:
            tmpCluster["centroid"] = tmpRawCluster["centroid"]
        for i in range(0, dimension):
            tmpCluster["xss"].append([])
        if "points" in tmpRawCluster:
            for tmpRawPoint in tmpRawCluster["points"]:
                for j in range(0, len(tmpRawPoint)):
                    tmpCluster["xss"][j].append(tmpRawPoint[j])
        clusters.append(tmpCluster)
        clusterCnt += 1
    return {"dimension": dimension, "clusters": clusters}
def work():
    data = readData()
    fig = plt.figure()
    if data["dimension"] == 2:
        ax = fig.add_subplot(111)
        for cluster in data["clusters"]:
            if cluster["centroid"]:
                paint(ax, cluster["xss"][0],
                      cluster["xss"][1], cluster["color"], marker='o')
                paint(ax, [cluster["centroid"][0]], [
                      cluster["centroid"][1]], "#000000", marker='^')
    elif data["dimension"] == 3:
        ax = fig.add_subplot(111, projection='3d')
        for cluster in data["clusters"]:
            paint(ax, cluster["xss"][0], cluster["xss"]
                  [1], cluster["color"], cluster["xss"][2])
    plt.show()
    pass
if __name__ == "__main__":
    work()

部分截图

目前算法效果欠佳，如下效果图仅供参考，后续考虑使用更优化的算法。

EricJeffrey 发布了222 篇原创文章 · 获赞 12 · 访问量 1万+ 私信关注

码农公寓