并行计算——实验报告

并行计算实现KNN算法

零.环境

GPU型号为GTX1050,详细信息如下:

 

Device 0: "GeForce GTX 1050"
  CUDA Driver Version / Runtime Version          11.1 / 11.1
  CUDA Capability Major/Minor version number:    6.1
  Total amount of global memory:                 2048 MBytes (2147483648 bytes)
  ( 5) Multiprocessors, (128) CUDA Cores/MP:     640 CUDA Cores
  GPU Max Clock rate:                            1493 MHz (1.49 GHz)
  Memory Clock rate:                             3504 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 524288 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               zu bytes
  Total amount of shared memory per block:       zu bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          zu bytes
  Texture alignment:                             zu bytes
  Concurrent copy and kernel execution:          Yes with 5 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 11.1, CUDA Runtime Version = 11.1, NumDevs = 1, Device0 = GeForce GTX 1050
Result = PASS

一.openmp实现计算向量的欧式距离

需求分析:

现在我们要解决这么一个问题:计算一个D维的向量A[D]到二维矩阵B[N][D]的每一行的欧式距离,并且将每一组距离保存在一个向量dis[N]中并返回。

串行代码

实现方法就是用一个二重循环进行相乘,然后将结果保存。

 

#include<iostream>                                                                                                       
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
 
using namespace std;
 
const int N = 100;
const int D = 8;
const int MAX = 10;
 
void cal_dis(int **train_data, int *test_data, int *dis)
{
    for(int k=0;k<N;k++)
    {
        int sum = 0;
        int temp = 0;
        for(int i=0;i<D;i++)
        {
            temp =  *(*(train_data+k)+i) - test_data[i];
            sum += temp * temp;
        }
        dis[k] = sum;
    }
}
 
void print(int **data)
{
    cout<<"training data:"<<endl;
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<D;j++)
        {
            cout<<*(*(data+i)+j)<<" ";      
        }
        cout<<endl;
    }
}
 
void print(int *data,int n)
{
    for(int i=0;i<n;i++)
    {
        cout<<data[i]<<" ";
    }
    cout<<endl;
}
 
int main()
{
    int **h_train_data , *h_test_data , *distance;
    
    //allocate space in heap for the variable
    h_train_data = new int*[N];
    for(int i=0;i<N;i++)
    {
        h_train_data[i] = new int[D];
    }
    h_test_data = new int[D];
    distance = new int[N];
 
    //initialize training data
    srand( (unsigned)time(NULL) );
    for( int i=0;i<N;i++ )
    {
        for( int j=0;j<D;j++)
        {
            h_train_data[i][j] = rand()%MAX;
        }
    }
    print(h_train_data);
 
    //initialize testing data
    for( int j=0;j<D;j++ )
    {
        h_test_data[j] = rand() % MAX;
    }
    cout<<"testing data:"<<endl;
    print(h_test_data,D);
 
    //calculate the distance
    cal_dis( h_train_data,h_test_data,distance );
 
    cout<<"distance data:"<<endl;
    print(distance,N);
 
    return 0;
}     

空间复杂度O(N*D),时间复杂度O(N*D)。

用openmp实现:

 

#include <omp.h>
//#include "omp.h"
#include <stdio.h>
#include <iostream>
#include <time.h>
#include<fstream>
#include<string.h>
#define N 8
#define TRAIN_NUM 3600
#define TEST_NUM 400
const int rowLen = TRAIN_NUM + TEST_NUM;
const int colLen = N;
typedef float tData;
using namespace::std;
ifstream fin;
class blog4
{
public:
    clock_t parallel(float a[][N], float b[][N], float c[TEST_NUM][TRAIN_NUM]);
    clock_t serial(float a[][N], float b[][N], float c[TEST_NUM][TRAIN_NUM]);
};



clock_t blog4::serial(float a[][N], float b[][N], float c[TEST_NUM][TRAIN_NUM])
{
    int i, j, k;
    clock_t startTime = clock();
    float sum = 0;
    for (i = 0; i < TEST_NUM; i++)
    {
        for (j = 0; j < TRAIN_NUM; j++)
        {
            c[i][j] = 0;
            for (k = 0; k < N; k++)
                c[i][j] += (a[i][k] - b[j][k])* (a[i][k] - b[j][k]);
        }
    }
    clock_t endTime = clock();
    for (i = 0,sum=0; i <N; i++)
    {
        for (j = 0; j < N; j++)
        {
            sum += c[i][j];
        }
    }
    std::cout << "serial time: " << endTime - startTime << std::endl;
    cout << "sum:" << sum<<endl;
    return endTime - startTime;
}

clock_t blog4::parallel(float a[][N], float b[][N], float c[TEST_NUM][TRAIN_NUM])
{
    int i, j, k;
    float sum = 0;
    clock_t t1 = clock();
#pragma omp parallel shared(a,b,c) private(i,j,k)
    {
#pragma omp for schedule(dynamic)
        for (i = 0; i < TEST_NUM; i++)
        {
            for (j = 0; j < TRAIN_NUM; j++)
            {
                c[i][j] = 0;
                for (k = 0; k < N; k++)
                    c[i][j] += (a[i][k] - b[j][k]) * (a[i][k] - b[j][k]);

            }
        }
    }
    clock_t t2 = clock();
    for (i = 0, sum = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
        {
            sum += c[i][j];
        }
    }
    std::cout << "Parallel time: " << t2 - t1 << std::endl;
    cout << "sum:" << sum<<endl;
    return t2 - t1;
}



int main()
{
    int i, j;
    float trainData[TRAIN_NUM][N], testData[TEST_NUM][N], dis[TEST_NUM][TRAIN_NUM], t;
    srand((unsigned)time(NULL));
    tData in;
    char filename[80]= "C:\\Users\\HP-PC\\source\\repos\\CUDA 11.1 Runtime1\\CUDA 11.1 Runtime1\\input2.txt";
    fin.open(filename);
    if (!fin)
    {
        cout << "can not open the file" << endl;
        exit(0);
    }
    string label;
    for (int i = 0; i < TEST_NUM; i++)
    {
        for (int j = 0; j < N; j++)
            fin >> testData[i][j];
        fin >> label;
    }
    for (int i = 0; i < TRAIN_NUM; i++)
    {
        for (int j = 0; j < N; j++)
            fin >> trainData[i][j];
        fin >> label;
    }
    blog4 test;
    clock_t serialtime = test.serial(trainData,testData ,dis );
    clock_t paralleltime = test.parallel(trainData, testData, dis);
    printf("加速比为:%f s/%f s=%f \n",
        (double(serialtime)/ CLOCKS_PER_SEC), (double(paralleltime)/ CLOCKS_PER_SEC), (float)(serialtime) / paralleltime);
}

openmp实现比较简单,把对应for循环改一下就行了,实验效果如下:

size 900*4 & 100*4 900*8 & 100*8 3600*8 & 400*8 14400*8 & 1600*8
serial(ms) 4 3 58 947
parallel(ms) 1 2 15 643
加速比 2 3 3.86 1.483

到了大数组反而加速变小了,似乎是栈空间有些问题?

 

并行计算——实验报告

size为3600*8 & 400*8的运行结果.png

二.CUDA实现计算向量的欧氏距离

1.仅使用并行,不做任何优化

 

__global__ void cal_dis_origin(tData* train_data, tData* test_data, tData* dis, int pitch, int N, int D)
    {
        int tid = blockIdx.x;
        if (tid < N)
        {
            tData temp = 0;
            tData sum = 0;
            for (int i = 0; i < D; i++)
            {
                temp = *((tData*)((char*)train_data + tid * pitch) + i) - test_data[i];
                sum += temp * temp;
            }
            dis[tid] = sum;
        }
    }

void KNN::get_all_distance_parall()
{
    int height = rowLen - test_data_num;
    tData* distance = new tData[height];
    tData* d_train_data, * d_test_data, * d_dis;
    //allocate memory on GPU
    cudaEvent_t startTime_all, endTime_all;
    cudaEventCreate(&startTime_all);
    cudaEventCreate(&endTime_all);
    cudaEventRecord(startTime_all, 0);
    size_t pitch_d;
    size_t pitch_h = colLen * sizeof(tData);
    //allocate memory on GPU
    cudaMallocPitch(&d_train_data, &pitch_d, colLen * sizeof(tData), height);
    cudaMalloc(&d_test_data, colLen * sizeof(tData));
    cudaMalloc(&d_dis, height * sizeof(tData));

    cudaMemset(d_train_data, 0, height * colLen * sizeof(tData));
    cudaMemset(d_test_data, 0, colLen * sizeof(tData));
    cudaMemset(d_dis, 0, height * sizeof(tData));

    //copy training and testing data from host to device
    cudaMemcpy2D(d_train_data, pitch_d, trainingData, pitch_h, colLen * sizeof(tData), height, cudaMemcpyHostToDevice);
    cudaMemcpy(d_test_data, testData, colLen * sizeof(tData), cudaMemcpyHostToDevice);

    cudaEvent_t startTime_cal, endTime_cal;
    cudaEventCreate(&startTime_cal);
    cudaEventCreate(&endTime_cal);
    cudaEventRecord(startTime_cal, 0);

    //calculate the distance

    cal_dis_origin << <height, 1 >> > (d_train_data, d_test_data, d_dis, pitch_d, height, colLen);


    cudaEventRecord(endTime_cal, 0);
    cudaEventSynchronize(startTime_cal);
    cudaEventSynchronize(endTime_cal);
    //copy distance data from device to host
    cudaMemcpy(distance, d_dis, height * sizeof(tData), cudaMemcpyDeviceToHost);
    cudaEventRecord(endTime_all, 0);
    cudaEventSynchronize(startTime_all);
    cudaEventSynchronize(endTime_all);
    float cal_time, all_time;
    cudaEventElapsedTime(&cal_time, startTime_cal, endTime_cal);
    cudaEventElapsedTime(&all_time, startTime_all, endTime_all);
    this->sum_cal_parall += cal_time ;
    this->sum_all_parall += all_time ;
    cudaEventDestroy(startTime_cal);
    cudaEventDestroy(startTime_all);
    cudaEventDestroy(endTime_cal);
    cudaEventDestroy(endTime_all);

    int i;
    for (i = 0; i < height; i++)
    {
        map_index_dis[i + test_data_num] = distance[i];
    }
    cudaFree(d_test_data);
    cudaFree(d_train_data);
    cudaFree(d_dis);
    free(distance);
}

如上,blocknumtrainDATA_num,由于存在对显存的重复访问,且存取方式为一个线程取一段连续的显存数据,效率较低,在数据量较小时,甚至比

2使用shared memory存储中间矩阵并使用block内规约

 

void KNN::get_all_distance_2() {
    int height = rowLen - test_data_num;
    tData** distance = new tData*[test_data_num];
    for (int i = 0; i < test_data_num; ++i)
    {
        distance[i] = new tData[height];
    }
    tData* d_train_data, * d_test_data, * d_dis;
    dim3 blockdim(8, 0, 0), threaddim(TILE_WIDTH, DIM_NUM, 0);
    cudaMalloc((void**)&d_test_data, 513 *colLen * sizeof(tData));
    cudaMalloc((void**)&d_dis, test_data_num* height * sizeof(tData));
    cudaMalloc((void**)&d_train_data, colLen * 3713 * sizeof(tData));
    cudaMemset(d_dis, 0, height * sizeof(tData));
    cudaMemset(d_test_data, 0, 513 * colLen * sizeof(tData));
    cudaMemset(d_train_data, 0, colLen * 3713 * sizeof(tData));
    cudaMemcpy(d_train_data, trainingData, height * colLen * sizeof(tData), cudaMemcpyHostToDevice);
    cudaMemcpy(d_test_data, testdata2, test_data_num *colLen * sizeof(tData), cudaMemcpyHostToDevice);



    cal_dis2 << <blockdim, threaddim >> > (d_train_data, d_test_data, d_dis);


    cudaMemcpy(distance, d_dis, test_data_num * height * sizeof(tData), cudaMemcpyDeviceToHost);


    float cal_time, all_time;

    double sum = 0;
    for (int i = 0; i < TRAIN_NUM; i++)
    {
        for (int j = 0; j < TEST_NUM; j++)
        {
            sum += distance[i][j];
        }
    }

    cout << "sum=" << sum<<endl;
    cudaFree(d_test_data);
    cudaFree(d_train_data);
    cudaFree(d_dis);
    free(distance);
}
//global function

__global__ void cal_dis(tData* train_data, tData* test_data, tData* dis)
{
    __shared__ tData tmp[THREAD_NUM / DIM_NUM][DIM_NUM];//共享内存 一个block线程处理一个tmp里面的数据
    int TmpIndexX, TmpIndexY, i, n;
    clock_t start, end;
    tData tempx;
    n = TRAIN_NUM * DIM_NUM;
    const int tidx = threadIdx.x;
    const int bidx = blockIdx.x;
    int tid = bidx * THREAD_NUM + tidx;
    const int footstep = THREAD_NUM * BLOCK_NUM;
    int fstepn = 0;
    int RightSwift = 3;
    while (tid < n) {
        TmpIndexX = (tid & (THREAD_NUM - 1)) >> RightSwift;// X=(tid%thread_num)/dim
        TmpIndexY = tid & (DIM_NUM - 1);
        tempx = train_data[tid] - test_data[tidx & (DIM_NUM - 1)];
        tmp[TmpIndexX][TmpIndexY] = tempx * tempx;
        __syncthreads();
        tData sum;
        if (tidx < THREAD_NUM / DIM_NUM) {
            for (i = 0, sum = 0; i < DIM_NUM; i++)
            {
                /*sum += tmp[tidx][(i+tidx)&(DIM_NUM - 1)];*/
                sum += tmp[tidx][i];
            }
            dis[fstepn * THREAD_NUM / DIM_NUM * BLOCK_NUM + bidx * THREAD_NUM / DIM_NUM + tidx] = sum;
        }
        __syncthreads();
        tid += footstep;
        fstepn++;
    }
}

//Parallel calculate the distance
void KNN::get_all_distance()
{
    int height = rowLen - test_data_num;
    tData* distance = new tData[height];
    tData* d_train_data, * d_test_data, * d_dis;
    //allocate memory on GPU
    cudaEvent_t startTime_all, endTime_all;
    cudaEventCreate(&startTime_all);
    cudaEventCreate(&endTime_all);
    cudaEventRecord(startTime_all, 0);

    cudaMalloc((void**)&d_test_data, colLen * sizeof(tData));
    cudaMalloc((void**)&d_dis, height * sizeof(tData));
    cudaMalloc((void**)&d_train_data, colLen * height * sizeof(tData));

    cudaMemset(d_train_data, 0, height * colLen * sizeof(tData));
    cudaMemset(d_test_data, 0, colLen * sizeof(tData));
    cudaMemset(d_dis, 0, height * sizeof(tData));

    //copy training and testing data from host to device
    cudaMemcpy(d_train_data, trainingData, height * colLen * sizeof(tData), cudaMemcpyHostToDevice);
    cudaMemcpy(d_test_data, testData, colLen * sizeof(tData), cudaMemcpyHostToDevice);
    //calculate the distance

    cudaEvent_t startTime_cal, endTime_cal;
    cudaEventCreate(&startTime_cal);
    cudaEventCreate(&endTime_cal);
    cudaEventRecord(startTime_cal, 0);


    cal_dis << <BLOCK_NUM, THREAD_NUM >> > (d_train_data, d_test_data, d_dis);

    cudaEventRecord(endTime_cal, 0);
    cudaEventSynchronize(startTime_cal);
    cudaEventSynchronize(endTime_cal);
    //copy distance data from device to host
    cudaMemcpy(distance, d_dis, height * sizeof(tData), cudaMemcpyDeviceToHost);
    cudaEventRecord(endTime_all, 0);
    cudaEventSynchronize(startTime_all);
    cudaEventSynchronize(endTime_all);
    float cal_time, all_time;
    cudaEventElapsedTime(&cal_time, startTime_cal, endTime_cal);
    cudaEventElapsedTime(&all_time, startTime_all, endTime_all);
    this->sum_cal_time += cal_time;
    this->sum_all_time += all_time ;
    cudaEventDestroy(startTime_cal);
    cudaEventDestroy(startTime_all);
    cudaEventDestroy(endTime_cal);
    cudaEventDestroy(endTime_all);

    int i;
    for (i = 0; i < height; i++)
    {
        map_index_dis[i + test_data_num] = distance[i];
    }
    cudaFree(d_test_data);
    cudaFree(d_train_data);
    cudaFree(d_dis);
    free(distance);
}
}

这里的思路是:对于size为test_num 维度为8(我的设定是向量是8维的)的test_data矩阵和size为train_num 维度为8的train_data矩阵,由于求解每一个test向量到每一个train向量的距离时,都会取一次这两个向量8维度的数据,相当于多做了test_num*train_num次取数据的操作,会大大增加延时,实际上,我们可以通过分块的思想,每次取一块的train_data和test_data存在共享内存上,再由这个block内的线程完成运算,这样,就会取数据的次数就会降低width倍(width为共享内存块的大小)。在对distance 8个维度求和时,使用规约算法,效率更高

三.CUDA实现并行排序

CPU排序:调用C++库函数sort()

 

sort(vec_index_dis.begin(), vec_index_dis.end(), CmpByValue());

GPU排序:归并排序

归并排序因为可以分解,很适合并行计算;但是传统的归并排序在合并时,到最后只有少数线程在操作,效率比较低。因此先使用桶排序让数据在列方向有序,然后所有列一起归并,而不是两两归并。
首先可以进行按列的桶排序,让数据在每一列有序。这里桶排序为0/1桶排序。
桶排序

 

__device__ void radix_sort2(unsigned int * const sort_tmp, 
                            unsigned int * const sort_tmp_1,
                            const unsigned int tid) //桶排序
{
    for(unsigned int bit_mask = 1; bit_mask > 0; bit_mask <<= 1)    //32位
    {
        unsigned int base_cnt_0 = 0;
        unsigned int base_cnt_1 = 0;

        for (unsigned int i = 0; i < NUM_ELEMENT; i+=NUM_LISTS) 
        {
            if(sort_tmp[i+tid] & bit_mask)  //该位是1,放到sort_tmp_1中
            {
                sort_tmp_1[base_cnt_1+tid] = sort_tmp[i+tid];
                base_cnt_1 += NUM_LISTS;
            }
            else    //该位是0,放到sort_tmp的前面的
            {
                sort_tmp[base_cnt_0+tid] = sort_tmp[i+tid];
                base_cnt_0 += NUM_LISTS;
            }
        }

        for (unsigned int i = 0; i < base_cnt_1; i+=NUM_LISTS)  //将sort_tmp_1的数据放到sort_tmp后面
        {
            sort_tmp[base_cnt_0+i+tid] = sort_tmp_1[i+tid];
        }
        __syncthreads();
    }
}

桶排序后,数据在每一列方向上是有序的,现在需要将所有列归并。共有五种合并方式,代码和效率如下:

1.单线程合并

单线程归并指的是使用gpu中的一个线程进行归并操作。由于没有利用到多线程,速度较慢。

 

__device__ void merge_1(unsigned int * const srcData, 
                        unsigned int * const dstData,
                        const unsigned int tid) //单线程合并
{
    __shared__ unsigned int list_index[NUM_LISTS];  //不使用__shared__的话,就会创建在寄存器中,寄存器空间不够则会创建在全局内存中,很慢
    list_index[tid] = tid;    //使用多线程初始化
    __syncthreads();

    if(tid == 0)    //使用单个线程merge
    {
        for(int i = 0; i < NUM_ELEMENT; i++)    //执行NUM_ELEMENT次
        {
            unsigned int min_val = 0xFFFFFFFF;
            unsigned int min_idx = 0;
            for(int j = 0; j < NUM_LISTS; j++)  //遍历每个list的头指针
            {
                if(list_index[j] >= NUM_ELEMENT)    //列表已经走完则跳过
                    continue;
                if(srcData[list_index[j]] < min_val)    
                {
                    min_val = srcData[list_index[j]];
                    min_idx = j;
                }
            }
            list_index[min_idx] += NUM_LISTS;   //最小的那个指针向后一位
            dstData[i] = min_val;
        }
    }
}

2.多线程合并

思路比较简单,就是每个线程使用actomicMin()函数,找到最小值

 

__device__ void merge_atomicMin(unsigned int* const srcData,
    unsigned int* const dstData,
    const unsigned int tid)   //多线程合并
{
    unsigned int self_index = tid;

    for (int i = 0; i < NUM_ELEMENT; i++)
    {
        __shared__ unsigned int min_val;
        unsigned int self_data = 0xFFFFFFFF;

        if (self_index < NUM_ELEMENT)
        {
            self_data = srcData[self_index];
        }

        __syncthreads();

        atomicMin(&min_val, self_data);

        if (min_val == self_data)
        {
            self_index += NUM_LISTS;
            dstData[i] = min_val;
            min_val = 0xFFFFFFFF;
        }

    }
}

3、两两规约

 

__device__ void merge_two(unsigned int * const srcData, 
                        unsigned int * dstData, 
                        const unsigned int tid) //归约合并
{
    unsigned int self_index = tid;
    __shared__ unsigned int data[NUM_LISTS];
    __shared__ unsigned int tid_max;

    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        data[tid] = 0xFFFFFFFF;
        
        if(self_index < NUM_ELEMENT)
        {
            data[tid] = srcData[self_index];
        }

        if(tid == 0)
        {
            tid_max = NUM_LISTS >> 1;
        }

        __syncthreads();

        while(tid_max > 0)
        {
            if(tid < tid_max)
            {
                if(data[tid] > data[tid + tid_max])   //小的换到前半段
                {
                    data[tid] = data[tid + tid_max];
                }
            }
            __syncthreads();
            if(tid == 0)    //不清楚书里面为什么不让单一线程处理共享变量,不会出问题吗?
            {
                tid_max >>= 1;
            }
            __syncthreads();
        }

        if(srcData[self_index] == data[0])
        {
            dstData[i] = data[0];
            self_index += NUM_LISTS;
        }
    }
}

4、分块规约:

两两归约到后面,也存在这活动线程太少的问题,因此采用分块归约,即X个线程先归约到Y个值,然后Y个值再归约到1个值,X*Y=NUM_LISTS。

 

#define REDUCTION_SIZE  8
#define REDUCTION_SHIFT 3

__device__ void merge_final(unsigned int * const srcData, 
                            unsigned int * const dstData, 
                            const unsigned int tid) //分块的归约合并
{
    __shared__ unsigned int min_val_reduction[NUM_LISTS/REDUCTION_SIZE];
    unsigned int s_tid = tid >> REDUCTION_SHIFT;
    unsigned int self_index = tid;
    __shared__ unsigned int min_val;

    for(int i = 0; i < NUM_ELEMENT; i++)
    {
        unsigned int self_data = 0xFFFFFFFF;

        if(self_index < NUM_ELEMENT)
        {
            self_data = srcData[self_index];
        }

        if(tid < NUM_LISTS/REDUCTION_SIZE)
        {
            min_val_reduction[tid] = 0xFFFFFFFF;
        }

        __syncthreads();

        atomicMin(&(min_val_reduction[s_tid]), self_data);  //分块归约

        __syncthreads();

        if(tid == 0)
        {
            min_val = 0xFFFFFFFF;
        }

        __syncthreads();

        if(tid < NUM_LISTS/REDUCTION_SIZE)
        {
            atomicMin(&min_val, min_val_reduction[tid]);    //归约起来的值再归约
        }

        __syncthreads();

        if(min_val == self_data)
        {
            dstData[i] = min_val;
            self_index += NUM_LISTS;
            min_val = 0xFFFFFFFF;
        }

    }
}

测试结果:

方法(NUM_LISTS) 8 16 32 64 128
cpu 0.00050800 0.00050700 0.00050700 0.00050700 0.00050800
gpu单线程 0.00135200 0.00127200 0.00169200 0.00285700 0.00637600
gpu多线程 0.00122300 0.00088400 0.00107800 0.00100100 0.00094100
gpu两两归约 0.00340800 0.00378200 0.00431200 0.00493000 0.00578400
gpu分块归约 0.00186800 0.00148400 0.00130000 0.00124200 0.00124200

并行计算——实验报告

数量为4096,lists为128的运行结果

 

发现sort()库函数的效率居然是最高的。。。所以在KNN的程序中没有使用自己写的CUDA排序算法,而是直接调库。。。。
不过可以看到,在cuda单线程(即真实的串行程序下)性能最低,两两规约的效率应该是由于最后使用的线程太少,效率也不是很高,分块规约的效率最高。

四.应用

k=7

应用背景——KNN算法基本流程

KNN(K Near Neighbor):k个最近的邻居,即每个样本都可以用它最接近的k个邻居来代表。最近邻 (k-Nearest Neighbors, KNN) 算法是一种无监督聚类算法。

 

并行计算——实验报告

image.png

 

优点:精度高、对异常值不敏感、无数据输入假定。

缺点:计算复杂度高、空间复杂度高。

适用数据范围:数值型和标称性。

工作原理:存在一个样本数据集合,也称作训练样本集,并且样本集中每个数据都存在标签,即我们知道样本集中每一个数据与所属分类的对应关系。输入没有标签的新数据后,将新数据的每个特征与样本集中数据对应的特征进行比较,然后算法提取样本集中特征最相似数据(最近邻)的分类标签。一般来说,我们只选择样本数据及中前k个最相似的数据,这就是k-近邻算法中k的出处,通常k选择不大于20的整数。最后,选择k个最相似数据中出现次数最多的分类,作为新数据的分类。

K-近邻算法的一般流程:

(1)收集数据:可以使用任何方法

(2)准备数据:距离计算所需要的数值,最好是结构化的数据格式

(3)分析数据:可以使用任何方法

(4)训练算法:此步骤不适用k-邻近算法

(5)测试算法:计算错误率

(6)使用算法:首先需要输入样本数据和结构化的输出结果,然后运行k-近邻算法判定输入数据分别属于哪个分类,最后应用对计算出的分类执行后续的处理。
现在我们假设一个场景,就是要为坐标上的点进行分类,如下图所示:

并行计算——实验报告

image

上图一共12个左边点,每个坐标点都有相应的坐标(x,y)以及它所属的类别A/B,那么现在需要做的就是给定一个点坐标(x1,y1),判断它属于的类别A或者B。
点的坐标在input.txt文件中

step1:

通过类的默认构造函数去初始化训练数据集dataSet和测试数据testData。

step2:

用get_distance()来计算测试数据testData和每一个训练数据dataSet[index]的距离,用map_index_dis来保存键值对<index,distance>,其中index代表第几个训练数据,distance代表第index个训练数据和测试数据的距离。

step3:

将map_index_dis按照value值(即distance值)从小到大的顺序排序,然后取前k个最小的value值,用map_label_freq来记录每一个类标签出现的频率。

step4:

遍历map_label_freq中的value值,返回value最大的那个key值,就是测试数据属于的类。

故KNN使用C++实现的串行代码如下:

 

#include<iostream>
#include<map>
#include<vector>
#include<stdio.h>
#include<cmath>
#include<cstdlib>
#include<algorithm>
#include<fstream>
 
using namespace std;
 
typedef char tLabel;
typedef double tData;
typedef pair<int,double>  PAIR;
const int colLen = 2;
const int rowLen = 12;
ifstream fin;
ofstream fout;
 
class KNN
{
private:
        tData dataSet[rowLen][colLen];
        tLabel labels[rowLen];
        tData testData[colLen];
        int k;
        map<int,double> map_index_dis;
        map<tLabel,int> map_label_freq;
        double get_distance(tData *d1,tData *d2);
public:
 
        KNN(int k);
 
        void get_all_distance();
        
        void get_max_freq_label();
 
        struct CmpByValue
        {
            bool operator() (const PAIR& lhs,const PAIR& rhs)
            {
                return lhs.second < rhs.second;
            }
        };
        
};
 
KNN::KNN(int k)
{
    this->k = k;
 
    fin.open("data.txt");
 
    if(!fin)
    {
        cout<<"can not open the file data.txt"<<endl;
        exit(1);
    }
 
    /* input the dataSet */ 
    for(int i=0;i<rowLen;i++)
    {
        for(int j=0;j<colLen;j++)
        {
            fin>>dataSet[i][j];
        }
        fin>>labels[i];
    }
 
    cout<<"please input the test data :"<<endl;
    /* inuput the test data */
    for(int i=0;i<colLen;i++)
        cin>>testData[i];
    
}
 
/*
 * calculate the distance between test data and dataSet[i]
 */
double KNN:: get_distance(tData *d1,tData *d2)
{
    double sum = 0;
    for(int i=0;i<colLen;i++)
    {
        sum += pow( (d1[i]-d2[i]) , 2 );
    }
 
//  cout<<"the sum is = "<<sum<<endl;
    return sqrt(sum);
}
 
/*
 * calculate all the distance between test data and each training data
 */
void KNN:: get_all_distance()
{
    double distance;
    int i;
    for(i=0;i<rowLen;i++)
    {
        distance = get_distance(dataSet[i],testData);
        //<key,value> => <i,distance>
        map_index_dis[i] = distance;
    }
 
    //traverse the map to print the index and distance
    map<int,double>::const_iterator it = map_index_dis.begin();
    while(it!=map_index_dis.end())
    {
        cout<<"index = "<<it->first<<" distance = "<<it->second<<endl;
        it++;
    }
}
 
/*
 * check which label the test data belongs to to classify the test data 
 */
void KNN:: get_max_freq_label()
{
    //transform the map_index_dis to vec_index_dis
    vector<PAIR> vec_index_dis( map_index_dis.begin(),map_index_dis.end() );
    //sort the vec_index_dis by distance from low to high to get the nearest data
    sort(vec_index_dis.begin(),vec_index_dis.end(),CmpByValue());
 
    for(int i=0;i<k;i++)
    {
        cout<<"the index = "<<vec_index_dis[i].first<<" the distance = "<<vec_index_dis[i].second<<" the label = "<<labels[vec_index_dis[i].first]<<" the coordinate ( "<<dataSet[ vec_index_dis[i].first ][0]<<","<<dataSet[ vec_index_dis[i].first ][1]<<" )"<<endl;
        //calculate the count of each label
        map_label_freq[ labels[ vec_index_dis[i].first ]  ]++;
    }
 
    map<tLabel,int>::const_iterator map_it = map_label_freq.begin();
    tLabel label;
    int max_freq = 0;
    //find the most frequent label
    while( map_it != map_label_freq.end() )
    {
        if( map_it->second > max_freq )
        {
            max_freq = map_it->second;
            label = map_it->first;
        }
        map_it++;
    }
    cout<<"The test data belongs to the "<<label<<" label"<<endl;
}
 
int main()
{
    int k ;
    cout<<"please input the k value : "<<endl;
    cin>>k;
    KNN knn(k);
    knn.get_all_distance();
    knn.get_max_freq_label();
    system("pause"); 
    return 0;
}

故对以上步骤,可以并行化的是step2——计算testdatatraindata的欧式距离distance,step3——对一个testdata的所有距离排序。故可以将上述CUDA程序使用在这个具体问题上,最终优化得到的代码如下(这里就只是用CUDA,不用OPENMP了):

 

__global__ void cal_dis(tData* train_data, tData* test_data, tData* dis)
{
    __shared__ tData tmp[THREAD_NUM / DIM_NUM][DIM_NUM];//共享内存 一个block线程处理一个tmp里面的数据
    int TmpIndexX, TmpIndexY, i, n;
    clock_t start, end;
    tData tempx;
    n = TRAIN_NUM * DIM_NUM;
    const int tidx = threadIdx.x;
    const int bidx = blockIdx.x;
    int tid = bidx * THREAD_NUM + tidx;
    const int footstep = THREAD_NUM * BLOCK_NUM;
    int fstepn = 0;
    int RightSwift = 3;
    while (tid < n) {
        TmpIndexX = (tid & (THREAD_NUM - 1)) >> RightSwift;// X=(tid%thread_num)/dim
        TmpIndexY = tid & (DIM_NUM - 1);
        tempx = train_data[tid] - test_data[tidx & (DIM_NUM - 1)];
        tmp[TmpIndexX][TmpIndexY] = tempx * tempx;
        __syncthreads();
        tData sum;
        if (tidx < THREAD_NUM / DIM_NUM) {
            for (i = 0, sum = 0; i < DIM_NUM; i++)
            {
                /*sum += tmp[tidx][(i+tidx)&(DIM_NUM - 1)];*/
                sum += tmp[tidx][i];
            }
            dis[fstepn * THREAD_NUM / DIM_NUM * BLOCK_NUM + bidx * THREAD_NUM / DIM_NUM + tidx] = sum;
        }
        __syncthreads();
        tid += footstep;
        fstepn++;
    }
}

//Parallel calculate the distance
void KNN::get_all_distance()
{
    int height = rowLen - test_data_num;
    tData* distance = new tData[height];
    tData* d_train_data, * d_test_data, * d_dis;
    //allocate memory on GPU
    cudaEvent_t startTime_all, endTime_all;
    cudaEventCreate(&startTime_all);
    cudaEventCreate(&endTime_all);
    cudaEventRecord(startTime_all, 0);

    cudaMalloc((void**)&d_test_data, colLen * sizeof(tData));
    cudaMalloc((void**)&d_dis, height * sizeof(tData));
    cudaMalloc((void**)&d_train_data, colLen * height * sizeof(tData));

    cudaMemset(d_train_data, 0, height * colLen * sizeof(tData));
    cudaMemset(d_test_data, 0, colLen * sizeof(tData));
    cudaMemset(d_dis, 0, height * sizeof(tData));

    //copy training and testing data from host to device
    cudaMemcpy(d_train_data, trainingData, height * colLen * sizeof(tData), cudaMemcpyHostToDevice);
    cudaMemcpy(d_test_data, testData, colLen * sizeof(tData), cudaMemcpyHostToDevice);
    //calculate the distance

    cudaEvent_t startTime_cal, endTime_cal;
    cudaEventCreate(&startTime_cal);
    cudaEventCreate(&endTime_cal);
    cudaEventRecord(startTime_cal, 0);


    cal_dis << <BLOCK_NUM, THREAD_NUM >> > (d_train_data, d_test_data, d_dis);

    cudaEventRecord(endTime_cal, 0);
    cudaEventSynchronize(startTime_cal);
    cudaEventSynchronize(endTime_cal);
    //copy distance data from device to host
    cudaMemcpy(distance, d_dis, height * sizeof(tData), cudaMemcpyDeviceToHost);
    cudaEventRecord(endTime_all, 0);
    cudaEventSynchronize(startTime_all);
    cudaEventSynchronize(endTime_all);
    float cal_time, all_time;
    cudaEventElapsedTime(&cal_time, startTime_cal, endTime_cal);
    cudaEventElapsedTime(&all_time, startTime_all, endTime_all);
    this->sum_cal_time += cal_time;
    this->sum_all_time += all_time ;
    cudaEventDestroy(startTime_cal);
    cudaEventDestroy(startTime_all);
    cudaEventDestroy(endTime_cal);
    cudaEventDestroy(endTime_all);

    int i;
    for (i = 0; i < height; i++)
    {
        map_index_dis[i + test_data_num] = distance[i];
    }
    cudaFree(d_test_data);
    cudaFree(d_train_data);
    cudaFree(d_dis);
    free(distance);
}

下图为串行程序和并行程序在不同数据大小下的运行时间:

size 900*4 & 100*4 900*8 & 100*8 3600*8 & 400*8 14400*8 & 1600*8
serial(ms) 7 5 77.0 1301.4
parallel_orig(ms) 5.2437 7.719257 14.19 48.31
parallel_upgrade(ms) 5.1606 4.816661 8.64 38.73
(update/serial)加速比 1.356408 1.038 9.625 33.60
error rate(对测试数据的分类结果错误率) 0.01 0.06 0.0525 0.091

此算法下KNN的召回率在95%左右?(KNN虽然简单但效率还真不错)
其中parallel_orig为未优化的算法,parallel_upgrade为优化后算法
可以看到,在数据量比较小的时候,并行算法效率有时还不如串行,单当数据量增大,串行加速比会呈指数型上升

实验总结

1、学习并熟悉了实际的CUDA c++和openmp 并行程序的开发,将并行开发的思维应用于实际操作中,加深了对并行计算的理解。
2、通过对大矩阵的分块计算,并通过反复试验,得到一个相对较优的结果,我学习到了网格和线程块尺寸对内核性能的影响。并学会了结合自己的GPU型号分析如何分配block和thread的数目以达到一个比较优秀的效果。
3、对GPU架构,特别是存储架构有了跟深入的了解,例如如何减少延时为上百个时钟周期的显存读写,如何合理使用内存较小的共享内存等等。
4、对knn算法有了更深入的认识,算法本身不难,时间允许的情况还可以尝试对实际问题的分类做个试验来验证本程序的准确性。

 

上一篇:C中的函数指针的用法


下一篇:计算数列和