#include<stdio.h>
- #include<string.h>
- #include<stdlib.h>
- //树结点定义
- typedef struct
- {
- int weight;
- int parent;
- int lchild;
- int rchild;
- }HTNode,*HuffmanTree;
- static char N[100];//用于保存正文
- //哈弗曼编码,char型二级指针
- typedef char **HuffmanCode;
- //封装最小权结点和次小权结点
- typedef struct
- {
- int s1;
- int s2;
- }MinCode;
- //函数声明
- void Error(char *message);
- HuffmanCode HuffmanCoding(HuffmanTree &HT,HuffmanCode HC,int *w,int n);
- MinCode Select(HuffmanTree HT,int n);
- //当输入1个结点时的错误提示
- void Error(char *message)
- {
- fprintf(stderr,"Error:%s\n",message);
- exit(1);
- }
- //构造哈夫曼树HT,编码存放在HC中,w为权值,n为结点个数
- HuffmanCode HuffmanCoding(HuffmanTree &HT,HuffmanCode HC,int *w,int n)
- {
- int i,s1=0,s2=0;
- HuffmanTree p;
- char *cd;
- int f,c,start,m;
- MinCode min;
- if(n<=1)
- {
- Error("Code too small!");//只有一个结点不进行编码,直接exit(1)退出。非return,如果return 会造成main函数HT[i]无值
- }
- m=2*n-1;//哈弗曼编码需要开辟的结点大小为2n-1
- HT=(HuffmanTree)malloc((m+1)*sizeof(HTNode));//开辟哈夫曼树结点空间 m+1 。为了对应关系,我们第0个空间不用。
- //初始化n个叶子结点,w[0] = 0,main函数已赋值
- for(p=HT,i=0;i<=n;i++,p++,w++)
- {
- p->weight=*w;
- p->parent=0;
- p->lchild=0;
- p->rchild=0;
- }
- //将n-1个非叶子结点的初始化
- for(;i<=m;i++,p++)
- {
- p->weight=0;
- p->parent=0;
- p->lchild=0;
- p->rchild=0;
- }
- //构造哈夫曼树
- for(i=n+1;i<=m;i++)
- {
- min=Select(HT,i-1);//找出最小和次小的两个结点
- s1=min.s1 ; //最小结点下标
- s2=min.s2;//次小结点下标
- HT[s1].parent=i;
- HT[s2].parent=i;
- HT[i].lchild=s1;
- HT[i].rchild=s2;
- HT[i].weight=HT[s1].weight+HT[s2].weight;//赋权和
- }
- //打印哈弗曼树
- printf("HT List:\n");
- printf("Number\t\tweight\t\tparent\t\tlchild\t\trchild\n");
- for(i=1;i<=m;i++)
- {
- printf("%d\t\t%d\t\t%d\t\t%d\t\t%d\t\n",i,HT[i].weight,HT[i].parent,HT[i].lchild,HT[i].rchild);
- }
- //从叶子结点到根节点求每个字符的哈弗曼编码
- HC=(HuffmanCode)malloc((n+1)*sizeof(char *));
- cd=(char *)malloc(n*sizeof(char *));//为哈弗曼编码动态分配空间
- cd[n-1]='\0';//如:3个结点编码最长为2。cd[3-1] = '\0';
- //求叶子结点的哈弗曼编码
- for(i=1;i<=n;i++)
- {
- start=n-1;
- //定义左子树为0,右子树为1
- /*
- 从最下面的1号节点开始往顶部编码(逆序存放),然后编码2号节点,3号......
- */
- for(c=i,f=HT[i].parent; f!=0; c=f,f=HT[f].parent)
- {
- if(HT[f].lchild==c)
- cd[--start]='0';
- else
- cd[--start]='1';
- }
- //为第i个字符分配编码空间
- HC[i]=(char *)malloc((n-start)*sizeof(char *));
- //将当前求出结点的哈弗曼编码复制到HC
- strcpy(HC[i],&cd[start]);
- }
- free(cd);
- return HC;
- }
- MinCode Select(HuffmanTree HT,int n)
- {
- int min,secmin;
- int temp = 0;
- int i,s1,s2,tempi = 0;
- MinCode code ;
- s1=1;
- s2=1;
- min = 66666;//足够大
- //找出权值weight最小的结点,下标保存在s1中
- for(i=1;i<=n;i++)
- {
- if(HT[i].weight<min && HT[i].parent==0)
- {
- min=HT[i].weight;
- s1=i;
- }
- }
- secmin = 66666;//足够大
- //找出权值weight次小的结点,下标保存在s2中
- for(i=1;i<=n;i++)
- {
- if((HT[i].weight<secmin) && (i!=s1) && HT[i].parent==0)
- {
- secmin=HT[i].weight;
- s2=i;
- }
- }
- //放进封装中
- code.s1=s1;
- code.s2=s2;
- return code;
- }
- void HuffmanTranslateCoding(HuffmanTree HT, int n,char* ch)
- {//译码过程
- int m=2*n-1;
- int i,j=0;
- printf("After Translation:");
- while(ch[j]!='\0')//ch[]:你输入的要译码的0101010串
- {
- i=m;
- while(0 != HT[i].lchild && 0 != HT[i].rchild)//从顶部找到最下面
- {
- if('0' == ch[j])//0 往左子树走
- {
- i=HT[i].lchild;
- }
- else//1 往右子树走
- {
- i=HT[i].rchild;
- }
- ++j;//下一个路径
- }
- printf("%c",N[i-1]);//打印出来
- }
- printf("\n");
- }
- void main()
- {
- HuffmanTree HT=NULL;
- HuffmanCode HC=NULL;
- int *w=NULL;
- int i,n;
- char tran[100];
- printf("Input N(char):");
- gets(N);
- fflush(stdin);
- n = strlen(N);
- w=(int *)malloc((n+1)*sizeof(int *));//开辟n+1个长度的int指针空间
- w[0]=0;
- printf("Enter weight:\n");
- //输入结点权值
- for(i=1;i<=n;i++)
- {
- printf("w[%d]=",i);
- scanf("%d",&w[i]);
- }
- fflush(stdin);
- //构造哈夫曼树HT,编码存放在HC中,w为权值,n为结点个数
- HC=HuffmanCoding(HT,HC,w,n);
- //输出哈弗曼编码
- printf("HuffmanCode:\n");
- printf("Number\t\tWeight\t\tCode\n");
- for(i=1;i<=n;i++)
- {
- printf("%c\t\t%d\t\t%s\n",N[i-1],w[i],HC[i]);
- }
- fflush(stdin);
- //译码过程
- printf("Input HuffmanTranslateCoding:");
- gets(tran);
- HuffmanTranslateCoding(HT, n, tran);
- return;
- }
#include <stdio.h>#include <stdlib.h>#include <memory.h> #define MAX_CODE_LENGTH 40 long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; /** * word与Huffman树编码 */struct vocab_word { long long cn; // 词在训练集中出现的频率 int *point; // 编码的节点路径 char *word, // 词 *code, // Huffman编码,每一位上,0或1 codelen; // Huffman编码长度}; struct vocab_word *vocab; /* * 打印构造过程的中间状态. */ void printState(long long* count, long long* binary, long long* parent_node) { printf("count[]:\t"); for(int x=0; x<vocab_size * 2; x++) { printf("%lld", count[x]); printf(" "); } printf("\n"); printf("binary[]:\t"); for(int x=0; x<vocab_size * 2; x++) { printf("%lld", binary[x]); printf(" "); } printf("\n"); printf("parent[]:\t"); for(int x=0; x<vocab_size * 2; x++) { printf("%lld", parent_node[x]); printf(" "); } printf("\n");}/** * 使用词频创建一棵的Huffman树. 频率高的字将具有更短的 * Huffman二进制码(binary code). * */// Create binary Huffman tree using the word counts// Frequent words will have short uniqe binary codesvoid CreateBinaryTree() { long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; char code[MAX_CODE_LENGTH]; // count: 词频. // binary: // parent_node: long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); // 初始化count数组的前一半 for (a = 0; a < vocab_size; a++) { count[a] = vocab[a].cn; //printf("count=%lld\n", count[a]); } // 初始化count数组的后一半,用于交换. 赋很大值. for (a = vocab_size; a < vocab_size * 2; a++) { count[a] = 1e15; //printf("count=%lld\n", count[a]); } // pos1 = vocab_size - 1; pos2 = vocab_size; //printf("pos1=%lld, pos2=%lld\n", pos1, pos2); printState(count, binary, parent_node); // 根据算法构建Huffman树,一次增加一个节点. // Following algorithm constructs the Huffman tree by adding one node at a time for (a = 0; a < vocab_size - 1; a++) { printf("----------------\n"); printf("pos1=%lld, pos2=%lld\n", pos1, pos2); // 每轮找到最小的两个值. // First, find two smallest nodes 'min1, min2' if (pos1 >= 0) { // 遍历所有词汇的count,比较count;取较小值. if (count[pos1] < count[pos2]) { min1i = pos1; pos1--; } else { min1i = pos2; pos2++; } } else { min1i = pos2; pos2++; } printf("min1i=%d, min2i=%d\n", min1i, min2i); printf("pos1=%lld, pos2=%lld\n", pos1, pos2); // 再比一次. if (pos1 >= 0) { if (count[pos1] < count[pos2]) { min2i = pos1; pos1--; } else { min2i = pos2; pos2++; } } else { min2i = pos2; pos2++; } // 最小值cnt的两个索引 printf("min1i=%d, min2i=%d\n", min1i, min2i); printf("count[min1i]=%d, count[min2i]=%d\n", count[min1i], count[min2i]); count[vocab_size + a] = count[min1i] + count[min2i]; parent_node[min1i] = vocab_size + a; parent_node[min2i] = vocab_size + a; binary[min2i] = 1; printf("count[vocab_size + a] = %d\n", count[vocab_size + a]); printf("parent_node[%d] = %d\n", min1i, parent_node[min1i]); printf("parent_node[%d] = %d\n", min2i, parent_node[min2i]); printf("binary[%d] = %d\n", min2i, binary[min2i]); printState(count, binary, parent_node); } // 将二进制编码分配给词汇表中每个词汇. // Now assign binary code to each vocabulary word for (a = 0; a < vocab_size; a++) { b = a; i = 0; while (1) { code[i] = binary[b]; point[i] = b; i++; b = parent_node[b]; if (b == vocab_size * 2 - 2) break; } // 得到huffman编码长度. vocab[a].codelen = i; // 得到huffman编码code及路径point. vocab[a].point[0] = vocab_size - 2; for (b = 0; b < i; b++) { vocab[a].code[i - b - 1] = code[b]; vocab[a].point[i - b] = point[b] - vocab_size; } } // 释放内存. free(count); free(binary); free(parent_node);}/** * 代码运行: * gcc ./huffman_tree.cpp; ./a.out */int main(){ vocab_size = 6; vocab = (vocab_word*) calloc(vocab_size, sizeof(vocab_word)); memset(vocab, 0, sizeof(vocab_word) * (vocab_size)); // 初始化code/point. for (int a = 0; a < vocab_size; a++) { vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); } // 事先对vocab按词频排好序(word2vec事先已经用qsort处理),从大到小排序. // 可以用qsort。 本代码直接已经人工排好序了. vocab[0].cn = 7; char* str = "T"; vocab[0].word = str; vocab[1].cn = 5; str = "E"; vocab[1].word = str; vocab[2].cn = 4; str = "G"; vocab[2].word = str; vocab[3].cn = 4; str = "R"; vocab[3].word = str; vocab[4].cn = 3; str = "O"; vocab[4].word = str; vocab[5].cn = 2; str = "F"; vocab[5].word = str; CreateBinaryTree(); for (int a = 0; a < vocab_size; a++) { printf("word=%s\t", vocab[a].word); printf("cn=%d\t", vocab[a].cn); printf("codelen=%d\t", vocab[a].codelen); printf("code="); for(int i = 0; i < vocab[a].codelen; i++) { printf("%d", vocab[a].code[i]); } printf("\t"); printf("point="); for(int i = 0; i < vocab[a].codelen; i++) { printf("%d-", vocab[a].point[i]); } printf("\n"); //printf("point=%s\n", vocab[a].point); } return 0;
————————————————
版权声明:本文为CSDN博主「桃根仙」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/taotaobaobei/java/article/details/78513979