这个作业属于哪个课程 | <网络1934-软件工程> |
---|---|
这个作业的要求在哪里 | <作业要求> |
这个作业的目标 | 完成论文查重代码上传到github上,完成github项目编写 |
PSP表格
PSP2.1 | Personal Software Process Stages | 预估耗时(分钟) | 实际耗时(分钟) |
---|---|---|---|
Planning | 计划 | ||
· Estimate | · 估计这个任务需要多少时间 | 605 | 590 |
Development | 开发 | ||
· Analysis | · 需求分析 (包括学习新技术) | 120 | 100 |
· Design Spec | · 生成设计文档 | 30 | 30 |
· Design Review | · 设计复审 | 30 | 30 |
· Coding Standard | · 代码规范 (为目前的开发制定合适的规范) | 20 | 20 |
· Design | · 具体设计 | 60 | 80 |
· Coding | · 具体编码 | 120 | 150 |
· Code Review | · 代码复审 | 30 | 30 |
· Test | · 测试(自我测试,修改代码,提交修改) | 90 | 90 |
Reporting | 报告 | ||
· Test Repor | · 测试报告 | 30 | 20 |
· Size Measurement | · 计算工作量 | 15 | 10 |
· Postmortem & Process Improvement Plan | · 事后总结, 并提出过程改进计划 | 60 | 30 |
· 合计 | 605 | 590 |
2.计算模块接口的设计与实现过程
2.1程序实现
第三方依赖--gojieba
实现一个md5hash方法和SimHash算法
File包:
File.go 打开文件以及输出文件
hash包:
hash.go 定义了一个hash接口
Haohasher.go 实现MD5哈希
gosimhash包:
gosimhash.go 实现simhash算法
main.go 主函数
项目结构:
项目流程:
2.2关键函数的分析与实现
gosimhash.go
Simhasher结构体包含一个jieba分词提取器和一个hash方法
type Simhasher struct {
extractor *jieba.Jieba
hasher hash.Hasher
}
通过jieba分词提取出词频权重,
func (simhasher *Simhasher) MakeSimHasher(data string, topk int) (uint64, error) {
//提取feature和weight
fws := simhasher.extractor.ExtractWithWeight(data, topk)
var err error
if len(fws) == 0 {
err = errors.New("输入文本数据为空,无数据可提取")
return 0, err
}
//将feature通过md5hash转换为uint64哈希值,并将哈希值和权重赋给hws
hws := simhasher.ConvertFeatureToHash(fws)
...
再将数据进行hash,
func (simhasher *Simhasher) ConvertFeatureToHash(fws []jieba.WordWeight) []HashWeight {
size := len(fws)
hws := make([]HashWeight, size, size)
for index, fw := range fws {
hws[index].hash, _ = simhasher.hasher.Hash64(fw.Word)
hws[index].weight = fw.Weight
}
return hws
}
加权降维
var one uint64 = 1
var vector [64]float64
for _, hw := range hws {
for i := 0; i < 64; i++ {
if (one << uint(i) & hw.hash) > 0 {
vector[i] += hw.weight
} else {
vector[i] -= hw.weight
}
}
}
var res uint64 = 0
for i, val := range vector {
if val > 0.0 {
res |= one << uint(i)
}
}
流程如下图
3计算模块接口的性能改进
通过对main函数进行10s的基准测试
func BenchmarkMain(b *testing.B) {
for i := 0; i < b.N; i++ {
main()
}
}
go test -run=xxx -bench=. -benchtime="10s" -cpuprofile profile_cpu.out
可以看到耗时基本是jieba分词提取器的创建初始化和分词提取占用的
4计算模块部分单元测试展示
gosimhash_test.go
package test
import (
"PapeCheck/gosimhash"
"fmt"
"testing"
)
var (
r1 uint64
r2 uint64
r3 uint64
r4 uint64
err error
)
func TestSimhasher_MakeSimHasher(t *testing.T) {
s1 := "今天是星期一,下午要去上课"
s2 := "今天是星期日,一整天不用上课"
s3 := "昨天是星期一,上午不用去上课"
s4 := ""
hasher := gosimhash.NewSimhasher()
r1, err = hasher.MakeSimHasher(s1, 100)
if err != nil {
t.Log(err)
} else {
fmt.Printf("Test1 succeed:Hash %s result is %v\n", s1, r1)
}
r2, err = hasher.MakeSimHasher(s2, 100)
if err != nil {
t.Log(err)
} else {
fmt.Printf("Test2 succeed:Hash %s result is %v\n", s2, r2)
}
r3, err = hasher.MakeSimHasher(s3, 100)
if err != nil {
t.Log(err)
} else {
fmt.Printf("Test3 succeed:Hash %s result is %v\n", s3, r3)
}
r4, err = hasher.MakeSimHasher(s4, 100)
if err != nil {
t.Logf("Test4 failed:%v\n", err)
} else {
fmt.Printf("Test4 succeed:Hash %s result is %v\n", s4, r4)
}
}
func TestGetSimilarity(t *testing.T) {
fmt.Printf("%v %v %v %v\n", r1, r2, r3, r4)
similarity1, err := gosimhash.GetSimilarity(r1, r2)
if err != nil {
t.Logf("similarity1 failed:%v\n", err)
} else {
fmt.Printf("Similarity1:%v\n", similarity1)
}
similarity2, err := gosimhash.GetSimilarity(r1, r3)
if err != nil {
t.Logf("similarity2 failed:%v\n", err)
} else {
fmt.Printf("Similarity2:%v\n", similarity2)
}
similarity3, err := gosimhash.GetSimilarity(r1, r4)
if err != nil {
t.Logf("similarity3 failed:%v\n", err)
} else {
fmt.Printf("Similarity3:%v\n", similarity3)
}
similarity4, err := gosimhash.GetSimilarity(r2, r3)
if err != nil {
t.Logf("similarity4 failed:%v\n", err)
} else {
fmt.Printf("Similarity4:%v\n", similarity4)
}
similarity5, err := gosimhash.GetSimilarity(r2, r4)
if err != nil {
t.Logf("similarity5 failed:%v\n", err)
} else {
fmt.Printf("Similarity5:%v\n", similarity5)
}
similarity6, err := gosimhash.GetSimilarity(r3, r4)
if err != nil {
t.Logf("similarity6 failed:%v\n", err)
} else {
fmt.Printf("Similarity6:%v\n", similarity6)
}
}
代码覆盖率100%
hash_test.go
package test
import (
"PapeCheck/hash"
"fmt"
"testing"
)
func TestHaoHasher_Hash64(t *testing.T) {
s1 := "今天是星期一,下午要去上课"
s2 := "今天是 期日,一整天不用上课"
s3 := "昨天是%#期一,上@不用!上课"
s4 := ""
hasher := hash.NewHaoHasher()
r1, err := hasher.Hash64(s1)
if err != nil {
t.Log(err)
} else {
fmt.Printf("Test1 succeed:Hash %s result is %v\n", s1, r1)
}
r2, err := hasher.Hash64(s2)
if err != nil {
t.Log(err)
} else {
fmt.Printf("Test2 succeed:Hash %s result is %v\n", s2, r2)
}
r3, err := hasher.Hash64(s3)
if err != nil {
t.Log(err)
} else {
fmt.Printf("Test3 succeed:Hash %s result is %v\n", s3, r3)
}
r4, err := hasher.Hash64(s4)
if err != nil {
t.Logf("Test4 failed:%v\n", err)
} else {
fmt.Printf("Test4 succeed:Hash %s result is %v\n", s4, r4)
}
}
代码覆盖率100%
5模块部分异常处理说明
利用一些函数返回error类型来处理异常情况