文章目录
前言
最近在做dblp数据集相关预处理工作,根据老师给定的论文,需要按照年份划分提取信息,年份是从1970--2008年。
一、分析dblp数据结构
从官网上下载dblp最早版本的数据集(不是最新的,最新的有2.1G),大约是1.12GB,XML格式,刚开始老师和我准备手工清洗,只清洗了一年的就好困难,果断放弃,让我想办法编程试一下。
查看各种博客后,发现可以用sax进行解析,十分方便。
二、python代码
1.提取article类型文章
代码如下:
#!/usr/bin/python
# use xml.sax, because it support external entity expansion
# this program will extract all article elements in dblp.xml into inProceedings.xml
import xml.sax
import codecs
from xml.sax.saxutils import escape
# a handler class for xml.sax.parse(source,handler)
class dblpHandler(xml.sax.handler.ContentHandler):
def __init__(self):
# xml.sax.ContentHandler.__init__(self)
self.isArticle = 0
self.outfile = codecs.open('data/article.xml', 'w', 'utf-8')
self.outfile.write('<?xml version="1.0" encoding="utf-8"?>\n')
self.outfile.write('<!DOCTYPE dblp SYSTEM "../dblp.dtd">\n')
self.outfile.write('<dblp>\n\n')
# starting tag of an element
def startElement(self, name, attrs):
if name == 'article':
self.isArticle = 1
self.outfile.write('<article>\n')
elif self.isArticle == 1:
self.outfile.write('<' + name + '>')
# ending tag of an element
def endElement(self, name):
if name == 'article':
self.outfile.write('</article>\n\n')
self.isArticle = 0
elif self.isArticle == 1:
self.outfile.write('</' + name + '>\n')
# write content in <article> and </article>
def characters(self, content):
if self.isArticle == 1:
if not content.isspace():
self.outfile.write(escape(content))
# write the ending file tag for article.xml and close the written file
def close_file(self):
self.outfile.write('</dblp>')
self.outfile.close()
if __name__ == '__main__':
dblp = dblpHandler()
source = open('./data/dblp.xml', 'r')
xml.sax.parse(source, dblp)
dblp.close_file()
print("extract successfully!")
和老师进行讨论之后,只需要提取article和inproceedings类型文章。将上述提取article类型的代码改成提取inproceeding
2.按照年份提取
代码如下:
#!/usr/bin/python
#use xml.sax, because it support external entity expansion
#this program will extract all article elements in dblp.xml into inProceedings.xml
import xml.sax
import codecs
from xml.sax.saxutils import escape
from multiprocessing import Pool
# a handler class for xml.sax.parse(source,handler)
class dblpHandler(xml.sax.handler.ContentHandler):
def __init__(self,year):
#xml.sax.ContentHandler.__init__(self)
self.isArticle = 0
self.buffer = ''
self.isYear = 0
self.year = year
self.currentYear = 0 # year for current parsing article paper
#self.outfile = codecs.open('data/inProceedings.xml','w','utf-8')
self.outfile = codecs.open('data/'+str(year)+'article.xml','w','utf-8')
self.outfile.write('<?xml version="1.0" encoding="utf-8"?>\n')
self.outfile.write('<!DOCTYPE dblp SYSTEM "../dblp.dtd">\n')
self.outfile.write('<dblp>\n\n')
#starting tag of an element
def startElement(self, name, attrs):
if name == 'article':
self.isArticle = 1
#self.outfile.write('<inproceedings>\n')
self.buffer += '<article>\n'
elif self.isArticle == 1: # name != inproceedings and isInProceedings == 1
#self.outfile.write('<'+name+'>')
self.buffer += '<'+name+'>'
if name == 'year':
self.isYear = 1
#ending tag of an element
def endElement(self,name):
if name == 'article':
#self.outfile.write('</inproceedings>\n\n')
self.buffer += '</article>\n\n'
if self.currentYear == self.year:
self.outfile.write(self.buffer)
self.buffer = ''
self.isArticle = 0
elif self.isArticle == 1:
#self.outfile.write('</'+name+'>\n')
self.buffer += '</'+name+'>\n'
if name == 'year':
self.isYear = 0
#write content in <article> and </article>
def characters(self,content):
if self.isArticle == 1:
if not content.isspace():
#self.outfile.write(escape(content))
self.buffer += escape(content)
if self.isYear == 1:
self.currentYear = int(content)
#write the ending file tag for inProceeding.xml and close the written file
def close_file(self):
self.outfile.write('</dblp>')
self.outfile.close()
if __name__ == '__main__':
for year in reversed(range(1970,2009)):
dblp = dblpHandler(year)
source = open('./data/article.xml','r',encoding='utf-8')
xml.sax.parse(source, dblp)
dblp.close_file()
print("compelete!")
最后,会得到article1970--article2008和inproceedings1970--inproceedings2008的xml文件。
总结
本次代码是根据GitHub上相关代码改编的,可能不是很有效,但是最后还是得到清洗后的数据,下次要进行作者提取。