# encoding:utf-8
‘‘‘
Created on 2014年7月14日
@author: caoshouxin
‘‘‘
import os
import re
import os.path
from lxml import etree
from sogou import offdb,docid
import traceback
import struct
import logging as L
from time import localtime,strftime
L.basicConfig(level=L.INFO, format=‘[%(asctime)s] %(levelname)-8s %(message)s‘)
filename="baike_soso_upload_20140717-160704.26523.xml"
print "文件操作"+os.getcwd()
class sosobaikeProcess():
def __init__(self,filename,ip="127.0.0.1",port="9999"):
url_beg="http://baike.sogou.com/v"
url_end=".htm\n"
self.file_name=filename
self.offdb_rand=offdb.QuickAdapter()
self.offdb_rand.open(ip,port,5)
now_time=strftime("%Y-%m-%d",localtime())
dir=""
result_tup=self.getlemmaId_type()
if result_tup is not None:
(lemmaId,baike_type,value)=result_tup
outputFile="sosobaike_"+now_time+"_"+baike_type
outf=open(outputFile,‘a‘)
outf.write(url_beg+url_end)
outf.close()
self.put_qdb(lemmaId, value)
def put_qdb(self,lemmaId,value):
try:
key=struct.pack(‘i‘,int(lemmaId))
ret=self.offdb_rand.put(key,value,0,5)
if ret==0 or ret==1:
L.info("put file %s/%s success %d"%(self.file_name,lemmaId,1))
else:
self.offdb_reconnect(5,3)
except Exception,e:
L.error("put file %s/%s err %d because:%s"%(self.file_name,lemmaId,1,traceback.format_exc()))
self.offdb_rand.close()
def getlemmaId_type(self):
lemmaId_obj=re.compile("<lemmaId>(.*?)</lemmaId>.*?<action>(.*?)</action>",re.M)
lemma_obj=re.compile("<lemmaId>(.*?)</lemmaId>",re.M)
lemmaId=""
baike_type=""
if os.path.isfile(self.file_name):
f=open(self.file_name)
#为节约内存和提高匹配速度,只读取文件的1024字节
filecontent=f.read(1024)
f.close()
m=re.search(lemmaId_obj, filecontent)
if m is not None:
lemmaId=m.group(1)
baike_type=m.group(2)
L.info("put file %s lemmaId=%s, type=%s "%(self.file_name,lemmaId,baike_type))
return (lemmaId,baike_type)
else:
m_1=re.search(lemma_obj,filecontent)
if m_1 is not None:
lemmaId=m.group(1)
baike_type="update"
L.info("put file %s lemmaId=%s, type=%s "%(self.file_name,lemmaId,baike_type))
return (lemmaId,baike_type)
else:
L.info("put file %s not found result"%(self.file_name))
print None
else:
L.info("put file%s not found"%(self.file_name))
return None