#coding=gbk
import os
import struct
import sys
reload(sys)
sys.setdefaultencoding(‘gbk‘)
XDB_VERSION = 34 # 0x01 ~ 0xff
XDB_TAGNAME = ‘XDB‘ # First bytes
XDB_MAXKLEN = 0xf0 # maxklen: < 255
class XDB_R(object):
fd = False
hash_base = 0
hash_prime = 0
memread = None #内存
mem = False #是否启用内存
off = 0 #位置
len = 0 #内存长度
def __init__(self,mem=False):
self.mem = mem
pass
def __del__(self):
self.Close()
pass
def Open(self,fpath):
self.Close()
try:
fd = file(fpath,‘rb‘)
except IOError:
raise Exception(‘XDB::Open("‘ + os.path.basename(fpath) + ‘"),invalid xdb failed.‘)
else:
if(self.mem):
self.memread = fd.read()
self.len = len(self.memread)
self.fd = fd
if( self._check_header(fd) is False):
raise Exception(‘XDB::Open("‘ + os.path.basename(fpath) + ‘"),invalid xdb format.‘)
fd.close()
return True
def _read(self,size):
if(self.mem):
return self.memread[self.off:self.off+size]
else:
return self.fd.read(size)
def _seek(self,seek,flag=False):
if(self.mem):
if self.off > self.len: raise Exception(‘Mem offset !‘)
self.off = seek
else:
self.fd.seek(seek,flag)
def _close(self):
if(self.mem):
self.memread = None
else:
self.fd.close()
self.fd = False
def Get(self,key):
if(self.fd is False):
raise Exception(‘XDB:Get(), null db handler.‘)
klen = len(key)
#print klen
if(klen ==0 or klen > XDB_MAXKLEN):
return False
rec = self._get_record(key)
if(not rec.has_key(‘vlen‘) or rec[‘vlen‘] ==0):
return False
return rec[‘value‘]
def Close(self):
if(self.fd is False):
return
self._close()
def _get_index(self,key):
l = len(key)
h = self.hash_base
while l:
l-=1
h += (h << 5)
h ^= ord(key[l])
h &= 0x7fffffff
return (h % self.hash_prime)
def _check_header(self,fd):
fd.seek(0,os.SEEK_SET)
buf = fd.read(32)
if(len(buf) != 32): return False
unpack = struct.unpack(‘3s B I I I f 12s‘,buf)
if(len(unpack) <=6):
unpack = list(unpack)
unpack.extend(‘ ‘)
hdr = {}
hdr[‘tag‘],hdr[‘ver‘],hdr[‘base‘],hdr[‘prime‘],hdr[‘fsize‘],hdr[‘check‘],hdr[‘reversed‘] =unpack[0],unpack[1],unpack[2],unpack[3],unpack[4],unpack[5],unpack[6]
if(hdr[‘tag‘] != XDB_TAGNAME): return False
fstat = os.fstat(fd.fileno())
if(fstat.st_size != hdr[‘fsize‘]): return False
self.hash_base = hdr[‘base‘]
self.hash_prime = hdr[‘prime‘]
self.version = hdr[‘ver‘]
self.fsize = hdr[‘fsize‘]
def _get_record(self,key):
self._io_times = 1
index = self._get_index(key) if self.hash_prime > 1 else 0
poff = index * 8 + 32
self._seek(poff,os.SEEK_SET)
buf = self._read(8)
if(len(buf) ==8):
tmp = struct.unpack(‘I I‘,buf)
tmp = {‘off‘:tmp[0],‘len‘:tmp[1]}
else:tmp = {‘off‘:0,‘len‘:0}
return self._tree_get_record(tmp[‘off‘],tmp[‘len‘],poff,key)
def _tree_get_record(self,off,len,poff =0,key =‘‘):
if(len == 0): return {‘poff‘:poff}
self._io_times+=1
self._seek(off,os.SEEK_SET)
rlen = XDB_MAXKLEN + 17
if(rlen > len): rlen = len
buf = self._read(rlen)
unpack = struct.unpack(‘I I I I B‘,buf[0:17])
rec = {}
rec[‘loff‘],rec[‘llen‘],rec[‘roff‘],rec[‘rlen‘],rec[‘klen‘] = unpack[0],unpack[1],unpack[2],unpack[3],unpack[4]
fkey = buf[17:17+rec[‘klen‘]]
cmpl = cmp(key,fkey) if(key) else 0
#print key.decode(‘gbk‘),fkey.decode(‘gbk‘)
if(cmpl > 0):
buf =‘‘
return self._tree_get_record(rec[‘roff‘],rec[‘rlen‘],off+8,key)
elif (cmpl < 0):
buf=‘‘
return self._tree_get_record(rec[‘loff‘],rec[‘llen‘],off,key)
else:
rec[‘poff‘] = poff
rec[‘off‘] = off
rec[‘len‘] = len
rec[‘voff‘] = off + 17 + rec[‘klen‘]
rec[‘vlen‘] = len - 17 - rec[‘klen‘]
rec[‘key‘] = fkey
self._seek(rec[‘voff‘],os.SEEK_SET)
rec[‘value‘] = self._read(rec[‘vlen‘])
return rec
#
#aa = XDB_R(True)
#aa.Open(‘./dict.xdb‘)
#aab = aa.Get(‘上海‘)
#print aab
本文章来至源码世界 http://www.ymsky.net/views/65091.shtml
pyscws4 是一个python的分词程序,布布扣,bubuko.com
pyscws4 是一个python的分词程序