pyscws4 是一个python的分词程序

#coding=gbk
import os
import struct
import sys
reload(sys)
sys.setdefaultencoding(‘gbk‘)
XDB_VERSION = 34 # 0x01 ~ 0xff
XDB_TAGNAME = ‘XDB‘ # First bytes
XDB_MAXKLEN = 0xf0 # maxklen: < 255

class XDB_R(object):
    fd = False
    hash_base = 0
    hash_prime = 0
    memread = None #内存
    mem = False #是否启用内存
    off = 0 #位置
    len = 0 #内存长度
    def __init__(self,mem=False):
        self.mem = mem
        pass
    def __del__(self):
        self.Close()
        pass
    def Open(self,fpath):
        self.Close()
        try:
            fd = file(fpath,‘rb‘)
        except IOError:
            raise Exception(‘XDB::Open("‘ + os.path.basename(fpath) + ‘"),invalid xdb failed.‘)
        else:
            if(self.mem):
                self.memread = fd.read()
                self.len = len(self.memread)
            self.fd = fd
        if( self._check_header(fd) is False):
            raise Exception(‘XDB::Open("‘ + os.path.basename(fpath) + ‘"),invalid xdb format.‘)
            fd.close()
        return True
    def _read(self,size):
        if(self.mem):
            return self.memread[self.off:self.off+size]
        else:
            return self.fd.read(size)
    def _seek(self,seek,flag=False):
        if(self.mem):
            if self.off > self.len: raise Exception(‘Mem offset !‘)
            self.off = seek
        else:
            self.fd.seek(seek,flag)
    def _close(self):
        if(self.mem):
            self.memread = None
        else:
            self.fd.close()
        self.fd = False
    def Get(self,key):
        if(self.fd is False):
            raise Exception(‘XDB:Get(), null db handler.‘)
        klen = len(key)
        #print klen
        if(klen ==0 or klen > XDB_MAXKLEN):
            return False
        rec = self._get_record(key)
        if(not rec.has_key(‘vlen‘)  or rec[‘vlen‘] ==0):
            return False
      
        return rec[‘value‘]
    def Close(self):
        if(self.fd is False):
            return
        self._close()
    def _get_index(self,key):
        l = len(key)
        h = self.hash_base
        while l:
            l-=1
            h += (h << 5)
            h ^= ord(key[l])
            h &= 0x7fffffff
        return (h % self.hash_prime)
    def _check_header(self,fd):
        fd.seek(0,os.SEEK_SET)
        buf = fd.read(32)
        if(len(buf) != 32): return False
        unpack = struct.unpack(‘3s B I I I f 12s‘,buf)
        if(len(unpack) <=6):
            unpack = list(unpack)
            unpack.extend(‘ ‘)
        hdr = {}
        hdr[‘tag‘],hdr[‘ver‘],hdr[‘base‘],hdr[‘prime‘],hdr[‘fsize‘],hdr[‘check‘],hdr[‘reversed‘] =unpack[0],unpack[1],unpack[2],unpack[3],unpack[4],unpack[5],unpack[6]
        if(hdr[‘tag‘] != XDB_TAGNAME): return False
        fstat = os.fstat(fd.fileno())
        if(fstat.st_size != hdr[‘fsize‘]): return False
        self.hash_base = hdr[‘base‘]
        self.hash_prime = hdr[‘prime‘]
        self.version = hdr[‘ver‘]
        self.fsize = hdr[‘fsize‘]
    def _get_record(self,key):
        self._io_times = 1
        index = self._get_index(key) if self.hash_prime > 1 else 0
        poff = index * 8 + 32
        self._seek(poff,os.SEEK_SET)
        buf = self._read(8)
       
        if(len(buf) ==8):
            tmp = struct.unpack(‘I I‘,buf)
            tmp = {‘off‘:tmp[0],‘len‘:tmp[1]}
        else:tmp = {‘off‘:0,‘len‘:0}
        return self._tree_get_record(tmp[‘off‘],tmp[‘len‘],poff,key)

    def _tree_get_record(self,off,len,poff =0,key =‘‘):
        if(len == 0): return {‘poff‘:poff}
        self._io_times+=1
        self._seek(off,os.SEEK_SET)
        rlen = XDB_MAXKLEN + 17
        
        if(rlen > len): rlen = len
        buf = self._read(rlen)
        unpack = struct.unpack(‘I I I I B‘,buf[0:17])
        rec = {}
        rec[‘loff‘],rec[‘llen‘],rec[‘roff‘],rec[‘rlen‘],rec[‘klen‘] = unpack[0],unpack[1],unpack[2],unpack[3],unpack[4]
        
        fkey = buf[17:17+rec[‘klen‘]]
        cmpl = cmp(key,fkey) if(key) else 0
        #print key.decode(‘gbk‘),fkey.decode(‘gbk‘)
        if(cmpl > 0):
            buf =‘‘
            return self._tree_get_record(rec[‘roff‘],rec[‘rlen‘],off+8,key)
        elif (cmpl < 0):
            buf=‘‘
            return self._tree_get_record(rec[‘loff‘],rec[‘llen‘],off,key)
        else:
            rec[‘poff‘] = poff
            rec[‘off‘] = off
            rec[‘len‘] = len
            rec[‘voff‘] = off + 17 + rec[‘klen‘]
            rec[‘vlen‘] = len - 17 - rec[‘klen‘]
            rec[‘key‘] = fkey
            self._seek(rec[‘voff‘],os.SEEK_SET)
            rec[‘value‘] = self._read(rec[‘vlen‘])
            return rec
#


#aa = XDB_R(True)
#aa.Open(‘./dict.xdb‘)
#aab = aa.Get(‘上海‘)
#print aab


本文章来至源码世界    http://www.ymsky.net/views/65091.shtml

pyscws4 是一个python的分词程序,布布扣,bubuko.com

pyscws4 是一个python的分词程序

上一篇:Java接口回调是个什么玩意儿


下一篇:vue-cli3构建的项目使用compression-webpack-plugin 进行Gzip压缩