数据结构-字典树

数据结构-字典树

什么是字典树

字典树又称Trie,单词查找树,典型用于统计,查找大量字符串,典型应用有通讯录,前缀搜索,搜索自动补全,打字补全等。如下图所示,树中存储单词cat,dog,deer, pan, panda
数据结构-字典树

BST搜索树的时间复杂度为O(logn), trie的时间复杂度为O(w),w为单词长度

源代码

基于TreeMap的存储方案

public class Trie {

    private class Node{

        public int use;//字符使用次数
        public int weight;//某个单词的权重
        public TreeMap<Character,Node> next;//映射

        public Node(){
            next = new TreeMap<>();
        }
    }


    private Node root;//根
    private int size;//字典树总单词数


    public Trie(){
        root = new Node();
        size = 0;
    }

    /**
     * 判断某个节点是否是某个单词的结束
     * @param node
     * @return
     */
    private boolean isWord(Node node){
        if(null==node)
            return false;
        return node.weight!=0;
    }

    // 获得Trie中存储的单词数量
    public int size(){
        return size;
    }

    /**
     * 添加一个单词
     * @param word
     */
    public void add(String word){
        char[] cs = checkWord(word);
        add(cs,cs.length);
    }

    /**
     * 添加一个单词
     * @param word 单词
     * @param weight 权重
     */
    public void add(String word, int weight){
        if(0==weight)
            throw new RuntimeException("weight must not zero");
        char[] cs = checkWord(word);
        add(cs,weight);
    }

    private void add(char[] cs, int weight){
        Node cur = root;
        for(char c:cs){
            if(null==cur.next.get(c)){
                cur.next.put(c,new Node());
            }
            cur = cur.next.get(c);
            cur.use++;
        }
        if(!isWord(cur)){
            ++size;
            cur.weight = weight;
        }
    }

    /**
     * 带.的匹配
     * @param pattern
     * @return
     */
    public boolean match(String pattern){
        char[] cs = checkWord(pattern);
        return match(root,cs,0);
    }

    private boolean match(Node node,char[] cs,int index){
        if(index==cs.length){
            return isWord(node);
        }
        char c = cs[index];
        if('.'!=c){
            Node cur = node.next.get(c);
            if(null==cur)
                return false;
            return match(cur,cs,index+1);
        }else{
            for(char ch:node.next.keySet()){
                if(match(node.next.get(ch),cs,index+1)){
                    return true;
                }
            }
            return false;
        }

    }

    /**
     * 是否包含某个单词
     * @param word
     * @return
     */
    public boolean contains(String word){
        char[]cs = checkWord(word);
        Node cur = root;
        for(char c:cs){
            if(null==cur.next.get(c)){
                return false;
            }
            cur = cur.next.get(c);
        }
        return isWord(cur);
    }


    /**
     * 查询是否在Trie中有单词以prefix为前缀
     * @param prefix
     * @return
     */
    public boolean isPrefix(String prefix){
        char[]cs = checkWord(prefix);
        Node cur = root;
        for(char c:cs){
            if(null==cur.next.get(c)){
                return false;
            }
            cur = cur.next.get(c);
        }
        return true;
    }


    private char[] checkWord(String word){
        if(null==word||word.isEmpty())
            throw new RuntimeException("word must not null or empty");
        return word.toCharArray();
    }

    /**
     * 某个单词的权重
     * @param word
     * @return
     */
    public int getWeight(String word){
        char[]cs = checkWord(word);
        Node cur = root;

        for(char c:cs){
            if(cur.next.get(c)==null)
                throw new RuntimeException("could not find word:"+word);
            cur = cur.next.get(c);
        }
        if(!isWord(cur)){
            throw new RuntimeException("could not find word:"+word);
        }
        return cur.weight;
    }


    /**
     * 找出以prefix为前缀的所有单词的权重和
     * @param prefix
     * @return
     */
    public int sumWeight(String prefix){
        char[]cs = checkWord(prefix);
        Node cur = root;
        for(char c:cs){
            if(cur.next.get(c)==null){
                return 0;
            }
            cur = cur.next.get(c);
        }
        return sumWeight(cur);
    }

    private int sumWeight(Node node){
        int ans = node.weight;
        for(char c:node.next.keySet()){
            ans+= sumWeight(node.next.get(c));
        }
        return ans;
    }

    /**
     *
     * @return 所有单词
     */
    public java.util.Set<String> getWords(){
        java.util.Set<String> words = new TreeSet<String>();
        getWords(root,words,"");
        return words;
    }

    private void getWords(Node node, Set<String> words,String word) {
        if(isWord(node)){
            words.add(word);
        }
        for(char c:node.next.keySet()){
            getWords(node.next.get(c),words,word+c);
        }
    }

    /**
     * 删除某个单词
     * @param word
     * @return
     */
    public boolean delete(String word){
        if(!contains(word)){
            throw new RuntimeException("word is not exists");
        }
        char[] cs = checkWord(word);
        Node cur = root;
        for(char c:cs){
            //如果某个节点只使用了一次,而且只有一个字节点,直接删除即可
            if(cur.next.get(c).use==1&&cur.next.size()==1){
                cur.next.remove(c);
                cur = null;
                break;
            }
            //否则置为下一个节点
            cur = cur.next.get(c);
            cur.use--;
        }
        //如果走完了,是一个单词的话,置为不是单词
        if(isWord(cur)){
            cur.weight = 0;
        }
        size--;
        return true;
    }
}


测试代码

public static void main(String[] args) {
        Trie trie = new Trie();
        trie.add("hello",1);
        trie.add("helloooo");
        System.out.println(trie.match("he..o"));
        trie.add("cat",1);
        trie.add("dog",1);
        trie.add("deer",1);
        trie.add("pan",1);
        trie.add("panda",1);

        System.out.println(trie.getWords());
        System.out.println(trie.getWeight("hello"));
//        System.out.println(trie.getWeight("hel"));

        System.out.println(trie.sumWeight("hel"));

        trie.delete("pan");
        //trie.delete("panda");
        System.out.println(trie.getWords());

    }

基于数组的存储方式

public class Trie {

    private class Node{

        public Node[] nodes;
        public boolean isWord;

        public Node(){
            nodes = new Node[26];
        }


        public Node get(char c){
            return nodes[c-'a'];
        }

        public boolean containsKey(char c){
            return get(c)!=null;
        }

        public void put(char c ,Node node){
            nodes[c-'a'] = node;
        }
    }

    private int size;
    private Node root;
    public Trie(){
        root = new Node();
    }


    public void add(String word){
        char[] cs = checkWord(word);
        Node cur = root;
        for(char c:cs){
            if(!cur.containsKey(c)){
                cur.put(c,new Node());
            }
            cur = cur.get(c);
        }
        if(!cur.isWord){
            cur.isWord = true;
            ++size;
        }
    }

    private char[] checkWord(String word){
        if(null==word||word.isEmpty())
            throw new RuntimeException("word must not null or empty");
        char[]cs = word.toCharArray();

        for(char c:cs){
            if(!(c>='a'&&c<='z')){
                throw new RuntimeException("字符必须a~z");
            }
        }
        return cs;
    }

    private char[] checkPattern(String pattern){
        if(null==pattern||pattern.isEmpty())
            throw new RuntimeException("pattern must not null or empty");
        char[]cs = pattern.toCharArray();

        for(char c:cs){
            if('.'==c)
                continue;
            if(!(c>='a'&&c<='z')){
                throw new RuntimeException("字符必须a~z");
            }
        }
        return cs;
    }

    public boolean contains(String word){
        char[] cs = checkWord(word);
        Node cur = root;
        for(char c:cs){
            if(!cur.containsKey(c))
                return false;
            cur = cur.get(c);
        }
        return cur.isWord;
    }

    //pattern可以写.匹配一个字符
    public boolean match(String pattern){
        char[] cs = checkPattern(pattern);

        return match(root,cs,0);
    }

    private boolean match(Node node, char[] cs, int index) {
        if(null==node)
            return false;

        if(index==cs.length){
            return node.isWord;
        }

        if(cs[index]=='.'){
            for(Node n:node.nodes){
                if(match(n,cs,index+1)){
                    return true;
                }
            }
            return false;
        }else{
            if(!node.containsKey(cs[index])){
                return false;
            }
            return match(node.get(cs[index]),cs,index+1);
        }
    }


    // 查询是否在Trie中有单词以prefix为前缀
    public boolean isPrefix(String prefix){
        char[] cs = checkWord(prefix);
        Node cur = root;
        for(char c:cs){
            if(!cur.containsKey(c))
                return false;
            cur = cur.get(c);
        }
        return true;
    }


    public int size() {
        return size;
    }


    public static void main(String[] args) {
        Trie trie = new Trie();
        trie.add("hello");
        System.out.println(trie.contains("hello"));
        System.out.println(trie.match("he.o"));
        System.out.println(trie.isPrefix("he"));
    }
}
部分算法解析

match算法

private boolean match(Node node,char[] cs,int index){
        if(index==cs.length){
            return isWord(node);
        }
        char c = cs[index];
        //不是. 就直接match下一个字符
        if('.'!=c){
            Node cur = node.next.get(c);
            if(null==cur)
                return false;
            return match(cur,cs,index+1);
        }else{
        	//是.的话需要匹配所有可能的路径,有一个满足即可
            for(char ch:node.next.keySet()){
                if(match(node.next.get(ch),cs,index+1)){
                    return true;
                }
            }
            return false;
        }

    }

数据结构-字典树

delete算法

public boolean delete(String word){
        if(!contains(word)){
            throw new RuntimeException("word is not exists");
        }
        char[] cs = checkWord(word);
        Node cur = root;
        for(char c:cs){
            //如果某个节点只使用了一次,而且只有一个字节点,直接删除即可
            if(cur.next.get(c).use==1&&cur.next.size()==1){
                cur.next.remove(c);
                cur = null;
                break;
            }
            //否则置为下一个节点
            cur = cur.next.get(c);
            cur.use--;
        }
        //如果走完了,是一个单词的话,置为不是单词
        if(isWord(cur)){
            cur.weight = 0;
        }
        size--;
        return true;
    }

数据结构-字典树
case1: 删除cat,从根节点出发,遍历到c的时候发现c只被使用了一次,直接从根节点的TreeMap移除掉key为c即可

case2: 删除dog, 从根节点出发,遍历到d的时候发现d被使用了两次,不可以删除d,当前节点置为下一个节点o,并且把d的使用次数减一,发现o的使用次数为1,直接从d的TreeMap移除掉key为o即可

case3: 删除pan, 从根节点出发,遍历到p的时候发现p被使用了两次, p的use–, cur=p, 发现a被使用了两次,a的use–, cur=a, 发现n被使用了两次, n的use–, cur=n, 循环执行完毕后发现n节点是pan的尾节点,把cur的weight改为0,标识pan不是单词

case4: 删除panda,p,a,n的use依次减一,然后直接从n的TreeMap移除掉key为d即可

参考

慕课网liuyubobo玩转数据结构

上一篇:python基础练习题(题目 递归求阶乘)


下一篇:2021-07-21