vivo工具开发在线计算语言模型打分

import json
import time
import uuid
from tqdm import tqdm
from json import JSONDecodeError
from flask import Flask, jsonify, request
import logging

from call_tokenize import get_tokenize_and_query

app = Flask(__name__)

kv_dict = {}


def arpa_read(files):
    with open(files, mode='r', encoding='UTF-8') as f1:

        arpa_list = f1.readlines()

        for line in tqdm(arpa_list):
            try:
                splitList = line.strip("\n").split("\t")
                if len(splitList) == 2:
                    key = splitList[1]
                    v1 = float(splitList[0])
                    v2 = 0
                elif len(splitList) == 3:
                    key = splitList[1]
                    v1 = float(splitList[0])
                    v2 = float(splitList[2])
                else:
                    continue

                kv_dict[key] = (v1, v2)
            except Exception:
                continue

        return kv_dict


#arpa_read("D:\\Users\\72152411\\Documents\\vchat\\ChatFiles\\trainfile.lm")  # 调用函数arpa文件转换成字典
arpa_read("./1_9_arpa")  # 调用函数arpa文件转换成字典


def _score(sentence):

    def calculate_sentence_start(word_0, word_1):
        key = word_0 + " " + word_1
        if key in kv_dict:
            return kv_dict[key][0]
        else:
            return kv_dict[word_1][0] + kv_dict[word_0][1]

    def score_bigram_prob(w1,w2):
        s2 = 0
        if w1 == "<s>":
            calculate_sentence_start(w1,w2)
        else:
            key = w1 + " " + w2
            if key in kv_dict:
                s2 += kv_dict[key][0]
                return s2
            else:
                s2 += kv_dict[w2][0] + kv_dict[w1][1]
                return s2

    def score_trigram_prob(trigram_list):
        first, second, third = trigram_list

        tri_key = " ".join(trigram_list)
        if tri_key in kv_dict:
            return kv_dict[tri_key][0]
        # 需要回退
        else:
            bi_key = second + " " + third
            bi_bow_key = first + " " + second
            # 回退到bigram
            if bi_key in kv_dict:
                bi_prob = kv_dict[bi_key][0]
                bi_bow = 0
                # 后面的二元有, 前面上文的backoff可以查到
                if bi_bow_key in kv_dict:
                   bi_bow = kv_dict[bi_bow_key][1]
                bi_prob += bi_bow
                return bi_prob
            # 回退到unigram
            else:
                if third not in kv_dict:
                    raise ValueError
                bi_bow = 0
                uni_bow = 0
                # 前面上文的backoff可以查到
                if bi_bow_key in kv_dict:
                    bi_bow = kv_dict[bi_bow_key][1]
                if second in kv_dict:
                    uni_bow = kv_dict[second][1]
                uni_prob = kv_dict[third][0] + uni_bow + bi_bow
                return uni_prob

    try:
        sentence = ("<s> " + sentence).strip()
        wordArr = sentence.strip().split(" ")  # 对输入语料进行切分

        wordArrK = tuple(wordArr)  # 转换成元组  因为字典的k不能是list  将切分好的语料和字典的k匹配

        total_score = 0

        if len(wordArrK) == 1:  # <s>
            return kv_dict.get(wordArrK[0])[0]
        elif len(wordArrK) == 2:  # <s> 今天
            return calculate_sentence_start(wordArrK[0], wordArrK[1])
        else:
            total_score += calculate_sentence_start(wordArrK[0], wordArrK[1])
            for i in range(0, len(wordArrK) - 2):
                total_score += score_trigram_prob(wordArrK[i:i + 3])
            return total_score

    except Exception as e:
        app.logger.exception(e)
        return -1


@app.route('/ngram_score', methods=['POST'])
def score():
    try:

        parameters = request.form
        print(parameters)
        sentence = parameters['sentence']
        print(sentence)
        score_result = _score(sentence)

        response = {"score": score_result}

        response = jsonify(response)
    except Exception:
        response = 'Internal error', 500

    return response


@app.route('/input_score', methods=['POST'])
def input_score():
    try:

        parameters = request.form
        user_input = parameters['user_input']
        keyboard_type = parameters['keyboard_type']
        
        tokenize_result = get_tokenize_and_query(user_input, keyboard_type)
        
        result = []
        for words, score in tokenize_result:
            sent = " ".join(words)
            score_result = _score(sent)
            result.append((sent, score_result, score))

        response = result

        response = jsonify(response)
    except Exception as e:
        app.logger.exception(e)
        response = 'Internal error', 500

    return response


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=9494)

 

上一篇:Ceph性能瓶颈分析与优化(混合盘篇)


下一篇:Asp.Net Core中的 IdentityServer4 (转自Jlion)