vosk实时语音识别

vosk介绍以及安装,参考地址:https://blog.csdn.net/qq_35385687/article/details/119209189?spm=1001.2014.3001.5501

文章目录

命令行方式直接转写

#!/usr/bin/env python3

import argparse
import os
import queue
import sounddevice as sd
import sys
import vosk

q = queue.Queue()


def int_or_str(text):
    """Helper function for argument parsing."""
    try:
        return int(text)
    except ValueError:
        return text


def callback(indata, frames, time, status):
    """This is called (from a separate thread) for each audio block."""
    if status:
        print(status, file=sys.stderr)
    q.put(bytes(indata))


parser = argparse.ArgumentParser(add_help=False)
parser.add_argument(
    '-l', '--list-devices', action='store_true',
    help='show list of audio devices and exit')
args, remaining = parser.parse_known_args()
if args.list_devices:
    print(sd.query_devices())
    parser.exit(0)
parser = argparse.ArgumentParser(
    description=__doc__,
    formatter_class=argparse.RawDescriptionHelpFormatter,
    parents=[parser])
parser.add_argument(
    '-f', '--filename', type=str, metavar='FILENAME',
    help='audio file to store recording to')
parser.add_argument(
    '-m', '--model', type=str, metavar='MODEL_PATH',
    help='Path to the model')
parser.add_argument(
    '-d', '--device', type=int_or_str,
    help='input device (numeric ID or substring)')
parser.add_argument(
    '-r', '--samplerate', type=int, help='sampling rate')
args = parser.parse_args(remaining)

try:
    if args.model is None:
        args.model = "model"
    if not os.path.exists(args.model):
        print("Please download a model for your language from https://alphacephei.com/vosk/models")
        print("and unpack as 'model' in the current folder.")
        parser.exit(0)
    if args.samplerate is None:
        device_info = sd.query_devices(args.device, 'input')
        # soundfile expects an int, sounddevice provides a float:
        args.samplerate = int(device_info['default_samplerate'])

    model = vosk.Model(args.model)

    if args.filename:
        dump_fn = open(args.filename, "wb")
    else:
        dump_fn = None

    with sd.RawInputStream(samplerate=args.samplerate, blocksize=16000, device=args.device, dtype='int16',
                           channels=1, callback=callback):
        print('#' * 80)
        print('Press Ctrl+C to stop the recording')
        print('#' * 80)

        rec = vosk.KaldiRecognizer(model, args.samplerate)
        while True:
            data = q.get()
            if rec.AcceptWaveform(data):
                print(rec.Result())
            else:
                print(rec.PartialResult())
            if dump_fn is not None:
                dump_fn.write(data)

except KeyboardInterrupt:
    print('\nDone')
    parser.exit(0)
except Exception as e:
    parser.exit(type(e).__name__ + ': ' + str(e))

websoket实现实时转写

#!/usr/bin/env python3

import argparse
import os
import queue
import sounddevice as sd
import sys
import vosk

q = queue.Queue()


def int_or_str(text):
    """Helper function for argument parsing."""
    try:
        return int(text)
    except ValueError:
        return text


def callback(indata, frames, time, status):
    """This is called (from a separate thread) for each audio block."""
    if status:
        print(status, file=sys.stderr)
    q.put(bytes(indata))


parser = argparse.ArgumentParser(add_help=False)
parser.add_argument(
    '-l', '--list-devices', action='store_true',
    help='show list of audio devices and exit')
args, remaining = parser.parse_known_args()
if args.list_devices:
    print(sd.query_devices())
    parser.exit(0)
parser = argparse.ArgumentParser(
    description=__doc__,
    formatter_class=argparse.RawDescriptionHelpFormatter,
    parents=[parser])
parser.add_argument(
    '-f', '--filename', type=str, metavar='FILENAME',
    help='audio file to store recording to')
parser.add_argument(
    '-m', '--model', type=str, metavar='MODEL_PATH',
    help='Path to the model')
parser.add_argument(
    '-d', '--device', type=int_or_str,
    help='input device (numeric ID or substring)')
parser.add_argument(
    '-r', '--samplerate', type=int, help='sampling rate')
args = parser.parse_args(remaining)

try:
    if args.model is None:
        args.model = "model"
    if not os.path.exists(args.model):
        print("Please download a model for your language from https://alphacephei.com/vosk/models")
        print("and unpack as 'model' in the current folder.")
        parser.exit(0)
    if args.samplerate is None:
        device_info = sd.query_devices(args.device, 'input')
        # soundfile expects an int, sounddevice provides a float:
        args.samplerate = int(device_info['default_samplerate'])

    model = vosk.Model(args.model)

    if args.filename:
        dump_fn = open(args.filename, "wb")
    else:
        dump_fn = None

    with sd.RawInputStream(samplerate=args.samplerate, blocksize=16000, device=args.device, dtype='int16',
                           channels=1, callback=callback):
        print('#' * 80)
        print('Press Ctrl+C to stop the recording')
        print('#' * 80)

        rec = vosk.KaldiRecognizer(model, args.samplerate)
        while True:
            data = q.get()
            if rec.AcceptWaveform(data):
                print(rec.Result())
            else:
                print(rec.PartialResult())
            if dump_fn is not None:
                dump_fn.write(data)

except KeyboardInterrupt:
    print('\nDone')
    parser.exit(0)
except Exception as e:
    parser.exit(type(e).__name__ + ': ' + str(e))

前端获取pcm实时传输至后台

<html>

<head>
    <meta charset="UTF-8">
    <title>Simple Recorder.js demo with record, stop and pause</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <!-- 控制宽度的自动适应 -->
    <style type="text/css">
        .comments {
            width: 100%; /*自动适应父布局宽度*/
            overflow: auto;
            word-break: break-all;
            /*在ie中解决断行问题(防止自动变为在一行显示,主要解决ie兼容问题,ie8中当设宽度为100%时,文本域类容超过一行时,
            当我们双击文本内容就会自动变为一行显示,所以只能用ie的专有断行属性“word-break或word-wrap”控制其断行)*/
        }
    </style>
</head>

<body>
<div id="controls">
    <button id="recordButton">Record</button>
    <button id="stopButton">Stop</button>
</div>

<textarea id="textResult" class="comments" rows="10" cols="10"></textarea>

</body>
<script type="text/javascript" src="./js/recorder3.js"></script>
<script>

    var ws = null; //实现WebSocket

    var interval; // 定时器

    let recorder = new Recorder({
        sampleBits: 16,                 // 采样位数,支持 8 或 16,默认是16
        sampleRate: 16000,              // 采样率,支持 11025、16000、22050、24000、44100、48000,根据浏览器默认值,
        numChannels: 1,                 // 声道,支持 1 或 2, 默认是1
        // compiling: false,(0.x版本中生效,1.x增加中)  // 是否边录边转换,默认是false
        compiling: true
    });

    var recordButton = document.getElementById("recordButton");
    var stopButton = document.getElementById("stopButton");
    var textResult = document.getElementById("textResult");

    recordButton.addEventListener("click", startRecording);
    stopButton.addEventListener("click", stopRecording);

    // 录音
    function startRecording() {
        console.log("recordButton clicked");
        recorder.start().then(() => {
            // 开始录音
            useWebSocket();
        }, (error) => {
            // 出错了
            console.log(`出错了`);
        });

    }

    // 停止录音
    function stopRecording() {
        console.log("stopButton clicked", recorder.getPCMBlob());

        recorder.stop();

        if (ws) {
            ws.close();
        }

        clearInterval(interval);

        textResult.innerText = '';

        // recorder.getPCMBlob();
        // recorder.downloadPCM('aaa');

    }

    /*
    * WebSocket
    */
    function useWebSocket() {
        // console.log(recorder.getNextData())
        ws = new WebSocket("ws://localhost:5678");

        ws.binaryType = 'arraybuffer'; //传输的是 ArrayBuffer 类型的数据
        ws.onopen = function () {
            console.log('握手成功');
            if (ws.readyState === 1) { //ws进入连接状态,则每隔500毫秒发送一包数据
                interval = setInterval(() => {
                    // recorder.getNextData();
                    // recorder.getWholeData();
                    // console.log(recorder.getNextData());
                    ws.send(recorder.getNextData());
                }, 500)

            }

        };

        ws.onmessage = function (msg) {
            var jsonStr = msg.data;
            console.info(jsonStr);
            textResult.innerText = jsonStr;
            autoTextarea(document.getElementById("textResult"));
        };

        ws.onerror = function (err) {
            console.error(err);
            textResult.innerText = '';
        };

        ws.onclose = function (msg) {
            console.info(msg);
            textResult.innerText = '';
        };

    }

    /**
     * 文本框根据输入内容自适应高度
     * @param                {HTMLElement}        输入框元素
     * @param                {Number}                设置光标与输入框保持的距离(默认0)
     * @param                {Number}                设置最大高度(可选)
     */
    var autoTextarea = function (elem, extra, maxHeight) {
        //判断elem是否为数组
        if (elem.length > 0) {
            for (var i = 0; i < elem.length; i++) {
                e(elem[i]);
            }
        } else {
            e(elem);
        }

        function e(elem) {
            extra = extra || 0;
            var isFirefox = !!document.getBoxObjectFor || 'mozInnerScreenX' in window,
                isOpera = !!window.opera && !!window.opera.toString().indexOf('Opera'),
                addEvent = function (type, callback) {
                    elem.addEventListener ?
                        elem.addEventListener(type, callback, false) :
                        elem.attachEvent('on' + type, callback);
                },
                getStyle = elem.currentStyle ? function (name) {
                    var val = elem.currentStyle[name];

                    if (name === 'height' && val.search(/px/i) !== 1) {
                        var rect = elem.getBoundingClientRect();
                        return rect.bottom - rect.top -
                            parseFloat(getStyle('paddingTop')) -
                            parseFloat(getStyle('paddingBottom')) + 'px';
                    }
                    ;

                    return val;
                } : function (name) {
                    return getComputedStyle(elem, null)[name];
                },
                minHeight = parseFloat(getStyle('height'));

            elem.style.resize = 'none';

            var change = function () {
                var scrollTop, height,
                    padding = 0,
                    style = elem.style;

                if (elem._length === elem.value.length) return;
                elem._length = elem.value.length;

                if (!isFirefox && !isOpera) {
                    padding = parseInt(getStyle('paddingTop')) + parseInt(getStyle('paddingBottom'));
                }
                ;
                scrollTop = document.body.scrollTop || document.documentElement.scrollTop;

                elem.style.height = minHeight + 'px';
                if (elem.scrollHeight > minHeight) {
                    if (maxHeight && elem.scrollHeight > maxHeight) {
                        height = maxHeight - padding;
                        style.overflowY = 'auto';
                    } else {
                        height = elem.scrollHeight - padding;
                        style.overflowY = 'hidden';
                    }
                    ;
                    style.height = height + extra + 'px';
                    scrollTop += parseInt(style.height) - elem.currHeight;
                    document.body.scrollTop = scrollTop;
                    document.documentElement.scrollTop = scrollTop;
                    elem.currHeight = parseInt(style.height);
                }
                ;
            };

            addEvent('propertychange', change);
            addEvent('input', change);
            addEvent('focus', change);
            change();
        }
    };

</script>
</html>

完整项目地址

https://gitee.com/yzdyzdyzd/speechToText

上一篇:前端三种埋点


下一篇:并查集实验