vosk介绍以及安装,参考地址:https://blog.csdn.net/qq_35385687/article/details/119209189?spm=1001.2014.3001.5501
文章目录
命令行方式直接转写
#!/usr/bin/env python3
import argparse
import os
import queue
import sounddevice as sd
import sys
import vosk
q = queue.Queue()
def int_or_str(text):
"""Helper function for argument parsing."""
try:
return int(text)
except ValueError:
return text
def callback(indata, frames, time, status):
"""This is called (from a separate thread) for each audio block."""
if status:
print(status, file=sys.stderr)
q.put(bytes(indata))
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument(
'-l', '--list-devices', action='store_true',
help='show list of audio devices and exit')
args, remaining = parser.parse_known_args()
if args.list_devices:
print(sd.query_devices())
parser.exit(0)
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[parser])
parser.add_argument(
'-f', '--filename', type=str, metavar='FILENAME',
help='audio file to store recording to')
parser.add_argument(
'-m', '--model', type=str, metavar='MODEL_PATH',
help='Path to the model')
parser.add_argument(
'-d', '--device', type=int_or_str,
help='input device (numeric ID or substring)')
parser.add_argument(
'-r', '--samplerate', type=int, help='sampling rate')
args = parser.parse_args(remaining)
try:
if args.model is None:
args.model = "model"
if not os.path.exists(args.model):
print("Please download a model for your language from https://alphacephei.com/vosk/models")
print("and unpack as 'model' in the current folder.")
parser.exit(0)
if args.samplerate is None:
device_info = sd.query_devices(args.device, 'input')
# soundfile expects an int, sounddevice provides a float:
args.samplerate = int(device_info['default_samplerate'])
model = vosk.Model(args.model)
if args.filename:
dump_fn = open(args.filename, "wb")
else:
dump_fn = None
with sd.RawInputStream(samplerate=args.samplerate, blocksize=16000, device=args.device, dtype='int16',
channels=1, callback=callback):
print('#' * 80)
print('Press Ctrl+C to stop the recording')
print('#' * 80)
rec = vosk.KaldiRecognizer(model, args.samplerate)
while True:
data = q.get()
if rec.AcceptWaveform(data):
print(rec.Result())
else:
print(rec.PartialResult())
if dump_fn is not None:
dump_fn.write(data)
except KeyboardInterrupt:
print('\nDone')
parser.exit(0)
except Exception as e:
parser.exit(type(e).__name__ + ': ' + str(e))
websoket实现实时转写
#!/usr/bin/env python3
import argparse
import os
import queue
import sounddevice as sd
import sys
import vosk
q = queue.Queue()
def int_or_str(text):
"""Helper function for argument parsing."""
try:
return int(text)
except ValueError:
return text
def callback(indata, frames, time, status):
"""This is called (from a separate thread) for each audio block."""
if status:
print(status, file=sys.stderr)
q.put(bytes(indata))
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument(
'-l', '--list-devices', action='store_true',
help='show list of audio devices and exit')
args, remaining = parser.parse_known_args()
if args.list_devices:
print(sd.query_devices())
parser.exit(0)
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[parser])
parser.add_argument(
'-f', '--filename', type=str, metavar='FILENAME',
help='audio file to store recording to')
parser.add_argument(
'-m', '--model', type=str, metavar='MODEL_PATH',
help='Path to the model')
parser.add_argument(
'-d', '--device', type=int_or_str,
help='input device (numeric ID or substring)')
parser.add_argument(
'-r', '--samplerate', type=int, help='sampling rate')
args = parser.parse_args(remaining)
try:
if args.model is None:
args.model = "model"
if not os.path.exists(args.model):
print("Please download a model for your language from https://alphacephei.com/vosk/models")
print("and unpack as 'model' in the current folder.")
parser.exit(0)
if args.samplerate is None:
device_info = sd.query_devices(args.device, 'input')
# soundfile expects an int, sounddevice provides a float:
args.samplerate = int(device_info['default_samplerate'])
model = vosk.Model(args.model)
if args.filename:
dump_fn = open(args.filename, "wb")
else:
dump_fn = None
with sd.RawInputStream(samplerate=args.samplerate, blocksize=16000, device=args.device, dtype='int16',
channels=1, callback=callback):
print('#' * 80)
print('Press Ctrl+C to stop the recording')
print('#' * 80)
rec = vosk.KaldiRecognizer(model, args.samplerate)
while True:
data = q.get()
if rec.AcceptWaveform(data):
print(rec.Result())
else:
print(rec.PartialResult())
if dump_fn is not None:
dump_fn.write(data)
except KeyboardInterrupt:
print('\nDone')
parser.exit(0)
except Exception as e:
parser.exit(type(e).__name__ + ': ' + str(e))
前端获取pcm实时传输至后台
<html>
<head>
<meta charset="UTF-8">
<title>Simple Recorder.js demo with record, stop and pause</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<!-- 控制宽度的自动适应 -->
<style type="text/css">
.comments {
width: 100%; /*自动适应父布局宽度*/
overflow: auto;
word-break: break-all;
/*在ie中解决断行问题(防止自动变为在一行显示,主要解决ie兼容问题,ie8中当设宽度为100%时,文本域类容超过一行时,
当我们双击文本内容就会自动变为一行显示,所以只能用ie的专有断行属性“word-break或word-wrap”控制其断行)*/
}
</style>
</head>
<body>
<div id="controls">
<button id="recordButton">Record</button>
<button id="stopButton">Stop</button>
</div>
<textarea id="textResult" class="comments" rows="10" cols="10"></textarea>
</body>
<script type="text/javascript" src="./js/recorder3.js"></script>
<script>
var ws = null; //实现WebSocket
var interval; // 定时器
let recorder = new Recorder({
sampleBits: 16, // 采样位数,支持 8 或 16,默认是16
sampleRate: 16000, // 采样率,支持 11025、16000、22050、24000、44100、48000,根据浏览器默认值,
numChannels: 1, // 声道,支持 1 或 2, 默认是1
// compiling: false,(0.x版本中生效,1.x增加中) // 是否边录边转换,默认是false
compiling: true
});
var recordButton = document.getElementById("recordButton");
var stopButton = document.getElementById("stopButton");
var textResult = document.getElementById("textResult");
recordButton.addEventListener("click", startRecording);
stopButton.addEventListener("click", stopRecording);
// 录音
function startRecording() {
console.log("recordButton clicked");
recorder.start().then(() => {
// 开始录音
useWebSocket();
}, (error) => {
// 出错了
console.log(`出错了`);
});
}
// 停止录音
function stopRecording() {
console.log("stopButton clicked", recorder.getPCMBlob());
recorder.stop();
if (ws) {
ws.close();
}
clearInterval(interval);
textResult.innerText = '';
// recorder.getPCMBlob();
// recorder.downloadPCM('aaa');
}
/*
* WebSocket
*/
function useWebSocket() {
// console.log(recorder.getNextData())
ws = new WebSocket("ws://localhost:5678");
ws.binaryType = 'arraybuffer'; //传输的是 ArrayBuffer 类型的数据
ws.onopen = function () {
console.log('握手成功');
if (ws.readyState === 1) { //ws进入连接状态,则每隔500毫秒发送一包数据
interval = setInterval(() => {
// recorder.getNextData();
// recorder.getWholeData();
// console.log(recorder.getNextData());
ws.send(recorder.getNextData());
}, 500)
}
};
ws.onmessage = function (msg) {
var jsonStr = msg.data;
console.info(jsonStr);
textResult.innerText = jsonStr;
autoTextarea(document.getElementById("textResult"));
};
ws.onerror = function (err) {
console.error(err);
textResult.innerText = '';
};
ws.onclose = function (msg) {
console.info(msg);
textResult.innerText = '';
};
}
/**
* 文本框根据输入内容自适应高度
* @param {HTMLElement} 输入框元素
* @param {Number} 设置光标与输入框保持的距离(默认0)
* @param {Number} 设置最大高度(可选)
*/
var autoTextarea = function (elem, extra, maxHeight) {
//判断elem是否为数组
if (elem.length > 0) {
for (var i = 0; i < elem.length; i++) {
e(elem[i]);
}
} else {
e(elem);
}
function e(elem) {
extra = extra || 0;
var isFirefox = !!document.getBoxObjectFor || 'mozInnerScreenX' in window,
isOpera = !!window.opera && !!window.opera.toString().indexOf('Opera'),
addEvent = function (type, callback) {
elem.addEventListener ?
elem.addEventListener(type, callback, false) :
elem.attachEvent('on' + type, callback);
},
getStyle = elem.currentStyle ? function (name) {
var val = elem.currentStyle[name];
if (name === 'height' && val.search(/px/i) !== 1) {
var rect = elem.getBoundingClientRect();
return rect.bottom - rect.top -
parseFloat(getStyle('paddingTop')) -
parseFloat(getStyle('paddingBottom')) + 'px';
}
;
return val;
} : function (name) {
return getComputedStyle(elem, null)[name];
},
minHeight = parseFloat(getStyle('height'));
elem.style.resize = 'none';
var change = function () {
var scrollTop, height,
padding = 0,
style = elem.style;
if (elem._length === elem.value.length) return;
elem._length = elem.value.length;
if (!isFirefox && !isOpera) {
padding = parseInt(getStyle('paddingTop')) + parseInt(getStyle('paddingBottom'));
}
;
scrollTop = document.body.scrollTop || document.documentElement.scrollTop;
elem.style.height = minHeight + 'px';
if (elem.scrollHeight > minHeight) {
if (maxHeight && elem.scrollHeight > maxHeight) {
height = maxHeight - padding;
style.overflowY = 'auto';
} else {
height = elem.scrollHeight - padding;
style.overflowY = 'hidden';
}
;
style.height = height + extra + 'px';
scrollTop += parseInt(style.height) - elem.currHeight;
document.body.scrollTop = scrollTop;
document.documentElement.scrollTop = scrollTop;
elem.currHeight = parseInt(style.height);
}
;
};
addEvent('propertychange', change);
addEvent('input', change);
addEvent('focus', change);
change();
}
};
</script>
</html>
完整项目地址
https://gitee.com/yzdyzdyzd/speechToText