sherpa-onnx使用http协议实现在线语音转文本

魔障阿Q

已于 2025-07-29 11:05:10 修改

阅读量504

点赞数 6

CC 4.0 BY-SA版权

文章标签：人工智能 python 自然语言处理

于 2025-02-26 13:58:33 首次发布

本文链接：https://blog.csdn.net/qq_44908396/article/details/145871158

本文不生产技术，只做技术的搬运工！！

前言

最近有一个客户需求，使用语音输入转成文本完成大模型文本输入，测试了whisper和sherpa-onnx模型的表现，发现whisper太慢了，sherpa-onnx可以兼容速度和准确度，因此使用该模型进行部署测试

实现

import wave
import numpy as np
import sherpa_onnx
from flask import Flask, request, jsonify
import io

app = Flask(__name__)

encoder_model_path = '/home/project_python/whisper/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/encoder-epoch-20-avg-1-chunk-16-left-128.onnx'
decoder_model_path = '/home/project_python/whisper/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/decoder-epoch-20-avg-1-chunk-16-left-128.onnx'
joiner_model_path = '/home/project_python/whisper/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/joiner-epoch-20-avg-1-chunk-16-left-128.onnx'
tokens_path = '/home/project_python/whisper/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12/tokens.txt'
recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
    encoder=encoder_model_path,
    decoder=decoder_model_path,
    joiner=joiner_model_path,
    tokens=tokens_path,
    sample_rate=16000,
    feature_dim=80,
    provider="cpu"
)

def read_wave(wave_data):
    """
    Args:
      wave_data:
        Bytes of a wave file. It should be single channel and each sample should
        be 16-bit. Its sample rate does not need to be 16kHz.
    Returns:
      Return a tuple containing:
       - A 1-D array of dtype np.float32 containing the samples, which are
       normalized to the range [-1, 1].
       - sample rate of the wave file
    """

    with wave.open(io.BytesIO(wave_data), 'rb') as f:
        assert f.getnchannels() == 1, f.getnchannels()
        assert f.getsampwidth() == 2, f.getsampwidth()  # it is in bytes
        num_samples = f.getnframes()
        samples = f.readframes(num_samples)
        samples_int16 = np.frombuffer(samples, dtype=np.int16)
        samples_float32 = samples_int16.astype(np.float32)

        samples_float32 = samples_float32 / 32768
        return samples_float32, f.getframerate()

@app.route('/transcribe', methods=['POST'])
def transcribe():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file:
        wave_data = file.read()
        samples, sample_rate = read_wave(wave_data)

        streams = []
        s = recognizer.create_stream()
        s.accept_waveform(sample_rate, samples)
        tail_paddings = np.zeros(int(0.66 * sample_rate), dtype=np.float32)
        s.accept_waveform(sample_rate, tail_paddings)
        s.input_finished()
        streams.append(s)

        while True:
            ready_list = []
            for s in streams:
                if recognizer.is_ready(s):
                    ready_list.append(s)
            if len(ready_list) == 0:
                break
            recognizer.decode_streams(ready_list)
        results = [recognizer.get_result(s) for s in streams]
        return jsonify({'results': results})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8079)