speech_handler.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. """
  2. 百度实时语音识别处理模块
  3. """
  4. from utils.logger import logger
  5. import json
  6. import struct
  7. import os
  8. import threading
  9. import queue
  10. import base64
  11. import requests
  12. import random
  13. import sounddevice as sd
  14. import platform
  15. system = platform.system().lower()
  16. if system == "linux":
  17. sd.default.device = 'pulse'
  18. elif system == "windows":
  19. sd.default.device = None
  20. elif system == "darwin":
  21. sd.default.device = None
  22. from socket import socket, AF_INET, SOCK_STREAM
  23. from websockets.sync.client import connect
  24. from websockets import ConnectionClosedOK
  25. from utils.load_config import load_config
  26. class BaiduSpeechHandler:
  27. """百度实时语音识别处理类"""
  28. def __init__(self):
  29. # 根据文字长度分类的语气助词列表
  30. self.short_thinking_phrases = [
  31. "嗯",
  32. "这个……",
  33. "嗯……",
  34. ]
  35. self.medium_thinking_phrases = [
  36. "稍等",
  37. "我想想",
  38. "等一下",
  39. ]
  40. self.long_thinking_phrases = [
  41. "嗯,我想一想",
  42. "我琢磨一下",
  43. "我思考一下",
  44. ]
  45. # 加载配置
  46. self.config = load_config()
  47. self.baidu_config = self.config.get(
  48. 'speech_recognition', {}).get('baidu_realtime', {})
  49. # 百度实时语音识别相关
  50. self.audio_socket = None
  51. self.baidu_ws = None
  52. self.play_buffer = queue.Queue()
  53. self.chatting = threading.Event()
  54. self.recording = threading.Event()
  55. self.is_running = False
  56. self.access_token = None
  57. # 音频播放相关
  58. self.output_device = None
  59. self.audio_stream = None
  60. # 初始化百度实时语音识别
  61. self._init_baidu_realtime()
  62. def _init_baidu_realtime(self):
  63. """初始化百度实时语音识别"""
  64. try:
  65. # 获取百度配置
  66. client_id = self.baidu_config.get('client_id')
  67. client_secret = self.baidu_config.get('client_secret')
  68. if not client_id or not client_secret:
  69. logger.error("百度配置中缺少client_id或client_secret")
  70. return
  71. # 获取access token
  72. token_url = f"https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={client_id}&client_secret={client_secret}"
  73. response = requests.get(token_url)
  74. if response.status_code == 200:
  75. self.access_token = response.json().get('access_token')
  76. logger.info("百度实时语音识别初始化成功")
  77. else:
  78. logger.error("获取百度access token失败")
  79. return
  80. except Exception as e:
  81. logger.error(f"初始化百度实时语音识别失败: {e}")
  82. def _init_audio_socket(self):
  83. """初始化音频Socket连接"""
  84. try:
  85. self.audio_socket = socket(AF_INET, SOCK_STREAM)
  86. server_ip = self.config.get(
  87. 'server', {}).get('ip', '192.168.123.21')
  88. self.audio_socket.connect((server_ip, 9080))
  89. logger.info("音频Socket连接建立成功")
  90. return True
  91. except Exception as e:
  92. from utils.tts_client import play_text_async
  93. play_text_async('讯飞套件连接失败,请检查讯飞套件是否正常', use_cache=True)
  94. logger.error(f"音频Socket连接失败: {e}")
  95. return False
  96. def _recv_all(self, num_byte):
  97. """接收指定字节数的数据"""
  98. data = b''
  99. while len(data) < num_byte:
  100. packet = self.audio_socket.recv(num_byte - len(data))
  101. if not packet:
  102. return None
  103. data += packet
  104. return data
  105. def _process_audio_data(self):
  106. """处理音频数据"""
  107. try:
  108. recv_data = self._recv_all(9)
  109. if not recv_data:
  110. return None
  111. sync_head, user_id, msg_type, msg_length, msg_id = struct.unpack(
  112. '<BBBIH', recv_data)
  113. if sync_head == 0xa5 and user_id == 0x01:
  114. recv_data = self._recv_all(msg_length + 1)
  115. if recv_data and recv_data[1] == 0: # 只处理第0路音频
  116. audio_data = recv_data[8:-1]
  117. return audio_data
  118. return None
  119. except Exception as e:
  120. logger.error(f"处理音频数据异常: {e}")
  121. return None
  122. def _process_audio_to_baidu(self):
  123. """将音频数据发送给百度"""
  124. try:
  125. while self.is_running:
  126. audio_data = self._process_audio_data()
  127. if audio_data:
  128. audio_base64 = base64.b64encode(audio_data).decode()
  129. self.baidu_ws.send(json.dumps({
  130. "type": "input_audio_buffer.append",
  131. "audio": audio_base64
  132. }))
  133. except Exception as e:
  134. logger.error(f"发送音频到百度异常: {e}")
  135. def _play_audio(self):
  136. """音频播放线程"""
  137. try:
  138. # 获取输出设备 - 修复设备选择逻辑
  139. try:
  140. devices = sd.query_devices()
  141. output_device = None
  142. # 查找默认输出设备
  143. for i, device in enumerate(devices):
  144. if device['max_outputs'] > 0: # 有输出能力的设备
  145. output_device = i
  146. break
  147. if output_device is None:
  148. logger.warning("未找到输出设备,使用默认设备")
  149. output_device = None
  150. else:
  151. logger.info(f"使用输出设备: {devices[output_device]['name']}")
  152. except Exception as e:
  153. logger.warning(f"设备查询失败,使用默认设备: {e}")
  154. output_device = None
  155. # 获取音频配置
  156. output_audio_config = self.baidu_config.get('output_audio', {})
  157. sample_rate = output_audio_config.get('sample_rate', 24000)
  158. channels = output_audio_config.get('channels', 1)
  159. logger.info(
  160. f"启动音频播放线程,采样率: {sample_rate}, 声道: {channels}, 设备: {output_device}")
  161. with sd.RawOutputStream(
  162. samplerate=sample_rate,
  163. channels=channels,
  164. dtype='int16',
  165. blocksize=1024,
  166. device=output_device
  167. ) as stream:
  168. while self.is_running:
  169. try:
  170. chunk = self.play_buffer.get(timeout=1)
  171. if chunk == 'EOF':
  172. break
  173. stream.write(chunk)
  174. except queue.Empty:
  175. continue
  176. except Exception as e:
  177. logger.error(f"音频播放异常: {e}")
  178. break
  179. except Exception as e:
  180. logger.error(f"音频播放线程异常: {e}")
  181. def _start_baidu_realtime(self):
  182. """启动百度实时语音识别"""
  183. try:
  184. if not self.access_token:
  185. logger.error("百度access token未初始化")
  186. return False
  187. # 获取百度配置
  188. streaming_url = self.baidu_config.get('streaming_url')
  189. model_name = self.baidu_config.get('model_name', 'audio-realtime')
  190. url = f'{streaming_url}?model={model_name}&access_token={self.access_token}'
  191. self.baidu_ws = connect(url)
  192. self.is_running = True
  193. # 启动音频播放线程
  194. play_thread = threading.Thread(target=self._play_audio)
  195. play_thread.daemon = True
  196. play_thread.start()
  197. # 启动接收线程
  198. receive_thread = threading.Thread(target=self._receive_baidu_data)
  199. receive_thread.daemon = True
  200. receive_thread.start()
  201. # 启动音频处理线程
  202. audio_thread = threading.Thread(
  203. target=self._process_audio_to_baidu)
  204. audio_thread.daemon = True
  205. audio_thread.start()
  206. # 更新会话配置 - 确保启用音频输出
  207. session_config = {
  208. "type": "session.update",
  209. "session": {
  210. "input_audio_transcription": {
  211. "model": "default"
  212. },
  213. "output_audio": {
  214. "format": "pcm16",
  215. "sample_rate": 24000
  216. },
  217. "max_output_tokens": "inf",
  218. "voice": "default"
  219. }
  220. }
  221. self.baidu_ws.send(json.dumps(session_config))
  222. logger.info("百度实时语音识别启动成功")
  223. return True
  224. except Exception as e:
  225. logger.error(f"启动百度实时语音识别失败: {e}")
  226. return False
  227. def _handle_baidu_response_done(self, data):
  228. """处理百度response.done消息"""
  229. try:
  230. if 'response' in data:
  231. response = data['response']
  232. status = response.get('status', '')
  233. status_details = response.get('status_details', {})
  234. logger.info(f"百度响应状态: {status}")
  235. if status == 'completed':
  236. # 正常完成的响应
  237. if 'output' in response:
  238. output = response['output']
  239. for item in output:
  240. if item.get('type') == 'message' and 'content' in item:
  241. content = item['content']
  242. for content_item in content:
  243. if content_item.get('type') == 'audio':
  244. # 处理音频内容
  245. transcript = content_item.get(
  246. 'transcript', '')
  247. if transcript:
  248. logger.info(
  249. f"百度识别结果: {transcript}")
  250. # 检查是否有音频数据
  251. if 'audio' in content_item:
  252. audio_data = base64.b64decode(
  253. content_item['audio'])
  254. self.play_buffer.put(
  255. audio_data)
  256. logger.info(
  257. f"从content_item中提取音频数据,大小: {len(audio_data)} 字节")
  258. else:
  259. logger.warning(
  260. "content_item中没有找到音频数据")
  261. # 直接播放百度返回的音频,不使用TTS
  262. self._play_baidu_audio_response(
  263. response)
  264. elif status == 'incomplete':
  265. # 不完整的响应,可能是内容过滤等原因
  266. reason = status_details.get('reason', 'unknown')
  267. logger.warning(f"百度响应不完整,原因: {reason}")
  268. if reason == 'content_filter':
  269. logger.info("内容被过滤,使用TTS播放提示音")
  270. # 播放提示音告知用户内容被过滤
  271. # play_text_async("抱歉,我无法回答这个问题", use_cache=True)
  272. else:
  273. logger.warning(f"未知的不完整原因: {reason}")
  274. else:
  275. logger.warning(f"未知的响应状态: {status}")
  276. except Exception as e:
  277. logger.error(f"处理百度response.done异常: {e}")
  278. def _play_baidu_audio_response(self, response):
  279. """播放百度返回的音频响应"""
  280. try:
  281. # 检查响应中是否包含音频数据
  282. if 'output' in response:
  283. output = response['output']
  284. for item in output:
  285. if item.get('type') == 'message' and 'content' in item:
  286. content = item['content']
  287. for content_item in content:
  288. if content_item.get('type') == 'audio':
  289. # 检查是否有音频数据
  290. if 'audio' in content_item:
  291. audio_data = base64.b64decode(
  292. content_item['audio'])
  293. self.play_buffer.put(audio_data)
  294. logger.info(
  295. f"从响应中提取音频数据,大小: {len(audio_data)} 字节")
  296. else:
  297. logger.warning("响应中没有找到音频数据")
  298. logger.info("等待百度音频数据播放...")
  299. except Exception as e:
  300. logger.error(f"播放百度音频响应异常: {e}")
  301. def _receive_baidu_data(self):
  302. """接收百度返回的数据"""
  303. try:
  304. while self.is_running:
  305. data = self.baidu_ws.recv()
  306. if isinstance(data, str):
  307. data = json.loads(data)
  308. # 记录所有消息类型,帮助调试
  309. msg_type = data.get('type', 'unknown')
  310. logger.info(f"收到百度消息类型: {msg_type}")
  311. if data['type'] == 'response.audio.delta':
  312. # 处理音频输出 - 这是百度返回的实际音频数据
  313. audio = base64.b64decode(data['delta'])
  314. self.play_buffer.put(audio)
  315. logger.info(f"收到百度音频数据,大小: {len(audio)} 字节")
  316. data['delta'] = '...'
  317. elif data['type'] == 'response.created':
  318. # 清空播放缓冲区
  319. while True:
  320. try:
  321. self.play_buffer.get(block=False)
  322. except queue.Empty:
  323. break
  324. logger.info("清空播放缓冲区,准备接收新的音频")
  325. elif data['type'] == 'input_audio_buffer.speech_started':
  326. self.chatting.set()
  327. logger.info("语音开始")
  328. elif data['type'] == 'response.done':
  329. self.chatting.clear()
  330. logger.info("响应完成")
  331. # 处理完整的响应,包括音频数据
  332. self._handle_baidu_response_done(data)
  333. elif data['type'] == 'input_audio_buffer.speech_ended':
  334. # 处理语音识别结果
  335. self._handle_baidu_recognition_result(data)
  336. logger.info("语音结束")
  337. elif data['type'] == 'response.audio':
  338. # 处理完整的音频响应
  339. logger.info("收到完整的音频响应")
  340. if 'audio' in data:
  341. audio = base64.b64decode(data['audio'])
  342. self.play_buffer.put(audio)
  343. logger.info(f"收到完整音频数据,大小: {len(audio)} 字节")
  344. elif data['type'] == 'response.text':
  345. # 处理文本响应
  346. logger.info("收到文本响应")
  347. if 'text' in data:
  348. logger.info(f"文本内容: {data['text']}")
  349. elif data['type'] == 'session.created':
  350. # 处理会话创建
  351. logger.info("会话创建成功")
  352. if 'session' in data:
  353. session = data['session']
  354. logger.info(f"会话ID: {session.get('id')}")
  355. elif data['type'] == 'conversation.created':
  356. # 处理对话创建
  357. logger.info("对话创建成功")
  358. if 'conversation' in data:
  359. conversation = data['conversation']
  360. logger.info(f"对话ID: {conversation.get('id')}")
  361. elif data['type'] == 'error':
  362. # 处理错误消息
  363. logger.error("收到错误消息")
  364. if 'error' in data:
  365. error = data['error']
  366. error_type = error.get('type', 'unknown')
  367. error_code = error.get('code', 'unknown')
  368. error_message = error.get('message', 'unknown')
  369. logger.error(
  370. f"错误类型: {error_type}, 代码: {error_code}, 消息: {error_message}")
  371. else:
  372. # 记录其他类型的消息
  373. logger.info(f"收到其他类型消息: {msg_type}")
  374. logger.info(json.dumps(data, ensure_ascii=False))
  375. except ConnectionClosedOK:
  376. logger.info("百度WebSocket连接已关闭")
  377. except Exception as e:
  378. logger.error(f"接收百度数据异常: {e}")
  379. def _handle_baidu_recognition_result(self, data):
  380. """处理百度语音识别结果"""
  381. try:
  382. if 'result' in data:
  383. result_text = data['result'].get('text', '')
  384. if result_text:
  385. logger.info(f"百度识别结果: {result_text}")
  386. # 不播放思考语气词,因为百度会直接返回音频
  387. except Exception as e:
  388. logger.error(f"处理百度识别结果异常: {e}")
  389. def play_thinking_phrase(self, text_length: int = 0, type="thinking"):
  390. """根据文字长度播放合适的语气助词(使用缓存)"""
  391. # 百度实时语音识别不使用TTS播放思考语气词
  392. # 因为百度会直接返回音频响应
  393. logger.info("百度实时语音识别模式,跳过TTS思考语气词播放")
  394. return
  395. def start_recognition(self):
  396. """启动百度实时语音识别服务"""
  397. if not self._init_audio_socket():
  398. return False
  399. return self._start_baidu_realtime()
  400. def stop_recognition(self):
  401. """停止百度实时语音识别服务"""
  402. self.is_running = False
  403. # 发送结束信号给音频播放线程
  404. self.play_buffer.put('EOF')
  405. if self.audio_socket:
  406. self.audio_socket.close()
  407. self.audio_socket = None
  408. if self.baidu_ws:
  409. self.baidu_ws.close()
  410. self.baidu_ws = None
  411. logger.info("百度实时语音识别服务已停止")