echo_cancellation.py 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297
  1. # -*- coding: utf-8 -*-
  2. """
  3. 自我声音检测和回声消除模块
  4. 防止系统在播放TTS时误触发语音识别
  5. """
  6. import threading
  7. import time
  8. import numpy as np
  9. from typing import Optional, Callable, Dict, List, Tuple
  10. from utils.logger import logger
  11. import hashlib
  12. import json
  13. from collections import deque
  14. from dataclasses import dataclass
  15. import struct
  16. from config.config.echo_cancellation_conf import EchoCancellationConf
  17. @dataclass
  18. class AudioFingerprint:
  19. """音频指纹数据结构"""
  20. fingerprint: str
  21. timestamp: float
  22. duration: float
  23. energy: float
  24. spectral_features: List[float]
  25. class EchoCancellationEngine:
  26. """回声消除引擎"""
  27. def __init__(self):
  28. self.is_enabled = EchoCancellationConf.ENABLE_ECHO_CANCELLATION
  29. self.is_playing_tts = False
  30. self._current_playing_status = False # 添加这个属性初始化
  31. self.tts_audio_fingerprints: deque = deque(
  32. maxlen=EchoCancellationConf.MAX_TTS_FINGERPRINTS)
  33. self.recording_buffer: deque = deque(
  34. maxlen=EchoCancellationConf.MAX_RECORDING_BUFFER)
  35. # 配置参数
  36. self.sample_rate = EchoCancellationConf.SAMPLE_RATE
  37. self.frame_size = EchoCancellationConf.FRAME_SIZE
  38. self.overlap_ratio = EchoCancellationConf.OVERLAP_RATIO
  39. self.energy_threshold = EchoCancellationConf.ENERGY_THRESHOLD
  40. self.correlation_threshold = EchoCancellationConf.CORRELATION_THRESHOLD
  41. self.time_window = EchoCancellationConf.TIME_WINDOW
  42. # 状态管理
  43. self._lock = threading.RLock()
  44. self.last_tts_end_time = 0
  45. self.tts_fade_out_duration = EchoCancellationConf.TTS_FADE_OUT_DURATION
  46. # 频谱分析参数
  47. self.fft_size = EchoCancellationConf.FFT_SIZE
  48. self.mel_filters = EchoCancellationConf.MEL_FILTERS
  49. # 用户语音检测配置
  50. self.voice_detection_config = EchoCancellationConf.get_voice_detection_config()
  51. # TTS过滤配置
  52. self.tts_filtering_config = EchoCancellationConf.TTS_FILTERING.copy()
  53. # 统计信息
  54. self.stats = {
  55. 'total_processed': 0,
  56. 'echo_detected': 0,
  57. 'false_positives': 0,
  58. 'processing_time_avg': 0.0,
  59. 'user_interrupts_detected': 0
  60. }
  61. logger.info("🔇 回声消除引擎已初始化")
  62. # 验证配置
  63. config_errors = EchoCancellationConf.validate_config()
  64. if config_errors:
  65. logger.warning(f"⚠️ 配置验证发现问题: {config_errors}")
  66. else:
  67. logger.info("✅ 回声消除配置验证通过")
  68. def set_tts_playing_status(self, is_playing: bool, audio_data: Optional[bytes] = None):
  69. """设置TTS播放状态并记录音频指纹"""
  70. with self._lock:
  71. # 避免重复设置相同状态
  72. if hasattr(self, '_current_playing_status') and self._current_playing_status == is_playing:
  73. return
  74. self._current_playing_status = is_playing
  75. self.is_playing_tts = is_playing
  76. current_time = time.time()
  77. if is_playing:
  78. # 记录TTS开始时间
  79. self._tts_start_time = current_time
  80. if audio_data:
  81. # 生成TTS音频指纹
  82. fingerprint = self._generate_audio_fingerprint(
  83. audio_data, current_time)
  84. if fingerprint:
  85. self.tts_audio_fingerprints.append(fingerprint)
  86. logger.debug(
  87. f"🎵 记录TTS音频指纹: {fingerprint.fingerprint[:16]}...")
  88. else:
  89. self.last_tts_end_time = current_time
  90. # logger.debug("🔇 TTS播放结束,开始淡出期")
  91. def is_echo_audio(self, audio_data: bytes) -> bool:
  92. """检测音频是否为回声(自我声音)"""
  93. if not self.is_enabled or not audio_data:
  94. return False
  95. start_time = time.time()
  96. try:
  97. with self._lock:
  98. current_time = time.time()
  99. # 如果正在播放TTS,采用更严格的过滤策略
  100. if self.is_playing_tts:
  101. # 检查TTS播放是否刚刚开始,给音频指纹建立一些时间
  102. tts_start_time = getattr(self, '_tts_start_time', 0)
  103. if current_time - tts_start_time < 1.2: # TTS开始1.2秒内,延长过滤时间
  104. # 直接认为是回声,给系统时间建立指纹,防止误触发打断
  105. self.stats['echo_detected'] += 1
  106. # logger.debug("🚨 TTS刚开始播放,预防性过滤音频")
  107. return True
  108. # 生成当前音频的指纹进行分析
  109. current_fingerprint = self._generate_audio_fingerprint(
  110. audio_data, current_time)
  111. if not current_fingerprint:
  112. # 无法生成指纹时默认认为是回声
  113. self.stats['echo_detected'] += 1
  114. # logger.debug("🚨 无法生成音频指纹,默认过滤")
  115. return True
  116. # TTS播放期间,采用更严格的过滤策略
  117. # 只有非常明显的用户语音才允许通过
  118. if self._is_very_strong_user_voice(current_fingerprint):
  119. # 进一步检查与TTS的差异,使用更严格的标准
  120. if self._has_extreme_difference_from_tts(current_fingerprint):
  121. # logger.debug("🎤 检测到非常强烈的用户语音特征且与TTS有极大差异,允许通过")
  122. return False
  123. else:
  124. # 即使能量很强,但与TTS差异不够大,仍然过滤
  125. self.stats['echo_detected'] += 1
  126. logger.debug("🚨 强用户语音但与TTS差异不够,仍然过滤")
  127. return True
  128. # 其他情况全部过滤,防止回声
  129. self.stats['echo_detected'] += 1
  130. # logger.debug("🚨 TTS播放期间严格过滤音频,防止回声")
  131. return True
  132. # 检查是否在TTS结束后的淡出期内
  133. if current_time - self.last_tts_end_time < self.tts_fade_out_duration:
  134. # 在TTS淡出期内,采用更宽松的过滤策略
  135. # 生成当前音频的指纹进行分析
  136. current_fingerprint = self._generate_audio_fingerprint(
  137. audio_data, current_time)
  138. if not current_fingerprint:
  139. # 无法生成指纹时默认过滤
  140. self.stats['echo_detected'] += 1
  141. # logger.debug("🚨 TTS淡出期内无法生成音频指纹,默认过滤")
  142. return True
  143. # 在淡出期内,允许明显的用户语音通过
  144. if self._has_obvious_voice_characteristics(current_fingerprint):
  145. # 检查与TTS的差异,但使用更宽松的标准
  146. if self._has_basic_difference_from_tts(current_fingerprint):
  147. # logger.debug("🎤 TTS淡出期内检测到明显用户语音,允许通过")
  148. return False
  149. # 其他情况仍然过滤,但记录更详细的信息
  150. self.stats['echo_detected'] += 1
  151. logger.debug(
  152. f"🚨 TTS淡出期内过滤音频(距离TTS结束 {current_time - self.last_tts_end_time:.1f}s)")
  153. return True
  154. # 生成当前音频的指纹
  155. current_fingerprint = self._generate_audio_fingerprint(
  156. audio_data, current_time)
  157. if not current_fingerprint:
  158. return False
  159. # 与最近的TTS指纹进行比较
  160. is_echo = self._compare_with_tts_fingerprints(
  161. current_fingerprint)
  162. if is_echo:
  163. self.stats['echo_detected'] += 1
  164. # 更新统计信息
  165. self.stats['total_processed'] += 1
  166. processing_time = time.time() - start_time
  167. self.stats['processing_time_avg'] = (
  168. self.stats['processing_time_avg'] * (self.stats['total_processed'] - 1) +
  169. processing_time
  170. ) / self.stats['total_processed']
  171. return is_echo
  172. except Exception as e:
  173. logger.error(f"❌ 回声检测失败: {e}")
  174. # 出错时默认不认为是回声,避免影响正常功能
  175. return False
  176. def _generate_audio_fingerprint(self, audio_data: bytes, timestamp: float) -> Optional[AudioFingerprint]:
  177. """生成音频指纹"""
  178. try:
  179. # 转换为numpy数组
  180. samples = np.frombuffer(
  181. audio_data, dtype=np.int16).astype(np.float32)
  182. if len(samples) == 0:
  183. return None
  184. # 计算能量
  185. energy = float(np.mean(samples ** 2))
  186. if energy < self.energy_threshold:
  187. return None
  188. # 计算频谱特征
  189. spectral_features = self._extract_spectral_features(samples)
  190. # 生成更详细的指纹哈希 - 包含更多特征
  191. feature_str = f"{energy:.2f}_{len(samples)}_{np.mean(spectral_features):.4f}_{np.std(spectral_features):.4f}"
  192. fingerprint = hashlib.md5(feature_str.encode()).hexdigest()
  193. duration = len(samples) / self.sample_rate
  194. # 记录更详细的调试信息
  195. if EchoCancellationConf.should_log_audio_fingerprints():
  196. logger.debug(
  197. f"🎵 生成音频指纹: 能量={energy:.1f}, 时长={duration:.3f}s, 特征均值={np.mean(spectral_features):.3f}")
  198. return AudioFingerprint(
  199. fingerprint=fingerprint,
  200. timestamp=timestamp,
  201. duration=duration,
  202. energy=energy,
  203. spectral_features=spectral_features
  204. )
  205. except Exception as e:
  206. logger.error(f"❌ 生成音频指纹失败: {e}")
  207. return None
  208. def _extract_spectral_features(self, samples: np.ndarray) -> List[float]:
  209. """提取频谱特征"""
  210. try:
  211. # 确保样本长度足够进行FFT
  212. if len(samples) < self.fft_size:
  213. # 零填充
  214. padded_samples = np.zeros(self.fft_size)
  215. padded_samples[:len(samples)] = samples
  216. samples = padded_samples
  217. # 应用窗函数
  218. windowed = samples[:self.fft_size] * np.hanning(self.fft_size)
  219. # FFT变换
  220. fft_result = np.fft.fft(windowed)
  221. magnitude_spectrum = np.abs(fft_result[:self.fft_size//2])
  222. # 计算mel频率特征
  223. mel_features = self._compute_mel_features(magnitude_spectrum)
  224. return mel_features.tolist()
  225. except Exception as e:
  226. logger.error(f"❌ 提取频谱特征失败: {e}")
  227. return [0.0] * self.mel_filters
  228. def _compute_mel_features(self, magnitude_spectrum: np.ndarray) -> np.ndarray:
  229. """计算Mel频率特征"""
  230. try:
  231. # 简化的Mel滤波器组
  232. mel_filters = np.linspace(
  233. 0, len(magnitude_spectrum), self.mel_filters + 2)
  234. mel_features = np.zeros(self.mel_filters)
  235. for i in range(self.mel_filters):
  236. start_idx = int(mel_filters[i])
  237. end_idx = int(mel_filters[i + 2])
  238. if end_idx > start_idx:
  239. mel_features[i] = np.mean(
  240. magnitude_spectrum[start_idx:end_idx])
  241. # 对数变换
  242. mel_features = np.log(mel_features + 1e-10)
  243. return mel_features
  244. except Exception as e:
  245. logger.error(f"❌ 计算Mel特征失败: {e}")
  246. return np.zeros(self.mel_filters)
  247. def _compare_with_tts_fingerprints(self, current_fingerprint: AudioFingerprint) -> bool:
  248. """与TTS指纹进行比较"""
  249. try:
  250. current_time = current_fingerprint.timestamp
  251. for tts_fingerprint in self.tts_audio_fingerprints:
  252. # 检查时间窗口
  253. time_diff = current_time - tts_fingerprint.timestamp
  254. if time_diff > self.time_window:
  255. continue
  256. # 比较指纹哈希
  257. if current_fingerprint.fingerprint == tts_fingerprint.fingerprint:
  258. return True
  259. # 比较能量和频谱特征
  260. if self._is_similar_audio(current_fingerprint, tts_fingerprint):
  261. return True
  262. return False
  263. except Exception as e:
  264. logger.error(f"❌ 指纹比较失败: {e}")
  265. return False
  266. def _is_similar_audio(self, fp1: AudioFingerprint, fp2: AudioFingerprint) -> bool:
  267. """判断两个音频指纹是否相似"""
  268. try:
  269. # 能量相似性检查
  270. energy_ratio = min(fp1.energy, fp2.energy) / \
  271. max(fp1.energy, fp2.energy)
  272. if energy_ratio < 0.5:
  273. return False
  274. # 频谱特征相似性检查
  275. if len(fp1.spectral_features) != len(fp2.spectral_features):
  276. return False
  277. # 计算余弦相似度
  278. features1 = np.array(fp1.spectral_features)
  279. features2 = np.array(fp2.spectral_features)
  280. norm1 = np.linalg.norm(features1)
  281. norm2 = np.linalg.norm(features2)
  282. if norm1 == 0 or norm2 == 0:
  283. return False
  284. cosine_similarity = np.dot(features1, features2) / (norm1 * norm2)
  285. return cosine_similarity > self.correlation_threshold
  286. except Exception as e:
  287. logger.error(f"❌ 音频相似性计算失败: {e}")
  288. return False
  289. def _is_likely_user_voice(self, fingerprint: AudioFingerprint) -> bool:
  290. """判断是否可能是用户语音(用于打断检测)"""
  291. try:
  292. # 1. 能量阈值检查 - 用户语音通常有足够的能量
  293. user_voice_threshold = self.energy_threshold * \
  294. self.voice_detection_config['energy_multiplier']
  295. if fingerprint.energy < user_voice_threshold:
  296. return False
  297. # 2. 频谱特征分析 - 人声有特定的频谱特征
  298. features = np.array(fingerprint.spectral_features)
  299. # 检查频谱分布是否符合人声特征
  300. # 人声通常在中低频有较强的能量
  301. if len(features) >= 8:
  302. low_freq_energy = np.mean(features[:4]) # 低频部分
  303. mid_freq_energy = np.mean(features[4:8]) # 中频部分
  304. high_freq_energy = np.mean(features[8:]) if len(
  305. features) > 8 else 0 # 高频部分
  306. # 人声特征:中低频能量较强,高频相对较弱
  307. low_freq_weight = self.voice_detection_config['low_freq_weight']
  308. if mid_freq_energy > low_freq_energy * low_freq_weight and mid_freq_energy > high_freq_energy:
  309. # 3. 与最近TTS音频的差异检查
  310. if self._has_significant_difference_from_tts(fingerprint):
  311. self.stats['user_interrupts_detected'] += 1
  312. return True
  313. return False
  314. except Exception as e:
  315. logger.error(f"❌ 用户语音判断失败: {e}")
  316. return False
  317. def _has_significant_difference_from_tts(self, fingerprint: AudioFingerprint) -> bool:
  318. """检查与TTS音频是否有显著差异"""
  319. try:
  320. if not self.tts_audio_fingerprints:
  321. return True # 没有TTS参考,认为是用户语音
  322. current_time = fingerprint.timestamp
  323. tts_reference_window = EchoCancellationConf.TTS_REFERENCE_WINDOW
  324. # 找到最近的TTS指纹进行比较
  325. recent_tts_fingerprints = [
  326. fp for fp in self.tts_audio_fingerprints
  327. if current_time - fp.timestamp < tts_reference_window
  328. ]
  329. if not recent_tts_fingerprints:
  330. return True # 没有最近的TTS参考
  331. # 与最近的TTS指纹比较
  332. check_count = self.voice_detection_config['recent_tts_check_count']
  333. energy_diff_threshold = self.voice_detection_config['energy_diff_threshold']
  334. spectral_diff_threshold = self.voice_detection_config['spectral_diff_threshold']
  335. for tts_fp in recent_tts_fingerprints[-check_count:]:
  336. # 能量差异检查
  337. energy_diff = abs(fingerprint.energy - tts_fp.energy) / \
  338. max(fingerprint.energy, tts_fp.energy)
  339. if energy_diff > energy_diff_threshold:
  340. continue
  341. # 频谱特征差异检查
  342. if len(fingerprint.spectral_features) == len(tts_fp.spectral_features):
  343. features1 = np.array(fingerprint.spectral_features)
  344. features2 = np.array(tts_fp.spectral_features)
  345. # 计算频谱差异
  346. spectral_diff = np.mean(np.abs(features1 - features2))
  347. if spectral_diff < spectral_diff_threshold:
  348. return False # 频谱过于相似,可能是回声
  349. return True # 与TTS有显著差异,可能是用户语音
  350. except Exception as e:
  351. logger.error(f"❌ TTS差异检查失败: {e}")
  352. return True # 出错时倾向于认为是用户语音
  353. def _update_processing_time(self, processing_time: float):
  354. """更新处理时间统计"""
  355. if self.stats['total_processed'] > 0:
  356. alpha = 0.1 # 平滑因子
  357. self.stats['processing_time_avg'] = (
  358. alpha * processing_time +
  359. (1 - alpha) * self.stats['processing_time_avg']
  360. )
  361. else:
  362. self.stats['processing_time_avg'] = processing_time
  363. def cleanup_old_fingerprints(self):
  364. """清理过期的指纹"""
  365. try:
  366. with self._lock:
  367. current_time = time.time()
  368. # 清理过期的TTS指纹
  369. while (self.tts_audio_fingerprints and
  370. current_time - self.tts_audio_fingerprints[0].timestamp > self.time_window):
  371. self.tts_audio_fingerprints.popleft()
  372. except Exception as e:
  373. logger.error(f"❌ 清理指纹失败: {e}")
  374. def get_stats(self) -> Dict:
  375. """获取统计信息"""
  376. with self._lock:
  377. total = self.stats['total_processed']
  378. return {
  379. 'total_processed': total,
  380. 'echo_detected': self.stats['echo_detected'],
  381. 'user_interrupts_detected': self.stats['user_interrupts_detected'],
  382. 'echo_detection_rate': self.stats['echo_detected'] / max(total, 1),
  383. 'interrupt_success_rate': self.stats['user_interrupts_detected'] / max(total, 1),
  384. 'processing_time_avg_ms': self.stats['processing_time_avg'] * 1000,
  385. 'fingerprints_stored': len(self.tts_audio_fingerprints),
  386. 'config': {
  387. 'is_enabled': self.is_enabled,
  388. 'interrupt_during_playback': EchoCancellationConf.ENABLE_INTERRUPT_DURING_PLAYBACK,
  389. 'energy_threshold': self.energy_threshold,
  390. 'user_voice_threshold': self.energy_threshold * self.voice_detection_config['energy_multiplier']
  391. }
  392. }
  393. def enable(self):
  394. """启用回声消除"""
  395. self.is_enabled = True
  396. logger.info("✅ 回声消除已启用")
  397. def disable(self):
  398. """禁用回声消除"""
  399. self.is_enabled = False
  400. logger.info("❌ 回声消除已禁用")
  401. def _is_tts_variant_audio(self, fingerprint: AudioFingerprint) -> bool:
  402. """检测是否为TTS音频的变种(经过扬声器-麦克风传输后的音频)"""
  403. try:
  404. if not self.tts_audio_fingerprints:
  405. return False
  406. current_time = fingerprint.timestamp
  407. detection_window = self.tts_filtering_config['variant_detection_window']
  408. # 检查最近的TTS指纹
  409. recent_tts_fingerprints = [
  410. fp for fp in self.tts_audio_fingerprints
  411. if current_time - fp.timestamp < detection_window
  412. ]
  413. if not recent_tts_fingerprints:
  414. return False
  415. energy_range = self.tts_filtering_config['energy_attenuation_range']
  416. similarity_threshold = self.tts_filtering_config['variant_similarity_threshold']
  417. correlation_threshold = self.tts_filtering_config['frequency_correlation_threshold']
  418. # 检查所有最近的TTS指纹,而不仅仅是最后几个
  419. for tts_fp in recent_tts_fingerprints:
  420. # 1. 时间相关性检查 - 播放开始后短时间内的音频很可能是回声
  421. time_diff = current_time - tts_fp.timestamp
  422. if time_diff < 1.0: # 1秒内
  423. # 2. 能量衰减检查 - 扬声器播放的音频通过麦克风录制会有能量衰减
  424. if tts_fp.energy > 0: # 避免除零
  425. energy_ratio = fingerprint.energy / tts_fp.energy
  426. if energy_range[0] <= energy_ratio <= energy_range[1]:
  427. # 3. 频谱形状相似性检查
  428. if len(fingerprint.spectral_features) == len(tts_fp.spectral_features):
  429. features1 = np.array(
  430. fingerprint.spectral_features)
  431. features2 = np.array(tts_fp.spectral_features)
  432. # 归一化频谱特征以消除能量差异的影响
  433. if np.linalg.norm(features1) > 0 and np.linalg.norm(features2) > 0:
  434. features1_norm = features1 / \
  435. np.linalg.norm(features1)
  436. features2_norm = features2 / \
  437. np.linalg.norm(features2)
  438. # 计算归一化后的相似度
  439. similarity = np.dot(
  440. features1_norm, features2_norm)
  441. # 如果归一化后的相似度高,很可能是TTS音频的变种
  442. if similarity > similarity_threshold:
  443. if EchoCancellationConf.should_log_detection_details():
  444. logger.debug(
  445. f"🚨 TTS变种检测: 时间差={time_diff:.3f}s, 能量比={energy_ratio:.3f}, 相似度={similarity:.3f}")
  446. return True
  447. # 4. 频率分布相关性检查
  448. if self._has_similar_frequency_distribution(fingerprint, tts_fp, correlation_threshold):
  449. if EchoCancellationConf.should_log_detection_details():
  450. logger.debug(f"🚨 TTS变种检测: 频率分布相似,时间差={time_diff:.3f}s")
  451. return True
  452. return False
  453. except Exception as e:
  454. logger.error(f"❌ TTS变种检测失败: {e}")
  455. return False
  456. def _has_similar_frequency_distribution(self, fp1: AudioFingerprint, fp2: AudioFingerprint, threshold: float = 0.5) -> bool:
  457. """检查两个音频指纹是否有相似的频率分布"""
  458. try:
  459. if len(fp1.spectral_features) != len(fp2.spectral_features):
  460. return False
  461. features1 = np.array(fp1.spectral_features)
  462. features2 = np.array(fp2.spectral_features)
  463. # 计算频率分布的相关系数
  464. if len(features1) > 1:
  465. correlation = np.corrcoef(features1, features2)[0, 1]
  466. return not np.isnan(correlation) and correlation > threshold
  467. return False
  468. except Exception as e:
  469. logger.error(f"❌ 频率分布比较失败: {e}")
  470. return False
  471. def _is_definitely_user_voice(self, fingerprint: AudioFingerprint) -> bool:
  472. """严格判断是否为确定的用户语音(用于播放中的打断检测)"""
  473. try:
  474. # 1. 更高的能量阈值 - 用户打断时通常会更大声
  475. energy_multiplier = self.tts_filtering_config['definite_voice_energy_multiplier']
  476. high_energy_threshold = self.energy_threshold * \
  477. self.voice_detection_config['energy_multiplier'] * \
  478. energy_multiplier
  479. if fingerprint.energy < high_energy_threshold:
  480. return False
  481. # 2. 严格的频谱特征分析
  482. features = np.array(fingerprint.spectral_features)
  483. if len(features) >= 8:
  484. low_freq_energy = np.mean(features[:4]) # 低频部分
  485. mid_freq_energy = np.mean(features[4:8]) # 中频部分
  486. high_freq_energy = np.mean(features[8:]) if len(
  487. features) > 8 else 0 # 高频部分
  488. # 人声特征检查 - 更严格的标准
  489. # 中频能量应该明显高于低频和高频
  490. if not (mid_freq_energy > low_freq_energy * 0.8 and
  491. mid_freq_energy > high_freq_energy * 1.2):
  492. return False
  493. # 3. 检查频谱的动态范围 - 人声通常有较大的动态范围
  494. min_spectral_range = self.tts_filtering_config['min_spectral_range']
  495. spectral_range = np.max(features) - np.min(features)
  496. if spectral_range < min_spectral_range:
  497. return False
  498. # 4. 与所有TTS音频的差异检查 - 必须与所有TTS音频都有显著差异
  499. if not self._has_significant_difference_from_all_tts(fingerprint):
  500. return False
  501. # 5. 检查音频的复杂度 - 人声通常比TTS更复杂
  502. if not self._has_sufficient_complexity(fingerprint):
  503. return False
  504. # 所有检查都通过,认为是确定的用户语音
  505. self.stats['user_interrupts_detected'] += 1
  506. return True
  507. except Exception as e:
  508. logger.error(f"❌ 确定用户语音判断失败: {e}")
  509. return False
  510. def _has_significant_difference_from_all_tts(self, fingerprint: AudioFingerprint) -> bool:
  511. """检查与所有TTS音频是否都有显著差异"""
  512. try:
  513. if not self.tts_audio_fingerprints:
  514. return True
  515. current_time = fingerprint.timestamp
  516. max_similarity = self.tts_filtering_config['max_similarity_with_tts']
  517. # 检查所有最近的TTS指纹
  518. recent_tts_fingerprints = [
  519. fp for fp in self.tts_audio_fingerprints
  520. if current_time - fp.timestamp < 3.0 # 3秒内的所有TTS
  521. ]
  522. if not recent_tts_fingerprints:
  523. return True
  524. # 必须与所有TTS音频都有显著差异
  525. for tts_fp in recent_tts_fingerprints:
  526. # 能量差异检查 - 更严格
  527. energy_ratio = min(fingerprint.energy, tts_fp.energy) / \
  528. max(fingerprint.energy, tts_fp.energy)
  529. if energy_ratio > 0.7: # 能量过于相似
  530. return False
  531. # 频谱相似度检查 - 更严格
  532. if len(fingerprint.spectral_features) == len(tts_fp.spectral_features):
  533. features1 = np.array(fingerprint.spectral_features)
  534. features2 = np.array(tts_fp.spectral_features)
  535. if np.linalg.norm(features1) > 0 and np.linalg.norm(features2) > 0:
  536. # 归一化比较
  537. features1_norm = features1 / np.linalg.norm(features1)
  538. features2_norm = features2 / np.linalg.norm(features2)
  539. similarity = np.dot(features1_norm, features2_norm)
  540. if similarity > max_similarity: # 相似度过高
  541. return False
  542. return True
  543. except Exception as e:
  544. logger.error(f"❌ 全TTS差异检查失败: {e}")
  545. return False
  546. def _has_sufficient_complexity(self, fingerprint: AudioFingerprint) -> bool:
  547. """检查音频是否有足够的复杂度(人声特征)"""
  548. try:
  549. features = np.array(fingerprint.spectral_features)
  550. min_variation = self.tts_filtering_config['min_spectral_variation']
  551. # 1. 频谱变化检查 - 人声通常有更多的频谱变化
  552. if len(features) > 1:
  553. spectral_variation = np.std(features)
  554. if spectral_variation < min_variation:
  555. return False
  556. # 2. 频谱分布检查 - 人声应该有特定的频率分布
  557. if len(features) >= 6:
  558. # 检查是否有明显的共振峰特征
  559. # 人声通常在某些频段有能量集中
  560. max_energy_idx = np.argmax(features)
  561. if max_energy_idx < 2 or max_energy_idx > len(features) - 2:
  562. # 能量峰值在边缘,可能不是人声
  563. return False
  564. return True
  565. except Exception as e:
  566. logger.error(f"❌ 复杂度检查失败: {e}")
  567. return False
  568. def _is_likely_user_voice_relaxed(self, fingerprint: AudioFingerprint) -> bool:
  569. """宽松判断是否为用户语音(用于非严格模式的打断检测)"""
  570. try:
  571. # 1. 极严格的能量阈值 - 避免嘈杂环境误判
  572. user_voice_threshold = self.energy_threshold * 5.0 # 大幅提高阈值
  573. if fingerprint.energy < user_voice_threshold:
  574. return False
  575. # 2. 如果音频能量足够,需要极严格的检查
  576. moderate_energy_threshold = self.energy_threshold * 20.0 # 极大幅提高阈值
  577. if fingerprint.energy > moderate_energy_threshold:
  578. # 能量足够时,需要通过更严格的差异检查
  579. if self._has_strict_difference_from_tts(fingerprint):
  580. logger.debug(
  581. f"🎤 高能量({fingerprint.energy:.1f}),通过严格差异检查,认为是用户语音")
  582. return True
  583. else:
  584. logger.debug(
  585. f"🚫 高能量({fingerprint.energy:.1f}),但未通过严格差异检查,可能是噪音")
  586. return False
  587. # 3. 基本的频谱特征检查(更宽松)
  588. features = np.array(fingerprint.spectral_features)
  589. if len(features) >= 4: # 降低要求
  590. # 检查是否有人声的基本特征
  591. if len(features) >= 6:
  592. mid_freq_energy = np.mean(features[2:5]) # 中频部分
  593. total_energy = np.mean(features)
  594. # 中频能量占比检查(更宽松)
  595. if mid_freq_energy > total_energy * 0.3: # 从0.5降低到0.3
  596. # 简单的与TTS差异检查
  597. if self._has_basic_difference_from_tts(fingerprint):
  598. return True
  599. # 4. 如果音频足够大声,需要极严格的检查
  600. high_energy_threshold = self.energy_threshold * 50.0 # 提高到50.0,极严格
  601. if fingerprint.energy > high_energy_threshold:
  602. # 即使能量很高,也要检查与TTS的差异,并且需要更严格的条件
  603. if self._has_strict_difference_from_tts(fingerprint):
  604. logger.debug(
  605. f"🎤 极高能量({fingerprint.energy:.1f})且通过严格差异检查,认为是用户语音")
  606. return True
  607. else:
  608. logger.debug(
  609. f"🚫 极高能量({fingerprint.energy:.1f}),但未通过严格差异检查,可能是强噪音")
  610. return False
  611. # 5. 时间窗口检查 - 距离TTS较远时更容易认为是用户语音
  612. if self.tts_audio_fingerprints:
  613. last_tts_time = max(
  614. fp.timestamp for fp in self.tts_audio_fingerprints)
  615. time_since_last_tts = fingerprint.timestamp - last_tts_time
  616. if time_since_last_tts > 1.5: # 从1.0提高到1.5秒
  617. logger.debug(
  618. f"🎤 距离最后TTS较远({time_since_last_tts:.1f}s),认为是用户语音")
  619. return True
  620. return False
  621. except Exception as e:
  622. logger.error(f"❌ 宽松用户语音判断失败: {e}")
  623. # 出错时倾向于认为是用户语音,允许打断
  624. return True
  625. def _has_basic_difference_from_tts(self, fingerprint: AudioFingerprint) -> bool:
  626. """基本的与TTS差异检查(更宽松)"""
  627. try:
  628. if not self.tts_audio_fingerprints:
  629. return True # 没有TTS参考,认为是用户语音
  630. current_time = fingerprint.timestamp
  631. # 检查是否在TTS播放期间
  632. if self.is_playing_tts:
  633. # TTS播放期间采用更严格的标准
  634. # 只检查最近0.3秒内的TTS,缩短时间窗口
  635. recent_tts_fingerprints = [
  636. fp for fp in self.tts_audio_fingerprints
  637. if current_time - fp.timestamp < 0.3
  638. ]
  639. if not recent_tts_fingerprints:
  640. return True # 没有最近的TTS参考
  641. # 与最近的TTS指纹比较(TTS播放期间更严格的标准)
  642. for tts_fp in recent_tts_fingerprints[-1:]: # 只检查最近1个
  643. # 能量差异检查(TTS播放期间更严格)
  644. energy_ratio = min(fingerprint.energy, tts_fp.energy) / \
  645. max(fingerprint.energy, tts_fp.energy)
  646. if energy_ratio > 0.6: # TTS播放期间需要更大的能量差异
  647. logger.debug(
  648. f"🚨 TTS播放期间能量过于相似({energy_ratio:.2f}),可能是回声")
  649. return False
  650. # 频谱特征差异检查(TTS播放期间更严格)
  651. if len(fingerprint.spectral_features) == len(tts_fp.spectral_features):
  652. features1 = np.array(fingerprint.spectral_features)
  653. features2 = np.array(tts_fp.spectral_features)
  654. # 计算简单的欧氏距离
  655. distance = np.linalg.norm(features1 - features2)
  656. if distance < 0.8: # TTS播放期间需要更大的频谱差异
  657. logger.debug(
  658. f"🚨 TTS播放期间频谱距离过小({distance:.2f}),可能是回声")
  659. return False
  660. # 检查相关性
  661. if np.linalg.norm(features1) > 0 and np.linalg.norm(features2) > 0:
  662. features1_norm = features1 / \
  663. np.linalg.norm(features1)
  664. features2_norm = features2 / \
  665. np.linalg.norm(features2)
  666. correlation = np.dot(
  667. features1_norm, features2_norm)
  668. if correlation > 0.5: # TTS播放期间相关性要求更严格
  669. logger.debug(
  670. f"🚨 TTS播放期间频谱相关性过高({correlation:.2f}),可能是回声")
  671. return False
  672. return True # 通过TTS播放期间的严格检查
  673. else:
  674. # 非TTS播放期间采用原来的宽松标准
  675. # 只检查最近0.5秒内的TTS,缩短时间窗口
  676. recent_tts_fingerprints = [
  677. fp for fp in self.tts_audio_fingerprints
  678. if current_time - fp.timestamp < 0.5
  679. ]
  680. if not recent_tts_fingerprints:
  681. return True # 没有最近的TTS参考
  682. # 与最近的TTS指纹比较(非常宽松的标准)
  683. for tts_fp in recent_tts_fingerprints[-1:]: # 只检查最近1个
  684. # 能量差异检查(更宽松)
  685. energy_ratio = min(fingerprint.energy, tts_fp.energy) / \
  686. max(fingerprint.energy, tts_fp.energy)
  687. if energy_ratio > 0.8: # 从0.5提高到0.8,需要更相似才认为可疑
  688. # 频谱特征差异检查(更宽松)
  689. if len(fingerprint.spectral_features) == len(tts_fp.spectral_features):
  690. features1 = np.array(fingerprint.spectral_features)
  691. features2 = np.array(tts_fp.spectral_features)
  692. # 计算简单的欧氏距离
  693. distance = np.linalg.norm(features1 - features2)
  694. if distance < 0.5: # 从1.0降低到0.5,需要更相似才认为是回声
  695. logger.debug(f"🚨 频谱距离过小({distance:.2f}),可能是回声")
  696. return False
  697. return True # 通过基本检查,认为是用户语音
  698. except Exception as e:
  699. logger.error(f"❌ 基本TTS差异检查失败: {e}")
  700. return True # 出错时倾向于认为是用户语音
  701. def _has_strict_difference_from_tts(self, fingerprint: AudioFingerprint) -> bool:
  702. """严格的与TTS差异检查(用于高能量音频)"""
  703. try:
  704. if not self.tts_audio_fingerprints:
  705. return True # 没有TTS参考,认为是用户语音
  706. current_time = fingerprint.timestamp
  707. # 检查最近1秒内的TTS,时间窗口更严格
  708. recent_tts_fingerprints = [
  709. fp for fp in self.tts_audio_fingerprints
  710. if current_time - fp.timestamp < 1.0
  711. ]
  712. if not recent_tts_fingerprints:
  713. return True # 没有最近的TTS参考
  714. # 与最近的TTS指纹比较(非常严格的标准)
  715. for tts_fp in recent_tts_fingerprints:
  716. # 1. 时间相关性检查 - 如果在TTS播放后很短时间内出现,更可能是回声
  717. time_diff = current_time - tts_fp.timestamp
  718. if time_diff < 0.3: # 300ms内
  719. logger.debug(f"🚫 时间过近({time_diff:.2f}s),可能是回声")
  720. return False
  721. # 2. 能量差异检查(非常严格)
  722. energy_ratio = min(fingerprint.energy, tts_fp.energy) / \
  723. max(fingerprint.energy, tts_fp.energy)
  724. if energy_ratio > 0.9: # 需要能量差异很大才认为不是回声
  725. logger.debug(f"🚫 能量过于相似({energy_ratio:.2f}),可能是回声")
  726. return False
  727. # 3. 频谱特征差异检查(非常严格)
  728. if len(fingerprint.spectral_features) == len(tts_fp.spectral_features):
  729. features1 = np.array(fingerprint.spectral_features)
  730. features2 = np.array(tts_fp.spectral_features)
  731. # 计算欧氏距离和相关性
  732. distance = np.linalg.norm(features1 - features2)
  733. if distance < 0.3: # 需要频谱差异很大
  734. logger.debug(f"🚫 频谱过于相似({distance:.2f}),可能是回声")
  735. return False
  736. # 检查相关性
  737. if np.linalg.norm(features1) > 0 and np.linalg.norm(features2) > 0:
  738. features1_norm = features1 / np.linalg.norm(features1)
  739. features2_norm = features2 / np.linalg.norm(features2)
  740. correlation = np.dot(features1_norm, features2_norm)
  741. if correlation > 0.7: # 相关性过高
  742. logger.debug(f"🚫 频谱相关性过高({correlation:.2f}),可能是回声")
  743. return False
  744. return True # 通过严格检查,认为是用户语音
  745. except Exception as e:
  746. logger.error(f"❌ 严格TTS差异检查失败: {e}")
  747. return False # 出错时倾向于认为是回声,减少误判
  748. def _is_definitely_echo(self, fingerprint: AudioFingerprint) -> bool:
  749. """确定判断是否为回声(用于非严格模式)"""
  750. try:
  751. if not self.tts_audio_fingerprints:
  752. return False # 没有TTS参考,不能确定是回声
  753. current_time = fingerprint.timestamp
  754. # 检查最近的TTS指纹
  755. recent_tts_fingerprints = [
  756. fp for fp in self.tts_audio_fingerprints
  757. if current_time - fp.timestamp < 1.5 # 1.5秒内的TTS
  758. ]
  759. if not recent_tts_fingerprints:
  760. return False # 没有最近的TTS参考
  761. for tts_fp in recent_tts_fingerprints:
  762. # 1. 时间相关性检查 - 如果在TTS播放后很短时间内出现
  763. time_diff = current_time - tts_fp.timestamp
  764. if time_diff < 0.3: # 300ms内
  765. # 2. 能量衰减检查 - 符合扬声器到麦克风的衰减特征
  766. energy_ratio = fingerprint.energy / tts_fp.energy
  767. if 0.1 <= energy_ratio <= 0.7: # 能量衰减10%-70%
  768. # 3. 频谱相似性检查 - 高度相似
  769. if len(fingerprint.spectral_features) == len(tts_fp.spectral_features):
  770. features1 = np.array(fingerprint.spectral_features)
  771. features2 = np.array(tts_fp.spectral_features)
  772. if np.linalg.norm(features1) > 0 and np.linalg.norm(features2) > 0:
  773. # 归一化比较
  774. features1_norm = features1 / \
  775. np.linalg.norm(features1)
  776. features2_norm = features2 / \
  777. np.linalg.norm(features2)
  778. similarity = np.dot(
  779. features1_norm, features2_norm)
  780. # 如果相似度很高,几乎确定是回声
  781. if similarity > 0.8:
  782. return True
  783. # 4. 完全匹配检查
  784. if fingerprint.fingerprint == tts_fp.fingerprint:
  785. return True
  786. return False # 不能确定是回声
  787. except Exception as e:
  788. logger.error(f"❌ 确定回声判断失败: {e}")
  789. return False # 出错时不确定是回声
  790. def _trigger_interrupt(self):
  791. """已移除:不再触发打断信号,语音正常播放进行消音"""
  792. try:
  793. # 防止重复触发
  794. current_time = time.time()
  795. if hasattr(self, '_last_trigger_time') and current_time - self._last_trigger_time < 0.5:
  796. logger.debug("🔇 打断信号防抖:跳过重复触发")
  797. return
  798. self._last_trigger_time = current_time
  799. # 不再创建待验证的打断请求,只进行消音处理
  800. logger.debug("🔇 回声消除检测到音频,进行消音处理但不打断播放")
  801. except Exception as e:
  802. logger.error(f"❌ 消音处理失败: {e}")
  803. import traceback
  804. logger.error(f"消音处理异常详情: {traceback.format_exc()}")
  805. def _is_very_strong_user_voice(self, fingerprint: AudioFingerprint) -> bool:
  806. """
  807. 检测是否为非常强的用户语音
  808. 用于在TTS播放期间让明显的用户语音通过,触发VAD和IAT
  809. """
  810. try:
  811. # 1. 适中的能量阈值 - 让更多用户语音通过
  812. high_energy_threshold = self.energy_threshold * 3.0 # 降低到3倍基础阈值
  813. if fingerprint.energy < high_energy_threshold:
  814. return False
  815. # 2. 检查是否有明显的人声特征
  816. if not self._has_obvious_voice_characteristics(fingerprint):
  817. return False
  818. # 3. 与TTS音频有足够差异
  819. if not self._has_significant_difference_from_tts(fingerprint):
  820. return False
  821. # logger.debug(f"🔊 检测到强用户语音,能量: {fingerprint.energy:.1f}")
  822. return True
  823. except Exception as e:
  824. logger.error(f"❌ 强用户语音检测失败: {e}")
  825. return False
  826. def _has_extreme_difference_from_tts(self, fingerprint: AudioFingerprint) -> bool:
  827. """检查与TTS音频是否有极大差异"""
  828. try:
  829. if not self.tts_audio_fingerprints:
  830. return True
  831. current_time = fingerprint.timestamp
  832. # 检查最近的TTS指纹
  833. recent_tts_fingerprints = [
  834. fp for fp in self.tts_audio_fingerprints
  835. if current_time - fp.timestamp < 3.0 # 3秒内的TTS,延长检查时间
  836. ]
  837. if not recent_tts_fingerprints:
  838. return True
  839. # 必须与所有TTS音频都有极大的能量差异
  840. for tts_fp in recent_tts_fingerprints:
  841. energy_ratio = min(fingerprint.energy, tts_fp.energy) / \
  842. max(fingerprint.energy, tts_fp.energy)
  843. if energy_ratio > 0.3: # 能量差异阈值降低到0.3,更严格
  844. return False
  845. # 进一步检查频谱特征差异
  846. for tts_fp in recent_tts_fingerprints:
  847. if self._has_similar_frequency_distribution(fingerprint, tts_fp, threshold=0.6):
  848. return False
  849. return True
  850. except Exception as e:
  851. logger.error(f"❌ 极大TTS差异检查失败: {e}")
  852. return False
  853. def _has_obvious_voice_characteristics(self, fingerprint: AudioFingerprint) -> bool:
  854. """检查是否有明显的人声特征"""
  855. try:
  856. features = np.array(fingerprint.spectral_features)
  857. if len(features) < 6:
  858. return False
  859. # 简单但有效的人声检查
  860. low_freq = np.mean(features[:2]) # 低频
  861. mid_freq = np.mean(features[2:5]) # 中频
  862. high_freq = np.mean(features[5:]) # 高频
  863. # 放宽人声检测条件
  864. # 1. 中频能量较强(主要条件)
  865. if mid_freq > low_freq * 0.6 and mid_freq > high_freq * 0.6:
  866. return True
  867. # 2. 或者能量分布相对均匀(人声特征)
  868. total_energy = low_freq + mid_freq + high_freq
  869. if total_energy > 0:
  870. mid_ratio = mid_freq / total_energy
  871. if 0.25 < mid_ratio < 0.6: # 中频占比合理范围
  872. return True
  873. return False
  874. except Exception as e:
  875. logger.error(f"❌ 明显人声特征检测失败: {e}")
  876. return False
  877. # 已移除:_is_very_likely_user_voice_during_tts 方法(不再需要TTS期间的打断检测)
  878. def _has_strong_voice_characteristics(self, fingerprint: AudioFingerprint) -> bool:
  879. """检查是否具有强烈的人声特征"""
  880. try:
  881. features = np.array(fingerprint.spectral_features)
  882. if len(features) < 8:
  883. return False
  884. # 更严格的频谱分布检查
  885. low_freq = np.mean(features[:2]) # 低频
  886. mid_freq = np.mean(features[2:6]) # 中频
  887. high_freq = np.mean(features[6:]) # 高频
  888. # 中频能量必须明显强于低频和高频(人声特征)
  889. if mid_freq < low_freq * 1.2 or mid_freq < high_freq * 1.5:
  890. return False
  891. # 检查频谱的平滑度 - 人声通常有特定的共振峰
  892. spectral_variance = np.var(features)
  893. if spectral_variance < 0.1: # 频谱过于平滑可能是TTS
  894. return False
  895. return True
  896. except Exception as e:
  897. logger.error(f"❌ 强人声特征检测失败: {e}")
  898. return False
  899. def _has_high_spectral_complexity(self, fingerprint: AudioFingerprint) -> bool:
  900. """检查是否具有高频谱复杂度"""
  901. try:
  902. features = np.array(fingerprint.spectral_features)
  903. if len(features) < 4:
  904. return False
  905. # 计算频谱的熵(复杂度指标)
  906. normalized_features = features / \
  907. np.sum(features) if np.sum(features) > 0 else features
  908. entropy = -np.sum(normalized_features *
  909. np.log(normalized_features + 1e-10))
  910. # 人声通常有较高的熵值
  911. min_entropy = 2.0 # 根据实际情况调整
  912. if entropy < min_entropy:
  913. return False
  914. return True
  915. except Exception as e:
  916. logger.error(f"❌ 频谱复杂度检测失败: {e}")
  917. return False
  918. def _is_possibly_user_voice_during_tts(self, fingerprint: AudioFingerprint) -> bool:
  919. """
  920. 在TTS播放期间判断是否可能是用户语音
  921. 使用更宽松的标准,主要用于创建待验证的打断请求
  922. """
  923. try:
  924. # 1. 基本能量检查 - 需要足够的能量
  925. min_energy_threshold = self.energy_threshold * 2.0 # 降低能量要求
  926. if fingerprint.energy < min_energy_threshold:
  927. return False
  928. # 2. 与TTS音频的基本差异检查
  929. if not self._has_basic_difference_from_tts(fingerprint):
  930. return False
  931. # 3. 检查是否有人声特征
  932. if not self._has_voice_characteristics(fingerprint):
  933. return False
  934. return True
  935. except Exception as e:
  936. logger.error(f"❌ TTS期间用户语音检测失败: {e}")
  937. return False
  938. def _has_voice_characteristics(self, fingerprint: AudioFingerprint) -> bool:
  939. """检查音频是否具有人声特征"""
  940. try:
  941. features = np.array(fingerprint.spectral_features)
  942. if len(features) < 4:
  943. return False
  944. # 检查频谱分布 - 人声通常在中频有较强能量
  945. if len(features) >= 8:
  946. low_freq = np.mean(features[:3])
  947. mid_freq = np.mean(features[3:7])
  948. high_freq = np.mean(features[7:]) if len(features) > 7 else 0
  949. # 中频能量应该相对较强
  950. if mid_freq < low_freq * 0.5:
  951. return False
  952. return True
  953. except Exception as e:
  954. logger.error(f"❌ 人声特征检测失败: {e}")
  955. return False
  956. # 已移除:_create_pending_interrupt_request 方法(不再需要创建待验证的打断请求)
  957. class SelfVoiceDetector:
  958. """自我声音检测器"""
  959. def __init__(self):
  960. self.echo_engine = EchoCancellationEngine()
  961. self.voice_callbacks: Dict[str, Callable] = {}
  962. self._lock = threading.RLock()
  963. # 定期清理线程
  964. self._cleanup_thread = threading.Thread(
  965. target=self._periodic_cleanup, daemon=True)
  966. self._stop_cleanup = threading.Event()
  967. self._cleanup_thread.start()
  968. logger.info("🎯 自我声音检测器已初始化")
  969. def register_voice_callback(self, name: str, callback: Callable):
  970. """注册声音检测回调"""
  971. with self._lock:
  972. self.voice_callbacks[name] = callback
  973. logger.info(f"📝 注册声音检测回调: {name}")
  974. def unregister_voice_callback(self, name: str):
  975. """注销声音检测回调"""
  976. with self._lock:
  977. if name in self.voice_callbacks:
  978. del self.voice_callbacks[name]
  979. logger.info(f"🗑️ 注销声音检测回调: {name}")
  980. def set_tts_playing(self, is_playing: bool, audio_data: Optional[bytes] = None):
  981. """设置TTS播放状态"""
  982. self.echo_engine.set_tts_playing_status(is_playing, audio_data)
  983. # 通知回调
  984. with self._lock:
  985. for name, callback in self.voice_callbacks.items():
  986. try:
  987. callback('tts_status_changed', {
  988. 'is_playing': is_playing,
  989. 'timestamp': time.time()
  990. })
  991. except Exception as e:
  992. logger.error(f"❌ 声音检测回调 {name} 执行失败: {e}")
  993. def should_ignore_audio(self, audio_data: bytes) -> bool:
  994. """判断是否应该忽略音频(回声检测)"""
  995. return self.echo_engine.is_echo_audio(audio_data)
  996. def process_recording_audio(self, audio_data: bytes) -> bool:
  997. """处理录音音频,返回是否应该继续处理"""
  998. if self.should_ignore_audio(audio_data):
  999. # logger.debug("🚫 检测到自我声音,忽略音频数据")
  1000. return False
  1001. return True
  1002. def _periodic_cleanup(self):
  1003. """定期清理过期数据"""
  1004. while not self._stop_cleanup.is_set():
  1005. try:
  1006. self.echo_engine.cleanup_old_fingerprints()
  1007. time.sleep(0.1) # 每秒清理一次
  1008. except Exception as e:
  1009. logger.error(f"❌ 定期清理失败: {e}")
  1010. time.sleep(1.0) # 出错时等待更长时间
  1011. def get_detection_stats(self) -> Dict:
  1012. """获取检测统计信息"""
  1013. return self.echo_engine.get_stats()
  1014. def enable_echo_cancellation(self):
  1015. """启用回声消除"""
  1016. self.echo_engine.enable()
  1017. def disable_echo_cancellation(self):
  1018. """禁用回声消除"""
  1019. self.echo_engine.disable()
  1020. def shutdown(self):
  1021. """关闭检测器"""
  1022. logger.info("🔄 关闭自我声音检测器...")
  1023. self._stop_cleanup.set()
  1024. if self._cleanup_thread.is_alive():
  1025. self._cleanup_thread.join(timeout=2.0)
  1026. logger.info("✅ 自我声音检测器已关闭")
  1027. # 全局实例
  1028. _self_voice_detector: Optional[SelfVoiceDetector] = None
  1029. _detector_lock = threading.Lock()
  1030. def get_self_voice_detector() -> SelfVoiceDetector:
  1031. """获取全局自我声音检测器实例"""
  1032. global _self_voice_detector
  1033. with _detector_lock:
  1034. if _self_voice_detector is None:
  1035. _self_voice_detector = SelfVoiceDetector()
  1036. return _self_voice_detector
  1037. def cleanup_self_voice_detector():
  1038. """清理全局检测器实例"""
  1039. global _self_voice_detector
  1040. with _detector_lock:
  1041. if _self_voice_detector is not None:
  1042. _self_voice_detector.shutdown()
  1043. _self_voice_detector = None