move claude-marketplace to ai-proj-helper
This commit is contained in:
186
plugins/doubao-voice-plugin/scripts/README.md
Normal file
186
plugins/doubao-voice-plugin/scripts/README.md
Normal file
@@ -0,0 +1,186 @@
|
||||
# 豆包语音工具使用指南
|
||||
|
||||
简单易用的豆包语音命令行工具,支持**文字转语音(TTS)**和**唱歌**。
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 1. 配置环境变量
|
||||
|
||||
```bash
|
||||
# 在 ~/.zshrc 或 ~/.bashrc 中添加
|
||||
export DOUBAO_APP_ID="your_app_id"
|
||||
export DOUBAO_ACCESS_TOKEN="your_access_token"
|
||||
|
||||
# 使配置生效
|
||||
source ~/.zshrc
|
||||
```
|
||||
|
||||
### 2. 安装依赖
|
||||
|
||||
```bash
|
||||
pip install requests
|
||||
```
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 📝 文字转语音 (TTS)
|
||||
|
||||
**基础用法:**
|
||||
```bash
|
||||
python voice_converter.py tts "你好,我是豆包语音助手"
|
||||
```
|
||||
|
||||
**指定输出文件和音色:**
|
||||
```bash
|
||||
python voice_converter.py tts "欢迎使用豆包语音" -o welcome.mp3 -v BV701_V2_streaming
|
||||
```
|
||||
|
||||
**可用音色:**
|
||||
- `BV700_V2_streaming` - 通用女声(默认,推荐)
|
||||
- `BV701_V2_streaming` - 通用男声
|
||||
- `BV406_streaming` - 温柔女声
|
||||
- `BV158_streaming` - 活泼女声
|
||||
- `BV115_streaming` - 磁性男声
|
||||
|
||||
### 🎵 唱歌 (Singing)
|
||||
|
||||
**基础用法:**
|
||||
```bash
|
||||
python singing.py sing "请唱一首关于春天的歌"
|
||||
```
|
||||
|
||||
**指定输出文件:**
|
||||
```bash
|
||||
python singing.py sing "唱一个温柔的摇篮曲" -o lullaby.mp3
|
||||
```
|
||||
|
||||
**交互式模式(实时对话):**
|
||||
```bash
|
||||
python singing.py interactive
|
||||
```
|
||||
|
||||
在交互模式下可以自然地与豆包对话,要求她唱歌、讲故事等。输入 `quit` 退出。
|
||||
|
||||
## Python 代码调用
|
||||
|
||||
```python
|
||||
# TTS - 文字转语音
|
||||
from voice_converter import DoubaoVoiceConverter
|
||||
|
||||
converter = DoubaoVoiceConverter()
|
||||
audio_file = converter.text_to_speech(
|
||||
"你好,欢迎使用豆包语音",
|
||||
output_file="hello.mp3",
|
||||
voice_type="BV700_V2_streaming"
|
||||
)
|
||||
print(f"生成语音: {audio_file}")
|
||||
|
||||
# 唱歌
|
||||
import asyncio
|
||||
from singing import DoubaoSinging
|
||||
|
||||
async def main():
|
||||
singing = DoubaoSinging()
|
||||
|
||||
# 让豆包唱歌
|
||||
audio_file = await singing.sing(
|
||||
"请唱一首情歌",
|
||||
output_file="love_song.mp3",
|
||||
language="zh-CN"
|
||||
)
|
||||
print(f"唱歌完成: {audio_file}")
|
||||
|
||||
# 或启动交互模式
|
||||
# await singing.interactive_singing()
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## 完整示例
|
||||
|
||||
### 示例1:生成通知语音
|
||||
|
||||
```bash
|
||||
# 生成女声通知
|
||||
python voice_converter.py tts "您有一条新消息,请注意查收" -o notification.mp3
|
||||
|
||||
# 生成男声通知
|
||||
python voice_converter.py tts "系统将在5分钟后进行维护" -o maintenance.mp3 -v BV701_V2_streaming
|
||||
```
|
||||
|
||||
### 示例2:唱歌
|
||||
|
||||
```bash
|
||||
# 让豆包唱一首情歌
|
||||
python singing.py sing "请唱一首温柔的情歌" -o love_song.mp3
|
||||
|
||||
# 让豆包唱一首儿歌
|
||||
python singing.py sing "唱一首欢快的儿歌" -o kids_song.mp3
|
||||
|
||||
# 启动交互式模式与豆包对话
|
||||
python singing.py interactive
|
||||
```
|
||||
|
||||
|
||||
## 错误处理
|
||||
|
||||
### 常见错误
|
||||
|
||||
**1. 环境变量未设置**
|
||||
```
|
||||
❌ 错误: 请先设置环境变量:
|
||||
export DOUBAO_APP_ID='your_app_id'
|
||||
export DOUBAO_ACCESS_TOKEN='your_access_token'
|
||||
```
|
||||
**解决:** 确保已正确设置环境变量并 `source ~/.zshrc`
|
||||
|
||||
**2. API 调用失败**
|
||||
```
|
||||
❌ 错误: TTS 失败 (code: 4001): Invalid token
|
||||
```
|
||||
**解决:** 检查 Access Token 是否正确或已过期
|
||||
|
||||
## 技术参数
|
||||
|
||||
### 音频格式要求
|
||||
|
||||
**TTS 输出:**
|
||||
- 格式:MP3
|
||||
- 采样率:16000 Hz
|
||||
- 声道:单声道
|
||||
|
||||
### API 限制
|
||||
|
||||
- **TTS**: 单次最长 5000 字符
|
||||
- **并发限制**: 根据购买的并发数
|
||||
|
||||
## 在 Claude Code 中使用
|
||||
|
||||
在 Claude Code 中可以直接用自然语言调用:
|
||||
|
||||
**TTS - 文字转语音**:
|
||||
```
|
||||
"把这段话转成语音:你好世界"
|
||||
"用温柔女声合成:欢迎光临"
|
||||
```
|
||||
|
||||
**唱歌**:
|
||||
```
|
||||
"请唱一首关于春天的歌"
|
||||
"唱一个温柔的摇篮曲"
|
||||
"开启与豆包的实时语音对话模式"
|
||||
```
|
||||
|
||||
## 获取 API 凭证
|
||||
|
||||
1. 访问 [火山引擎控制台](https://console.volcengine.com/speech/app)
|
||||
2. 创建应用
|
||||
3. 获取 App ID 和 Access Token
|
||||
4. 开通所需服务:
|
||||
- 豆包语音合成模型2.0
|
||||
|
||||
## 参考链接
|
||||
|
||||
- [火山引擎豆包语音文档](https://www.volcengine.com/docs/6561)
|
||||
- [API 接口文档](https://www.volcengine.com/docs/6561/1096680)
|
||||
- [计费说明](https://www.volcengine.com/docs/6561/1359370)
|
||||
@@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
# 豆包语音 API 环境变量配置(本地版本)
|
||||
#
|
||||
# 使用说明:
|
||||
# 1. 复制本文件: cp setup_env.local.sh.example setup_env.local.sh
|
||||
# 2. 编辑 setup_env.local.sh,填入您的真实凭证
|
||||
# 3. 运行: source setup_env.local.sh
|
||||
# 4. .gitignore 已配置忽略 setup_env.local.sh,所以您的凭证不会被提交到 Git
|
||||
|
||||
# ⚠️ 重要:请在下面填入您的真实凭证(仅本地使用)
|
||||
export DOUBAO_APP_ID="your_app_id_here"
|
||||
export DOUBAO_ACCESS_TOKEN="your_access_token_here"
|
||||
|
||||
# V3 API 配置 (可选,如需豆包2.0音色)
|
||||
# export DOUBAO_USE_V3="true"
|
||||
# export DOUBAO_RESOURCE_ID="volc.bigmodel.tts"
|
||||
|
||||
echo "✅ 豆包语音 API 环境变量已设置(本地配置)"
|
||||
echo ""
|
||||
echo "App ID: ${DOUBAO_APP_ID:0:10}..."
|
||||
echo "Access Token: ${DOUBAO_ACCESS_TOKEN:0:20}..."
|
||||
22
plugins/doubao-voice-plugin/scripts/setup_env.sh
Executable file
22
plugins/doubao-voice-plugin/scripts/setup_env.sh
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
# 豆包语音 API 环境变量配置 (示例)
|
||||
#
|
||||
# ⚠️ 重要:这是示例脚本,包含占位符。
|
||||
# 本地使用时,请参考 setup_env.local.sh.example 创建 setup_env.local.sh,
|
||||
# 然后在其中填入您的真实凭证。.gitignore 已配置忽略 .local 文件。
|
||||
|
||||
export DOUBAO_APP_ID="your_app_id"
|
||||
export DOUBAO_ACCESS_TOKEN="your_access_token"
|
||||
|
||||
# V3 API 配置 (可选,如需豆包2.0音色)
|
||||
# export DOUBAO_USE_V3="true"
|
||||
# export DOUBAO_RESOURCE_ID="volc.bigmodel.tts"
|
||||
|
||||
echo "✅ 豆包语音 API 环境变量已设置"
|
||||
echo ""
|
||||
echo "App ID: $DOUBAO_APP_ID"
|
||||
echo "Access Token: ${DOUBAO_ACCESS_TOKEN:0:20}..."
|
||||
echo ""
|
||||
echo "现在可以运行:"
|
||||
echo " python3 voice_converter.py tts \"你好世界\" -o hello.mp3"
|
||||
echo " python3 voice_converter.py asr audio.mp3 # 需先启用ASR服务"
|
||||
327
plugins/doubao-voice-plugin/scripts/singing.py
Executable file
327
plugins/doubao-voice-plugin/scripts/singing.py
Executable file
@@ -0,0 +1,327 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
豆包唱歌工具
|
||||
基于豆包端到端实时语音大模型,支持让豆包唱歌
|
||||
使用WebSocket实时对话和生成音频
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import asyncio
|
||||
import websockets
|
||||
import struct
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# 连接级事件(不需要session_id)
|
||||
CONNECTION_EVENTS = {1, 2, 50, 51, 52}
|
||||
|
||||
|
||||
class DoubaoSinging:
|
||||
"""豆包唱歌工具类"""
|
||||
|
||||
def __init__(self):
|
||||
# 从环境变量读取配置
|
||||
self.app_id = os.environ.get("DOUBAO_APP_ID")
|
||||
self.access_token = os.environ.get("DOUBAO_ACCESS_TOKEN")
|
||||
|
||||
if not self.app_id or not self.access_token:
|
||||
raise ValueError(
|
||||
"请先设置环境变量:\n"
|
||||
"export DOUBAO_APP_ID='your_app_id'\n"
|
||||
"export DOUBAO_ACCESS_TOKEN='your_access_token'"
|
||||
)
|
||||
|
||||
# 端到端实时语音WebSocket地址
|
||||
self.ws_url = "wss://openspeech.bytedance.com/api/v3/realtime/dialogue"
|
||||
self.app_key = "PlgvMymc7f3tQnJ6" # 固定值
|
||||
self.resource_id = "volc.speech.dialog" # 固定值
|
||||
|
||||
def _build_message(self, event_id: int, payload: dict = None, session_id: str = None) -> bytes:
|
||||
"""
|
||||
构建二进制消息
|
||||
|
||||
协议格式:
|
||||
- header (4 bytes)
|
||||
- event_id (4 bytes, big-endian)
|
||||
- [session_id_len (4 bytes) + session_id (variable)] -- 仅非连接级事件
|
||||
- payload_len (4 bytes, big-endian)
|
||||
- payload (variable, JSON)
|
||||
"""
|
||||
buf = bytearray()
|
||||
|
||||
# Header (4 bytes)
|
||||
buf.append(0x11) # version=1, header_size=1
|
||||
buf.append(0x14) # FULL_CLIENT_REQUEST(0x1) + WITH_EVENT(0x4)
|
||||
buf.append(0x10) # JSON serialization, no compression
|
||||
buf.append(0x00) # reserved
|
||||
|
||||
# Event ID
|
||||
buf.extend(struct.pack('>I', event_id))
|
||||
|
||||
# Session ID (required for non-connection events)
|
||||
if event_id not in CONNECTION_EVENTS:
|
||||
sid_bytes = (session_id or "").encode('utf-8')
|
||||
buf.extend(struct.pack('>I', len(sid_bytes)))
|
||||
buf.extend(sid_bytes)
|
||||
|
||||
# Payload
|
||||
if payload:
|
||||
payload_bytes = json.dumps(payload, ensure_ascii=False).encode('utf-8')
|
||||
else:
|
||||
payload_bytes = b'{}'
|
||||
buf.extend(struct.pack('>I', len(payload_bytes)))
|
||||
buf.extend(payload_bytes)
|
||||
|
||||
return bytes(buf)
|
||||
|
||||
def _parse_response(self, data: bytes) -> dict:
|
||||
"""
|
||||
解析服务端二进制消息
|
||||
|
||||
Returns:
|
||||
dict with keys: msg_type, event_id, session_id, payload, payload_bytes
|
||||
"""
|
||||
result = {"raw": data}
|
||||
if len(data) < 4:
|
||||
return result
|
||||
|
||||
# Header
|
||||
msg_type = (data[1] >> 4) & 0x0F
|
||||
flags = data[1] & 0x0F
|
||||
result["msg_type"] = msg_type
|
||||
|
||||
offset = 4
|
||||
|
||||
# Event ID (if WITH_EVENT flag)
|
||||
if flags & 0x04 and len(data) >= offset + 4:
|
||||
event_id = struct.unpack('>I', data[offset:offset + 4])[0]
|
||||
result["event_id"] = event_id
|
||||
offset += 4
|
||||
|
||||
# Connect ID for connection events (50, 51, 52)
|
||||
if event_id in {50, 51, 52} and len(data) >= offset + 4:
|
||||
cid_len = struct.unpack('>I', data[offset:offset + 4])[0]
|
||||
offset += 4
|
||||
if len(data) >= offset + cid_len:
|
||||
result["connect_id"] = data[offset:offset + cid_len].decode('utf-8', errors='ignore')
|
||||
offset += cid_len
|
||||
# Session ID for session-level events
|
||||
elif event_id not in CONNECTION_EVENTS and len(data) >= offset + 4:
|
||||
sid_len = struct.unpack('>I', data[offset:offset + 4])[0]
|
||||
offset += 4
|
||||
if len(data) >= offset + sid_len:
|
||||
result["session_id"] = data[offset:offset + sid_len].decode('utf-8', errors='ignore')
|
||||
offset += sid_len
|
||||
|
||||
# Payload
|
||||
if len(data) >= offset + 4:
|
||||
payload_len = struct.unpack('>I', data[offset:offset + 4])[0]
|
||||
offset += 4
|
||||
if len(data) >= offset + payload_len:
|
||||
payload_raw = data[offset:offset + payload_len]
|
||||
result["payload_bytes"] = payload_raw
|
||||
# Audio-only responses (msg_type 0xB) have raw audio
|
||||
if msg_type == 0x0B:
|
||||
result["is_audio"] = True
|
||||
else:
|
||||
try:
|
||||
result["payload"] = json.loads(payload_raw.decode('utf-8'))
|
||||
except:
|
||||
result["payload_text"] = payload_raw.decode('utf-8', errors='ignore')
|
||||
|
||||
return result
|
||||
|
||||
async def sing(
|
||||
self,
|
||||
song_request: str,
|
||||
output_file: str = "singing_output.mp3",
|
||||
language: str = "zh-CN",
|
||||
model: str = "1.2.1.0"
|
||||
) -> str:
|
||||
"""
|
||||
让豆包唱歌
|
||||
|
||||
Args:
|
||||
song_request: 唱歌请求,如 "请唱一首关于春天的歌"
|
||||
output_file: 输出音频文件路径
|
||||
language: 语言代码 (zh-CN/en-US)
|
||||
model: 模型版本
|
||||
|
||||
Returns:
|
||||
str: 输出文件路径
|
||||
"""
|
||||
print(f"🎵 豆包唱歌中...")
|
||||
print(f" 请求: {song_request}")
|
||||
print(f" 模型: {model}")
|
||||
|
||||
try:
|
||||
audio_data = bytearray()
|
||||
session_id = str(uuid.uuid4())
|
||||
|
||||
# WebSocket连接头
|
||||
headers = {
|
||||
"X-Api-App-ID": self.app_id,
|
||||
"X-Api-Access-Key": self.access_token,
|
||||
"X-Api-Resource-Id": self.resource_id,
|
||||
"X-Api-App-Key": self.app_key,
|
||||
"X-Api-Connect-Id": str(uuid.uuid4()),
|
||||
}
|
||||
|
||||
async with websockets.connect(self.ws_url, additional_headers=headers) as websocket:
|
||||
print("✅ WebSocket连接成功")
|
||||
|
||||
# 1. StartConnection (event_id=1, 无需session_id)
|
||||
await websocket.send(self._build_message(1))
|
||||
response = await asyncio.wait_for(websocket.recv(), timeout=5)
|
||||
resp = self._parse_response(response)
|
||||
if resp.get("event_id") == 50:
|
||||
print(f"✅ 连接已建立")
|
||||
else:
|
||||
print(f"⚠️ 连接响应: {resp}")
|
||||
|
||||
# 2. StartSession (event_id=100, 需要session_id)
|
||||
start_session_payload = {
|
||||
"tts": {
|
||||
"audio_config": {
|
||||
"channel": 1,
|
||||
"format": "pcm",
|
||||
"sample_rate": 24000
|
||||
}
|
||||
},
|
||||
"dialog": {
|
||||
"extra": {
|
||||
"enable_music": True,
|
||||
"input_mod": "text",
|
||||
"model": model
|
||||
}
|
||||
}
|
||||
}
|
||||
await websocket.send(self._build_message(100, start_session_payload, session_id))
|
||||
response = await asyncio.wait_for(websocket.recv(), timeout=5)
|
||||
resp = self._parse_response(response)
|
||||
if resp.get("event_id") == 150:
|
||||
print(f"✅ 会话已建立")
|
||||
elif resp.get("payload", {}).get("error"):
|
||||
print(f"❌ 会话错误: {resp['payload']['error']}")
|
||||
return None
|
||||
else:
|
||||
print(f"📋 会话响应: {resp}")
|
||||
|
||||
# 3. SayHello/ChatTextQuery (event_id=300, 需要session_id)
|
||||
chat_payload = {"content": song_request}
|
||||
await websocket.send(self._build_message(300, chat_payload, session_id))
|
||||
print(f"📤 已发送唱歌请求")
|
||||
|
||||
# 4. 接收音频流(使用超时检测结束)
|
||||
print("\n📋 接收音频流...")
|
||||
tts_started = False
|
||||
recv_timeout = 5 # 5秒无数据则认为结束
|
||||
|
||||
while True:
|
||||
try:
|
||||
message = await asyncio.wait_for(websocket.recv(), timeout=recv_timeout)
|
||||
except asyncio.TimeoutError:
|
||||
break
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
break
|
||||
|
||||
if isinstance(message, bytes) and len(message) >= 4:
|
||||
resp = self._parse_response(message)
|
||||
msg_type = resp.get("msg_type", 0)
|
||||
flags = message[1] & 0x0F
|
||||
|
||||
# Audio-only response (0xB = 11)
|
||||
if resp.get("is_audio") and resp.get("payload_bytes"):
|
||||
audio_data.extend(resp["payload_bytes"])
|
||||
if not tts_started:
|
||||
print(f" 接收音频中...", end="", flush=True)
|
||||
tts_started = True
|
||||
else:
|
||||
print(".", end="", flush=True)
|
||||
|
||||
# NEG_SEQUENCE flag = last packet
|
||||
if flags & 0x02:
|
||||
break
|
||||
|
||||
# Server error (0xF = 15)
|
||||
elif msg_type == 0x0F:
|
||||
error = resp.get("payload", {}).get("error", "unknown")
|
||||
print(f"\n❌ 服务器错误: {error}")
|
||||
break
|
||||
|
||||
# Full server response (0x9) - session finished
|
||||
elif msg_type == 0x09:
|
||||
event_id = resp.get("event_id", 0)
|
||||
if event_id in {152, 52}:
|
||||
break
|
||||
|
||||
# 5. 保存音频文件
|
||||
if audio_data:
|
||||
# Save as PCM, convert extension if needed
|
||||
actual_output = output_file
|
||||
if output_file.endswith('.mp3'):
|
||||
actual_output = output_file.replace('.mp3', '.pcm')
|
||||
|
||||
with open(actual_output, "wb") as f:
|
||||
f.write(audio_data)
|
||||
|
||||
file_size = len(audio_data) / 1024
|
||||
print(f"\n\n✅ 唱歌完成!")
|
||||
print(f" 输出: {actual_output} ({file_size:.1f} KB)")
|
||||
print(f" 格式: PCM (24000Hz, 单声道)")
|
||||
return actual_output
|
||||
else:
|
||||
print("\n⚠️ 未收到音频数据,请检查:")
|
||||
print(" 1. 凭证是否正确")
|
||||
print(" 2. 端到端实时语音大模型是否已开通")
|
||||
print(" 3. 网络连接是否正常")
|
||||
return None
|
||||
|
||||
except websockets.exceptions.WebSocketException as e:
|
||||
raise Exception(f"WebSocket连接错误: {str(e)}")
|
||||
except Exception as e:
|
||||
raise Exception(f"唱歌调用失败: {str(e)}")
|
||||
|
||||
|
||||
def main():
|
||||
"""命令行工具"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="豆包唱歌工具")
|
||||
subparsers = parser.add_subparsers(dest="command", help="选择功能")
|
||||
|
||||
# 唱歌命令
|
||||
sing_parser = subparsers.add_parser("sing", help="让豆包唱歌")
|
||||
sing_parser.add_argument("request", help="唱歌请求,如 '请唱一首关于春天的歌'")
|
||||
sing_parser.add_argument(
|
||||
"-o", "--output", default="singing_output.mp3", help="输出音频文件(默认: singing_output.mp3)"
|
||||
)
|
||||
sing_parser.add_argument(
|
||||
"-l", "--language", default="zh-CN", help="语言代码(默认: zh-CN)"
|
||||
)
|
||||
sing_parser.add_argument(
|
||||
"-m", "--model", default="1.2.1.0", help="模型版本(默认: 1.2.1.0=O2.0版本)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
try:
|
||||
singing = DoubaoSinging()
|
||||
|
||||
if args.command == "sing":
|
||||
asyncio.run(singing.sing(args.request, args.output, args.language, args.model))
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 错误: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
171
plugins/doubao-voice-plugin/scripts/voice_converter.py
Executable file
171
plugins/doubao-voice-plugin/scripts/voice_converter.py
Executable file
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
豆包语音转换工具
|
||||
支持:文字转语音 (TTS)
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import base64
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class DoubaoVoiceConverter:
|
||||
"""豆包语音转换工具类"""
|
||||
|
||||
def __init__(self):
|
||||
# 从环境变量读取配置
|
||||
self.app_id = os.environ.get("DOUBAO_APP_ID")
|
||||
self.access_token = os.environ.get("DOUBAO_ACCESS_TOKEN")
|
||||
|
||||
if not self.app_id or not self.access_token:
|
||||
raise ValueError(
|
||||
"请先设置环境变量:\n"
|
||||
"export DOUBAO_APP_ID='your_app_id'\n"
|
||||
"export DOUBAO_ACCESS_TOKEN='your_access_token'"
|
||||
)
|
||||
|
||||
# API版本选择: V1 (默认, 支持基础音色) 或 V3 (豆包2.0, 需额外配置)
|
||||
self.use_v3 = os.environ.get("DOUBAO_USE_V3", "false").lower() == "true"
|
||||
|
||||
if self.use_v3:
|
||||
self.tts_url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
|
||||
self.resource_id = os.environ.get("DOUBAO_RESOURCE_ID", "volc.bigmodel.tts")
|
||||
else:
|
||||
# V1 API - 稳定可用,支持基础音色
|
||||
self.tts_url = "https://openspeech.bytedance.com/api/v1/tts"
|
||||
|
||||
def text_to_speech(
|
||||
self,
|
||||
text: str,
|
||||
output_file: str = "output.mp3",
|
||||
voice_type: str = "BV700_V2_streaming"
|
||||
) -> str:
|
||||
"""
|
||||
文字转语音 (TTS)
|
||||
|
||||
Args:
|
||||
text: 要转换的文字
|
||||
output_file: 输出音频文件路径
|
||||
voice_type: 音色类型
|
||||
- BV700_V2_streaming: 通用女声(推荐)
|
||||
- BV701_V2_streaming: 通用男声
|
||||
- BV406_streaming: 温柔女声
|
||||
- BV158_streaming: 活泼女声
|
||||
- BV115_streaming: 磁性男声
|
||||
|
||||
Returns:
|
||||
str: 输出文件路径
|
||||
"""
|
||||
print(f"📝 文字转语音中...")
|
||||
print(f" 文字: {text[:50]}{'...' if len(text) > 50 else ''}")
|
||||
print(f" 音色: {voice_type}")
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer;{self.access_token}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# V3 API需要Resource-Id (如果启用)
|
||||
if self.use_v3:
|
||||
headers["Resource-Id"] = self.resource_id
|
||||
|
||||
payload = {
|
||||
"app": {
|
||||
"appid": self.app_id,
|
||||
"token": self.access_token,
|
||||
"cluster": "volcano_tts"
|
||||
},
|
||||
"user": {
|
||||
"uid": "user_001"
|
||||
},
|
||||
"audio": {
|
||||
"voice_type": voice_type,
|
||||
"encoding": "mp3",
|
||||
"speed_ratio": 1.0,
|
||||
"volume_ratio": 1.0,
|
||||
"pitch_ratio": 1.0
|
||||
},
|
||||
"request": {
|
||||
"reqid": f"tts_{os.urandom(8).hex()}",
|
||||
"text": text,
|
||||
"text_type": "plain",
|
||||
"operation": "query"
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(self.tts_url, headers=headers, json=payload, timeout=30)
|
||||
|
||||
# 打印响应头信息
|
||||
print(f"\n📋 响应信息:")
|
||||
print(f" HTTP状态码: {response.status_code}")
|
||||
if 'X-Tt-Logid' in response.headers:
|
||||
print(f" RequestId: {response.headers['X-Tt-Logid']}")
|
||||
if 'X-Request-Id' in response.headers:
|
||||
print(f" X-Request-Id: {response.headers['X-Request-Id']}")
|
||||
|
||||
data = response.json()
|
||||
|
||||
# 打印完整响应
|
||||
print(f"\n📄 完整响应:")
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
print()
|
||||
|
||||
if data.get("code") == 3000:
|
||||
# 成功:解码并保存音频
|
||||
audio_data = base64.b64decode(data["data"])
|
||||
with open(output_file, "wb") as f:
|
||||
f.write(audio_data)
|
||||
|
||||
file_size = len(audio_data) / 1024 # KB
|
||||
print(f"✅ 语音合成成功!")
|
||||
print(f" 输出: {output_file} ({file_size:.1f} KB)")
|
||||
return output_file
|
||||
else:
|
||||
error_msg = data.get("message", "未知错误")
|
||||
reqid = data.get("reqid", "未知")
|
||||
raise Exception(f"TTS 失败\n 错误码: {data.get('code')}\n 错误信息: {error_msg}\n RequestId: {reqid}")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
raise Exception("请求超时,请检查网络连接")
|
||||
except Exception as e:
|
||||
raise Exception(f"TTS 调用失败: {str(e)}")
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
"""命令行工具"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="豆包语音转换工具")
|
||||
subparsers = parser.add_subparsers(dest="command", help="选择功能")
|
||||
|
||||
# TTS 命令
|
||||
tts_parser = subparsers.add_parser("tts", help="文字转语音")
|
||||
tts_parser.add_argument("text", help="要转换的文字")
|
||||
tts_parser.add_argument("-o", "--output", default="output.mp3", help="输出音频文件(默认: output.mp3)")
|
||||
tts_parser.add_argument("-v", "--voice", default="BV700_V2_streaming",
|
||||
help="音色类型(默认: BV700_V2_streaming 通用女声)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
try:
|
||||
converter = DoubaoVoiceConverter()
|
||||
|
||||
if args.command == "tts":
|
||||
converter.text_to_speech(args.text, args.output, args.voice)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 错误: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user