ai-proj-helper/skills-integration/doubao-voice-plugin/scripts/voice_converter.py

#!/usr/bin/env python3
"""
豆包语音转换工具
支持：文字转语音 (TTS)
"""

import os
import sys
import json
import base64
import requests
from pathlib import Path


class DoubaoVoiceConverter:
    """豆包语音转换工具类"""

    def __init__(self):
        # 从环境变量读取配置
        self.app_id = os.environ.get("DOUBAO_APP_ID")
        self.access_token = os.environ.get("DOUBAO_ACCESS_TOKEN")

        if not self.app_id or not self.access_token:
            raise ValueError(
                "请先设置环境变量:\n"
                "export DOUBAO_APP_ID='your_app_id'\n"
                "export DOUBAO_ACCESS_TOKEN='your_access_token'"
            )

        # API版本选择: V1 (默认, 支持基础音色) 或 V3 (豆包2.0, 需额外配置)
        self.use_v3 = os.environ.get("DOUBAO_USE_V3", "false").lower() == "true"

        if self.use_v3:
            self.tts_url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
            self.resource_id = os.environ.get("DOUBAO_RESOURCE_ID", "volc.bigmodel.tts")
        else:
            # V1 API - 稳定可用，支持基础音色
            self.tts_url = "https://openspeech.bytedance.com/api/v1/tts"

    def text_to_speech(
        self,
        text: str,
        output_file: str = "output.mp3",
        voice_type: str = "BV700_V2_streaming"
    ) -> str:
        """
        文字转语音 (TTS)

        Args:
            text: 要转换的文字
            output_file: 输出音频文件路径
            voice_type: 音色类型
                - BV700_V2_streaming: 通用女声（推荐）
                - BV701_V2_streaming: 通用男声
                - BV406_streaming: 温柔女声
                - BV158_streaming: 活泼女声
                - BV115_streaming: 磁性男声

        Returns:
            str: 输出文件路径
        """
        print(f"📝 文字转语音中...")
        print(f"   文字: {text[:50]}{'...' if len(text) > 50 else ''}")
        print(f"   音色: {voice_type}")

        headers = {
            "Authorization": f"Bearer;{self.access_token}",
            "Content-Type": "application/json"
        }

        # V3 API需要Resource-Id (如果启用)
        if self.use_v3:
            headers["Resource-Id"] = self.resource_id

        payload = {
            "app": {
                "appid": self.app_id,
                "token": self.access_token,
                "cluster": "volcano_tts"
            },
            "user": {
                "uid": "user_001"
            },
            "audio": {
                "voice_type": voice_type,
                "encoding": "mp3",
                "speed_ratio": 1.0,
                "volume_ratio": 1.0,
                "pitch_ratio": 1.0
            },
            "request": {
                "reqid": f"tts_{os.urandom(8).hex()}",
                "text": text,
                "text_type": "plain",
                "operation": "query"
            }
        }

        try:
            response = requests.post(self.tts_url, headers=headers, json=payload, timeout=30)

            # 打印响应头信息
            print(f"\n📋 响应信息:")
            print(f"   HTTP状态码: {response.status_code}")
            if 'X-Tt-Logid' in response.headers:
                print(f"   RequestId: {response.headers['X-Tt-Logid']}")
            if 'X-Request-Id' in response.headers:
                print(f"   X-Request-Id: {response.headers['X-Request-Id']}")

            data = response.json()

            # 打印完整响应
            print(f"\n📄 完整响应:")
            print(json.dumps(data, indent=2, ensure_ascii=False))
            print()

            if data.get("code") == 3000:
                # 成功：解码并保存音频
                audio_data = base64.b64decode(data["data"])
                with open(output_file, "wb") as f:
                    f.write(audio_data)

                file_size = len(audio_data) / 1024  # KB
                print(f"✅ 语音合成成功!")
                print(f"   输出: {output_file} ({file_size:.1f} KB)")
                return output_file
            else:
                error_msg = data.get("message", "未知错误")
                reqid = data.get("reqid", "未知")
                raise Exception(f"TTS 失败\n   错误码: {data.get('code')}\n   错误信息: {error_msg}\n   RequestId: {reqid}")

        except requests.exceptions.Timeout:
            raise Exception("请求超时，请检查网络连接")
        except Exception as e:
            raise Exception(f"TTS 调用失败: {str(e)}")


def main():
    """命令行工具"""
    import argparse

    parser = argparse.ArgumentParser(description="豆包语音转换工具")
    subparsers = parser.add_subparsers(dest="command", help="选择功能")

    # TTS 命令
    tts_parser = subparsers.add_parser("tts", help="文字转语音")
    tts_parser.add_argument("text", help="要转换的文字")
    tts_parser.add_argument("-o", "--output", default="output.mp3", help="输出音频文件（默认: output.mp3）")
    tts_parser.add_argument("-v", "--voice", default="BV700_V2_streaming",
                           help="音色类型（默认: BV700_V2_streaming 通用女声）")

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return

    try:
        converter = DoubaoVoiceConverter()

        if args.command == "tts":
            converter.text_to_speech(args.text, args.output, args.voice)

    except Exception as e:
        print(f"❌ 错误: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()