BMAD-METHOD/.claude/hooks/play-tts-elevenlabs.sh

405 lines
14 KiB
Bash
Executable File

#!/bin/bash
#
# File: .claude/hooks/play-tts-elevenlabs.sh
#
# AgentVibes - Finally, your AI Agents can Talk Back! Text-to-Speech WITH personality for AI Assistants!
# Website: https://agentvibes.org
# Repository: https://github.com/paulpreibisch/AgentVibes
#
# Co-created by Paul Preibisch with Claude AI
# Copyright (c) 2025 Paul Preibisch
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# DISCLAIMER: This software is provided "AS IS", WITHOUT WARRANTY OF ANY KIND,
# express or implied. Use at your own risk. See the Apache License for details.
#
# ---
#
# @fileoverview ElevenLabs TTS Provider Implementation - Premium cloud-based TTS
# @context Provider-specific implementation for ElevenLabs API integration with multilingual support
# @architecture Part of multi-provider TTS system - implements provider interface contract
# @dependencies Requires ELEVENLABS_API_KEY, curl, ffmpeg, paplay/aplay/mpg123, jq
# @entrypoints Called by play-tts.sh router with ($1=text, $2=voice_name) when provider=elevenlabs
# @patterns Follows provider contract: accept text/voice, output audio file path, API error handling, SSH audio optimization
# @related play-tts.sh, provider-manager.sh, voices-config.sh, language-manager.sh, GitHub Issue #25
#
# Fix locale warnings
export LC_ALL=C
TEXT="$1"
VOICE_OVERRIDE="$2" # Optional: voice name or direct voice ID
API_KEY="${ELEVENLABS_API_KEY}"
# Check for project-local pretext configuration
CONFIG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/config"
CONFIG_FILE="$CONFIG_DIR/agentvibes.json"
if [[ -f "$CONFIG_FILE" ]] && command -v jq &> /dev/null; then
PRETEXT=$(jq -r '.pretext // empty' "$CONFIG_FILE" 2>/dev/null)
if [[ -n "$PRETEXT" ]]; then
TEXT="$PRETEXT: $TEXT"
fi
fi
# Limit text length to prevent API issues (max 500 chars for safety)
if [ ${#TEXT} -gt 500 ]; then
TEXT="${TEXT:0:497}..."
echo "⚠️ Text truncated to 500 characters for API safety"
fi
# Source the single voice configuration file
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/voices-config.sh"
source "$SCRIPT_DIR/language-manager.sh"
# @function determine_voice_and_language
# @intent Resolve voice name/ID and language for multilingual support
# @why Supports both voice names and direct IDs, plus language-specific voices
# @param $VOICE_OVERRIDE {string} Voice name or ID (optional)
# @returns Sets $VOICE_ID and $LANGUAGE_CODE global variables
# @sideeffects None
# @edgecases Handles unknown voices, falls back to default
VOICE_ID=""
LANGUAGE_CODE="en" # Default to English
# Get current language setting
CURRENT_LANGUAGE=$(get_language_code)
# Get language code for API
# ElevenLabs uses 2-letter ISO codes
case "$CURRENT_LANGUAGE" in
spanish) LANGUAGE_CODE="es" ;;
french) LANGUAGE_CODE="fr" ;;
german) LANGUAGE_CODE="de" ;;
italian) LANGUAGE_CODE="it" ;;
portuguese) LANGUAGE_CODE="pt" ;;
chinese) LANGUAGE_CODE="zh" ;;
japanese) LANGUAGE_CODE="ja" ;;
korean) LANGUAGE_CODE="ko" ;;
russian) LANGUAGE_CODE="ru" ;;
polish) LANGUAGE_CODE="pl" ;;
dutch) LANGUAGE_CODE="nl" ;;
turkish) LANGUAGE_CODE="tr" ;;
arabic) LANGUAGE_CODE="ar" ;;
hindi) LANGUAGE_CODE="hi" ;;
swedish) LANGUAGE_CODE="sv" ;;
danish) LANGUAGE_CODE="da" ;;
norwegian) LANGUAGE_CODE="no" ;;
finnish) LANGUAGE_CODE="fi" ;;
czech) LANGUAGE_CODE="cs" ;;
romanian) LANGUAGE_CODE="ro" ;;
ukrainian) LANGUAGE_CODE="uk" ;;
greek) LANGUAGE_CODE="el" ;;
bulgarian) LANGUAGE_CODE="bg" ;;
croatian) LANGUAGE_CODE="hr" ;;
slovak) LANGUAGE_CODE="sk" ;;
english|*) LANGUAGE_CODE="en" ;;
esac
if [[ -n "$VOICE_OVERRIDE" ]]; then
# Check if override is a voice name (lookup in mapping)
if [[ -n "${VOICES[$VOICE_OVERRIDE]}" ]]; then
VOICE_ID="${VOICES[$VOICE_OVERRIDE]}"
echo "🎤 Using voice: $VOICE_OVERRIDE (session-specific)"
# Check if override looks like a voice ID (alphanumeric string ~20 chars)
elif [[ "$VOICE_OVERRIDE" =~ ^[a-zA-Z0-9]{15,30}$ ]]; then
VOICE_ID="$VOICE_OVERRIDE"
echo "🎤 Using custom voice ID (session-specific)"
else
echo "⚠️ Unknown voice '$VOICE_OVERRIDE', trying language-specific voice"
fi
fi
# If no override or invalid override, use language-specific voice
if [[ -z "$VOICE_ID" ]]; then
# Try to get voice for current language
LANG_VOICE=$(get_voice_for_language "$CURRENT_LANGUAGE" "elevenlabs" 2>/dev/null)
if [[ -n "$LANG_VOICE" ]] && [[ -n "${VOICES[$LANG_VOICE]}" ]]; then
VOICE_ID="${VOICES[$LANG_VOICE]}"
echo "🌍 Using $CURRENT_LANGUAGE voice: $LANG_VOICE"
else
# Fall back to voice manager
VOICE_MANAGER_SCRIPT="$(dirname "$0")/voice-manager.sh"
if [[ -f "$VOICE_MANAGER_SCRIPT" ]]; then
VOICE_NAME=$("$VOICE_MANAGER_SCRIPT" get)
VOICE_ID="${VOICES[$VOICE_NAME]}"
fi
# Final fallback to default
if [[ -z "$VOICE_ID" ]]; then
echo "⚠️ No voice configured, using default"
VOICE_ID="${VOICES[Aria]}"
fi
fi
fi
# @function validate_inputs
# @intent Check required parameters and API key
# @why Fail fast with clear errors if inputs missing
# @exitcode 1=missing text, 2=missing API key
if [ -z "$TEXT" ]; then
echo "Usage: $0 \"text to speak\" [voice_name_or_id]"
exit 1
fi
if [ -z "$API_KEY" ]; then
echo "Error: ELEVENLABS_API_KEY not set"
echo "Set your API key: export ELEVENLABS_API_KEY=your_key_here"
exit 2
fi
# @function determine_audio_directory
# @intent Find appropriate directory for audio file storage
# @why Supports project-local and global storage
# @returns Sets $AUDIO_DIR global variable
# @sideeffects None
# @edgecases Handles missing directories, creates if needed
# AI NOTE: Check project dir first, then search up tree, finally fall back to global
if [[ -n "$CLAUDE_PROJECT_DIR" ]]; then
AUDIO_DIR="$CLAUDE_PROJECT_DIR/.claude/audio"
else
# Fallback: try to find .claude directory in current path
CURRENT_DIR="$PWD"
while [[ "$CURRENT_DIR" != "/" ]]; do
if [[ -d "$CURRENT_DIR/.claude" ]]; then
AUDIO_DIR="$CURRENT_DIR/.claude/audio"
break
fi
CURRENT_DIR=$(dirname "$CURRENT_DIR")
done
# Final fallback to global if no project .claude found
if [[ -z "$AUDIO_DIR" ]]; then
AUDIO_DIR="$HOME/.claude/audio"
fi
fi
mkdir -p "$AUDIO_DIR"
TEMP_FILE="$AUDIO_DIR/tts-$(date +%s).mp3"
# @function synthesize_with_elevenlabs
# @intent Call ElevenLabs API to generate speech
# @why Encapsulates API call with error handling
# @param Uses globals: $TEXT, $VOICE_ID, $API_KEY
# @returns Creates audio file at $TEMP_FILE
# @exitcode 0=success, 3=API error
# @sideeffects Creates MP3 file in audio directory
# @edgecases Handles network failures, API errors, rate limiting
# Choose model based on language
if [[ "$LANGUAGE_CODE" == "en" ]]; then
MODEL_ID="eleven_monolingual_v1"
else
MODEL_ID="eleven_multilingual_v2"
fi
# @function get_speech_speed
# @intent Read speed config and map to ElevenLabs API range (0.7-1.2)
# @why ElevenLabs only supports 0.7 (slower) to 1.2 (faster), must map user scale
# @returns Speed value for ElevenLabs API (clamped to 0.7-1.2)
get_speech_speed() {
local config_dir=""
# Determine config directory
if [[ -n "$CLAUDE_PROJECT_DIR" ]] && [[ -d "$CLAUDE_PROJECT_DIR/.claude" ]]; then
config_dir="$CLAUDE_PROJECT_DIR/.claude/config"
else
# Try to find .claude in current path
local current_dir="$PWD"
while [[ "$current_dir" != "/" ]]; do
if [[ -d "$current_dir/.claude" ]]; then
config_dir="$current_dir/.claude/config"
break
fi
current_dir=$(dirname "$current_dir")
done
# Fallback to global
if [[ -z "$config_dir" ]]; then
config_dir="$HOME/.claude/config"
fi
fi
local main_speed_file="$config_dir/tts-speech-rate.txt"
local target_speed_file="$config_dir/tts-target-speech-rate.txt"
# Legacy file paths for backward compatibility
local legacy_main_speed_file="$config_dir/piper-speech-rate.txt"
local legacy_target_speed_file="$config_dir/piper-target-speech-rate.txt"
local user_speed="1.0"
# If this is a non-English voice and target config exists, use it
if [[ "$CURRENT_LANGUAGE" != "english" ]]; then
if [[ -f "$target_speed_file" ]]; then
user_speed=$(cat "$target_speed_file" 2>/dev/null || echo "1.0")
elif [[ -f "$legacy_target_speed_file" ]]; then
user_speed=$(cat "$legacy_target_speed_file" 2>/dev/null || echo "1.0")
else
user_speed="0.5" # Default slower for learning
fi
else
# Otherwise use main config if available
if [[ -f "$main_speed_file" ]]; then
user_speed=$(grep -v '^#' "$main_speed_file" 2>/dev/null | grep -v '^$' | tail -1 || echo "1.0")
elif [[ -f "$legacy_main_speed_file" ]]; then
user_speed=$(grep -v '^#' "$legacy_main_speed_file" 2>/dev/null | grep -v '^$' | tail -1 || echo "1.0")
fi
fi
# Map user scale (0.5=slower, 1.0=normal, 2.0=faster, 3.0=very fast)
# to ElevenLabs range (0.7=slower, 1.0=normal, 1.2=faster)
# Formula: elevenlabs_speed = 0.7 + (user_speed - 0.5) * 0.2
# This maps: 0.5→0.7, 1.0→0.8, 2.0→1.0, 3.0→1.2
# Actually, let's use a better mapping:
# 0.5x → 0.7 (slowest ElevenLabs)
# 1.0x → 1.0 (normal)
# 2.0x → 1.15
# 3.0x → 1.2 (fastest ElevenLabs)
if command -v bc &> /dev/null; then
local eleven_speed
if (( $(echo "$user_speed <= 0.5" | bc -l) )); then
eleven_speed="0.7"
elif (( $(echo "$user_speed >= 3.0" | bc -l) )); then
eleven_speed="1.2"
elif (( $(echo "$user_speed <= 1.0" | bc -l) )); then
# Map 0.5-1.0 to 0.7-1.0
eleven_speed=$(echo "scale=2; 0.7 + ($user_speed - 0.5) * 0.6" | bc -l)
else
# Map 1.0-3.0 to 1.0-1.2
eleven_speed=$(echo "scale=2; 1.0 + ($user_speed - 1.0) * 0.1" | bc -l)
fi
echo "$eleven_speed"
else
# Fallback without bc: just clamp to safe values
if (( $(awk 'BEGIN {print ("'$user_speed'" <= 0.5)}') )); then
echo "0.7"
elif (( $(awk 'BEGIN {print ("'$user_speed'" >= 2.0)}') )); then
echo "1.2"
else
echo "1.0"
fi
fi
}
SPEECH_SPEED=$(get_speech_speed)
# Build JSON payload with jq for proper escaping
PAYLOAD=$(jq -n \
--arg text "$TEXT" \
--arg model "$MODEL_ID" \
--arg lang "$LANGUAGE_CODE" \
--argjson speed "$SPEECH_SPEED" \
'{
text: $text,
model_id: $model,
language_code: $lang,
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
speed: $speed
}
}')
curl -s -X POST "https://api.elevenlabs.io/v1/text-to-speech/${VOICE_ID}" \
-H "xi-api-key: ${API_KEY}" \
-H "Content-Type: application/json" \
-d "$PAYLOAD" \
-o "${TEMP_FILE}"
# @function add_silence_padding
# @intent Add silence to beginning of audio to prevent WSL static
# @why WSL audio subsystem cuts off first ~200ms, causing static/clipping
# @param Uses global: $TEMP_FILE
# @returns Updates $TEMP_FILE to padded version
# @sideeffects Modifies audio file, removes original
# @edgecases Gracefully falls back to unpadded if ffmpeg unavailable
# Add silence padding to prevent WSL audio static
if [ -f "${TEMP_FILE}" ]; then
# Check if ffmpeg is available for adding padding
if command -v ffmpeg &> /dev/null; then
PADDED_FILE="$AUDIO_DIR/tts-padded-$(date +%s).mp3"
# Add 200ms of silence at the beginning to prevent static
# Note: ElevenLabs returns mono audio, so we use mono silence
ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono:d=0.2 -i "${TEMP_FILE}" \
-filter_complex "[0:a][1:a]concat=n=2:v=0:a=1[out]" \
-map "[out]" -c:a libmp3lame -b:a 128k -y "${PADDED_FILE}" 2>/dev/null
if [ -f "${PADDED_FILE}" ]; then
# Use padded file and clean up original
rm -f "${TEMP_FILE}"
TEMP_FILE="${PADDED_FILE}"
fi
# If padding failed, just use original file
fi
# @function play_audio
# @intent Play generated audio file using available player with sequential playback
# @why Support multiple audio players and prevent overlapping audio in learning mode
# @param Uses global: $TEMP_FILE, $CURRENT_LANGUAGE
# @sideeffects Plays audio with lock mechanism for sequential playback
# @edgecases Falls through players until one works
LOCK_FILE="/tmp/agentvibes-audio.lock"
# Wait for previous audio to finish (max 30 seconds)
for i in {1..60}; do
if [ ! -f "$LOCK_FILE" ]; then
break
fi
sleep 0.5
done
# Track last target language audio for replay command
if [[ "$CURRENT_LANGUAGE" != "english" ]]; then
TARGET_AUDIO_FILE="${CLAUDE_PROJECT_DIR:-.}/.claude/last-target-audio.txt"
echo "${TEMP_FILE}" > "$TARGET_AUDIO_FILE"
fi
# Create lock and play audio
touch "$LOCK_FILE"
# Get audio duration for proper lock timing
DURATION=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${TEMP_FILE}" 2>/dev/null)
DURATION=${DURATION%.*} # Round to integer
DURATION=${DURATION:-1} # Default to 1 second if detection fails
# Convert to 48kHz stereo WAV for better SSH tunnel compatibility
# ElevenLabs returns 44.1kHz mono MP3, which causes static over SSH audio tunnels
# Converting to 48kHz stereo (Windows/PulseAudio native format) eliminates the static
if [[ -n "$SSH_CONNECTION" ]] || [[ -n "$SSH_CLIENT" ]] || [[ -n "$VSCODE_IPC_HOOK_CLI" ]]; then
CONVERTED_FILE="${TEMP_FILE%.mp3}.wav"
if ffmpeg -i "${TEMP_FILE}" -ar 48000 -ac 2 "${CONVERTED_FILE}" -y 2>/dev/null; then
TEMP_FILE="${CONVERTED_FILE}"
fi
fi
# Play audio (WSL/Linux) in background to avoid blocking, fully detached (skip if in test mode)
if [[ "${AGENTVIBES_TEST_MODE:-false}" != "true" ]]; then
(paplay "${TEMP_FILE}" || aplay "${TEMP_FILE}" || mpg123 "${TEMP_FILE}") >/dev/null 2>&1 &
PLAYER_PID=$!
fi
# Wait for audio to finish, then release lock
(sleep $DURATION; rm -f "$LOCK_FILE") &
disown
# Keep temp files for later review - cleaned up weekly by cron
echo "🎵 Saved to: ${TEMP_FILE}"
echo "🎤 Voice used: ${VOICE_NAME} (${VOICE_ID})"
else
echo "❌ Failed to generate audio - API may be unavailable"
echo "Check your API key and network connection"
exit 3
fi