405 lines
14 KiB
Bash
Executable File
405 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# File: .claude/hooks/play-tts-elevenlabs.sh
|
|
#
|
|
# AgentVibes - Finally, your AI Agents can Talk Back! Text-to-Speech WITH personality for AI Assistants!
|
|
# Website: https://agentvibes.org
|
|
# Repository: https://github.com/paulpreibisch/AgentVibes
|
|
#
|
|
# Co-created by Paul Preibisch with Claude AI
|
|
# Copyright (c) 2025 Paul Preibisch
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# DISCLAIMER: This software is provided "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
# express or implied. Use at your own risk. See the Apache License for details.
|
|
#
|
|
# ---
|
|
#
|
|
# @fileoverview ElevenLabs TTS Provider Implementation - Premium cloud-based TTS
|
|
# @context Provider-specific implementation for ElevenLabs API integration with multilingual support
|
|
# @architecture Part of multi-provider TTS system - implements provider interface contract
|
|
# @dependencies Requires ELEVENLABS_API_KEY, curl, ffmpeg, paplay/aplay/mpg123, jq
|
|
# @entrypoints Called by play-tts.sh router with ($1=text, $2=voice_name) when provider=elevenlabs
|
|
# @patterns Follows provider contract: accept text/voice, output audio file path, API error handling, SSH audio optimization
|
|
# @related play-tts.sh, provider-manager.sh, voices-config.sh, language-manager.sh, GitHub Issue #25
|
|
#
|
|
|
|
# Fix locale warnings
|
|
export LC_ALL=C
|
|
|
|
TEXT="$1"
|
|
VOICE_OVERRIDE="$2" # Optional: voice name or direct voice ID
|
|
API_KEY="${ELEVENLABS_API_KEY}"
|
|
|
|
# Check for project-local pretext configuration
|
|
CONFIG_DIR="${CLAUDE_PROJECT_DIR:-.}/.claude/config"
|
|
CONFIG_FILE="$CONFIG_DIR/agentvibes.json"
|
|
|
|
if [[ -f "$CONFIG_FILE" ]] && command -v jq &> /dev/null; then
|
|
PRETEXT=$(jq -r '.pretext // empty' "$CONFIG_FILE" 2>/dev/null)
|
|
if [[ -n "$PRETEXT" ]]; then
|
|
TEXT="$PRETEXT: $TEXT"
|
|
fi
|
|
fi
|
|
|
|
# Limit text length to prevent API issues (max 500 chars for safety)
|
|
if [ ${#TEXT} -gt 500 ]; then
|
|
TEXT="${TEXT:0:497}..."
|
|
echo "⚠️ Text truncated to 500 characters for API safety"
|
|
fi
|
|
|
|
# Source the single voice configuration file
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "$SCRIPT_DIR/voices-config.sh"
|
|
source "$SCRIPT_DIR/language-manager.sh"
|
|
|
|
# @function determine_voice_and_language
|
|
# @intent Resolve voice name/ID and language for multilingual support
|
|
# @why Supports both voice names and direct IDs, plus language-specific voices
|
|
# @param $VOICE_OVERRIDE {string} Voice name or ID (optional)
|
|
# @returns Sets $VOICE_ID and $LANGUAGE_CODE global variables
|
|
# @sideeffects None
|
|
# @edgecases Handles unknown voices, falls back to default
|
|
VOICE_ID=""
|
|
LANGUAGE_CODE="en" # Default to English
|
|
|
|
# Get current language setting
|
|
CURRENT_LANGUAGE=$(get_language_code)
|
|
|
|
# Get language code for API
|
|
# ElevenLabs uses 2-letter ISO codes
|
|
case "$CURRENT_LANGUAGE" in
|
|
spanish) LANGUAGE_CODE="es" ;;
|
|
french) LANGUAGE_CODE="fr" ;;
|
|
german) LANGUAGE_CODE="de" ;;
|
|
italian) LANGUAGE_CODE="it" ;;
|
|
portuguese) LANGUAGE_CODE="pt" ;;
|
|
chinese) LANGUAGE_CODE="zh" ;;
|
|
japanese) LANGUAGE_CODE="ja" ;;
|
|
korean) LANGUAGE_CODE="ko" ;;
|
|
russian) LANGUAGE_CODE="ru" ;;
|
|
polish) LANGUAGE_CODE="pl" ;;
|
|
dutch) LANGUAGE_CODE="nl" ;;
|
|
turkish) LANGUAGE_CODE="tr" ;;
|
|
arabic) LANGUAGE_CODE="ar" ;;
|
|
hindi) LANGUAGE_CODE="hi" ;;
|
|
swedish) LANGUAGE_CODE="sv" ;;
|
|
danish) LANGUAGE_CODE="da" ;;
|
|
norwegian) LANGUAGE_CODE="no" ;;
|
|
finnish) LANGUAGE_CODE="fi" ;;
|
|
czech) LANGUAGE_CODE="cs" ;;
|
|
romanian) LANGUAGE_CODE="ro" ;;
|
|
ukrainian) LANGUAGE_CODE="uk" ;;
|
|
greek) LANGUAGE_CODE="el" ;;
|
|
bulgarian) LANGUAGE_CODE="bg" ;;
|
|
croatian) LANGUAGE_CODE="hr" ;;
|
|
slovak) LANGUAGE_CODE="sk" ;;
|
|
english|*) LANGUAGE_CODE="en" ;;
|
|
esac
|
|
|
|
if [[ -n "$VOICE_OVERRIDE" ]]; then
|
|
# Check if override is a voice name (lookup in mapping)
|
|
if [[ -n "${VOICES[$VOICE_OVERRIDE]}" ]]; then
|
|
VOICE_ID="${VOICES[$VOICE_OVERRIDE]}"
|
|
echo "🎤 Using voice: $VOICE_OVERRIDE (session-specific)"
|
|
# Check if override looks like a voice ID (alphanumeric string ~20 chars)
|
|
elif [[ "$VOICE_OVERRIDE" =~ ^[a-zA-Z0-9]{15,30}$ ]]; then
|
|
VOICE_ID="$VOICE_OVERRIDE"
|
|
echo "🎤 Using custom voice ID (session-specific)"
|
|
else
|
|
echo "⚠️ Unknown voice '$VOICE_OVERRIDE', trying language-specific voice"
|
|
fi
|
|
fi
|
|
|
|
# If no override or invalid override, use language-specific voice
|
|
if [[ -z "$VOICE_ID" ]]; then
|
|
# Try to get voice for current language
|
|
LANG_VOICE=$(get_voice_for_language "$CURRENT_LANGUAGE" "elevenlabs" 2>/dev/null)
|
|
|
|
if [[ -n "$LANG_VOICE" ]] && [[ -n "${VOICES[$LANG_VOICE]}" ]]; then
|
|
VOICE_ID="${VOICES[$LANG_VOICE]}"
|
|
echo "🌍 Using $CURRENT_LANGUAGE voice: $LANG_VOICE"
|
|
else
|
|
# Fall back to voice manager
|
|
VOICE_MANAGER_SCRIPT="$(dirname "$0")/voice-manager.sh"
|
|
if [[ -f "$VOICE_MANAGER_SCRIPT" ]]; then
|
|
VOICE_NAME=$("$VOICE_MANAGER_SCRIPT" get)
|
|
VOICE_ID="${VOICES[$VOICE_NAME]}"
|
|
fi
|
|
|
|
# Final fallback to default
|
|
if [[ -z "$VOICE_ID" ]]; then
|
|
echo "⚠️ No voice configured, using default"
|
|
VOICE_ID="${VOICES[Aria]}"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# @function validate_inputs
|
|
# @intent Check required parameters and API key
|
|
# @why Fail fast with clear errors if inputs missing
|
|
# @exitcode 1=missing text, 2=missing API key
|
|
if [ -z "$TEXT" ]; then
|
|
echo "Usage: $0 \"text to speak\" [voice_name_or_id]"
|
|
exit 1
|
|
fi
|
|
|
|
if [ -z "$API_KEY" ]; then
|
|
echo "Error: ELEVENLABS_API_KEY not set"
|
|
echo "Set your API key: export ELEVENLABS_API_KEY=your_key_here"
|
|
exit 2
|
|
fi
|
|
|
|
# @function determine_audio_directory
|
|
# @intent Find appropriate directory for audio file storage
|
|
# @why Supports project-local and global storage
|
|
# @returns Sets $AUDIO_DIR global variable
|
|
# @sideeffects None
|
|
# @edgecases Handles missing directories, creates if needed
|
|
# AI NOTE: Check project dir first, then search up tree, finally fall back to global
|
|
if [[ -n "$CLAUDE_PROJECT_DIR" ]]; then
|
|
AUDIO_DIR="$CLAUDE_PROJECT_DIR/.claude/audio"
|
|
else
|
|
# Fallback: try to find .claude directory in current path
|
|
CURRENT_DIR="$PWD"
|
|
while [[ "$CURRENT_DIR" != "/" ]]; do
|
|
if [[ -d "$CURRENT_DIR/.claude" ]]; then
|
|
AUDIO_DIR="$CURRENT_DIR/.claude/audio"
|
|
break
|
|
fi
|
|
CURRENT_DIR=$(dirname "$CURRENT_DIR")
|
|
done
|
|
# Final fallback to global if no project .claude found
|
|
if [[ -z "$AUDIO_DIR" ]]; then
|
|
AUDIO_DIR="$HOME/.claude/audio"
|
|
fi
|
|
fi
|
|
|
|
mkdir -p "$AUDIO_DIR"
|
|
TEMP_FILE="$AUDIO_DIR/tts-$(date +%s).mp3"
|
|
|
|
# @function synthesize_with_elevenlabs
|
|
# @intent Call ElevenLabs API to generate speech
|
|
# @why Encapsulates API call with error handling
|
|
# @param Uses globals: $TEXT, $VOICE_ID, $API_KEY
|
|
# @returns Creates audio file at $TEMP_FILE
|
|
# @exitcode 0=success, 3=API error
|
|
# @sideeffects Creates MP3 file in audio directory
|
|
# @edgecases Handles network failures, API errors, rate limiting
|
|
# Choose model based on language
|
|
if [[ "$LANGUAGE_CODE" == "en" ]]; then
|
|
MODEL_ID="eleven_monolingual_v1"
|
|
else
|
|
MODEL_ID="eleven_multilingual_v2"
|
|
fi
|
|
|
|
# @function get_speech_speed
|
|
# @intent Read speed config and map to ElevenLabs API range (0.7-1.2)
|
|
# @why ElevenLabs only supports 0.7 (slower) to 1.2 (faster), must map user scale
|
|
# @returns Speed value for ElevenLabs API (clamped to 0.7-1.2)
|
|
get_speech_speed() {
|
|
local config_dir=""
|
|
|
|
# Determine config directory
|
|
if [[ -n "$CLAUDE_PROJECT_DIR" ]] && [[ -d "$CLAUDE_PROJECT_DIR/.claude" ]]; then
|
|
config_dir="$CLAUDE_PROJECT_DIR/.claude/config"
|
|
else
|
|
# Try to find .claude in current path
|
|
local current_dir="$PWD"
|
|
while [[ "$current_dir" != "/" ]]; do
|
|
if [[ -d "$current_dir/.claude" ]]; then
|
|
config_dir="$current_dir/.claude/config"
|
|
break
|
|
fi
|
|
current_dir=$(dirname "$current_dir")
|
|
done
|
|
# Fallback to global
|
|
if [[ -z "$config_dir" ]]; then
|
|
config_dir="$HOME/.claude/config"
|
|
fi
|
|
fi
|
|
|
|
local main_speed_file="$config_dir/tts-speech-rate.txt"
|
|
local target_speed_file="$config_dir/tts-target-speech-rate.txt"
|
|
|
|
# Legacy file paths for backward compatibility
|
|
local legacy_main_speed_file="$config_dir/piper-speech-rate.txt"
|
|
local legacy_target_speed_file="$config_dir/piper-target-speech-rate.txt"
|
|
|
|
local user_speed="1.0"
|
|
|
|
# If this is a non-English voice and target config exists, use it
|
|
if [[ "$CURRENT_LANGUAGE" != "english" ]]; then
|
|
if [[ -f "$target_speed_file" ]]; then
|
|
user_speed=$(cat "$target_speed_file" 2>/dev/null || echo "1.0")
|
|
elif [[ -f "$legacy_target_speed_file" ]]; then
|
|
user_speed=$(cat "$legacy_target_speed_file" 2>/dev/null || echo "1.0")
|
|
else
|
|
user_speed="0.5" # Default slower for learning
|
|
fi
|
|
else
|
|
# Otherwise use main config if available
|
|
if [[ -f "$main_speed_file" ]]; then
|
|
user_speed=$(grep -v '^#' "$main_speed_file" 2>/dev/null | grep -v '^$' | tail -1 || echo "1.0")
|
|
elif [[ -f "$legacy_main_speed_file" ]]; then
|
|
user_speed=$(grep -v '^#' "$legacy_main_speed_file" 2>/dev/null | grep -v '^$' | tail -1 || echo "1.0")
|
|
fi
|
|
fi
|
|
|
|
# Map user scale (0.5=slower, 1.0=normal, 2.0=faster, 3.0=very fast)
|
|
# to ElevenLabs range (0.7=slower, 1.0=normal, 1.2=faster)
|
|
# Formula: elevenlabs_speed = 0.7 + (user_speed - 0.5) * 0.2
|
|
# This maps: 0.5→0.7, 1.0→0.8, 2.0→1.0, 3.0→1.2
|
|
# Actually, let's use a better mapping:
|
|
# 0.5x → 0.7 (slowest ElevenLabs)
|
|
# 1.0x → 1.0 (normal)
|
|
# 2.0x → 1.15
|
|
# 3.0x → 1.2 (fastest ElevenLabs)
|
|
|
|
if command -v bc &> /dev/null; then
|
|
local eleven_speed
|
|
if (( $(echo "$user_speed <= 0.5" | bc -l) )); then
|
|
eleven_speed="0.7"
|
|
elif (( $(echo "$user_speed >= 3.0" | bc -l) )); then
|
|
eleven_speed="1.2"
|
|
elif (( $(echo "$user_speed <= 1.0" | bc -l) )); then
|
|
# Map 0.5-1.0 to 0.7-1.0
|
|
eleven_speed=$(echo "scale=2; 0.7 + ($user_speed - 0.5) * 0.6" | bc -l)
|
|
else
|
|
# Map 1.0-3.0 to 1.0-1.2
|
|
eleven_speed=$(echo "scale=2; 1.0 + ($user_speed - 1.0) * 0.1" | bc -l)
|
|
fi
|
|
echo "$eleven_speed"
|
|
else
|
|
# Fallback without bc: just clamp to safe values
|
|
if (( $(awk 'BEGIN {print ("'$user_speed'" <= 0.5)}') )); then
|
|
echo "0.7"
|
|
elif (( $(awk 'BEGIN {print ("'$user_speed'" >= 2.0)}') )); then
|
|
echo "1.2"
|
|
else
|
|
echo "1.0"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
SPEECH_SPEED=$(get_speech_speed)
|
|
|
|
# Build JSON payload with jq for proper escaping
|
|
PAYLOAD=$(jq -n \
|
|
--arg text "$TEXT" \
|
|
--arg model "$MODEL_ID" \
|
|
--arg lang "$LANGUAGE_CODE" \
|
|
--argjson speed "$SPEECH_SPEED" \
|
|
'{
|
|
text: $text,
|
|
model_id: $model,
|
|
language_code: $lang,
|
|
voice_settings: {
|
|
stability: 0.5,
|
|
similarity_boost: 0.75,
|
|
speed: $speed
|
|
}
|
|
}')
|
|
|
|
curl -s -X POST "https://api.elevenlabs.io/v1/text-to-speech/${VOICE_ID}" \
|
|
-H "xi-api-key: ${API_KEY}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$PAYLOAD" \
|
|
-o "${TEMP_FILE}"
|
|
|
|
# @function add_silence_padding
|
|
# @intent Add silence to beginning of audio to prevent WSL static
|
|
# @why WSL audio subsystem cuts off first ~200ms, causing static/clipping
|
|
# @param Uses global: $TEMP_FILE
|
|
# @returns Updates $TEMP_FILE to padded version
|
|
# @sideeffects Modifies audio file, removes original
|
|
# @edgecases Gracefully falls back to unpadded if ffmpeg unavailable
|
|
# Add silence padding to prevent WSL audio static
|
|
if [ -f "${TEMP_FILE}" ]; then
|
|
# Check if ffmpeg is available for adding padding
|
|
if command -v ffmpeg &> /dev/null; then
|
|
PADDED_FILE="$AUDIO_DIR/tts-padded-$(date +%s).mp3"
|
|
# Add 200ms of silence at the beginning to prevent static
|
|
# Note: ElevenLabs returns mono audio, so we use mono silence
|
|
ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono:d=0.2 -i "${TEMP_FILE}" \
|
|
-filter_complex "[0:a][1:a]concat=n=2:v=0:a=1[out]" \
|
|
-map "[out]" -c:a libmp3lame -b:a 128k -y "${PADDED_FILE}" 2>/dev/null
|
|
|
|
if [ -f "${PADDED_FILE}" ]; then
|
|
# Use padded file and clean up original
|
|
rm -f "${TEMP_FILE}"
|
|
TEMP_FILE="${PADDED_FILE}"
|
|
fi
|
|
# If padding failed, just use original file
|
|
fi
|
|
|
|
# @function play_audio
|
|
# @intent Play generated audio file using available player with sequential playback
|
|
# @why Support multiple audio players and prevent overlapping audio in learning mode
|
|
# @param Uses global: $TEMP_FILE, $CURRENT_LANGUAGE
|
|
# @sideeffects Plays audio with lock mechanism for sequential playback
|
|
# @edgecases Falls through players until one works
|
|
LOCK_FILE="/tmp/agentvibes-audio.lock"
|
|
|
|
# Wait for previous audio to finish (max 30 seconds)
|
|
for i in {1..60}; do
|
|
if [ ! -f "$LOCK_FILE" ]; then
|
|
break
|
|
fi
|
|
sleep 0.5
|
|
done
|
|
|
|
# Track last target language audio for replay command
|
|
if [[ "$CURRENT_LANGUAGE" != "english" ]]; then
|
|
TARGET_AUDIO_FILE="${CLAUDE_PROJECT_DIR:-.}/.claude/last-target-audio.txt"
|
|
echo "${TEMP_FILE}" > "$TARGET_AUDIO_FILE"
|
|
fi
|
|
|
|
# Create lock and play audio
|
|
touch "$LOCK_FILE"
|
|
|
|
# Get audio duration for proper lock timing
|
|
DURATION=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${TEMP_FILE}" 2>/dev/null)
|
|
DURATION=${DURATION%.*} # Round to integer
|
|
DURATION=${DURATION:-1} # Default to 1 second if detection fails
|
|
|
|
# Convert to 48kHz stereo WAV for better SSH tunnel compatibility
|
|
# ElevenLabs returns 44.1kHz mono MP3, which causes static over SSH audio tunnels
|
|
# Converting to 48kHz stereo (Windows/PulseAudio native format) eliminates the static
|
|
if [[ -n "$SSH_CONNECTION" ]] || [[ -n "$SSH_CLIENT" ]] || [[ -n "$VSCODE_IPC_HOOK_CLI" ]]; then
|
|
CONVERTED_FILE="${TEMP_FILE%.mp3}.wav"
|
|
if ffmpeg -i "${TEMP_FILE}" -ar 48000 -ac 2 "${CONVERTED_FILE}" -y 2>/dev/null; then
|
|
TEMP_FILE="${CONVERTED_FILE}"
|
|
fi
|
|
fi
|
|
|
|
# Play audio (WSL/Linux) in background to avoid blocking, fully detached (skip if in test mode)
|
|
if [[ "${AGENTVIBES_TEST_MODE:-false}" != "true" ]]; then
|
|
(paplay "${TEMP_FILE}" || aplay "${TEMP_FILE}" || mpg123 "${TEMP_FILE}") >/dev/null 2>&1 &
|
|
PLAYER_PID=$!
|
|
fi
|
|
|
|
# Wait for audio to finish, then release lock
|
|
(sleep $DURATION; rm -f "$LOCK_FILE") &
|
|
disown
|
|
|
|
# Keep temp files for later review - cleaned up weekly by cron
|
|
echo "🎵 Saved to: ${TEMP_FILE}"
|
|
echo "🎤 Voice used: ${VOICE_NAME} (${VOICE_ID})"
|
|
else
|
|
echo "❌ Failed to generate audio - API may be unavailable"
|
|
echo "Check your API key and network connection"
|
|
exit 3
|
|
fi
|