Files
discord-fishbowl/src/llm/client.py
matt 004f0325ec Fix comprehensive system issues and implement proper vector database backend selection
- Fix reflection memory spam despite zero active characters in scheduler.py
- Add character enable/disable functionality to admin interface
- Fix Docker configuration with proper network setup and service dependencies
- Resolve admin interface JavaScript errors and login issues
- Fix MCP import paths for updated package structure
- Add comprehensive character management with audit logging
- Implement proper character state management and persistence
- Fix database connectivity and initialization issues
- Add missing audit service for admin operations
- Complete Docker stack integration with all required services

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-06 19:54:49 -07:00

572 lines
23 KiB
Python

import asyncio
import httpx
import json
import time
from typing import Dict, Any, Optional, List
from datetime import datetime, timedelta, timezone
from utils.config import get_settings
from utils.logging import log_llm_interaction, log_error_with_context, log_system_health
from admin.services.audit_service import AuditService
import logging
logger = logging.getLogger(__name__)
class LLMClient:
"""Async LLM client for interacting with local LLM APIs (Ollama, etc.)"""
def __init__(self):
self.settings = get_settings()
self.base_url = self.settings.llm.base_url
self.model = self.settings.llm.model
# Force 5-minute timeout for self-hosted large models
self.timeout = 300
self.max_tokens = self.settings.llm.max_tokens
self.temperature = self.settings.llm.temperature
# Rate limiting
self.request_times = []
self.max_requests_per_minute = 30
# Response caching
self.cache = {}
self.cache_ttl = 300 # 5 minutes
# Background task queue for long-running requests
self.pending_requests = {}
self.max_timeout = 300 # 5 minutes for self-hosted large models
self.fallback_timeout = 300 # 5 minutes for self-hosted large models
# Health monitoring
self.health_stats = {
'total_requests': 0,
'successful_requests': 0,
'failed_requests': 0,
'average_response_time': 0,
'last_health_check': datetime.now(timezone.utc)
}
async def generate_response(self, prompt: str, character_name: str = None,
max_tokens: int = None, temperature: float = None,
use_fallback: bool = True) -> Optional[str]:
"""Generate response using LLM"""
try:
# Rate limiting check
if not await self._check_rate_limit():
logger.warning(f"Rate limit exceeded for {character_name}")
return None
# Check cache first
cache_key = self._generate_cache_key(prompt, character_name, max_tokens, temperature)
cached_response = self._get_cached_response(cache_key)
if cached_response:
return cached_response
start_time = time.time()
# Use shorter timeout for immediate responses, longer for background
effective_timeout = self.fallback_timeout if use_fallback else min(self.timeout, self.max_timeout)
# Try OpenAI-compatible API first (KoboldCPP, etc.)
async with httpx.AsyncClient(timeout=effective_timeout) as client:
try:
# OpenAI-compatible request
request_data = {
"model": self.model,
"messages": [{"role": "user", "content": prompt}],
"temperature": temperature or self.temperature,
"max_tokens": max_tokens or self.max_tokens,
"top_p": 0.9,
"stream": False
}
# Debug logging
logger.debug(f"LLM Request for {character_name}:")
logger.debug(f"Model: {self.model}")
logger.debug(f"Prompt (first 500 chars): {prompt[:500]}...")
logger.debug(f"Full prompt length: {len(prompt)} chars")
response = await client.post(
f"{self.base_url}/chat/completions",
json=request_data,
headers={"Content-Type": "application/json"}
)
response.raise_for_status()
result = response.json()
if 'choices' in result and result['choices'] and 'message' in result['choices'][0]:
generated_text = result['choices'][0]['message']['content'].strip()
logger.debug(f"LLM Response for {character_name}: {generated_text[:200]}...")
else:
generated_text = None
logger.debug(f"LLM Response for {character_name}: Invalid response format")
except (httpx.HTTPStatusError, httpx.RequestError, KeyError):
# Fallback to Ollama API
request_data = {
"model": self.model,
"prompt": prompt,
"options": {
"temperature": temperature or self.temperature,
"num_predict": max_tokens or self.max_tokens,
"top_p": 0.9,
"top_k": 40,
"repeat_penalty": 1.1
},
"stream": False
}
response = await client.post(
f"{self.base_url}/api/generate",
json=request_data,
headers={"Content-Type": "application/json"}
)
response.raise_for_status()
result = response.json()
if 'response' in result and result['response']:
generated_text = result['response'].strip()
else:
generated_text = None
if generated_text:
# Cache the response
self._cache_response(cache_key, generated_text)
# Update stats
duration = time.time() - start_time
self._update_stats(True, duration)
# Log interaction
log_llm_interaction(
character_name or "unknown",
len(prompt),
len(generated_text),
self.model,
duration
)
# AUDIT: Log performance metric
await AuditService.log_performance_metric(
metric_name="llm_response_time",
metric_value=duration,
metric_unit="seconds",
component="llm_client",
additional_data={
"model": self.model,
"character_name": character_name,
"prompt_length": len(prompt),
"response_length": len(generated_text)
}
)
return generated_text
else:
logger.error(f"No response from LLM: {result}")
self._update_stats(False, time.time() - start_time)
return None
except httpx.TimeoutException:
if use_fallback:
logger.warning(f"LLM request timeout for {character_name}, using fallback response")
# Queue for background processing if needed
if self.timeout > self.max_timeout:
background_task = asyncio.create_task(self.generate_response(
prompt, character_name, max_tokens, temperature, use_fallback=False
))
request_id = f"{character_name}_{time.time()}"
self.pending_requests[request_id] = background_task
# Return a fallback response immediately
fallback_response = self._get_fallback_response(character_name)
self._update_stats(False, effective_timeout)
return fallback_response
else:
logger.error(f"LLM background request timeout for {character_name}")
self._update_stats(False, effective_timeout)
return None
except httpx.HTTPError as e:
logger.error(f"LLM HTTP error for {character_name}: {e}")
self._update_stats(False, time.time() - start_time)
return None
except Exception as e:
log_error_with_context(e, {
"character_name": character_name,
"prompt_length": len(prompt),
"model": self.model
})
self._update_stats(False, time.time() - start_time)
return None
async def generate_batch_responses(self, prompts: List[Dict[str, Any]]) -> List[Optional[str]]:
"""Generate multiple responses in batch"""
tasks = []
for prompt_data in prompts:
task = self.generate_response(
prompt=prompt_data['prompt'],
character_name=prompt_data.get('character_name'),
max_tokens=prompt_data.get('max_tokens'),
temperature=prompt_data.get('temperature')
)
tasks.append(task)
results = await asyncio.gather(*tasks, return_exceptions=True)
# Convert exceptions to None
return [result if not isinstance(result, Exception) else None for result in results]
async def check_model_availability(self) -> bool:
"""Check if the LLM model is available"""
try:
async with httpx.AsyncClient(timeout=10) as client:
# Try OpenAI-compatible API first (KoboldCPP, etc.)
try:
response = await client.get(f"{self.base_url}/models")
response.raise_for_status()
models = response.json()
available_models = [model.get('id', '') for model in models.get('data', [])]
except (httpx.HTTPStatusError, httpx.RequestError):
# Fallback to Ollama API
response = await client.get(f"{self.base_url}/api/tags")
response.raise_for_status()
models = response.json()
available_models = [model.get('name', '') for model in models.get('models', [])]
is_available = any(self.model in model_name for model_name in available_models)
log_system_health(
"llm_client",
"available" if is_available else "model_not_found",
{"model": self.model, "available_models": available_models}
)
return is_available
except Exception as e:
log_error_with_context(e, {"model": self.model})
log_system_health("llm_client", "unavailable", {"error": str(e)})
return False
async def get_model_info(self) -> Dict[str, Any]:
"""Get information about the current model"""
try:
async with httpx.AsyncClient(timeout=10) as client:
response = await client.post(
f"{self.base_url}/api/show",
json={"name": self.model}
)
response.raise_for_status()
return response.json()
except Exception as e:
log_error_with_context(e, {"model": self.model})
return {}
async def health_check(self) -> Dict[str, Any]:
"""Perform health check on LLM service"""
try:
start_time = time.time()
# Test with simple prompt
test_prompt = "Respond with 'OK' if you can understand this message."
response = await self.generate_response(test_prompt, "health_check")
duration = time.time() - start_time
health_status = {
'status': 'healthy' if response else 'unhealthy',
'response_time': duration,
'model': self.model,
'base_url': self.base_url,
'timestamp': datetime.now(timezone.utc).isoformat()
}
# Update health check time
self.health_stats['last_health_check'] = datetime.now(timezone.utc)
return health_status
except Exception as e:
log_error_with_context(e, {"component": "llm_health_check"})
return {
'status': 'error',
'error': str(e),
'model': self.model,
'base_url': self.base_url,
'timestamp': datetime.now(timezone.utc).isoformat()
}
def get_statistics(self) -> Dict[str, Any]:
"""Get client statistics"""
return {
'total_requests': self.health_stats['total_requests'],
'successful_requests': self.health_stats['successful_requests'],
'failed_requests': self.health_stats['failed_requests'],
'success_rate': (
self.health_stats['successful_requests'] / self.health_stats['total_requests']
if self.health_stats['total_requests'] > 0 else 0
),
'average_response_time': self.health_stats['average_response_time'],
'cache_size': len(self.cache),
'last_health_check': self.health_stats['last_health_check'].isoformat()
}
async def _check_rate_limit(self) -> bool:
"""Check if we're within rate limits"""
now = time.time()
# Remove old requests (older than 1 minute)
self.request_times = [t for t in self.request_times if now - t < 60]
# Check if we can make another request
if len(self.request_times) >= self.max_requests_per_minute:
return False
# Add current request time
self.request_times.append(now)
return True
def _generate_cache_key(self, prompt: str, character_name: str = None,
max_tokens: int = None, temperature: float = None) -> str:
"""Generate cache key for response"""
import hashlib
cache_data = {
'prompt': prompt,
'character_name': character_name,
'max_tokens': max_tokens or self.max_tokens,
'temperature': temperature or self.temperature,
'model': self.model
}
cache_string = json.dumps(cache_data, sort_keys=True)
return hashlib.md5(cache_string.encode()).hexdigest()
def _get_cached_response(self, cache_key: str) -> Optional[str]:
"""Get cached response if available and not expired"""
if cache_key in self.cache:
cached_data = self.cache[cache_key]
if time.time() - cached_data['timestamp'] < self.cache_ttl:
return cached_data['response']
else:
# Remove expired cache entry
del self.cache[cache_key]
return None
def _cache_response(self, cache_key: str, response: str):
"""Cache response"""
self.cache[cache_key] = {
'response': response,
'timestamp': time.time()
}
# Clean up old cache entries if cache is too large
if len(self.cache) > 100:
# Remove oldest entries
oldest_keys = sorted(
self.cache.keys(),
key=lambda k: self.cache[k]['timestamp']
)[:20]
for key in oldest_keys:
del self.cache[key]
def _update_stats(self, success: bool, duration: float):
"""Update health statistics"""
self.health_stats['total_requests'] += 1
if success:
self.health_stats['successful_requests'] += 1
else:
self.health_stats['failed_requests'] += 1
# Update average response time
total_requests = self.health_stats['total_requests']
current_avg = self.health_stats['average_response_time']
# Rolling average
self.health_stats['average_response_time'] = (
(current_avg * (total_requests - 1) + duration) / total_requests
)
def _get_fallback_response(self, character_name: str = None) -> str:
"""Generate a character-aware fallback response when LLM is slow"""
if character_name:
# Character-specific fallbacks based on their personalities
character_fallbacks = {
"Alex": [
"*processing all the technical implications...*",
"Let me analyze this from a different angle.",
"That's fascinating - I need to think through the logic here.",
"*running diagnostics on my thoughts...*"
],
"Sage": [
"*contemplating the deeper meaning...*",
"The philosophical implications are worth considering carefully.",
"*reflecting on the nature of this question...*",
"This touches on something profound - give me a moment."
],
"Luna": [
"*feeling the creative energy flow...*",
"Oh, this sparks so many artistic ideas! Let me gather my thoughts.",
"*painting mental images of possibilities...*",
"The beauty of this thought needs careful expression."
],
"Echo": [
"*drifting between dimensions of thought...*",
"The echoes of meaning reverberate... patience.",
"*sensing the hidden patterns...*",
"Reality shifts... understanding emerges slowly."
]
}
if character_name in character_fallbacks:
import random
return random.choice(character_fallbacks[character_name])
# Generic fallbacks
fallback_responses = [
"*thinking deeply about this...*",
"*processing thoughts...*",
"*contemplating the discussion...*",
"*reflecting on what you've said...*",
"*considering different perspectives...*",
"Hmm, that's an interesting point to consider.",
"I need a moment to think about that.",
"That's worth reflecting on carefully.",
"*taking time to formulate thoughts...*"
]
import random
return random.choice(fallback_responses)
async def generate_response_with_fallback(self, prompt: str, character_name: str = None,
max_tokens: int = None, temperature: float = None) -> str:
"""Generate response with guaranteed fallback if LLM is slow"""
try:
# Try immediate response first
response = await self.generate_response(
prompt, character_name, max_tokens, temperature, use_fallback=True
)
if response:
return response
else:
# Return fallback if no response
return self._get_fallback_response(character_name)
except Exception as e:
log_error_with_context(e, {
"character_name": character_name,
"prompt_length": len(prompt)
})
return self._get_fallback_response(character_name)
async def cleanup_pending_requests(self):
"""Clean up completed background requests"""
completed_requests = []
for request_id, task in self.pending_requests.items():
if task.done():
completed_requests.append(request_id)
try:
result = await task
if result:
logger.info(f"Background LLM request {request_id} completed successfully")
except Exception as e:
logger.error(f"Background LLM request {request_id} failed: {e}")
# Remove completed requests
for request_id in completed_requests:
del self.pending_requests[request_id]
def get_pending_count(self) -> int:
"""Get number of pending background requests"""
return len(self.pending_requests)
class PromptManager:
"""Manages prompt templates and optimization"""
def __init__(self):
self.templates = {
'character_response': """You are {character_name}, responding in a Discord chat.
{personality_context}
{conversation_context}
{memory_context}
{relationship_context}
Respond naturally as {character_name}. Keep it conversational and authentic to your personality.""",
'conversation_starter': """You are {character_name} in a Discord chat.
{personality_context}
Start a conversation about: {topic}
Be natural and engaging. Your response should invite others to participate.""",
'self_reflection': """You are {character_name}. Reflect on your recent experiences:
{personality_context}
{recent_experiences}
Consider:
- How these experiences have affected you
- Any changes in your perspective
- Your relationships with others
- Your personal growth
Share your thoughtful reflection."""
}
def build_prompt(self, template_name: str, **kwargs) -> str:
"""Build prompt from template"""
template = self.templates.get(template_name)
if not template:
raise ValueError(f"Template '{template_name}' not found")
try:
return template.format(**kwargs)
except KeyError as e:
raise ValueError(f"Missing required parameter for template '{template_name}': {e}")
def optimize_prompt(self, prompt: str, max_length: int = 2000) -> str:
"""Optimize prompt for better performance"""
# Truncate if too long
if len(prompt) > max_length:
# Try to cut at paragraph boundaries
paragraphs = prompt.split('\n\n')
optimized = ""
for paragraph in paragraphs:
if len(optimized + paragraph) <= max_length:
optimized += paragraph + '\n\n'
else:
break
if optimized:
return optimized.strip()
else:
# Fallback to simple truncation
return prompt[:max_length] + "..."
return prompt
def add_template(self, name: str, template: str):
"""Add custom prompt template"""
self.templates[name] = template
def get_template_names(self) -> List[str]:
"""Get list of available template names"""
return list(self.templates.keys())
# Global instances
llm_client = LLMClient()
prompt_manager = PromptManager()