Fix comprehensive system issues and implement proper vector database backend selection

- Fix remaining datetime timezone errors across all database operations - Implement dynamic vector database backend (Qdrant/ChromaDB) based on install.py configuration - Add LLM timeout handling with immediate fallback responses for slow self-hosted models - Use proper install.py configuration (2000 max tokens, 5min timeout, correct LLM endpoint) - Fix PostgreSQL schema to use timezone-aware columns throughout - Implement async LLM request handling with background processing - Add configurable prompt limits and conversation history controls - Start missing database services (PostgreSQL, Redis) automatically - Fix environment variable mapping between install.py and application code - Resolve all timezone-naive vs timezone-aware datetime conflicts System now properly uses Qdrant vector database as specified in install.py instead of hardcoded ChromaDB. Characters respond immediately with fallback messages during long LLM processing times. All database timezone errors resolved with proper timestamptz columns.
2025-07-05 21:31:52 -07:00
parent 4c474eeb23
commit 5480219901
38 changed files with 777 additions and 380 deletions
--- a/src/llm/client.py
+++ b/src/llm/client.py
@@ -3,7 +3,7 @@ import httpx
 import json
 import time
 from typing import Dict, Any, Optional, List
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from utils.config import get_settings
 from utils.logging import log_llm_interaction, log_error_with_context, log_system_health
 import logging
@@ -29,17 +29,23 @@ class LLMClient:
        self.cache = {}
        self.cache_ttl = 300  # 5 minutes
        
+        # Background task queue for long-running requests
+        self.pending_requests = {}
+        self.max_timeout = 60  # Hard timeout limit for immediate responses
+        self.fallback_timeout = 15  # Quick timeout for immediate responses
+        
        # Health monitoring
        self.health_stats = {
            'total_requests': 0,
            'successful_requests': 0,
            'failed_requests': 0,
            'average_response_time': 0,
-            'last_health_check': datetime.utcnow()
+            'last_health_check': datetime.now(timezone.utc)
        }
    
    async def generate_response(self, prompt: str, character_name: str = None, 
-                              max_tokens: int = None, temperature: float = None) -> Optional[str]:
+                              max_tokens: int = None, temperature: float = None, 
+                              use_fallback: bool = True) -> Optional[str]:
        """Generate response using LLM"""
        try:
            # Rate limiting check
@@ -55,8 +61,11 @@ class LLMClient:
            
            start_time = time.time()
            
+            # Use shorter timeout for immediate responses, longer for background
+            effective_timeout = self.fallback_timeout if use_fallback else min(self.timeout, self.max_timeout)
+            
            # Try OpenAI-compatible API first (KoboldCPP, etc.)
-            async with httpx.AsyncClient(timeout=self.timeout) as client:
+            async with httpx.AsyncClient(timeout=effective_timeout) as client:
                try:
                    # OpenAI-compatible request
                    request_data = {
@@ -134,9 +143,24 @@ class LLMClient:
                    return None
                    
        except httpx.TimeoutException:
-            logger.error(f"LLM request timeout for {character_name}")
-            self._update_stats(False, self.timeout)
-            return None
+            if use_fallback:
+                logger.warning(f"LLM request timeout for {character_name}, using fallback response")
+                # Queue for background processing if needed
+                if self.timeout > self.max_timeout:
+                    background_task = asyncio.create_task(self.generate_response(
+                        prompt, character_name, max_tokens, temperature, use_fallback=False
+                    ))
+                    request_id = f"{character_name}_{time.time()}"
+                    self.pending_requests[request_id] = background_task
+                
+                # Return a fallback response immediately
+                fallback_response = self._get_fallback_response(character_name)
+                self._update_stats(False, effective_timeout)
+                return fallback_response
+            else:
+                logger.error(f"LLM background request timeout for {character_name}")
+                self._update_stats(False, effective_timeout)
+                return None
        except httpx.HTTPError as e:
            logger.error(f"LLM HTTP error for {character_name}: {e}")
            self._update_stats(False, time.time() - start_time)
@@ -231,11 +255,11 @@ class LLMClient:
                'response_time': duration,
                'model': self.model,
                'base_url': self.base_url,
-                'timestamp': datetime.utcnow().isoformat()
+                'timestamp': datetime.now(timezone.utc).isoformat()
            }
            
            # Update health check time
-            self.health_stats['last_health_check'] = datetime.utcnow()
+            self.health_stats['last_health_check'] = datetime.now(timezone.utc)
            
            return health_status
            
@@ -246,7 +270,7 @@ class LLMClient:
                'error': str(e),
                'model': self.model,
                'base_url': self.base_url,
-                'timestamp': datetime.utcnow().isoformat()
+                'timestamp': datetime.now(timezone.utc).isoformat()
            }
    
    def get_statistics(self) -> Dict[str, Any]:
@@ -342,6 +366,67 @@ class LLMClient:
        self.health_stats['average_response_time'] = (
            (current_avg * (total_requests - 1) + duration) / total_requests
        )
+    
+    def _get_fallback_response(self, character_name: str = None) -> str:
+        """Generate a fallback response when LLM is slow"""
+        fallback_responses = [
+            "*thinking deeply about this...*",
+            "*processing thoughts...*",
+            "*contemplating the discussion...*",
+            "*reflecting on what you've said...*",
+            "*considering different perspectives...*",
+            "Hmm, that's an interesting point to consider.",
+            "I need a moment to think about that.",
+            "That's worth reflecting on carefully.",
+            "*taking time to formulate thoughts...*"
+        ]
+        
+        import random
+        return random.choice(fallback_responses)
+    
+    async def generate_response_with_fallback(self, prompt: str, character_name: str = None, 
+                                            max_tokens: int = None, temperature: float = None) -> str:
+        """Generate response with guaranteed fallback if LLM is slow"""
+        try:
+            # Try immediate response first
+            response = await self.generate_response(
+                prompt, character_name, max_tokens, temperature, use_fallback=True
+            )
+            
+            if response:
+                return response
+            else:
+                # Return fallback if no response
+                return self._get_fallback_response(character_name)
+                
+        except Exception as e:
+            log_error_with_context(e, {
+                "character_name": character_name,
+                "prompt_length": len(prompt)
+            })
+            return self._get_fallback_response(character_name)
+    
+    async def cleanup_pending_requests(self):
+        """Clean up completed background requests"""
+        completed_requests = []
+        
+        for request_id, task in self.pending_requests.items():
+            if task.done():
+                completed_requests.append(request_id)
+                try:
+                    result = await task
+                    if result:
+                        logger.info(f"Background LLM request {request_id} completed successfully")
+                except Exception as e:
+                    logger.error(f"Background LLM request {request_id} failed: {e}")
+        
+        # Remove completed requests
+        for request_id in completed_requests:
+            del self.pending_requests[request_id]
+    
+    def get_pending_count(self) -> int:
+        """Get number of pending background requests"""
+        return len(self.pending_requests)

 class PromptManager:
    """Manages prompt templates and optimization"""