diff --git a/api_server.py b/api_server.py index 9a9cfc2..642c60d 100644 --- a/api_server.py +++ b/api_server.py @@ -69,7 +69,8 @@ class ModelDispatcher: if model_config["type"] == "openai_proxy": self.backends[model_name] = OpenAIProxyBackend(model_config) elif model_config["type"] in ("llm", "vlm"): - while len(self.llm_models) >= 2: + count = model_config["pool_size"] + while len(self.llm_models) >= count: oldest_model = self.llm_models.pop(0) old_instance = self.backends.pop(oldest_model, None) if old_instance: @@ -307,7 +308,7 @@ async def create_translation( @app.get("/v1/models") async def list_models(): models_info = [] - for model_name in _dispatcher.backends.keys(): + for model_name in config.data["models"].keys(): model_config = config.data["models"].get(model_name, {}) models_info.append({ "id": model_name, diff --git a/backend/llm_client_backend.py b/backend/llm_client_backend.py index 3f7557c..847196a 100644 --- a/backend/llm_client_backend.py +++ b/backend/llm_client_backend.py @@ -20,7 +20,7 @@ class LlmClientBackend(BaseModelBackend): self._active_clients = {} self._pool_lock = asyncio.Lock() self.logger = logging.getLogger("api.llm") - self.MAX_CONTEXT_LENGTH = model_config.get("max_context_length", 500) + self.MAX_CONTEXT_LENGTH = model_config.get("max_context_length", 200) self.POOL_SIZE = model_config.get("pool_size", 2) self._inference_executor = ThreadPoolExecutor(max_workers=self.POOL_SIZE) self._active_tasks = weakref.WeakSet() diff --git a/config/config.yaml b/config/config.yaml index c1e8153..a7d5b83 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,119 +1,3 @@ -# config.yaml server: - host: 0.0.0.0 - port: 8000 - -models: - llama2-7b: - type: llama.cpp - - gpt-3.5-turbo-proxy: - type: openai_proxy - api_key: sk- - base_url: https://api.openai.com/v1 - model: gpt-3.5-turbo - - deepseek-r1: - type: openai_proxy - api_key: sk- - base_url: https://dashscope.aliyuncs.com/compatible-mode/v1 - model: deepseek-r1 - - qwen2.5-0.5B-p256-ax630c: - type: tcp_client - host: "192.168.20.56" - port: 10001 - model_name: "qwen2.5-0.5B-p256-ax630c" - object: "llm.setup" - pool_size: 2 - max_context_length: 128 - response_format: "llm.utf-8.stream" - input: "llm.utf-8" - memory_required: 560460 - system_prompt: | - You are a helpful assistant. - - qwen2.5-1.5B-p256-ax630c: - type: tcp_client - host: "192.168.20.56" - port: 10001 - model_name: "qwen2.5-1.5B-p256-ax630c" - object: "llm.setup" - pool_size: 1 - max_context_length: 128 - response_format: "llm.utf-8.stream" - input: "llm.utf-8" - memory_required: 1686216 - system_prompt: | - You are a helpful assistant. - - deepseek-r1-1.5B-p256-ax630c: - type: tcp_client - host: "192.168.20.56" - port: 10001 - model_name: "deepseek-r1-1.5B-p256-ax630c" - object: "llm.setup" - pool_size: 1 - max_context_length: 128 - response_format: "llm.utf-8.stream" - input: "llm.utf-8" - memory_required: 1686552 - system_prompt: | - You are a helpful assistant. - - llama3.2-1B-p256-ax630c: - type: tcp_client - host: "192.168.20.56" - port: 10001 - model_name: "llama3.2-1B-p256-ax630c" - object: "llm.setup" - pool_size: 2 - max_context_length: 128 - response_format: "llm.utf-8.stream" - input: "llm.utf-8" - memory_required: 1336288 - system_prompt: | - You are a helpful assistant. - - internvl2.5-1B-ax630c: - type: tcp_client - host: "192.168.20.56" - port: 10001 - model_name: "internvl2.5-1B-ax630c" - object: "vlm.setup" - pool_size: 2 - max_context_length: 256 - response_format: "vlm.utf-8.stream" - input: "vlm.utf-8" - memory_required: 905356 - system_prompt: | - You are a helpful assistant. - - qwen-vl-plus: - type: vision_model - api_key: sk- - base_url: https://dashscope.aliyuncs.com/compatible-mode/v1 - model: qwen-vl-plus - max_image_size: 4194304 - image_timeout: 20 - - melotts: - type: tts - host: "192.168.20.56" - port: 10001 - model_name: "melotts_zh-cn" - object: "melotts.setup" - response_format: "wav.base64" - memory_required: 59764 - input: "tts.utf-8" - - whisper-tiny: - type: asr - host: "192.168.20.56" - port: 10001 - model_name: "whisper-tiny" - object: "whisper.setup" - response_format: "asr.utf-8" - memory_required: 289132 - language: "en" - input: "pcm.base64" \ No newline at end of file + host: 127.0.0.1 + port: 10001 \ No newline at end of file