diff --git a/servers/x_composer_2/openai_api_example.py b/openai_api_example.py similarity index 83% rename from servers/x_composer_2/openai_api_example.py rename to openai_api_example.py index 39ed576..b12b9a7 100644 --- a/servers/x_composer_2/openai_api_example.py +++ b/openai_api_example.py @@ -1,12 +1,12 @@ from openai import OpenAI -openai_api_key = "sk-23232323" -openai_api_base = "http://199.204.135.78:23333/v1" +swarms_api_key = "sk-23232323" +swarms_base_url = "http://api.swarms.world/v1" model = "internlm-xcomposer2-4khd-7b" client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, + api_key=swarms_api_key, + base_url=swarms_base_url, ) chat_response = client.chat.completions.create( diff --git a/scripts/PERFORMANCE_MONITORING.md b/scripts/monitoring/PERFORMANCE_MONITORING.md similarity index 100% rename from scripts/PERFORMANCE_MONITORING.md rename to scripts/monitoring/PERFORMANCE_MONITORING.md diff --git a/scripts/grafana.json b/scripts/monitoring/grafana.json similarity index 100% rename from scripts/grafana.json rename to scripts/monitoring/grafana.json diff --git a/scripts/prometheus.yaml b/scripts/monitoring/prometheus.yaml similarity index 100% rename from scripts/prometheus.yaml rename to scripts/monitoring/prometheus.yaml diff --git a/servers/.DS_Store b/servers/.DS_Store new file mode 100644 index 0000000..d5eccce Binary files /dev/null and b/servers/.DS_Store differ diff --git a/servers/agent/api.py b/servers/agent/api.py index 1515bdc..39008b9 100644 --- a/servers/agent/api.py +++ b/servers/agent/api.py @@ -185,6 +185,4 @@ async def agent_completions(agent_input: AgentInput): if __name__ == "__main__": import uvicorn - uvicorn.run( - app, host="0.0.0.0", port=8000, use_colors=True, log_level="info" - ) + uvicorn.run(app, host="0.0.0.0", port=8000, use_colors=True, log_level="info") diff --git a/servers/kubernetes_skypilot/kubeconfig.yaml b/servers/kubernetes_skypilot/kubeconfig.yaml deleted file mode 100644 index 6c2fd3a..0000000 --- a/servers/kubernetes_skypilot/kubeconfig.yaml +++ /dev/null @@ -1,5 +0,0 @@ -kubernetes: - pod_config: - spec: - imagePullSecrets: - - name: your-secret-here \ No newline at end of file diff --git a/servers/kubernetes_skypilot/pre_install.sh b/servers/kubernetes_skypilot/pre_install.sh deleted file mode 100644 index 391411f..0000000 --- a/servers/kubernetes_skypilot/pre_install.sh +++ /dev/null @@ -1,11 +0,0 @@ -# MacOS -brew install kubectl socat netcat - -# Linux (may have socat already installed) -sudo apt-get install kubectl socat netcat -go install sigs.k8s.io/kind@v0.22.0 && kind create cluster -pip3 install "skypilot[all]" - -# Copy kubeconfig -mkdir -p ~/.kube -cp /path/to/kubeconfig ~/.kube/config \ No newline at end of file diff --git a/servers/kubernetes_skypilot/sky_config.yaml b/servers/kubernetes_skypilot/sky_config.yaml deleted file mode 100644 index 6e89b42..0000000 --- a/servers/kubernetes_skypilot/sky_config.yaml +++ /dev/null @@ -1,15 +0,0 @@ -kubernetes: - pod_config: - spec: - containers: - - env: - - name: MY_ENV_VAR - value: MY_ENV_VALUE - volumeMounts: # Custom volume mounts for the pod - - mountPath: /foo - name: example-volume - volumes: - - name: example-volume - hostPath: - path: /tmp - type: Directory \ No newline at end of file diff --git a/servers/kubernetes_skypilot/sky_task.yaml b/servers/kubernetes_skypilot/sky_task.yaml deleted file mode 100644 index 6ef759c..0000000 --- a/servers/kubernetes_skypilot/sky_task.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# task.yaml -resources: - ports: 8888 - -run: | - python -m http.server 8888 \ No newline at end of file diff --git a/servers/llava/api.py b/servers/llava/api.py deleted file mode 100644 index ea84fe6..0000000 --- a/servers/llava/api.py +++ /dev/null @@ -1,178 +0,0 @@ -import asyncio -import importlib -import inspect -import os -from contextlib import asynccontextmanager -from http import HTTPStatus - -import fastapi -import uvicorn -from fastapi import Request -from fastapi.exceptions import RequestValidationError -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, Response, StreamingResponse -from prometheus_client import make_asgi_app -import vllm -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, - ChatCompletionResponse, - CompletionRequest, - ErrorResponse, -) -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion -from vllm.logger import init_logger -from vllm.usage.usage_lib import UsageContext - -TIMEOUT_KEEP_ALIVE = 5 # seconds - -openai_serving_chat: OpenAIServingChat -openai_serving_completion: OpenAIServingCompletion -logger = init_logger(__name__) - - -@asynccontextmanager -async def lifespan(app: fastapi.FastAPI): - async def _force_log(): - while True: - await asyncio.sleep(10) - await engine.do_log_stats() - - if not engine_args.disable_log_stats: - asyncio.create_task(_force_log()) - - yield - - -app = fastapi.FastAPI(lifespan=lifespan, debug=True) - - -def parse_args(): - parser = make_arg_parser() - return parser.parse_args() - - -# Add prometheus asgi middleware to route /metrics requests -metrics_app = make_asgi_app() -app.mount("/metrics", metrics_app) - - -@app.exception_handler(RequestValidationError) -async def validation_exception_handler(_, exc): - err = openai_serving_chat.create_error_response(message=str(exc)) - return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) - - -@app.get("/health") -async def health() -> Response: - """Health check.""" - await openai_serving_chat.engine.check_health() - return Response(status_code=200) - - -@app.get("/v1/models") -async def show_available_models(): - models = await openai_serving_chat.show_available_models() - return JSONResponse(content=models.model_dump()) - - -@app.get("/version") -async def show_version(): - ver = {"version": vllm.__version__} - return JSONResponse(content=ver) - - -@app.post("/v1/chat/completions") -async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): - generator = await openai_serving_chat.create_chat_completion(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), status_code=generator.code) - if request.stream: - return StreamingResponse(content=generator, media_type="text/event-stream") - else: - assert isinstance(generator, ChatCompletionResponse) - return JSONResponse(content=generator.model_dump()) - - -@app.post("/v1/completions") -async def create_completion(request: CompletionRequest, raw_request: Request): - generator = await openai_serving_completion.create_completion(request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), status_code=generator.code) - if request.stream: - return StreamingResponse(content=generator, media_type="text/event-stream") - else: - return JSONResponse(content=generator.model_dump()) - - -if __name__ == "__main__": - args = parse_args() - - app.add_middleware( - CORSMiddleware, - allow_origins=args.allowed_origins, - allow_credentials=args.allow_credentials, - allow_methods=args.allowed_methods, - allow_headers=args.allowed_headers, - ) - - if token := os.environ.get("VLLM_API_KEY") or args.api_key: - - @app.middleware("http") - async def authentication(request: Request, call_next): - root_path = "" if args.root_path is None else args.root_path - if not request.url.path.startswith(f"{root_path}/v1"): - return await call_next(request) - if request.headers.get("Authorization") != "Bearer " + token: - return JSONResponse(content={"error": "Unauthorized"}, status_code=401) - return await call_next(request) - - for middleware in args.middleware: - module_path, object_name = middleware.rsplit(".", 1) - imported = getattr(importlib.import_module(module_path), object_name) - if inspect.isclass(imported): - app.add_middleware(imported) - elif inspect.iscoroutinefunction(imported): - app.middleware("http")(imported) - else: - raise ValueError( - f"Invalid middleware {middleware}. " f"Must be a function or a class." - ) - - logger.info("vLLM API server version %s", vllm.__version__) - logger.info("args: %s", args) - - if args.served_model_name is not None: - served_model_names = args.served_model_name - else: - served_model_names = [args.model] - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngine.from_engine_args( - engine_args, usage_context=UsageContext.OPENAI_API_SERVER - ) - openai_serving_chat = OpenAIServingChat( - engine, - served_model_names, - args.response_role, - args.lora_modules, - args.chat_template, - ) - openai_serving_completion = OpenAIServingCompletion( - engine, served_model_names, args.lora_modules - ) - - app.root_path = args.root_path - uvicorn.run( - app, - host=args.host, - port=args.port, - log_level=args.uvicorn_log_level, - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, - ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile, - ssl_ca_certs=args.ssl_ca_certs, - ssl_cert_reqs=args.ssl_cert_reqs, - ) diff --git a/servers/llms/llm.py b/servers/llms/llm.py deleted file mode 100644 index bd4c9d6..0000000 --- a/servers/llms/llm.py +++ /dev/null @@ -1,1092 +0,0 @@ -import asyncio -import os -import random -import time -from http import HTTPStatus -from typing import AsyncGenerator, List, Literal, Optional, Union - -import uvicorn -from fastapi import Depends, FastAPI, HTTPException, Request -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, StreamingResponse -from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer - -from lmdeploy.messages import ( - PytorchEngineConfig, - TurbomindEngineConfig, -) -from lmdeploy.model import ChatTemplateConfig -from lmdeploy.serve.async_engine import AsyncEngine -from swarms_cloud.schema.openai_protocol import ( # noqa: E501 - ChatCompletionRequest, - ChatCompletionRequestQos, - ChatCompletionResponse, - ChatCompletionResponseChoice, - ChatCompletionResponseStreamChoice, - ChatCompletionStreamResponse, - ChatMessage, - CompletionRequest, - CompletionRequestQos, - CompletionResponse, - CompletionResponseChoice, - CompletionResponseStreamChoice, - CompletionStreamResponse, - DeltaMessage, - EmbeddingsRequest, - EncodeRequest, - EncodeResponse, - ErrorResponse, - GenerateRequest, - GenerateRequestQos, - GenerateResponse, - ModelCard, - ModelList, - ModelPermission, - UsageInfo, - GenerationConfig, -) -from lmdeploy.serve.qos_engine.qos_engine import QosEngine - - -class VariableInterface: - """A IO interface maintaining variables.""" - - async_engine: AsyncEngine = None - api_keys: Optional[List[str]] = None - qos_engine: QosEngine = None - request_hosts = [] - - -app = FastAPI(docs_url="/") -get_bearer_token = HTTPBearer(auto_error=False) - - -async def check_api_key( - auth: Optional[HTTPAuthorizationCredentials] = Depends(get_bearer_token), -) -> str: - """Check if client provide valid api key. - - Adopted from https://github.com/lm-sys/FastChat/blob/v0.2.35/fastchat/serve/openai_api_server.py#L108-L127 - """ # noqa - if VariableInterface.api_keys: - if ( - auth is None - or (token := auth.credentials) not in VariableInterface.api_keys - ): - raise HTTPException( - status_code=401, - detail={ - "error": { - "message": "Please request with valid api key!", - "type": "invalid_request_error", - "param": None, - "code": "invalid_api_key", - } - }, - ) - return token - else: - # api_keys not set; allow all - return None - - -def get_model_list(): - """Available models. - - Only provided one now. - """ - return [VariableInterface.async_engine.engine.model_name] - - -@app.get("/v1/models", dependencies=[Depends(check_api_key)]) -def available_models(): - """Show available models.""" - model_cards = [] - for model_name in get_model_list(): - model_cards.append( - ModelCard(id=model_name, root=model_name, permission=[ModelPermission()]) - ) - return ModelList(data=model_cards) - - -def create_error_response(status: HTTPStatus, message: str): - """Create error response according to http status and message. - - Args: - status (HTTPStatus): HTTP status codes and reason phrases - message (str): error message - """ - return JSONResponse( - ErrorResponse( - message=message, type="invalid_request_error", code=status.value - ).model_dump() - ) - - -async def check_request(request) -> Optional[JSONResponse]: - """Check if a request is valid.""" - if request.model in get_model_list(): - return - ret = create_error_response( - HTTPStatus.NOT_FOUND, f"The model `{request.model}` does not exist." - ) - return ret - - -@app.post("/v1/chat/completions_qos") -async def chat_completions_v1_qos( - request: ChatCompletionRequestQos, raw_request: Request = None -): - """Completion API similar to OpenAI's API. - - Refer to `https://platform.openai.com/docs/api-reference/chat/create` - for the API specification. - - The request should be a JSON object with the following fields: - - model: model name. Available from /v1/models. - - messages: string prompt or chat history in OpenAI format. - - temperature (float): to modulate the next token probability - - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or higher - are kept for generation. - - n (int): How many chat completion choices to generate for each input - message. Only support one here. - - stream: whether to stream the results or not. Default to false. - - max_tokens (int): output token nums - - repetition_penalty (float): The parameter for repetition penalty. - 1.0 means no penalty - - Additional arguments supported by LMDeploy: - - ignore_eos (bool): indicator for ignoring eos - - session_id (int): if not specified, will set random value - - user_id (str): for qos; if not specified, will set to "default" - - Currently we do not support the following features: - - function_call (Users should implement this by themselves) - - logit_bias (not supported yet) - - presence_penalty (replaced with repetition_penalty) - - frequency_penalty (replaced with repetition_penalty) - """ - if request.session_id == -1: - request.session_id = random.randint(1, 10086) - error_check_ret = await check_request(request) - if error_check_ret is not None: - return error_check_ret - - model_name = request.model - request_id = str(request.session_id) - created_time = int(time.time()) - - if VariableInterface.qos_engine is None: - return create_error_response( - HTTPStatus.NOT_FOUND, "cannot parse qos engine config, this api is not work" - ) - - result_generator = await VariableInterface.qos_engine.generate_with_qos(request) - - if result_generator is None: - return create_error_response( - HTTPStatus.INTERNAL_SERVER_ERROR, "Failed to generate completions" - ) - - def create_stream_response_json( - index: int, - text: str, - finish_reason: Optional[str] = None, - ) -> str: - choice_data = ChatCompletionResponseStreamChoice( - index=index, - delta=DeltaMessage(role="assistant", content=text), - finish_reason=finish_reason, - ) - response = ChatCompletionStreamResponse( - id=request_id, - created=created_time, - model=model_name, - choices=[choice_data], - ) - response_json = response.model_dump_json() - - return response_json - - async def completion_stream_generator() -> AsyncGenerator[str, None]: - # First chunk with role - for i in range(request.n): - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage(role="assistant"), - finish_reason=None, - ) - chunk = ChatCompletionStreamResponse( - id=request_id, choices=[choice_data], model=model_name - ) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - - async for res in result_generator: - response_json = create_stream_response_json( - index=0, - text=res.response, - ) - yield f"data: {response_json}\n\n" - yield "data: [DONE]\n\n" - - # Streaming response - if request.stream: - return StreamingResponse( - completion_stream_generator(), media_type="text/event-stream" - ) - - # Non-streaming response - final_res = None - text = "" - async for res in result_generator: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(request.session_id) - return create_error_response(HTTPStatus.BAD_REQUEST, "Client disconnected") - final_res = res - text += res.response - assert final_res is not None - choices = [] - choice_data = ChatCompletionResponseChoice( - index=0, - message=ChatMessage(role="assistant", content=text), - finish_reason=final_res.finish_reason, - ) - choices.append(choice_data) - - total_tokens = sum( - [ - final_res.history_token_len, - final_res.input_token_len, - final_res.generate_token_len, - ] - ) - usage = UsageInfo( - prompt_tokens=final_res.input_token_len, - completion_tokens=final_res.generate_token_len, - total_tokens=total_tokens, - ) - response = ChatCompletionResponse( - id=request_id, - created=created_time, - model=model_name, - choices=choices, - usage=usage, - ) - - return response - - -@app.post("/v1/chat/completions", dependencies=[Depends(check_api_key)]) -async def chat_completions_v1( - request: ChatCompletionRequest, raw_request: Request = None -): - """Completion API similar to OpenAI's API. - - Refer to `https://platform.openai.com/docs/api-reference/chat/create` - for the API specification. - - The request should be a JSON object with the following fields: - - model: model name. Available from /v1/models. - - messages: string prompt or chat history in OpenAI format. Chat history - example: `[{"role": "user", "content": "hi"}]`. - - temperature (float): to modulate the next token probability - - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or higher - are kept for generation. - - n (int): How many chat completion choices to generate for each input - message. Only support one here. - - stream: whether to stream the results or not. Default to false. - - max_tokens (int): output token nums - - repetition_penalty (float): The parameter for repetition penalty. - 1.0 means no penalty - - stop (str | List[str] | None): To stop generating further - tokens. Only accept stop words that's encoded to one token idex. - - Additional arguments supported by LMDeploy: - - ignore_eos (bool): indicator for ignoring eos - - session_id (int): if not specified, will set random value - - Currently we do not support the following features: - - function_call (Users should implement this by themselves) - - logit_bias (not supported yet) - - presence_penalty (replaced with repetition_penalty) - - frequency_penalty (replaced with repetition_penalty) - """ - if request.session_id == -1: - request.session_id = random.randint(1, 10086) - error_check_ret = await check_request(request) - if error_check_ret is not None: - return error_check_ret - - model_name = request.model - request_id = str(request.session_id) - created_time = int(time.time()) - - if isinstance(request.stop, str): - request.stop = [request.stop] - - gen_config = GenerationConfig( - max_new_tokens=request.max_tokens if request.max_tokens else 512, - top_p=request.top_p, - temperature=request.temperature, - repetition_penalty=request.repetition_penalty, - ignore_eos=request.ignore_eos, - stop_words=request.stop, - ) - - result_generator = VariableInterface.async_engine.generate( - request.messages, - request.session_id, - gen_config=gen_config, - stream_response=True, # always use stream to enable batching - sequence_start=True, - sequence_end=True, - do_preprocess=not isinstance( - request.messages, str - ), # text completion for string input - ) - - def create_stream_response_json( - index: int, - text: str, - finish_reason: Optional[str] = None, - ) -> str: - choice_data = ChatCompletionResponseStreamChoice( - index=index, - delta=DeltaMessage(role="assistant", content=text), - finish_reason=finish_reason, - ) - response = ChatCompletionStreamResponse( - id=request_id, - created=created_time, - model=model_name, - choices=[choice_data], - ) - response_json = response.model_dump_json() - - return response_json - - async def completion_stream_generator() -> AsyncGenerator[str, None]: - # First chunk with role - for i in range(request.n): - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage(role="assistant"), - finish_reason=None, - ) - chunk = ChatCompletionStreamResponse( - id=request_id, choices=[choice_data], model=model_name - ) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - - async for res in result_generator: - response_json = create_stream_response_json( - index=0, - text=res.response, - finish_reason=res.finish_reason, - ) - yield f"data: {response_json}\n\n" - yield "data: [DONE]\n\n" - - # Streaming response - if request.stream: - return StreamingResponse( - completion_stream_generator(), media_type="text/event-stream" - ) - - # Non-streaming response - final_res = None - text = "" - async for res in result_generator: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(request.session_id) - return create_error_response(HTTPStatus.BAD_REQUEST, "Client disconnected") - final_res = res - text += res.response - assert final_res is not None - choices = [] - choice_data = ChatCompletionResponseChoice( - index=0, - message=ChatMessage(role="assistant", content=text), - finish_reason=final_res.finish_reason, - ) - choices.append(choice_data) - - total_tokens = sum( - [ - final_res.history_token_len, - final_res.input_token_len, - final_res.generate_token_len, - ] - ) - usage = UsageInfo( - prompt_tokens=final_res.input_token_len, - completion_tokens=final_res.generate_token_len, - total_tokens=total_tokens, - ) - response = ChatCompletionResponse( - id=request_id, - created=created_time, - model=model_name, - choices=choices, - usage=usage, - ) - - return response - - -@app.post("/v1/completions_qos") -async def completions_v1_qos( - request: CompletionRequestQos, raw_request: Request = None -): - """Completion API similar to OpenAI's API. - - Go to `https://platform.openai.com/docs/api-reference/completions/create` - for the API specification. - - The request should be a JSON object with the following fields: - - model (str): model name. Available from /v1/models. - - prompt (str): the input prompt. - - suffix (str): The suffix that comes after a completion of inserted text. - - max_tokens (int): output token nums - - temperature (float): to modulate the next token probability - - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or higher - are kept for generation. - - n (int): How many chat completion choices to generate for each input - message. Only support one here. - - stream: whether to stream the results or not. Default to false. - - repetition_penalty (float): The parameter for repetition penalty. - 1.0 means no penalty - - user (str): A unique identifier representing your end-user. - - Additional arguments supported by LMDeploy: - - top_k (int): The number of the highest probability vocabulary - tokens to keep for top-k-filtering - - ignore_eos (bool): indicator for ignoring eos - - session_id (int): if not specified, will set random value - - user_id (str): for qos; if not specified, will set to "default" - - Currently we do not support the following features: - - logprobs (not supported yet) - - presence_penalty (replaced with repetition_penalty) - - frequency_penalty (replaced with repetition_penalty) - """ - if request.session_id == -1: - request.session_id = random.randint(1, 10086) - error_check_ret = await check_request(request) - if error_check_ret is not None: - return error_check_ret - - model_name = request.model - request_id = str(request.session_id) - created_time = int(time.time()) - if isinstance(request.prompt, str): - request.prompt = [request.prompt] - - if VariableInterface.qos_engine is None: - return create_error_response( - HTTPStatus.NOT_FOUND, "cannot parse qos engine config, this api is not work" - ) - - generators = await VariableInterface.qos_engine.generate_with_qos(request) - - def create_stream_response_json( - index: int, - text: str, - finish_reason: Optional[str] = None, - ) -> str: - choice_data = CompletionResponseStreamChoice( - index=index, - text=text, - finish_reason=finish_reason, - ) - response = CompletionStreamResponse( - id=request_id, - created=created_time, - model=model_name, - choices=[choice_data], - ) - response_json = response.model_dump_json() - - return response_json - - async def completion_stream_generator() -> AsyncGenerator[str, None]: - # First chunk with role - for generator in generators: - for i in range(request.n): - choice_data = CompletionResponseStreamChoice( - index=i, - text="", - finish_reason=None, - ) - chunk = CompletionStreamResponse( - id=request_id, choices=[choice_data], model=model_name - ) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - - async for res in generator: - response_json = create_stream_response_json( - index=0, - text=res.response, - ) - yield f"data: {response_json}\n\n" - yield "data: [DONE]\n\n" - - # Streaming response - if request.stream: - return StreamingResponse( - completion_stream_generator(), media_type="text/event-stream" - ) - - # Non-streaming response - usage = UsageInfo() - choices = [] - - async def _inner_call(i, generator): - final_res = None - text = "" - async for res in generator: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(request.session_id) - return create_error_response( - HTTPStatus.BAD_REQUEST, "Client disconnected" - ) - final_res = res - text += res.response - assert final_res is not None - choice_data = CompletionResponseChoice( - index=0, - text=text, - finish_reason=final_res.finish_reason, - ) - choices.append(choice_data) - - total_tokens = sum( - [ - final_res.history_token_len, - final_res.input_token_len, - final_res.generate_token_len, - ] - ) - usage.prompt_tokens += final_res.input_token_len - usage.completion_tokens += final_res.generate_token_len - usage.total_tokens += total_tokens - - await asyncio.gather( - *[_inner_call(i, generators[i]) for i in range(len(generators))] - ) - - response = CompletionResponse( - id=request_id, - created=created_time, - model=model_name, - choices=choices, - usage=usage, - ) - - return response - - -@app.post("/v1/completions", dependencies=[Depends(check_api_key)]) -async def completions_v1(request: CompletionRequest, raw_request: Request = None): - """Completion API similar to OpenAI's API. - - Go to `https://platform.openai.com/docs/api-reference/completions/create` - for the API specification. - - The request should be a JSON object with the following fields: - - model (str): model name. Available from /v1/models. - - prompt (str): the input prompt. - - suffix (str): The suffix that comes after a completion of inserted text. - - max_tokens (int): output token nums - - temperature (float): to modulate the next token probability - - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or higher - are kept for generation. - - n (int): How many chat completion choices to generate for each input - message. Only support one here. - - stream: whether to stream the results or not. Default to false. - - repetition_penalty (float): The parameter for repetition penalty. - 1.0 means no penalty - - user (str): A unique identifier representing your end-user. - - stop (str | List[str] | None): To stop generating further - tokens. Only accept stop words that's encoded to one token idex. - - Additional arguments supported by LMDeploy: - - ignore_eos (bool): indicator for ignoring eos - - session_id (int): if not specified, will set random value - - top_k (int): The number of the highest probability vocabulary - tokens to keep for top-k-filtering - - Currently we do not support the following features: - - logprobs (not supported yet) - - presence_penalty (replaced with repetition_penalty) - - frequency_penalty (replaced with repetition_penalty) - """ - if request.session_id == -1: - request.session_id = random.randint(1, 10086) - error_check_ret = await check_request(request) - if error_check_ret is not None: - return error_check_ret - - model_name = request.model - request_id = str(request.session_id) - created_time = int(time.time()) - if isinstance(request.prompt, str): - request.prompt = [request.prompt] - if isinstance(request.stop, str): - request.stop = [request.stop] - gen_config = GenerationConfig( - max_new_tokens=request.max_tokens if request.max_tokens else 512, - top_k=request.top_k, - top_p=request.top_p, - temperature=request.temperature, - repetition_penalty=request.repetition_penalty, - ignore_eos=request.ignore_eos, - stop_words=request.stop, - ) - generators = [] - for i in range(len(request.prompt)): - result_generator = VariableInterface.async_engine.generate( - request.prompt[i], - request.session_id + i, - gen_config=gen_config, - stream_response=True, # always use stream to enable batching - sequence_start=True, - sequence_end=True, - do_preprocess=False, - ) - generators.append(result_generator) - - def create_stream_response_json( - index: int, - text: str, - finish_reason: Optional[str] = None, - ) -> str: - choice_data = CompletionResponseStreamChoice( - index=index, - text=text, - finish_reason=finish_reason, - ) - response = CompletionStreamResponse( - id=request_id, - created=created_time, - model=model_name, - choices=[choice_data], - ) - response_json = response.model_dump_json() - - return response_json - - async def completion_stream_generator() -> AsyncGenerator[str, None]: - # First chunk with role - for generator in generators: - for i in range(request.n): - choice_data = CompletionResponseStreamChoice( - index=i, - text="", - finish_reason=None, - ) - chunk = CompletionStreamResponse( - id=request_id, choices=[choice_data], model=model_name - ) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - - async for res in generator: - response_json = create_stream_response_json( - index=0, - text=res.response, - finish_reason=res.finish_reason, - ) - yield f"data: {response_json}\n\n" - yield "data: [DONE]\n\n" - - # Streaming response - if request.stream: - return StreamingResponse( - completion_stream_generator(), media_type="text/event-stream" - ) - - # Non-streaming response - usage = UsageInfo() - choices = [] - - async def _inner_call(i, generator): - final_res = None - text = "" - async for res in generator: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(request.session_id) - return create_error_response( - HTTPStatus.BAD_REQUEST, "Client disconnected" - ) - final_res = res - text += res.response - assert final_res is not None - choice_data = CompletionResponseChoice( - index=0, - text=text, - finish_reason=final_res.finish_reason, - ) - choices.append(choice_data) - - total_tokens = sum( - [ - final_res.history_token_len, - final_res.input_token_len, - final_res.generate_token_len, - ] - ) - usage.prompt_tokens += final_res.input_token_len - usage.completion_tokens += final_res.generate_token_len - usage.total_tokens += total_tokens - - await asyncio.gather( - *[_inner_call(i, generators[i]) for i in range(len(generators))] - ) - - response = CompletionResponse( - id=request_id, - created=created_time, - model=model_name, - choices=choices, - usage=usage, - ) - - return response - - -@app.post("/v1/embeddings", tags=["unsupported"]) -async def create_embeddings(request: EmbeddingsRequest, raw_request: Request = None): - """Creates embeddings for the text.""" - return create_error_response(HTTPStatus.BAD_REQUEST, "Unsupported by turbomind.") - - -@app.post("/v1/encode", dependencies=[Depends(check_api_key)]) -async def encode(request: EncodeRequest, raw_request: Request = None): - """Encode prompts. - - The request should be a JSON object with the following fields: - - input: the prompt to be encoded. In str or List[str] format. - - do_preprocess: whether do preprocess or not. Default to False. - - add_bos: True when it is the beginning of a conversation. False when it - is not. Default to True. - """ - - def encode(prompt: str, do_preprocess: bool, add_bos: bool): - if do_preprocess: - prompt = VariableInterface.async_engine.chat_template.get_prompt( - prompt, sequence_start=add_bos - ) - input_ids = VariableInterface.async_engine.tokenizer.encode( - prompt, add_bos=add_bos - ) - return input_ids - - if isinstance(request.input, str): - encoded = encode(request.input, request.do_preprocess, request.add_bos) - return EncodeResponse(input_ids=encoded, length=len(encoded)) - else: - encoded, length = [], [] - for prompt in request.input: - ids = encode(prompt, request.do_preprocess, request.add_bos) - encoded.append(ids) - length.append(len(ids)) - return EncodeResponse(input_ids=encoded, length=length) - - -@app.post("/v1/chat/interactive_qos") -async def chat_interactive_v1_qos( - request: GenerateRequestQos, raw_request: Request = None -): - """Generate completion for the request. - - - On interactive mode, the chat history is kept on the server. Please set - `interactive_mode = True`. - - On normal mode, no chat history is kept on the server. Set - `interactive_mode = False`. - - The request should be a JSON object with the following fields: - - prompt: the prompt to use for the generation. - - session_id: determine which instance will be called. If not specified - with a value other than -1, using random value directly. - - interactive_mode (bool): turn on interactive mode or not. On interactive - mode, session history is kept on the server (and vice versa). - - stream: whether to stream the results or not. - - stop: whether to stop the session response or not. - - request_output_len (int): output token nums - - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or higher - are kept for generation. - - top_k (int): The number of the highest probability vocabulary - tokens to keep for top-k-filtering - - temperature (float): to modulate the next token probability - - repetition_penalty (float): The parameter for repetition penalty. - 1.0 means no penalty - - ignore_eos (bool): indicator for ignoring eos - - user_id (str): for qos; if not specified, will set to "default" - """ - if request.session_id == -1: - request.session_id = random.randint(10087, 23333) - - if VariableInterface.qos_engine is None: - return create_error_response( - HTTPStatus.NOT_FOUND, "cannot parse qos engine config, this api is not work" - ) - - generation = await VariableInterface.qos_engine.generate_with_qos(request) - - # Streaming case - async def stream_results() -> AsyncGenerator[bytes, None]: - async for out in generation: - chunk = GenerateResponse( - text=out.response, - tokens=out.generate_token_len, - finish_reason=out.finish_reason, - ) - data = chunk.model_dump_json() - yield f"{data}\n" - - if request.stream: - return StreamingResponse(stream_results(), media_type="text/event-stream") - else: - ret = {} - text = "" - tokens = 0 - finish_reason = None - async for out in generation: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - VariableInterface.qos_engine.stop_session(request.session_id) - return create_error_response( - HTTPStatus.BAD_REQUEST, "Client disconnected" - ) - text += out.response - tokens = out.generate_token_len - finish_reason = out.finish_reason - ret = {"text": text, "tokens": tokens, "finish_reason": finish_reason} - return JSONResponse(ret) - - -@app.post("/v1/chat/interactive", dependencies=[Depends(check_api_key)]) -async def chat_interactive_v1(request: GenerateRequest, raw_request: Request = None): - """Generate completion for the request. - - - On interactive mode, the chat history is kept on the server. Please set - `interactive_mode = True`. - - On normal mode, no chat history is kept on the server. Set - `interactive_mode = False`. - - The request should be a JSON object with the following fields: - - prompt: the prompt to use for the generation. - - session_id: determine which instance will be called. If not specified - with a value other than -1, using random value directly. - - interactive_mode (bool): turn on interactive mode or not. On interactive - mode, session history is kept on the server (and vice versa). - - stream: whether to stream the results or not. - - stop (str | List[str] | None): To stop generating further - tokens. Only accept stop words that's encoded to one token idex. - - request_output_len (int): output token nums - - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or higher - are kept for generation. - - top_k (int): The number of the highest probability vocabulary - tokens to keep for top-k-filtering - - temperature (float): to modulate the next token probability - - repetition_penalty (float): The parameter for repetition penalty. - 1.0 means no penalty - - ignore_eos (bool): indicator for ignoring eos - """ - if request.cancel and request.session_id != -1: - VariableInterface.async_engine.stop_session(request.session_id) - return {"text": "", "tokens": 0, "finish_reason": None} - if request.session_id == -1: - request.session_id = random.randint(10087, 23333) - - async_engine = VariableInterface.async_engine - sequence_start = async_engine.id2step.get(str(request.session_id), 0) == 0 - sequence_end = not request.interactive_mode - if isinstance(request.stop, str): - request.stop = [request.stop] - - gen_config = GenerationConfig( - max_new_tokens=request.request_output_len, - top_p=request.top_p, - top_k=request.top_k, - temperature=request.temperature, - repetition_penalty=request.repetition_penalty, - ignore_eos=request.ignore_eos, - stop_words=request.stop, - ) - generation = async_engine.generate( - request.prompt, - request.session_id, - gen_config=gen_config, - stream_response=True, # always use stream to enable batching - sequence_start=sequence_start, - sequence_end=sequence_end, - ) - - # Streaming case - async def stream_results() -> AsyncGenerator[bytes, None]: - async for out in generation: - chunk = GenerateResponse( - text=out.response, - tokens=out.generate_token_len, - finish_reason=out.finish_reason, - ) - data = chunk.model_dump_json() - yield f"{data}\n" - - if request.stream: - return StreamingResponse(stream_results(), media_type="text/event-stream") - else: - ret = {} - text = "" - tokens = 0 - finish_reason = None - async for out in generation: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - async_engine.stop_session(request.session_id) - return create_error_response( - HTTPStatus.BAD_REQUEST, "Client disconnected" - ) - text += out.response - tokens = out.generate_token_len - finish_reason = out.finish_reason - ret = {"text": text, "tokens": tokens, "finish_reason": finish_reason} - return JSONResponse(ret) - - -def serve( - model_path: str, - model_name: Optional[str] = None, - backend: Literal["turbomind", "pytorch"] = "turbomind", - backend_config: Optional[Union[PytorchEngineConfig, TurbomindEngineConfig]] = None, - chat_template_config: Optional[ChatTemplateConfig] = None, - server_name: str = "0.0.0.0", - server_port: int = 23333, - tp: int = 1, - allow_origins: List[str] = ["*"], - allow_credentials: bool = True, - allow_methods: List[str] = ["*"], - allow_headers: List[str] = ["*"], - log_level: str = "ERROR", - api_keys: Optional[Union[List[str], str]] = None, - ssl: bool = False, - qos_config_path: str = "", - **kwargs, -): - """An example to perform model inference through the command line - interface. - - Args: - model_path (str): the path of a model. - It could be one of the following options: - - i) A local directory path of a turbomind model which is - converted by `lmdeploy convert` command or download from - ii) and iii). - - ii) The model_id of a lmdeploy-quantized model hosted - inside a model repo on huggingface.co, such as - "InternLM/internlm-chat-20b-4bit", - "lmdeploy/llama2-chat-70b-4bit", etc. - - iii) The model_id of a model hosted inside a model repo - on huggingface.co, such as "internlm/internlm-chat-7b", - "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" - and so on. - model_name (str): needed when model_path is a pytorch model on - huggingface.co, such as "InternLM/internlm-chat-7b" - backend (str): either `turbomind` or `pytorch` backend. Default to - `turbomind` backend. - backend_config (TurbomindEngineConfig | PytorchEngineConfig): beckend - config instance. Default to none. - chat_template_config (ChatTemplateConfig): chat template configuration. - Default to None. - server_name (str): host ip for serving - server_port (int): server port - tp (int): tensor parallel - allow_origins (List[str]): a list of allowed origins for CORS - allow_credentials (bool): whether to allow credentials for CORS - allow_methods (List[str]): a list of allowed HTTP methods for CORS - allow_headers (List[str]): a list of allowed HTTP headers for CORS - log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG] - api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as - a single api_key. Default to None, which means no api key applied. - ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'. - qos_config_path (str): qos policy config path - """ # noqa E501 - if os.getenv("TM_LOG_LEVEL") is None: - os.environ["TM_LOG_LEVEL"] = log_level - - if allow_origins: - app.add_middleware( - CORSMiddleware, - allow_origins=allow_origins, - allow_credentials=allow_credentials, - allow_methods=allow_methods, - allow_headers=allow_headers, - ) - if api_keys is not None: - if isinstance(api_keys, str): - api_keys = api_keys.split(",") - VariableInterface.api_keys = api_keys - ssl_keyfile, ssl_certfile, http_or_https = None, None, "http" - if ssl: - ssl_keyfile = os.environ["SSL_KEYFILE"] - ssl_certfile = os.environ["SSL_CERTFILE"] - http_or_https = "https" - - VariableInterface.async_engine = AsyncEngine( - model_path=model_path, - model_name=model_name, - backend=backend, - backend_config=backend_config, - chat_template_config=chat_template_config, - tp=tp, - **kwargs, - ) - - if qos_config_path: - try: - with open(qos_config_path, "r") as file: - qos_config_str = file.read() - VariableInterface.qos_engine = QosEngine( - qos_tag=qos_config_str, - engine=VariableInterface.async_engine, - **kwargs, - ) - VariableInterface.qos_engine.start() - except FileNotFoundError: - VariableInterface.qos_engine = None - - for i in range(3): - print( - f"HINT: Please open \033[93m\033[1m{http_or_https}://" - f"{server_name}:{server_port}\033[0m in a browser for detailed api" - " usage!!!" - ) - uvicorn.run( - app=app, - host=server_name, - port=server_port, - log_level="info", - ssl_keyfile=ssl_keyfile, - ssl_certfile=ssl_certfile, - ) - - -if __name__ == "__main__": - import fire - - fire.Fire(serve) diff --git a/servers/pali_gemma/deployment.yaml b/servers/pali_gemma/deployment.yaml new file mode 100644 index 0000000..0916923 --- /dev/null +++ b/servers/pali_gemma/deployment.yaml @@ -0,0 +1,47 @@ +envs: + MODEL_NAME: google/paligemma-3b-pt-224 + MODEL_ARCH: internlm + HUGGING_FACE_HUB_TOKEN: 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 + MODEL_PORT: 8080 + +resources: + # accelerators: {L4:4, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} ## Large models + accelerators: [A10g, A100, A100, A100-80GB, T4, M60] ## Small models + # cpus: 32+ + memory: 32+ + use_spot: True + disk_size: 512 # Ensure model checkpoints (~246GB) can fit. + # disk_tier: best + ports: 8080 # Expose to internet traffic. + +service: + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + readiness_probe: /v1/models + + # Replica Policy + replica_policy: + min_replicas: 3 # Minimum number of replicas + max_replicas: 100 # Maximum number of replicas + target_qps_per_replica: 2.5 # Target queries per second per replica + upscale_delay_seconds: 40 # Delay before upscaling replicas + downscale_delay_seconds: 20 # Delay before downscaling replicas + + +setup: | + docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 + +run: | + lmdeploy serve api_server google/paligemma-3b-pt-224 --server-port 8080 \ No newline at end of file diff --git a/servers/swarm_agents/Devin_state.json b/servers/swarm_agents/Devin_state.json deleted file mode 100644 index 9e9b490..0000000 --- a/servers/swarm_agents/Devin_state.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "agent_id": "", - "agent_name": "Devin", - "agent_description": null, - "system_prompt": "Autonomous agent that can interact with humans and other agents. Be Helpful and Kind. Use the tools provided to assist the user. Return all code in markdown format.", - "sop": null, - "short_memory": "system: Autonomous agent that can interact with humans and other agents. Be Helpful and Kind. Use the tools provided to assist the user. Return all code in markdown format.\n\n\nDevin: \n You are Devin,\n Your decisions must always be made independently without seeking user assistance. \n Play to your strengths as an LLM and pursue simple strategies with no legal complications.\n If you have completed all your tasks, make sure to use the 'finish' command.\n \n GOALS:\n \n 1. Hello, how are you? Create an image of how you are doing!\n \n Constraints:\n \n 1. ~4000 word limit for short term memory. Your short term memory is short, so immediately save important information to files.\n 2. If you are unsure how you previously did something or want to recall past events, thinking about similar events will help you remember.\n 3. No user assistance\n 4. Exclusively use the commands listed in double quotes e.g. 'command name'\n \n Commands:\n \n 1. finish: use this to signal that you have finished all your objectives, args: 'response': 'final response to let people know you have finished your objectives'\n \n Resources:\n \n 1. Internet access for searches and information gathering.\n 2. Long Term memory management.\n 3. Agents for delegation of simple tasks.\n 4. File output.\n \n Performance Evaluation:\n \n 1. Continuously review and analyze your actions to ensure you are performing to the best of your abilities.\n 2. Constructively self-criticize your big-picture behavior constantly.\n 3. Reflect on past decisions and strategies to refine your approach.\n 4. Every command has a cost, so be smart and efficient. Aim to complete tasks in the least number of steps.\n \n You should only respond in JSON format as described below Response Format, you will respond only in markdown format within 6 backticks. The JSON will be in markdown format.\n \n ```\n {'$defs': {'Command': {'properties': {'name': {'title': 'Command Name', 'type': 'string'}, 'args': {'default': {}, 'title': 'Command Arguments', 'type': 'object'}}, 'required': ['name'], 'title': 'Command', 'type': 'object'}, 'Thoughts': {'properties': {'text': {'title': 'Thoughts', 'type': 'string'}, 'reasoning': {'title': 'Reasoning', 'type': 'string'}, 'plan': {'title': 'Plan', 'type': 'string'}}, 'required': ['text', 'reasoning', 'plan'], 'title': 'Thoughts', 'type': 'object'}}, 'properties': {'thoughts': {'$ref': '#/$defs/Thoughts'}, 'command': {'$ref': '#/$defs/Command'}}, 'required': ['thoughts', 'command'], 'title': 'ResponseFormat', 'type': 'object'}\n ```\n \n Ensure the response can be parsed by Python json.loads\n System: The current time and date is 2024-04-16 12:45:09\n System: This reminds you of these events from your past:\n [system: Autonomous agent that can interact with humans and other agents. Be Helpful and Kind. Use the tools provided to assist the user. Return all code in markdown format.\n\n]\n \n Human: Determine which next command to use, and respond using the format specified above:\n \n\n\nDevin: Class Name: StructuredTool\n\nDocumentation:\nTool that can operate on any number of inputs.\n\n\nclass StructuredTool(BaseTool):\n \"\"\"Tool that can operate on any number of inputs.\"\"\"\n\n description: str = \"\"\n args_schema: Type[BaseModel] = Field(..., description=\"The tool schema.\")\n \"\"\"The input arguments' schema.\"\"\"\n func: Optional[Callable[..., Any]]\n \"\"\"The function to run when the tool is called.\"\"\"\n coroutine: Optional[Callable[..., Awaitable[Any]]] = None\n \"\"\"The asynchronous version of the function.\"\"\"\n\n # --- Runnable ---\n\n async def ainvoke(\n self,\n input: Union[str, Dict],\n config: Optional[RunnableConfig] = None,\n **kwargs: Any,\n ) -> Any:\n if not self.coroutine:\n # If the tool does not implement async, fall back to default implementation\n return await run_in_executor(config, self.invoke, input, config, **kwargs)\n\n return await super().ainvoke(input, config, **kwargs)\n\n # --- Tool ---\n\n @property\n def args(self) -> dict:\n \"\"\"The tool's input arguments.\"\"\"\n return self.args_schema.schema()[\"properties\"]\n\n def _run(\n self,\n *args: Any,\n run_manager: Optional[CallbackManagerForToolRun] = None,\n **kwargs: Any,\n ) -> Any:\n \"\"\"Use the tool.\"\"\"\n if self.func:\n new_argument_supported = signature(self.func).parameters.get(\"callbacks\")\n return (\n self.func(\n *args,\n callbacks=run_manager.get_child() if run_manager else None,\n **kwargs,\n )\n if new_argument_supported\n else self.func(*args, **kwargs)\n )\n raise NotImplementedError(\"Tool does not support sync\")\n\n async def _arun(\n self,\n *args: Any,\n run_manager: Optional[AsyncCallbackManagerForToolRun] = None,\n **kwargs: Any,\n ) -> str:\n \"\"\"Use the tool asynchronously.\"\"\"\n if self.coroutine:\n new_argument_supported = signature(self.coroutine).parameters.get(\n \"callbacks\"\n )\n return (\n await self.coroutine(\n *args,\n callbacks=run_manager.get_child() if run_manager else None,\n **kwargs,\n )\n if new_argument_supported\n else await self.coroutine(*args, **kwargs)\n )\n return await run_in_executor(\n None,\n self._run,\n run_manager=run_manager.get_sync() if run_manager else None,\n *args,\n **kwargs,\n )\n\n @classmethod\n def from_function(\n cls,\n func: Optional[Callable] = None,\n coroutine: Optional[Callable[..., Awaitable[Any]]] = None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n return_direct: bool = False,\n args_schema: Optional[Type[BaseModel]] = None,\n infer_schema: bool = True,\n **kwargs: Any,\n ) -> StructuredTool:\n \"\"\"Create tool from a given function.\n\n A classmethod that helps to create a tool from a function.\n\n Args:\n func: The function from which to create a tool\n coroutine: The async function from which to create a tool\n name: The name of the tool. Defaults to the function name\n description: The description of the tool. Defaults to the function docstring\n return_direct: Whether to return the result directly or as a callback\n args_schema: The schema of the tool's input arguments\n infer_schema: Whether to infer the schema from the function's signature\n **kwargs: Additional arguments to pass to the tool\n\n Returns:\n The tool\n\n Examples:\n\n .. code-block:: python\n\n def add(a: int, b: int) -> int:\n \\\"\\\"\\\"Add two numbers\\\"\\\"\\\"\n return a + b\n tool = StructuredTool.from_function(add)\n tool.run(1, 2) # 3\n \"\"\"\n\n if func is not None:\n source_function = func\n elif coroutine is not None:\n source_function = coroutine\n else:\n raise ValueError(\"Function and/or coroutine must be provided\")\n name = name or source_function.__name__\n description = description or source_function.__doc__\n if description is None:\n raise ValueError(\n \"Function must have a docstring if description not provided.\"\n )\n\n # Description example:\n # search_api(query: str) - Searches the API for the query.\n sig = signature(source_function)\n description = f\"{name}{sig} - {description.strip()}\"\n _args_schema = args_schema\n if _args_schema is None and infer_schema:\n # schema name is appended within function\n _args_schema = create_schema_from_function(name, source_function)\n return cls(\n name=name,\n func=func,\n coroutine=coroutine,\n args_schema=_args_schema, # type: ignore[arg-type]\n description=description,\n return_direct=return_direct,\n **kwargs,\n )\n\n\n\nDevin: Class Name: StructuredTool\n\nDocumentation:\nTool that can operate on any number of inputs.\n\n\nclass StructuredTool(BaseTool):\n \"\"\"Tool that can operate on any number of inputs.\"\"\"\n\n description: str = \"\"\n args_schema: Type[BaseModel] = Field(..., description=\"The tool schema.\")\n \"\"\"The input arguments' schema.\"\"\"\n func: Optional[Callable[..., Any]]\n \"\"\"The function to run when the tool is called.\"\"\"\n coroutine: Optional[Callable[..., Awaitable[Any]]] = None\n \"\"\"The asynchronous version of the function.\"\"\"\n\n # --- Runnable ---\n\n async def ainvoke(\n self,\n input: Union[str, Dict],\n config: Optional[RunnableConfig] = None,\n **kwargs: Any,\n ) -> Any:\n if not self.coroutine:\n # If the tool does not implement async, fall back to default implementation\n return await run_in_executor(config, self.invoke, input, config, **kwargs)\n\n return await super().ainvoke(input, config, **kwargs)\n\n # --- Tool ---\n\n @property\n def args(self) -> dict:\n \"\"\"The tool's input arguments.\"\"\"\n return self.args_schema.schema()[\"properties\"]\n\n def _run(\n self,\n *args: Any,\n run_manager: Optional[CallbackManagerForToolRun] = None,\n **kwargs: Any,\n ) -> Any:\n \"\"\"Use the tool.\"\"\"\n if self.func:\n new_argument_supported = signature(self.func).parameters.get(\"callbacks\")\n return (\n self.func(\n *args,\n callbacks=run_manager.get_child() if run_manager else None,\n **kwargs,\n )\n if new_argument_supported\n else self.func(*args, **kwargs)\n )\n raise NotImplementedError(\"Tool does not support sync\")\n\n async def _arun(\n self,\n *args: Any,\n run_manager: Optional[AsyncCallbackManagerForToolRun] = None,\n **kwargs: Any,\n ) -> str:\n \"\"\"Use the tool asynchronously.\"\"\"\n if self.coroutine:\n new_argument_supported = signature(self.coroutine).parameters.get(\n \"callbacks\"\n )\n return (\n await self.coroutine(\n *args,\n callbacks=run_manager.get_child() if run_manager else None,\n **kwargs,\n )\n if new_argument_supported\n else await self.coroutine(*args, **kwargs)\n )\n return await run_in_executor(\n None,\n self._run,\n run_manager=run_manager.get_sync() if run_manager else None,\n *args,\n **kwargs,\n )\n\n @classmethod\n def from_function(\n cls,\n func: Optional[Callable] = None,\n coroutine: Optional[Callable[..., Awaitable[Any]]] = None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n return_direct: bool = False,\n args_schema: Optional[Type[BaseModel]] = None,\n infer_schema: bool = True,\n **kwargs: Any,\n ) -> StructuredTool:\n \"\"\"Create tool from a given function.\n\n A classmethod that helps to create a tool from a function.\n\n Args:\n func: The function from which to create a tool\n coroutine: The async function from which to create a tool\n name: The name of the tool. Defaults to the function name\n description: The description of the tool. Defaults to the function docstring\n return_direct: Whether to return the result directly or as a callback\n args_schema: The schema of the tool's input arguments\n infer_schema: Whether to infer the schema from the function's signature\n **kwargs: Additional arguments to pass to the tool\n\n Returns:\n The tool\n\n Examples:\n\n .. code-block:: python\n\n def add(a: int, b: int) -> int:\n \\\"\\\"\\\"Add two numbers\\\"\\\"\\\"\n return a + b\n tool = StructuredTool.from_function(add)\n tool.run(1, 2) # 3\n \"\"\"\n\n if func is not None:\n source_function = func\n elif coroutine is not None:\n source_function = coroutine\n else:\n raise ValueError(\"Function and/or coroutine must be provided\")\n name = name or source_function.__name__\n description = description or source_function.__doc__\n if description is None:\n raise ValueError(\n \"Function must have a docstring if description not provided.\"\n )\n\n # Description example:\n # search_api(query: str) - Searches the API for the query.\n sig = signature(source_function)\n description = f\"{name}{sig} - {description.strip()}\"\n _args_schema = args_schema\n if _args_schema is None and infer_schema:\n # schema name is appended within function\n _args_schema = create_schema_from_function(name, source_function)\n return cls(\n name=name,\n func=func,\n coroutine=coroutine,\n args_schema=_args_schema, # type: ignore[arg-type]\n description=description,\n return_direct=return_direct,\n **kwargs,\n )\n\n\n\nDevin: Class Name: StructuredTool\n\nDocumentation:\nTool that can operate on any number of inputs.\n\n\nclass StructuredTool(BaseTool):\n \"\"\"Tool that can operate on any number of inputs.\"\"\"\n\n description: str = \"\"\n args_schema: Type[BaseModel] = Field(..., description=\"The tool schema.\")\n \"\"\"The input arguments' schema.\"\"\"\n func: Optional[Callable[..., Any]]\n \"\"\"The function to run when the tool is called.\"\"\"\n coroutine: Optional[Callable[..., Awaitable[Any]]] = None\n \"\"\"The asynchronous version of the function.\"\"\"\n\n # --- Runnable ---\n\n async def ainvoke(\n self,\n input: Union[str, Dict],\n config: Optional[RunnableConfig] = None,\n **kwargs: Any,\n ) -> Any:\n if not self.coroutine:\n # If the tool does not implement async, fall back to default implementation\n return await run_in_executor(config, self.invoke, input, config, **kwargs)\n\n return await super().ainvoke(input, config, **kwargs)\n\n # --- Tool ---\n\n @property\n def args(self) -> dict:\n \"\"\"The tool's input arguments.\"\"\"\n return self.args_schema.schema()[\"properties\"]\n\n def _run(\n self,\n *args: Any,\n run_manager: Optional[CallbackManagerForToolRun] = None,\n **kwargs: Any,\n ) -> Any:\n \"\"\"Use the tool.\"\"\"\n if self.func:\n new_argument_supported = signature(self.func).parameters.get(\"callbacks\")\n return (\n self.func(\n *args,\n callbacks=run_manager.get_child() if run_manager else None,\n **kwargs,\n )\n if new_argument_supported\n else self.func(*args, **kwargs)\n )\n raise NotImplementedError(\"Tool does not support sync\")\n\n async def _arun(\n self,\n *args: Any,\n run_manager: Optional[AsyncCallbackManagerForToolRun] = None,\n **kwargs: Any,\n ) -> str:\n \"\"\"Use the tool asynchronously.\"\"\"\n if self.coroutine:\n new_argument_supported = signature(self.coroutine).parameters.get(\n \"callbacks\"\n )\n return (\n await self.coroutine(\n *args,\n callbacks=run_manager.get_child() if run_manager else None,\n **kwargs,\n )\n if new_argument_supported\n else await self.coroutine(*args, **kwargs)\n )\n return await run_in_executor(\n None,\n self._run,\n run_manager=run_manager.get_sync() if run_manager else None,\n *args,\n **kwargs,\n )\n\n @classmethod\n def from_function(\n cls,\n func: Optional[Callable] = None,\n coroutine: Optional[Callable[..., Awaitable[Any]]] = None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n return_direct: bool = False,\n args_schema: Optional[Type[BaseModel]] = None,\n infer_schema: bool = True,\n **kwargs: Any,\n ) -> StructuredTool:\n \"\"\"Create tool from a given function.\n\n A classmethod that helps to create a tool from a function.\n\n Args:\n func: The function from which to create a tool\n coroutine: The async function from which to create a tool\n name: The name of the tool. Defaults to the function name\n description: The description of the tool. Defaults to the function docstring\n return_direct: Whether to return the result directly or as a callback\n args_schema: The schema of the tool's input arguments\n infer_schema: Whether to infer the schema from the function's signature\n **kwargs: Additional arguments to pass to the tool\n\n Returns:\n The tool\n\n Examples:\n\n .. code-block:: python\n\n def add(a: int, b: int) -> int:\n \\\"\\\"\\\"Add two numbers\\\"\\\"\\\"\n return a + b\n tool = StructuredTool.from_function(add)\n tool.run(1, 2) # 3\n \"\"\"\n\n if func is not None:\n source_function = func\n elif coroutine is not None:\n source_function = coroutine\n else:\n raise ValueError(\"Function and/or coroutine must be provided\")\n name = name or source_function.__name__\n description = description or source_function.__doc__\n if description is None:\n raise ValueError(\n \"Function must have a docstring if description not provided.\"\n )\n\n # Description example:\n # search_api(query: str) - Searches the API for the query.\n sig = signature(source_function)\n description = f\"{name}{sig} - {description.strip()}\"\n _args_schema = args_schema\n if _args_schema is None and infer_schema:\n # schema name is appended within function\n _args_schema = create_schema_from_function(name, source_function)\n return cls(\n name=name,\n func=func,\n coroutine=coroutine,\n args_schema=_args_schema, # type: ignore[arg-type]\n description=description,\n return_direct=return_direct,\n **kwargs,\n )\n\n\n\nDevin: Class Name: StructuredTool\n\nDocumentation:\nTool that can operate on any number of inputs.\n\n\nclass StructuredTool(BaseTool):\n \"\"\"Tool that can operate on any number of inputs.\"\"\"\n\n description: str = \"\"\n args_schema: Type[BaseModel] = Field(..., description=\"The tool schema.\")\n \"\"\"The input arguments' schema.\"\"\"\n func: Optional[Callable[..., Any]]\n \"\"\"The function to run when the tool is called.\"\"\"\n coroutine: Optional[Callable[..., Awaitable[Any]]] = None\n \"\"\"The asynchronous version of the function.\"\"\"\n\n # --- Runnable ---\n\n async def ainvoke(\n self,\n input: Union[str, Dict],\n config: Optional[RunnableConfig] = None,\n **kwargs: Any,\n ) -> Any:\n if not self.coroutine:\n # If the tool does not implement async, fall back to default implementation\n return await run_in_executor(config, self.invoke, input, config, **kwargs)\n\n return await super().ainvoke(input, config, **kwargs)\n\n # --- Tool ---\n\n @property\n def args(self) -> dict:\n \"\"\"The tool's input arguments.\"\"\"\n return self.args_schema.schema()[\"properties\"]\n\n def _run(\n self,\n *args: Any,\n run_manager: Optional[CallbackManagerForToolRun] = None,\n **kwargs: Any,\n ) -> Any:\n \"\"\"Use the tool.\"\"\"\n if self.func:\n new_argument_supported = signature(self.func).parameters.get(\"callbacks\")\n return (\n self.func(\n *args,\n callbacks=run_manager.get_child() if run_manager else None,\n **kwargs,\n )\n if new_argument_supported\n else self.func(*args, **kwargs)\n )\n raise NotImplementedError(\"Tool does not support sync\")\n\n async def _arun(\n self,\n *args: Any,\n run_manager: Optional[AsyncCallbackManagerForToolRun] = None,\n **kwargs: Any,\n ) -> str:\n \"\"\"Use the tool asynchronously.\"\"\"\n if self.coroutine:\n new_argument_supported = signature(self.coroutine).parameters.get(\n \"callbacks\"\n )\n return (\n await self.coroutine(\n *args,\n callbacks=run_manager.get_child() if run_manager else None,\n **kwargs,\n )\n if new_argument_supported\n else await self.coroutine(*args, **kwargs)\n )\n return await run_in_executor(\n None,\n self._run,\n run_manager=run_manager.get_sync() if run_manager else None,\n *args,\n **kwargs,\n )\n\n @classmethod\n def from_function(\n cls,\n func: Optional[Callable] = None,\n coroutine: Optional[Callable[..., Awaitable[Any]]] = None,\n name: Optional[str] = None,\n description: Optional[str] = None,\n return_direct: bool = False,\n args_schema: Optional[Type[BaseModel]] = None,\n infer_schema: bool = True,\n **kwargs: Any,\n ) -> StructuredTool:\n \"\"\"Create tool from a given function.\n\n A classmethod that helps to create a tool from a function.\n\n Args:\n func: The function from which to create a tool\n coroutine: The async function from which to create a tool\n name: The name of the tool. Defaults to the function name\n description: The description of the tool. Defaults to the function docstring\n return_direct: Whether to return the result directly or as a callback\n args_schema: The schema of the tool's input arguments\n infer_schema: Whether to infer the schema from the function's signature\n **kwargs: Additional arguments to pass to the tool\n\n Returns:\n The tool\n\n Examples:\n\n .. code-block:: python\n\n def add(a: int, b: int) -> int:\n \\\"\\\"\\\"Add two numbers\\\"\\\"\\\"\n return a + b\n tool = StructuredTool.from_function(add)\n tool.run(1, 2) # 3\n \"\"\"\n\n if func is not None:\n source_function = func\n elif coroutine is not None:\n source_function = coroutine\n else:\n raise ValueError(\"Function and/or coroutine must be provided\")\n name = name or source_function.__name__\n description = description or source_function.__doc__\n if description is None:\n raise ValueError(\n \"Function must have a docstring if description not provided.\"\n )\n\n # Description example:\n # search_api(query: str) - Searches the API for the query.\n sig = signature(source_function)\n description = f\"{name}{sig} - {description.strip()}\"\n _args_schema = args_schema\n if _args_schema is None and infer_schema:\n # schema name is appended within function\n _args_schema = create_schema_from_function(name, source_function)\n return cls(\n name=name,\n func=func,\n coroutine=coroutine,\n args_schema=_args_schema, # type: ignore[arg-type]\n description=description,\n return_direct=return_direct,\n **kwargs,\n )\n\n\n\nHuman:: Use the browser tool to look up chicken nuggets\n\n\n\nDevin: \n```python\nfrom tools import browser\n\nresponse = browser.search(\"chicken nuggets\")\n\nprint(response)\n```\n\nThis uses the `browser` tool to search for \"chicken nuggets\". The `search()` method will open the default web browser and search for the given query. The response contains the HTML content of the search result page.\n\nWe print out the response to see what was returned. This allows you to parse and process the search results in your code.\n\nSome other things you could do:\n\n- Search images instead with `browser.image_search()`\n- Open a specific URL with `browser.open()`\n- Extract links, titles, text etc. by parsing the HTML\n- Interact with the browser using Selenium\n\nLet me know if you need any other help using the browser tool!\n\n", - "loop_interval": 0, - "retry_attempts": 3, - "retry_interval": 1, - "interactive": false, - "dashboard": false, - "dynamic_temperature": false, - "autosave": true, - "saved_state_path": "Devin_state.json", - "max_loops": 1 -} \ No newline at end of file diff --git a/servers/swarm_agents/langserve_api.py b/servers/swarm_agents/langserve_api.py deleted file mode 100644 index e37a03e..0000000 --- a/servers/swarm_agents/langserve_api.py +++ /dev/null @@ -1,139 +0,0 @@ -import subprocess - -from fastapi import Depends, FastAPI -from fastapi.middleware.cors import CORSMiddleware -from langchain_core.runnables import RunnableLambda -from langserve import add_routes -from swarms import Agent, Anthropic, tool - - -# Model -llm = Anthropic( - temperature=0.1, -) - - -# Tools -@tool -def terminal( - code: str, -): - """ - Run code in the terminal. - - Args: - code (str): The code to run in the terminal. - - Returns: - str: The output of the code. - """ - out = subprocess.run(code, shell=True, capture_output=True, text=True).stdout - return str(out) - - -@tool -def browser(query: str): - """ - Search the query in the browser with the `browser` tool. - - Args: - query (str): The query to search in the browser. - - Returns: - str: The search results. - """ - import webbrowser - - url = f"https://www.google.com/search?q={query}" - webbrowser.open(url) - return f"Searching for {query} in the browser." - - -@tool -def create_file(file_path: str, content: str): - """ - Create a file using the file editor tool. - - Args: - file_path (str): The path to the file. - content (str): The content to write to the file. - - Returns: - str: The result of the file creation operation. - """ - with open(file_path, "w") as file: - file.write(content) - return f"File {file_path} created successfully." - - -@tool -def file_editor(file_path: str, mode: str, content: str): - """ - Edit a file using the file editor tool. - - Args: - file_path (str): The path to the file. - mode (str): The mode to open the file in. - content (str): The content to write to the file. - - Returns: - str: The result of the file editing operation. - """ - with open(file_path, mode) as file: - file.write(content) - return f"File {file_path} edited successfully." - - -def agent_run_task(task: str) -> str: - # Agent - agent = Agent( - agent_name="Devin", - system_prompt=( - "Autonomous agent that can interact with humans and other" - " agents. Be Helpful and Kind. Use the tools provided to" - " assist the user. Return all code in markdown format." - ), - llm=llm, - max_loops=1, - autosave=True, - dashboard=False, - streaming_on=True, - verbose=True, - stopping_token="", - tools=[terminal, browser, file_editor, create_file], - ) - - return agent.run(task) - - -agent = RunnableLambda(agent_run_task) - -app = FastAPI( - title="LangChain Server", - version="1.0", - description="A simple api server using Langchain's Runnable interfaces", - dependencies=[Depends()], -) - - -# Set all CORS enabled origins -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - expose_headers=["*"], -) - -add_routes( - app, - agent, - path="/test", -) - - -if __name__ == "__main__": - import uvicorn - - uvicorn.run(app, host="localhost", port=8000) diff --git a/servers/swarm_agents/normal_deploy.py b/servers/swarm_agents/normal_deploy.py deleted file mode 100644 index 8d25373..0000000 --- a/servers/swarm_agents/normal_deploy.py +++ /dev/null @@ -1,158 +0,0 @@ -from dotenv import load_dotenv -import os - -import torch -import uvicorn -from fastapi import FastAPI, HTTPException -from fastapi.middleware.cors import CORSMiddleware -from loguru import logger -from sse_starlette.sse import EventSourceResponse - -from swarms_cloud.schema.cog_vlm_schemas import ( - ChatCompletionRequest, - ChatCompletionResponse, - ChatCompletionResponseChoice, - ChatMessageResponse, - ModelCard, - ModelList, - UsageInfo, -) - -# from exa.structs.parallelize_models_gpus import prepare_model_for_ddp_inference - -# Load environment variables from .env file -load_dotenv() - -# Environment variables -MODEL_PATH = os.environ.get("COGVLM_MODEL_PATH", "THUDM/cogvlm-chat-hf") -TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", "lmsys/vicuna-7b-v1.5") -DEVICE = "cuda" if torch.cuda.is_available() else "cpu" -QUANT_ENABLED = os.environ.get("QUANT_ENABLED", True) - -# Create a FastAPI app -app = FastAPI( - title="Swarms Cloud API", - description="A simple API server for Swarms Cloud", - debug=True, - version="1.0", -) - - -# Load the middleware to handle CORS -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -@app.get("/v1/models", response_model=ModelList) -async def list_models(): - """ - An endpoint to list available models. It returns a list of model cards. - This is useful for clients to query and understand what models are available for use. - """ - model_card = ModelCard( - id="cogvlm-chat-17b" - ) # can be replaced by your model id like cogagent-chat-18b - return ModelList(data=[model_card]) - - -@app.post("/v1/chat/completions", response_model=ChatCompletionResponse) -async def create_chat_completion( - request: ChatCompletionRequest, # token: str = Depends(authenticate_user) -): - try: - if len(request.messages) < 1 or request.messages[-1].role == "assistant": - raise HTTPException(status_code=400, detail="Invalid request") - - # print(f"Request: {request}") - dict( - messages=request.messages, - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens or 1024, - echo=False, - stream=request.stream, - ) - - if request.stream: - # generate = predict(request.model, gen_params) - generate = None - return EventSourceResponse(generate, media_type="text/event-stream") - - # Generate response - # response = generate_cogvlm(model, tokenizer, gen_params) - response = None - - usage = UsageInfo() - - # ChatMessageResponse - message = ChatMessageResponse( - role="assistant", - content=response["text"], - ) - - # # # Log the entry to supabase - # entry = ModelAPILogEntry( - # user_id=fetch_api_key_info(token), - # model_id="41a2869c-5f8d-403f-83bb-1f06c56bad47", - # input_tokens=count_tokens(request.messages, tokenizer, request.model), - # output_tokens=count_tokens(response["text"], tokenizer, request.model), - # all_cost=calculate_pricing( - # texts=[message.content], tokenizer=tokenizer, rate_per_million=15.0 - # ), - # input_cost=calculate_pricing( - # texts=[message.content], tokenizer=tokenizer, rate_per_million=15.0 - # ), - # output_cost=calculate_pricing( - # texts=response["text"], tokenizer=tokenizer, rate_per_million=15.0 - # ) - # * 5, - # messages=request.messages, - # # temperature=request.temperature, - # top_p=request.top_p, - # # echo=request.echo, - # stream=request.stream, - # repetition_penalty=request.repetition_penalty, - # max_tokens=request.max_tokens, - # ) - - # # Log the entry to supabase - # log_to_supabase(entry=entry) - - # ChatCompletionResponseChoice - logger.debug(f"==== message ====\n{message}") - choice_data = ChatCompletionResponseChoice( - index=0, - message=message, - ) - - # task_usage = UsageInfo.model_validate(response["usage"]) - task_usage = UsageInfo.parse_obj(response["usage"]) - for usage_key, usage_value in task_usage.dict().items(): - setattr(usage, usage_key, getattr(usage, usage_key) + usage_value) - - out = ChatCompletionResponse( - model=request.model, - choices=[choice_data], - object="chat.completion", - usage=usage, - ) - - return out - except Exception as e: - logger.error(f"Error: {e}") - raise HTTPException(status_code=500, detail="Internal Server Error") - - -if __name__ == "__main__": - uvicorn.run( - app, - host="0.0.0.0", - port=int(os.environ.get("SWARM_AGENT_API_PORT", 8000)), - log_level="info", - use_colors=True, - ) diff --git a/servers/swarm_agents/omni_agent_api.py b/servers/swarm_agents/omni_agent_api.py deleted file mode 100644 index 64b4a6b..0000000 --- a/servers/swarm_agents/omni_agent_api.py +++ /dev/null @@ -1,49 +0,0 @@ -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from langchain_core.runnables import RunnableLambda -from langserve import add_routes -from swarms import Anthropic -from swarms.agents.omni_modal_agent import OmniModalAgent - - -def agent_run_task(task: str): - # Agent - agent = OmniModalAgent( - llm=Anthropic(), - verbose=True, - ) - - return agent.run(task) - - -agent = RunnableLambda(agent_run_task) - -app = FastAPI( - title="LangChain Server", - version="1.0", - description="A simple api server using Langchain's Runnable interfaces", - # dependencies=[Depends()], -) - - -# Set all CORS enabled origins -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - expose_headers=["*"], -) - -add_routes( - app, - agent, - path="/v1/chat/completions", -) - - -if __name__ == "__main__": - import uvicorn - - uvicorn.run(app, host="localhost", port=8000) diff --git a/servers/swarm_agents/omni_modal_agent.py b/servers/swarm_agents/omni_modal_agent.py deleted file mode 100644 index 859d21d..0000000 --- a/servers/swarm_agents/omni_modal_agent.py +++ /dev/null @@ -1,71 +0,0 @@ -from swarms import Agent, Anthropic, tool - -# Model -llm = Anthropic( - temperature=0.1, -) - - -# Tools -@tool -def text_to_video(task: str): - """ - Converts a given text task into an animated video. - - Args: - task (str): The text task to be converted into a video. - - Returns: - str: The path to the exported GIF file. - """ - import torch - from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler - from diffusers.utils import export_to_gif - from huggingface_hub import hf_hub_download - from safetensors.torch import load_file - - device = "cuda" - dtype = torch.float16 - - step = 4 # Options: [1,2,4,8] - repo = "ByteDance/AnimateDiff-Lightning" - ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors" - base = "emilianJR/epiCRealism" # Choose to your favorite base model. - - adapter = MotionAdapter().to(device, dtype) - adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device)) - pipe = AnimateDiffPipeline.from_pretrained( - base, motion_adapter=adapter, torch_dtype=dtype - ).to(device) - pipe.scheduler = EulerDiscreteScheduler.from_config( - pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear" - ) - - output = pipe(prompt=task, guidance_scale=1.0, num_inference_steps=step) - out = export_to_gif(output.frames[0], "animation.gif") - return out - - -# Agent -agent = Agent( - agent_name="Devin", - system_prompt=( - "Autonomous agent that can interact with humans and other" - " agents. Be Helpful and Kind. Use the tools provided to" - " assist the user. Return all code in markdown format." - ), - llm=llm, - max_loops="auto", - autosave=True, - dashboard=False, - streaming_on=True, - verbose=True, - stopping_token="", - interactive=True, - tools=[text_to_video], - code_interpreter=True, -) - -# Run the agent -out = agent("Create a vide of a girl coding AI wearing hijab") -print(out) diff --git a/servers/x_composer_2/sky_serve.yaml b/servers/x_composer_2/sky_serve.yaml index 540bf17..ee00240 100644 --- a/servers/x_composer_2/sky_serve.yaml +++ b/servers/x_composer_2/sky_serve.yaml @@ -1,5 +1,4 @@ envs: - MODEL_NAME: internlm/internlm-xcomposer2-4khd-7b MODEL_ARCH: internlm HUGGING_FACE_HUB_TOKEN: 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 @@ -25,12 +24,15 @@ service: content: Hello! What is your name? max_tokens: 1 readiness_probe: /v1/models + + # Replica Policy replica_policy: - min_replicas: 1 - max_replicas: 10 - target_qps_per_replica: 2.5 - upscale_delay_seconds: 300 - downscale_delay_seconds: 1200 + min_replicas: 3 # Minimum number of replicas + max_replicas: 100 # Maximum number of replicas + target_qps_per_replica: 2.5 # Target queries per second per replica + upscale_delay_seconds: 40 # Delay before upscaling replicas + downscale_delay_seconds: 20 # Delay before downscaling replicas + setup: | pip3 install lmdeploy torchvision diff --git a/sky_serve.yaml b/sky_serve.yaml index 1290255..387bd36 100644 --- a/sky_serve.yaml +++ b/sky_serve.yaml @@ -1,5 +1,5 @@ envs: - OPENAI_API_KEY: + OPENAI_API_KEY: "sk-proj" MODEL_NAME: "OpenAIChat" # Service configuration @@ -20,26 +20,19 @@ service: max_replicas: 100 # Maximum number of replicas target_qps_per_replica: 2.5 # Target queries per second per replica upscale_delay_seconds: 40 # Delay before upscaling replicas - downscale_delay_seconds: 1000 # Delay before downscaling replicas - -# workdir: . + downscale_delay_seconds: 20 # Delay before downscaling replicas resources: accelerators: [L4, A10g, A100, T4] ## Small models # cpus: 32+ # memory: 32 - use_spot: True + use_spot: true # disk_size: 512 # Ensure model checkpoints (~246GB) can fit. # disk_tier: best ports: 8080 # Expose to internet traffic. -setup: | - git clone https://github.com/kyegomez/swarms-cloud.git - - cd swarms-cloud - # Install dependencies - pip install -r requirements.txt +workdir: servers/agent run: | - python3 api.py \ No newline at end of file + python api.py \ No newline at end of file