From f059388b26b834f5ea45beb6e5f00f940b5050d9 Mon Sep 17 00:00:00 2001 From: josh Date: Thu, 18 Sep 2025 21:49:46 +0000 Subject: [PATCH] fixes --- agent/agent.py | 103 +++++++++++++++++++++++++------------------------ stack.yml | 50 +++++++++++------------- 2 files changed, 76 insertions(+), 77 deletions(-) diff --git a/agent/agent.py b/agent/agent.py index 71337a0..05a41f9 100644 --- a/agent/agent.py +++ b/agent/agent.py @@ -1,17 +1,15 @@ - -import os -import re -import yaml -import json +import os, re, yaml from fastapi import FastAPI, Request, HTTPException from pydantic import BaseModel -import docker -# Basic AI Ops Agent for Docker Swarm +try: + import docker +except Exception: + docker = None + PORT = int(os.getenv("PORT", "8080")) GUARD_DEFAULT_MAX = 25 -# Load rules file if present RULES_PATH = "rules.yaml" if os.path.exists(RULES_PATH): with open(RULES_PATH, "r") as f: @@ -22,88 +20,104 @@ else: ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*")) MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX)) -# Docker client using the socket -client = docker.DockerClient(base_url='unix://var/run/docker.sock') +DOCKER_SOCK = "/var/run/docker.sock" +_client = None + +def get_client(): + global _client + if _client is not None: + return _client + if docker is None: + raise HTTPException(status_code=500, detail="docker SDK not available in image.") + if not os.path.exists(DOCKER_SOCK): + raise HTTPException(status_code=500, detail=f"Docker socket not found at {DOCKER_SOCK}. Did you mount it?") + try: + _client = docker.DockerClient(base_url=f'unix://{DOCKER_SOCK}') + _client.ping() + return _client + except Exception as e: + raise HTTPException(status_code=500, detail=f"Cannot connect to Docker at {DOCKER_SOCK}: {e}") app = FastAPI(title="BurnServ AI Ops Agent") - def _guard_service(service_name: str): if not ALLOWED_REGEX.match(service_name): raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.") - def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None): + cli = get_client() _guard_service(service_name) try: - svc = client.services.get(service_name) - except docker.errors.NotFound: + svc = cli.services.get(service_name) + except Exception: raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found") spec = svc.attrs.get('Spec', {}).copy() mode = spec.get('Mode', {}) replicas_current = mode.get('Replicated', {}).get('Replicas', 1) - if replicas is None: - # step-based scaling tgt = int(replicas_current) + int(step or 1) if min_replicas is not None: tgt = max(tgt, int(min_replicas)) if max_replicas is not None: tgt = min(tgt, int(max_replicas)) replicas = tgt - replicas = max(1, min(int(replicas), MAX_SCALE)) mode['Replicated'] = {'Replicas': replicas} spec['Mode'] = mode - - # svc.update expects keyword args matching the service spec shape; pass the full spec with update try: svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'}) except Exception: - # Fallback: use the update with the raw spec (works in many docker-py versions) svc.update(**spec) return {"service": service_name, "replicas": replicas} - def _restart_service(service_name: str): + cli = get_client() _guard_service(service_name) try: - svc = client.services.get(service_name) - except docker.errors.NotFound: + svc = cli.services.get(service_name) + except Exception: raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found") spec = svc.attrs.get('Spec', {}).copy() - # Force a rolling update by bumping ForceUpdate / Version index try: current_index = svc.attrs.get('Version', {}).get('Index', 0) svc.update(**spec, force_update=current_index + 1) except Exception: - # If update signature differs, try a simple update svc.update(**spec) return {"service": service_name, "status": "rolling-restart-issued"} - class Command(BaseModel): text: str = "" action: str | None = None params: dict | None = None - @app.get("/health") def health(): return {"ok": True} +@app.get("/diagnostics") +def diagnostics(): + info = { + "docker_sock_exists": os.path.exists(DOCKER_SOCK), + "docker_sock_path": DOCKER_SOCK, + "uid": os.getuid() if hasattr(os, "getuid") else "n/a", + "gid": os.getgid() if hasattr(os, "getgid") else "n/a", + "env_PORT": PORT, + "rules_loaded": bool(RULES), + } + try: + cli = get_client() + info["docker_ping"] = True + info["server_version"] = cli.version() + except HTTPException as he: + info["docker_ping"] = False + info["error"] = he.detail + except Exception as e: + info["docker_ping"] = False + info["error"] = str(e) + return info @app.post("/command") def command(cmd: Command): - \"\"\"Accepts either structured commands or simple text instructions. - - Structured example: - {"action":"scale","params":{"service":"weblabs_php","replicas":3}} - - Text example: - {"text":"scale weblabs_php to 3"} - \"\"\" - # Structured commands first if cmd.action: if cmd.action == "scale": p = cmd.params or {} @@ -115,16 +129,13 @@ def command(cmd: Command): if "service" not in p: raise HTTPException(status_code=400, detail="Missing service param for restart_service") return _restart_service(p["service"]) - else: - raise HTTPException(status_code=400, detail="Unknown action") + raise HTTPException(status_code=400, detail="Unknown action") - # Free-text parsing (simple) t = (cmd.text or "").strip().lower() if not t: raise HTTPException(status_code=400, detail="Empty command") if t.startswith("scale "): - # "scale weblabs_php to 3" parts = t.split() try: svc = parts[1] @@ -132,13 +143,10 @@ def command(cmd: Command): idx_to = parts.index("to") reps = int(parts[idx_to + 1]) return _scale(svc, replicas=reps) - else: - # if no explicit amount, treat as step +1 - return _scale(svc, step=1) + return _scale(svc, step=1) except Exception: raise HTTPException(status_code=400, detail="Format: 'scale to '") - elif t.startswith("restart "): - # "restart weblabs_php" + if t.startswith("restart "): try: svc = t.split()[1] return _restart_service(svc) @@ -147,17 +155,13 @@ def command(cmd: Command): raise HTTPException(status_code=400, detail="Unrecognized command") - @app.post("/alert") async def alert(request: Request): - \"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\" payload = await request.json() alerts = payload.get("alerts", []) executed = [] - for a in alerts: labels = a.get("labels", {}) or {} - # For each rule in RULES, check if the match conditions apply for rule in RULES.get("alerts", []): match = rule.get("match", {}) if all(labels.get(k) == v for k, v in match.items()): @@ -178,5 +182,4 @@ async def alert(request: Request): svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", "")) res = _restart_service(svc) executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res}) - return {"executed": executed} diff --git a/stack.yml b/stack.yml index 3f0d78a..be91162 100644 --- a/stack.yml +++ b/stack.yml @@ -19,18 +19,16 @@ secrets: services: ai-agent: - build: - context: ./agent - image: burnserv/ai-agent:latest + image: hostlabs/ai-agent:latest networks: [opsNet] ports: - - "8080:8080" + - "8080:8080" # expose only if you want host access; remove if internal-only deploy: mode: replicated replicas: 1 placement: constraints: - - node.role == manager + - node.role == manager # must be on a manager to control Swarm via docker.sock labels: - "ai.agent=true" volumes: @@ -39,6 +37,26 @@ services: - source: rules.yaml target: /app/rules.yaml + relay: + image: hostlabs/ai-relay:latest + networks: [opsNet] + depends_on: [ai-agent] + environment: + - OPENAI_MODEL=gpt-4o-mini + - AGENT_URL=http://ai-agent:8080 # DNS name on opsNet + - OPENAI_API_KEY_FILE=/run/secrets/openai_api_key + secrets: + - source: openai_api_key + target: openai_api_key + ports: + - "8090:8090" + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.role == manager + prometheus: image: prom/prometheus:v2.55.0 networks: [opsNet] @@ -95,25 +113,3 @@ services: - --path.rootfs=/host volumes: - /:/host:ro,rslave - - relay: - build: - context: ./relay - image: burnserv/ai-relay:latest - networks: [opsNet] - depends_on: [ai-agent] - deploy: - mode: replicated - replicas: 1 - placement: - constraints: - - node.role == manager - environment: - - OPENAI_MODEL=gpt-4o-mini - - AGENT_URL=http://ai-agent:8080 - - OPENAI_API_KEY_FILE=/run/secrets/openai_api_key - secrets: - - source: openai_api_key - target: openai_api_key - ports: - - "8090:8090"