From f059388b26b834f5ea45beb6e5f00f940b5050d9 Mon Sep 17 00:00:00 2001
From: josh <josh@hostlabs.pro>
Date: Thu, 18 Sep 2025 21:49:46 +0000
Subject: [PATCH] fixes

---
 agent/agent.py | 103 +++++++++++++++++++++++++------------------------
 stack.yml      |  50 +++++++++++-------------
 2 files changed, 76 insertions(+), 77 deletions(-)

diff --git a/agent/agent.py b/agent/agent.py
index 71337a0..05a41f9 100644
--- a/agent/agent.py
+++ b/agent/agent.py
@@ -1,17 +1,15 @@
-
-import os
-import re
-import yaml
-import json
+import os, re, yaml
 from fastapi import FastAPI, Request, HTTPException
 from pydantic import BaseModel
-import docker
 
-# Basic AI Ops Agent for Docker Swarm
+try:
+    import docker
+except Exception:
+    docker = None
+
 PORT = int(os.getenv("PORT", "8080"))
 GUARD_DEFAULT_MAX = 25
 
-# Load rules file if present
 RULES_PATH = "rules.yaml"
 if os.path.exists(RULES_PATH):
     with open(RULES_PATH, "r") as f:
@@ -22,88 +20,104 @@ else:
 ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
 MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
 
-# Docker client using the socket
-client = docker.DockerClient(base_url='unix://var/run/docker.sock')
+DOCKER_SOCK = "/var/run/docker.sock"
+_client = None
+
+def get_client():
+    global _client
+    if _client is not None:
+        return _client
+    if docker is None:
+        raise HTTPException(status_code=500, detail="docker SDK not available in image.")
+    if not os.path.exists(DOCKER_SOCK):
+        raise HTTPException(status_code=500, detail=f"Docker socket not found at {DOCKER_SOCK}. Did you mount it?")
+    try:
+        _client = docker.DockerClient(base_url=f'unix://{DOCKER_SOCK}')
+        _client.ping()
+        return _client
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Cannot connect to Docker at {DOCKER_SOCK}: {e}")
 
 app = FastAPI(title="BurnServ AI Ops Agent")
 
-
 def _guard_service(service_name: str):
     if not ALLOWED_REGEX.match(service_name):
         raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
 
-
 def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
+    cli = get_client()
     _guard_service(service_name)
     try:
-        svc = client.services.get(service_name)
-    except docker.errors.NotFound:
+        svc = cli.services.get(service_name)
+    except Exception:
         raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
     spec = svc.attrs.get('Spec', {}).copy()
     mode = spec.get('Mode', {})
     replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
-
     if replicas is None:
-        # step-based scaling
         tgt = int(replicas_current) + int(step or 1)
         if min_replicas is not None:
             tgt = max(tgt, int(min_replicas))
         if max_replicas is not None:
             tgt = min(tgt, int(max_replicas))
         replicas = tgt
-
     replicas = max(1, min(int(replicas), MAX_SCALE))
     mode['Replicated'] = {'Replicas': replicas}
     spec['Mode'] = mode
-
-    # svc.update expects keyword args matching the service spec shape; pass the full spec with update
     try:
         svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
     except Exception:
-        # Fallback: use the update with the raw spec (works in many docker-py versions)
         svc.update(**spec)
     return {"service": service_name, "replicas": replicas}
 
-
 def _restart_service(service_name: str):
+    cli = get_client()
     _guard_service(service_name)
     try:
-        svc = client.services.get(service_name)
-    except docker.errors.NotFound:
+        svc = cli.services.get(service_name)
+    except Exception:
         raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
     spec = svc.attrs.get('Spec', {}).copy()
-    # Force a rolling update by bumping ForceUpdate / Version index
     try:
         current_index = svc.attrs.get('Version', {}).get('Index', 0)
         svc.update(**spec, force_update=current_index + 1)
     except Exception:
-        # If update signature differs, try a simple update
         svc.update(**spec)
     return {"service": service_name, "status": "rolling-restart-issued"}
 
-
 class Command(BaseModel):
     text: str = ""
     action: str | None = None
     params: dict | None = None
 
-
 @app.get("/health")
 def health():
     return {"ok": True}
 
+@app.get("/diagnostics")
+def diagnostics():
+    info = {
+        "docker_sock_exists": os.path.exists(DOCKER_SOCK),
+        "docker_sock_path": DOCKER_SOCK,
+        "uid": os.getuid() if hasattr(os, "getuid") else "n/a",
+        "gid": os.getgid() if hasattr(os, "getgid") else "n/a",
+        "env_PORT": PORT,
+        "rules_loaded": bool(RULES),
+    }
+    try:
+        cli = get_client()
+        info["docker_ping"] = True
+        info["server_version"] = cli.version()
+    except HTTPException as he:
+        info["docker_ping"] = False
+        info["error"] = he.detail
+    except Exception as e:
+        info["docker_ping"] = False
+        info["error"] = str(e)
+    return info
 
 @app.post("/command")
 def command(cmd: Command):
-    \"\"\"Accepts either structured commands or simple text instructions.
-
-    Structured example:
-      {"action":"scale","params":{"service":"weblabs_php","replicas":3}}
-
-    Text example:
-      {"text":"scale weblabs_php to 3"}
-    \"\"\"
-    # Structured commands first
     if cmd.action:
         if cmd.action == "scale":
             p = cmd.params or {}
@@ -115,16 +129,13 @@ def command(cmd: Command):
             if "service" not in p:
                 raise HTTPException(status_code=400, detail="Missing service param for restart_service")
             return _restart_service(p["service"])
-        else:
-            raise HTTPException(status_code=400, detail="Unknown action")
+        raise HTTPException(status_code=400, detail="Unknown action")
 
-    # Free-text parsing (simple)
     t = (cmd.text or "").strip().lower()
     if not t:
         raise HTTPException(status_code=400, detail="Empty command")
 
     if t.startswith("scale "):
-        # "scale weblabs_php to 3"
         parts = t.split()
         try:
             svc = parts[1]
@@ -132,13 +143,10 @@ def command(cmd: Command):
                 idx_to = parts.index("to")
                 reps = int(parts[idx_to + 1])
                 return _scale(svc, replicas=reps)
-            else:
-                # if no explicit amount, treat as step +1
-                return _scale(svc, step=1)
+            return _scale(svc, step=1)
         except Exception:
             raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
-    elif t.startswith("restart "):
-        # "restart weblabs_php"
+    if t.startswith("restart "):
         try:
             svc = t.split()[1]
             return _restart_service(svc)
@@ -147,17 +155,13 @@ def command(cmd: Command):
 
     raise HTTPException(status_code=400, detail="Unrecognized command")
 
-
 @app.post("/alert")
 async def alert(request: Request):
-    \"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\"
     payload = await request.json()
     alerts = payload.get("alerts", [])
     executed = []
-
     for a in alerts:
         labels = a.get("labels", {}) or {}
-        # For each rule in RULES, check if the match conditions apply
         for rule in RULES.get("alerts", []):
             match = rule.get("match", {})
             if all(labels.get(k) == v for k, v in match.items()):
@@ -178,5 +182,4 @@ async def alert(request: Request):
                         svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
                         res = _restart_service(svc)
                         executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
-
     return {"executed": executed}
diff --git a/stack.yml b/stack.yml
index 3f0d78a..be91162 100644
--- a/stack.yml
+++ b/stack.yml
@@ -19,18 +19,16 @@ secrets:
 
 services:
   ai-agent:
-    build:
-      context: ./agent
-    image: burnserv/ai-agent:latest
+    image: hostlabs/ai-agent:latest
     networks: [opsNet]
     ports:
-      - "8080:8080"
+      - "8080:8080"        # expose only if you want host access; remove if internal-only
     deploy:
       mode: replicated
       replicas: 1
       placement:
         constraints:
-          - node.role == manager
+          - node.role == manager    # must be on a manager to control Swarm via docker.sock
       labels:
         - "ai.agent=true"
     volumes:
@@ -39,6 +37,26 @@ services:
       - source: rules.yaml
         target: /app/rules.yaml
 
+  relay:
+    image: hostlabs/ai-relay:latest
+    networks: [opsNet]
+    depends_on: [ai-agent]
+    environment:
+      - OPENAI_MODEL=gpt-4o-mini
+      - AGENT_URL=http://ai-agent:8080           # DNS name on opsNet
+      - OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
+    secrets:
+      - source: openai_api_key
+        target: openai_api_key
+    ports:
+      - "8090:8090"
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager
+
   prometheus:
     image: prom/prometheus:v2.55.0
     networks: [opsNet]
@@ -95,25 +113,3 @@ services:
       - --path.rootfs=/host
     volumes:
       - /:/host:ro,rslave
-
-  relay:
-    build:
-      context: ./relay
-    image: burnserv/ai-relay:latest
-    networks: [opsNet]
-    depends_on: [ai-agent]
-    deploy:
-      mode: replicated
-      replicas: 1
-      placement:
-        constraints:
-          - node.role == manager
-    environment:
-      - OPENAI_MODEL=gpt-4o-mini
-      - AGENT_URL=http://ai-agent:8080
-      - OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
-    secrets:
-      - source: openai_api_key
-        target: openai_api_key
-    ports:
-      - "8090:8090"