fixes

2025-09-18 21:49:46 +00:00
parent db3c2336ac
commit f059388b26
2 changed files with 76 additions and 77 deletions
--- a/agent/agent.py
+++ b/agent/agent.py
@@ -1,17 +1,15 @@
-
+import os, re, yaml
 import os
 import re
 import yaml
 import json
 from fastapi import FastAPI, Request, HTTPException
 from pydantic import BaseModel
 import docker
-# Basic AI Ops Agent for Docker Swarm
+try:
    import docker
 except Exception:
    docker = None
 PORT = int(os.getenv("PORT", "8080"))
 GUARD_DEFAULT_MAX = 25
 # Load rules file if present
 RULES_PATH = "rules.yaml"
 if os.path.exists(RULES_PATH):
    with open(RULES_PATH, "r") as f:
@@ -22,88 +20,104 @@ else:
 ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
 MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
-# Docker client using the socket
+DOCKER_SOCK = "/var/run/docker.sock"
-client = docker.DockerClient(base_url='unix://var/run/docker.sock')
+_client = None
 def get_client():
    global _client
    if _client is not None:
        return _client
    if docker is None:
        raise HTTPException(status_code=500, detail="docker SDK not available in image.")
    if not os.path.exists(DOCKER_SOCK):
        raise HTTPException(status_code=500, detail=f"Docker socket not found at {DOCKER_SOCK}. Did you mount it?")
    try:
        _client = docker.DockerClient(base_url=f'unix://{DOCKER_SOCK}')
        _client.ping()
        return _client
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Cannot connect to Docker at {DOCKER_SOCK}: {e}")
 app = FastAPI(title="BurnServ AI Ops Agent")
 def _guard_service(service_name: str):
    if not ALLOWED_REGEX.match(service_name):
        raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
 def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
    cli = get_client()
    _guard_service(service_name)
    try:
-        svc = client.services.get(service_name)
+        svc = cli.services.get(service_name)
-    except docker.errors.NotFound:
+    except Exception:
        raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
    spec = svc.attrs.get('Spec', {}).copy()
    mode = spec.get('Mode', {})
    replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
    if replicas is None:
        # step-based scaling
        tgt = int(replicas_current) + int(step or 1)
        if min_replicas is not None:
            tgt = max(tgt, int(min_replicas))
        if max_replicas is not None:
            tgt = min(tgt, int(max_replicas))
        replicas = tgt
    replicas = max(1, min(int(replicas), MAX_SCALE))
    mode['Replicated'] = {'Replicas': replicas}
    spec['Mode'] = mode
    # svc.update expects keyword args matching the service spec shape; pass the full spec with update
    try:
        svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
    except Exception:
        # Fallback: use the update with the raw spec (works in many docker-py versions)
        svc.update(**spec)
    return {"service": service_name, "replicas": replicas}
 def _restart_service(service_name: str):
    cli = get_client()
    _guard_service(service_name)
    try:
-        svc = client.services.get(service_name)
+        svc = cli.services.get(service_name)
-    except docker.errors.NotFound:
+    except Exception:
        raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
    spec = svc.attrs.get('Spec', {}).copy()
    # Force a rolling update by bumping ForceUpdate / Version index
    try:
        current_index = svc.attrs.get('Version', {}).get('Index', 0)
        svc.update(**spec, force_update=current_index + 1)
    except Exception:
        # If update signature differs, try a simple update
        svc.update(**spec)
    return {"service": service_name, "status": "rolling-restart-issued"}
 class Command(BaseModel):
    text: str = ""
    action: str | None = None
    params: dict | None = None
@app.get("/health")
 def health():
    return {"ok": True}
@app.get("/diagnostics")
 def diagnostics():
    info = {
        "docker_sock_exists": os.path.exists(DOCKER_SOCK),
        "docker_sock_path": DOCKER_SOCK,
        "uid": os.getuid() if hasattr(os, "getuid") else "n/a",
        "gid": os.getgid() if hasattr(os, "getgid") else "n/a",
        "env_PORT": PORT,
        "rules_loaded": bool(RULES),
    }
    try:
        cli = get_client()
        info["docker_ping"] = True
        info["server_version"] = cli.version()
    except HTTPException as he:
        info["docker_ping"] = False
        info["error"] = he.detail
    except Exception as e:
        info["docker_ping"] = False
        info["error"] = str(e)
    return info
@app.post("/command")
 def command(cmd: Command):
    \"\"\"Accepts either structured commands or simple text instructions.
    Structured example:
      {"action":"scale","params":{"service":"weblabs_php","replicas":3}}
    Text example:
      {"text":"scale weblabs_php to 3"}
    \"\"\"
    # Structured commands first
    if cmd.action:
        if cmd.action == "scale":
            p = cmd.params or {}
@@ -115,16 +129,13 @@ def command(cmd: Command):
            if "service" not in p:
                raise HTTPException(status_code=400, detail="Missing service param for restart_service")
            return _restart_service(p["service"])
-        else:
+        raise HTTPException(status_code=400, detail="Unknown action")
            raise HTTPException(status_code=400, detail="Unknown action")
    # Free-text parsing (simple)
    t = (cmd.text or "").strip().lower()
    if not t:
        raise HTTPException(status_code=400, detail="Empty command")
    if t.startswith("scale "):
        # "scale weblabs_php to 3"
        parts = t.split()
        try:
            svc = parts[1]
@@ -132,13 +143,10 @@ def command(cmd: Command):
                idx_to = parts.index("to")
                reps = int(parts[idx_to + 1])
                return _scale(svc, replicas=reps)
-            else:
+            return _scale(svc, step=1)
                # if no explicit amount, treat as step +1
                return _scale(svc, step=1)
        except Exception:
            raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
-    elif t.startswith("restart "):
+    if t.startswith("restart "):
        # "restart weblabs_php"
        try:
            svc = t.split()[1]
            return _restart_service(svc)
@@ -147,17 +155,13 @@ def command(cmd: Command):
    raise HTTPException(status_code=400, detail="Unrecognized command")
@app.post("/alert")
 async def alert(request: Request):
    \"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\"
    payload = await request.json()
    alerts = payload.get("alerts", [])
    executed = []
    for a in alerts:
        labels = a.get("labels", {}) or {}
        # For each rule in RULES, check if the match conditions apply
        for rule in RULES.get("alerts", []):
            match = rule.get("match", {})
            if all(labels.get(k) == v for k, v in match.items()):
@@ -178,5 +182,4 @@ async def alert(request: Request):
                        svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
                        res = _restart_service(svc)
                        executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
    return {"executed": executed}
--- a/stack.yml
+++ b/stack.yml
@@ -19,18 +19,16 @@ secrets:
 services:
  ai-agent:
-    build:
+    image: hostlabs/ai-agent:latest
      context: ./agent
    image: burnserv/ai-agent:latest
    networks: [opsNet]
    ports:
-      - "8080:8080"
+      - "8080:8080"        # expose only if you want host access; remove if internal-only
    deploy:
      mode: replicated
      replicas: 1
      placement:
        constraints:
-          - node.role == manager
+          - node.role == manager    # must be on a manager to control Swarm via docker.sock
      labels:
        - "ai.agent=true"
    volumes:
@@ -39,6 +37,26 @@ services:
      - source: rules.yaml
        target: /app/rules.yaml
  relay:
    image: hostlabs/ai-relay:latest
    networks: [opsNet]
    depends_on: [ai-agent]
    environment:
      - OPENAI_MODEL=gpt-4o-mini
      - AGENT_URL=http://ai-agent:8080           # DNS name on opsNet
      - OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
    secrets:
      - source: openai_api_key
        target: openai_api_key
    ports:
      - "8090:8090"
    deploy:
      mode: replicated
      replicas: 1
      placement:
        constraints:
          - node.role == manager
  prometheus:
    image: prom/prometheus:v2.55.0
    networks: [opsNet]
@@ -95,25 +113,3 @@ services:
      - --path.rootfs=/host
    volumes:
      - /:/host:ro,rslave
  relay:
    build:
      context: ./relay
    image: burnserv/ai-relay:latest
    networks: [opsNet]
    depends_on: [ai-agent]
    deploy:
      mode: replicated
      replicas: 1
      placement:
        constraints:
          - node.role == manager
    environment:
      - OPENAI_MODEL=gpt-4o-mini
      - AGENT_URL=http://ai-agent:8080
      - OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
    secrets:
      - source: openai_api_key
        target: openai_api_key
    ports:
      - "8090:8090"