first commit

2025-09-18 14:37:09 -04:00
commit 0473fdb225
13 changed files with 481 additions and 0 deletions
--- a/agent/Dockerfile
+++ b/agent/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY agent.py rules.yaml ./
+ENV PORT=8080
+
+CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/agent/agent.py
+++ b/agent/agent.py
@@ -0,0 +1,182 @@
+
+import os
+import re
+import yaml
+import json
+from fastapi import FastAPI, Request, HTTPException
+from pydantic import BaseModel
+import docker
+
+# Basic AI Ops Agent for Docker Swarm
+PORT = int(os.getenv("PORT", "8080"))
+GUARD_DEFAULT_MAX = 25
+
+# Load rules file if present
+RULES_PATH = "rules.yaml"
+if os.path.exists(RULES_PATH):
+    with open(RULES_PATH, "r") as f:
+        RULES = yaml.safe_load(f) or {}
+else:
+    RULES = {}
+
+ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
+MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
+
+# Docker client using the socket
+client = docker.DockerClient(base_url='unix://var/run/docker.sock')
+
+app = FastAPI(title="BurnServ AI Ops Agent")
+
+
+def _guard_service(service_name: str):
+    if not ALLOWED_REGEX.match(service_name):
+        raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
+
+
+def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
+    _guard_service(service_name)
+    try:
+        svc = client.services.get(service_name)
+    except docker.errors.NotFound:
+        raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
+    spec = svc.attrs.get('Spec', {}).copy()
+    mode = spec.get('Mode', {})
+    replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
+
+    if replicas is None:
+        # step-based scaling
+        tgt = int(replicas_current) + int(step or 1)
+        if min_replicas is not None:
+            tgt = max(tgt, int(min_replicas))
+        if max_replicas is not None:
+            tgt = min(tgt, int(max_replicas))
+        replicas = tgt
+
+    replicas = max(1, min(int(replicas), MAX_SCALE))
+    mode['Replicated'] = {'Replicas': replicas}
+    spec['Mode'] = mode
+
+    # svc.update expects keyword args matching the service spec shape; pass the full spec with update
+    try:
+        svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
+    except Exception:
+        # Fallback: use the update with the raw spec (works in many docker-py versions)
+        svc.update(**spec)
+    return {"service": service_name, "replicas": replicas}
+
+
+def _restart_service(service_name: str):
+    _guard_service(service_name)
+    try:
+        svc = client.services.get(service_name)
+    except docker.errors.NotFound:
+        raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
+    spec = svc.attrs.get('Spec', {}).copy()
+    # Force a rolling update by bumping ForceUpdate / Version index
+    try:
+        current_index = svc.attrs.get('Version', {}).get('Index', 0)
+        svc.update(**spec, force_update=current_index + 1)
+    except Exception:
+        # If update signature differs, try a simple update
+        svc.update(**spec)
+    return {"service": service_name, "status": "rolling-restart-issued"}
+
+
+class Command(BaseModel):
+    text: str = ""
+    action: str | None = None
+    params: dict | None = None
+
+
+@app.get("/health")
+def health():
+    return {"ok": True}
+
+
+@app.post("/command")
+def command(cmd: Command):
+    \"\"\"Accepts either structured commands or simple text instructions.
+
+    Structured example:
+      {"action":"scale","params":{"service":"weblabs_php","replicas":3}}
+
+    Text example:
+      {"text":"scale weblabs_php to 3"}
+    \"\"\"
+    # Structured commands first
+    if cmd.action:
+        if cmd.action == "scale":
+            p = cmd.params or {}
+            if "service" not in p or "replicas" not in p:
+                raise HTTPException(status_code=400, detail="Missing params for scale: service, replicas")
+            return _scale(p["service"], replicas=int(p["replicas"]))
+        elif cmd.action == "restart_service":
+            p = cmd.params or {}
+            if "service" not in p:
+                raise HTTPException(status_code=400, detail="Missing service param for restart_service")
+            return _restart_service(p["service"])
+        else:
+            raise HTTPException(status_code=400, detail="Unknown action")
+
+    # Free-text parsing (simple)
+    t = (cmd.text or "").strip().lower()
+    if not t:
+        raise HTTPException(status_code=400, detail="Empty command")
+
+    if t.startswith("scale "):
+        # "scale weblabs_php to 3"
+        parts = t.split()
+        try:
+            svc = parts[1]
+            if "to" in parts:
+                idx_to = parts.index("to")
+                reps = int(parts[idx_to + 1])
+                return _scale(svc, replicas=reps)
+            else:
+                # if no explicit amount, treat as step +1
+                return _scale(svc, step=1)
+        except Exception:
+            raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
+    elif t.startswith("restart "):
+        # "restart weblabs_php"
+        try:
+            svc = t.split()[1]
+            return _restart_service(svc)
+        except Exception:
+            raise HTTPException(status_code=400, detail="Format: 'restart <service>'")
+
+    raise HTTPException(status_code=400, detail="Unrecognized command")
+
+
+@app.post("/alert")
+async def alert(request: Request):
+    \"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\"
+    payload = await request.json()
+    alerts = payload.get("alerts", [])
+    executed = []
+
+    for a in alerts:
+        labels = a.get("labels", {}) or {}
+        # For each rule in RULES, check if the match conditions apply
+        for rule in RULES.get("alerts", []):
+            match = rule.get("match", {})
+            if all(labels.get(k) == v for k, v in match.items()):
+                for act in rule.get("actions", []):
+                    if "scale" in act:
+                        cfg = act["scale"].copy()
+                        svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
+                        res = _scale(
+                            service_name=svc,
+                            replicas=cfg.get("replicas"),
+                            step=cfg.get("step"),
+                            min_replicas=cfg.get("min_replicas"),
+                            max_replicas=cfg.get("max_replicas", MAX_SCALE),
+                        )
+                        executed.append({"alert": labels.get("alertname"), "action": "scale", "result": res})
+                    elif "restart_service" in act:
+                        cfg = act["restart_service"].copy()
+                        svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
+                        res = _restart_service(svc)
+                        executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
+
+    return {"executed": executed}
--- a/agent/requirements.txt
+++ b/agent/requirements.txt
@@ -0,0 +1,5 @@
+fastapi==0.115.0
+uvicorn[standard]==0.30.6
+PyYAML==6.0.2
+docker==7.1.0
+httpx==0.27.2
--- a/agent/rules.yaml
+++ b/agent/rules.yaml
@@ -0,0 +1,30 @@
+alerts:
+  - match:
+      alertname: HighCPU
+      severity: warning
+    actions:
+      - scale:
+          service: "weblabs_php"
+          min_replicas: 2
+          step: 1
+          max_replicas: 10
+  - match:
+      alertname: ServiceDown
+      severity: critical
+    actions:
+      - restart_service:
+          service: "{{ $labels.service_name }}"
+
+commands:
+  - intent: "scale"
+    schema:
+      service: str
+      replicas: int
+    action:
+      scale:
+        service: "{{service}}"
+        replicas: "{{replicas}}"
+
+guardrails:
+  allowed_services_regex: "^(weblabs_.*|wordpress_.*|nginx_.*|php_.*|redis_.*|mysql_.*)$"
+  max_scale_replicas: 25