first commit

2025-09-18 14:37:09 -04:00
commit 0473fdb225
13 changed files with 481 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,5 @@
+
+# Copy to .env and customize (not used by Docker secrets)
+OPENAI_API_KEY=YOUR_KEY_HERE
+OPENAI_MODEL=gpt-4o-mini
+AGENT_URL=http://ai-agent:8080
--- a/README.md
+++ b/README.md
--- a/agent/Dockerfile
+++ b/agent/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY agent.py rules.yaml ./
+ENV PORT=8080
+
+CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/agent/agent.py
+++ b/agent/agent.py
@@ -0,0 +1,182 @@
+
+import os
+import re
+import yaml
+import json
+from fastapi import FastAPI, Request, HTTPException
+from pydantic import BaseModel
+import docker
+
+# Basic AI Ops Agent for Docker Swarm
+PORT = int(os.getenv("PORT", "8080"))
+GUARD_DEFAULT_MAX = 25
+
+# Load rules file if present
+RULES_PATH = "rules.yaml"
+if os.path.exists(RULES_PATH):
+    with open(RULES_PATH, "r") as f:
+        RULES = yaml.safe_load(f) or {}
+else:
+    RULES = {}
+
+ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
+MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
+
+# Docker client using the socket
+client = docker.DockerClient(base_url='unix://var/run/docker.sock')
+
+app = FastAPI(title="BurnServ AI Ops Agent")
+
+
+def _guard_service(service_name: str):
+    if not ALLOWED_REGEX.match(service_name):
+        raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
+
+
+def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
+    _guard_service(service_name)
+    try:
+        svc = client.services.get(service_name)
+    except docker.errors.NotFound:
+        raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
+    spec = svc.attrs.get('Spec', {}).copy()
+    mode = spec.get('Mode', {})
+    replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
+
+    if replicas is None:
+        # step-based scaling
+        tgt = int(replicas_current) + int(step or 1)
+        if min_replicas is not None:
+            tgt = max(tgt, int(min_replicas))
+        if max_replicas is not None:
+            tgt = min(tgt, int(max_replicas))
+        replicas = tgt
+
+    replicas = max(1, min(int(replicas), MAX_SCALE))
+    mode['Replicated'] = {'Replicas': replicas}
+    spec['Mode'] = mode
+
+    # svc.update expects keyword args matching the service spec shape; pass the full spec with update
+    try:
+        svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
+    except Exception:
+        # Fallback: use the update with the raw spec (works in many docker-py versions)
+        svc.update(**spec)
+    return {"service": service_name, "replicas": replicas}
+
+
+def _restart_service(service_name: str):
+    _guard_service(service_name)
+    try:
+        svc = client.services.get(service_name)
+    except docker.errors.NotFound:
+        raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
+    spec = svc.attrs.get('Spec', {}).copy()
+    # Force a rolling update by bumping ForceUpdate / Version index
+    try:
+        current_index = svc.attrs.get('Version', {}).get('Index', 0)
+        svc.update(**spec, force_update=current_index + 1)
+    except Exception:
+        # If update signature differs, try a simple update
+        svc.update(**spec)
+    return {"service": service_name, "status": "rolling-restart-issued"}
+
+
+class Command(BaseModel):
+    text: str = ""
+    action: str | None = None
+    params: dict | None = None
+
+
+@app.get("/health")
+def health():
+    return {"ok": True}
+
+
+@app.post("/command")
+def command(cmd: Command):
+    \"\"\"Accepts either structured commands or simple text instructions.
+
+    Structured example:
+      {"action":"scale","params":{"service":"weblabs_php","replicas":3}}
+
+    Text example:
+      {"text":"scale weblabs_php to 3"}
+    \"\"\"
+    # Structured commands first
+    if cmd.action:
+        if cmd.action == "scale":
+            p = cmd.params or {}
+            if "service" not in p or "replicas" not in p:
+                raise HTTPException(status_code=400, detail="Missing params for scale: service, replicas")
+            return _scale(p["service"], replicas=int(p["replicas"]))
+        elif cmd.action == "restart_service":
+            p = cmd.params or {}
+            if "service" not in p:
+                raise HTTPException(status_code=400, detail="Missing service param for restart_service")
+            return _restart_service(p["service"])
+        else:
+            raise HTTPException(status_code=400, detail="Unknown action")
+
+    # Free-text parsing (simple)
+    t = (cmd.text or "").strip().lower()
+    if not t:
+        raise HTTPException(status_code=400, detail="Empty command")
+
+    if t.startswith("scale "):
+        # "scale weblabs_php to 3"
+        parts = t.split()
+        try:
+            svc = parts[1]
+            if "to" in parts:
+                idx_to = parts.index("to")
+                reps = int(parts[idx_to + 1])
+                return _scale(svc, replicas=reps)
+            else:
+                # if no explicit amount, treat as step +1
+                return _scale(svc, step=1)
+        except Exception:
+            raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
+    elif t.startswith("restart "):
+        # "restart weblabs_php"
+        try:
+            svc = t.split()[1]
+            return _restart_service(svc)
+        except Exception:
+            raise HTTPException(status_code=400, detail="Format: 'restart <service>'")
+
+    raise HTTPException(status_code=400, detail="Unrecognized command")
+
+
+@app.post("/alert")
+async def alert(request: Request):
+    \"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\"
+    payload = await request.json()
+    alerts = payload.get("alerts", [])
+    executed = []
+
+    for a in alerts:
+        labels = a.get("labels", {}) or {}
+        # For each rule in RULES, check if the match conditions apply
+        for rule in RULES.get("alerts", []):
+            match = rule.get("match", {})
+            if all(labels.get(k) == v for k, v in match.items()):
+                for act in rule.get("actions", []):
+                    if "scale" in act:
+                        cfg = act["scale"].copy()
+                        svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
+                        res = _scale(
+                            service_name=svc,
+                            replicas=cfg.get("replicas"),
+                            step=cfg.get("step"),
+                            min_replicas=cfg.get("min_replicas"),
+                            max_replicas=cfg.get("max_replicas", MAX_SCALE),
+                        )
+                        executed.append({"alert": labels.get("alertname"), "action": "scale", "result": res})
+                    elif "restart_service" in act:
+                        cfg = act["restart_service"].copy()
+                        svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
+                        res = _restart_service(svc)
+                        executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
+
+    return {"executed": executed}
--- a/agent/requirements.txt
+++ b/agent/requirements.txt
@@ -0,0 +1,5 @@
+fastapi==0.115.0
+uvicorn[standard]==0.30.6
+PyYAML==6.0.2
+docker==7.1.0
+httpx==0.27.2
--- a/agent/rules.yaml
+++ b/agent/rules.yaml
@@ -0,0 +1,30 @@
+alerts:
+  - match:
+      alertname: HighCPU
+      severity: warning
+    actions:
+      - scale:
+          service: "weblabs_php"
+          min_replicas: 2
+          step: 1
+          max_replicas: 10
+  - match:
+      alertname: ServiceDown
+      severity: critical
+    actions:
+      - restart_service:
+          service: "{{ $labels.service_name }}"
+
+commands:
+  - intent: "scale"
+    schema:
+      service: str
+      replicas: int
+    action:
+      scale:
+        service: "{{service}}"
+        replicas: "{{replicas}}"
+
+guardrails:
+  allowed_services_regex: "^(weblabs_.*|wordpress_.*|nginx_.*|php_.*|redis_.*|mysql_.*)$"
+  max_scale_replicas: 25
--- a/monitoring/alertmanager.yml
+++ b/monitoring/alertmanager.yml
@@ -0,0 +1,12 @@
+route:
+  receiver: 'ai-agent'
+  group_by: ['alertname', 'service_name']
+  group_wait: 10s
+  group_interval: 1m
+  repeat_interval: 15m
+
+receivers:
+  - name: 'ai-agent'
+    webhook_configs:
+      - url: 'http://ai-agent:8080/alert'
+        max_alerts: 10
--- a/monitoring/prometheus.yml
+++ b/monitoring/prometheus.yml
@@ -0,0 +1,12 @@
+global:
+  scrape_interval: 15s
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs: [{ targets: ['prometheus:9090'] }]
+
+  - job_name: 'cadvisor'
+    static_configs: [{ targets: ['cadvisor:8080'] }]
+
+  - job_name: 'node'
+    static_configs: [{ targets: ['node-exporter:9100'] }]
--- a/relay/Dockerfile
+++ b/relay/Dockerfile
@@ -0,0 +1,13 @@
+
+FROM python:3.12-slim
+
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY relay.py ./
+ENV PORT=8090
+# Reads API key from OPENAI_API_KEY or OPENAI_API_KEY_FILE (/run/secrets/openai_api_key)
+# AGENT_URL defaults to http://ai-agent:8080
+# OPENAI_MODEL defaults to gpt-4o-mini
+CMD ["uvicorn", "relay:app", "--host", "0.0.0.0", "--port", "8090"]
--- a/relay/client.sh
+++ b/relay/client.sh
@@ -0,0 +1,11 @@
+
+#!/usr/bin/env bash
+# Simple CLI to talk to the relay
+# Usage: ./client.sh "scale weblabs_php to 3"
+set -euo pipefail
+PROMPT="${1:-}"
+if [ -z "$PROMPT" ]; then
+  echo "Usage: $0 "your request"" >&2
+  exit 1
+fi
+curl -s -X POST http://localhost:8090/chat   -H 'Content-Type: application/json'   -d "$(jq -n --arg p "$PROMPT" '{prompt:$p}')" | jq .
--- a/relay/relay.py
+++ b/relay/relay.py
@@ -0,0 +1,76 @@
+
+import os, json, httpx
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+AGENT_URL = os.getenv("AGENT_URL", "http://ai-agent:8080")
+OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
+
+def _read_api_key():
+    # Prefer file from Docker secret if present
+    path = os.getenv("OPENAI_API_KEY_FILE", "/run/secrets/openai_api_key")
+    if os.path.exists(path):
+        return open(path, "r").read().strip()
+    return os.getenv("OPENAI_API_KEY", "")
+
+SYSTEM_PROMPT = (
+    "You are an ops command planner. Convert the user's intent into a STRICT JSON object "
+    "with fields: action (scale|restart_service), params (dict). No prose. Examples: "
+    '{"action":"scale","params":{"service":"weblabs_php","replicas":3}} '
+    'or {"action":"restart_service","params":{"service":"weblabs_php"}}. '
+    "Only produce valid JSON. If unclear, choose the safest no-op."
+)
+
+class ChatIn(BaseModel):
+    prompt: str
+
+app = FastAPI(title="AI Relay (LLM -> Agent)")
+
+@app.get("/health")
+def health():
+    return {"ok": True}
+
+@app.post("/chat")
+async def chat(inp: ChatIn):
+    api_key = _read_api_key()
+    if not api_key:
+        raise HTTPException(500, "Missing OPENAI_API_KEY (env or secret).")
+
+    # Call OpenAI Responses API
+    url = "https://api.openai.com/v1/responses"
+    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+    body = {
+        "model": OPENAI_MODEL,
+        "input": f"{SYSTEM_PROMPT}\nUSER: {inp.prompt}",
+        "max_output_tokens": 300,
+        "temperature": 0.1
+    }
+
+    async with httpx.AsyncClient(timeout=30) as client:
+        r = await client.post(url, headers=headers, json=body)
+        if r.status_code >= 400:
+            raise HTTPException(502, f"OpenAI error: {r.text}")
+        data = r.json()
+        # Responses API returns output in 'output_text' (or tool messages). Try common fields.
+        content = data.get("output_text") or data.get("content") or ""
+        if isinstance(content, list):
+            # Some responses return a list of content parts; take text from first text part
+            for part in content:
+                if part.get("type") in ("output_text", "text") and part.get("text"):
+                    content = part["text"]
+                    break
+        if not isinstance(content, str):
+            content = str(content)
+
+    # Parse JSON from the model output
+    try:
+        cmd = json.loads(content)
+    except Exception as e:
+        raise HTTPException(500, f"Failed to parse model JSON: {e}; content={content[:200]}")
+
+    # Forward to the agent
+    async with httpx.AsyncClient(timeout=15) as client:
+        r = await client.post(f"{AGENT_URL}/command", json=cmd)
+        if r.status_code >= 400:
+            raise HTTPException(r.status_code, f"Agent error: {r.text}")
+        return r.json()
--- a/relay/requirements.txt
+++ b/relay/requirements.txt
@@ -0,0 +1,6 @@
+
+fastapi==0.115.0
+uvicorn[standard]==0.30.6
+httpx==0.27.2
+pydantic==2.9.2
+python-dotenv==1.0.1
--- a/stack.yml
+++ b/stack.yml
@@ -0,0 +1,119 @@
+version: "3.9"
+
+networks:
+  opsNet:
+    driver: overlay
+    attachable: true
+
+configs:
+  prometheus.yml:
+    file: ./monitoring/prometheus.yml
+  alertmanager.yml:
+    file: ./monitoring/alertmanager.yml
+  rules.yaml:
+    file: ./agent/rules.yaml
+
+secrets:
+  openai_api_key:
+    external: true
+
+services:
+  ai-agent:
+    build:
+      context: ./agent
+    image: burnserv/ai-agent:latest
+    networks: [opsNet]
+    ports:
+      - "8080:8080"
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager
+      labels:
+        - "ai.agent=true"
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    configs:
+      - source: rules.yaml
+        target: /app/rules.yaml
+
+  prometheus:
+    image: prom/prometheus:v2.55.0
+    networks: [opsNet]
+    deploy:
+      mode: replicated
+      replicas: 1
+    command:
+      - "--config.file=/etc/prometheus/prometheus.yml"
+    configs:
+      - source: prometheus.yml
+        target: /etc/prometheus/prometheus.yml
+    ports:
+      - "9090:9090"
+
+  alertmanager:
+    image: prom/alertmanager:v0.27.0
+    networks: [opsNet]
+    deploy:
+      mode: replicated
+      replicas: 1
+    command:
+      - "--config.file=/etc/alertmanager/alertmanager.yml"
+    configs:
+      - source: alertmanager.yml
+        target: /etc/alertmanager/alertmanager.yml
+    ports:
+      - "9093:9093"
+
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor:v0.49.1
+    networks: [opsNet]
+    deploy:
+      mode: global
+      placement:
+        constraints:
+          - node.platform.os == linux
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+    ports:
+      - "8081:8080"
+
+  node-exporter:
+    image: prom/node-exporter:v1.8.2
+    networks: [opsNet]
+    deploy:
+      mode: global
+      placement:
+        constraints:
+          - node.platform.os == linux
+    command:
+      - --path.rootfs=/host
+    volumes:
+      - /:/host:ro,rslave
+
+  relay:
+    build:
+      context: ./relay
+    image: burnserv/ai-relay:latest
+    networks: [opsNet]
+    depends_on: [ai-agent]
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager
+    environment:
+      - OPENAI_MODEL=gpt-4o-mini
+      - AGENT_URL=http://ai-agent:8080
+      - OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
+    secrets:
+      - source: openai_api_key
+        target: openai_api_key
+    ports:
+      - "8090:8090"