commit 0473fdb22566f426f4563daf2d396a67fee61f7e Author: Josh Stevens Date: Thu Sep 18 14:37:09 2025 -0400 first commit diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9aa40d8 --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ + +# Copy to .env and customize (not used by Docker secrets) +OPENAI_API_KEY=YOUR_KEY_HERE +OPENAI_MODEL=gpt-4o-mini +AGENT_URL=http://ai-agent:8080 diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/agent/Dockerfile b/agent/Dockerfile new file mode 100644 index 0000000..8a4cd8b --- /dev/null +++ b/agent/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.12-slim + +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY agent.py rules.yaml ./ +ENV PORT=8080 + +CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/agent/agent.py b/agent/agent.py new file mode 100644 index 0000000..71337a0 --- /dev/null +++ b/agent/agent.py @@ -0,0 +1,182 @@ + +import os +import re +import yaml +import json +from fastapi import FastAPI, Request, HTTPException +from pydantic import BaseModel +import docker + +# Basic AI Ops Agent for Docker Swarm +PORT = int(os.getenv("PORT", "8080")) +GUARD_DEFAULT_MAX = 25 + +# Load rules file if present +RULES_PATH = "rules.yaml" +if os.path.exists(RULES_PATH): + with open(RULES_PATH, "r") as f: + RULES = yaml.safe_load(f) or {} +else: + RULES = {} + +ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*")) +MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX)) + +# Docker client using the socket +client = docker.DockerClient(base_url='unix://var/run/docker.sock') + +app = FastAPI(title="BurnServ AI Ops Agent") + + +def _guard_service(service_name: str): + if not ALLOWED_REGEX.match(service_name): + raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.") + + +def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None): + _guard_service(service_name) + try: + svc = client.services.get(service_name) + except docker.errors.NotFound: + raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found") + spec = svc.attrs.get('Spec', {}).copy() + mode = spec.get('Mode', {}) + replicas_current = mode.get('Replicated', {}).get('Replicas', 1) + + if replicas is None: + # step-based scaling + tgt = int(replicas_current) + int(step or 1) + if min_replicas is not None: + tgt = max(tgt, int(min_replicas)) + if max_replicas is not None: + tgt = min(tgt, int(max_replicas)) + replicas = tgt + + replicas = max(1, min(int(replicas), MAX_SCALE)) + mode['Replicated'] = {'Replicas': replicas} + spec['Mode'] = mode + + # svc.update expects keyword args matching the service spec shape; pass the full spec with update + try: + svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'}) + except Exception: + # Fallback: use the update with the raw spec (works in many docker-py versions) + svc.update(**spec) + return {"service": service_name, "replicas": replicas} + + +def _restart_service(service_name: str): + _guard_service(service_name) + try: + svc = client.services.get(service_name) + except docker.errors.NotFound: + raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found") + spec = svc.attrs.get('Spec', {}).copy() + # Force a rolling update by bumping ForceUpdate / Version index + try: + current_index = svc.attrs.get('Version', {}).get('Index', 0) + svc.update(**spec, force_update=current_index + 1) + except Exception: + # If update signature differs, try a simple update + svc.update(**spec) + return {"service": service_name, "status": "rolling-restart-issued"} + + +class Command(BaseModel): + text: str = "" + action: str | None = None + params: dict | None = None + + +@app.get("/health") +def health(): + return {"ok": True} + + +@app.post("/command") +def command(cmd: Command): + \"\"\"Accepts either structured commands or simple text instructions. + + Structured example: + {"action":"scale","params":{"service":"weblabs_php","replicas":3}} + + Text example: + {"text":"scale weblabs_php to 3"} + \"\"\" + # Structured commands first + if cmd.action: + if cmd.action == "scale": + p = cmd.params or {} + if "service" not in p or "replicas" not in p: + raise HTTPException(status_code=400, detail="Missing params for scale: service, replicas") + return _scale(p["service"], replicas=int(p["replicas"])) + elif cmd.action == "restart_service": + p = cmd.params or {} + if "service" not in p: + raise HTTPException(status_code=400, detail="Missing service param for restart_service") + return _restart_service(p["service"]) + else: + raise HTTPException(status_code=400, detail="Unknown action") + + # Free-text parsing (simple) + t = (cmd.text or "").strip().lower() + if not t: + raise HTTPException(status_code=400, detail="Empty command") + + if t.startswith("scale "): + # "scale weblabs_php to 3" + parts = t.split() + try: + svc = parts[1] + if "to" in parts: + idx_to = parts.index("to") + reps = int(parts[idx_to + 1]) + return _scale(svc, replicas=reps) + else: + # if no explicit amount, treat as step +1 + return _scale(svc, step=1) + except Exception: + raise HTTPException(status_code=400, detail="Format: 'scale to '") + elif t.startswith("restart "): + # "restart weblabs_php" + try: + svc = t.split()[1] + return _restart_service(svc) + except Exception: + raise HTTPException(status_code=400, detail="Format: 'restart '") + + raise HTTPException(status_code=400, detail="Unrecognized command") + + +@app.post("/alert") +async def alert(request: Request): + \"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\" + payload = await request.json() + alerts = payload.get("alerts", []) + executed = [] + + for a in alerts: + labels = a.get("labels", {}) or {} + # For each rule in RULES, check if the match conditions apply + for rule in RULES.get("alerts", []): + match = rule.get("match", {}) + if all(labels.get(k) == v for k, v in match.items()): + for act in rule.get("actions", []): + if "scale" in act: + cfg = act["scale"].copy() + svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", "")) + res = _scale( + service_name=svc, + replicas=cfg.get("replicas"), + step=cfg.get("step"), + min_replicas=cfg.get("min_replicas"), + max_replicas=cfg.get("max_replicas", MAX_SCALE), + ) + executed.append({"alert": labels.get("alertname"), "action": "scale", "result": res}) + elif "restart_service" in act: + cfg = act["restart_service"].copy() + svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", "")) + res = _restart_service(svc) + executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res}) + + return {"executed": executed} diff --git a/agent/requirements.txt b/agent/requirements.txt new file mode 100644 index 0000000..754ae51 --- /dev/null +++ b/agent/requirements.txt @@ -0,0 +1,5 @@ +fastapi==0.115.0 +uvicorn[standard]==0.30.6 +PyYAML==6.0.2 +docker==7.1.0 +httpx==0.27.2 diff --git a/agent/rules.yaml b/agent/rules.yaml new file mode 100644 index 0000000..3c1cf80 --- /dev/null +++ b/agent/rules.yaml @@ -0,0 +1,30 @@ +alerts: + - match: + alertname: HighCPU + severity: warning + actions: + - scale: + service: "weblabs_php" + min_replicas: 2 + step: 1 + max_replicas: 10 + - match: + alertname: ServiceDown + severity: critical + actions: + - restart_service: + service: "{{ $labels.service_name }}" + +commands: + - intent: "scale" + schema: + service: str + replicas: int + action: + scale: + service: "{{service}}" + replicas: "{{replicas}}" + +guardrails: + allowed_services_regex: "^(weblabs_.*|wordpress_.*|nginx_.*|php_.*|redis_.*|mysql_.*)$" + max_scale_replicas: 25 diff --git a/monitoring/alertmanager.yml b/monitoring/alertmanager.yml new file mode 100644 index 0000000..60882e2 --- /dev/null +++ b/monitoring/alertmanager.yml @@ -0,0 +1,12 @@ +route: + receiver: 'ai-agent' + group_by: ['alertname', 'service_name'] + group_wait: 10s + group_interval: 1m + repeat_interval: 15m + +receivers: + - name: 'ai-agent' + webhook_configs: + - url: 'http://ai-agent:8080/alert' + max_alerts: 10 diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml new file mode 100644 index 0000000..1be0d7a --- /dev/null +++ b/monitoring/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: [{ targets: ['prometheus:9090'] }] + + - job_name: 'cadvisor' + static_configs: [{ targets: ['cadvisor:8080'] }] + + - job_name: 'node' + static_configs: [{ targets: ['node-exporter:9100'] }] diff --git a/relay/Dockerfile b/relay/Dockerfile new file mode 100644 index 0000000..271be8e --- /dev/null +++ b/relay/Dockerfile @@ -0,0 +1,13 @@ + +FROM python:3.12-slim + +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY relay.py ./ +ENV PORT=8090 +# Reads API key from OPENAI_API_KEY or OPENAI_API_KEY_FILE (/run/secrets/openai_api_key) +# AGENT_URL defaults to http://ai-agent:8080 +# OPENAI_MODEL defaults to gpt-4o-mini +CMD ["uvicorn", "relay:app", "--host", "0.0.0.0", "--port", "8090"] diff --git a/relay/client.sh b/relay/client.sh new file mode 100644 index 0000000..68090fb --- /dev/null +++ b/relay/client.sh @@ -0,0 +1,11 @@ + +#!/usr/bin/env bash +# Simple CLI to talk to the relay +# Usage: ./client.sh "scale weblabs_php to 3" +set -euo pipefail +PROMPT="${1:-}" +if [ -z "$PROMPT" ]; then + echo "Usage: $0 "your request"" >&2 + exit 1 +fi +curl -s -X POST http://localhost:8090/chat -H 'Content-Type: application/json' -d "$(jq -n --arg p "$PROMPT" '{prompt:$p}')" | jq . diff --git a/relay/relay.py b/relay/relay.py new file mode 100644 index 0000000..3c90a44 --- /dev/null +++ b/relay/relay.py @@ -0,0 +1,76 @@ + +import os, json, httpx +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +AGENT_URL = os.getenv("AGENT_URL", "http://ai-agent:8080") +OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini") + +def _read_api_key(): + # Prefer file from Docker secret if present + path = os.getenv("OPENAI_API_KEY_FILE", "/run/secrets/openai_api_key") + if os.path.exists(path): + return open(path, "r").read().strip() + return os.getenv("OPENAI_API_KEY", "") + +SYSTEM_PROMPT = ( + "You are an ops command planner. Convert the user's intent into a STRICT JSON object " + "with fields: action (scale|restart_service), params (dict). No prose. Examples: " + '{"action":"scale","params":{"service":"weblabs_php","replicas":3}} ' + 'or {"action":"restart_service","params":{"service":"weblabs_php"}}. ' + "Only produce valid JSON. If unclear, choose the safest no-op." +) + +class ChatIn(BaseModel): + prompt: str + +app = FastAPI(title="AI Relay (LLM -> Agent)") + +@app.get("/health") +def health(): + return {"ok": True} + +@app.post("/chat") +async def chat(inp: ChatIn): + api_key = _read_api_key() + if not api_key: + raise HTTPException(500, "Missing OPENAI_API_KEY (env or secret).") + + # Call OpenAI Responses API + url = "https://api.openai.com/v1/responses" + headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} + body = { + "model": OPENAI_MODEL, + "input": f"{SYSTEM_PROMPT}\nUSER: {inp.prompt}", + "max_output_tokens": 300, + "temperature": 0.1 + } + + async with httpx.AsyncClient(timeout=30) as client: + r = await client.post(url, headers=headers, json=body) + if r.status_code >= 400: + raise HTTPException(502, f"OpenAI error: {r.text}") + data = r.json() + # Responses API returns output in 'output_text' (or tool messages). Try common fields. + content = data.get("output_text") or data.get("content") or "" + if isinstance(content, list): + # Some responses return a list of content parts; take text from first text part + for part in content: + if part.get("type") in ("output_text", "text") and part.get("text"): + content = part["text"] + break + if not isinstance(content, str): + content = str(content) + + # Parse JSON from the model output + try: + cmd = json.loads(content) + except Exception as e: + raise HTTPException(500, f"Failed to parse model JSON: {e}; content={content[:200]}") + + # Forward to the agent + async with httpx.AsyncClient(timeout=15) as client: + r = await client.post(f"{AGENT_URL}/command", json=cmd) + if r.status_code >= 400: + raise HTTPException(r.status_code, f"Agent error: {r.text}") + return r.json() diff --git a/relay/requirements.txt b/relay/requirements.txt new file mode 100644 index 0000000..6a59831 --- /dev/null +++ b/relay/requirements.txt @@ -0,0 +1,6 @@ + +fastapi==0.115.0 +uvicorn[standard]==0.30.6 +httpx==0.27.2 +pydantic==2.9.2 +python-dotenv==1.0.1 diff --git a/stack.yml b/stack.yml new file mode 100644 index 0000000..3f0d78a --- /dev/null +++ b/stack.yml @@ -0,0 +1,119 @@ +version: "3.9" + +networks: + opsNet: + driver: overlay + attachable: true + +configs: + prometheus.yml: + file: ./monitoring/prometheus.yml + alertmanager.yml: + file: ./monitoring/alertmanager.yml + rules.yaml: + file: ./agent/rules.yaml + +secrets: + openai_api_key: + external: true + +services: + ai-agent: + build: + context: ./agent + image: burnserv/ai-agent:latest + networks: [opsNet] + ports: + - "8080:8080" + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.role == manager + labels: + - "ai.agent=true" + volumes: + - /var/run/docker.sock:/var/run/docker.sock + configs: + - source: rules.yaml + target: /app/rules.yaml + + prometheus: + image: prom/prometheus:v2.55.0 + networks: [opsNet] + deploy: + mode: replicated + replicas: 1 + command: + - "--config.file=/etc/prometheus/prometheus.yml" + configs: + - source: prometheus.yml + target: /etc/prometheus/prometheus.yml + ports: + - "9090:9090" + + alertmanager: + image: prom/alertmanager:v0.27.0 + networks: [opsNet] + deploy: + mode: replicated + replicas: 1 + command: + - "--config.file=/etc/alertmanager/alertmanager.yml" + configs: + - source: alertmanager.yml + target: /etc/alertmanager/alertmanager.yml + ports: + - "9093:9093" + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.49.1 + networks: [opsNet] + deploy: + mode: global + placement: + constraints: + - node.platform.os == linux + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + ports: + - "8081:8080" + + node-exporter: + image: prom/node-exporter:v1.8.2 + networks: [opsNet] + deploy: + mode: global + placement: + constraints: + - node.platform.os == linux + command: + - --path.rootfs=/host + volumes: + - /:/host:ro,rslave + + relay: + build: + context: ./relay + image: burnserv/ai-relay:latest + networks: [opsNet] + depends_on: [ai-agent] + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.role == manager + environment: + - OPENAI_MODEL=gpt-4o-mini + - AGENT_URL=http://ai-agent:8080 + - OPENAI_API_KEY_FILE=/run/secrets/openai_api_key + secrets: + - source: openai_api_key + target: openai_api_key + ports: + - "8090:8090"