first commit

This commit is contained in:
2025-09-18 14:37:09 -04:00
commit 0473fdb225
13 changed files with 481 additions and 0 deletions

5
.env.example Normal file
View File

@@ -0,0 +1,5 @@
# Copy to .env and customize (not used by Docker secrets)
OPENAI_API_KEY=YOUR_KEY_HERE
OPENAI_MODEL=gpt-4o-mini
AGENT_URL=http://ai-agent:8080

0
README.md Normal file
View File

10
agent/Dockerfile Normal file
View File

@@ -0,0 +1,10 @@
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY agent.py rules.yaml ./
ENV PORT=8080
CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "8080"]

182
agent/agent.py Normal file
View File

@@ -0,0 +1,182 @@
import os
import re
import yaml
import json
from fastapi import FastAPI, Request, HTTPException
from pydantic import BaseModel
import docker
# Basic AI Ops Agent for Docker Swarm
PORT = int(os.getenv("PORT", "8080"))
GUARD_DEFAULT_MAX = 25
# Load rules file if present
RULES_PATH = "rules.yaml"
if os.path.exists(RULES_PATH):
with open(RULES_PATH, "r") as f:
RULES = yaml.safe_load(f) or {}
else:
RULES = {}
ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
# Docker client using the socket
client = docker.DockerClient(base_url='unix://var/run/docker.sock')
app = FastAPI(title="BurnServ AI Ops Agent")
def _guard_service(service_name: str):
if not ALLOWED_REGEX.match(service_name):
raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
_guard_service(service_name)
try:
svc = client.services.get(service_name)
except docker.errors.NotFound:
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
spec = svc.attrs.get('Spec', {}).copy()
mode = spec.get('Mode', {})
replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
if replicas is None:
# step-based scaling
tgt = int(replicas_current) + int(step or 1)
if min_replicas is not None:
tgt = max(tgt, int(min_replicas))
if max_replicas is not None:
tgt = min(tgt, int(max_replicas))
replicas = tgt
replicas = max(1, min(int(replicas), MAX_SCALE))
mode['Replicated'] = {'Replicas': replicas}
spec['Mode'] = mode
# svc.update expects keyword args matching the service spec shape; pass the full spec with update
try:
svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
except Exception:
# Fallback: use the update with the raw spec (works in many docker-py versions)
svc.update(**spec)
return {"service": service_name, "replicas": replicas}
def _restart_service(service_name: str):
_guard_service(service_name)
try:
svc = client.services.get(service_name)
except docker.errors.NotFound:
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
spec = svc.attrs.get('Spec', {}).copy()
# Force a rolling update by bumping ForceUpdate / Version index
try:
current_index = svc.attrs.get('Version', {}).get('Index', 0)
svc.update(**spec, force_update=current_index + 1)
except Exception:
# If update signature differs, try a simple update
svc.update(**spec)
return {"service": service_name, "status": "rolling-restart-issued"}
class Command(BaseModel):
text: str = ""
action: str | None = None
params: dict | None = None
@app.get("/health")
def health():
return {"ok": True}
@app.post("/command")
def command(cmd: Command):
\"\"\"Accepts either structured commands or simple text instructions.
Structured example:
{"action":"scale","params":{"service":"weblabs_php","replicas":3}}
Text example:
{"text":"scale weblabs_php to 3"}
\"\"\"
# Structured commands first
if cmd.action:
if cmd.action == "scale":
p = cmd.params or {}
if "service" not in p or "replicas" not in p:
raise HTTPException(status_code=400, detail="Missing params for scale: service, replicas")
return _scale(p["service"], replicas=int(p["replicas"]))
elif cmd.action == "restart_service":
p = cmd.params or {}
if "service" not in p:
raise HTTPException(status_code=400, detail="Missing service param for restart_service")
return _restart_service(p["service"])
else:
raise HTTPException(status_code=400, detail="Unknown action")
# Free-text parsing (simple)
t = (cmd.text or "").strip().lower()
if not t:
raise HTTPException(status_code=400, detail="Empty command")
if t.startswith("scale "):
# "scale weblabs_php to 3"
parts = t.split()
try:
svc = parts[1]
if "to" in parts:
idx_to = parts.index("to")
reps = int(parts[idx_to + 1])
return _scale(svc, replicas=reps)
else:
# if no explicit amount, treat as step +1
return _scale(svc, step=1)
except Exception:
raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
elif t.startswith("restart "):
# "restart weblabs_php"
try:
svc = t.split()[1]
return _restart_service(svc)
except Exception:
raise HTTPException(status_code=400, detail="Format: 'restart <service>'")
raise HTTPException(status_code=400, detail="Unrecognized command")
@app.post("/alert")
async def alert(request: Request):
\"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\"
payload = await request.json()
alerts = payload.get("alerts", [])
executed = []
for a in alerts:
labels = a.get("labels", {}) or {}
# For each rule in RULES, check if the match conditions apply
for rule in RULES.get("alerts", []):
match = rule.get("match", {})
if all(labels.get(k) == v for k, v in match.items()):
for act in rule.get("actions", []):
if "scale" in act:
cfg = act["scale"].copy()
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
res = _scale(
service_name=svc,
replicas=cfg.get("replicas"),
step=cfg.get("step"),
min_replicas=cfg.get("min_replicas"),
max_replicas=cfg.get("max_replicas", MAX_SCALE),
)
executed.append({"alert": labels.get("alertname"), "action": "scale", "result": res})
elif "restart_service" in act:
cfg = act["restart_service"].copy()
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
res = _restart_service(svc)
executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
return {"executed": executed}

5
agent/requirements.txt Normal file
View File

@@ -0,0 +1,5 @@
fastapi==0.115.0
uvicorn[standard]==0.30.6
PyYAML==6.0.2
docker==7.1.0
httpx==0.27.2

30
agent/rules.yaml Normal file
View File

@@ -0,0 +1,30 @@
alerts:
- match:
alertname: HighCPU
severity: warning
actions:
- scale:
service: "weblabs_php"
min_replicas: 2
step: 1
max_replicas: 10
- match:
alertname: ServiceDown
severity: critical
actions:
- restart_service:
service: "{{ $labels.service_name }}"
commands:
- intent: "scale"
schema:
service: str
replicas: int
action:
scale:
service: "{{service}}"
replicas: "{{replicas}}"
guardrails:
allowed_services_regex: "^(weblabs_.*|wordpress_.*|nginx_.*|php_.*|redis_.*|mysql_.*)$"
max_scale_replicas: 25

View File

@@ -0,0 +1,12 @@
route:
receiver: 'ai-agent'
group_by: ['alertname', 'service_name']
group_wait: 10s
group_interval: 1m
repeat_interval: 15m
receivers:
- name: 'ai-agent'
webhook_configs:
- url: 'http://ai-agent:8080/alert'
max_alerts: 10

12
monitoring/prometheus.yml Normal file
View File

@@ -0,0 +1,12 @@
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs: [{ targets: ['prometheus:9090'] }]
- job_name: 'cadvisor'
static_configs: [{ targets: ['cadvisor:8080'] }]
- job_name: 'node'
static_configs: [{ targets: ['node-exporter:9100'] }]

13
relay/Dockerfile Normal file
View File

@@ -0,0 +1,13 @@
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY relay.py ./
ENV PORT=8090
# Reads API key from OPENAI_API_KEY or OPENAI_API_KEY_FILE (/run/secrets/openai_api_key)
# AGENT_URL defaults to http://ai-agent:8080
# OPENAI_MODEL defaults to gpt-4o-mini
CMD ["uvicorn", "relay:app", "--host", "0.0.0.0", "--port", "8090"]

11
relay/client.sh Normal file
View File

@@ -0,0 +1,11 @@
#!/usr/bin/env bash
# Simple CLI to talk to the relay
# Usage: ./client.sh "scale weblabs_php to 3"
set -euo pipefail
PROMPT="${1:-}"
if [ -z "$PROMPT" ]; then
echo "Usage: $0 "your request"" >&2
exit 1
fi
curl -s -X POST http://localhost:8090/chat -H 'Content-Type: application/json' -d "$(jq -n --arg p "$PROMPT" '{prompt:$p}')" | jq .

76
relay/relay.py Normal file
View File

@@ -0,0 +1,76 @@
import os, json, httpx
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
AGENT_URL = os.getenv("AGENT_URL", "http://ai-agent:8080")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
def _read_api_key():
# Prefer file from Docker secret if present
path = os.getenv("OPENAI_API_KEY_FILE", "/run/secrets/openai_api_key")
if os.path.exists(path):
return open(path, "r").read().strip()
return os.getenv("OPENAI_API_KEY", "")
SYSTEM_PROMPT = (
"You are an ops command planner. Convert the user's intent into a STRICT JSON object "
"with fields: action (scale|restart_service), params (dict). No prose. Examples: "
'{"action":"scale","params":{"service":"weblabs_php","replicas":3}} '
'or {"action":"restart_service","params":{"service":"weblabs_php"}}. '
"Only produce valid JSON. If unclear, choose the safest no-op."
)
class ChatIn(BaseModel):
prompt: str
app = FastAPI(title="AI Relay (LLM -> Agent)")
@app.get("/health")
def health():
return {"ok": True}
@app.post("/chat")
async def chat(inp: ChatIn):
api_key = _read_api_key()
if not api_key:
raise HTTPException(500, "Missing OPENAI_API_KEY (env or secret).")
# Call OpenAI Responses API
url = "https://api.openai.com/v1/responses"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
body = {
"model": OPENAI_MODEL,
"input": f"{SYSTEM_PROMPT}\nUSER: {inp.prompt}",
"max_output_tokens": 300,
"temperature": 0.1
}
async with httpx.AsyncClient(timeout=30) as client:
r = await client.post(url, headers=headers, json=body)
if r.status_code >= 400:
raise HTTPException(502, f"OpenAI error: {r.text}")
data = r.json()
# Responses API returns output in 'output_text' (or tool messages). Try common fields.
content = data.get("output_text") or data.get("content") or ""
if isinstance(content, list):
# Some responses return a list of content parts; take text from first text part
for part in content:
if part.get("type") in ("output_text", "text") and part.get("text"):
content = part["text"]
break
if not isinstance(content, str):
content = str(content)
# Parse JSON from the model output
try:
cmd = json.loads(content)
except Exception as e:
raise HTTPException(500, f"Failed to parse model JSON: {e}; content={content[:200]}")
# Forward to the agent
async with httpx.AsyncClient(timeout=15) as client:
r = await client.post(f"{AGENT_URL}/command", json=cmd)
if r.status_code >= 400:
raise HTTPException(r.status_code, f"Agent error: {r.text}")
return r.json()

6
relay/requirements.txt Normal file
View File

@@ -0,0 +1,6 @@
fastapi==0.115.0
uvicorn[standard]==0.30.6
httpx==0.27.2
pydantic==2.9.2
python-dotenv==1.0.1

119
stack.yml Normal file
View File

@@ -0,0 +1,119 @@
version: "3.9"
networks:
opsNet:
driver: overlay
attachable: true
configs:
prometheus.yml:
file: ./monitoring/prometheus.yml
alertmanager.yml:
file: ./monitoring/alertmanager.yml
rules.yaml:
file: ./agent/rules.yaml
secrets:
openai_api_key:
external: true
services:
ai-agent:
build:
context: ./agent
image: burnserv/ai-agent:latest
networks: [opsNet]
ports:
- "8080:8080"
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
labels:
- "ai.agent=true"
volumes:
- /var/run/docker.sock:/var/run/docker.sock
configs:
- source: rules.yaml
target: /app/rules.yaml
prometheus:
image: prom/prometheus:v2.55.0
networks: [opsNet]
deploy:
mode: replicated
replicas: 1
command:
- "--config.file=/etc/prometheus/prometheus.yml"
configs:
- source: prometheus.yml
target: /etc/prometheus/prometheus.yml
ports:
- "9090:9090"
alertmanager:
image: prom/alertmanager:v0.27.0
networks: [opsNet]
deploy:
mode: replicated
replicas: 1
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
configs:
- source: alertmanager.yml
target: /etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
networks: [opsNet]
deploy:
mode: global
placement:
constraints:
- node.platform.os == linux
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
ports:
- "8081:8080"
node-exporter:
image: prom/node-exporter:v1.8.2
networks: [opsNet]
deploy:
mode: global
placement:
constraints:
- node.platform.os == linux
command:
- --path.rootfs=/host
volumes:
- /:/host:ro,rslave
relay:
build:
context: ./relay
image: burnserv/ai-relay:latest
networks: [opsNet]
depends_on: [ai-agent]
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
environment:
- OPENAI_MODEL=gpt-4o-mini
- AGENT_URL=http://ai-agent:8080
- OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
secrets:
- source: openai_api_key
target: openai_api_key
ports:
- "8090:8090"