first commit
This commit is contained in:
5
.env.example
Normal file
5
.env.example
Normal file
@@ -0,0 +1,5 @@
|
||||
|
||||
# Copy to .env and customize (not used by Docker secrets)
|
||||
OPENAI_API_KEY=YOUR_KEY_HERE
|
||||
OPENAI_MODEL=gpt-4o-mini
|
||||
AGENT_URL=http://ai-agent:8080
|
||||
10
agent/Dockerfile
Normal file
10
agent/Dockerfile
Normal file
@@ -0,0 +1,10 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY agent.py rules.yaml ./
|
||||
ENV PORT=8080
|
||||
|
||||
CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "8080"]
|
||||
182
agent/agent.py
Normal file
182
agent/agent.py
Normal file
@@ -0,0 +1,182 @@
|
||||
|
||||
import os
|
||||
import re
|
||||
import yaml
|
||||
import json
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from pydantic import BaseModel
|
||||
import docker
|
||||
|
||||
# Basic AI Ops Agent for Docker Swarm
|
||||
PORT = int(os.getenv("PORT", "8080"))
|
||||
GUARD_DEFAULT_MAX = 25
|
||||
|
||||
# Load rules file if present
|
||||
RULES_PATH = "rules.yaml"
|
||||
if os.path.exists(RULES_PATH):
|
||||
with open(RULES_PATH, "r") as f:
|
||||
RULES = yaml.safe_load(f) or {}
|
||||
else:
|
||||
RULES = {}
|
||||
|
||||
ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
|
||||
MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
|
||||
|
||||
# Docker client using the socket
|
||||
client = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
||||
|
||||
app = FastAPI(title="BurnServ AI Ops Agent")
|
||||
|
||||
|
||||
def _guard_service(service_name: str):
|
||||
if not ALLOWED_REGEX.match(service_name):
|
||||
raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
|
||||
|
||||
|
||||
def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
|
||||
_guard_service(service_name)
|
||||
try:
|
||||
svc = client.services.get(service_name)
|
||||
except docker.errors.NotFound:
|
||||
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
|
||||
spec = svc.attrs.get('Spec', {}).copy()
|
||||
mode = spec.get('Mode', {})
|
||||
replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
|
||||
|
||||
if replicas is None:
|
||||
# step-based scaling
|
||||
tgt = int(replicas_current) + int(step or 1)
|
||||
if min_replicas is not None:
|
||||
tgt = max(tgt, int(min_replicas))
|
||||
if max_replicas is not None:
|
||||
tgt = min(tgt, int(max_replicas))
|
||||
replicas = tgt
|
||||
|
||||
replicas = max(1, min(int(replicas), MAX_SCALE))
|
||||
mode['Replicated'] = {'Replicas': replicas}
|
||||
spec['Mode'] = mode
|
||||
|
||||
# svc.update expects keyword args matching the service spec shape; pass the full spec with update
|
||||
try:
|
||||
svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
|
||||
except Exception:
|
||||
# Fallback: use the update with the raw spec (works in many docker-py versions)
|
||||
svc.update(**spec)
|
||||
return {"service": service_name, "replicas": replicas}
|
||||
|
||||
|
||||
def _restart_service(service_name: str):
|
||||
_guard_service(service_name)
|
||||
try:
|
||||
svc = client.services.get(service_name)
|
||||
except docker.errors.NotFound:
|
||||
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
|
||||
spec = svc.attrs.get('Spec', {}).copy()
|
||||
# Force a rolling update by bumping ForceUpdate / Version index
|
||||
try:
|
||||
current_index = svc.attrs.get('Version', {}).get('Index', 0)
|
||||
svc.update(**spec, force_update=current_index + 1)
|
||||
except Exception:
|
||||
# If update signature differs, try a simple update
|
||||
svc.update(**spec)
|
||||
return {"service": service_name, "status": "rolling-restart-issued"}
|
||||
|
||||
|
||||
class Command(BaseModel):
|
||||
text: str = ""
|
||||
action: str | None = None
|
||||
params: dict | None = None
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.post("/command")
|
||||
def command(cmd: Command):
|
||||
\"\"\"Accepts either structured commands or simple text instructions.
|
||||
|
||||
Structured example:
|
||||
{"action":"scale","params":{"service":"weblabs_php","replicas":3}}
|
||||
|
||||
Text example:
|
||||
{"text":"scale weblabs_php to 3"}
|
||||
\"\"\"
|
||||
# Structured commands first
|
||||
if cmd.action:
|
||||
if cmd.action == "scale":
|
||||
p = cmd.params or {}
|
||||
if "service" not in p or "replicas" not in p:
|
||||
raise HTTPException(status_code=400, detail="Missing params for scale: service, replicas")
|
||||
return _scale(p["service"], replicas=int(p["replicas"]))
|
||||
elif cmd.action == "restart_service":
|
||||
p = cmd.params or {}
|
||||
if "service" not in p:
|
||||
raise HTTPException(status_code=400, detail="Missing service param for restart_service")
|
||||
return _restart_service(p["service"])
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="Unknown action")
|
||||
|
||||
# Free-text parsing (simple)
|
||||
t = (cmd.text or "").strip().lower()
|
||||
if not t:
|
||||
raise HTTPException(status_code=400, detail="Empty command")
|
||||
|
||||
if t.startswith("scale "):
|
||||
# "scale weblabs_php to 3"
|
||||
parts = t.split()
|
||||
try:
|
||||
svc = parts[1]
|
||||
if "to" in parts:
|
||||
idx_to = parts.index("to")
|
||||
reps = int(parts[idx_to + 1])
|
||||
return _scale(svc, replicas=reps)
|
||||
else:
|
||||
# if no explicit amount, treat as step +1
|
||||
return _scale(svc, step=1)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
|
||||
elif t.startswith("restart "):
|
||||
# "restart weblabs_php"
|
||||
try:
|
||||
svc = t.split()[1]
|
||||
return _restart_service(svc)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=400, detail="Format: 'restart <service>'")
|
||||
|
||||
raise HTTPException(status_code=400, detail="Unrecognized command")
|
||||
|
||||
|
||||
@app.post("/alert")
|
||||
async def alert(request: Request):
|
||||
\"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\"
|
||||
payload = await request.json()
|
||||
alerts = payload.get("alerts", [])
|
||||
executed = []
|
||||
|
||||
for a in alerts:
|
||||
labels = a.get("labels", {}) or {}
|
||||
# For each rule in RULES, check if the match conditions apply
|
||||
for rule in RULES.get("alerts", []):
|
||||
match = rule.get("match", {})
|
||||
if all(labels.get(k) == v for k, v in match.items()):
|
||||
for act in rule.get("actions", []):
|
||||
if "scale" in act:
|
||||
cfg = act["scale"].copy()
|
||||
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
|
||||
res = _scale(
|
||||
service_name=svc,
|
||||
replicas=cfg.get("replicas"),
|
||||
step=cfg.get("step"),
|
||||
min_replicas=cfg.get("min_replicas"),
|
||||
max_replicas=cfg.get("max_replicas", MAX_SCALE),
|
||||
)
|
||||
executed.append({"alert": labels.get("alertname"), "action": "scale", "result": res})
|
||||
elif "restart_service" in act:
|
||||
cfg = act["restart_service"].copy()
|
||||
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
|
||||
res = _restart_service(svc)
|
||||
executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
|
||||
|
||||
return {"executed": executed}
|
||||
5
agent/requirements.txt
Normal file
5
agent/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.6
|
||||
PyYAML==6.0.2
|
||||
docker==7.1.0
|
||||
httpx==0.27.2
|
||||
30
agent/rules.yaml
Normal file
30
agent/rules.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
alerts:
|
||||
- match:
|
||||
alertname: HighCPU
|
||||
severity: warning
|
||||
actions:
|
||||
- scale:
|
||||
service: "weblabs_php"
|
||||
min_replicas: 2
|
||||
step: 1
|
||||
max_replicas: 10
|
||||
- match:
|
||||
alertname: ServiceDown
|
||||
severity: critical
|
||||
actions:
|
||||
- restart_service:
|
||||
service: "{{ $labels.service_name }}"
|
||||
|
||||
commands:
|
||||
- intent: "scale"
|
||||
schema:
|
||||
service: str
|
||||
replicas: int
|
||||
action:
|
||||
scale:
|
||||
service: "{{service}}"
|
||||
replicas: "{{replicas}}"
|
||||
|
||||
guardrails:
|
||||
allowed_services_regex: "^(weblabs_.*|wordpress_.*|nginx_.*|php_.*|redis_.*|mysql_.*)$"
|
||||
max_scale_replicas: 25
|
||||
12
monitoring/alertmanager.yml
Normal file
12
monitoring/alertmanager.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
route:
|
||||
receiver: 'ai-agent'
|
||||
group_by: ['alertname', 'service_name']
|
||||
group_wait: 10s
|
||||
group_interval: 1m
|
||||
repeat_interval: 15m
|
||||
|
||||
receivers:
|
||||
- name: 'ai-agent'
|
||||
webhook_configs:
|
||||
- url: 'http://ai-agent:8080/alert'
|
||||
max_alerts: 10
|
||||
12
monitoring/prometheus.yml
Normal file
12
monitoring/prometheus.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs: [{ targets: ['prometheus:9090'] }]
|
||||
|
||||
- job_name: 'cadvisor'
|
||||
static_configs: [{ targets: ['cadvisor:8080'] }]
|
||||
|
||||
- job_name: 'node'
|
||||
static_configs: [{ targets: ['node-exporter:9100'] }]
|
||||
13
relay/Dockerfile
Normal file
13
relay/Dockerfile
Normal file
@@ -0,0 +1,13 @@
|
||||
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY relay.py ./
|
||||
ENV PORT=8090
|
||||
# Reads API key from OPENAI_API_KEY or OPENAI_API_KEY_FILE (/run/secrets/openai_api_key)
|
||||
# AGENT_URL defaults to http://ai-agent:8080
|
||||
# OPENAI_MODEL defaults to gpt-4o-mini
|
||||
CMD ["uvicorn", "relay:app", "--host", "0.0.0.0", "--port", "8090"]
|
||||
11
relay/client.sh
Normal file
11
relay/client.sh
Normal file
@@ -0,0 +1,11 @@
|
||||
|
||||
#!/usr/bin/env bash
|
||||
# Simple CLI to talk to the relay
|
||||
# Usage: ./client.sh "scale weblabs_php to 3"
|
||||
set -euo pipefail
|
||||
PROMPT="${1:-}"
|
||||
if [ -z "$PROMPT" ]; then
|
||||
echo "Usage: $0 "your request"" >&2
|
||||
exit 1
|
||||
fi
|
||||
curl -s -X POST http://localhost:8090/chat -H 'Content-Type: application/json' -d "$(jq -n --arg p "$PROMPT" '{prompt:$p}')" | jq .
|
||||
76
relay/relay.py
Normal file
76
relay/relay.py
Normal file
@@ -0,0 +1,76 @@
|
||||
|
||||
import os, json, httpx
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
AGENT_URL = os.getenv("AGENT_URL", "http://ai-agent:8080")
|
||||
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
|
||||
|
||||
def _read_api_key():
|
||||
# Prefer file from Docker secret if present
|
||||
path = os.getenv("OPENAI_API_KEY_FILE", "/run/secrets/openai_api_key")
|
||||
if os.path.exists(path):
|
||||
return open(path, "r").read().strip()
|
||||
return os.getenv("OPENAI_API_KEY", "")
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are an ops command planner. Convert the user's intent into a STRICT JSON object "
|
||||
"with fields: action (scale|restart_service), params (dict). No prose. Examples: "
|
||||
'{"action":"scale","params":{"service":"weblabs_php","replicas":3}} '
|
||||
'or {"action":"restart_service","params":{"service":"weblabs_php"}}. '
|
||||
"Only produce valid JSON. If unclear, choose the safest no-op."
|
||||
)
|
||||
|
||||
class ChatIn(BaseModel):
|
||||
prompt: str
|
||||
|
||||
app = FastAPI(title="AI Relay (LLM -> Agent)")
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"ok": True}
|
||||
|
||||
@app.post("/chat")
|
||||
async def chat(inp: ChatIn):
|
||||
api_key = _read_api_key()
|
||||
if not api_key:
|
||||
raise HTTPException(500, "Missing OPENAI_API_KEY (env or secret).")
|
||||
|
||||
# Call OpenAI Responses API
|
||||
url = "https://api.openai.com/v1/responses"
|
||||
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
||||
body = {
|
||||
"model": OPENAI_MODEL,
|
||||
"input": f"{SYSTEM_PROMPT}\nUSER: {inp.prompt}",
|
||||
"max_output_tokens": 300,
|
||||
"temperature": 0.1
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
r = await client.post(url, headers=headers, json=body)
|
||||
if r.status_code >= 400:
|
||||
raise HTTPException(502, f"OpenAI error: {r.text}")
|
||||
data = r.json()
|
||||
# Responses API returns output in 'output_text' (or tool messages). Try common fields.
|
||||
content = data.get("output_text") or data.get("content") or ""
|
||||
if isinstance(content, list):
|
||||
# Some responses return a list of content parts; take text from first text part
|
||||
for part in content:
|
||||
if part.get("type") in ("output_text", "text") and part.get("text"):
|
||||
content = part["text"]
|
||||
break
|
||||
if not isinstance(content, str):
|
||||
content = str(content)
|
||||
|
||||
# Parse JSON from the model output
|
||||
try:
|
||||
cmd = json.loads(content)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"Failed to parse model JSON: {e}; content={content[:200]}")
|
||||
|
||||
# Forward to the agent
|
||||
async with httpx.AsyncClient(timeout=15) as client:
|
||||
r = await client.post(f"{AGENT_URL}/command", json=cmd)
|
||||
if r.status_code >= 400:
|
||||
raise HTTPException(r.status_code, f"Agent error: {r.text}")
|
||||
return r.json()
|
||||
6
relay/requirements.txt
Normal file
6
relay/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.6
|
||||
httpx==0.27.2
|
||||
pydantic==2.9.2
|
||||
python-dotenv==1.0.1
|
||||
119
stack.yml
Normal file
119
stack.yml
Normal file
@@ -0,0 +1,119 @@
|
||||
version: "3.9"
|
||||
|
||||
networks:
|
||||
opsNet:
|
||||
driver: overlay
|
||||
attachable: true
|
||||
|
||||
configs:
|
||||
prometheus.yml:
|
||||
file: ./monitoring/prometheus.yml
|
||||
alertmanager.yml:
|
||||
file: ./monitoring/alertmanager.yml
|
||||
rules.yaml:
|
||||
file: ./agent/rules.yaml
|
||||
|
||||
secrets:
|
||||
openai_api_key:
|
||||
external: true
|
||||
|
||||
services:
|
||||
ai-agent:
|
||||
build:
|
||||
context: ./agent
|
||||
image: burnserv/ai-agent:latest
|
||||
networks: [opsNet]
|
||||
ports:
|
||||
- "8080:8080"
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.role == manager
|
||||
labels:
|
||||
- "ai.agent=true"
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
configs:
|
||||
- source: rules.yaml
|
||||
target: /app/rules.yaml
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.55.0
|
||||
networks: [opsNet]
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
configs:
|
||||
- source: prometheus.yml
|
||||
target: /etc/prometheus/prometheus.yml
|
||||
ports:
|
||||
- "9090:9090"
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.27.0
|
||||
networks: [opsNet]
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
command:
|
||||
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
||||
configs:
|
||||
- source: alertmanager.yml
|
||||
target: /etc/alertmanager/alertmanager.yml
|
||||
ports:
|
||||
- "9093:9093"
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.49.1
|
||||
networks: [opsNet]
|
||||
deploy:
|
||||
mode: global
|
||||
placement:
|
||||
constraints:
|
||||
- node.platform.os == linux
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
ports:
|
||||
- "8081:8080"
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:v1.8.2
|
||||
networks: [opsNet]
|
||||
deploy:
|
||||
mode: global
|
||||
placement:
|
||||
constraints:
|
||||
- node.platform.os == linux
|
||||
command:
|
||||
- --path.rootfs=/host
|
||||
volumes:
|
||||
- /:/host:ro,rslave
|
||||
|
||||
relay:
|
||||
build:
|
||||
context: ./relay
|
||||
image: burnserv/ai-relay:latest
|
||||
networks: [opsNet]
|
||||
depends_on: [ai-agent]
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.role == manager
|
||||
environment:
|
||||
- OPENAI_MODEL=gpt-4o-mini
|
||||
- AGENT_URL=http://ai-agent:8080
|
||||
- OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
|
||||
secrets:
|
||||
- source: openai_api_key
|
||||
target: openai_api_key
|
||||
ports:
|
||||
- "8090:8090"
|
||||
Reference in New Issue
Block a user