first commit
This commit is contained in:
10
agent/Dockerfile
Normal file
10
agent/Dockerfile
Normal file
@@ -0,0 +1,10 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY agent.py rules.yaml ./
|
||||
ENV PORT=8080
|
||||
|
||||
CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "8080"]
|
||||
182
agent/agent.py
Normal file
182
agent/agent.py
Normal file
@@ -0,0 +1,182 @@
|
||||
|
||||
import os
|
||||
import re
|
||||
import yaml
|
||||
import json
|
||||
from fastapi import FastAPI, Request, HTTPException
|
||||
from pydantic import BaseModel
|
||||
import docker
|
||||
|
||||
# Basic AI Ops Agent for Docker Swarm
|
||||
PORT = int(os.getenv("PORT", "8080"))
|
||||
GUARD_DEFAULT_MAX = 25
|
||||
|
||||
# Load rules file if present
|
||||
RULES_PATH = "rules.yaml"
|
||||
if os.path.exists(RULES_PATH):
|
||||
with open(RULES_PATH, "r") as f:
|
||||
RULES = yaml.safe_load(f) or {}
|
||||
else:
|
||||
RULES = {}
|
||||
|
||||
ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
|
||||
MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
|
||||
|
||||
# Docker client using the socket
|
||||
client = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
||||
|
||||
app = FastAPI(title="BurnServ AI Ops Agent")
|
||||
|
||||
|
||||
def _guard_service(service_name: str):
|
||||
if not ALLOWED_REGEX.match(service_name):
|
||||
raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
|
||||
|
||||
|
||||
def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
|
||||
_guard_service(service_name)
|
||||
try:
|
||||
svc = client.services.get(service_name)
|
||||
except docker.errors.NotFound:
|
||||
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
|
||||
spec = svc.attrs.get('Spec', {}).copy()
|
||||
mode = spec.get('Mode', {})
|
||||
replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
|
||||
|
||||
if replicas is None:
|
||||
# step-based scaling
|
||||
tgt = int(replicas_current) + int(step or 1)
|
||||
if min_replicas is not None:
|
||||
tgt = max(tgt, int(min_replicas))
|
||||
if max_replicas is not None:
|
||||
tgt = min(tgt, int(max_replicas))
|
||||
replicas = tgt
|
||||
|
||||
replicas = max(1, min(int(replicas), MAX_SCALE))
|
||||
mode['Replicated'] = {'Replicas': replicas}
|
||||
spec['Mode'] = mode
|
||||
|
||||
# svc.update expects keyword args matching the service spec shape; pass the full spec with update
|
||||
try:
|
||||
svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
|
||||
except Exception:
|
||||
# Fallback: use the update with the raw spec (works in many docker-py versions)
|
||||
svc.update(**spec)
|
||||
return {"service": service_name, "replicas": replicas}
|
||||
|
||||
|
||||
def _restart_service(service_name: str):
|
||||
_guard_service(service_name)
|
||||
try:
|
||||
svc = client.services.get(service_name)
|
||||
except docker.errors.NotFound:
|
||||
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
|
||||
spec = svc.attrs.get('Spec', {}).copy()
|
||||
# Force a rolling update by bumping ForceUpdate / Version index
|
||||
try:
|
||||
current_index = svc.attrs.get('Version', {}).get('Index', 0)
|
||||
svc.update(**spec, force_update=current_index + 1)
|
||||
except Exception:
|
||||
# If update signature differs, try a simple update
|
||||
svc.update(**spec)
|
||||
return {"service": service_name, "status": "rolling-restart-issued"}
|
||||
|
||||
|
||||
class Command(BaseModel):
|
||||
text: str = ""
|
||||
action: str | None = None
|
||||
params: dict | None = None
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@app.post("/command")
|
||||
def command(cmd: Command):
|
||||
\"\"\"Accepts either structured commands or simple text instructions.
|
||||
|
||||
Structured example:
|
||||
{"action":"scale","params":{"service":"weblabs_php","replicas":3}}
|
||||
|
||||
Text example:
|
||||
{"text":"scale weblabs_php to 3"}
|
||||
\"\"\"
|
||||
# Structured commands first
|
||||
if cmd.action:
|
||||
if cmd.action == "scale":
|
||||
p = cmd.params or {}
|
||||
if "service" not in p or "replicas" not in p:
|
||||
raise HTTPException(status_code=400, detail="Missing params for scale: service, replicas")
|
||||
return _scale(p["service"], replicas=int(p["replicas"]))
|
||||
elif cmd.action == "restart_service":
|
||||
p = cmd.params or {}
|
||||
if "service" not in p:
|
||||
raise HTTPException(status_code=400, detail="Missing service param for restart_service")
|
||||
return _restart_service(p["service"])
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="Unknown action")
|
||||
|
||||
# Free-text parsing (simple)
|
||||
t = (cmd.text or "").strip().lower()
|
||||
if not t:
|
||||
raise HTTPException(status_code=400, detail="Empty command")
|
||||
|
||||
if t.startswith("scale "):
|
||||
# "scale weblabs_php to 3"
|
||||
parts = t.split()
|
||||
try:
|
||||
svc = parts[1]
|
||||
if "to" in parts:
|
||||
idx_to = parts.index("to")
|
||||
reps = int(parts[idx_to + 1])
|
||||
return _scale(svc, replicas=reps)
|
||||
else:
|
||||
# if no explicit amount, treat as step +1
|
||||
return _scale(svc, step=1)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
|
||||
elif t.startswith("restart "):
|
||||
# "restart weblabs_php"
|
||||
try:
|
||||
svc = t.split()[1]
|
||||
return _restart_service(svc)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=400, detail="Format: 'restart <service>'")
|
||||
|
||||
raise HTTPException(status_code=400, detail="Unrecognized command")
|
||||
|
||||
|
||||
@app.post("/alert")
|
||||
async def alert(request: Request):
|
||||
\"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\"
|
||||
payload = await request.json()
|
||||
alerts = payload.get("alerts", [])
|
||||
executed = []
|
||||
|
||||
for a in alerts:
|
||||
labels = a.get("labels", {}) or {}
|
||||
# For each rule in RULES, check if the match conditions apply
|
||||
for rule in RULES.get("alerts", []):
|
||||
match = rule.get("match", {})
|
||||
if all(labels.get(k) == v for k, v in match.items()):
|
||||
for act in rule.get("actions", []):
|
||||
if "scale" in act:
|
||||
cfg = act["scale"].copy()
|
||||
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
|
||||
res = _scale(
|
||||
service_name=svc,
|
||||
replicas=cfg.get("replicas"),
|
||||
step=cfg.get("step"),
|
||||
min_replicas=cfg.get("min_replicas"),
|
||||
max_replicas=cfg.get("max_replicas", MAX_SCALE),
|
||||
)
|
||||
executed.append({"alert": labels.get("alertname"), "action": "scale", "result": res})
|
||||
elif "restart_service" in act:
|
||||
cfg = act["restart_service"].copy()
|
||||
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
|
||||
res = _restart_service(svc)
|
||||
executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
|
||||
|
||||
return {"executed": executed}
|
||||
5
agent/requirements.txt
Normal file
5
agent/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.6
|
||||
PyYAML==6.0.2
|
||||
docker==7.1.0
|
||||
httpx==0.27.2
|
||||
30
agent/rules.yaml
Normal file
30
agent/rules.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
alerts:
|
||||
- match:
|
||||
alertname: HighCPU
|
||||
severity: warning
|
||||
actions:
|
||||
- scale:
|
||||
service: "weblabs_php"
|
||||
min_replicas: 2
|
||||
step: 1
|
||||
max_replicas: 10
|
||||
- match:
|
||||
alertname: ServiceDown
|
||||
severity: critical
|
||||
actions:
|
||||
- restart_service:
|
||||
service: "{{ $labels.service_name }}"
|
||||
|
||||
commands:
|
||||
- intent: "scale"
|
||||
schema:
|
||||
service: str
|
||||
replicas: int
|
||||
action:
|
||||
scale:
|
||||
service: "{{service}}"
|
||||
replicas: "{{replicas}}"
|
||||
|
||||
guardrails:
|
||||
allowed_services_regex: "^(weblabs_.*|wordpress_.*|nginx_.*|php_.*|redis_.*|mysql_.*)$"
|
||||
max_scale_replicas: 25
|
||||
Reference in New Issue
Block a user