This commit is contained in:
2025-09-18 21:49:46 +00:00
parent db3c2336ac
commit f059388b26
2 changed files with 76 additions and 77 deletions

View File

@@ -1,17 +1,15 @@
import os, re, yaml
import os
import re
import yaml
import json
from fastapi import FastAPI, Request, HTTPException from fastapi import FastAPI, Request, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
import docker
# Basic AI Ops Agent for Docker Swarm try:
import docker
except Exception:
docker = None
PORT = int(os.getenv("PORT", "8080")) PORT = int(os.getenv("PORT", "8080"))
GUARD_DEFAULT_MAX = 25 GUARD_DEFAULT_MAX = 25
# Load rules file if present
RULES_PATH = "rules.yaml" RULES_PATH = "rules.yaml"
if os.path.exists(RULES_PATH): if os.path.exists(RULES_PATH):
with open(RULES_PATH, "r") as f: with open(RULES_PATH, "r") as f:
@@ -22,88 +20,104 @@ else:
ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*")) ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX)) MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
# Docker client using the socket DOCKER_SOCK = "/var/run/docker.sock"
client = docker.DockerClient(base_url='unix://var/run/docker.sock') _client = None
def get_client():
global _client
if _client is not None:
return _client
if docker is None:
raise HTTPException(status_code=500, detail="docker SDK not available in image.")
if not os.path.exists(DOCKER_SOCK):
raise HTTPException(status_code=500, detail=f"Docker socket not found at {DOCKER_SOCK}. Did you mount it?")
try:
_client = docker.DockerClient(base_url=f'unix://{DOCKER_SOCK}')
_client.ping()
return _client
except Exception as e:
raise HTTPException(status_code=500, detail=f"Cannot connect to Docker at {DOCKER_SOCK}: {e}")
app = FastAPI(title="BurnServ AI Ops Agent") app = FastAPI(title="BurnServ AI Ops Agent")
def _guard_service(service_name: str): def _guard_service(service_name: str):
if not ALLOWED_REGEX.match(service_name): if not ALLOWED_REGEX.match(service_name):
raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.") raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None): def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
cli = get_client()
_guard_service(service_name) _guard_service(service_name)
try: try:
svc = client.services.get(service_name) svc = cli.services.get(service_name)
except docker.errors.NotFound: except Exception:
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found") raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
spec = svc.attrs.get('Spec', {}).copy() spec = svc.attrs.get('Spec', {}).copy()
mode = spec.get('Mode', {}) mode = spec.get('Mode', {})
replicas_current = mode.get('Replicated', {}).get('Replicas', 1) replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
if replicas is None: if replicas is None:
# step-based scaling
tgt = int(replicas_current) + int(step or 1) tgt = int(replicas_current) + int(step or 1)
if min_replicas is not None: if min_replicas is not None:
tgt = max(tgt, int(min_replicas)) tgt = max(tgt, int(min_replicas))
if max_replicas is not None: if max_replicas is not None:
tgt = min(tgt, int(max_replicas)) tgt = min(tgt, int(max_replicas))
replicas = tgt replicas = tgt
replicas = max(1, min(int(replicas), MAX_SCALE)) replicas = max(1, min(int(replicas), MAX_SCALE))
mode['Replicated'] = {'Replicas': replicas} mode['Replicated'] = {'Replicas': replicas}
spec['Mode'] = mode spec['Mode'] = mode
# svc.update expects keyword args matching the service spec shape; pass the full spec with update
try: try:
svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'}) svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
except Exception: except Exception:
# Fallback: use the update with the raw spec (works in many docker-py versions)
svc.update(**spec) svc.update(**spec)
return {"service": service_name, "replicas": replicas} return {"service": service_name, "replicas": replicas}
def _restart_service(service_name: str): def _restart_service(service_name: str):
cli = get_client()
_guard_service(service_name) _guard_service(service_name)
try: try:
svc = client.services.get(service_name) svc = cli.services.get(service_name)
except docker.errors.NotFound: except Exception:
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found") raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
spec = svc.attrs.get('Spec', {}).copy() spec = svc.attrs.get('Spec', {}).copy()
# Force a rolling update by bumping ForceUpdate / Version index
try: try:
current_index = svc.attrs.get('Version', {}).get('Index', 0) current_index = svc.attrs.get('Version', {}).get('Index', 0)
svc.update(**spec, force_update=current_index + 1) svc.update(**spec, force_update=current_index + 1)
except Exception: except Exception:
# If update signature differs, try a simple update
svc.update(**spec) svc.update(**spec)
return {"service": service_name, "status": "rolling-restart-issued"} return {"service": service_name, "status": "rolling-restart-issued"}
class Command(BaseModel): class Command(BaseModel):
text: str = "" text: str = ""
action: str | None = None action: str | None = None
params: dict | None = None params: dict | None = None
@app.get("/health") @app.get("/health")
def health(): def health():
return {"ok": True} return {"ok": True}
@app.get("/diagnostics")
def diagnostics():
info = {
"docker_sock_exists": os.path.exists(DOCKER_SOCK),
"docker_sock_path": DOCKER_SOCK,
"uid": os.getuid() if hasattr(os, "getuid") else "n/a",
"gid": os.getgid() if hasattr(os, "getgid") else "n/a",
"env_PORT": PORT,
"rules_loaded": bool(RULES),
}
try:
cli = get_client()
info["docker_ping"] = True
info["server_version"] = cli.version()
except HTTPException as he:
info["docker_ping"] = False
info["error"] = he.detail
except Exception as e:
info["docker_ping"] = False
info["error"] = str(e)
return info
@app.post("/command") @app.post("/command")
def command(cmd: Command): def command(cmd: Command):
\"\"\"Accepts either structured commands or simple text instructions.
Structured example:
{"action":"scale","params":{"service":"weblabs_php","replicas":3}}
Text example:
{"text":"scale weblabs_php to 3"}
\"\"\"
# Structured commands first
if cmd.action: if cmd.action:
if cmd.action == "scale": if cmd.action == "scale":
p = cmd.params or {} p = cmd.params or {}
@@ -115,16 +129,13 @@ def command(cmd: Command):
if "service" not in p: if "service" not in p:
raise HTTPException(status_code=400, detail="Missing service param for restart_service") raise HTTPException(status_code=400, detail="Missing service param for restart_service")
return _restart_service(p["service"]) return _restart_service(p["service"])
else: raise HTTPException(status_code=400, detail="Unknown action")
raise HTTPException(status_code=400, detail="Unknown action")
# Free-text parsing (simple)
t = (cmd.text or "").strip().lower() t = (cmd.text or "").strip().lower()
if not t: if not t:
raise HTTPException(status_code=400, detail="Empty command") raise HTTPException(status_code=400, detail="Empty command")
if t.startswith("scale "): if t.startswith("scale "):
# "scale weblabs_php to 3"
parts = t.split() parts = t.split()
try: try:
svc = parts[1] svc = parts[1]
@@ -132,13 +143,10 @@ def command(cmd: Command):
idx_to = parts.index("to") idx_to = parts.index("to")
reps = int(parts[idx_to + 1]) reps = int(parts[idx_to + 1])
return _scale(svc, replicas=reps) return _scale(svc, replicas=reps)
else: return _scale(svc, step=1)
# if no explicit amount, treat as step +1
return _scale(svc, step=1)
except Exception: except Exception:
raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'") raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
elif t.startswith("restart "): if t.startswith("restart "):
# "restart weblabs_php"
try: try:
svc = t.split()[1] svc = t.split()[1]
return _restart_service(svc) return _restart_service(svc)
@@ -147,17 +155,13 @@ def command(cmd: Command):
raise HTTPException(status_code=400, detail="Unrecognized command") raise HTTPException(status_code=400, detail="Unrecognized command")
@app.post("/alert") @app.post("/alert")
async def alert(request: Request): async def alert(request: Request):
\"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\"
payload = await request.json() payload = await request.json()
alerts = payload.get("alerts", []) alerts = payload.get("alerts", [])
executed = [] executed = []
for a in alerts: for a in alerts:
labels = a.get("labels", {}) or {} labels = a.get("labels", {}) or {}
# For each rule in RULES, check if the match conditions apply
for rule in RULES.get("alerts", []): for rule in RULES.get("alerts", []):
match = rule.get("match", {}) match = rule.get("match", {})
if all(labels.get(k) == v for k, v in match.items()): if all(labels.get(k) == v for k, v in match.items()):
@@ -178,5 +182,4 @@ async def alert(request: Request):
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", "")) svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
res = _restart_service(svc) res = _restart_service(svc)
executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res}) executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
return {"executed": executed} return {"executed": executed}

View File

@@ -19,18 +19,16 @@ secrets:
services: services:
ai-agent: ai-agent:
build: image: hostlabs/ai-agent:latest
context: ./agent
image: burnserv/ai-agent:latest
networks: [opsNet] networks: [opsNet]
ports: ports:
- "8080:8080" - "8080:8080" # expose only if you want host access; remove if internal-only
deploy: deploy:
mode: replicated mode: replicated
replicas: 1 replicas: 1
placement: placement:
constraints: constraints:
- node.role == manager - node.role == manager # must be on a manager to control Swarm via docker.sock
labels: labels:
- "ai.agent=true" - "ai.agent=true"
volumes: volumes:
@@ -39,6 +37,26 @@ services:
- source: rules.yaml - source: rules.yaml
target: /app/rules.yaml target: /app/rules.yaml
relay:
image: hostlabs/ai-relay:latest
networks: [opsNet]
depends_on: [ai-agent]
environment:
- OPENAI_MODEL=gpt-4o-mini
- AGENT_URL=http://ai-agent:8080 # DNS name on opsNet
- OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
secrets:
- source: openai_api_key
target: openai_api_key
ports:
- "8090:8090"
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
prometheus: prometheus:
image: prom/prometheus:v2.55.0 image: prom/prometheus:v2.55.0
networks: [opsNet] networks: [opsNet]
@@ -95,25 +113,3 @@ services:
- --path.rootfs=/host - --path.rootfs=/host
volumes: volumes:
- /:/host:ro,rslave - /:/host:ro,rslave
relay:
build:
context: ./relay
image: burnserv/ai-relay:latest
networks: [opsNet]
depends_on: [ai-agent]
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
environment:
- OPENAI_MODEL=gpt-4o-mini
- AGENT_URL=http://ai-agent:8080
- OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
secrets:
- source: openai_api_key
target: openai_api_key
ports:
- "8090:8090"