fixes
This commit is contained in:
103
agent/agent.py
103
agent/agent.py
@@ -1,17 +1,15 @@
|
|||||||
|
import os, re, yaml
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import yaml
|
|
||||||
import json
|
|
||||||
from fastapi import FastAPI, Request, HTTPException
|
from fastapi import FastAPI, Request, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
import docker
|
|
||||||
|
|
||||||
# Basic AI Ops Agent for Docker Swarm
|
try:
|
||||||
|
import docker
|
||||||
|
except Exception:
|
||||||
|
docker = None
|
||||||
|
|
||||||
PORT = int(os.getenv("PORT", "8080"))
|
PORT = int(os.getenv("PORT", "8080"))
|
||||||
GUARD_DEFAULT_MAX = 25
|
GUARD_DEFAULT_MAX = 25
|
||||||
|
|
||||||
# Load rules file if present
|
|
||||||
RULES_PATH = "rules.yaml"
|
RULES_PATH = "rules.yaml"
|
||||||
if os.path.exists(RULES_PATH):
|
if os.path.exists(RULES_PATH):
|
||||||
with open(RULES_PATH, "r") as f:
|
with open(RULES_PATH, "r") as f:
|
||||||
@@ -22,88 +20,104 @@ else:
|
|||||||
ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
|
ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
|
||||||
MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
|
MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
|
||||||
|
|
||||||
# Docker client using the socket
|
DOCKER_SOCK = "/var/run/docker.sock"
|
||||||
client = docker.DockerClient(base_url='unix://var/run/docker.sock')
|
_client = None
|
||||||
|
|
||||||
|
def get_client():
|
||||||
|
global _client
|
||||||
|
if _client is not None:
|
||||||
|
return _client
|
||||||
|
if docker is None:
|
||||||
|
raise HTTPException(status_code=500, detail="docker SDK not available in image.")
|
||||||
|
if not os.path.exists(DOCKER_SOCK):
|
||||||
|
raise HTTPException(status_code=500, detail=f"Docker socket not found at {DOCKER_SOCK}. Did you mount it?")
|
||||||
|
try:
|
||||||
|
_client = docker.DockerClient(base_url=f'unix://{DOCKER_SOCK}')
|
||||||
|
_client.ping()
|
||||||
|
return _client
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"Cannot connect to Docker at {DOCKER_SOCK}: {e}")
|
||||||
|
|
||||||
app = FastAPI(title="BurnServ AI Ops Agent")
|
app = FastAPI(title="BurnServ AI Ops Agent")
|
||||||
|
|
||||||
|
|
||||||
def _guard_service(service_name: str):
|
def _guard_service(service_name: str):
|
||||||
if not ALLOWED_REGEX.match(service_name):
|
if not ALLOWED_REGEX.match(service_name):
|
||||||
raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
|
raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
|
||||||
|
|
||||||
|
|
||||||
def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
|
def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
|
||||||
|
cli = get_client()
|
||||||
_guard_service(service_name)
|
_guard_service(service_name)
|
||||||
try:
|
try:
|
||||||
svc = client.services.get(service_name)
|
svc = cli.services.get(service_name)
|
||||||
except docker.errors.NotFound:
|
except Exception:
|
||||||
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
|
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
|
||||||
spec = svc.attrs.get('Spec', {}).copy()
|
spec = svc.attrs.get('Spec', {}).copy()
|
||||||
mode = spec.get('Mode', {})
|
mode = spec.get('Mode', {})
|
||||||
replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
|
replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
|
||||||
|
|
||||||
if replicas is None:
|
if replicas is None:
|
||||||
# step-based scaling
|
|
||||||
tgt = int(replicas_current) + int(step or 1)
|
tgt = int(replicas_current) + int(step or 1)
|
||||||
if min_replicas is not None:
|
if min_replicas is not None:
|
||||||
tgt = max(tgt, int(min_replicas))
|
tgt = max(tgt, int(min_replicas))
|
||||||
if max_replicas is not None:
|
if max_replicas is not None:
|
||||||
tgt = min(tgt, int(max_replicas))
|
tgt = min(tgt, int(max_replicas))
|
||||||
replicas = tgt
|
replicas = tgt
|
||||||
|
|
||||||
replicas = max(1, min(int(replicas), MAX_SCALE))
|
replicas = max(1, min(int(replicas), MAX_SCALE))
|
||||||
mode['Replicated'] = {'Replicas': replicas}
|
mode['Replicated'] = {'Replicas': replicas}
|
||||||
spec['Mode'] = mode
|
spec['Mode'] = mode
|
||||||
|
|
||||||
# svc.update expects keyword args matching the service spec shape; pass the full spec with update
|
|
||||||
try:
|
try:
|
||||||
svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
|
svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fallback: use the update with the raw spec (works in many docker-py versions)
|
|
||||||
svc.update(**spec)
|
svc.update(**spec)
|
||||||
return {"service": service_name, "replicas": replicas}
|
return {"service": service_name, "replicas": replicas}
|
||||||
|
|
||||||
|
|
||||||
def _restart_service(service_name: str):
|
def _restart_service(service_name: str):
|
||||||
|
cli = get_client()
|
||||||
_guard_service(service_name)
|
_guard_service(service_name)
|
||||||
try:
|
try:
|
||||||
svc = client.services.get(service_name)
|
svc = cli.services.get(service_name)
|
||||||
except docker.errors.NotFound:
|
except Exception:
|
||||||
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
|
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
|
||||||
spec = svc.attrs.get('Spec', {}).copy()
|
spec = svc.attrs.get('Spec', {}).copy()
|
||||||
# Force a rolling update by bumping ForceUpdate / Version index
|
|
||||||
try:
|
try:
|
||||||
current_index = svc.attrs.get('Version', {}).get('Index', 0)
|
current_index = svc.attrs.get('Version', {}).get('Index', 0)
|
||||||
svc.update(**spec, force_update=current_index + 1)
|
svc.update(**spec, force_update=current_index + 1)
|
||||||
except Exception:
|
except Exception:
|
||||||
# If update signature differs, try a simple update
|
|
||||||
svc.update(**spec)
|
svc.update(**spec)
|
||||||
return {"service": service_name, "status": "rolling-restart-issued"}
|
return {"service": service_name, "status": "rolling-restart-issued"}
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseModel):
|
class Command(BaseModel):
|
||||||
text: str = ""
|
text: str = ""
|
||||||
action: str | None = None
|
action: str | None = None
|
||||||
params: dict | None = None
|
params: dict | None = None
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
def health():
|
def health():
|
||||||
return {"ok": True}
|
return {"ok": True}
|
||||||
|
|
||||||
|
@app.get("/diagnostics")
|
||||||
|
def diagnostics():
|
||||||
|
info = {
|
||||||
|
"docker_sock_exists": os.path.exists(DOCKER_SOCK),
|
||||||
|
"docker_sock_path": DOCKER_SOCK,
|
||||||
|
"uid": os.getuid() if hasattr(os, "getuid") else "n/a",
|
||||||
|
"gid": os.getgid() if hasattr(os, "getgid") else "n/a",
|
||||||
|
"env_PORT": PORT,
|
||||||
|
"rules_loaded": bool(RULES),
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
cli = get_client()
|
||||||
|
info["docker_ping"] = True
|
||||||
|
info["server_version"] = cli.version()
|
||||||
|
except HTTPException as he:
|
||||||
|
info["docker_ping"] = False
|
||||||
|
info["error"] = he.detail
|
||||||
|
except Exception as e:
|
||||||
|
info["docker_ping"] = False
|
||||||
|
info["error"] = str(e)
|
||||||
|
return info
|
||||||
|
|
||||||
@app.post("/command")
|
@app.post("/command")
|
||||||
def command(cmd: Command):
|
def command(cmd: Command):
|
||||||
\"\"\"Accepts either structured commands or simple text instructions.
|
|
||||||
|
|
||||||
Structured example:
|
|
||||||
{"action":"scale","params":{"service":"weblabs_php","replicas":3}}
|
|
||||||
|
|
||||||
Text example:
|
|
||||||
{"text":"scale weblabs_php to 3"}
|
|
||||||
\"\"\"
|
|
||||||
# Structured commands first
|
|
||||||
if cmd.action:
|
if cmd.action:
|
||||||
if cmd.action == "scale":
|
if cmd.action == "scale":
|
||||||
p = cmd.params or {}
|
p = cmd.params or {}
|
||||||
@@ -115,16 +129,13 @@ def command(cmd: Command):
|
|||||||
if "service" not in p:
|
if "service" not in p:
|
||||||
raise HTTPException(status_code=400, detail="Missing service param for restart_service")
|
raise HTTPException(status_code=400, detail="Missing service param for restart_service")
|
||||||
return _restart_service(p["service"])
|
return _restart_service(p["service"])
|
||||||
else:
|
raise HTTPException(status_code=400, detail="Unknown action")
|
||||||
raise HTTPException(status_code=400, detail="Unknown action")
|
|
||||||
|
|
||||||
# Free-text parsing (simple)
|
|
||||||
t = (cmd.text or "").strip().lower()
|
t = (cmd.text or "").strip().lower()
|
||||||
if not t:
|
if not t:
|
||||||
raise HTTPException(status_code=400, detail="Empty command")
|
raise HTTPException(status_code=400, detail="Empty command")
|
||||||
|
|
||||||
if t.startswith("scale "):
|
if t.startswith("scale "):
|
||||||
# "scale weblabs_php to 3"
|
|
||||||
parts = t.split()
|
parts = t.split()
|
||||||
try:
|
try:
|
||||||
svc = parts[1]
|
svc = parts[1]
|
||||||
@@ -132,13 +143,10 @@ def command(cmd: Command):
|
|||||||
idx_to = parts.index("to")
|
idx_to = parts.index("to")
|
||||||
reps = int(parts[idx_to + 1])
|
reps = int(parts[idx_to + 1])
|
||||||
return _scale(svc, replicas=reps)
|
return _scale(svc, replicas=reps)
|
||||||
else:
|
return _scale(svc, step=1)
|
||||||
# if no explicit amount, treat as step +1
|
|
||||||
return _scale(svc, step=1)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
|
raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
|
||||||
elif t.startswith("restart "):
|
if t.startswith("restart "):
|
||||||
# "restart weblabs_php"
|
|
||||||
try:
|
try:
|
||||||
svc = t.split()[1]
|
svc = t.split()[1]
|
||||||
return _restart_service(svc)
|
return _restart_service(svc)
|
||||||
@@ -147,17 +155,13 @@ def command(cmd: Command):
|
|||||||
|
|
||||||
raise HTTPException(status_code=400, detail="Unrecognized command")
|
raise HTTPException(status_code=400, detail="Unrecognized command")
|
||||||
|
|
||||||
|
|
||||||
@app.post("/alert")
|
@app.post("/alert")
|
||||||
async def alert(request: Request):
|
async def alert(request: Request):
|
||||||
\"\"\"Receive Alertmanager webhooks and execute configured actions based on rules.yaml.\"\"\"
|
|
||||||
payload = await request.json()
|
payload = await request.json()
|
||||||
alerts = payload.get("alerts", [])
|
alerts = payload.get("alerts", [])
|
||||||
executed = []
|
executed = []
|
||||||
|
|
||||||
for a in alerts:
|
for a in alerts:
|
||||||
labels = a.get("labels", {}) or {}
|
labels = a.get("labels", {}) or {}
|
||||||
# For each rule in RULES, check if the match conditions apply
|
|
||||||
for rule in RULES.get("alerts", []):
|
for rule in RULES.get("alerts", []):
|
||||||
match = rule.get("match", {})
|
match = rule.get("match", {})
|
||||||
if all(labels.get(k) == v for k, v in match.items()):
|
if all(labels.get(k) == v for k, v in match.items()):
|
||||||
@@ -178,5 +182,4 @@ async def alert(request: Request):
|
|||||||
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
|
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
|
||||||
res = _restart_service(svc)
|
res = _restart_service(svc)
|
||||||
executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
|
executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
|
||||||
|
|
||||||
return {"executed": executed}
|
return {"executed": executed}
|
||||||
|
|||||||
50
stack.yml
50
stack.yml
@@ -19,18 +19,16 @@ secrets:
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
ai-agent:
|
ai-agent:
|
||||||
build:
|
image: hostlabs/ai-agent:latest
|
||||||
context: ./agent
|
|
||||||
image: burnserv/ai-agent:latest
|
|
||||||
networks: [opsNet]
|
networks: [opsNet]
|
||||||
ports:
|
ports:
|
||||||
- "8080:8080"
|
- "8080:8080" # expose only if you want host access; remove if internal-only
|
||||||
deploy:
|
deploy:
|
||||||
mode: replicated
|
mode: replicated
|
||||||
replicas: 1
|
replicas: 1
|
||||||
placement:
|
placement:
|
||||||
constraints:
|
constraints:
|
||||||
- node.role == manager
|
- node.role == manager # must be on a manager to control Swarm via docker.sock
|
||||||
labels:
|
labels:
|
||||||
- "ai.agent=true"
|
- "ai.agent=true"
|
||||||
volumes:
|
volumes:
|
||||||
@@ -39,6 +37,26 @@ services:
|
|||||||
- source: rules.yaml
|
- source: rules.yaml
|
||||||
target: /app/rules.yaml
|
target: /app/rules.yaml
|
||||||
|
|
||||||
|
relay:
|
||||||
|
image: hostlabs/ai-relay:latest
|
||||||
|
networks: [opsNet]
|
||||||
|
depends_on: [ai-agent]
|
||||||
|
environment:
|
||||||
|
- OPENAI_MODEL=gpt-4o-mini
|
||||||
|
- AGENT_URL=http://ai-agent:8080 # DNS name on opsNet
|
||||||
|
- OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
|
||||||
|
secrets:
|
||||||
|
- source: openai_api_key
|
||||||
|
target: openai_api_key
|
||||||
|
ports:
|
||||||
|
- "8090:8090"
|
||||||
|
deploy:
|
||||||
|
mode: replicated
|
||||||
|
replicas: 1
|
||||||
|
placement:
|
||||||
|
constraints:
|
||||||
|
- node.role == manager
|
||||||
|
|
||||||
prometheus:
|
prometheus:
|
||||||
image: prom/prometheus:v2.55.0
|
image: prom/prometheus:v2.55.0
|
||||||
networks: [opsNet]
|
networks: [opsNet]
|
||||||
@@ -95,25 +113,3 @@ services:
|
|||||||
- --path.rootfs=/host
|
- --path.rootfs=/host
|
||||||
volumes:
|
volumes:
|
||||||
- /:/host:ro,rslave
|
- /:/host:ro,rslave
|
||||||
|
|
||||||
relay:
|
|
||||||
build:
|
|
||||||
context: ./relay
|
|
||||||
image: burnserv/ai-relay:latest
|
|
||||||
networks: [opsNet]
|
|
||||||
depends_on: [ai-agent]
|
|
||||||
deploy:
|
|
||||||
mode: replicated
|
|
||||||
replicas: 1
|
|
||||||
placement:
|
|
||||||
constraints:
|
|
||||||
- node.role == manager
|
|
||||||
environment:
|
|
||||||
- OPENAI_MODEL=gpt-4o-mini
|
|
||||||
- AGENT_URL=http://ai-agent:8080
|
|
||||||
- OPENAI_API_KEY_FILE=/run/secrets/openai_api_key
|
|
||||||
secrets:
|
|
||||||
- source: openai_api_key
|
|
||||||
target: openai_api_key
|
|
||||||
ports:
|
|
||||||
- "8090:8090"
|
|
||||||
|
|||||||
Reference in New Issue
Block a user