This commit is contained in:
2025-09-19 18:31:57 +00:00
parent ecc0b6642f
commit 48bde05d50
4 changed files with 393 additions and 6 deletions

185
agent/agent-fixed.py Normal file
View File

@@ -0,0 +1,185 @@
import os, re, yaml
from fastapi import FastAPI, Request, HTTPException
from pydantic import BaseModel
try:
import docker
except Exception:
docker = None
PORT = int(os.getenv("PORT", "8080"))
GUARD_DEFAULT_MAX = 25
RULES_PATH = "rules.yaml"
if os.path.exists(RULES_PATH):
with open(RULES_PATH, "r") as f:
RULES = yaml.safe_load(f) or {}
else:
RULES = {}
ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
DOCKER_SOCK = "/var/run/docker.sock"
_client = None
def get_client():
global _client
if _client is not None:
return _client
if docker is None:
raise HTTPException(status_code=500, detail="docker SDK not available in image.")
if not os.path.exists(DOCKER_SOCK):
raise HTTPException(status_code=500, detail=f"Docker socket not found at {DOCKER_SOCK}. Did you mount it?")
try:
_client = docker.DockerClient(base_url=f'unix://{DOCKER_SOCK}')
_client.ping()
return _client
except Exception as e:
raise HTTPException(status_code=500, detail=f"Cannot connect to Docker at {DOCKER_SOCK}: {e}")
app = FastAPI(title="BurnServ AI Ops Agent")
def _guard_service(service_name: str):
if not ALLOWED_REGEX.match(service_name):
raise HTTPException(status_code=403, detail=f"Service '{service_name}' not allowed by guardrails.")
def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
cli = get_client()
_guard_service(service_name)
try:
svc = cli.services.get(service_name)
except Exception:
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
spec = svc.attrs.get('Spec', {}).copy()
mode = spec.get('Mode', {})
replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
if replicas is None:
tgt = int(replicas_current) + int(step or 1)
if min_replicas is not None:
tgt = max(tgt, int(min_replicas))
if max_replicas is not None:
tgt = min(tgt, int(max_replicas))
replicas = tgt
replicas = max(1, min(int(replicas), MAX_SCALE))
mode['Replicated'] = {'Replicas': replicas}
spec['Mode'] = mode
try:
svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
except Exception:
svc.update(**spec)
return {"service": service_name, "replicas": replicas}
def _restart_service(service_name: str):
cli = get_client()
_guard_service(service_name)
try:
svc = cli.services.get(service_name)
except Exception:
raise HTTPException(status_code=404, detail=f"Service '{service_name}' not found")
spec = svc.attrs.get('Spec', {}).copy()
try:
current_index = svc.attrs.get('Version', {}).get('Index', 0)
svc.update(**spec, force_update=current_index + 1)
except Exception:
svc.update(**spec)
return {"service": service_name, "status": "rolling-restart-issued"}
class Command(BaseModel):
text: str = ""
action: str | None = None
params: dict | None = None
@app.get("/health")
def health():
return {"ok": True}
@app.get("/diagnostics")
def diagnostics():
info = {
"docker_sock_exists": os.path.exists(DOCKER_SOCK),
"docker_sock_path": DOCKER_SOCK,
"uid": os.getuid() if hasattr(os, "getuid") else "n/a",
"gid": os.getgid() if hasattr(os, "getgid") else "n/a",
"env_PORT": PORT,
"rules_loaded": bool(RULES),
}
try:
cli = get_client()
info["docker_ping"] = True
info["server_version"] = cli.version()
except HTTPException as he:
info["docker_ping"] = False
info["error"] = he.detail
except Exception as e:
info["docker_ping"] = False
info["error"] = str(e)
return info
@app.post("/command")
def command(cmd: Command):
if cmd.action:
if cmd.action == "scale":
p = cmd.params or {}
if "service" not in p or "replicas" not in p:
raise HTTPException(status_code=400, detail="Missing params for scale: service, replicas")
return _scale(p["service"], replicas=int(p["replicas"]))
elif cmd.action == "restart_service":
p = cmd.params or {}
if "service" not in p:
raise HTTPException(status_code=400, detail="Missing service param for restart_service")
return _restart_service(p["service"])
raise HTTPException(status_code=400, detail="Unknown action")
t = (cmd.text or "").strip().lower()
if not t:
raise HTTPException(status_code=400, detail="Empty command")
if t.startswith("scale "):
parts = t.split()
try:
svc = parts[1]
if "to" in parts:
idx_to = parts.index("to")
reps = int(parts[idx_to + 1])
return _scale(svc, replicas=reps)
return _scale(svc, step=1)
except Exception:
raise HTTPException(status_code=400, detail="Format: 'scale <service> to <n>'")
if t.startswith("restart "):
try:
svc = t.split()[1]
return _restart_service(svc)
except Exception:
raise HTTPException(status_code=400, detail="Format: 'restart <service>'")
raise HTTPException(status_code=400, detail="Unrecognized command")
@app.post("/alert")
async def alert(request: Request):
payload = await request.json()
alerts = payload.get("alerts", [])
executed = []
for a in alerts:
labels = a.get("labels", {}) or {}
for rule in RULES.get("alerts", []):
match = rule.get("match", {})
if all(labels.get(k) == v for k, v in match.items()):
for act in rule.get("actions", []):
if "scale" in act:
cfg = act["scale"].copy()
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
res = _scale(
service_name=svc,
replicas=cfg.get("replicas"),
step=cfg.get("step"),
min_replicas=cfg.get("min_replicas"),
max_replicas=cfg.get("max_replicas", MAX_SCALE),
)
executed.append({"alert": labels.get("alertname"), "action": "scale", "result": res})
elif "restart_service" in act:
cfg = act["restart_service"].copy()
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
res = _restart_service(svc)
executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
return {"executed": executed}

182
agent/agentv2.py Normal file
View File

@@ -0,0 +1,182 @@
import os, re, yaml
from fastapi import FastAPI, Request, HTTPException
from pydantic import BaseModel
try:
import docker
except Exception:
docker = None
PORT = int(os.getenv("PORT", "8080"))
GUARD_DEFAULT_MAX = 25
RULES_PATH = "rules.yaml"
if os.path.exists(RULES_PATH):
with open(RULES_PATH, "r") as f:
RULES = yaml.safe_load(f) or {}
else:
RULES = {}
ALLOWED_REGEX = re.compile(RULES.get("guardrails", {}).get("allowed_services_regex", ".*"))
MAX_SCALE = int(RULES.get("guardrails", {}).get("max_scale_replicas", GUARD_DEFAULT_MAX))
DOCKER_SOCK = "/var/run/docker.sock"
_client = None
def get_client():
global _client
if _client is not None:
return _client
if docker is None:
raise HTTPException(500, "docker SDK not available in image.")
if not os.path.exists(DOCKER_SOCK):
raise HTTPException(500, f"Docker socket not found at {DOCKER_SOCK}. Did you mount it?")
try:
_client = docker.DockerClient(base_url=f'unix://{DOCKER_SOCK}')
_client.ping()
return _client
except Exception as e:
raise HTTPException(500, f"Cannot connect to Docker at {DOCKER_SOCK}: {e}")
app = FastAPI(title="BurnServ AI Ops Agent")
def _guard_service(service_name: str):
if not ALLOWED_REGEX.match(service_name):
raise HTTPException(403, f"Service '{service_name}' not allowed by guardrails.")
def _scale(service_name: str, replicas: int = None, step: int = None, min_replicas: int = None, max_replicas: int = None):
cli = get_client()
_guard_service(service_name)
try:
svc = cli.services.get(service_name)
except Exception:
raise HTTPException(404, f"Service '{service_name}' not found")
spec = svc.attrs.get('Spec', {}).copy()
mode = spec.get('Mode', {})
replicas_current = mode.get('Replicated', {}).get('Replicas', 1)
if replicas is None:
tgt = int(replicas_current) + int(step or 1)
if min_replicas is not None: tgt = max(tgt, int(min_replicas))
if max_replicas is not None: tgt = min(tgt, int(max_replicas))
replicas = tgt
replicas = max(1, min(int(replicas), MAX_SCALE))
mode['Replicated'] = {'Replicas': replicas}
spec['Mode'] = mode
try:
svc.update(task_template=spec.get('TaskTemplate'), **{k: v for k, v in spec.items() if k != 'TaskTemplate'})
except Exception:
svc.update(**spec)
return {"service": service_name, "replicas": replicas}
def _restart_service(service_name: str):
cli = get_client()
_guard_service(service_name)
try:
svc = cli.services.get(service_name)
except Exception:
raise HTTPException(404, f"Service '{service_name}' not found")
# Use SDK-native ForceUpdate bump with current spec fetched by the SDK
try:
# Passing fetch_current_spec=True lets docker-py pull the live spec,
# and force_update tells Swarm to redeploy tasks (rolling restart)
svc.update(fetch_current_spec=True, force_update=(int(os.getenv("FORCE_UPDATE_BUMP", "1"))))
return {"service": service_name, "status": "rolling-restart-issued"}
except Exception as e:
# Fallback: manually bump TaskTemplate.ForceUpdate in-place
spec = svc.attrs.get('Spec', {}).copy()
tt = spec.get('TaskTemplate') or {}
fu = int(tt.get('ForceUpdate') or 0)
tt['ForceUpdate'] = fu + 1
spec['TaskTemplate'] = tt
try:
svc.update(task_template=spec.get('TaskTemplate'),
name=spec.get('Name'),
labels=spec.get('Labels'),
mode=spec.get('Mode'),
update_config=spec.get('UpdateConfig'),
networks=spec.get('Networks'),
endpoint_spec=spec.get('EndpointSpec'))
return {"service": service_name, "status": "rolling-restart-issued"}
except Exception as e2:
raise HTTPException(500, f"Restart failed: {e2}")
class Command(BaseModel):
text: str = ""
action: str | None = None
params: dict | None = None
@app.get("/health")
def health():
return {"ok": True}
@app.get("/diagnostics")
def diagnostics():
info = {"docker_sock_exists": os.path.exists(DOCKER_SOCK), "rules_loaded": bool(RULES)}
try:
cli = get_client()
info["docker_ping"] = True
info["server_version"] = cli.version()
except Exception as e:
info["docker_ping"] = False
info["error"] = str(e)
return info
@app.post("/command")
def command(cmd: Command):
if cmd.action:
if cmd.action == "scale":
p = cmd.params or {}
if "service" not in p or "replicas" not in p:
raise HTTPException(400, "Missing params for scale: service, replicas")
return _scale(p["service"], replicas=int(p["replicas"]))
if cmd.action == "restart_service":
p = cmd.params or {}
if "service" not in p:
raise HTTPException(400, "Missing service param for restart_service")
return _restart_service(p["service"])
raise HTTPException(400, "Unknown action")
t = (cmd.text or "").strip().lower()
if not t:
raise HTTPException(400, "Empty command")
if t.startswith("scale "):
parts = t.split()
try:
svc = parts[1]
if "to" in parts:
idx_to = parts.index("to")
reps = int(parts[idx_to + 1])
return _scale(svc, replicas=reps)
return _scale(svc, step=1)
except Exception:
raise HTTPException(400, "Format: 'scale <service> to <n>'")
if t.startswith("restart "):
try:
svc = t.split()[1]
return _restart_service(svc)
except Exception:
raise HTTPException(400, "Format: 'restart <service>'")
raise HTTPException(400, "Unrecognized command")
@app.post("/alert")
async def alert(request: Request):
payload = await request.json()
alerts = payload.get("alerts", [])
executed = []
for a in alerts:
labels = a.get("labels", {}) or {}
for rule in RULES.get("alerts", []):
match = rule.get("match", {})
if all(labels.get(k) == v for k, v in match.items()):
for act in rule.get("actions", []):
if "scale" in act:
cfg = act["scale"].copy()
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
res = _scale(service_name=svc,
replicas=cfg.get("replicas"),
step=cfg.get("step"),
min_replicas=cfg.get("min_replicas"),
max_replicas=cfg.get("max_replicas", MAX_SCALE))
executed.append({"alert": labels.get("alertname"), "action": "scale", "result": res})
elif "restart_service" in act:
cfg = act["restart_service"].copy()
svc = cfg.get("service", "").replace("{{ $labels.service_name }}", labels.get("service_name", ""))
res = _restart_service(svc)
executed.append({"alert": labels.get("alertname"), "action": "restart_service", "result": res})
return {"executed": executed}

16
agent/rules.safe.yaml Normal file
View File

@@ -0,0 +1,16 @@
alerts: []
commands:
- intent: "scale"
schema:
service: str
replicas: int
action:
scale:
service: "{{service}}"
replicas: "{{replicas}}"
guardrails:
# allow your prefixes and common pools; note the quotes around the regex
allowed_services_regex: "^([a-z0-9-]+_)?(wordpress|php|nginx|redis|mysql|weblabs|burnengine-io).*$
max_scale_replicas: 25

View File

@@ -11,7 +11,9 @@ configs:
alertmanager.yml: alertmanager.yml:
file: ./monitoring/alertmanager.yml file: ./monitoring/alertmanager.yml
rules.yaml: rules.yaml:
file: ./agent/rules.yaml file: ./agent/rules.safe.yaml
agent.py:
file: ./agent/agent-fixed.py
secrets: secrets:
openai_api_key: openai_api_key:
@@ -22,7 +24,7 @@ services:
image: hostlabs/ai-agent:latest image: hostlabs/ai-agent:latest
networks: [opsNet] networks: [opsNet]
ports: ports:
- "8080:8080" # expose only if you want host access; remove if internal-only - "18080:8080" # expose only if you want host access; remove if internal-only
deploy: deploy:
mode: replicated mode: replicated
replicas: 1 replicas: 1
@@ -36,6 +38,8 @@ services:
configs: configs:
- source: rules.yaml - source: rules.yaml
target: /app/rules.yaml target: /app/rules.yaml
- source: agent.py
target: /app/agent.py
relay: relay:
image: hostlabs/ai-relay:latest image: hostlabs/ai-relay:latest
@@ -49,7 +53,7 @@ services:
- source: openai_api_key - source: openai_api_key
target: openai_api_key target: openai_api_key
ports: ports:
- "8090:8090" - "18090:8090"
deploy: deploy:
mode: replicated mode: replicated
replicas: 1 replicas: 1
@@ -69,7 +73,7 @@ services:
- source: prometheus.yml - source: prometheus.yml
target: /etc/prometheus/prometheus.yml target: /etc/prometheus/prometheus.yml
ports: ports:
- "9090:9090" - "19090:9090"
alertmanager: alertmanager:
image: prom/alertmanager:v0.27.0 image: prom/alertmanager:v0.27.0
@@ -83,7 +87,7 @@ services:
- source: alertmanager.yml - source: alertmanager.yml
target: /etc/alertmanager/alertmanager.yml target: /etc/alertmanager/alertmanager.yml
ports: ports:
- "9093:9093" - "19093:9093"
cadvisor: cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1 image: gcr.io/cadvisor/cadvisor:v0.49.1
@@ -99,7 +103,7 @@ services:
- /sys:/sys:ro - /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro - /var/lib/docker/:/var/lib/docker:ro
ports: ports:
- "8081:8080" - "18081:8080"
node-exporter: node-exporter:
image: prom/node-exporter:v1.8.2 image: prom/node-exporter:v1.8.2