May 1, 2025 update

This commit is contained in:
Sharad Ahlawat
2025-05-01 21:19:17 -07:00
parent a2cdf26594
commit b33d54d723
261 changed files with 2451 additions and 12859 deletions

791
scripts/gstat_exporter.py Executable file → Normal file
View File

@ -1,411 +1,396 @@
from prometheus_client import start_http_server, Gauge # type: ignore
from prometheus_client import start_http_server, Gauge
import argparse
import logging
import datetime
from subprocess import Popen, PIPE
from typing import Dict
from importlib.metadata import PackageNotFoundError, version
# get version
try:
__version__ = version("gstat_exporter")
except PackageNotFoundError:
# package is not installed, version unknown
__version__ = "0.0.0"
class GstatExporter:
def __init__(self, interval: int = 30, grace: int = 30, sleep: int = 15) -> None:
"""Define metrics and other neccesary variables."""
# save interval, grace, and sleep
self.interval = interval
self.grace = grace
self.sleep = sleep
# save the version as a class attribute
self.__version__ = __version__
# define the metric labels
self.labels: list[str] = [
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
]
# define the metrics
self.metrics: dict[str, Gauge] = {}
self.metrics["up"] = Gauge(
"gstat_up",
"The value of this Gauge is always 1 when the gstat_exporter is up",
)
self.metrics["queue"] = Gauge(
"gstat_queue_depth",
"The queue depth for this GEOM",
self.labels,
)
self.metrics["totalops"] = Gauge(
"gstat_total_operations_per_second",
"The total number of operations/second for this GEOM",
self.labels,
)
self.metrics["readops"] = Gauge(
"gstat_read_operations_per_second",
"The number of read operations/second for this GEOM",
self.labels,
)
self.metrics["readsize"] = Gauge(
"gstat_read_size_kilobytes",
"The size in kilobytes of read operations for this GEOM",
self.labels,
)
self.metrics["readkbs"] = Gauge(
"gstat_read_kilobytes_per_second",
"The speed in kilobytes/second of read operations for this GEOM",
self.labels,
)
self.metrics["readms"] = Gauge(
"gstat_miliseconds_per_read",
"The speed in miliseconds/read operation for this GEOM",
self.labels,
)
self.metrics["writeops"] = Gauge(
"gstat_write_operations_per_second",
"The number of write operations/second for this GEOM",
self.labels,
)
self.metrics["writesize"] = Gauge(
"gstat_write_size_kilobytes",
"The size in kilobytes of write operations for this GEOM",
self.labels,
)
self.metrics["writekbs"] = Gauge(
"gstat_write_kilobytes_per_second",
"The speed in kilobytes/second of write operations for this GEOM",
self.labels,
)
self.metrics["writems"] = Gauge(
"gstat_miliseconds_per_write",
"The speed in miliseconds/write operation for this GEOM",
self.labels,
)
self.metrics["deleteops"] = Gauge(
"gstat_delete_operations_per_second",
"The number of delete operations/second for this GEOM",
self.labels,
)
self.metrics["deletesize"] = Gauge(
"gstat_delete_size_kilobytes",
"The size in kilobytes of delete operations for this GEOM",
self.labels,
)
self.metrics["deletekbs"] = Gauge(
"gstat_delete_kilobytes_per_second",
"The speed in kilobytes/second of delete operations for this GEOM",
self.labels,
)
self.metrics["deletems"] = Gauge(
"gstat_miliseconds_per_delete",
"The speed in miliseconds/delete operation for this GEOM",
self.labels,
)
self.metrics["otherops"] = Gauge(
"gstat_other_operations_per_second",
"The number of other operations (BIO_FLUSH)/second for this GEOM",
self.labels,
)
self.metrics["otherms"] = Gauge(
"gstat_miliseconds_per_other",
"The speed in miliseconds/other operation (BIO_FLUSH) for this GEOM",
self.labels,
)
self.metrics["busy"] = Gauge(
"gstat_percent_busy",
"The percent of the time this GEOM is busy",
self.labels,
)
# start with an empty deviceinfo dict and add devices as we see them
self.deviceinfo: Dict[str, Dict[str, str]] = {}
# variables used for checking for removed devices
self.lastcheck = datetime.datetime.now()
self.timestamps: Dict[str, datetime.datetime] = {}
logging.debug("Done initialising GstatExporter class")
def get_deviceinfo(self, name: str) -> Dict[str, str]:
"""
Return a dict of GEOM device info for GEOM devices in class DISK,
for use as labels for the metrics.
Sample output from the geom command:
$ geom -p ada0
Geom class: DISK
Geom name: ada0
Providers:
1. Name: ada0
Mediasize: 250059350016 (233G)
Sectorsize: 512
Mode: r2w2e4
descr: Samsung SSD 860 EVO mSATA 250GB
lunid: 5002538e700b753f
ident: S41MNG0K907238X
rotationrate: 0
fwsectors: 63
fwheads: 16
$
"""
logging.debug(f"Getting deviceinfo for GEOM {name}...")
with Popen(
["geom", "-p", name], stdout=PIPE, bufsize=1, universal_newlines=True
) as p:
result = {}
for line in p.stdout: # type: ignore
# remove excess whitespace
line = line.strip()
# we only care about the DISK class for now
if line[0:12] == "Geom class: " and line[-4:] != "DISK":
break
if line[0:11] == "Mediasize: ":
result["mediasize"] = line[11:]
if line[0:12] == "Sectorsize: ":
result["sectorsize"] = line.split(" ")[1]
if line[0:7] == "descr: ":
result["descr"] = " ".join(line.split(" ")[1:])
if line[0:7] == "lunid: ":
result["lunid"] = line.split(" ")[1]
if line[0:7] == "ident: ":
result["ident"] = line.split(" ")[1]
if line[0:14] == "rotationrate: ":
result["rotationrate"] = line.split(" ")[1]
if line[0:11] == "fwsectors: ":
result["fwsectors"] = line.split(" ")[1]
if line[0:9] == "fwheads: ":
result["fwheads"] = line.split(" ")[1]
logging.debug(f"Returning deviceinfo for {name}: {result}")
return result
def run_gstat_forever(self) -> None:
"""
Run gstat in a loop and update stats per line
"""
logging.debug(f"Running 'gstat -pdosCI {self.sleep}s' (will loop forever)...")
with Popen(
["gstat", "-pdosCI", f"{self.sleep}s"], stdout=PIPE, bufsize=1, universal_newlines=True
) as p:
# loop over lines in the output
for line in p.stdout: # type: ignore
(
timestamp,
name,
queue_depth,
total_operations_per_second,
read_operations_per_second,
read_size_kilobytes,
read_kilobytes_per_second,
miliseconds_per_read,
write_operations_per_second,
write_size_kilobytes,
write_kilobytes_per_second,
miliseconds_per_write,
delete_operations_per_second,
delete_size_kilobytes,
delete_kilobytes_per_second,
miliseconds_per_delete,
other_operations_per_second,
miliseconds_per_other,
percent_busy,
) = line.split(",")
if timestamp == "timestamp":
# skip header line
continue
# first check if this GEOM has been seen before
if name not in self.deviceinfo:
logging.info(f"Adding new GEOM to deviceinfo: {name}")
# this is the first time we see this GEOM
self.deviceinfo[name] = {}
# we always need a value for all labels
for key in self.labels:
self.deviceinfo[name][key] = ""
# get real info from the device if it is class DISK
self.deviceinfo[name].update(self.get_deviceinfo(name))
self.deviceinfo[name].update({"name": name})
# update timestamp to track when this GEOM was last seen
self.timestamps[name] = datetime.datetime.strptime(
timestamp.split(".")[0], "%Y-%m-%d %H:%M:%S"
)
# up is always.. up
self.metrics["up"].set(1)
self.metrics["queue"].labels(**self.deviceinfo[name]).set(queue_depth)
self.metrics["totalops"].labels(**self.deviceinfo[name]).set(
total_operations_per_second
)
self.metrics["readops"].labels(**self.deviceinfo[name]).set(
read_operations_per_second
)
self.metrics["readsize"].labels(**self.deviceinfo[name]).set(
read_size_kilobytes
)
self.metrics["readkbs"].labels(**self.deviceinfo[name]).set(
read_kilobytes_per_second
)
self.metrics["readms"].labels(**self.deviceinfo[name]).set(
miliseconds_per_read
)
self.metrics["writeops"].labels(**self.deviceinfo[name]).set(
write_operations_per_second
)
self.metrics["writesize"].labels(**self.deviceinfo[name]).set(
write_size_kilobytes
)
self.metrics["writekbs"].labels(**self.deviceinfo[name]).set(
write_kilobytes_per_second
)
self.metrics["writems"].labels(**self.deviceinfo[name]).set(
miliseconds_per_write
)
self.metrics["deleteops"].labels(**self.deviceinfo[name]).set(
delete_operations_per_second
)
self.metrics["deletesize"].labels(**self.deviceinfo[name]).set(
delete_size_kilobytes
)
self.metrics["deletekbs"].labels(**self.deviceinfo[name]).set(
delete_kilobytes_per_second
)
self.metrics["deletems"].labels(**self.deviceinfo[name]).set(
miliseconds_per_delete
)
self.metrics["otherops"].labels(**self.deviceinfo[name]).set(
other_operations_per_second
)
self.metrics["otherms"].labels(**self.deviceinfo[name]).set(
miliseconds_per_other
)
self.metrics["busy"].labels(**self.deviceinfo[name]).set(percent_busy)
# check for removed GEOMs
now = datetime.datetime.now()
if (now - self.lastcheck).seconds > self.interval:
logging.debug("Running periodic check for removed devices...")
# enough time has passed since the last check
# loop over devices and check timestamp for each
remove = []
for name in self.deviceinfo.keys():
delta = (now - self.timestamps[name]).seconds
if delta > self.grace:
remove.append(name)
logging.info(
f"It has been {self.grace} seconds since gstat last reported data for GEOM {name} - removing metrics"
)
# loop over the GEOMs for which gstat stopped giving data and remove them
for name in remove:
# it has been too long since we have seen this GEOM, remove it
for metric in self.metrics.keys():
if metric == "up":
# skip the up metric
continue
self.metrics[metric].remove(*self.deviceinfo[name].values())
del self.deviceinfo[name]
self.lastcheck = datetime.datetime.now()
def get_deviceinfo(name: str) -> Dict[str, str]:
"""
Return a dict of GEOM device info for GEOM devices in class DISK,
for use as labels for the metrics.
def main() -> None:
"""Run the main function."""
parser = argparse.ArgumentParser()
Sample output from the geom command:
parser.add_argument(
"-g",
"--grace",
type=int,
help="Stop exporting metrics for a GEOM after gstat has not reported data from it for this many seconds. Defaults to 30 seconds.",
default=30,
)
parser.add_argument(
"-i",
"--interval",
type=int,
help="How many seconds to wait between checking for removed devices. Defaults to 30 seconds.",
default=30,
)
parser.add_argument(
"-l",
"--listen-ip",
type=str,
help="Listen IP. Defaults to 0.0.0.0 (all v4 IPs). Set to :: to listen on all v6 IPs.",
default="0.0.0.0",
)
parser.add_argument(
"-p",
"--port",
type=int,
help="Portnumber. Defaults to 9248.",
default=9248,
)
parser.add_argument(
"-s",
"--sleep",
type=int,
help="How long should gstat sleep between reporting data, in seconds. Set this to the same as your Prometheus scrape interval. Defaults to 15.",
default=15,
)
$ geom -p ada0
Geom class: DISK
Geom name: ada0
Providers:
1. Name: ada0
Mediasize: 250059350016 (233G)
Sectorsize: 512
Mode: r2w2e4
descr: Samsung SSD 860 EVO mSATA 250GB
lunid: 5002538e700b753f
ident: S41MNG0K907238X
rotationrate: 0
fwsectors: 63
fwheads: 16
$
"""
with Popen(
["geom", "-p", name], stdout=PIPE, bufsize=1, universal_newlines=True
) as p:
result = {}
for line in p.stdout:
# remove excess whitespace
line = line.strip()
# we only care about the DISK class for now
if line[0:12] == "Geom class: " and line[-4:] != "DISK":
break
parser.add_argument(
"-d",
"--debug",
action="store_const",
dest="loglevel",
const="DEBUG",
help="Debug mode.",
default="INFO",
)
if line[0:11] == "Mediasize: ":
result["mediasize"] = line[11:]
if line[0:12] == "Sectorsize: ":
result["sectorsize"] = line.split(" ")[1]
if line[0:7] == "descr: ":
result["descr"] = " ".join(line.split(" ")[1:])
if line[0:7] == "lunid: ":
result["lunid"] = line.split(" ")[1]
if line[0:7] == "ident: ":
result["ident"] = line.split(" ")[1]
if line[0:14] == "rotationrate: ":
result["rotationrate"] = line.split(" ")[1]
if line[0:11] == "fwsectors: ":
result["fwsectors"] = line.split(" ")[1]
if line[0:9] == "fwheads: ":
result["fwheads"] = line.split(" ")[1]
return result
args = parser.parse_args()
logging.basicConfig(level=args.loglevel, datefmt="%Y-%m-%d %H:%M:%S %z", format="%(asctime)s - %(module)s - %(levelname)s - %(message)s")
logging.info(f"Starting gstat_exporter v{__version__} - logging at level {args.loglevel}")
logging.info(f"Starting HTTP listener on address '{args.listen_ip}' port '{args.port}'")
start_http_server(addr=args.listen_ip, port=args.port)
exporter = GstatExporter(interval=args.interval, grace=args.grace, sleep=args.sleep)
while True:
exporter.run_gstat_forever()
def process_request() -> None:
"""
Run gstat in a loop and update stats per line
"""
# start with an empty deviceinfo dict and add devices as we see them
deviceinfo: Dict[str, Dict[str, str]] = {}
with Popen(
["gstat", "-pdosCI", "5s"], stdout=PIPE, bufsize=1, universal_newlines=True
) as p:
for line in p.stdout:
(
timestamp,
name,
queue_depth,
total_operations_per_second,
read_operations_per_second,
read_size_kilobytes,
read_kilobytes_per_second,
miliseconds_per_read,
write_operations_per_second,
write_size_kilobytes,
write_kilobytes_per_second,
miliseconds_per_write,
delete_operations_per_second,
delete_size_kilobytes,
delete_kilobytes_per_second,
miliseconds_per_delete,
other_operations_per_second,
miliseconds_per_other,
percent_busy,
) = line.split(",")
if timestamp == "timestamp":
# skip header line
continue
if name not in deviceinfo:
# this is the first time we see this GEOM
deviceinfo[name] = {}
# we always need a value for all labels
for key in [
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
]:
deviceinfo[name][key] = ""
# get real info from the device if it is class DISK
deviceinfo[name].update(get_deviceinfo(name))
deviceinfo[name].update({"name": name})
# up is always.. up
up.set(1)
queue.labels(**deviceinfo[name]).set(queue_depth)
totalops.labels(**deviceinfo[name]).set(total_operations_per_second)
readops.labels(**deviceinfo[name]).set(read_operations_per_second)
readsize.labels(**deviceinfo[name]).set(read_size_kilobytes)
readkbs.labels(**deviceinfo[name]).set(read_kilobytes_per_second)
readms.labels(**deviceinfo[name]).set(miliseconds_per_read)
writeops.labels(**deviceinfo[name]).set(write_operations_per_second)
writesize.labels(**deviceinfo[name]).set(write_size_kilobytes)
writekbs.labels(**deviceinfo[name]).set(write_kilobytes_per_second)
writems.labels(**deviceinfo[name]).set(miliseconds_per_write)
deleteops.labels(**deviceinfo[name]).set(delete_operations_per_second)
deletesize.labels(**deviceinfo[name]).set(delete_size_kilobytes)
deletekbs.labels(**deviceinfo[name]).set(delete_kilobytes_per_second)
deletems.labels(**deviceinfo[name]).set(miliseconds_per_delete)
otherops.labels(**deviceinfo[name]).set(other_operations_per_second)
otherms.labels(**deviceinfo[name]).set(miliseconds_per_other)
busy.labels(**deviceinfo[name]).set(percent_busy)
# define metrics
up = Gauge(
"gstat_up", "The value of this Gauge is always 1 when the gstat_exporter is up"
)
queue = Gauge(
"gstat_queue_depth",
"The queue depth for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
totalops = Gauge(
"gstat_total_operations_per_second",
"The total number of operations/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
readops = Gauge(
"gstat_read_operations_per_second",
"The number of read operations/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
readsize = Gauge(
"gstat_read_size_kilobytes",
"The size in kilobytes of read operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
readkbs = Gauge(
"gstat_read_kilobytes_per_second",
"The speed in kilobytes/second of read operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
readms = Gauge(
"gstat_miliseconds_per_read",
"The speed in miliseconds/read operation for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
writeops = Gauge(
"gstat_write_operations_per_second",
"The number of write operations/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
writesize = Gauge(
"gstat_write_size_kilobytes",
"The size in kilobytes of write operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
writekbs = Gauge(
"gstat_write_kilobytes_per_second",
"The speed in kilobytes/second of write operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
writems = Gauge(
"gstat_miliseconds_per_write",
"The speed in miliseconds/write operation for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
deleteops = Gauge(
"gstat_delete_operations_per_second",
"The number of delete operations/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
deletesize = Gauge(
"gstat_delete_size_kilobytes",
"The size in kilobytes of delete operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
deletekbs = Gauge(
"gstat_delete_kilobytes_per_second",
"The speed in kilobytes/second of delete operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
deletems = Gauge(
"gstat_miliseconds_per_delete",
"The speed in miliseconds/delete operation for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
otherops = Gauge(
"gstat_other_operations_per_second",
"The number of other operations (BIO_FLUSH)/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
otherms = Gauge(
"gstat_miliseconds_per_other",
"The speed in miliseconds/other operation (BIO_FLUSH) for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
busy = Gauge(
"gstat_percent_busy",
"The percent of the time this GEOM is busy",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
start_http_server(9248)
while True:
process_request()
if __name__ == "__main__":
main()

View File

@ -102,12 +102,12 @@ if [ ${problems} -eq 0 ]; then
#scrubDate=$(date -d "$scrubRawDate" +%s)
### FreeBSD 11.2 with *nix supported date format
#scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $15 $12 $13}')
#scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $15 $12 $13}')
scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
### FreeBSD 12.0 with *nix supported date format
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $17 $14 $15}')
scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
#scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $17 $14 $15}')
#scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
if [ $(($currentDate - $scrubDate)) -ge $scrubExpire ]; then
emailSubject="`hostname` - ZFS pool - Scrub Time Expired. Scrub Needed on Volume(s)"