Feb 25, 2020

This commit is contained in:
Charlie Root
2020-02-25 11:28:31 -08:00
commit f26cf87f5a
436 changed files with 67904 additions and 0 deletions

2
scripts/crontab.txt Normal file
View File

@ -0,0 +1,2 @@
00 08,12,16,20 * * * /root/FreeBSD/scripts/zfs_health.sh
00 6 * * 0 /usr/local/sbin/zfSnap -d -s -S -a 1m -p weekly_ -r zroot ship data base

411
scripts/gstat_exporter.py Executable file
View File

@ -0,0 +1,411 @@
from prometheus_client import start_http_server, Gauge # type: ignore
from subprocess import Popen, PIPE
from typing import Dict
def get_deviceinfo(name: str) -> Dict[str, str]:
"""
Return a dict of GEOM device info for GEOM devices in class DISK,
for use as labels for the metrics.
Sample output from the geom command:
$ geom -p ada0
Geom class: DISK
Geom name: ada0
Providers:
1. Name: ada0
Mediasize: 250059350016 (233G)
Sectorsize: 512
Mode: r2w2e4
descr: Samsung SSD 860 EVO mSATA 250GB
lunid: 5002538e700b753f
ident: S41MNG0K907238X
rotationrate: 0
fwsectors: 63
fwheads: 16
$
"""
with Popen(
["geom", "-p", name], stdout=PIPE, bufsize=1, universal_newlines=True
) as p:
result = {}
for line in p.stdout:
# remove excess whitespace
line = line.strip()
# we only care about the DISK class for now
if line[0:12] == "Geom class: " and line[-4:] != "DISK":
break
if line[0:11] == "Mediasize: ":
result["mediasize"] = line[11:]
if line[0:12] == "Sectorsize: ":
result["sectorsize"] = line.split(" ")[1]
if line[0:7] == "descr: ":
result["descr"] = " ".join(line.split(" ")[1:])
if line[0:7] == "lunid: ":
result["lunid"] = line.split(" ")[1]
if line[0:7] == "ident: ":
result["ident"] = line.split(" ")[1]
if line[0:14] == "rotationrate: ":
result["rotationrate"] = line.split(" ")[1]
if line[0:11] == "fwsectors: ":
result["fwsectors"] = line.split(" ")[1]
if line[0:9] == "fwheads: ":
result["fwheads"] = line.split(" ")[1]
return result
def process_request() -> None:
"""
Run gstat in a loop and update stats per line
"""
# start with an empty deviceinfo dict and add devices as we see them
deviceinfo: Dict[str, Dict[str, str]] = {}
with Popen(
["gstat", "-pdosCI", "5s"], stdout=PIPE, bufsize=1, universal_newlines=True
) as p:
for line in p.stdout:
(
timestamp,
name,
queue_depth,
total_operations_per_second,
read_operations_per_second,
read_size_kilobytes,
read_kilobytes_per_second,
miliseconds_per_read,
write_operations_per_second,
write_size_kilobytes,
write_kilobytes_per_second,
miliseconds_per_write,
delete_operations_per_second,
delete_size_kilobytes,
delete_kilobytes_per_second,
miliseconds_per_delete,
other_operations_per_second,
miliseconds_per_other,
percent_busy,
) = line.split(",")
if timestamp == "timestamp":
# skip header line
continue
if name not in deviceinfo:
# this is the first time we see this GEOM
deviceinfo[name] = {}
# we always need a value for all labels
for key in [
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
]:
deviceinfo[name][key] = ""
# get real info from the device if it is class DISK
deviceinfo[name].update(get_deviceinfo(name))
deviceinfo[name].update({"name": name})
# up is always.. up
up.set(1)
queue.labels(**deviceinfo[name]).set(queue_depth)
totalops.labels(**deviceinfo[name]).set(total_operations_per_second)
readops.labels(**deviceinfo[name]).set(read_operations_per_second)
readsize.labels(**deviceinfo[name]).set(read_size_kilobytes)
readkbs.labels(**deviceinfo[name]).set(read_kilobytes_per_second)
readms.labels(**deviceinfo[name]).set(miliseconds_per_read)
writeops.labels(**deviceinfo[name]).set(write_operations_per_second)
writesize.labels(**deviceinfo[name]).set(write_size_kilobytes)
writekbs.labels(**deviceinfo[name]).set(write_kilobytes_per_second)
writems.labels(**deviceinfo[name]).set(miliseconds_per_write)
deleteops.labels(**deviceinfo[name]).set(delete_operations_per_second)
deletesize.labels(**deviceinfo[name]).set(delete_size_kilobytes)
deletekbs.labels(**deviceinfo[name]).set(delete_kilobytes_per_second)
deletems.labels(**deviceinfo[name]).set(miliseconds_per_delete)
otherops.labels(**deviceinfo[name]).set(other_operations_per_second)
otherms.labels(**deviceinfo[name]).set(miliseconds_per_other)
busy.labels(**deviceinfo[name]).set(percent_busy)
# define metrics
up = Gauge(
"gstat_up", "The value of this Gauge is always 1 when the gstat_exporter is up"
)
queue = Gauge(
"gstat_queue_depth",
"The queue depth for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
totalops = Gauge(
"gstat_total_operations_per_second",
"The total number of operations/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
readops = Gauge(
"gstat_read_operations_per_second",
"The number of read operations/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
readsize = Gauge(
"gstat_read_size_kilobytes",
"The size in kilobytes of read operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
readkbs = Gauge(
"gstat_read_kilobytes_per_second",
"The speed in kilobytes/second of read operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
readms = Gauge(
"gstat_miliseconds_per_read",
"The speed in miliseconds/read operation for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
writeops = Gauge(
"gstat_write_operations_per_second",
"The number of write operations/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
writesize = Gauge(
"gstat_write_size_kilobytes",
"The size in kilobytes of write operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
writekbs = Gauge(
"gstat_write_kilobytes_per_second",
"The speed in kilobytes/second of write operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
writems = Gauge(
"gstat_miliseconds_per_write",
"The speed in miliseconds/write operation for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
deleteops = Gauge(
"gstat_delete_operations_per_second",
"The number of delete operations/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
deletesize = Gauge(
"gstat_delete_size_kilobytes",
"The size in kilobytes of delete operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
deletekbs = Gauge(
"gstat_delete_kilobytes_per_second",
"The speed in kilobytes/second of delete operations for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
deletems = Gauge(
"gstat_miliseconds_per_delete",
"The speed in miliseconds/delete operation for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
otherops = Gauge(
"gstat_other_operations_per_second",
"The number of other operations (BIO_FLUSH)/second for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
otherms = Gauge(
"gstat_miliseconds_per_other",
"The speed in miliseconds/other operation (BIO_FLUSH) for this GEOM",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
busy = Gauge(
"gstat_percent_busy",
"The percent of the time this GEOM is busy",
[
"name",
"descr",
"mediasize",
"sectorsize",
"lunid",
"ident",
"rotationrate",
"fwsectors",
"fwheads",
],
)
start_http_server(9248)
while True:
process_request()

172
scripts/zfs-prune-snapshots Executable file
View File

@ -0,0 +1,172 @@
#!/usr/bin/env bash
#
# script to prune zfs snapshots over a given age
#
# Author: Dave Eddy <dave@daveeddy.com>
# Date: November 20, 2015
# License: MIT
VERSION='v1.0.1'
usage() {
local prog=${0##*/}
cat <<-EOF
usage: $prog [-hnv] [-p <prefix] <time> [[dataset1] ...]
remove snapshots from one or more zpools that match given criteria
examples
# $prog 1w
remove snapshots older than a week across all zpools
# $prog -vn 1w
same as above, but with increased verbosity and without
actually deleting any snapshots (dry-run)
# $prog 3w tank1 tank2/backup
remove snapshots older than 3 weeks on tank1 and tank2/backup.
note that this script will recurse through *all* of tank1 and
*all* datasets below tank2/backup
# $prog -p 'autosnap_' 1M zones
remove snapshots older than a month on the zones pool that start
with the string "autosnap_"
timespec
the first argument denotes how old a snapshot must be for it to
be considered for deletion - possible specifiers are
s seconds
m minutes
h hours
d days
w weeks
M months
y years
options
-h print this message and exit
-n dry-run, don't actually delete snapshots
-p <prefix> snapshot prefix string to match
-q quiet, do not printout removed snapshots
-v increase verbosity
-V print the version number and exit
EOF
}
debug() {
((verbosity >= 1)) && echo "$@"
return 0
}
# given a time in seconds, return the "human readable" string
human() {
local seconds=$1
if ((seconds < 0)); then
((seconds *= -1))
fi
local times=(
$((seconds / 60 / 60 / 24 / 365)) # years
$((seconds / 60 / 60 / 24 / 30)) # months
$((seconds / 60 / 60 / 24 / 7)) # weeks
$((seconds / 60 / 60 / 24)) # days
$((seconds / 60 / 60)) # hours
$((seconds / 60)) # minutes
$((seconds)) # seconds
)
local names=(year month week day hour minute second)
local i
for ((i = 0; i < ${#names[@]}; i++)); do
if ((${times[$i]} > 1)); then
echo "${times[$i]} ${names[$i]}s"
return
elif ((${times[$i]} == 1)); then
echo "${times[$i]} ${names[$i]}"
return
fi
done
echo '0 seconds'
}
dryrun=false
verbosity=0
prefix=
quiet=false
while getopts 'hnqp:vV' option; do
case "$option" in
h) usage; exit 0;;
n) dryrun=true;;
p) prefix=$OPTARG;;
q) quiet=true;;
v) ((verbosity++));;
V) echo "$VERSION"; exit 0;;
*) usage; exit 1;;
esac
done
shift "$((OPTIND - 1))"
# extract the first argument - the timespec - and
# convert it to seconds
t=$1
time_re='^([0-9]+)([smhdwMy])$'
seconds=
if [[ $t =~ $time_re ]]; then
# ex: "21d" becomes num=21 spec=d
num=${BASH_REMATCH[1]}
spec=${BASH_REMATCH[2]}
case "$spec" in
s) seconds=$((num));;
m) seconds=$((num * 60));;
h) seconds=$((num * 60 * 60));;
d) seconds=$((num * 60 * 60 * 24));;
w) seconds=$((num * 60 * 60 * 24 * 7));;
M) seconds=$((num * 60 * 60 * 24 * 30));;
y) seconds=$((num * 60 * 60 * 24 * 365));;
*) echo "error: unknown spec '$spec'" >&2; exit 1;;
esac
elif [[ -z $t ]]; then
echo 'error: timespec must be specified as the first argument' >&2
exit 1
else
echo "error: failed to parse timespec '$t'" >&2
exit 1
fi
shift
pools=("$@")
now=$(date +%s)
code=0
while read -r creation snapshot; do
# ensure optional prefix matches
snapname=${snapshot#*@}
if [[ -n $prefix && $prefix != "${snapname:0:${#prefix}}" ]]; then
debug "skipping $snapshot: doesn't match prefix $prefix"
continue
fi
# ensure snapshot is older than the cutoff time
delta=$((now - creation))
human=$(human "$delta")
if ((delta <= seconds)); then
debug "skipping $snapshot: $human old"
continue
fi
# remove the snapshot
if ! $quiet || $dryrun; then
echo -n "removing $snapshot: $human old"
fi
if $dryrun; then
echo ' <dry-run: no action taken>'
else
if ! $quiet; then
echo
fi
zfs destroy "$snapshot" || code=1
fi
done < <(zfs list -Hpo creation,name -t snapshot -r "${pools[@]}")
exit "$code"

131
scripts/zfs_health.sh Executable file
View File

@ -0,0 +1,131 @@
#! /bin/sh
#
# Calomel.org
# https://calomel.org/zfs_health_check_script.html
# FreeBSD ZFS Health Check script
# zfs_health.sh @ Version 0.18
# Check health of ZFS volumes and drives. On any faults send email.
# 99 problems but ZFS aint one
problems=0
# Health - Check if all zfs volumes are in good condition. We are looking for
# any keyword signifying a degraded or broken array.
condition=$(/sbin/zpool status | egrep -i '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED|corrupt|cannot|unrecover)')
if [ "${condition}" ]; then
emailSubject="`hostname` - ZFS pool - HEALTH fault"
problems=1
fi
# Capacity - Make sure the pool capacity is below 80% for best performance. The
# percentage really depends on how large your volume is. If you have a 128GB
# SSD then 80% is reasonable. If you have a 60TB raid-z2 array then you can
# probably set the warning closer to 95%.
#
# ZFS uses a copy-on-write scheme. The file system writes new data to
# sequential free blocks first and when the uberblock has been updated the new
# inode pointers become valid. This method is true only when the pool has
# enough free sequential blocks. If the pool is at capacity and space limited,
# ZFS will be have to randomly write blocks. This means ZFS can not create an
# optimal set of sequential writes and write performance is severely impacted.
maxCapacity=80
if [ ${problems} -eq 0 ]; then
capacity=$(/sbin/zpool list -H -o capacity | cut -d'%' -f1)
for line in ${capacity}
do
if [ $line -ge $maxCapacity ]; then
emailSubject="`hostname` - ZFS pool - Capacity Exceeded"
problems=1
fi
done
fi
# Errors - Check the columns for READ, WRITE and CKSUM (checksum) drive errors
# on all volumes and all drives using "zpool status". If any non-zero errors
# are reported an email will be sent out. You should then look to replace the
# faulty drive and run "zpool scrub" on the affected volume after resilvering.
if [ ${problems} -eq 0 ]; then
errors=$(/sbin/zpool status | grep ONLINE | grep -v state | awk '{print $3 $4 $5}' | grep -v 000)
if [ "${errors}" ]; then
emailSubject="`hostname` - ZFS pool - Drive Errors"
problems=1
fi
fi
# Scrub Expired - Check if all volumes have been scrubbed in at least the last
# 40 days. The general guide is to scrub volumes on desktop quality drives once
# a week and volumes on enterprise class drives once a month. You can always
# use cron to schedual "zpool scrub" in off hours. We scrub our volumes every
# Sunday morning for example.
#
# Scrubbing traverses all the data in the pool once and verifies all blocks can
# be read. Scrubbing proceeds as fast as the devices allows, though the
# priority of any I/O remains below that of normal calls. This operation might
# negatively impact performance, but the file system will remain usable and
# responsive while scrubbing occurs. To initiate an explicit scrub, use the
# "zpool scrub" command.
#
# The scrubExpire variable is in seconds. So for 40 days we calculate 40 days
# times 24 hours times 3600 seconds to equal 3456000 seconds.
# 10 days
#scrubExpire=864000
# 40 days
scrubExpire=3456000
if [ ${problems} -eq 0 ]; then
currentDate=$(date +%s)
zfsVolumes=$(/sbin/zpool list -H -o name)
for volume in ${zfsVolumes}
do
if [ $(/sbin/zpool status $volume | egrep -c "none requested") -ge 1 ]; then
printf "ERROR: You need to run \"zpool scrub $volume\" before this script can monitor the scrub expiration time."
break
fi
if [ $(/sbin/zpool status $volume | egrep -c "scrub in progress|resilver") -ge 1 ]; then
break
fi
### Ubuntu with GNU supported date format
#scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $11" "$12" " $13" " $14" "$15}')
#scrubDate=$(date -d "$scrubRawDate" +%s)
### FreeBSD 11.2 with *nix supported date format
#scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $15 $12 $13}')
#scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
### FreeBSD 12.0 with *nix supported date format
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $17 $14 $15}')
scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
if [ $(($currentDate - $scrubDate)) -ge $scrubExpire ]; then
emailSubject="`hostname` - ZFS pool - Scrub Time Expired. Scrub Needed on Volume(s)"
problems=1
fi
done
fi
# Email - On any problems send email with drive status information and
# capacities including a helpful subject line. Also use logger to write the
# email subject to the local logs. This is also the place you may want to put
# any other notifications like playing a sound file, beeping the internal
# speaker, paging someone or updating Nagios or even BigBrother.
if [ "$problems" -ne 0 ]; then
printf '%s\n' "$emailSubject" "" "`/sbin/zpool list`" "" "`/sbin/zpool status`" | /usr/bin/mail -s "$emailSubject" jail-root@ahlawat.com
logger $emailSubject
fi
### EOF ###