Feb 25, 2020

2020-02-25 11:28:31 -08:00
commit f26cf87f5a
436 changed files with 67904 additions and 0 deletions
--- a/scripts/crontab.txt
+++ b/scripts/crontab.txt
@ -0,0 +1,2 @@
+00 08,12,16,20 *   *   *   /root/FreeBSD/scripts/zfs_health.sh
+00   6    *   *   0   /usr/local/sbin/zfSnap -d -s -S -a 1m -p weekly_ -r zroot ship data base
--- a/scripts/gstat_exporter.py
+++ b/scripts/gstat_exporter.py
@ -0,0 +1,411 @@
+from prometheus_client import start_http_server, Gauge  # type: ignore
+from subprocess import Popen, PIPE
+from typing import Dict
+
+
+def get_deviceinfo(name: str) -> Dict[str, str]:
+    """
+    Return a dict of GEOM device info for GEOM devices in class DISK,
+    for use as labels for the metrics.
+
+    Sample output from the geom command:
+
+    $ geom -p ada0
+    Geom class: DISK
+    Geom name: ada0
+    Providers:
+    1. Name: ada0
+       Mediasize: 250059350016 (233G)
+       Sectorsize: 512
+       Mode: r2w2e4
+       descr: Samsung SSD 860 EVO mSATA 250GB
+       lunid: 5002538e700b753f
+       ident: S41MNG0K907238X
+       rotationrate: 0
+       fwsectors: 63
+       fwheads: 16
+    $
+    """
+    with Popen(
+        ["geom", "-p", name], stdout=PIPE, bufsize=1, universal_newlines=True
+    ) as p:
+        result = {}
+        for line in p.stdout:
+            # remove excess whitespace
+            line = line.strip()
+            # we only care about the DISK class for now
+            if line[0:12] == "Geom class: " and line[-4:] != "DISK":
+                break
+
+            if line[0:11] == "Mediasize: ":
+                result["mediasize"] = line[11:]
+            if line[0:12] == "Sectorsize: ":
+                result["sectorsize"] = line.split(" ")[1]
+            if line[0:7] == "descr: ":
+                result["descr"] = " ".join(line.split(" ")[1:])
+            if line[0:7] == "lunid: ":
+                result["lunid"] = line.split(" ")[1]
+            if line[0:7] == "ident: ":
+                result["ident"] = line.split(" ")[1]
+            if line[0:14] == "rotationrate: ":
+                result["rotationrate"] = line.split(" ")[1]
+            if line[0:11] == "fwsectors: ":
+                result["fwsectors"] = line.split(" ")[1]
+            if line[0:9] == "fwheads: ":
+                result["fwheads"] = line.split(" ")[1]
+        return result
+
+
+def process_request() -> None:
+    """
+    Run gstat in a loop and update stats per line
+    """
+    # start with an empty deviceinfo dict and add devices as we see them
+    deviceinfo: Dict[str, Dict[str, str]] = {}
+
+    with Popen(
+        ["gstat", "-pdosCI", "5s"], stdout=PIPE, bufsize=1, universal_newlines=True
+    ) as p:
+        for line in p.stdout:
+            (
+                timestamp,
+                name,
+                queue_depth,
+                total_operations_per_second,
+                read_operations_per_second,
+                read_size_kilobytes,
+                read_kilobytes_per_second,
+                miliseconds_per_read,
+                write_operations_per_second,
+                write_size_kilobytes,
+                write_kilobytes_per_second,
+                miliseconds_per_write,
+                delete_operations_per_second,
+                delete_size_kilobytes,
+                delete_kilobytes_per_second,
+                miliseconds_per_delete,
+                other_operations_per_second,
+                miliseconds_per_other,
+                percent_busy,
+            ) = line.split(",")
+            if timestamp == "timestamp":
+                # skip header line
+                continue
+
+            if name not in deviceinfo:
+                # this is the first time we see this GEOM
+                deviceinfo[name] = {}
+                # we always need a value for all labels
+                for key in [
+                    "name",
+                    "descr",
+                    "mediasize",
+                    "sectorsize",
+                    "lunid",
+                    "ident",
+                    "rotationrate",
+                    "fwsectors",
+                    "fwheads",
+                ]:
+                    deviceinfo[name][key] = ""
+                # get real info from the device if it is class DISK
+                deviceinfo[name].update(get_deviceinfo(name))
+
+            deviceinfo[name].update({"name": name})
+
+            # up is always.. up
+            up.set(1)
+
+            queue.labels(**deviceinfo[name]).set(queue_depth)
+            totalops.labels(**deviceinfo[name]).set(total_operations_per_second)
+
+            readops.labels(**deviceinfo[name]).set(read_operations_per_second)
+            readsize.labels(**deviceinfo[name]).set(read_size_kilobytes)
+            readkbs.labels(**deviceinfo[name]).set(read_kilobytes_per_second)
+            readms.labels(**deviceinfo[name]).set(miliseconds_per_read)
+
+            writeops.labels(**deviceinfo[name]).set(write_operations_per_second)
+            writesize.labels(**deviceinfo[name]).set(write_size_kilobytes)
+            writekbs.labels(**deviceinfo[name]).set(write_kilobytes_per_second)
+            writems.labels(**deviceinfo[name]).set(miliseconds_per_write)
+
+            deleteops.labels(**deviceinfo[name]).set(delete_operations_per_second)
+            deletesize.labels(**deviceinfo[name]).set(delete_size_kilobytes)
+            deletekbs.labels(**deviceinfo[name]).set(delete_kilobytes_per_second)
+            deletems.labels(**deviceinfo[name]).set(miliseconds_per_delete)
+
+            otherops.labels(**deviceinfo[name]).set(other_operations_per_second)
+            otherms.labels(**deviceinfo[name]).set(miliseconds_per_other)
+
+            busy.labels(**deviceinfo[name]).set(percent_busy)
+
+
+# define metrics
+up = Gauge(
+    "gstat_up", "The value of this Gauge is always 1 when the gstat_exporter is up"
+)
+
+queue = Gauge(
+    "gstat_queue_depth",
+    "The queue depth for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+totalops = Gauge(
+    "gstat_total_operations_per_second",
+    "The total number of operations/second for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+
+readops = Gauge(
+    "gstat_read_operations_per_second",
+    "The number of read operations/second for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+readsize = Gauge(
+    "gstat_read_size_kilobytes",
+    "The size in kilobytes of read operations for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+readkbs = Gauge(
+    "gstat_read_kilobytes_per_second",
+    "The speed in kilobytes/second of read operations for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+readms = Gauge(
+    "gstat_miliseconds_per_read",
+    "The speed in miliseconds/read operation for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+
+writeops = Gauge(
+    "gstat_write_operations_per_second",
+    "The number of write operations/second for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+writesize = Gauge(
+    "gstat_write_size_kilobytes",
+    "The size in kilobytes of write operations for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+writekbs = Gauge(
+    "gstat_write_kilobytes_per_second",
+    "The speed in kilobytes/second of write operations for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+writems = Gauge(
+    "gstat_miliseconds_per_write",
+    "The speed in miliseconds/write operation for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+
+deleteops = Gauge(
+    "gstat_delete_operations_per_second",
+    "The number of delete operations/second for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+deletesize = Gauge(
+    "gstat_delete_size_kilobytes",
+    "The size in kilobytes of delete operations for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+deletekbs = Gauge(
+    "gstat_delete_kilobytes_per_second",
+    "The speed in kilobytes/second of delete operations for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+deletems = Gauge(
+    "gstat_miliseconds_per_delete",
+    "The speed in miliseconds/delete operation for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+
+otherops = Gauge(
+    "gstat_other_operations_per_second",
+    "The number of other operations (BIO_FLUSH)/second for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+otherms = Gauge(
+    "gstat_miliseconds_per_other",
+    "The speed in miliseconds/other operation (BIO_FLUSH) for this GEOM",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+
+busy = Gauge(
+    "gstat_percent_busy",
+    "The percent of the time this GEOM is busy",
+    [
+        "name",
+        "descr",
+        "mediasize",
+        "sectorsize",
+        "lunid",
+        "ident",
+        "rotationrate",
+        "fwsectors",
+        "fwheads",
+    ],
+)
+
+start_http_server(9248)
+while True:
+    process_request()
--- a/scripts/zfs-prune-snapshots
+++ b/scripts/zfs-prune-snapshots
@ -0,0 +1,172 @@
+#!/usr/bin/env bash
+#
+# script to prune zfs snapshots over a given age
+#
+# Author: Dave Eddy <dave@daveeddy.com>
+# Date: November 20, 2015
+# License: MIT
+
+VERSION='v1.0.1'
+
+usage() {
+	local prog=${0##*/}
+	cat <<-EOF
+	usage: $prog [-hnv] [-p <prefix] <time> [[dataset1] ...]
+
+	remove snapshots from one or more zpools that match given criteria
+
+	examples
+	    # $prog 1w
+	    remove snapshots older than a week across all zpools
+
+	    # $prog -vn 1w
+	    same as above, but with increased verbosity and without
+	    actually deleting any snapshots (dry-run)
+
+	    # $prog 3w tank1 tank2/backup
+	    remove snapshots older than 3 weeks on tank1 and tank2/backup.
+	    note that this script will recurse through *all* of tank1 and
+	    *all* datasets below tank2/backup
+
+	    # $prog -p 'autosnap_' 1M zones
+	    remove snapshots older than a month on the zones pool that start
+	    with the string "autosnap_"
+
+	timespec
+	    the first argument denotes how old a snapshot must be for it to
+	    be considered for deletion - possible specifiers are
+
+	        s seconds
+	        m minutes
+	        h hours
+	        d days
+	        w weeks
+	        M months
+	        y years
+
+	options
+	    -h             print this message and exit
+	    -n             dry-run, don't actually delete snapshots
+	    -p <prefix>    snapshot prefix string to match
+	    -q             quiet, do not printout removed snapshots
+	    -v             increase verbosity
+	    -V             print the version number and exit
+	EOF
+}
+
+debug() {
+	((verbosity >= 1)) && echo "$@"
+	return 0
+}
+
+# given a time in seconds, return the "human readable" string
+human() {
+	local seconds=$1
+	if ((seconds < 0)); then
+		((seconds *= -1))
+	fi
+
+	local times=(
+	$((seconds / 60 / 60 / 24 / 365)) # years
+	$((seconds / 60 / 60 / 24 / 30))  # months
+	$((seconds / 60 / 60 / 24 / 7))   # weeks
+	$((seconds / 60 / 60 / 24))       # days
+	$((seconds / 60 / 60))            # hours
+	$((seconds / 60))                 # minutes
+	$((seconds))                      # seconds
+	)
+	local names=(year month week day hour minute second)
+
+	local i
+	for ((i = 0; i < ${#names[@]}; i++)); do
+		if ((${times[$i]} > 1)); then
+			echo "${times[$i]} ${names[$i]}s"
+			return
+		elif ((${times[$i]} == 1)); then
+			echo "${times[$i]} ${names[$i]}"
+			return
+		fi
+	done
+	echo '0 seconds'
+}
+
+dryrun=false
+verbosity=0
+prefix=
+quiet=false
+while getopts 'hnqp:vV' option; do
+	case "$option" in
+		h) usage; exit 0;;
+		n) dryrun=true;;
+		p) prefix=$OPTARG;;
+		q) quiet=true;;
+		v) ((verbosity++));;
+		V) echo "$VERSION"; exit 0;;
+		*) usage; exit 1;;
+	esac
+done
+shift "$((OPTIND - 1))"
+
+# extract the first argument - the timespec - and
+# convert it to seconds
+t=$1
+time_re='^([0-9]+)([smhdwMy])$'
+seconds=
+if [[ $t =~ $time_re ]]; then
+	# ex: "21d" becomes num=21 spec=d
+	num=${BASH_REMATCH[1]}
+	spec=${BASH_REMATCH[2]}
+
+	case "$spec" in
+		s) seconds=$((num));;
+		m) seconds=$((num * 60));;
+		h) seconds=$((num * 60 * 60));;
+		d) seconds=$((num * 60 * 60 * 24));;
+		w) seconds=$((num * 60 * 60 * 24 * 7));;
+		M) seconds=$((num * 60 * 60 * 24 * 30));;
+		y) seconds=$((num * 60 * 60 * 24 * 365));;
+		*) echo "error: unknown spec '$spec'" >&2; exit 1;;
+	esac
+elif [[ -z $t ]]; then
+	echo 'error: timespec must be specified as the first argument' >&2
+	exit 1
+else
+	echo "error: failed to parse timespec '$t'" >&2
+	exit 1
+fi
+
+shift
+pools=("$@")
+
+now=$(date +%s)
+code=0
+while read -r creation snapshot; do
+	# ensure optional prefix matches
+	snapname=${snapshot#*@}
+	if [[ -n $prefix && $prefix != "${snapname:0:${#prefix}}" ]]; then
+		debug "skipping $snapshot: doesn't match prefix $prefix"
+		continue
+	fi
+
+	# ensure snapshot is older than the cutoff time
+	delta=$((now - creation))
+	human=$(human "$delta")
+	if ((delta <= seconds)); then
+		debug "skipping $snapshot: $human old"
+		continue
+	fi
+
+	# remove the snapshot
+	if ! $quiet || $dryrun; then
+		echo -n "removing $snapshot: $human old"
+	fi
+	if $dryrun; then
+		echo ' <dry-run: no action taken>'
+	else
+		if ! $quiet; then
+			echo
+		fi
+		zfs destroy "$snapshot" || code=1
+	fi
+done < <(zfs list -Hpo creation,name -t snapshot -r "${pools[@]}")
+exit "$code"
--- a/scripts/zfs_health.sh
+++ b/scripts/zfs_health.sh
@ -0,0 +1,131 @@
+#! /bin/sh
+#
+# Calomel.org
+#     https://calomel.org/zfs_health_check_script.html
+#     FreeBSD ZFS Health Check script
+#     zfs_health.sh @ Version 0.18
+
+# Check health of ZFS volumes and drives. On any faults send email.
+
+
+# 99 problems but ZFS aint one
+problems=0
+
+
+# Health - Check if all zfs volumes are in good condition. We are looking for
+# any keyword signifying a degraded or broken array.
+
+condition=$(/sbin/zpool status | egrep -i '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED|corrupt|cannot|unrecover)')
+if [ "${condition}" ]; then
+        emailSubject="`hostname` - ZFS pool - HEALTH fault"
+        problems=1
+fi
+
+
+# Capacity - Make sure the pool capacity is below 80% for best performance. The
+# percentage really depends on how large your volume is. If you have a 128GB
+# SSD then 80% is reasonable. If you have a 60TB raid-z2 array then you can
+# probably set the warning closer to 95%.
+#
+# ZFS uses a copy-on-write scheme. The file system writes new data to
+# sequential free blocks first and when the uberblock has been updated the new
+# inode pointers become valid. This method is true only when the pool has
+# enough free sequential blocks. If the pool is at capacity and space limited,
+# ZFS will be have to randomly write blocks. This means ZFS can not create an
+# optimal set of sequential writes and write performance is severely impacted.
+
+maxCapacity=80
+
+if [ ${problems} -eq 0 ]; then
+   capacity=$(/sbin/zpool list -H -o capacity | cut -d'%' -f1)
+   for line in ${capacity}
+     do
+       if [ $line -ge $maxCapacity ]; then
+         emailSubject="`hostname` - ZFS pool - Capacity Exceeded"
+         problems=1
+       fi
+     done
+fi
+
+
+# Errors - Check the columns for READ, WRITE and CKSUM (checksum) drive errors
+# on all volumes and all drives using "zpool status". If any non-zero errors
+# are reported an email will be sent out. You should then look to replace the
+# faulty drive and run "zpool scrub" on the affected volume after resilvering.
+
+if [ ${problems} -eq 0 ]; then
+   errors=$(/sbin/zpool status | grep ONLINE | grep -v state | awk '{print $3 $4 $5}' | grep -v 000)
+   if [ "${errors}" ]; then
+        emailSubject="`hostname` - ZFS pool - Drive Errors"
+        problems=1
+   fi
+fi
+
+
+# Scrub Expired - Check if all volumes have been scrubbed in at least the last
+# 40 days. The general guide is to scrub volumes on desktop quality drives once
+# a week and volumes on enterprise class drives once a month. You can always
+# use cron to schedual "zpool scrub" in off hours. We scrub our volumes every
+# Sunday morning for example.
+#
+# Scrubbing traverses all the data in the pool once and verifies all blocks can
+# be read. Scrubbing proceeds as fast as the devices allows, though the
+# priority of any I/O remains below that of normal calls. This operation might
+# negatively impact performance, but the file system will remain usable and
+# responsive while scrubbing occurs. To initiate an explicit scrub, use the
+# "zpool scrub" command.
+#
+# The scrubExpire variable is in seconds. So for 40 days we calculate 40 days
+# times 24 hours times 3600 seconds to equal 3456000 seconds.
+
+# 10 days
+#scrubExpire=864000
+# 40 days
+scrubExpire=3456000
+
+if [ ${problems} -eq 0 ]; then
+   currentDate=$(date +%s)
+   zfsVolumes=$(/sbin/zpool list -H -o name)
+
+  for volume in ${zfsVolumes}
+   do
+    if [ $(/sbin/zpool status $volume | egrep -c "none requested") -ge 1 ]; then
+        printf "ERROR: You need to run \"zpool scrub $volume\" before this script can monitor the scrub expiration time."
+        break
+    fi
+    if [ $(/sbin/zpool status $volume | egrep -c "scrub in progress|resilver") -ge 1 ]; then
+        break
+    fi
+
+    ### Ubuntu with GNU supported date format
+    #scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $11" "$12" " $13" " $14" "$15}')
+    #scrubDate=$(date -d "$scrubRawDate" +%s)
+
+    ### FreeBSD 11.2 with *nix supported date format
+    #scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $15 $12 $13}')
+    #scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
+
+    ### FreeBSD 12.0 with *nix supported date format
+    scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $17 $14 $15}')
+    scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
+
+     if [ $(($currentDate - $scrubDate)) -ge $scrubExpire ]; then
+        emailSubject="`hostname` - ZFS pool - Scrub Time Expired. Scrub Needed on Volume(s)"
+        problems=1
+     fi
+   done
+fi
+
+
+# Email - On any problems send email with drive status information and
+# capacities including a helpful subject line. Also use logger to write the
+# email subject to the local logs. This is also the place you may want to put
+# any other notifications like playing a sound file, beeping the internal 
+# speaker, paging someone or updating Nagios or even BigBrother.
+
+if [ "$problems" -ne 0 ]; then
+  printf '%s\n' "$emailSubject" "" "`/sbin/zpool list`" "" "`/sbin/zpool status`" | /usr/bin/mail -s "$emailSubject" jail-root@ahlawat.com
+  logger $emailSubject
+fi
+
+### EOF ###