#!/bin/bash
# ceph_disable_hdd_write_cache.sh - Disable the volatile on-disk write cache
# on every HDD-class OSD data device on this host, via sdparm --set=WCE=0.
#
# After sdparm flips WCE the kernel re-reads the SCSI mode page, so
# /sys/block/<dev>/device/scsi_disk/*/cache_type goes from "write back" to
# "write through" -- we read that back and verify. Note: hdparm -W 0 will
# disable WCE on the drive but does NOT update the kernel's cache_type;
# they then disagree. sdparm avoids that.
#
# Default mode is dry-run. Pass --apply to actually change anything.
#
# block.db, block.wal, and journal devices are added to a skip set so the
# SSD that backs deferred writes / RocksDB WAL is left alone.
#
# Note: changes don't survive a reboot on these drives (the WCE mode page
# is reported non-savable). Re-run on boot via a unit/cron if you need
# persistence.

# Re-exec under bash if invoked via sh/dash/etc -- we use bash-only
# features (set -o pipefail, [[, mapfile, read -ra, associative arrays).
if [ -z "${BASH_VERSION:-}" ]; then
    exec /bin/bash "$0" "$@"
fi

APPLY=false
VERIFY=true

usage() {
    cat <<EOF
Usage: $0 [options]

Disables the volatile on-disk write cache (WCE=0) on every HDD-class OSD
data device on this host, via sdparm. After each change, reads back
/sys/block/<dev>/device/scsi_disk/*/cache_type and verifies it flipped
to "write through". block.db, block.wal, and journal devices are skipped.

Default mode is dry-run.

Options:
      --apply       Actually apply changes (default is dry-run)
      --no-verify   Skip post-change cache_type readback
  -h, --help        Show this help message

Walks /var/run/ceph/ceph-osd.*.asok to find local OSDs and uses
ceph_osd_device --osd <id> --local to discover backing devices.

Requires: ceph, ceph_osd_device, sdparm.
EOF
}

while [[ $# -gt 0 ]]; do
    case $1 in
        --apply)      APPLY=true; shift ;;
        --no-verify)  VERIFY=false; shift ;;
        -h|--help)    usage; exit 0 ;;
        *) echo "Unknown option: $1" >&2; usage >&2; exit 1 ;;
    esac
done

set -euo pipefail

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

need() {
    command -v "$1" >/dev/null 2>&1 || {
        echo -e "${RED}Error: '$1' not found in PATH${NC}" >&2
        exit 1
    }
}
need ceph
need ceph_osd_device
need sdparm

# Only --apply needs root. Dry-run reads ceph state and /sys cache_type as
# any user.
if $APPLY && [[ $EUID -ne 0 ]]; then
    echo -e "${RED}Error: --apply requires root." >&2
    exit 1
fi

# --- 1. Local OSD IDs from admin sockets ------------------------------------
shopt -s nullglob
sockets=(/var/run/ceph/ceph-osd.*.asok)
shopt -u nullglob
if (( ${#sockets[@]} == 0 )); then
    echo -e "${RED}No OSDs on this host (no admin sockets in /var/run/ceph).${NC}" >&2
    exit 1
fi

local_osds=()
for sock in "${sockets[@]}"; do
    base=${sock##*/ceph-osd.}
    id=${base%.asok}
    [[ "$id" =~ ^[0-9]+$ ]] || continue
    local_osds+=("$id")
done
mapfile -t local_osds < <(printf '%s\n' "${local_osds[@]}" | sort -n)

# --- 2. Cluster-wide HDD class membership -----------------------------------
# Plain text output is one ID per line, no jq needed.
hdd_set=" $(ceph osd crush class ls-osd hdd 2>/dev/null | tr '\n' ' ') "

# --- 3. Walk local OSDs, classify each backing device -----------------------
declare -A target_devs   # dev -> osd id (HDD data device)
declare -A skip_devs     # dev -> reason (block.db / block.wal / journal)

strip_ansi='s/\x1b\[[0-9;]*m//g'

add_devs() {
    local bucket=$1 list=$2 tag=$3
    IFS=',' read -ra arr <<<"$list"
    for d in "${arr[@]}"; do
        d=$(echo "$d" | xargs)
        [[ -z "$d" ]] && continue
        [[ "$d" == "(unknown)" ]] && continue
        [[ -b "/dev/$d" ]] || continue
        case "$bucket" in
            target) target_devs[$d]=$tag ;;
            skip)   skip_devs[$d]=$tag ;;
        esac
    done
}

echo -e "${CYAN}Local OSDs:${NC} ${local_osds[*]}"

for id in "${local_osds[@]}"; do
    if ! out=$(ceph_osd_device --osd "$id" --local 2>/dev/null); then
        echo -e "${YELLOW}warn: ceph_osd_device --osd $id failed; skipping${NC}" >&2
        continue
    fi
    out=$(printf '%s\n' "$out" | sed "$strip_ansi")

    is_hdd=false
    [[ " $hdd_set " == *" $id "* ]] && is_hdd=true

    while IFS= read -r line; do
        case "$line" in
            "  Device: "*)
                devs=${line#"  Device: "}
                if $is_hdd; then
                    add_devs target "$devs" "$id"
                fi
                ;;
            "  block.db: "*)
                devs=${line#"  block.db: "}
                add_devs skip "$devs" "block.db"
                ;;
            "  block.wal: "*)
                devs=${line#"  block.wal: "}
                add_devs skip "$devs" "block.wal"
                ;;
            "  Journal: "*)
                devs=${line#"  Journal: "}
                add_devs skip "$devs" "journal"
                ;;
        esac
    done <<<"$out"
done

if (( ${#target_devs[@]} == 0 )); then
    echo -e "${YELLOW}No HDD-class OSD data devices on this host.${NC}"
    exit 0
fi

# --- 4. Apply (or dry-run) -------------------------------------------------
echo
if $APPLY; then
    echo -e "${CYAN}Mode:${NC} ${GREEN}APPLY${NC}"
else
    echo -e "${CYAN}Mode:${NC} ${BLUE}dry-run${NC} (pass --apply to actually run sdparm)"
fi
echo
printf "%-12s %-8s %-14s %-14s %s\n" "DEVICE" "OSD" "BEFORE" "AFTER" "RESULT"
printf "%-12s %-8s %-14s %-14s %s\n" "------------" "--------" "--------------" "--------------" "------"

cache_type_path() {
    local dev=$1
    local files=( /sys/block/"$dev"/device/scsi_disk/*/cache_type )
    [[ -e "${files[0]}" ]] || { echo ""; return; }
    echo "${files[0]}"
}

cache_type_for() {
    local dev=$1
    local p
    p=$(cache_type_path "$dev")
    [[ -z "$p" ]] && { echo "(no scsi)"; return; }
    cat "$p" 2>/dev/null || echo "(unknown)"
}

mapfile -t sorted_targets < <(printf '%s\n' "${!target_devs[@]}" | sort)

ok=0; skipped=0; failed=0; would=0
for dev in "${sorted_targets[@]}"; do
    osd=${target_devs[$dev]}

    if [[ -n "${skip_devs[$dev]:-}" ]]; then
        printf "${YELLOW}%-12s %-8s %-14s %-14s %s${NC}\n" \
            "$dev" "osd.$osd" "-" "-" "skip (${skip_devs[$dev]})"
        skipped=$((skipped+1))
        continue
    fi

    before=$(cache_type_for "$dev")

    if ! $APPLY; then
        printf "${BLUE}%-12s %-8s %-14s %-14s %s${NC}\n" \
            "$dev" "osd.$osd" "$before" "(dry-run)" "would set WCE=0"
        would=$((would+1))
        continue
    fi

    if sdparm --set=WCE=0 "/dev/$dev" >/dev/null; then
        after=$(cache_type_for "$dev")
        if $VERIFY && [[ "$after" != "write through" ]]; then
            printf "${RED}%-12s %-8s %-14s %-14s %s${NC}\n" \
                "$dev" "osd.$osd" "$before" "$after" "FAIL (sdparm ok but cache_type unchanged)"
            failed=$((failed+1))
        else
            printf "${GREEN}%-12s %-8s %-14s %-14s %s${NC}\n" \
                "$dev" "osd.$osd" "$before" "$after" "ok"
            ok=$((ok+1))
        fi
    else
        printf "${RED}%-12s %-8s %-14s %-14s %s${NC}\n" \
            "$dev" "osd.$osd" "$before" "?" "sdparm failed"
        failed=$((failed+1))
    fi
done

echo
if $APPLY; then
    echo -e "${CYAN}Summary:${NC} ${GREEN}${ok} changed${NC}, ${YELLOW}${skipped} skipped${NC}, ${RED}${failed} failed${NC}"
    exit $(( failed > 0 ? 1 : 0 ))
else
    echo -e "${CYAN}Summary:${NC} ${BLUE}${would} would change${NC}, ${YELLOW}${skipped} skipped${NC} (dry-run; pass --apply to commit)"
    exit 0
fi
