#!/usr/bin/env ruby

# Script to alert on DRBD badness
# Ref: http://www.drbd.org/users-guide/ch-admin.html#s-proc-drbd

# Example:
# "10: cs:StandAlone ro:Secondary/Unknown ds:UpToDate/DUnknown   r-----"
# "20: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate B r-----"
# "21: cs:Unconfigured"
# "22: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate B r-----"
#
# Certain shares can be ignored either by adding their guest name
# or their drbd share (/dev/drbdN) to /etc/disk_alert.ignore

require "yaml"
require "rexml/document"


OKCSTATES = ["Connected"]
OKDSTATES = ["UpToDate/UpToDate"]

DRBD_BASE_PORT=45000



class DRBDChecker


  def disabled?
    File.exists? "/etc/drbd_alert.disabled"
  end

  def drbd?
    File.exists? "/proc/drbd"
  end

  def pairvm?
    File.exists? "/machines/_global"
  end


  # Guests we're ignoring
  def ignored_guests
    ignored = []
    ignore_file = "/etc/drbd_alert.ignore"

    if File.exists? ignore_file
      open(ignore_file, "r") do |f|
  	f.readlines.each{|l| ignored << l.strip}
      end
    end
  end

  # Guests which are backing up (locally)
  def backing_up_locally
    # gzip -c SNAPSHOT | cpipe -s SPEED | ssh DEST cat > TEMP && mv TEMP DPATH
    backing_up = %x[ps -ef|grep -i cpipe].split( /[\r\n]/).map do |s|
      next unless s =~ /gzip/
      backing_up_file = File.basename(s.split("|").first.split[11])
      backing_up_file.split("_").first
    end.reject{|a|a.nil?}

    backing_up
  end

  # Guests which are backing up (remotely)
  def backing_up_remotely
    if ( pairvm? )
      other_host = YAML.load_file("/machines/_global")["ips"]["there"]

      backing_up_remote = %x[ssh #{other_host} 'ps -ef|grep -i cpipe'].split( /[\r\n]/).map do |s|
        next unless s =~ /gzip/
        backing_up_file = File.basename(s.split("|").first.split[11])
        backing_up_file.split("_").first
      end.reject{|a|a.nil?}

      backing_up_remote
    else
      nil
    end
  end


  def drbd_ports
    guestports = {}

    if ( pairvm? )
      Dir.glob("/machines/*").each do |machine|

        next if ( machine =~ /\/_/ )
        next if ( machine.end_with? "~" )
        next if ( machine.end_with? "_dump" )

        mfile = YAML.load_file(machine)
        next unless mfile["drbd_port"]
        found_port = (mfile["drbd_port"]-DRBD_BASE_PORT).to_s
        guestports[found_port] = File.basename(machine)
      end
    else
      # use drbdadm to get the status
      xmlstate = REXML::Document.new %x[drbdadm status all 2>/dev/null]

      if $?.exitstatus == 0
        xmlstate.elements.each("drbd-status/resources/resource") do |r|
          resname = r.attribute("name").to_s
          drbdnum = r.attribute("minor").to_s
          guestports[drbdnum] = "#{resname}.drbd#{drbdnum}"
        end
      else
        %x(drbd-overview).scan(/(\d+):([^\/]+)\//).each do |drbd|
          (drbdnum, resname) = drbd
          guestports[drbdnum] = "#{resname}.drbd#{drbdnum}"
        end
      end
    end
    guestports
  end

  def check
    guestports = drbd_ports()
    ignored    = ignored_guests() || Array.new()
    local      = backing_up_locally() || Array.new()
    remote     = backing_up_remotely() || Array.new()

    drbd_state = open("/proc/drbd","r").read

    statuses = drbd_state.scan(/([0-9]+): cs:([^\s]+) ro:([^\s]+) ds:([^\s]+) ([ABC]?) ([^\s]+)?/).map do |drbd|
      (res,cs,ro,ds,protcol,flags) = drbd
      label = ( guestports[res.to_s] ) + " (drbd#{res})"
      machine = guestports[res.to_s] || res
      next if ignored.include?(machine)
      next if pairvm? and local.include?(machine)
      next if pairvm? and remote.include?(machine)
      next if ignored.include?(label)
      next if ignored.include?("/dev/drbd#{res}")
      problems = []
      problems << "Connection state is #{cs}" unless OKCSTATES.include?(cs)
      problems << "Resyncing/Bad disk state: Disk state is #{ds}" unless OKDSTATES.include?(ds)

      if problems.length > 0
        detail = ""
        detail << problems.map{|s|"* #{label}: #{s}"}.join("\n")
        [res,:raise,detail]
      else
        [res,:clear,nil]
      end
    end.reject{|a|a==nil}

    (notok,ok) = statuses.partition{|o|o[1]==:raise}
  end

end



if __FILE__ ==  $PROGRAM_NAME

  def verbose(str)
    STDERR.puts(str)
  end

  # Alerts we'll raise
  to_raise = []

  # Create the helper
  x = DRBDChecker.new()

  # Are we disabled?
  if x.disabled?
    verbose( "Checking disabled" )
    puts YAML.dump(to_raise)
    exit(0)
  end

  # Are we on a non-DRBD host?
  if ( x.drbd? == false )
    verbose( "Non-DRBD host" )
    puts YAML.dump(to_raise)
    exit(0)
  end

  # Now we're cooking on gas
  (notok,ok)= x.check

  #
  if notok.length > 0
    verbose( "#{notok.length} (out of #{statuses.length}) DRBD shares are not OK." )

    h = {}
    h[:summary] = "#{notok.length} DRBD shares are not OK"
    h[:detail] = notok.map{|a|a[2]}.join("\n")
    h[:detail] << "\n\nFull dump follows:\n\n"
    h[:detail] << "<pre>"
    h[:detail] << drbd_state
    h[:detail] << "</pre>"
    h[:detail] << "\n\n#{ignored.length} shares were ignored."

    if (x.pairvm? )
      h[:detail] << "\n\n#{x.backing_up_locally.length} shares were ignored because they were being backed up at the time (PairVM)."
      h[:detail] << "\n\n#{x.backing_up_remotely.length} shares were ignored because they were being backed (on the other host) up at the time (PairVM)."
    end
    to_raise.push(h);
  else
    verbose( "All #{ok.length} shares are OK." )
  end

  puts YAML.dump(to_raise)
  exit(0)

end
