#!/usr/bin/ruby
# coding: utf-8

require 'fileutils'
require 'getoptlong'
require 'socket'
require 'yaml'
require 'open3'
require 'mauve/sender'
require 'mauve/proto'
require 'English'

#
#  Use the current time to schedule execution of various checks and
# tests from numbered directories beneath /etc/bytemark-healthcheck/.
#
#  Each check is designed to return an array of hashes, one hash for
# each alert which should be raised.
#
#  The hashes *must* have the following three keys:
#
#     :id
#     :summary
#     :detail
#
#  In this script we add extra fields ":filename", and ":timestamp", so
# that we can replace previous alerts wih more current ones.
#
class HealthcheckDriver
  #
  # The prefix of our run-part-like directory tree.
  #
  attr_reader :prefix

  #
  # The file we used to save state
  #
  attr_reader :state_file

  #
  # The numbered directories we have discovered.
  #
  attr_reader :directories

  #
  # The alerts we will raise.
  #
  attr_reader :alerts

  #
  # The time
  #
  attr_reader :now

  #
  # A hash of failed jobs
  #
  attr_reader :failures

  #
  #  Constructor
  #
  def initialize(prefix_dir, state_file, now = Time.now)
    @prefix = File.expand_path(prefix_dir)
    @state_file = File.expand_path(state_file)
    @directories = []
    @alerts      = []

    # Catch failures, so we can alert about them
    @failures    = {}

    # Set the time
    @now = now

    #
    #
    #
    @jobs_run = []
  end

  #
  # Print if we're verbose
  #
  def verbose(str)
    puts(str) if $VERBOSE || ENV['VERBOSE']
  end

  #
  #  Load any previous alerts - taking into account the TTL which is
  # specified in minutes.
  #
  #
  def load_previous
    return unless File.exist? @state_file

    begin
      old = YAML.load_file(@state_file)

    rescue SyntaxError => _e
      #
      #  If we failed to parse the state remove it.
      #
      verbose("Failed to parse YAML from dump - #{@state_file} - deleting")
      File.delete(@state_file)
    end

    if old.is_a?(Array)
      @alerts   = old
      @failures = {}
    elsif old.is_a?(Hash)
      @alerts    = old[:alerts] || old['alerts']
      @failures  = old[:failures] || old['failures']
    end

    @alerts.delete_if do |prev|
      # Get the old timestamp
      ts = prev[:timestamp] || 0

      # Get the old TTL
      ttl = prev[:ttl] || (60 * 24)

      (ts + (ttl * 60) + 1) < @now.to_i
    end
  end

  #
  # Should the given period-directory be executed at Time given by at.
  #
  def should_run?(period)
    raise ArgumentError, 'period must be an integer' unless period.is_a?(Integer)
    raise ArgumentError, 'period must be greater than one' if period < 1

    #
    # Simple test of whether a given frequency should run.
    #
    ((now.to_i / 60.0).floor.to_i % period == 0)
  end

  #
  # Look at each of the period-directories, and execute those that
  # need to be triggered.
  #
  # The "at" argument allows an argument on the command line to be processed.
  #
  def run(at = nil)
    #
    #
    # Look for sub-directories which will be named after the run-frequency,
    # along with some others which have more humane-names.
    #
    #  ( "Hourly", "daily", "jobs.d", etc)
    #
    Dir.glob(File.join(prefix, '/*')).each do |directory|
      #
      # Stat the directory, but don't follow symlinks to stop running tests twice.
      # TODO Maybe this could be improved with keeping an idea of which directories have been run.
      #
      unless File.lstat(directory).directory?
        verbose("Ignoring #{directory} as not a directory")
        next
      end

      #
      # The period is the directory name, and could be a number or symbolic, like weekly/monthly etc.
      #
      period = self.class.parse_period(File.basename(directory))

      #
      # We can'd do periods less than a minute.
      #
      if period < 1
        verbose("Ignoring #{directory} as it does not parse to a sensible period (#{period})") unless directory == 'jobs.d'
        next
      end

      verbose("Found directory: #{directory}")

      if at.nil?
        #
        # If no at parameter has been given, use our simple test.
        #
        next unless should_run?(period)

      else
        #
        # If an "at" parameter has been given parse it, and if it matches the
        # period, continue.
        #
        at = self.class.parse_period(at)
        next unless (at == 1) || (at == period)
      end

      run_parts(directory, period)
    end
  end

  #
  # Collect the output of the given directory.
  #
  # The output of each test will be YAML which contains alerts to raise.
  #
  def run_parts(directory, period)
    verbose("Running the scripts in: #{directory}")

    #
    #  For each executable run it, and capture the output as YAML.
    #
    #  If the output of the YAML was an array (of hashes) then update
    # our @alerts array to include it.
    #
    Dir.glob(File.join(directory, '*')).each do |job|
      if File.symlink?(job)
        # Attempt to cope with both absolute AND relative symlinks.
        job = File.expand_path(File.readlink(job), directory)
      end

      if @jobs_run.include?(job)
        verbose "Skipping #{job} as it has already been run."
        next
      end

      @jobs_run << job

      # Catch-all error-handling around the job.
      begin
        data = run_job(job, period)
      rescue => e
        @failures[job] = "Exception processing job: #{e}"
        verbose "Exception processing job #{job} - #{e}"
        verbose e.backtrace.join("\n")
        next
      end

      unless data
        verbose "No data returned by #{job}. Skipping"
        next
      end

      #
      # Remove any alerts prior that were produced by this script.
      # to this script.
      #
      @alerts.each do |cur|
        @alerts.delete(cur) if cur[:filename] == job
      end

      #
      # Each alert needs to be raised should now be added
      #
      data.each do |item|
        # Add some meta-data about each alert.
        item[:timestamp] = @now.to_i
        item[:filename] = job
        item[:ttl] = period + 1

        # Add the alert
        @alerts << item
      end
    end
  end

  def run_job(job, _period)
    #
    # Stat the file once.
    #
    fstat = File.lstat(job)

    unless fstat.file?
      verbose "Skipping #{job} as it is not a file."
      return
    end

    unless fstat.executable?
      verbose "Skipping #{job} as it is not executable."
      return
    end

    #
    # Run the command, hiding the verbose output which has been
    # sent to STDERR.
    #
    stdout = ''
    stderr = ''
    status = 0

    begin
      stdout, stderr, status = Open3.capture3(job)
    rescue NoMethodError
      stdout = `#{job} 2>/dev/null`
    end

    #
    # If the script generated no output, or didn't exit cleanly,
    # then we're going to ignore it and move on.
    #
    if status != 0
      @failures[job] = "Exited with non-zero status #{status}, STDERR:#{stderr}"
      return
    end

    if stdout.nil? || stdout.empty?
      @failures[job] = 'Did not return any output.'
      return
    end

    #
    # Parse the YAML, if we can.
    #
    begin
      data = if YAML.respond_to?(:safe_load)
               YAML.safe_load(stdout, [Symbol], [:id, :summary, :detail])
             else
               YAML.load(stdout)
             end
    rescue StandardError => e
      @failures[job] = "Failed to parse YAML: #{e}"
      return
    end

    #
    #  We expect to receive an array of hashes.
    #
    #  (Each hash-item representing an alert to raise.)
    #
    unless data.is_a?(Array) && (data.empty? || data.all? { |d| d.is_a?(Hash) })
      @failures[job] = 'Did not return an Array of Hashes, which is odd.  Maybe the script is written incorrectly.'
      return
    end

    #
    # If we get this far, delete any failures that have persisted.
    #
    @failures.delete(job)

    data
  end

  #
  #  Clear all outstanding alerts.
  #
  def clear_alerts
    hostname = Socket.gethostname

    #
    # The clear-update we're going to send
    #
    # The key here is that we set "replace" to be true.
    #
    update = Mauve::Proto::AlertUpdate.new
    update.alert   = []
    update.source  = File.basename(__FILE__) + '@' + hostname
    update.replace = true

    #
    # Add the hearbeat
    #
    heartbeat_alert = Mauve::Proto::Alert.new
    heartbeat_alert.id = 'heartbeat'

    #
    # The heartbeat will `clear` now.
    #
    heartbeat_alert.clear_time = Time.now.to_i
    heartbeat_alert.summary = "heartbeat failed for #{hostname}"
    heartbeat_alert.detail  = "The heartbeat wasn't sent for the host #{hostname}\nThis indicates that the host might be down."
    update.alert << heartbeat_alert

    #
    # Should we send the alerts?
    #
    send_alerts = (ENV['NOP'] != 'NOP')

    #
    #  Do the necessary.
    #
    if send_alerts
      update.transmission_id = rand(2**63)
      Mauve::Sender.new.send(update)
    else
      verbose 'Should clear all alerts here'
    end

    # For testing.
    update
  end

  #
  #  Unlink our log of previously-raised alerts
  #
  def clean_alerts
    File.unlink(@state_file) if File.exist?(@state_file)
  end

  #
  #  Raise any outstanding alerts.
  #
  def raise_alerts
    hostname = Socket.gethostname

    #
    # These are the alerts we're going to raise
    #
    update = Mauve::Proto::AlertUpdate.new
    update.alert   = []
    update.source  = File.basename(__FILE__) + '@' + hostname
    update.replace = true

    #
    # Add the hearbeat
    #
    heartbeat_alert = Mauve::Proto::Alert.new
    heartbeat_alert.id = 'heartbeat'

    #
    # Use the current time, in case the tests have taken ages to run.
    #
    heartbeat_alert.clear_time = Time.now.to_i
    #
    # Raise in 5½ minutes time (the extra 30s is a bodge to avoid race
    # conditions)
    #
    heartbeat_alert.raise_time = Time.now.to_i + 330
    heartbeat_alert.summary = "heartbeat failed for #{hostname}"
    heartbeat_alert.detail  = "The heartbeat wasn't sent for the host #{hostname}\nThis indicates that the host might be down."
    update.alert << heartbeat_alert

    #
    # Alerts we're going to raise now.
    #
    # This is a union of any saved-alerts, and any new ones.
    #
    @alerts.each do |value|
      verbose("Raising alert: #{value[:summary]}")

      alert = Mauve::Proto::Alert.new
      alert.id         = value[:id]
      alert.summary    = value[:summary]
      alert.subject    = hostname
      alert.detail     = value[:detail]
      alert.raise_time = now.to_i

      # This is annoying.
      alert.detail = alert.detail[0..1024] if alert.detail.length > 1025

      update.alert << alert
    end

    #
    #  Update the failures - if we saw any failures.
    #
    unless @failures.empty?

      healthcheck_alert = Mauve::Proto::Alert.new
      healthcheck_alert.id = 'healthcheck-failures'
      healthcheck_alert.subject = hostname
      healthcheck_alert.summary = 'Host health check has not completed properly (this needs manual clearing).'
      healthcheck_alert.detail  = "The following checks failed:\n" + @failures.collect { |check, failure| " * #{check}: #{failure}" }.join("\n")
      healthcheck_alert.raise_time = now.to_i

      update.alert << healthcheck_alert
    end

    #
    # Should we send the alerts?
    #
    send_alerts = (ENV['NOP'] != 'NOP')

    #
    #  Do the necessary.
    #
    if send_alerts
      update.transmission_id = rand(2**63)
      Mauve::Sender.new.send(update)
    else
      verbose "Should send: #{update.alert.size} alerts"
      verbose YAML.dump(update.alert)
    end

    # For testing.
    update
  end

  #
  # Update our state, which is the list of alerts we know about.
  #
  # This will be the array of YAML we've previously created.
  #
  def update_state
    state = { :alerts => @alerts, :failures => @failures }

    begin
      File.open(@state_file + ".tmp.#{$PROCESS_ID}", 'w') do |f|
        f.write(YAML.dump(state))
      end

      File.rename(@state_file + ".tmp.#{$PROCESS_ID}", @state_file)
    rescue Errno::ENOENT
      puts "Failed to write state, no such directory #{@state_file}"
    end
  end

  #
  # This is used to parse command line arguments, as well as directory names.
  #
  def self.parse_period(arg)
    return arg.to_i if arg.is_a?(Numeric)

    arg = arg.to_s.strip

    #
    # Convert period to an integer, failing gracefully
    #
    case arg
    when /^\d+$/
      begin
        Integer(arg)
      rescue ArgumentError
        puts "Invalid period #{arg.inspect}"
        return false
      end
    when /all/i
      1
    when /hourly/i
      60
    when /daily/i
      60 * 24
    when /weekly/i
      60 * 24 * 7
    when /monthly/i
      60 * 24 * 30
    else
      0
    end
  end
end

#
# Load and execute our driver.
#
if __FILE__ == $PROGRAM_NAME

  #
  # Ensure we have a sane PATH, such that any jobs we execute
  # will find binaries they expect - even if not explicitly qualified
  #
  ENV['PATH'] = '/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin:/usr/local/sbin'


  help   = false
  clear  = false
  prefix = '/etc/bytemark-healthcheck'
  state  = '/dev/shm/bytemark-healthcheck.yml'
  lock   = '/dev/shm/healthcheck.lock'

  begin
    opts = GetoptLong.new(
      ['--clear-all', '-c', GetoptLong::NO_ARGUMENT],
      ['--help', '-h', GetoptLong::NO_ARGUMENT],
      ['--lock', '-l', GetoptLong::REQUIRED_ARGUMENT],
      ['--nop',  '-n', GetoptLong::NO_ARGUMENT],
      ['--prefix', '-p', GetoptLong::REQUIRED_ARGUMENT],
      ['--state', '-s', GetoptLong::REQUIRED_ARGUMENT],
      ['--verbose',  '-v', GetoptLong::NO_ARGUMENT]
    )

    opts.each do |opt, arg|
      case opt
      when '--help' then
        help = true
      when '--lock' then
        lock = arg
      when '--clear-all' then
        clear = true
      when '--nop' then
        ENV['NOP'] = 'NOP'
      when '--prefix' then
        prefix = arg
      when '--state' then
        state = arg
      when '--verbose' then
        ENV['VERBOSE'] = '1'
      end
    end
  rescue StandardError => ex
    puts "Option parsing failed: #{ex}"
    exit
  end

  if help
    puts DATA.read
    exit(0)
  end

  #
  # Test the user is root
  #
  if Process.uid != 0
    puts 'You must be root to invoke this script'
    exit(0)
  end

  #
  # Attempt to aquire a lock-file to guarantee that
  # our execution is race-free.
  #
  begin
    File.open(lock, File::CREAT | File::EXCL)

    #
    # If the previous statement didn't raise an exception
    # then we successfully acquired a lockfile.
    #
    # We should ensure that when this process terminates
    # we release it.
    #
    # NOTE: Even if an exception occurs in our script this
    # `at_exit` call will definitely be invoked - so we're not
    # at risk of orphaning the lockfile.
    #
    at_exit { File.unlink(lock) }

  rescue Errno::EEXIST => ex
    puts( "Failed to acquire lockfile (#{lock}) - #{ex}" )
    exit(0)
  end


  #
  # Set up the driver
  #
  c = HealthcheckDriver.new(prefix, state)

  #
  # If we're just clearing events then do that, then terminate.
  #
  if clear

    c.clear_alerts

    c.clean_alerts

    exit(0)
  end

  #
  # Load any previous alerts
  #
  c.load_previous

  #
  #  Run our scheduled tests - recording alerts generated.
  #
  loop do
    c.run ARGV.pop
    break if ARGV.empty?
  end

  #
  #  Raise any alerts that we should.
  #
  c.raise_alerts

  #
  #  Update our state.
  #
  c.update_state

  #
  #  Terminate
  #
  exit(0)
end

__END__

bytemark-healthcheck:  Script to run system healthchecks

  --help      Show this message.
  --clear-all Clear all outstanding alerts, including our hearbeat.
  --nop       Do not send alerts.
  --prefix    Directory where the jobs are found.  Defaults to /etc/bytemark-healthcheck
  --state     File where YAML state file is kept.  Defaults to /dev/shm/bytemark-healthcheck.yml
  --verbose   Give more output.

Usage:

This script can just be run off the command line.  If arguments are given, scripts that match the period are run.

e.g. If you specify "all", all scripts will be run.

     If you specify "5", all scripts in the 5 minute directory will be run.

     If you specify "hourly", all scripts in the 60 minute directory will be run.

     If you do not specify any argument, then the scripts due at that time will
     be run.  This is decided based on the number of minutes since the start of the
     unix epoch.
