use spot instances for compute nodes
[arvados.git] / app / models / node.rb
index 6f73d4abeb36f2b38a889df6a7a3c54d8c1af910..853b2712c7ac2c2d54d3fbc9e879c65ddf97015b 100644 (file)
@@ -1,15 +1,57 @@
-class Node < ActiveRecord::Base
+class Node < OrvosModel
   include AssignUuid
+  include KindAndEtag
+  include CommonApiTemplate
   serialize :info, Hash
   before_validation :ensure_ping_secret
+  after_update :dnsmasq_update
 
   MAX_SLOTS = 64
 
+  @@confdir = if Rails.configuration.respond_to? :dnsmasq_conf_dir
+                Rails.configuration.dnsmasq_conf_dir
+              elsif File.exists? '/etc/dnsmasq.d/.'
+                '/etc/dnsmasq.d'
+              else
+                nil
+              end
+  @@domain = Rails.configuration.compute_node_domain rescue `hostname --domain`.strip
+  @@nameservers = Rails.configuration.compute_node_nameservers
+
+  api_accessible :superuser, :extend => :common do |t|
+    t.add :hostname
+    t.add :domain
+    t.add :ip_address
+    t.add :first_ping_at
+    t.add :last_ping_at
+    t.add :info
+    t.add :status
+    t.add lambda { |x| @@nameservers }, :as => :nameservers
+  end
+
   def info
     @info ||= Hash.new
     super
   end
 
+  def domain
+    super || @@domain
+  end
+
+  def status
+    if !self.last_ping_at
+      if Time.now - self.created_at > 5.minutes
+        'startup-fail'
+      else
+        'pending'
+      end
+    elsif Time.now - self.last_ping_at > 1.hours
+      'missing'
+    else
+      'running'
+    end
+  end
+
   def ping(o)
     raise "must have :ip and :ping_secret" unless o[:ip] and o[:ping_secret]
 
@@ -19,6 +61,8 @@ class Node < ActiveRecord::Base
     end
     self.last_ping_at = Time.now
 
+    @bypass_orvos_authorization = true
+
     # Record IP address
     if self.ip_address.nil?
       logger.info "#{self.uuid} ip_address= #{o[:ip]}"
@@ -27,39 +71,65 @@ class Node < ActiveRecord::Base
     end
 
     # Record instance ID if not already known
-    self.info[:ec2_instance_id] ||= o[:ec2_instance_id]
+    if !self.info[:ec2_instance_id] and o[:ec2_instance_id]
+      self.info[:ec2_instance_id] = o[:ec2_instance_id]
+      `ec2-create-tags #{self.info[:ec2_instance_id]} --tag 'Name=#{self.uuid}'`
+    end
 
     # Assign hostname
     if self.slot_number.nil?
       try_slot = 0
       begin
         self.slot_number = try_slot
-        try_slot += 1
-        break if self.save rescue nil
+        begin
+          self.save!
+          break
+        rescue ActiveRecord::RecordNotUnique
+          try_slot += 1
+        end
         raise "No available node slots" if try_slot == MAX_SLOTS
       end while true
-      self.hostname = "compute#{self.slot_number}"
+      self.hostname = self.class.hostname_for_slot(self.slot_number)
+      if info[:ec2_instance_id]
+        `ec2-create-tags #{self.info[:ec2_instance_id]} --tag 'hostname=#{self.hostname}'`
+      end
     end
 
     save
   end
 
   def start!(ping_url_method)
+    ensure_permission_to_update
     ping_url = ping_url_method.call({ uuid: self.uuid, ping_secret: self.info[:ping_secret] })
-    cmd = ["ec2-run-instances",
-           "--user-data '#{ping_url}'",
-           "-t c1.xlarge -n 1 -g orvos-compute",
-           "ami-68ca6901"
-          ].join(' ')
-    self.info[:ec2_start_command] = cmd
-    logger.info "#{self.uuid} ec2_start_command= #{cmd.inspect}"
-    result = `#{cmd} 2>&1`
+    ec2_args = ["--user-data '#{ping_url}'",
+                "-t c1.xlarge -n 1",
+                "-g", Rails.configuration.compute_node_security_group,
+                Rails.configuration.compute_node_ami
+               ]
+    ec2run_cmd = ["ec2-run-instances",
+                  "--client-token", self.uuid,
+                  ec2_args].flatten.join(' ')
+    ec2spot_cmd = ["ec2-request-spot-instances",
+                   "-p #{Rails.configuration.compute_node_spot_bid} --type one-time",
+                   ec2_args].flatten.join(' ')
+    self.info[:ec2_run_command] = ec2run_cmd
+    self.info[:ec2_spot_command] = ec2spot_cmd
+    self.info[:ec2_start_command] = ec2spot_cmd
+    logger.info "#{self.uuid} ec2_start_command= #{ec2spot_cmd.inspect}"
+    result = `#{ec2spot_cmd} 2>&1`
     self.info[:ec2_start_result] = result
     logger.info "#{self.uuid} ec2_start_result= #{result.inspect}"
     result.match(/INSTANCE\s*(i-[0-9a-f]+)/) do |m|
-      self.info[:ec2_instance_id] = m[1]
-      self.save!
+      instance_id = m[1]
+      self.info[:ec2_instance_id] = instance_id
+      `ec2-create-tags #{instance_id} --tag 'Name=#{self.uuid}'`
+    end
+    result.match(/SPOTINSTANCEREQUEST\s*(sir-[0-9a-f]+)/) do |m|
+      sir_id = m[1]
+      self.info[:ec2_sir_id] = sir_id
+      `ec2-create-tags #{sir_id} --tag 'Name=#{self.uuid}'`
     end
+    self.save!
   end
 
   protected
@@ -67,4 +137,52 @@ class Node < ActiveRecord::Base
   def ensure_ping_secret
     self.info[:ping_secret] ||= rand(2**256).to_s(36)
   end
+
+  def dnsmasq_update
+    if self.hostname_changed? or self.ip_address_changed?
+      if self.hostname and self.ip_address
+        self.class.dnsmasq_update(self.hostname, self.ip_address)
+      end
+    end
+  end
+
+  def self.dnsmasq_update(hostname, ip_address)
+    return unless @@confdir
+    ptr_domain = ip_address.
+      split('.').reverse.join('.').concat('.in-addr.arpa')
+    hostfile = File.join @@confdir, hostname
+    File.open hostfile, 'w' do |f|
+      f.puts "address=/#{hostname}/#{ip_address}"
+      f.puts "address=/#{hostname}.#{@@domain}/#{ip_address}" if @@domain
+      f.puts "ptr-record=#{ptr_domain},#{hostname}"
+    end
+    File.open(File.join(@@confdir, 'restart.txt'), 'w') do |f|
+      # this should trigger a dnsmasq restart
+    end
+  end
+
+  def self.hostname_for_slot(slot_number)
+    "compute#{slot_number}"
+  end
+
+  # At startup, make sure all DNS entries exist.  Otherwise, slurmctld
+  # will refuse to start.
+  if @@confdir and
+      !File.exists? (File.join(@@confdir, hostname_for_slot(MAX_SLOTS-1)))
+    (0..MAX_SLOTS-1).each do |slot_number|
+      hostname = hostname_for_slot(slot_number)
+      hostfile = File.join @@confdir, hostname
+      if !File.exists? hostfile
+        dnsmasq_update(hostname, '127.40.4.0')
+      end
+    end
+  end
+
+  def permission_to_update
+    @bypass_orvos_authorization or super
+  end
+
+  def permission_to_create
+    current_user and current_user.is_admin
+  end
 end