Merge branch '14484-collection-record-update'
authorEric Biagiotti <ebiagiotti@veritasgenetics.com>
Thu, 4 Apr 2019 14:54:49 +0000 (10:54 -0400)
committerEric Biagiotti <ebiagiotti@veritasgenetics.com>
Thu, 4 Apr 2019 14:54:49 +0000 (10:54 -0400)
refs #14484

Arvados-DCO-1.1-Signed-off-by: Eric Biagiotti <ebiagiotti@veritasgenetics.com>

doc/api/methods/collections.html.textile.liquid
services/api/app/models/collection.rb
services/api/db/migrate/20190322174136_add_file_info_to_collection.rb [new file with mode: 0755]
services/api/db/structure.sql
services/api/lib/group_pdhs.rb [new file with mode: 0644]
services/api/test/fixtures/collections.yml
services/api/test/functional/arvados/v1/collections_controller_test.rb
services/api/test/unit/collection_test.rb
services/api/test/unit/group_pdhs_test.rb [new file with mode: 0644]

index c68773d900fd3dec88ec389ce73cc1410ac9ec2d..d611c5b1613ce4ba93c48039c6c335457c5584e4 100644 (file)
@@ -39,6 +39,8 @@ table(table table-bordered table-condensed).
 |current_version_uuid|string|UUID of the collection's current version. On new collections, it'll be equal to the @uuid@ attribute.||
 |version|number|Version number, starting at 1 on new collections. This attribute is read-only.||
 |preserve_version|boolean|When set to true on a current version, it will be saved on the next versionable update.||
+|file_count|number|The total number of files in the collection. This attribute is read-only.||
+|file_size_total|number|The sum of the file sizes in the collection. This attribute is read-only.||
 
 h3. Conditions of creating a Collection
 
index 6147b79f9f5aa16c6b9e24ef5164bab43139373a..578f25a62fc1eee9a081dfd6a9bac767a44e90d9 100644 (file)
@@ -29,6 +29,7 @@ class Collection < ArvadosModel
   validate :ensure_storage_classes_contain_non_empty_strings
   validate :versioning_metadata_updates, on: :update
   validate :past_versions_cannot_be_updated, on: :update
+  after_validation :set_file_count_and_total_size
   before_save :set_file_names
   around_update :manage_versioning
 
@@ -51,6 +52,8 @@ class Collection < ArvadosModel
     t.add :version
     t.add :current_version_uuid
     t.add :preserve_version
+    t.add :file_count
+    t.add :file_size_total
   end
 
   after_initialize do
@@ -195,6 +198,20 @@ class Collection < ArvadosModel
     true
   end
 
+  def set_file_count_and_total_size
+    # Only update the file stats if the manifest changed
+    if self.manifest_text_changed?
+      m = Keep::Manifest.new(self.manifest_text)
+      self.file_size_total = m.files_size
+      self.file_count = m.files_count
+    # If the manifest didn't change but the attributes did, ignore the changes
+    elsif self.file_count_changed? || self.file_size_total_changed?
+      self.file_count = self.file_count_was
+      self.file_size_total = self.file_size_total_was
+    end
+    true
+  end
+
   def manifest_files
     return '' if !self.manifest_text
 
diff --git a/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb b/services/api/db/migrate/20190322174136_add_file_info_to_collection.rb
new file mode 100755 (executable)
index 0000000..146e105
--- /dev/null
@@ -0,0 +1,63 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+require "arvados/keep"
+require "group_pdhs"
+
+class AddFileInfoToCollection < ActiveRecord::Migration
+  def do_batch(pdhs)
+    pdhs_str = ''
+    pdhs.each do |pdh|
+      pdhs_str << "'" << pdh << "'" << ","
+    end
+
+    collections = ActiveRecord::Base.connection.exec_query(
+      "SELECT DISTINCT portable_data_hash, manifest_text FROM collections "\
+      "WHERE portable_data_hash IN (#{pdhs_str[0..-2]}) "
+    )
+
+    collections.rows.each do |row|
+      manifest = Keep::Manifest.new(row[1])
+      ActiveRecord::Base.connection.exec_query("BEGIN")
+      ActiveRecord::Base.connection.exec_query("UPDATE collections SET file_count=#{manifest.files_count}, "\
+                                               "file_size_total=#{manifest.files_size} "\
+                                               "WHERE portable_data_hash='#{row[0]}'")
+      ActiveRecord::Base.connection.exec_query("COMMIT")
+    end
+  end
+
+  def up
+    add_column :collections, :file_count, :integer, default: 0, null: false
+    add_column :collections, :file_size_total, :integer, limit: 8, default: 0, null: false
+
+    distinct_pdh_count = ActiveRecord::Base.connection.exec_query(
+      "SELECT DISTINCT portable_data_hash FROM collections"
+    ).rows.count
+
+    # Generator that queries for all the distinct pdhs greater than last_pdh
+    ordered_pdh_query = lambda { |last_pdh, &block|
+      pdhs = ActiveRecord::Base.connection.exec_query(
+        "SELECT DISTINCT portable_data_hash FROM collections "\
+        "WHERE portable_data_hash > '#{last_pdh}' "\
+        "ORDER BY portable_data_hash LIMIT 1000"
+      )
+      pdhs.rows.each do |row|
+        block.call(row[0])
+      end
+    }
+
+    batch_size_max = 1 << 28 # 256 MiB
+    GroupPdhs.group_pdhs_for_multiple_transactions(ordered_pdh_query,
+                                                   distinct_pdh_count,
+                                                   batch_size_max,
+                                                   "AddFileInfoToCollection") do |pdhs|
+      do_batch(pdhs)
+    end
+  end
+
+  def down
+    remove_column :collections, :file_count
+    remove_column :collections, :file_size_total
+  end
+end
index f766f33e1b35e1f85a64aa2c5d87bf85e2bb6d0f..4520e1bc04623e1a4a2f240ec2282c5308361785 100644 (file)
@@ -175,7 +175,9 @@ CREATE TABLE public.collections (
     storage_classes_confirmed_at timestamp without time zone,
     current_version_uuid character varying,
     version integer DEFAULT 1 NOT NULL,
-    preserve_version boolean DEFAULT false
+    preserve_version boolean DEFAULT false,
+    file_count integer DEFAULT 0 NOT NULL,
+    file_size_total bigint DEFAULT 0 NOT NULL
 );
 
 
@@ -3220,3 +3222,5 @@ INSERT INTO schema_migrations (version) VALUES ('20181213183234');
 
 INSERT INTO schema_migrations (version) VALUES ('20190214214814');
 
+INSERT INTO schema_migrations (version) VALUES ('20190322174136');
+
diff --git a/services/api/lib/group_pdhs.rb b/services/api/lib/group_pdhs.rb
new file mode 100644 (file)
index 0000000..0630ef8
--- /dev/null
@@ -0,0 +1,39 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+module GroupPdhs
+  # NOTE: Migration 20190322174136_add_file_info_to_collection.rb relies on this function.
+  #
+  # Change with caution!
+  #
+  # Correctly groups pdhs to use for batch database updates. Helps avoid
+  # updating too many database rows in a single transaction.
+  def self.group_pdhs_for_multiple_transactions(distinct_ordered_pdhs, distinct_pdh_count, batch_size_max, log_prefix)
+    batch_size = 0
+    batch_pdhs = {}
+    last_pdh = '0'
+    done = 0
+    any = true
+
+    while any
+      any = false
+      distinct_ordered_pdhs.call(last_pdh) do |pdh|
+        any = true
+        last_pdh = pdh
+        manifest_size = pdh.split('+')[1].to_i
+        if batch_size > 0 && batch_size + manifest_size > batch_size_max
+          yield batch_pdhs.keys
+          done += batch_pdhs.size
+          Rails.logger.info(log_prefix + ": #{done}/#{distinct_pdh_count}")
+          batch_pdhs = {}
+          batch_size = 0
+        end
+        batch_pdhs[pdh] = true
+        batch_size += manifest_size
+      end
+    end
+    yield batch_pdhs.keys
+    Rails.logger.info(log_prefix + ": finished")
+  end
+end
index 8763f3944471e9a5cad4f4565da2833f988bbdf8..c84e479e48fbe6118a1297a903013addf68e928e 100644 (file)
@@ -29,6 +29,22 @@ collection_owned_by_active:
   name: owned_by_active
   version: 2
 
+collection_owned_by_active_with_file_stats:
+  uuid: zzzzz-4zz18-fjeod4od92kfj5f
+  current_version_uuid: zzzzz-4zz18-fjeod4od92kfj5f
+  portable_data_hash: fa7aeb5140e2848d39b416daeef4ffc5+45
+  owner_uuid: zzzzz-tpzed-xurymjxw79nv3jz
+  created_at: 2014-02-03T17:22:54Z
+  modified_by_client_uuid: zzzzz-ozdt8-brczlopd8u8d0jr
+  modified_by_user_uuid: zzzzz-tpzed-d9tiejq69daie8f
+  modified_at: 2014-02-03T17:22:54Z
+  updated_at: 2014-02-03T17:22:54Z
+  manifest_text: ". 37b51d194a7513e45b56f6524f2d51f2+3 0:3:bar\n"
+  file_count: 1
+  file_size_total: 3
+  name: owned_by_active_with_file_stats
+  version: 2
+
 collection_owned_by_active_past_version_1:
   uuid: zzzzz-4zz18-znfnqtbbv4spast
   current_version_uuid: zzzzz-4zz18-bv31uwvy3neko21
index 997d89d5cd13d72c97dd3edcaa177bca36e1efed..ff581e44291d89a109f0228a2e4bf23399345c8d 100644 (file)
@@ -937,6 +937,89 @@ EOS
     assert_equal 'value_1', json_response['properties']['property_1']
   end
 
+  [
+    [". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt\n", 1, 34],
+    [". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt 0:30:foo.txt 0:30:foo1.txt 0:30:foo2.txt 0:30:foo3.txt 0:30:foo4.txt\n", 5, 184],
+    [". d41d8cd98f00b204e9800998ecf8427e 0:0:.\n", 0, 0]
+  ].each do |manifest, count, size|
+    test "create collection with valid manifest #{manifest} and expect file stats" do
+      authorize_with :active
+      post :create, {
+        collection: {
+          manifest_text: manifest
+        }
+      }
+      assert_response 200
+      assert_equal count, json_response['file_count']
+      assert_equal size, json_response['file_size_total']
+    end
+  end
+
+  test "update collection manifest and expect new file stats" do
+    authorize_with :active
+    post :update, {
+      id: collections(:collection_owned_by_active_with_file_stats).uuid,
+      collection: {
+        manifest_text: ". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt\n"
+      }
+    }
+    assert_response 200
+    assert_equal 1, json_response['file_count']
+    assert_equal 34, json_response['file_size_total']
+  end
+
+  [
+    ['file_count', 1],
+    ['file_size_total', 34]
+  ].each do |attribute, val|
+    test "create collection with #{attribute} and expect overwrite" do
+      authorize_with :active
+      post :create, {
+        collection: {
+          manifest_text: ". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt\n",
+          "#{attribute}": 10
+        }
+      }
+      assert_response 200
+      assert_equal val, json_response[attribute]
+    end
+  end
+
+  [
+    ['file_count', 1],
+    ['file_size_total', 3]
+  ].each do |attribute, val|
+    test "update collection with #{attribute} and expect ignore" do
+      authorize_with :active
+      post :update, {
+        id: collections(:collection_owned_by_active_with_file_stats).uuid,
+        collection: {
+          "#{attribute}": 10
+        }
+      }
+      assert_response 200
+      assert_equal val, json_response[attribute]
+    end
+  end
+
+  [
+    ['file_count', 1],
+    ['file_size_total', 34]
+  ].each do |attribute, val|
+    test "update collection with #{attribute} and manifest and expect manifest values" do
+      authorize_with :active
+      post :update, {
+        id: collections(:collection_owned_by_active_with_file_stats).uuid,
+        collection: {
+          manifest_text: ". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt\n",
+          "#{attribute}": 10
+        }
+      }
+      assert_response 200
+      assert_equal val, json_response[attribute]
+    end
+  end
+
   [
     ". 0:0:foo.txt",
     ". d41d8cd98f00b204e9800998ecf8427e foo.txt",
index 9797ed63dc0d098898d38a4e0741ecd9fc7e0e4c..8deedee0186ea5bbd87ba6d219d7ef4d47f66314 100644 (file)
@@ -60,6 +60,56 @@ class CollectionTest < ActiveSupport::TestCase
     end
   end
 
+  [
+    [". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt\n", 1, 34],
+    [". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt 0:30:foo.txt 0:30:foo1.txt 0:30:foo2.txt 0:30:foo3.txt 0:30:foo4.txt\n", 5, 184],
+    [". d41d8cd98f00b204e9800998ecf8427e 0:0:.\n", 0, 0]
+  ].each do |manifest, count, size|
+    test "file stats on create collection with #{manifest}" do
+      act_as_system_user do
+        c = Collection.create(manifest_text: manifest)
+        assert_equal count, c.file_count
+        assert_equal size, c.file_size_total
+      end
+    end
+  end
+
+  test "file stats cannot be changed unless through manifest change" do
+    act_as_system_user do
+      # Direct changes to file stats should be ignored
+      c = Collection.create(manifest_text: ". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt\n")
+      c.file_count = 6
+      c.file_size_total = 30
+      assert c.valid?
+      assert_equal 1, c.file_count
+      assert_equal 34, c.file_size_total
+
+      # File stats specified on create should be ignored and overwritten
+      c = Collection.create(manifest_text: ". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt\n", file_count: 10, file_size_total: 10)
+      assert c.valid?
+      assert_equal 1, c.file_count
+      assert_equal 34, c.file_size_total
+
+      # Updating the manifest should change file stats
+      c.update_attributes(manifest_text: ". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt 0:34:foo2.txt\n")
+      assert c.valid?
+      assert_equal 2, c.file_count
+      assert_equal 68, c.file_size_total
+
+      # Updating file stats and the manifest should use manifest values
+      c.update_attributes(manifest_text: ". d41d8cd98f00b204e9800998ecf8427e 0:34:foo.txt\n", file_count:10, file_size_total: 10)
+      assert c.valid?
+      assert_equal 1, c.file_count
+      assert_equal 34, c.file_size_total
+
+      # Updating just the file stats should be ignored
+      c.update_attributes(file_count: 10, file_size_total: 10)
+      assert c.valid?
+      assert_equal 1, c.file_count
+      assert_equal 34, c.file_size_total
+    end
+  end
+
   [
     nil,
     "",
diff --git a/services/api/test/unit/group_pdhs_test.rb b/services/api/test/unit/group_pdhs_test.rb
new file mode 100644 (file)
index 0000000..82256e6
--- /dev/null
@@ -0,0 +1,27 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+require 'test_helper'
+require 'group_pdhs'
+
+# NOTE: Migration 20190322174136_add_file_info_to_collection.rb
+# relies on this test. Change with caution!
+class GroupPdhsTest < ActiveSupport::TestCase
+  test "pdh_grouping_by_manifest_size" do
+    batch_size_max = 200
+    pdhs_in = ['x1+30', 'x2+30', 'x3+201', 'x4+100', 'x5+100']
+    pdh_lambda = lambda { |last_pdh, &block|
+      pdhs = pdhs_in.select{|pdh| pdh > last_pdh} 
+      pdhs.each do |p|
+        block.call(p)
+      end
+    }
+    batched_pdhs = []
+    GroupPdhs.group_pdhs_for_multiple_transactions(pdh_lambda, pdhs_in.size, batch_size_max, "") do |pdhs|
+      batched_pdhs << pdhs
+    end
+    expected = [['x1+30', 'x2+30'], ['x3+201'], ['x4+100', 'x5+100']]
+    assert_equal(batched_pdhs, expected)
+  end
+end