18004: Fixes a couple of race condition bugs related to caching remote users.
authorLucas Di Pentima <lucas.dipentima@curii.com>
Tue, 17 Aug 2021 18:20:59 +0000 (15:20 -0300)
committerLucas Di Pentima <lucas.dipentima@curii.com>
Tue, 17 Aug 2021 18:20:59 +0000 (15:20 -0300)
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <lucas.dipentima@curii.com>

lib/controller/integration_test.go
services/api/app/models/api_client_authorization.rb

index 8a12f1d2fa2f02751f94cb91004e767425c8aa34..6851442054e1f49e8cde8c87dcced6d9eea0918a 100644 (file)
@@ -189,7 +189,7 @@ func (s *IntegrationSuite) TestGetCollectionByPDH(c *check.C) {
 }
 
 // Tests bug #18004
-func (s *IntegrationSuite) TestRemoteTokenCacheRace(c *check.C) {
+func (s *IntegrationSuite) TestRemoteUserAndTokenCacheRace(c *check.C) {
        conn1 := s.testClusters["z1111"].Conn()
        rootctx1, _, _ := s.testClusters["z1111"].RootClients()
        rootctx2, _, _ := s.testClusters["z2222"].RootClients()
@@ -208,7 +208,7 @@ func (s *IntegrationSuite) TestRemoteTokenCacheRace(c *check.C) {
                        defer wg2.Done()
                        wg1.Wait()
                        _, err := conn2.UserGetCurrent(rootctx2, arvados.GetOptions{})
-                       c.Check(err, check.IsNil)
+                       c.Check(err, check.IsNil, check.Commentf("warm up phase failed"))
                }()
        }
        wg1.Done()
@@ -224,7 +224,7 @@ func (s *IntegrationSuite) TestRemoteTokenCacheRace(c *check.C) {
                        wg1.Wait()
                        // Retrieve the remote collection from cluster z2222.
                        _, err := conn2.UserGetCurrent(userctx1, arvados.GetOptions{})
-                       c.Check(err, check.IsNil)
+                       c.Check(err, check.IsNil, check.Commentf("testing phase failed"))
                }()
        }
        wg1.Done()
index 52f2cee064905fd6a81e4e9e60a774dfc80bab55..7c7ed759c60058b5915ad1d56505dba6b56d84dd 100644 (file)
@@ -319,7 +319,17 @@ class ApiClientAuthorization < ArvadosModel
         user.last_name = "from cluster #{remote_user_prefix}"
       end
 
-      user.save!
+      begin
+        user.save!
+      rescue ActiveRecord::RecordInvalid, ActiveRecord::RecordNotUnique
+        Rails.logger.debug("remote user #{remote_user['uuid']} already exists, retrying...")
+        # Some other request won the race: retry fetching the user record.
+        user = User.find_by_uuid(remote_user['uuid'])
+        if !user
+          Rails.logger.warn("cannot find or create remote user #{remote_user['uuid']}")
+          return nil
+        end
+      end
 
       if user.is_invited && !remote_user['is_invited']
         # Remote user is not "invited" state, they should be unsetup, which
@@ -364,12 +374,24 @@ class ApiClientAuthorization < ArvadosModel
       exp = [db_current_time + Rails.configuration.Login.RemoteTokenRefresh,
              remote_token.andand['expires_at']].compact.min
       scopes = remote_token.andand['scopes'] || ['all']
-      auth = ApiClientAuthorization.find_or_create_by(uuid: token_uuid) do |auth|
-        auth.user = user
-        auth.api_token = stored_secret
-        auth.api_client_id = 0
-        auth.scopes = scopes
-        auth.expires_at = exp
+      begin
+        retries ||= 0
+        auth = ApiClientAuthorization.find_or_create_by(uuid: token_uuid) do |auth|
+          auth.user = user
+          auth.api_token = stored_secret
+          auth.api_client_id = 0
+          auth.scopes = scopes
+          auth.expires_at = exp
+        end
+      rescue ActiveRecord::RecordNotUnique
+        Rails.logger.debug("cached remote token #{token_uuid} already exists, retrying...")
+        # Some other request won the race: retry just once before erroring out
+        if (retries += 1) <= 1
+          retry
+        else
+          Rails.logger.warn("cannot find or create cached remote token #{token_uuid}")
+          return nil
+        end
       end
       auth.update_attributes!(user: user,
                               api_token: stored_secret,