Merge branch '18004-cached-token-race-condition' into main. Closes #18004
authorLucas Di Pentima <lucas.dipentima@curii.com>
Tue, 24 Aug 2021 15:31:11 +0000 (12:31 -0300)
committerLucas Di Pentima <lucas.dipentima@curii.com>
Tue, 24 Aug 2021 15:31:11 +0000 (12:31 -0300)
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <lucas.dipentima@curii.com>

lib/controller/integration_test.go
services/api/app/models/api_client_authorization.rb

index 26f0dbb0d1388da1886cea726fc644648b4d57e3..6851442054e1f49e8cde8c87dcced6d9eea0918a 100644 (file)
@@ -20,6 +20,7 @@ import (
        "path/filepath"
        "strconv"
        "strings"
+       "sync"
 
        "git.arvados.org/arvados.git/lib/boot"
        "git.arvados.org/arvados.git/lib/config"
@@ -187,6 +188,49 @@ func (s *IntegrationSuite) TestGetCollectionByPDH(c *check.C) {
        c.Check(coll.PortableDataHash, check.Equals, pdh)
 }
 
+// Tests bug #18004
+func (s *IntegrationSuite) TestRemoteUserAndTokenCacheRace(c *check.C) {
+       conn1 := s.testClusters["z1111"].Conn()
+       rootctx1, _, _ := s.testClusters["z1111"].RootClients()
+       rootctx2, _, _ := s.testClusters["z2222"].RootClients()
+       conn2 := s.testClusters["z2222"].Conn()
+       userctx1, _, _, _ := s.testClusters["z1111"].UserClients(rootctx1, c, conn1, "user2@example.com", true)
+
+       var wg1, wg2 sync.WaitGroup
+       creqs := 100
+
+       // Make concurrent requests to z2222 with a local token to make sure more
+       // than one worker is listening.
+       wg1.Add(1)
+       for i := 0; i < creqs; i++ {
+               wg2.Add(1)
+               go func() {
+                       defer wg2.Done()
+                       wg1.Wait()
+                       _, err := conn2.UserGetCurrent(rootctx2, arvados.GetOptions{})
+                       c.Check(err, check.IsNil, check.Commentf("warm up phase failed"))
+               }()
+       }
+       wg1.Done()
+       wg2.Wait()
+
+       // Real test pass -- use a new remote token than the one used in the warm-up
+       // phase.
+       wg1.Add(1)
+       for i := 0; i < creqs; i++ {
+               wg2.Add(1)
+               go func() {
+                       defer wg2.Done()
+                       wg1.Wait()
+                       // Retrieve the remote collection from cluster z2222.
+                       _, err := conn2.UserGetCurrent(userctx1, arvados.GetOptions{})
+                       c.Check(err, check.IsNil, check.Commentf("testing phase failed"))
+               }()
+       }
+       wg1.Done()
+       wg2.Wait()
+}
+
 func (s *IntegrationSuite) TestS3WithFederatedToken(c *check.C) {
        if _, err := exec.LookPath("s3cmd"); err != nil {
                c.Skip("s3cmd not in PATH")
@@ -502,7 +546,7 @@ func (s *IntegrationSuite) TestRequestIDHeader(c *check.C) {
 }
 
 // We test the direct access to the database
-// normally an integration test would not have a database access, but  in this case we need
+// normally an integration test would not have a database access, but in this case we need
 // to test tokens that are secret, so there is no API response that will give them back
 func (s *IntegrationSuite) dbConn(c *check.C, clusterID string) (*sql.DB, *sql.Conn) {
        ctx := context.Background()
index 52f2cee064905fd6a81e4e9e60a774dfc80bab55..7c7ed759c60058b5915ad1d56505dba6b56d84dd 100644 (file)
@@ -319,7 +319,17 @@ class ApiClientAuthorization < ArvadosModel
         user.last_name = "from cluster #{remote_user_prefix}"
       end
 
-      user.save!
+      begin
+        user.save!
+      rescue ActiveRecord::RecordInvalid, ActiveRecord::RecordNotUnique
+        Rails.logger.debug("remote user #{remote_user['uuid']} already exists, retrying...")
+        # Some other request won the race: retry fetching the user record.
+        user = User.find_by_uuid(remote_user['uuid'])
+        if !user
+          Rails.logger.warn("cannot find or create remote user #{remote_user['uuid']}")
+          return nil
+        end
+      end
 
       if user.is_invited && !remote_user['is_invited']
         # Remote user is not "invited" state, they should be unsetup, which
@@ -364,12 +374,24 @@ class ApiClientAuthorization < ArvadosModel
       exp = [db_current_time + Rails.configuration.Login.RemoteTokenRefresh,
              remote_token.andand['expires_at']].compact.min
       scopes = remote_token.andand['scopes'] || ['all']
-      auth = ApiClientAuthorization.find_or_create_by(uuid: token_uuid) do |auth|
-        auth.user = user
-        auth.api_token = stored_secret
-        auth.api_client_id = 0
-        auth.scopes = scopes
-        auth.expires_at = exp
+      begin
+        retries ||= 0
+        auth = ApiClientAuthorization.find_or_create_by(uuid: token_uuid) do |auth|
+          auth.user = user
+          auth.api_token = stored_secret
+          auth.api_client_id = 0
+          auth.scopes = scopes
+          auth.expires_at = exp
+        end
+      rescue ActiveRecord::RecordNotUnique
+        Rails.logger.debug("cached remote token #{token_uuid} already exists, retrying...")
+        # Some other request won the race: retry just once before erroring out
+        if (retries += 1) <= 1
+          retry
+        else
+          Rails.logger.warn("cannot find or create cached remote token #{token_uuid}")
+          return nil
+        end
       end
       auth.update_attributes!(user: user,
                               api_token: stored_secret,