Merge branch '14870-retry-logs' refs #14870
authorPeter Amstutz <pamstutz@veritasgenetics.com>
Thu, 7 Mar 2019 17:58:04 +0000 (12:58 -0500)
committerPeter Amstutz <pamstutz@veritasgenetics.com>
Thu, 7 Mar 2019 17:58:04 +0000 (12:58 -0500)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <pamstutz@veritasgenetics.com>

build/run-build-test-packages-one-target.sh
doc/admin/upgrading.html.textile.liquid
lib/cloud/azure/azure.go
lib/dispatchcloud/worker/pool.go
sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/arvados_cwl/executor.py
sdk/cwl/tests/test_container.py
sdk/python/arvados/commands/keepdocker.py
services/fuse/arvados_fuse/fresh.py

index 7bdaacac52268a49b6fe929f0f467da061682662..b98a4c0a7e0dec9d3c8110712881d05b92f5ec03 100755 (executable)
@@ -14,6 +14,8 @@ Syntax:
 --upload
     If the build and test steps are successful, upload the packages
     to a remote apt repository (default: false)
+--rc
+    Optional Parameter to build Release Candidate
 --build-version <version>
     Version to build (default:
     \$ARVADOS_BUILDING_VERSION-\$ARVADOS_BUILDING_ITERATION or
@@ -40,7 +42,7 @@ if ! [[ -d "$WORKSPACE" ]]; then
 fi
 
 PARSEDOPTS=$(getopt --name "$0" --longoptions \
-    help,upload,target:,build-version: \
+    help,upload,rc,target:,build-version: \
     -- "" "$@")
 if [ $? -ne 0 ]; then
     exit 1
@@ -48,6 +50,7 @@ fi
 
 TARGET=debian8
 UPLOAD=0
+RC=0
 
 declare -a build_args=()
 
@@ -65,6 +68,9 @@ while [ $# -gt 0 ]; do
         --upload)
             UPLOAD=1
             ;;
+        --rc)
+            RC=1
+            ;;
         --build-version)
             build_args+=("$1" "$2")
             shift
@@ -115,8 +121,13 @@ if [[ "$UPLOAD" != 0 ]]; then
   timer_reset
 
   if [ ${#failures[@]} -eq 0 ]; then
-    echo "/usr/local/arvados-dev/jenkins/run_upload_packages.py -H jenkinsapt@apt.arvados.org -o Port=2222 --workspace $WORKSPACE $TARGET"
-    /usr/local/arvados-dev/jenkins/run_upload_packages.py -H jenkinsapt@apt.arvados.org -o Port=2222 --workspace $WORKSPACE $TARGET
+    if [[ "$RC" != 0 ]]; then
+      echo "/usr/local/arvados-dev/jenkins/run_upload_packages_testing.py -H jenkinsapt@apt.arvados.org -o Port=2222 --workspace $WORKSPACE $TARGET"
+      /usr/local/arvados-dev/jenkins/run_upload_packages_testing.py -H jenkinsapt@apt.arvados.org -o Port=2222 --workspace $WORKSPACE $TARGET
+    else
+      echo "/usr/local/arvados-dev/jenkins/run_upload_packages.py -H jenkinsapt@apt.arvados.org -o Port=2222 --workspace $WORKSPACE $TARGET"
+      /usr/local/arvados-dev/jenkins/run_upload_packages.py -H jenkinsapt@apt.arvados.org -o Port=2222 --workspace $WORKSPACE $TARGET
+    fi
   else
     echo "Skipping package upload, there were errors building and/or testing the packages"
   fi
@@ -124,4 +135,4 @@ if [[ "$UPLOAD" != 0 ]]; then
   title "End of upload packages (`timer`)"
 fi
 
-exit_cleanly
+exit_cleanly
\ No newline at end of file
index eda25d5cc6678524ea1cd44b0500201f49e56e65..6e2e6cba6dfeb1873d8a58049e1cdb65ac801bdd 100644 (file)
@@ -32,6 +32,63 @@ TODO: extract this information based on git commit messages and generate changel
 
 h3. current master branch
 
+h4. Stricter collection manifest validation on the API server
+
+As a consequence of "#14482":https://dev.arvados.org/issues/14482, the Ruby SDK does a more rigorous collection manifest validation. Collections created after 2015-05 are unlikely to be invalid, however you may check for invalid manifests using the script below.
+
+You could set up a new rvm gemset and install the specific arvados gem for testing, like so:
+
+<notextile>
+<pre><code>~$ <span class="userinput">rvm gemset create rubysdk-test</span>
+~$ <span class="userinput">rvm gemset use rubysdk-test</span>
+~$ <span class="userinput">gem install arvados -v 1.3.1.20190301212059</span>
+</code></pre>
+</notextile>
+
+Next, you can run the following script using admin credentials, it will scan the whole collection database and report any collection that didn't pass the check:
+
+{% codeblock as ruby %}
+require 'arvados'
+require 'arvados/keep'
+
+api = Arvados.new
+offset = 0
+batch_size = 100
+invalid = []
+
+while true
+    begin
+        req = api.collection.index(
+            :select => [:uuid, :created_at, :manifest_text],
+            :include_trash => true, :include_old_versions => true,
+            :limit => batch_size, :offset => offset)
+    rescue
+        invalid.each {|c| puts "#{c[:uuid]} (Created at #{c[:created_at]}): #{c[:error]}" }
+        raise
+    end
+
+    req[:items].each do |col|
+        begin
+            Keep::Manifest.validate! col[:manifest_text]
+        rescue Exception => e
+            puts "Collection #{col[:uuid]} manifest not valid"
+            invalid << {uuid: col[:uuid], error: e, created_at: col[:created_at]}
+        end
+    end
+    puts "Checked #{offset} / #{req[:items_available]} - Invalid: #{invalid.size}"
+    offset += req[:limit]
+    break if offset > req[:items_available]
+end
+
+if invalid.empty?
+    puts "No invalid collection manifests found"
+else
+    invalid.each {|c| puts "#{c[:uuid]} (Created at #{c[:created_at]}): #{c[:error]}" }
+end
+{% endcodeblock %}
+
+The script will return a final report enumerating any invalid collection by UUID, with its creation date and error message so you can take the proper correction measures, if needed.
+
 h4. Python packaging change
 
 As part of story "#9945":https://dev.arvados.org/issues/9945, the distribution packaging (deb/rpm) of our Python packages has changed. These packages now include a built-in virtualenv to reduce dependencies on system packages. We have also stopped packaging and publishing backports for all the Python dependencies of our packages, as they are no longer needed.
index b88962714e709765f1c93e0a6a88dbcf860aabaa..d19e4bef2372ff87dfb13e26db1ffddb9726bdca 100644 (file)
@@ -220,12 +220,14 @@ func newAzureInstanceSet(config json.RawMessage, dispatcherID cloud.InstanceSetI
                return nil, err
        }
 
-       ap := azureInstanceSet{logger: logger}
-       err = ap.setup(azcfg, string(dispatcherID))
+       az := azureInstanceSet{logger: logger}
+       az.ctx, az.stopFunc = context.WithCancel(context.Background())
+       err = az.setup(azcfg, string(dispatcherID))
        if err != nil {
+               az.stopFunc()
                return nil, err
        }
-       return &ap, nil
+       return &az, nil
 }
 
 func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID string) (err error) {
@@ -276,7 +278,6 @@ func (az *azureInstanceSet) setup(azcfg azureInstanceSetConfig, dispatcherID str
        az.dispatcherID = dispatcherID
        az.namePrefix = fmt.Sprintf("compute-%s-", az.dispatcherID)
 
-       az.ctx, az.stopFunc = context.WithCancel(context.Background())
        go func() {
                az.stopWg.Add(1)
                defer az.stopWg.Done()
index 14f6a3efced3815f11b19b6e08612ead4326e4f6..fe1c6ecc0304f64345135f6016d29a8b1512fea1 100644 (file)
@@ -565,6 +565,7 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
 func (wp *Pool) runMetrics() {
        ch := wp.Subscribe()
        defer wp.Unsubscribe(ch)
+       wp.updateMetrics()
        for range ch {
                wp.updateMetrics()
        }
index 9f93f0a6a6895780b2aca6e5ed235aca64d6aac9..03b4e07c76f5849a97ae85b9bd179e897ec8fc33 100644 (file)
@@ -342,7 +342,7 @@ class ArvadosContainer(JobBase):
             if record["output_uuid"]:
                 if self.arvrunner.trash_intermediate or self.arvrunner.intermediate_output_ttl:
                     # Compute the trash time to avoid requesting the collection record.
-                    trash_at = ciso8601.parse_datetime(record["modified_at"]) + datetime.timedelta(0, self.arvrunner.intermediate_output_ttl)
+                    trash_at = ciso8601.parse_datetime_as_naive(record["modified_at"]) + datetime.timedelta(0, self.arvrunner.intermediate_output_ttl)
                     aftertime = " at %s" % trash_at.strftime("%Y-%m-%d %H:%M:%S UTC") if self.arvrunner.intermediate_output_ttl else ""
                     orpart = ", or" if self.arvrunner.trash_intermediate and self.arvrunner.intermediate_output_ttl else ""
                     oncomplete = " upon successful completion of the workflow" if self.arvrunner.trash_intermediate else ""
index c1f2b54744083a22a52b659797a720da478304b1..319e8a887114b88b55865ca673dbafb3e0b9a7dc 100644 (file)
@@ -59,6 +59,7 @@ class RuntimeStatusLoggingHandler(logging.Handler):
     def __init__(self, runtime_status_update_func):
         super(RuntimeStatusLoggingHandler, self).__init__()
         self.runtime_status_update = runtime_status_update_func
+        self.updatingRuntimeStatus = False
 
     def emit(self, record):
         kind = None
@@ -66,22 +67,27 @@ class RuntimeStatusLoggingHandler(logging.Handler):
             kind = 'error'
         elif record.levelno >= logging.WARNING:
             kind = 'warning'
-        if kind is not None:
-            log_msg = record.getMessage()
-            if '\n' in log_msg:
-                # If the logged message is multi-line, use its first line as status
-                # and the rest as detail.
-                status, detail = log_msg.split('\n', 1)
-                self.runtime_status_update(
-                    kind,
-                    "%s: %s" % (record.name, status),
-                    detail
-                )
-            else:
-                self.runtime_status_update(
-                    kind,
-                    "%s: %s" % (record.name, record.getMessage())
-                )
+        if kind is not None and self.updatingRuntimeStatus is not True:
+            self.updatingRuntimeStatus = True
+            try:
+                log_msg = record.getMessage()
+                if '\n' in log_msg:
+                    # If the logged message is multi-line, use its first line as status
+                    # and the rest as detail.
+                    status, detail = log_msg.split('\n', 1)
+                    self.runtime_status_update(
+                        kind,
+                        "%s: %s" % (record.name, status),
+                        detail
+                    )
+                else:
+                    self.runtime_status_update(
+                        kind,
+                        "%s: %s" % (record.name, record.getMessage())
+                    )
+            finally:
+                self.updatingRuntimeStatus = False
+            
 
 class ArvCwlExecutor(object):
     """Execute a CWL tool or workflow, submit work (using either jobs or
index de21fc0b92d1c7437978be0a5018c3ea51fd76c3..1a57da3927a352e614f5a65ebb46887864ece07b 100644 (file)
@@ -80,6 +80,23 @@ class TestContainer(unittest.TestCase):
 
         return loadingContext, runtimeContext
 
+    # Helper function to set up the ArvCwlExecutor to use the containers api 
+    # and test that the RuntimeStatusLoggingHandler is set up correctly
+    def setup_and_test_container_executor_and_logging(self, gcc_mock) :
+        api = mock.MagicMock()
+        api._rootDesc = copy.deepcopy(get_rootDesc())
+        del api._rootDesc.get('resources')['jobs']['methods']['create']
+
+        # Make sure ArvCwlExecutor thinks it's running inside a container so it
+        # adds the logging handler that will call runtime_status_update() mock
+        self.assertFalse(gcc_mock.called)
+        runner = arvados_cwl.ArvCwlExecutor(api)
+        self.assertEqual(runner.work_api, 'containers')
+        root_logger = logging.getLogger('')
+        handlerClasses = [h.__class__ for h in root_logger.handlers]
+        self.assertTrue(arvados_cwl.RuntimeStatusLoggingHandler in handlerClasses)
+        return runner
+        
     # The test passes no builder.resources
     # Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
     @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
@@ -500,28 +517,30 @@ class TestContainer(unittest.TestCase):
         arvjob.output_callback.assert_called_with({"out": "stuff"}, "success")
         runner.add_intermediate_output.assert_called_with("zzzzz-4zz18-zzzzzzzzzzzzzz2")
 
+    # Test to make sure we dont call runtime_status_update if we already did
+    # some where higher up in the call stack
     @mock.patch("arvados_cwl.util.get_current_container")
-    @mock.patch("arvados.collection.CollectionReader")
-    @mock.patch("arvados.collection.Collection")
-    def test_child_failure(self, col, reader, gcc_mock):
-        api = mock.MagicMock()
-        api._rootDesc = copy.deepcopy(get_rootDesc())
-        del api._rootDesc.get('resources')['jobs']['methods']['create']
+    def test_recursive_runtime_status_update(self, gcc_mock):
+        self.setup_and_test_container_executor_and_logging(gcc_mock)
+        root_logger = logging.getLogger('')
 
-        # Set up runner with mocked runtime_status_update()
-        self.assertFalse(gcc_mock.called)
-        runtime_status_update = mock.MagicMock()
-        arvados_cwl.ArvCwlExecutor.runtime_status_update = runtime_status_update
-        runner = arvados_cwl.ArvCwlExecutor(api)
-        self.assertEqual(runner.work_api, 'containers')
+        # get_current_container is invoked when we call runtime_status_update
+        # so try and log again!
+        gcc_mock.side_effect = lambda *args: root_logger.error("Second Error")
+        try: 
+            root_logger.error("First Error")
+        except RuntimeError: 
+            self.fail("RuntimeStatusLoggingHandler should not be called recursively")
 
-        # Make sure ArvCwlExecutor thinks it's running inside a container so it
-        # adds the logging handler that will call runtime_status_update() mock
+    @mock.patch("arvados_cwl.ArvCwlExecutor.runtime_status_update")
+    @mock.patch("arvados_cwl.util.get_current_container")
+    @mock.patch("arvados.collection.CollectionReader")
+    @mock.patch("arvados.collection.Collection")
+    def test_child_failure(self, col, reader, gcc_mock, rts_mock):
+        runner = self.setup_and_test_container_executor_and_logging(gcc_mock)
+        
         gcc_mock.return_value = {"uuid" : "zzzzz-dz642-zzzzzzzzzzzzzzz"}
         self.assertTrue(gcc_mock.called)
-        root_logger = logging.getLogger('')
-        handlerClasses = [h.__class__ for h in root_logger.handlers]
-        self.assertTrue(arvados_cwl.RuntimeStatusLoggingHandler in handlerClasses)
 
         runner.num_retries = 0
         runner.ignore_docker_for_reuse = False
@@ -565,7 +584,7 @@ class TestContainer(unittest.TestCase):
             "modified_at": "2017-05-26T12:01:22Z"
         })
 
-        runtime_status_update.assert_called_with(
+        rts_mock.assert_called_with(
             'error',
             'arvados.cwl-runner: [container testjob] (zzzzz-xvhdp-zzzzzzzzzzzzzzz) error log:',
             '  ** log is empty **'
index e596e669156f9014ce5b9ad66d05fc2ba561bb3a..ec2a9942a6794153ea69138ba467a20f8b1ae6a6 100644 (file)
@@ -230,12 +230,12 @@ def docker_link_sort_key(link):
     Docker metadata links to sort them from least to most preferred.
     """
     try:
-        image_timestamp = ciso8601.parse_datetime(
+        image_timestamp = ciso8601.parse_datetime_as_naive(
             link['properties']['image_timestamp'])
     except (KeyError, ValueError):
         image_timestamp = EARLIEST_DATETIME
     try:
-        created_timestamp = ciso8601.parse_datetime(link['created_at'])
+        created_timestamp = ciso8601.parse_datetime_as_naive(link['created_at'])
     except ValueError:
         created_timestamp = None
     return (image_timestamp, created_timestamp)
index acebe2b1b7eaaffdc642a93849be9d3ab03b2e02..2e7a2a8182b62e2ffe8aa474a75c1f35b2723a43 100644 (file)
@@ -12,7 +12,7 @@ def convertTime(t):
     if not t:
         return 0
     try:
-        return calendar.timegm(ciso8601.parse_datetime(t).timetuple())
+        return calendar.timegm(ciso8601.parse_datetime_as_naive(t).timetuple())
     except (TypeError, ValueError):
         return 0