Merge pull request #105 from gvwilson/analytics

[rnaseq-cwl-training.git] / bin / lesson_check.py
diff --git a/bin/lesson_check.py b/bin/lesson_check.py

index f858ebd14a513b13d800e8d3c6a851940028591f..311687e92ac851ba3c5c5af2720ce45d03490902 100755 (executable)
--- a/bin/lesson_check.py
+++ b/bin/lesson_check.py
@@ -4,15 +4,15 @@
  Check lesson files and their contents.
  """
  
+from __future__ import print_function
  import sys
  import os
  import glob
  import json
-import yaml
  import re
  from optparse import OptionParser
  
-from util import Reporter, read_markdown
+from util import Reporter, read_markdown, load_yaml, check_unwanted_files, require, IMAGE_FILE_SUFFIX
  
  __version__ = '0.2'
  
@@ -38,15 +38,18 @@ REQUIRED_FILES = {
      '%/setup.md': True,
  }
  
-# Required non-Markdown files.
-NON_MARKDOWN_FILES = {
-    "AUTHORS",
-    "CITATION"
-}
-
  # Episode filename pattern.
  P_EPISODE_FILENAME = re.compile(r'/_episodes/(\d\d)-[-\w]+.md$')
  
+# Pattern to match lines ending with whitespace.
+P_TRAILING_WHITESPACE = re.compile(r'\s+$')
+
+# Pattern to match figure references in HTML.
+P_FIGURE_REFS = re.compile(r'<img[^>]+src="([^"]+)"[^>]*>')
+
+# Pattern to match internally-defined Markdown links.
+P_INTERNALLY_DEFINED_LINK = re.compile(r'\[[^\]]+\]\[[^\]]+\]')
+
  # What kinds of blockquotes are allowed?
  KNOWN_BLOCKQUOTES = {
      'callout',
@@ -68,6 +71,7 @@ KNOWN_CODEBLOCKS = {
      'source',
      'bash',
      'make',
+    'matlab',
      'python',
      'r',
      'sql'
@@ -97,14 +101,15 @@ def main():
      """Main driver."""
  
      args = parse_args()
-    args.reporter = Reporter(args)
-    check_config(args)
-    check_non_markdown_files(args.source_dir, args.reporter)
-    docs = read_all_markdown(args, args.source_dir)
+    args.reporter = Reporter()
+    check_config(args.reporter, args.source_dir)
+    docs = read_all_markdown(args.source_dir, args.parser)
      check_fileset(args.source_dir, args.reporter, docs.keys())
+    check_unwanted_files(args.source_dir, args.reporter)
      for filename in docs.keys():
          checker = create_checker(args, filename, docs[filename])
          checker.check()
+    check_figures(args.source_dir, args.reporter)
      args.reporter.report()
  
  
@@ -114,7 +119,8 @@ def parse_args():
      parser = OptionParser()
      parser.add_option('-l', '--linelen',
                        default=False,
-                      dest='line_len',
+                      action="store_true",
+                      dest='line_lengths',
                        help='Check line lengths')
      parser.add_option('-p', '--parser',
                        default=None,
@@ -124,6 +130,11 @@ def parse_args():
                        default=os.curdir,
                        dest='source_dir',
                        help='source directory')
+    parser.add_option('-w', '--whitespace',
+                      default=False,
+                      action="store_true",
+                      dest='trailing_whitespace',
+                      help='Check for trailing whitespace')
  
      args, extras = parser.parse_args()
      require(args.parser is not None,
@@ -134,27 +145,22 @@ def parse_args():
      return args
  
  
-def check_config(args):
+def check_config(reporter, source_dir):
      """Check configuration file."""
  
-    config_file = os.path.join(args.source_dir, '_config.yml')
-    with open(config_file, 'r') as reader:
-        config = yaml.load(reader)
-
-    args.reporter.check_field(config_file, 'configuration', config, 'kind', 'lesson')
-
+    config_file = os.path.join(source_dir, '_config.yml')
+    config = load_yaml(config_file)
+    reporter.check_field(config_file, 'configuration', config, 'kind', 'lesson')
+    reporter.check_field(config_file, 'configuration', config, 'carpentry', ('swc', 'dc'))
+    reporter.check_field(config_file, 'configuration', config, 'title')
+    reporter.check_field(config_file, 'configuration', config, 'email')
  
-def check_non_markdown_files(source_dir, reporter):
-    """Check presence of non-Markdown files."""
+    reporter.check({'values': {'root': '..'}} in config.get('defaults', []),
+                   'configuration',
+                   '"root" not set to ".." in configuration')
  
-    for filename in NON_MARKDOWN_FILES:
-        path = os.path.join(source_dir, filename)
-        reporter.check(os.path.exists(path),
-                       filename,
-                       "File not found")
  
-
-def read_all_markdown(args, source_dir):
+def read_all_markdown(source_dir, parser):
      """Read source files, returning
      {path : {'metadata':yaml, 'metadata_len':N, 'text':text, 'lines':[(i, line, len)], 'doc':doc}}
      """
@@ -164,7 +170,7 @@ def read_all_markdown(args, source_dir):
      result = {}
      for pat in all_patterns:
          for filename in glob.glob(pat):
-            data = read_markdown(args.parser, filename)
+            data = read_markdown(parser, filename)
              if data:
                  result[filename] = data
      return result
@@ -192,9 +198,9 @@ def check_fileset(source_dir, reporter, filenames_present):
  
      # Check for duplicate episode numbers.
      reporter.check(len(seen) == len(set(seen)),
-                   None,
-                   'Duplicate episode numbers {0} vs {1}',
-                   sorted(seen), sorted(set(seen)))
+                        None,
+                        'Duplicate episode numbers {0} vs {1}',
+                        sorted(seen), sorted(set(seen)))
  
      # Check that numbers are consecutive.
      seen = [int(s) for s in seen]
@@ -208,6 +214,38 @@ def check_fileset(source_dir, reporter, filenames_present):
                     seen)
  
  
+def check_figures(source_dir, reporter):
+    """Check that all figures are present and referenced."""
+
+    # Get references.
+    try:
+        all_figures_html = os.path.join(source_dir, '_includes', 'all_figures.html')
+        with open(all_figures_html, 'r') as reader:
+            text = reader.read()
+        figures = P_FIGURE_REFS.findall(text)
+        referenced = [os.path.split(f)[1] for f in figures if '/fig/' in f]
+    except FileNotFoundError as e:
+        reporter.add(all_figures_html,
+                     'File not found')
+        return
+
+    # Get actual image files (ignore non-image files).
+    fig_dir_path = os.path.join(source_dir, 'fig')
+    actual = [f for f in os.listdir(fig_dir_path) if os.path.splitext(f)[1] in IMAGE_FILE_SUFFIX]
+
+    # Report differences.
+    unexpected = set(actual) - set(referenced)
+    reporter.check(not unexpected,
+                   None,
+                   'Unexpected image files: {0}',
+                   ', '.join(sorted(unexpected)))
+    missing = set(referenced) - set(actual)
+    reporter.check(not missing,
+                   None,
+                   'Missing image files: {0}',
+                   ', '.join(sorted(missing)))
+
+
  def create_checker(args, filename, info):
      """Create appropriate checker for file."""
  
@@ -216,14 +254,6 @@ def create_checker(args, filename, info):
              return cls(args, filename, **info)
  
  
-def require(condition, message):
-    """Fail if condition not met."""
-
-    if not condition:
-        print(message, file=sys.stderr)
-        sys.exit(1)
-
-
  class CheckBase(object):
      """Base class for checking Markdown files."""
  
@@ -239,6 +269,7 @@ class CheckBase(object):
          self.text = text
          self.lines = lines
          self.doc = doc
+
          self.layout = None
  
  
@@ -246,9 +277,11 @@ class CheckBase(object):
          """Run tests on metadata."""
  
          self.check_metadata()
-        self.check_text()
+        self.check_line_lengths()
+        self.check_trailing_whitespace()
          self.check_blockquote_classes()
          self.check_codeblock_classes()
+        self.check_defined_link_references()
  
  
      def check_metadata(self):
@@ -262,10 +295,10 @@ class CheckBase(object):
              self.reporter.check_field(self.filename, 'metadata', self.metadata, 'layout', self.layout)
  
  
-    def check_text(self):
+    def check_line_lengths(self):
          """Check the raw text of the lesson body."""
  
-        if self.args.line_len:
+        if self.args.line_lengths:
              over = [i for (i, l, n) in self.lines if (n > MAX_LINE_LEN) and (not l.startswith('!'))]
              self.reporter.check(not over,
                                  self.filename,
@@ -273,6 +306,17 @@ class CheckBase(object):
                                  ', '.join([str(i) for i in over]))
  
  
+    def check_trailing_whitespace(self):
+        """Check for whitespace at the ends of lines."""
+
+        if self.args.trailing_whitespace:
+            trailing = [i for (i, l, n) in self.lines if P_TRAILING_WHITESPACE.match(l)]
+            self.reporter.check(not trailing,
+                                self.filename,
+                                'Line(s) end with whitespace: {0}',
+                                ', '.join([str(i) for i in trailing]))
+
+
      def check_blockquote_classes(self):
          """Check that all blockquotes have known classes."""
  
@@ -295,6 +339,26 @@ class CheckBase(object):
                                  cls)
  
  
+    def check_defined_link_references(self):
+        """Check that defined links resolve in the file.
+
+        Internally-defined links match the pattern [text][label].  If
+        the label contains '{{...}}', it is hopefully a references to
+        a configuration value - we should check that, but don't right
+        now.
+        """
+
+        result = set()
+        for node in self.find_all(self.doc, {'type' : 'text'}):
+            for match in P_INTERNALLY_DEFINED_LINK.findall(node['value']):
+                if '{{' not in match:
+                    result.add(match)
+        self.reporter.check(not result,
+                            self.filename,
+                            'Internally-defined links may be missing definitions: {0}',
+                            ', '.join(sorted(result)))
+
+
      def find_all(self, node, pattern, accum=None):
          """Find all matches for a pattern."""
  
@@ -364,6 +428,12 @@ class CheckIndex(CheckBase):
          super(CheckIndex, self).__init__(args, filename, metadata, metadata_len, text, lines, doc)
          self.layout = 'lesson'
  
+    def check_metadata(self):
+        super(CheckIndex, self).check_metadata()
+        self.reporter.check(self.metadata.get('root', '') == '.',
+                            self.filename,
+                            'Root not set to "."')
+
  
  class CheckEpisode(CheckBase):
      """Check an episode page."""
@@ -371,7 +441,6 @@ class CheckEpisode(CheckBase):
      def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
          super(CheckEpisode, self).__init__(args, filename, metadata, metadata_len, text, lines, doc)
  
-
      def check_metadata(self):
          super(CheckEpisode, self).check_metadata()
          if self.metadata: