X-Git-Url: https://git.arvados.org/rnaseq-cwl-training.git/blobdiff_plain/2a53ed7304d813edeab9a1cbbdf226413a5d6f53..7f8e13d704e49b625b25b7b655d4c21db70c7ccf:/bin/lesson_check.py diff --git a/bin/lesson_check.py b/bin/lesson_check.py index 4a621a1..311687e 100755 --- a/bin/lesson_check.py +++ b/bin/lesson_check.py @@ -4,15 +4,15 @@ Check lesson files and their contents. """ +from __future__ import print_function import sys import os import glob import json -import yaml import re from optparse import OptionParser -from util import Reporter, read_markdown +from util import Reporter, read_markdown, load_yaml, check_unwanted_files, require, IMAGE_FILE_SUFFIX __version__ = '0.2' @@ -41,6 +41,15 @@ REQUIRED_FILES = { # Episode filename pattern. P_EPISODE_FILENAME = re.compile(r'/_episodes/(\d\d)-[-\w]+.md$') +# Pattern to match lines ending with whitespace. +P_TRAILING_WHITESPACE = re.compile(r'\s+$') + +# Pattern to match figure references in HTML. +P_FIGURE_REFS = re.compile(r']+src="([^"]+)"[^>]*>') + +# Pattern to match internally-defined Markdown links. +P_INTERNALLY_DEFINED_LINK = re.compile(r'\[[^\]]+\]\[[^\]]+\]') + # What kinds of blockquotes are allowed? KNOWN_BLOCKQUOTES = { 'callout', @@ -62,13 +71,14 @@ KNOWN_CODEBLOCKS = { 'source', 'bash', 'make', + 'matlab', 'python', 'r', 'sql' } -# What fields are required in episode metadata? -EPISODE_METADATA_FIELDS = { +# What fields are required in teaching episode metadata? +TEACHING_METADATA_FIELDS = { ('title', str), ('teaching', int), ('exercises', int), @@ -77,6 +87,13 @@ EPISODE_METADATA_FIELDS = { ('keypoints', list) } +# What fields are required in break episode metadata? +BREAK_METADATA_FIELDS = { + ('layout', str), + ('title', str), + ('break', int) +} + # How long are lines allowed to be? MAX_LINE_LEN = 100 @@ -84,13 +101,15 @@ def main(): """Main driver.""" args = parse_args() - args.reporter = Reporter(args) - check_config(args) - docs = read_all_markdown(args, args.source_dir) + args.reporter = Reporter() + check_config(args.reporter, args.source_dir) + docs = read_all_markdown(args.source_dir, args.parser) check_fileset(args.source_dir, args.reporter, docs.keys()) + check_unwanted_files(args.source_dir, args.reporter) for filename in docs.keys(): checker = create_checker(args, filename, docs[filename]) checker.check() + check_figures(args.source_dir, args.reporter) args.reporter.report() @@ -100,7 +119,8 @@ def parse_args(): parser = OptionParser() parser.add_option('-l', '--linelen', default=False, - dest='line_len', + action="store_true", + dest='line_lengths', help='Check line lengths') parser.add_option('-p', '--parser', default=None, @@ -110,6 +130,11 @@ def parse_args(): default=os.curdir, dest='source_dir', help='source directory') + parser.add_option('-w', '--whitespace', + default=False, + action="store_true", + dest='trailing_whitespace', + help='Check for trailing whitespace') args, extras = parser.parse_args() require(args.parser is not None, @@ -120,17 +145,22 @@ def parse_args(): return args -def check_config(args): +def check_config(reporter, source_dir): """Check configuration file.""" - config_file = os.path.join(args.source_dir, '_config.yml') - with open(config_file, 'r') as reader: - config = yaml.load(reader) + config_file = os.path.join(source_dir, '_config.yml') + config = load_yaml(config_file) + reporter.check_field(config_file, 'configuration', config, 'kind', 'lesson') + reporter.check_field(config_file, 'configuration', config, 'carpentry', ('swc', 'dc')) + reporter.check_field(config_file, 'configuration', config, 'title') + reporter.check_field(config_file, 'configuration', config, 'email') - args.reporter.check_field(config_file, 'configuration', config, 'kind', 'lesson') + reporter.check({'values': {'root': '..'}} in config.get('defaults', []), + 'configuration', + '"root" not set to ".." in configuration') -def read_all_markdown(args, source_dir): +def read_all_markdown(source_dir, parser): """Read source files, returning {path : {'metadata':yaml, 'metadata_len':N, 'text':text, 'lines':[(i, line, len)], 'doc':doc}} """ @@ -140,7 +170,7 @@ def read_all_markdown(args, source_dir): result = {} for pat in all_patterns: for filename in glob.glob(pat): - data = read_markdown(args.parser, filename) + data = read_markdown(parser, filename) if data: result[filename] = data return result @@ -166,17 +196,54 @@ def check_fileset(source_dir, reporter, filenames_present): else: reporter.add(None, 'Episode {0} has badly-formatted filename', filename) - # Check episode filename numbering. + # Check for duplicate episode numbers. reporter.check(len(seen) == len(set(seen)), None, 'Duplicate episode numbers {0} vs {1}', sorted(seen), sorted(set(seen))) + + # Check that numbers are consecutive. seen = [int(s) for s in seen] seen.sort() - reporter.check(all([i+1 == n for (i, n) in enumerate(seen)]), - None, - 'Missing or non-consecutive episode numbers {0}', - seen) + clean = True + for i in range(len(seen) - 1): + clean = clean and ((seen[i+1] - seen[i]) == 1) + reporter.check(clean, + None, + 'Missing or non-consecutive episode numbers {0}', + seen) + + +def check_figures(source_dir, reporter): + """Check that all figures are present and referenced.""" + + # Get references. + try: + all_figures_html = os.path.join(source_dir, '_includes', 'all_figures.html') + with open(all_figures_html, 'r') as reader: + text = reader.read() + figures = P_FIGURE_REFS.findall(text) + referenced = [os.path.split(f)[1] for f in figures if '/fig/' in f] + except FileNotFoundError as e: + reporter.add(all_figures_html, + 'File not found') + return + + # Get actual image files (ignore non-image files). + fig_dir_path = os.path.join(source_dir, 'fig') + actual = [f for f in os.listdir(fig_dir_path) if os.path.splitext(f)[1] in IMAGE_FILE_SUFFIX] + + # Report differences. + unexpected = set(actual) - set(referenced) + reporter.check(not unexpected, + None, + 'Unexpected image files: {0}', + ', '.join(sorted(unexpected))) + missing = set(referenced) - set(actual) + reporter.check(not missing, + None, + 'Missing image files: {0}', + ', '.join(sorted(missing))) def create_checker(args, filename, info): @@ -187,14 +254,6 @@ def create_checker(args, filename, info): return cls(args, filename, **info) -def require(condition, message): - """Fail if condition not met.""" - - if not condition: - print(message, file=sys.stderr) - sys.exit(1) - - class CheckBase(object): """Base class for checking Markdown files.""" @@ -218,9 +277,11 @@ class CheckBase(object): """Run tests on metadata.""" self.check_metadata() - self.check_text() + self.check_line_lengths() + self.check_trailing_whitespace() self.check_blockquote_classes() self.check_codeblock_classes() + self.check_defined_link_references() def check_metadata(self): @@ -234,10 +295,10 @@ class CheckBase(object): self.reporter.check_field(self.filename, 'metadata', self.metadata, 'layout', self.layout) - def check_text(self): + def check_line_lengths(self): """Check the raw text of the lesson body.""" - if self.args.line_len: + if self.args.line_lengths: over = [i for (i, l, n) in self.lines if (n > MAX_LINE_LEN) and (not l.startswith('!'))] self.reporter.check(not over, self.filename, @@ -245,6 +306,17 @@ class CheckBase(object): ', '.join([str(i) for i in over])) + def check_trailing_whitespace(self): + """Check for whitespace at the ends of lines.""" + + if self.args.trailing_whitespace: + trailing = [i for (i, l, n) in self.lines if P_TRAILING_WHITESPACE.match(l)] + self.reporter.check(not trailing, + self.filename, + 'Line(s) end with whitespace: {0}', + ', '.join([str(i) for i in trailing])) + + def check_blockquote_classes(self): """Check that all blockquotes have known classes.""" @@ -267,6 +339,26 @@ class CheckBase(object): cls) + def check_defined_link_references(self): + """Check that defined links resolve in the file. + + Internally-defined links match the pattern [text][label]. If + the label contains '{{...}}', it is hopefully a references to + a configuration value - we should check that, but don't right + now. + """ + + result = set() + for node in self.find_all(self.doc, {'type' : 'text'}): + for match in P_INTERNALLY_DEFINED_LINK.findall(node['value']): + if '{{' not in match: + result.add(match) + self.reporter.check(not result, + self.filename, + 'Internally-defined links may be missing definitions: {0}', + ', '.join(sorted(result))) + + def find_all(self, node, pattern, accum=None): """Find all matches for a pattern.""" @@ -336,6 +428,12 @@ class CheckIndex(CheckBase): super(CheckIndex, self).__init__(args, filename, metadata, metadata_len, text, lines, doc) self.layout = 'lesson' + def check_metadata(self): + super(CheckIndex, self).check_metadata() + self.reporter.check(self.metadata.get('root', '') == '.', + self.filename, + 'Root not set to "."') + class CheckEpisode(CheckBase): """Check an episode page.""" @@ -346,11 +444,27 @@ class CheckEpisode(CheckBase): def check_metadata(self): super(CheckEpisode, self).check_metadata() if self.metadata: - for (name, type_) in EPISODE_METADATA_FIELDS: - self.reporter.check(type(self.metadata.get(name, None)) == type_, - self.filename, - '"{0}" missing, empty, or has wrong type in metadata', - name) + if 'layout' in self.metadata: + if self.metadata['layout'] == 'break': + self.check_metadata_fields(BREAK_METADATA_FIELDS) + else: + self.reporter.add(self.filename, + 'Unknown episode layout "{0}"', + self.metadata['layout']) + else: + self.check_metadata_fields(TEACHING_METADATA_FIELDS) + + + def check_metadata_fields(self, expected): + for (name, type_) in expected: + if name not in self.metadata: + self.reporter.add(self.filename, + 'Missing metadata field {0}', + name) + elif type(self.metadata[name]) != type_: + self.reporter.add(self.filename, + '"{0}" has wrong type in metadata ({1} instead of {2})', + name, type(self.metadata[name]), type_) class CheckReference(CheckBase):