bin/lesson_check.py

   1 #!/usr/bin/env python
   2
   3 """
   4 Check lesson files and their contents.
   5 """
   6
   7 from __future__ import print_function
   8 import sys
   9 import os
  10 import glob
  11 import json
  12 import re
  13 from optparse import OptionParser
  14
  15 from util import Reporter, read_markdown, load_yaml, check_unwanted_files, require, IMAGE_FILE_SUFFIX
  16
  17 __version__ = '0.3'
  18
  19 # Where to look for source Markdown files.
  20 SOURCE_DIRS = ['', '_episodes', '_extras']
  21
  22 # Required files: each entry is ('path': YAML_required).
  23 # FIXME: We do not yet validate whether any files have the required
  24 #   YAML headers, but should in the future.
  25 # The '%' is replaced with the source directory path for checking.
  26 # Episodes are handled specially, and extra files in '_extras' are also handled specially.
  27 # This list must include all the Markdown files listed in the 'bin/initialize' script.
  28 REQUIRED_FILES = {
  29     '%/CONDUCT.md': True,
  30     '%/CONTRIBUTING.md': False,
  31     '%/LICENSE.md': True,
  32     '%/README.md': False,
  33     '%/_extras/discuss.md': True,
  34     '%/_extras/figures.md': True,
  35     '%/_extras/guide.md': True,
  36     '%/index.md': True,
  37     '%/reference.md': True,
  38     '%/setup.md': True,
  39 }
  40
  41 # Episode filename pattern.
  42 P_EPISODE_FILENAME = re.compile(r'/_episodes/(\d\d)-[-\w]+.md$')
  43
  44 # Pattern to match lines ending with whitespace.
  45 P_TRAILING_WHITESPACE = re.compile(r'\s+$')
  46
  47 # Pattern to match figure references in HTML.
  48 P_FIGURE_REFS = re.compile(r'<img[^>]+src="([^"]+)"[^>]*>')
  49
  50 # Pattern to match internally-defined Markdown links.
  51 P_INTERNAL_LINK_REF = re.compile(r'\[([^\]]+)\]\[([^\]]+)\]')
  52
  53 # Pattern to match reference links (to resolve internally-defined references).
  54 P_INTERNAL_LINK_DEF = re.compile(r'^\[([^\]]+)\]:\s*(.+)')
  55
  56 # What kinds of blockquotes are allowed?
  57 KNOWN_BLOCKQUOTES = {
  58     'callout',
  59     'challenge',
  60     'checklist',
  61     'discussion',
  62     'keypoints',
  63     'objectives',
  64     'prereq',
  65     'quotation',
  66     'solution',
  67     'testimonial'
  68 }
  69
  70 # What kinds of code fragments are allowed?
  71 KNOWN_CODEBLOCKS = {
  72     'error',
  73     'output',
  74     'source',
  75     'bash',
  76     'html',
  77     'make',
  78     'matlab',
  79     'python',
  80     'r',
  81     'shell',
  82     'sql'
  83 }
  84
  85 # What fields are required in teaching episode metadata?
  86 TEACHING_METADATA_FIELDS = {
  87     ('title', str),
  88     ('teaching', int),
  89     ('exercises', int),
  90     ('questions', list),
  91     ('objectives', list),
  92     ('keypoints', list)
  93 }
  94
  95 # What fields are required in break episode metadata?
  96 BREAK_METADATA_FIELDS = {
  97     ('layout', str),
  98     ('title', str),
  99     ('break', int)
 100 }
 101
 102 # How long are lines allowed to be?
 103 MAX_LINE_LEN = 100
 104
 105 def main():
 106     """Main driver."""
 107
 108     args = parse_args()
 109     args.reporter = Reporter()
 110     check_config(args.reporter, args.source_dir)
 111     args.references = read_references(args.reporter, args.reference_path)
 112
 113     docs = read_all_markdown(args.source_dir, args.parser)
 114     check_fileset(args.source_dir, args.reporter, docs.keys())
 115     check_unwanted_files(args.source_dir, args.reporter)
 116     for filename in docs.keys():
 117         checker = create_checker(args, filename, docs[filename])
 118         checker.check()
 119     check_figures(args.source_dir, args.reporter)
 120
 121     args.reporter.report()
 122
 123
 124 def parse_args():
 125     """Parse command-line arguments."""
 126
 127     parser = OptionParser()
 128     parser.add_option('-l', '--linelen',
 129                       default=False,
 130                       action="store_true",
 131                       dest='line_lengths',
 132                       help='Check line lengths')
 133     parser.add_option('-p', '--parser',
 134                       default=None,
 135                       dest='parser',
 136                       help='path to Markdown parser')
 137     parser.add_option('-r', '--references',
 138                       default=None,
 139                       dest='reference_path',
 140                       help='path to Markdown file of external references')
 141     parser.add_option('-s', '--source',
 142                       default=os.curdir,
 143                       dest='source_dir',
 144                       help='source directory')
 145     parser.add_option('-w', '--whitespace',
 146                       default=False,
 147                       action="store_true",
 148                       dest='trailing_whitespace',
 149                       help='Check for trailing whitespace')
 150
 151     args, extras = parser.parse_args()
 152     require(args.parser is not None,
 153             'Path to Markdown parser not provided')
 154     require(not extras,
 155             'Unexpected trailing command-line arguments "{0}"'.format(extras))
 156
 157     return args
 158
 159
 160 def check_config(reporter, source_dir):
 161     """Check configuration file."""
 162
 163     config_file = os.path.join(source_dir, '_config.yml')
 164     config = load_yaml(config_file)
 165     reporter.check_field(config_file, 'configuration', config, 'kind', 'lesson')
 166     reporter.check_field(config_file, 'configuration', config, 'carpentry', ('swc', 'dc', 'lc'))
 167     reporter.check_field(config_file, 'configuration', config, 'title')
 168     reporter.check_field(config_file, 'configuration', config, 'email')
 169
 170     reporter.check({'values': {'root': '..'}} in config.get('defaults', []),
 171                    'configuration',
 172                    '"root" not set to ".." in configuration')
 173
 174
 175 def read_references(reporter, ref_path):
 176     """Read shared file of reference links, returning dictionary of valid references
 177     {symbolic_name : URL}
 178     """
 179
 180     result = {}
 181     urls_seen = set()
 182     if ref_path:
 183         with open(ref_path, 'r') as reader:
 184             for (num, line) in enumerate(reader):
 185                 line_num = num + 1
 186                 m = P_INTERNAL_LINK_DEF.search(line)
 187                 require(m,
 188                         '{0}:{1} not valid reference:\n{2}'.format(ref_path, line_num, line.rstrip()))
 189                 name = m.group(1)
 190                 url = m.group(2)
 191                 require(name,
 192                         'Empty reference at {0}:{1}'.format(ref_path, line_num))
 193                 reporter.check(name not in result,
 194                                ref_path,
 195                                'Duplicate reference {0} at line {1}',
 196                                name, line_num)
 197                 reporter.check(url not in urls_seen,
 198                                ref_path,
 199                                'Duplicate definition of URL {0} at line {1}',
 200                                url, line_num)
 201                 result[name] = url
 202                 urls_seen.add(url)
 203     return result
 204
 205
 206 def read_all_markdown(source_dir, parser):
 207     """Read source files, returning
 208     {path : {'metadata':yaml, 'metadata_len':N, 'text':text, 'lines':[(i, line, len)], 'doc':doc}}
 209     """
 210
 211     all_dirs = [os.path.join(source_dir, d) for d in SOURCE_DIRS]
 212     all_patterns = [os.path.join(d, '*.md') for d in all_dirs]
 213     result = {}
 214     for pat in all_patterns:
 215         for filename in glob.glob(pat):
 216             data = read_markdown(parser, filename)
 217             if data:
 218                 result[filename] = data
 219     return result
 220
 221
 222 def check_fileset(source_dir, reporter, filenames_present):
 223     """Are all required files present? Are extraneous files present?"""
 224
 225     # Check files with predictable names.
 226     required = [p.replace('%', source_dir) for p in REQUIRED_FILES]
 227     missing = set(required) - set(filenames_present)
 228     for m in missing:
 229         reporter.add(None, 'Missing required file {0}', m)
 230
 231     # Check episode files' names.
 232     seen = []
 233     for filename in filenames_present:
 234         if '_episodes' not in filename:
 235             continue
 236         m = P_EPISODE_FILENAME.search(filename)
 237         if m and m.group(1):
 238             seen.append(m.group(1))
 239         else:
 240             reporter.add(None, 'Episode {0} has badly-formatted filename', filename)
 241
 242     # Check for duplicate episode numbers.
 243     reporter.check(len(seen) == len(set(seen)),
 244                         None,
 245                         'Duplicate episode numbers {0} vs {1}',
 246                         sorted(seen), sorted(set(seen)))
 247
 248     # Check that numbers are consecutive.
 249     seen = [int(s) for s in seen]
 250     seen.sort()
 251     clean = True
 252     for i in range(len(seen) - 1):
 253         clean = clean and ((seen[i+1] - seen[i]) == 1)
 254     reporter.check(clean,
 255                    None,
 256                    'Missing or non-consecutive episode numbers {0}',
 257                    seen)
 258
 259
 260 def check_figures(source_dir, reporter):
 261     """Check that all figures are present and referenced."""
 262
 263     # Get references.
 264     try:
 265         all_figures_html = os.path.join(source_dir, '_includes', 'all_figures.html')
 266         with open(all_figures_html, 'r') as reader:
 267             text = reader.read()
 268         figures = P_FIGURE_REFS.findall(text)
 269         referenced = [os.path.split(f)[1] for f in figures if '/fig/' in f]
 270     except FileNotFoundError as e:
 271         reporter.add(all_figures_html,
 272                      'File not found')
 273         return
 274
 275     # Get actual image files (ignore non-image files).
 276     fig_dir_path = os.path.join(source_dir, 'fig')
 277     actual = [f for f in os.listdir(fig_dir_path) if os.path.splitext(f)[1] in IMAGE_FILE_SUFFIX]
 278
 279     # Report differences.
 280     unexpected = set(actual) - set(referenced)
 281     reporter.check(not unexpected,
 282                    None,
 283                    'Unexpected image files: {0}',
 284                    ', '.join(sorted(unexpected)))
 285     missing = set(referenced) - set(actual)
 286     reporter.check(not missing,
 287                    None,
 288                    'Missing image files: {0}',
 289                    ', '.join(sorted(missing)))
 290
 291
 292 def create_checker(args, filename, info):
 293     """Create appropriate checker for file."""
 294
 295     for (pat, cls) in CHECKERS:
 296         if pat.search(filename):
 297             return cls(args, filename, **info)
 298
 299
 300 class CheckBase(object):
 301     """Base class for checking Markdown files."""
 302
 303     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
 304         """Cache arguments for checking."""
 305
 306         super(CheckBase, self).__init__()
 307         self.args = args
 308         self.reporter = self.args.reporter # for convenience
 309         self.filename = filename
 310         self.metadata = metadata
 311         self.metadata_len = metadata_len
 312         self.text = text
 313         self.lines = lines
 314         self.doc = doc
 315
 316         self.layout = None
 317
 318
 319     def check(self):
 320         """Run tests."""
 321
 322         self.check_metadata()
 323         self.check_line_lengths()
 324         self.check_trailing_whitespace()
 325         self.check_blockquote_classes()
 326         self.check_codeblock_classes()
 327         self.check_defined_link_references()
 328
 329
 330     def check_metadata(self):
 331         """Check the YAML metadata."""
 332
 333         self.reporter.check(self.metadata is not None,
 334                             self.filename,
 335                             'Missing metadata entirely')
 336
 337         if self.metadata and (self.layout is not None):
 338             self.reporter.check_field(self.filename, 'metadata', self.metadata, 'layout', self.layout)
 339
 340
 341     def check_line_lengths(self):
 342         """Check the raw text of the lesson body."""
 343
 344         if self.args.line_lengths:
 345             over = [i for (i, l, n) in self.lines if (n > MAX_LINE_LEN) and (not l.startswith('!'))]
 346             self.reporter.check(not over,
 347                                 self.filename,
 348                                 'Line(s) are too long: {0}',
 349                                 ', '.join([str(i) for i in over]))
 350
 351
 352     def check_trailing_whitespace(self):
 353         """Check for whitespace at the ends of lines."""
 354
 355         if self.args.trailing_whitespace:
 356             trailing = [i for (i, l, n) in self.lines if P_TRAILING_WHITESPACE.match(l)]
 357             self.reporter.check(not trailing,
 358                                 self.filename,
 359                                 'Line(s) end with whitespace: {0}',
 360                                 ', '.join([str(i) for i in trailing]))
 361
 362
 363     def check_blockquote_classes(self):
 364         """Check that all blockquotes have known classes."""
 365
 366         for node in self.find_all(self.doc, {'type' : 'blockquote'}):
 367             cls = self.get_val(node, 'attr', 'class')
 368             self.reporter.check(cls in KNOWN_BLOCKQUOTES,
 369                                 (self.filename, self.get_loc(node)),
 370                                 'Unknown or missing blockquote type {0}',
 371                                 cls)
 372
 373
 374     def check_codeblock_classes(self):
 375         """Check that all code blocks have known classes."""
 376
 377         for node in self.find_all(self.doc, {'type' : 'codeblock'}):
 378             cls = self.get_val(node, 'attr', 'class')
 379             self.reporter.check(cls in KNOWN_CODEBLOCKS,
 380                                 (self.filename, self.get_loc(node)),
 381                                 'Unknown or missing code block type {0}',
 382                                 cls)
 383
 384
 385     def check_defined_link_references(self):
 386         """Check that defined links resolve in the file.
 387
 388         Internally-defined links match the pattern [text][label].
 389         """
 390
 391         result = set()
 392         for node in self.find_all(self.doc, {'type' : 'text'}):
 393             for match in P_INTERNAL_LINK_REF.findall(node['value']):
 394                 text = match[0]
 395                 link = match[1]
 396                 if link not in self.args.references:
 397                     result.add('"{0}"=>"{1}"'.format(text, link))
 398         self.reporter.check(not result,
 399                             self.filename,
 400                             'Internally-defined links may be missing definitions: {0}',
 401                             ', '.join(sorted(result)))
 402
 403
 404     def find_all(self, node, pattern, accum=None):
 405         """Find all matches for a pattern."""
 406
 407         assert type(pattern) == dict, 'Patterns must be dictionaries'
 408         if accum is None:
 409             accum = []
 410         if self.match(node, pattern):
 411             accum.append(node)
 412         for child in node.get('children', []):
 413             self.find_all(child, pattern, accum)
 414         return accum
 415
 416
 417     def match(self, node, pattern):
 418         """Does this node match the given pattern?"""
 419
 420         for key in pattern:
 421             if key not in node:
 422                 return False
 423             val = pattern[key]
 424             if type(val) == str:
 425                 if node[key] != val:
 426                     return False
 427             elif type(val) == dict:
 428                 if not self.match(node[key], val):
 429                     return False
 430         return True
 431
 432
 433     def get_val(self, node, *chain):
 434         """Get value one or more levels down."""
 435
 436         curr = node
 437         for selector in chain:
 438             curr = curr.get(selector, None)
 439             if curr is None:
 440                 break
 441         return curr
 442
 443
 444     def get_loc(self, node):
 445         """Convenience method to get node's line number."""
 446
 447         result = self.get_val(node, 'options', 'location')
 448         if self.metadata_len is not None:
 449             result += self.metadata_len
 450         return result
 451
 452
 453 class CheckNonJekyll(CheckBase):
 454     """Check a file that isn't translated by Jekyll."""
 455
 456     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
 457         super(CheckNonJekyll, self).__init__(args, filename, metadata, metadata_len, text, lines, doc)
 458
 459
 460     def check_metadata(self):
 461         self.reporter.check(self.metadata is None,
 462                             self.filename,
 463                             'Unexpected metadata')
 464
 465
 466 class CheckIndex(CheckBase):
 467     """Check the main index page."""
 468
 469     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
 470         super(CheckIndex, self).__init__(args, filename, metadata, metadata_len, text, lines, doc)
 471         self.layout = 'lesson'
 472
 473     def check_metadata(self):
 474         super(CheckIndex, self).check_metadata()
 475         self.reporter.check(self.metadata.get('root', '') == '.',
 476                             self.filename,
 477                             'Root not set to "."')
 478
 479
 480 class CheckEpisode(CheckBase):
 481     """Check an episode page."""
 482
 483     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
 484         super(CheckEpisode, self).__init__(args, filename, metadata, metadata_len, text, lines, doc)
 485
 486
 487     def check(self):
 488         """Run extra tests."""
 489
 490         super(CheckEpisode, self).check()
 491         self.check_reference_inclusion()
 492
 493
 494     def check_metadata(self):
 495         super(CheckEpisode, self).check_metadata()
 496         if self.metadata:
 497             if 'layout' in self.metadata:
 498                 if self.metadata['layout'] == 'break':
 499                     self.check_metadata_fields(BREAK_METADATA_FIELDS)
 500                 else:
 501                     self.reporter.add(self.filename,
 502                                       'Unknown episode layout "{0}"',
 503                                       self.metadata['layout'])
 504             else:
 505                 self.check_metadata_fields(TEACHING_METADATA_FIELDS)
 506
 507
 508     def check_metadata_fields(self, expected):
 509         for (name, type_) in expected:
 510             if name not in self.metadata:
 511                 self.reporter.add(self.filename,
 512                                   'Missing metadata field {0}',
 513                                   name)
 514             elif type(self.metadata[name]) != type_:
 515                 self.reporter.add(self.filename,
 516                                   '"{0}" has wrong type in metadata ({1} instead of {2})',
 517                                   name, type(self.metadata[name]), type_)
 518
 519
 520     def check_reference_inclusion(self):
 521         """Check that links file has been included."""
 522
 523         if not self.args.reference_path:
 524             return
 525
 526         for (i, last_line, line_len) in reversed(self.lines):
 527             if last_line:
 528                 break
 529
 530         require(last_line,
 531                 'No non-empty lines in {0}'.format(self.filename))
 532
 533         include_filename = os.path.split(self.args.reference_path)[-1]
 534         if include_filename not in last_line:
 535             self.reporter.add(self.filename,
 536                               'episode does not include "{0}"',
 537                               include_filename)
 538
 539
 540 class CheckReference(CheckBase):
 541     """Check the reference page."""
 542
 543     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
 544         super(CheckReference, self).__init__(args, filename, metadata, metadata_len, text, lines, doc)
 545         self.layout = 'reference'
 546
 547
 548 class CheckGeneric(CheckBase):
 549     """Check a generic page."""
 550
 551     def __init__(self, args, filename, metadata, metadata_len, text, lines, doc):
 552         super(CheckGeneric, self).__init__(args, filename, metadata, metadata_len, text, lines, doc)
 553         self.layout = 'page'
 554
 555
 556 CHECKERS = [
 557     (re.compile(r'CONTRIBUTING\.md'), CheckNonJekyll),
 558     (re.compile(r'README\.md'), CheckNonJekyll),
 559     (re.compile(r'index\.md'), CheckIndex),
 560     (re.compile(r'reference\.md'), CheckReference),
 561     (re.compile(r'_episodes/.*\.md'), CheckEpisode),
 562     (re.compile(r'.*\.md'), CheckGeneric)
 563 ]
 564
 565
 566 if __name__ == '__main__':
 567     main()