bin/extract_figures.py

   1 #!/usr/bin/env python
   2
   3 import sys
   4 import os
   5 import glob
   6 from optparse import OptionParser
   7
   8 from util import Reporter, read_markdown
   9
  10
  11 # Things an image file's name can end with.
  12 PATH_SUFFICES = {
  13     '.gif',
  14     '.jpg',
  15     '.png',
  16     '.svg'
  17 }
  18
  19
  20 def main():
  21     """Main driver."""
  22
  23     args = parse_args()
  24     images = []
  25     for filename in get_filenames(args.source_dir):
  26         images += get_images(args.parser, filename)
  27     save(sys.stdout, images)
  28
  29
  30 def parse_args():
  31     """Parse command-line arguments."""
  32
  33     parser = OptionParser()
  34     parser.add_option('-p', '--parser',
  35                       default=None,
  36                       dest='parser',
  37                       help='path to Markdown parser')
  38     parser.add_option('-s', '--source',
  39                       default=None,
  40                       dest='source_dir',
  41                       help='source directory')
  42
  43     args, extras = parser.parse_args()
  44     require(args.parser is not None,
  45             'Path to Markdown parser not provided')
  46     require(args.source_dir is not None,
  47             'Source directory not provided')
  48     require(not extras,
  49             'Unexpected trailing command-line arguments "{0}"'.format(extras))
  50
  51     return args
  52
  53
  54 def get_filenames(source_dir):
  55     """Get all filenames to be searched for images."""
  56
  57     return glob.glob(os.path.join(source_dir, '*.md'))
  58
  59
  60 def get_images(parser, filename):
  61     """Extract all images from file."""
  62
  63     content = read_markdown(parser, filename)
  64     result = []
  65     find_image_nodes(content['doc'], result)
  66     find_image_links(content['doc'], result)
  67     return result
  68
  69
  70 def find_image_nodes(doc, result):
  71     """Find all nested nodes representing images."""
  72
  73     if (doc["type"] == "img") or \
  74        ((doc["type"] == "html_element") and (doc["value"] == "img")):
  75         result.append({'alt': doc['attr']['alt'], 'src': doc['attr']['src']})
  76     else:
  77         for child in doc.get("children", []):
  78             find_image_nodes(child, result)
  79
  80
  81 def find_image_links(doc, result):
  82     """Find all links to files in the 'fig' directory."""
  83
  84     if (doc['type'] == 'a') and ('attr' in doc) and ('href' in doc['attr']):
  85         path = doc['attr']['href']
  86         if os.path.splitext(path)[1].lower() in PATH_SUFFICES:
  87             result.append({'alt':'', 'src': doc['attr']['href']})
  88     else:
  89         for child in doc.get('children', []):
  90             find_image_links(child, result)
  91
  92
  93 def save(stream, images):
  94     """Save results as Markdown."""
  95
  96     text = '\n<hr/>\n'.join(['<p><img alt="{0}" src="{1}" /></p>'.format(img['alt'], img['src']) for img in images])
  97     print(text, file=stream)
  98
  99
 100 def require(condition, message):
 101     """Fail if condition not met."""
 102
 103     if not condition:
 104         print(message, file=sys.stderr)
 105         sys.exit(1)
 106
 107
 108 if __name__ == '__main__':
 109     main()