#!/usr/bin/env python

import argparse
import hashlib
import os
import re
import string
import sys
import logging

logger = logging.getLogger(os.path.basename(sys.argv[0]))

parser = argparse.ArgumentParser(
    description='Copy data from Keep to a local file or pipe.')
parser.add_argument('locator', type=str,
                    help="""
Collection locator, optionally with a file path or prefix.
""")
parser.add_argument('destination', type=str, nargs='?', default='/dev/stdout',
                    help="""
Local file or directory where the data is to be written. Default:
/dev/stdout.
""")
group = parser.add_mutually_exclusive_group()
group.add_argument('--progress', action='store_true',
                   help="""
Display human-readable progress on stderr (bytes and, if possible,
percentage of total data size). This is the default behavior when
stderr is a tty and stdout is not a tty.
""")
group.add_argument('--no-progress', action='store_true',
                   help="""
Do not display human-readable progress on stderr.
""")
group.add_argument('--batch-progress', action='store_true',
                   help="""
Display machine-readable progress on stderr (bytes and, if known,
total data size).
""")
group = parser.add_mutually_exclusive_group()
group.add_argument('--hash',
                    help="""
Display the hash of each file as it is read from Keep, using the given
hash algorithm. Supported algorithms include md5, sha1, sha224,
sha256, sha384, and sha512.
""")
group.add_argument('--md5sum', action='store_const',
                    dest='hash', const='md5',
                    help="""
Display the MD5 hash of each file as it is read from Keep.
""")
parser.add_argument('-n', action='store_true',
                    help="""
Do not write any data -- just read from Keep, and report md5sums if
requested.
""")
parser.add_argument('-r', action='store_true',
                    help="""
Retrieve all files in the specified collection/prefix. This is the
default behavior if the "locator" argument ends with a forward slash.
""")

args = parser.parse_args()

if args.locator[-1] == os.sep:
    args.r = True
if (args.r and
    not args.n and
    not (args.destination and
         os.path.isdir(args.destination))):
    parser.error('Destination is not a directory.')
if not args.r and (os.path.isdir(args.destination) or
                   args.destination[-1] == os.path.sep):
    parser.error('Destination is a directory.')

# Turn on --progress by default if stderr is a tty and stdout isn't.
if (not (args.batch_progress or args.no_progress)
    and os.isatty(sys.stderr.fileno())
    and not os.isatty(sys.stdout.fileno())):
    args.progress = True

if args.destination == '-':
    args.destination = '/dev/stdout'
args.destination = args.destination.rstrip(os.sep)


import arvados

r = re.search(r'^(.*?)(/.*)?$', args.locator)
collection = r.group(1)
get_prefix = r.group(2)
if args.r and not get_prefix:
    get_prefix = os.sep

todo = []
todo_bytes = 0
if not get_prefix:
    if not args.n:
        with open(args.destination, 'wb') as f:
            f.write(arvados.Keep.get(collection))
    sys.exit(0)

reader = arvados.CollectionReader(collection)

# Scan the collection. Make an array of (stream, file, local
# destination filename) tuples, and add up total size to extract.

try:
    for s in reader.all_streams():
        for f in s.all_files():
            if get_prefix and get_prefix[-1] == os.sep:
                if 0 != string.find(os.path.join(s.name(), f.name()),
                                    '.' + get_prefix):
                    continue
                dest_path = os.path.join(
                    args.destination,
                    os.path.join(s.name(), f.name())[len(get_prefix)+1:])
            else:
                if os.path.join(s.name(), f.name()) != '.' + get_prefix:
                    continue
                dest_path = args.destination
            todo += [(s, f, dest_path)]
            todo_bytes += f.size()
except arvados.errors.NotFoundError as e:
    logger.error(e)
    sys.exit(1)

# Read data, and (if not -n) write to local file(s) or pipe.

out_bytes = 0
for s,f,outfilename in todo:
    outfile = None
    digestor = None
    if not args.n:
        if args.r:
            arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
        try:
            outfile = open(outfilename, 'wb')
        except Exception as e:
            logger.error('Open(%s) failed: %s' % (outfilename, e))
    if args.hash:
        digestor = hashlib.new(args.hash)
    try:
        for data in f.readall():
            if outfile:
                outfile.write(data)
            if digestor:
                digestor.update(data)
            out_bytes += len(data)
            if args.progress:
                sys.stderr.write('\r%d MiB / %d MiB %.1f%%' %
                                 (out_bytes >> 20,
                                  todo_bytes >> 20,
                                  (100
                                   if todo_bytes==0
                                   else 100.0*out_bytes/todo_bytes)))
            elif args.batch_progress:
                sys.stderr.write('%s %d read %d total\n' %
                                 (sys.argv[0], os.getpid(),
                                  out_bytes, todo_bytes))
        if digestor:
            sys.stderr.write("%s  %s/%s\n"
                             % (digestor.hexdigest(), s.name(), f.name()))
    except KeyboardInterrupt:
        if outfile:
            os.unlink(outfilename)
        break

if args.progress:
    sys.stderr.write('\n')