#!/usr/bin/env python

import argparse
import hashlib
import os
import re
import string
import sys
import logging

import arvados
import arvados.commands._util as arv_cmd

logger = logging.getLogger('arvados.arv-get')

def abort(msg, code=1):
    print >>sys.stderr, "arv-get:", msg
    exit(code)

parser = argparse.ArgumentParser(
    description='Copy data from Keep to a local file or pipe.',
    parents=[arv_cmd.retry_opt])
parser.add_argument('locator', type=str,
                    help="""
Collection locator, optionally with a file path or prefix.
""")
parser.add_argument('destination', type=str, nargs='?', default='-',
                    help="""
Local file or directory where the data is to be written. Default: stdout.
""")
group = parser.add_mutually_exclusive_group()
group.add_argument('--progress', action='store_true',
                   help="""
Display human-readable progress on stderr (bytes and, if possible,
percentage of total data size). This is the default behavior when it
is not expected to interfere with the output: specifically, stderr is
a tty _and_ either stdout is not a tty, or output is being written to
named files rather than stdout.
""")
group.add_argument('--no-progress', action='store_true',
                   help="""
Do not display human-readable progress on stderr.
""")
group.add_argument('--batch-progress', action='store_true',
                   help="""
Display machine-readable progress on stderr (bytes and, if known,
total data size).
""")
group = parser.add_mutually_exclusive_group()
group.add_argument('--hash',
                    help="""
Display the hash of each file as it is read from Keep, using the given
hash algorithm. Supported algorithms include md5, sha1, sha224,
sha256, sha384, and sha512.
""")
group.add_argument('--md5sum', action='store_const',
                    dest='hash', const='md5',
                    help="""
Display the MD5 hash of each file as it is read from Keep.
""")
parser.add_argument('-n', action='store_true',
                    help="""
Do not write any data -- just read from Keep, and report md5sums if
requested.
""")
parser.add_argument('-r', action='store_true',
                    help="""
Retrieve all files in the specified collection/prefix. This is the
default behavior if the "locator" argument ends with a forward slash.
""")
group = parser.add_mutually_exclusive_group()
group.add_argument('-f', action='store_true',
                   help="""
Overwrite existing files while writing. The default behavior is to
refuse to write *anything* if any of the output files already
exist. As a special case, -f is not needed to write to stdout.
""")
group.add_argument('--skip-existing', action='store_true',
                   help="""
Skip files that already exist. The default behavior is to refuse to
write *anything* if any files exist that would have to be
overwritten. This option causes even devices, sockets, and fifos to be
skipped.
""")

args = parser.parse_args()

if args.locator[-1] == os.sep:
    args.r = True
if (args.r and
    not args.n and
    not (args.destination and
         os.path.isdir(args.destination))):
    parser.error('Destination is not a directory.')
if not args.r and (os.path.isdir(args.destination) or
                   args.destination[-1] == os.path.sep):
    args.destination = os.path.join(args.destination,
                                    os.path.basename(args.locator))
    logger.debug("Appended source file name to destination directory: %s",
                 args.destination)

if args.destination == '/dev/stdout':
    args.destination = "-"

if args.destination == '-':
    # Normally you have to use -f to write to a file (or device) that
    # already exists, but "-" and "/dev/stdout" are common enough to
    # merit a special exception.
    args.f = True
else:
    args.destination = args.destination.rstrip(os.sep)

# Turn on --progress by default if stderr is a tty and output is
# either going to a named file, or going (via stdout) to something
# that isn't a tty.
if (not (args.batch_progress or args.no_progress)
    and sys.stderr.isatty()
    and (args.destination != '-'
         or not sys.stdout.isatty())):
    args.progress = True


r = re.search(r'^(.*?)(/.*)?$', args.locator)
collection = r.group(1)
get_prefix = r.group(2)
if args.r and not get_prefix:
    get_prefix = os.sep
api_client = arvados.api('v1')
reader = arvados.CollectionReader(collection, num_retries=args.retries)

if not get_prefix:
    if not args.n:
        open_flags = os.O_CREAT | os.O_WRONLY
        if not args.f:
            open_flags |= os.O_EXCL
        try:
            if args.destination == "-":
                sys.stdout.write(reader.manifest_text())
            else:
                out_fd = os.open(args.destination, open_flags)
                with os.fdopen(out_fd, 'wb') as out_file:
                    out_file.write(reader.manifest_text())
        except (IOError, OSError) as error:
            abort("can't write to '{}': {}".format(args.destination, error))
        except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
            abort("failed to download '{}': {}".format(collection, error))
    sys.exit(0)

reader.normalize()

# Scan the collection. Make an array of (stream, file, local
# destination filename) tuples, and add up total size to extract.
todo = []
todo_bytes = 0
try:
    for s in reader.all_streams():
        for f in s.all_files():
            if get_prefix and get_prefix[-1] == os.sep:
                if 0 != string.find(os.path.join(s.name(), f.name()),
                                    '.' + get_prefix):
                    continue
                if args.destination == "-":
                    dest_path = "-"
                else:
                    dest_path = os.path.join(
                        args.destination,
                        os.path.join(s.name(), f.name())[len(get_prefix)+1:])
                    if (not (args.n or args.f or args.skip_existing) and
                        os.path.exists(dest_path)):
                        abort('Local file %s already exists.' % (dest_path,))
            else:
                if os.path.join(s.name(), f.name()) != '.' + get_prefix:
                    continue
                dest_path = args.destination
            todo += [(s, f, dest_path)]
            todo_bytes += f.size()
except arvados.errors.NotFoundError as e:
    abort(e)

# Read data, and (if not -n) write to local file(s) or pipe.

out_bytes = 0
for s,f,outfilename in todo:
    outfile = None
    digestor = None
    if not args.n:
        if outfilename == "-":
            outfile = sys.stdout
        else:
            if args.skip_existing and os.path.exists(outfilename):
                logger.debug('Local file %s exists. Skipping.', outfilename)
                continue
            elif not args.f and (os.path.isfile(outfilename) or
                               os.path.isdir(outfilename)):
                # Good thing we looked again: apparently this file wasn't
                # here yet when we checked earlier.
                abort('Local file %s already exists.' % (outfilename,))
            if args.r:
                arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
            try:
                outfile = open(outfilename, 'wb')
            except Exception as error:
                abort('Open(%s) failed: %s' % (outfilename, error))
    if args.hash:
        digestor = hashlib.new(args.hash)
    try:
        for data in f.readall():
            if outfile:
                outfile.write(data)
            if digestor:
                digestor.update(data)
            out_bytes += len(data)
            if args.progress:
                sys.stderr.write('\r%d MiB / %d MiB %.1f%%' %
                                 (out_bytes >> 20,
                                  todo_bytes >> 20,
                                  (100
                                   if todo_bytes==0
                                   else 100.0*out_bytes/todo_bytes)))
            elif args.batch_progress:
                sys.stderr.write('%s %d read %d total\n' %
                                 (sys.argv[0], os.getpid(),
                                  out_bytes, todo_bytes))
        if digestor:
            sys.stderr.write("%s  %s/%s\n"
                             % (digestor.hexdigest(), s.name(), f.name()))
    except KeyboardInterrupt:
        if outfile and (outfile.fileno() > 2) and not outfile.closed:
            os.unlink(outfile.name)
        break

if args.progress:
    sys.stderr.write('\n')