#!/usr/bin/env python

# TODO:
# --md5sum - display md5 of each file as read from disk

import argparse
import os
import sys

parser = argparse.ArgumentParser(
    description='Copy data from the local filesystem to Keep.')

parser.add_argument('paths', metavar='path', type=str, nargs='*',
                    help="""
Local file or directory. Default: read from standard input.
""")

parser.add_argument('--max-manifest-depth', type=int, metavar='N', default=-1,
                    help="""
Maximum depth of directory tree to represent in the manifest
structure. A directory structure deeper than this will be represented
as a single stream in the manifest. If N=0, the manifest will contain
a single stream. Default: -1 (unlimited), i.e., exactly one manifest
stream per filesystem directory that contains files.
""")

group = parser.add_mutually_exclusive_group()

group.add_argument('--as-stream', action='store_true', dest='stream',
                   help="""
Synonym for --stream.
""")

group.add_argument('--stream', action='store_true',
                   help="""
Store the file content and display the resulting manifest on
stdout. Do not write the manifest to Keep or save a Collection object
in Arvados.
""")

group.add_argument('--as-manifest', action='store_true', dest='manifest',
                   help="""
Synonym for --manifest.
""")

group.add_argument('--in-manifest', action='store_true', dest='manifest',
                   help="""
Synonym for --manifest.
""")

group.add_argument('--manifest', action='store_true',
                   help="""
Store the file data and resulting manifest in Keep, save a Collection
object in Arvados, and display the manifest locator (Collection uuid)
on stdout. This is the default behavior.
""")

group.add_argument('--as-raw', action='store_true', dest='raw',
                   help="""
Synonym for --raw.
""")

group.add_argument('--raw', action='store_true',
                   help="""
Store the file content and display the data block locators on stdout,
separated by commas, with a trailing newline. Do not store a
manifest.
""")

parser.add_argument('--use-filename', type=str, default=None, dest='filename',
                    help="""
Synonym for --filename.
""")

parser.add_argument('--filename', type=str, default=None,
                    help="""
Use the given filename in the manifest, instead of the name of the
local file. This is useful when "-" or "/dev/stdin" is given as an
input file. It can be used only if there is exactly one path given and
it is not a directory. Implies --manifest.
""")

group = parser.add_mutually_exclusive_group()
group.add_argument('--progress', action='store_true',
                   help="""
Display human-readable progress on stderr (bytes and, if possible,
percentage of total data size). This is the default behavior when
stderr is a tty.
""")

group.add_argument('--no-progress', action='store_true',
                   help="""
Do not display human-readable progress on stderr, even if stderr is a
tty.
""")

group.add_argument('--batch-progress', action='store_true',
                   help="""
Display machine-readable progress on stderr (bytes and, if known,
total data size).
""")

args = parser.parse_args()

if len(args.paths) == 0:
    args.paths += ['/dev/stdin']

if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
    if args.filename:
        parser.error("""
--filename argument cannot be used when storing a directory or
multiple files.
""")

# Turn on --progress by default if stderr is a tty.
if (not (args.batch_progress or args.no_progress)
    and os.isatty(sys.stderr.fileno())):
    args.progress = True


import arvados
import re
import string

class CollectionWriterWithProgress(arvados.CollectionWriter):
    def flush_data(self, *args, **kwargs):
        if not getattr(self, 'display_type', None):
            return
        if not hasattr(self, 'bytes_flushed'):
            self.bytes_flushed = 0
        self.bytes_flushed += self._data_buffer_len
        super(CollectionWriterWithProgress, self).flush_data(*args, **kwargs)
        self.bytes_flushed -= self._data_buffer_len
        if self.display_type == 'machine':
            sys.stderr.write('%s %d: %d written %d total\n' %
                             (sys.argv[0],
                              os.getpid(),
                              self.bytes_flushed,
                              getattr(self, 'bytes_expected', -1)))
        elif getattr(self, 'bytes_expected', 0) > 0:
            pct = 100.0 * self.bytes_flushed / self.bytes_expected
            sys.stderr.write('\r%dM / %dM %.1f%% ' %
                             (self.bytes_flushed >> 20,
                              self.bytes_expected >> 20, pct))
        else:
            sys.stderr.write('\r%d ' % self.bytes_flushed)

    def manifest_text(self, *args, **kwargs):
        manifest_text = (super(CollectionWriterWithProgress, self)
                         .manifest_text(*args, **kwargs))
        if getattr(self, 'display_type', None):
            if self.display_type == 'human':
                sys.stderr.write('\n')
            self.display_type = None
        return manifest_text

if args.progress:
    writer = CollectionWriterWithProgress()
    writer.display_type = 'human'
elif args.batch_progress:
    writer = CollectionWriterWithProgress()
    writer.display_type = 'machine'
else:
    writer = arvados.CollectionWriter()

if args.paths == ['-']:
    args.paths = ['/dev/stdin']
    if not args.filename:
        args.filename = '-'

# Walk the given directory trees and stat files, adding up file sizes,
# so we can display progress as percent
writer.bytes_expected = 0
for path in args.paths:
    if os.path.isdir(path):
        for filename in arvados.util.listdir_recursive(path):
            writer.bytes_expected += os.path.getsize(
                os.path.join(path, filename))
    elif not os.path.isfile(path):
        del writer.bytes_expected
        break
    else:
        writer.bytes_expected += os.path.getsize(path)

# Copy file data to Keep.
for path in args.paths:
    if os.path.isdir(path):
        writer.write_directory_tree(path,
                                    max_manifest_depth=args.max_manifest_depth)
    else:
        writer.start_new_stream()
        writer.start_new_file(args.filename or os.path.split(path)[1])
        with open(path, 'rb') as f:
            while True:
                buf = f.read(2**26)
                if len(buf) == 0:
                    break
                writer.write(buf)

if args.stream:
    print writer.manifest_text(),
elif args.raw:
    writer.finish_current_stream()
    print string.join(writer.data_locators(), ',')
else:
    # Register the resulting collection in Arvados.
    arvados.api().collections().create(
        body={
            'uuid': writer.finish(),
            'manifest_text': writer.manifest_text(),
            },
        ).execute()

    # Print the locator (uuid) of the new collection.
    print writer.finish()