# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
"""Arvados utilities

This module provides functions and constants that are useful across a variety
of Arvados resource types, or extend the Arvados API client (see `arvados.api`).
"""

import dataclasses
import enum
import errno
import fcntl
import functools
import hashlib
import httplib2
import itertools
import logging
import os
import random
import re
import shlex
import stat
import subprocess
import sys
import warnings

import arvados.errors

from pathlib import Path, PurePath
from typing import (
    Any,
    Callable,
    Dict,
    Iterator,
    Mapping,
    Optional,
    TypeVar,
    Union,
)

T = TypeVar('T')

HEX_RE = re.compile(r'^[0-9a-fA-F]+$')
"""Regular expression to match a hexadecimal string (case-insensitive)"""
CR_UNCOMMITTED = 'Uncommitted'
"""Constant `state` value for uncommited container requests"""
CR_COMMITTED = 'Committed'
"""Constant `state` value for committed container requests"""
CR_FINAL = 'Final'
"""Constant `state` value for finalized container requests"""

keep_locator_pattern = re.compile(r'[0-9a-f]{32}\+[0-9]+(\+\S+)*')
"""Regular expression to match any Keep block locator"""
signed_locator_pattern = re.compile(r'[0-9a-f]{32}\+[0-9]+(\+\S+)*\+A\S+(\+\S+)*')
"""Regular expression to match any Keep block locator with an access token hint"""
portable_data_hash_pattern = re.compile(r'[0-9a-f]{32}\+[0-9]+')
"""Regular expression to match any collection portable data hash"""
manifest_pattern = re.compile(r'((\S+)( +[a-f0-9]{32}(\+[0-9]+)(\+\S+)*)+( +[0-9]+:[0-9]+:\S+)+$)+', flags=re.MULTILINE)
"""Regular expression to match an Arvados collection manifest text"""
keep_file_locator_pattern = re.compile(r'([0-9a-f]{32}\+[0-9]+)/(.*)')
"""Regular expression to match a file path from a collection identified by portable data hash"""
keepuri_pattern = re.compile(r'keep:([0-9a-f]{32}\+[0-9]+)/(.*)')
"""Regular expression to match a `keep:` URI with a collection identified by portable data hash"""

uuid_pattern = re.compile(r'[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}')
"""Regular expression to match any Arvados object UUID"""
collection_uuid_pattern = re.compile(r'[a-z0-9]{5}-4zz18-[a-z0-9]{15}')
"""Regular expression to match any Arvados collection UUID"""
container_uuid_pattern = re.compile(r'[a-z0-9]{5}-dz642-[a-z0-9]{15}')
"""Regular expression to match any Arvados container UUID"""
group_uuid_pattern = re.compile(r'[a-z0-9]{5}-j7d0g-[a-z0-9]{15}')
"""Regular expression to match any Arvados group UUID"""
link_uuid_pattern = re.compile(r'[a-z0-9]{5}-o0j2j-[a-z0-9]{15}')
"""Regular expression to match any Arvados link UUID"""
user_uuid_pattern = re.compile(r'[a-z0-9]{5}-tpzed-[a-z0-9]{15}')
"""Regular expression to match any Arvados user UUID"""

logger = logging.getLogger('arvados')

def _deprecated(version=None, preferred=None):
    """Mark a callable as deprecated in the SDK

    This will wrap the callable to emit as a DeprecationWarning
    and add a deprecation notice to its docstring.

    If the following arguments are given, they'll be included in the
    notices:

    * preferred: str | None --- The name of an alternative that users should
      use instead.

    * version: str | None --- The version of Arvados when the callable is
      scheduled to be removed.
    """
    if version is None:
        version = ''
    else:
        version = f' and scheduled to be removed in Arvados {version}'
    if preferred is None:
        preferred = ''
    else:
        preferred = f' Prefer {preferred} instead.'
    def deprecated_decorator(func):
        fullname = f'{func.__module__}.{func.__qualname__}'
        parent, _, name = fullname.rpartition('.')
        if name == '__init__':
            fullname = parent
        warning_msg = f'{fullname} is deprecated{version}.{preferred}'
        @functools.wraps(func)
        def deprecated_wrapper(*args, **kwargs):
            warnings.warn(warning_msg, DeprecationWarning, 2)
            return func(*args, **kwargs)
        # Get func's docstring without any trailing newline or empty lines.
        func_doc = re.sub(r'\n\s*$', '', func.__doc__ or '')
        match = re.search(r'\n([ \t]+)\S', func_doc)
        indent = '' if match is None else match.group(1)
        warning_doc = f'\n\n{indent}.. WARNING:: Deprecated\n{indent}   {warning_msg}'
        # Make the deprecation notice the second "paragraph" of the
        # docstring if possible. Otherwise append it.
        docstring, count = re.subn(
            rf'\n[ \t]*\n{indent}',
            f'{warning_doc}\n\n{indent}',
            func_doc,
            count=1,
        )
        if not count:
            docstring = f'{func_doc.lstrip()}{warning_doc}'
        deprecated_wrapper.__doc__ = docstring
        return deprecated_wrapper
    return deprecated_decorator

@dataclasses.dataclass
class _BaseDirectorySpec:
    """Parse base directories

    A _BaseDirectorySpec defines all the environment variable keys and defaults
    related to a set of base directories (cache, config, state, etc.). It
    provides pure methods to parse environment settings into valid paths.
    """
    systemd_key: str
    xdg_home_key: str
    xdg_home_default: PurePath
    xdg_dirs_key: Optional[str] = None
    xdg_dirs_default: str = ''

    @staticmethod
    def _abspath_from_env(env: Mapping[str, str], key: str) -> Optional[Path]:
        try:
            path = Path(env[key])
        except (KeyError, ValueError):
            ok = False
        else:
            ok = path.is_absolute()
        return path if ok else None

    @staticmethod
    def _iter_abspaths(value: str) -> Iterator[Path]:
        for path_s in value.split(':'):
            path = Path(path_s)
            if path.is_absolute():
                yield path

    def iter_systemd(self, env: Mapping[str, str]) -> Iterator[Path]:
        return self._iter_abspaths(env.get(self.systemd_key, ''))

    def iter_xdg(self, env: Mapping[str, str], subdir: PurePath) -> Iterator[Path]:
        yield self.xdg_home(env, subdir)
        if self.xdg_dirs_key is not None:
            for path in self._iter_abspaths(env.get(self.xdg_dirs_key) or self.xdg_dirs_default):
                yield path / subdir

    def xdg_home(self, env: Mapping[str, str], subdir: PurePath) -> Path:
        return (
            self._abspath_from_env(env, self.xdg_home_key)
            or self.xdg_home_default_path(env)
        ) / subdir

    def xdg_home_default_path(self, env: Mapping[str, str]) -> Path:
        return (self._abspath_from_env(env, 'HOME') or Path.home()) / self.xdg_home_default

    def xdg_home_is_customized(self, env: Mapping[str, str]) -> bool:
        xdg_home = self._abspath_from_env(env, self.xdg_home_key)
        return xdg_home is not None and xdg_home != self.xdg_home_default_path(env)


class _BaseDirectorySpecs(enum.Enum):
    """Base directory specifications

    This enum provides easy access to the standard base directory settings.
    """
    CACHE = _BaseDirectorySpec(
        'CACHE_DIRECTORY',
        'XDG_CACHE_HOME',
        PurePath('.cache'),
    )
    CONFIG = _BaseDirectorySpec(
        'CONFIGURATION_DIRECTORY',
        'XDG_CONFIG_HOME',
        PurePath('.config'),
        'XDG_CONFIG_DIRS',
        '/etc/xdg',
    )
    STATE = _BaseDirectorySpec(
        'STATE_DIRECTORY',
        'XDG_STATE_HOME',
        PurePath('.local', 'state'),
    )


class _BaseDirectories:
    """Resolve paths from a base directory spec

    Given a _BaseDirectorySpec, this class provides stateful methods to find
    existing files and return the most-preferred directory for writing.
    """
    _STORE_MODE = stat.S_IFDIR | stat.S_IWUSR

    def __init__(
            self,
            spec: Union[_BaseDirectorySpec, _BaseDirectorySpecs, str],
            env: Mapping[str, str]=os.environ,
            xdg_subdir: Union[os.PathLike, str]='arvados',
    ) -> None:
        if isinstance(spec, str):
            spec = _BaseDirectorySpecs[spec].value
        elif isinstance(spec, _BaseDirectorySpecs):
            spec = spec.value
        self._spec = spec
        self._env = env
        self._xdg_subdir = PurePath(xdg_subdir)

    def search(self, name: str) -> Iterator[Path]:
        any_found = False
        for search_path in itertools.chain(
                self._spec.iter_systemd(self._env),
                self._spec.iter_xdg(self._env, self._xdg_subdir),
        ):
            path = search_path / name
            if path.exists():
                yield path
                any_found = True
        # The rest of this function is dedicated to warning the user if they
        # have a custom XDG_*_HOME value that prevented the search from
        # succeeding. This should be rare.
        if any_found or not self._spec.xdg_home_is_customized(self._env):
            return
        default_home = self._spec.xdg_home_default_path(self._env)
        default_path = Path(self._xdg_subdir / name)
        if not (default_home / default_path).exists():
            return
        if self._spec.xdg_dirs_key is None:
            suggest_key = self._spec.xdg_home_key
            suggest_value = default_home
        else:
            suggest_key = self._spec.xdg_dirs_key
            cur_value = self._env.get(suggest_key, '')
            value_sep = ':' if cur_value else ''
            suggest_value = f'{cur_value}{value_sep}{default_home}'
        logger.warning(
            "\
%s was not found under your configured $%s (%s), \
but does exist at the default location (%s) - \
consider running this program with the environment setting %s=%s\
",
            default_path,
            self._spec.xdg_home_key,
            self._spec.xdg_home(self._env, ''),
            default_home,
            suggest_key,
            shlex.quote(suggest_value),
        )

    def storage_path(
            self,
            subdir: Union[str, os.PathLike]=PurePath(),
            mode: int=0o700,
    ) -> Path:
        for path in self._spec.iter_systemd(self._env):
            try:
                mode = path.stat().st_mode
            except OSError:
                continue
            if (mode & self._STORE_MODE) == self._STORE_MODE:
                break
        else:
            path = self._spec.xdg_home(self._env, self._xdg_subdir)
        path /= subdir
        path.mkdir(parents=True, exist_ok=True, mode=mode)
        return path


def is_hex(s: str, *length_args: int) -> bool:
    """Indicate whether a string is a hexadecimal number

    This method returns true if all characters in the string are hexadecimal
    digits. It is case-insensitive.

    You can also pass optional length arguments to check that the string has
    the expected number of digits. If you pass one integer, the string must
    have that length exactly, otherwise the method returns False. If you
    pass two integers, the string's length must fall within that minimum and
    maximum (inclusive), otherwise the method returns False.

    Arguments:

    * s: str --- The string to check

    * length_args: int --- Optional length limit(s) for the string to check
    """
    num_length_args = len(length_args)
    if num_length_args > 2:
        raise arvados.errors.ArgumentError(
            "is_hex accepts up to 3 arguments ({} given)".format(1 + num_length_args))
    elif num_length_args == 2:
        good_len = (length_args[0] <= len(s) <= length_args[1])
    elif num_length_args == 1:
        good_len = (len(s) == length_args[0])
    else:
        good_len = True
    return bool(good_len and HEX_RE.match(s))

def keyset_list_all(
        fn: Callable[..., 'arvados.api_resources.ArvadosAPIRequest'],
        order_key: str="created_at",
        num_retries: int=0,
        ascending: bool=True,
        **kwargs: Any,
) -> Iterator[Dict[str, Any]]:
    """Iterate all Arvados resources from an API list call

    This method takes a method that represents an Arvados API list call, and
    iterates the objects returned by the API server. It can make multiple API
    calls to retrieve and iterate all objects available from the API server.

    Arguments:

    * fn: Callable[..., arvados.api_resources.ArvadosAPIRequest] --- A
      function that wraps an Arvados API method that returns a list of
      objects. If you have an Arvados API client named `arv`, examples
      include `arv.collections().list` and `arv.groups().contents`. Note
      that you should pass the function *without* calling it.

    * order_key: str --- The name of the primary object field that objects
      should be sorted by. This name is used to build an `order` argument
      for `fn`. Default `'created_at'`.

    * num_retries: int --- This argument is passed through to
      `arvados.api_resources.ArvadosAPIRequest.execute` for each API call. See
      that method's docstring for details. Default 0 (meaning API calls will
      use the `num_retries` value set when the Arvados API client was
      constructed).

    * ascending: bool --- Used to build an `order` argument for `fn`. If True,
      all fields will be sorted in `'asc'` (ascending) order. Otherwise, all
      fields will be sorted in `'desc'` (descending) order.

    Additional keyword arguments will be passed directly to `fn` for each API
    call. Note that this function sets `count`, `limit`, and `order` as part of
    its work.
    """
    pagesize = 1000
    kwargs["limit"] = pagesize
    kwargs["count"] = 'none'
    asc = "asc" if ascending else "desc"
    kwargs["order"] = ["%s %s" % (order_key, asc), "uuid %s" % asc]
    other_filters = kwargs.get("filters", [])

    try:
        select = set(kwargs['select'])
    except KeyError:
        pass
    else:
        select.add(order_key)
        select.add('uuid')
        kwargs['select'] = list(select)

    nextpage = []
    tot = 0
    expect_full_page = True
    seen_prevpage = set()
    seen_thispage = set()
    lastitem = None
    prev_page_all_same_order_key = False

    while True:
        kwargs["filters"] = nextpage+other_filters
        items = fn(**kwargs).execute(num_retries=num_retries)

        if len(items["items"]) == 0:
            if prev_page_all_same_order_key:
                nextpage = [[order_key, ">" if ascending else "<", lastitem[order_key]]]
                prev_page_all_same_order_key = False
                continue
            else:
                return

        seen_prevpage = seen_thispage
        seen_thispage = set()

        for i in items["items"]:
            # In cases where there's more than one record with the
            # same order key, the result could include records we
            # already saw in the last page.  Skip them.
            if i["uuid"] in seen_prevpage:
                continue
            seen_thispage.add(i["uuid"])
            yield i

        firstitem = items["items"][0]
        lastitem = items["items"][-1]

        if firstitem[order_key] == lastitem[order_key]:
            # Got a page where every item has the same order key.
            # Switch to using uuid for paging.
            nextpage = [[order_key, "=", lastitem[order_key]], ["uuid", ">" if ascending else "<", lastitem["uuid"]]]
            prev_page_all_same_order_key = True
        else:
            # Start from the last order key seen, but skip the last
            # known uuid to avoid retrieving the same row twice.  If
            # there are multiple rows with the same order key it is
            # still likely we'll end up retrieving duplicate rows.
            # That's handled by tracking the "seen" rows for each page
            # so they can be skipped if they show up on the next page.
            nextpage = [[order_key, ">=" if ascending else "<=", lastitem[order_key]], ["uuid", "!=", lastitem["uuid"]]]
            prev_page_all_same_order_key = False

def ca_certs_path(fallback: T=httplib2.CA_CERTS) -> Union[str, T]:
    """Return the path of the best available source of CA certificates

    This function checks various known paths that provide trusted CA
    certificates, and returns the first one that exists. It checks:

    * the path in the `SSL_CERT_FILE` environment variable (used by OpenSSL)
    * `/etc/arvados/ca-certificates.crt`, respected by all Arvados software
    * `/etc/ssl/certs/ca-certificates.crt`, the default store on Debian-based
      distributions
    * `/etc/pki/tls/certs/ca-bundle.crt`, the default store on Red Hat-based
      distributions

    If none of these paths exist, this function returns the value of `fallback`.

    Arguments:

    * fallback: T --- The value to return if none of the known paths exist.
      The default value is the certificate store of Mozilla's trusted CAs
      included with the Python [certifi][] package.

    [certifi]: https://pypi.org/project/certifi/
    """
    for ca_certs_path in [
        # SSL_CERT_FILE and SSL_CERT_DIR are openssl overrides - note
        # that httplib2 itself also supports HTTPLIB2_CA_CERTS.
        os.environ.get('SSL_CERT_FILE'),
        # Arvados specific:
        '/etc/arvados/ca-certificates.crt',
        # Debian:
        '/etc/ssl/certs/ca-certificates.crt',
        # Red Hat:
        '/etc/pki/tls/certs/ca-bundle.crt',
        ]:
        if ca_certs_path and os.path.exists(ca_certs_path):
            return ca_certs_path
    return fallback

def new_request_id() -> str:
    """Return a random request ID

    This function generates and returns a random string suitable for use as a
    `X-Request-Id` header value in the Arvados API.
    """
    rid = "req-"
    # 2**104 > 36**20 > 2**103
    n = random.getrandbits(104)
    for _ in range(20):
        c = n % 36
        if c < 10:
            rid += chr(c+ord('0'))
        else:
            rid += chr(c+ord('a')-10)
        n = n // 36
    return rid

def get_config_once(svc: 'arvados.api_resources.ArvadosAPIClient') -> Dict[str, Any]:
    """Return an Arvados cluster's configuration, with caching

    This function gets and returns the Arvados configuration from the API
    server. It caches the result on the client object and reuses it on any
    future calls.

    Arguments:

    * svc: arvados.api_resources.ArvadosAPIClient --- The Arvados API client
      object to use to retrieve and cache the Arvados cluster configuration.
    """
    if not svc._rootDesc.get('resources').get('configs', False):
        # Old API server version, no config export endpoint
        return {}
    if not hasattr(svc, '_cached_config'):
        svc._cached_config = svc.configs().get().execute()
    return svc._cached_config

def get_vocabulary_once(svc: 'arvados.api_resources.ArvadosAPIClient') -> Dict[str, Any]:
    """Return an Arvados cluster's vocabulary, with caching

    This function gets and returns the Arvados vocabulary from the API
    server. It caches the result on the client object and reuses it on any
    future calls.

    .. HINT:: Low-level method
       This is a relatively low-level wrapper around the Arvados API. Most
       users will prefer to use `arvados.vocabulary.load_vocabulary`.

    Arguments:

    * svc: arvados.api_resources.ArvadosAPIClient --- The Arvados API client
      object to use to retrieve and cache the Arvados cluster vocabulary.
    """
    if not svc._rootDesc.get('resources').get('vocabularies', False):
        # Old API server version, no vocabulary export endpoint
        return {}
    if not hasattr(svc, '_cached_vocabulary'):
        svc._cached_vocabulary = svc.vocabularies().get().execute()
    return svc._cached_vocabulary

def trim_name(collectionname: str) -> str:
    """Limit the length of a name to fit within Arvados API limits

    This function ensures that a string is short enough to use as an object
    name in the Arvados API, leaving room for text that may be added by the
    `ensure_unique_name` argument. If the source name is short enough, it is
    returned unchanged. Otherwise, this function returns a string with excess
    characters removed from the middle of the source string and replaced with
    an ellipsis.

    Arguments:

    * collectionname: str --- The desired source name
    """
    max_name_len = 254 - 28

    if len(collectionname) > max_name_len:
        over = len(collectionname) - max_name_len
        split = int(max_name_len/2)
        collectionname = collectionname[0:split] + "…" + collectionname[split+over:]

    return collectionname