#!/usr/bin/env python3
# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: AGPL-3.0

import argparse
import sys

import arvados
import arvados.util
import datetime
import ciso8601
import csv
import os
from prometheus_api_client.utils import parse_datetime
from datetime import timedelta
import pandas
import base64

from prometheus_api_client import PrometheusConnect, MetricsList, Metric

def parse_arguments(arguments):
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('--start', help='Start date for the report in YYYY-MM-DD format (UTC)')
    arg_parser.add_argument('--end', help='End date for the report in YYYY-MM-DD format (UTC), default "now"')
    arg_parser.add_argument('--days', type=int, help='Number of days before "end" to start the report')
    arg_parser.add_argument('--cluster', type=str, help='Cluster to query')
    arg_parser.add_argument('--cost-report-file', type=str, help='Export cost report to specified CSV file')
    args = arg_parser.parse_args(arguments)

    if args.days and args.start:
        arg_parser.print_help()
        print("Error: either specify --days or both --start and --end")
        exit(1)

    if not args.days and not args.start:
        arg_parser.print_help()
        print("\nError: either specify --days or both --start and --end")
        exit(1)

    if (args.start and not args.end):
        arg_parser.print_help()
        print("\nError: no start or end date found, either specify --days or both --start and --end")
        exit(1)

    if args.end:
        try:
            to = datetime.datetime.strptime(args.end,"%Y-%m-%d")
        except:
            arg_parser.print_help()
            print("\nError: end date must be in YYYY-MM-DD format")
            exit(1)
    else:
        to = datetime.datetime.utcnow()

    if args.days:
        since = to - datetime.timedelta(days=args.days)

    if args.start:
        try:
            since = datetime.datetime.strptime(args.start,"%Y-%m-%d")
        except:
            arg_parser.print_help()
            print("\nError: start date must be in YYYY-MM-DD format")
            exit(1)


    return args, since, to

def data_usage(prom, timestamp, cluster, label):
    metric_data = prom.get_current_metric_value(metric_name='arvados_keep_total_bytes',
                                                label_config={"cluster": cluster},
                                                params={"time": timestamp.timestamp()})

    metric_object_list = MetricsList(metric_data)

    if len(metric_data) == 0:
        return

    my_metric_object = metric_object_list[0] # one of the metrics from the list
    value = my_metric_object.metric_values.iloc[0]["y"]
    summary_value = value

    metric_data = prom.get_current_metric_value(metric_name='arvados_keep_dedup_byte_ratio',
                                                label_config={"cluster": cluster},
                                                params={"time": timestamp.timestamp()})

    if len(metric_data) == 0:
        return

    my_metric_object = MetricsList(metric_data)[0]
    dedup_ratio = my_metric_object.metric_values.iloc[0]["y"]

    value_gb = value / (1024*1024*1024)
    first_50tb = min(1024*50, value_gb)
    next_450tb = max(min(1024*450, value_gb-1024*50), 0)
    over_500tb = max(value_gb-1024*500, 0)

    monthly_cost = (first_50tb * 0.023) + (next_450tb * 0.022) + (over_500tb * 0.021)

    for scale in ["KiB", "MiB", "GiB", "TiB", "PiB"]:
        summary_value = summary_value / 1024
        if summary_value < 1024:
            print(label,
                  "%.3f %s apparent," % (summary_value*dedup_ratio, scale),
                  "%.3f %s actually stored," % (summary_value, scale),
                  "$%.2f monthly S3 storage cost" % monthly_cost)
            break


def container_usage(prom, start_time, end_time, metric, label, fn=None):
    start = start_time
    chunk_size = timedelta(days=1)
    cumulative = 0

    while start < end_time:
        if start + chunk_size > end_time:
            chunk_size = end_time - start

        metric_data = prom.custom_query_range(metric,
                                              start_time=start,
                                              end_time=(start + chunk_size),
                                              step=15
                                              )

        if len(metric_data) == 0:
            break

        if "__name__" not in metric_data[0]["metric"]:
            metric_data[0]["metric"]["__name__"] = metric

        metric_object_list = MetricsList(metric_data)
        my_metric_object = metric_object_list[0] # one of the metrics from the list

        series = my_metric_object.metric_values.set_index(pandas.DatetimeIndex(my_metric_object.metric_values['ds']))

        # Resample to 1 minute increments, fill in missing values
        rs = series.resample("min").mean(1).ffill()

        # Calculate the sum of values
        #print(rs.sum()["y"])
        cumulative += rs.sum()["y"]

        start += chunk_size

    if fn is not None:
        cumulative = fn(cumulative)

    print(label % cumulative)

def report_from_prometheus(cluster, since, to):
    prom_host = os.environ.get("PROMETHEUS_HOST")
    prom_token = os.environ.get("PROMETHEUS_APIKEY")
    prom_user = os.environ.get("PROMETHEUS_USER")
    prom_pw = os.environ.get("PROMETHEUS_PASSWORD")

    headers = {}
    if prom_token:
        headers["Authorization"] = "Bearer %s" % prom_token

    if prom_user:
        headers["Authorization"] = "Basic %s" % str(base64.b64encode(bytes("%s:%s" % (prom_user, prom_pw), 'utf-8')), 'utf-8')

    prom = PrometheusConnect(url=prom_host, headers=headers)

    print(cluster, "between", since, "and", to, "timespan", (to-since))

    try:
        data_usage(prom, since, cluster, "at start:")
    except:
        pass
    try:
        data_usage(prom, to - timedelta(minutes=240), cluster, "current :")
    except:
        pass

    container_usage(prom, since, to, "arvados_dispatchcloud_containers_running{cluster='%s'}" % cluster, '%.1f container hours', lambda x: x/60)
    container_usage(prom, since, to, "sum(arvados_dispatchcloud_instances_price{cluster='%s'})" % cluster, '$%.2f spent on compute', lambda x: x/60)
    print()

def flush_containers(arv_client, csvwriter, pending):
    containers = {}

    for container in arvados.util.keyset_list_all(
        arv_client.containers().list,
        filters=[
            ["uuid", "in", [c["container_uuid"] for c in pending]],
        ],
        select=["uuid", "started_at", "finished_at"]):

        containers[container["uuid"]] = container

    workflows = {}
    workflows["none"] = "workflow run from command line"

    for wf in arvados.util.keyset_list_all(
            arv_client.workflows().list,
            filters=[
                ["uuid", "in", [c["properties"]["template_uuid"] for c in pending if "template_uuid" in c["properties"]]],
            ],
            select=["uuid", "name"]):
        workflows[wf["uuid"]] = wf["name"]

    projects = {}

    for pr in arvados.util.keyset_list_all(
            arv_client.groups().list,
            filters=[
                ["uuid", "in", [c["owner_uuid"] for c in pending if c["owner_uuid"][6:11] == 'j7d0g']],
            ],
            select=["uuid", "name"]):
        projects[pr["uuid"]] = pr["name"]

    for pr in arvados.util.keyset_list_all(
            arv_client.users().list,
            filters=[
                ["uuid", "in", [c["owner_uuid"] for c in pending if c["owner_uuid"][6:11] == 'tpzed']],
            ],
            select=["uuid", "full_name", "first_name", "last_name"]):
        projects[pr["uuid"]] = pr["full_name"]

    for container_request in pending:
        length = ciso8601.parse_datetime(containers[container_request["container_uuid"]]["finished_at"]) - ciso8601.parse_datetime(containers[container_request["container_uuid"]]["started_at"])

        hours = length.seconds // 3600
        minutes = (length.seconds // 60) % 60
        seconds = length.seconds % 60

        csvwriter.writerow((
            projects.get(container_request["owner_uuid"], "unknown owner"),
            workflows.get(container_request["properties"].get("template_uuid", "none"), "workflow missing"),
            container_request["name"],
            containers[container_request["container_uuid"]]["started_at"],
            "%i:%02i:%02i:%02i" % (length.days, hours, minutes, seconds),
            container_request["cumulative_cost"],
            ))


def report_from_api(since, to, out):
    arv_client = arvados.api()

    csvwriter = csv.writer(out)
    csvwriter.writerow(("Project", "Workflow", "Sample", "Started", "Runtime", "Cost"))

    pending = []

    print(since.isoformat())
    for container_request in arvados.util.keyset_list_all(
            arv_client.container_requests().list,
            filters=[
                ["command", "like", "[\"arvados-cwl-runner%"],
                ["created_at", ">=", since.strftime("%Y%m%dT%H%M%SZ")],
            ],
            select=["uuid", "owner_uuid", "container_uuid", "name", "cumulative_cost", "properties"]):

        if len(pending) < 1000:
            pending.append(container_request)
        else:
            flush_containers(arv_client, csvwriter, pending)
            pending.clear()

    flush_containers(arv_client, csvwriter, pending)

def main(arguments=None):
    if arguments is None:
        arguments = sys.argv[1:]

    args, since, to = parse_arguments(arguments)

    if "PROMETHEUS_HOST" in os.environ:
        report_from_prometheus(args.cluster, since, to)

    if args.cost_report_file:
        with open(args.cost_report_file, "wt") as f:
            report_from_api(since, to, f)

if __name__ == "__main__":
    main()