Source code for sentinelhub.data_utils

"""
Module with statistics to dataframe transformation.
"""

from __future__ import annotations

from typing import Any, Iterable

from .time_utils import parse_time
from .types import JsonDict

_PANDAS_IMPORT_MESSAGE = (
    "To use this function you need to install the `pandas` library, which is not a dependency of sentinelhub-py."
)
_FULL_TIME_RANGE = "full time range"


def _extract_hist(hist_data: list[dict[str, float]]) -> tuple[list[float], list[float]]:
    """Transform Statistical API histogram into sequences of bins and counts

    :param hist_data: An input representation of Statistical API histogram data in a form of the low edges,
        the high edges, and the counts for each bin.
    :return: Statistical histogram bins and counts value as sequences.
    """

    nbins = len(hist_data)
    bins, counts = [], []
    for idx in range(nbins):
        bins.append(hist_data[idx]["lowEdge"])
        if idx == nbins - 1:
            bins.append(hist_data[idx]["highEdge"])
        counts.append(hist_data[idx]["count"])

    return bins, counts


def _extract_stats(interval_output: JsonDict, exclude_stats: list[str]) -> dict[str, list[float] | float]:
    """Transform statistics into pandas.DataFrame entry

    :param interval_output: An input representation of statistics of an aggregation interval.
    :param exclude_stats: Statistics that will be excluded from output.
    :return: Statistics as a pandas.DataFrame entry.
    """
    stat_entry: dict[str, list[float] | float] = {}
    for output_name, output_data in interval_output.items():  # pylint: disable=too-many-nested-blocks
        for band_name, band_values in output_data["bands"].items():
            band_stats = band_values["stats"]
            # statistics are not valid when sample count equals to no data count
            if band_stats["sampleCount"] == band_stats["noDataCount"]:
                break

            for stat_name, value in band_stats.items():
                if stat_name not in exclude_stats:
                    col_name = f"{output_name}_{band_name}_{stat_name}"
                    if stat_name == "percentiles":
                        for percentile_name, percentile_value in value.items():
                            stat_entry[f"{col_name}_{percentile_name}"] = percentile_value
                    else:
                        stat_entry[col_name] = value

            if "histogram" in band_values:
                band_bins = band_values["histogram"]["bins"]
                hist_bins, hist_counts = _extract_hist(band_bins)
                stat_entry[f"{output_name}_{band_name}_bins"] = hist_bins
                stat_entry[f"{output_name}_{band_name}_counts"] = hist_counts

    return stat_entry


def _extract_response_data(response_data: list[JsonDict], exclude_stats: list[str]) -> list[JsonDict]:
    """Transform Statistical API response into a pandas.DataFrame

    :param response_data: An input representation of Statistical API response. The response is a list of JsonDict and
        each contains the statistics of an aggregation interval.
    :param exclude_stats: Statistics that will be excluded from output.
    :return: DataFrame entries of all aggregation intervals of a single geometry.
    """
    df_entries = []
    for interval in response_data:
        if "outputs" in interval:
            df_entry: dict[str, Any] = _extract_stats(interval["outputs"], exclude_stats)
            if df_entry:
                df_entry["interval_from"] = parse_time(interval["interval"]["from"])
                df_entry["interval_to"] = parse_time(interval["interval"]["to"])
                df_entries.append(df_entry)

    return df_entries


def _is_batch_stat(result_data: JsonDict) -> bool:
    """Identifies whether the resulting data belongs to a batch statistical request or not"""
    return "id" in result_data


def _is_valid_batch_response(result_data: JsonDict) -> bool:
    """Identifies whether there is a valid batch response"""
    return "error" not in result_data and result_data["response"]["status"] == "OK"


[docs]def statistical_to_dataframe(result_data: list[JsonDict], exclude_stats: list[str] | None = None) -> Any: """Transform (Batch) Statistical API results into a pandas.DataFrame This function has a dependency of the `pandas` library, which is not a requirement of sentinelhub-py and needs to be installed before using the function. :param result_data: An input representation of (Batch) Statistical API result returned from `AwsBatchStatisticalResults.get_data()`. Each JsonDict in the list is a Statistical API response of an input geometry. :param exclude_stats: The statistic names defined in this parameter will be excluded from the output DataFrame. :return: Statistical dataframe. """ try: import pandas as pd # pylint: disable=import-outside-toplevel except ImportError as exception: raise ImportError(_PANDAS_IMPORT_MESSAGE) from exception exclude_stats = exclude_stats or [] nresults = len(result_data) dfs = [None] * nresults for idx in range(nresults): result = result_data[idx] # valid batch stat response if _is_batch_stat(result) and _is_valid_batch_response(result): identifier, response_data = result["identifier"], result["response"]["data"] # valid normal stat response elif not _is_batch_stat(result) and "data" in result: identifier, response_data = str(idx), result["data"] else: continue result_entries = _extract_response_data(response_data, exclude_stats) result_df = pd.DataFrame(result_entries) result_df["identifier"] = identifier dfs[idx] = result_df return pd.concat(dfs)
def _get_failed_intervals(response_data: list[JsonDict]) -> list[tuple[str, str]]: """Collect failed intervals of a single geometry from the (Batch) Statistical result :param response_data: An input representation of the (Batch) Statistical API response of a geometry. :return: The failed intervals for a geometry. """ return [ (interval["interval"]["from"], interval["interval"]["to"]) for interval in response_data if "error" in interval ] def _get_failed_batch_response(result_data: JsonDict) -> str | list[tuple[str, str]]: """Collect failed responses :param result_data: An input representation of the (Batch) Statistical API result of a geometry. :return: Failed responses and responses with failed intervals """ if "error" in result_data or result_data["response"]["status"] == "FAILED": return _FULL_TIME_RANGE return _get_failed_intervals(result_data["response"]["data"])
[docs]def get_failed_statistical_requests(result_data: list[JsonDict]) -> list[JsonDict]: """Collect failed requests of (Batch) Statistical Results :param result_data: An input representation of (Batch) Statistical API result. :return: Failed requests of (Batch) Statistical Results. """ failed_responses: Iterable[tuple[Any, str | list[tuple[str, str]]]] if _is_batch_stat(result_data[0]): failed_responses = ((result["identifier"], _get_failed_batch_response(result)) for result in result_data) else: failed_responses = enumerate( _FULL_TIME_RANGE if "error" in result else _get_failed_intervals(result["data"]) for result in result_data ) return [ {"identifier": idx, "failed_intervals": intervals} for idx, intervals in failed_responses if intervals != [] ]