# encoding: utf-8
"""Provides the CubeSet and Cube classes.
CubeSet is the main API class for manipulating Crunch.io JSON cube responses.
"""
import copy
import json
from typing import Dict, FrozenSet, Iterator, List, Optional, Tuple, Union
import numpy as np
from cr.cube.cubepart import CubePartition
from cr.cube.dimension import Dimensions
from cr.cube.enums import CUBE_MEASURE, DIMENSION_TYPE as DT, NUMERIC_CUBE_MEASURES
from cr.cube.util import lazyproperty
[docs]class CubeSet:
"""Represents a multi-cube cube-response.
Also works just fine for a single cube-response passed inside a sequence, allowing
uniform handling of single and multi-cube responses.
`cube_responses` is a sequence of cube-response dicts received from Crunch. The
sequence can contain a single item, such as a cube-response for a slide, but it must
be contained in a sequence. A tabbook cube-response sequence can be passed as it was
received.
`transforms` is a sequence of transforms dicts corresponding in order to the
cube-responses. `population` is the estimated target population and is used when
a population-projection measure is requested. `min_base` is an integer representing
the minimum sample-size used for indicating values that are unreliable by reason of
insufficient sample (base).
"""
def __init__(
self,
cube_responses: List[Dict],
transforms: Dict,
population: int,
min_base: int,
):
self._cube_responses = cube_responses
self._transforms_dicts = transforms
self._population = population
self._min_base = min_base
[docs] @lazyproperty
def available_measures(self) -> FrozenSet[CUBE_MEASURE]:
"""frozenset of available measures of the first cube in this set."""
return frozenset(m for cube in self._cubes for m in cube.available_measures)
[docs] @lazyproperty
def can_show_pairwise(self) -> bool:
"""True if all 2D cubes in a multi-cube set can provide pairwise comparison."""
if len(self._cubes) < 2:
return False
return all(
all(dt in DT.ALLOWED_PAIRWISE_TYPES for dt in cube.dimension_types[-2:])
and cube.ndim >= 2
for cube in self._cubes[1:]
)
[docs] @lazyproperty
def description(self) -> str:
"""str description of first cube in this set."""
return self._cubes[0].description
[docs] @lazyproperty
def has_weighted_counts(self) -> bool:
"""True if cube-responses include a weighted-count measure."""
return self._cubes[0].has_weighted_counts
[docs] @lazyproperty
def is_ca_as_0th(self) -> bool:
"""True for multi-cube when first cube represents a categorical-array.
A "CA-as-0th" tabbook tab is "3D" in the sense it is "sliced" into one table
(partition-set) for each of the CA subvariables.
"""
# ---can only be true for multi-cube case---
if not self._is_multi_cube:
return False
# ---the rest depends on the row-var cube---
cube = self._cubes[0]
# ---True if row-var cube is CA---
return cube.dimension_types[0] == DT.CA_SUBVAR
[docs] @lazyproperty
def missing_count(self) -> int:
"""The number of missing values from first cube in this set."""
return self._cubes[0].missing
[docs] @lazyproperty
def name(self) -> str:
"""str name of first cube in this set."""
return self._cubes[0].name
[docs] @lazyproperty
def partition_sets(self) -> Tuple[Tuple[CubePartition], ...]:
"""Sequence of cube-partition collections across all cubes of this cube-set.
This value might look like the following for a ca-as-0th tabbook. For example::
(
(_Strand, _Slice, _Slice),
(_Strand, _Slice, _Slice),
(_Strand, _Slice, _Slice),
)
and might often look like this for a typical slide::
((_Slice,))
Each partition set represents the partitions for a single "stacked" table. A 2D
slide has a single partition-set of a single _Slice object, as in the second
example above. A 3D slide would have multiple partition sets, each of a single
_Slice. A tabook will have multiple partitions in each set, the first being
a _Strand and the rest being _Slice objects. Multiple partition sets only arise
for a tabbook in the CA-as-0th case.
"""
return tuple(zip(*(cube.partitions for cube in self._cubes)))
[docs] @lazyproperty
def population_fraction(self) -> float:
"""The filtered/unfiltered ratio for this cube-set.
This value is required for properly calculating population on a cube where
a filter has been applied. Returns 1.0 for an unfiltered cube. Returns `np.nan`
if the unfiltered count is zero, which would otherwise result in
a divide-by-zero error.
"""
return self._cubes[0].population_fraction
[docs] @lazyproperty
def n_responses(self) -> int:
"""Total number of responses considered from first cube in this set."""
return self._cubes[0].n_responses
[docs] @lazyproperty
def valid_counts_summary_range(self) -> int:
"""The valid count summary values from first cube in this set."""
return self._cubes[0].valid_counts_summary_range
@lazyproperty
def _cubes(self) -> Tuple["Cube", ...]:
"""Sequence of Cube objects containing data for this analysis."""
def iter_cubes() -> Iterator[Cube]:
"""Generate a Cube object for each of cube_responses.
0D cube-responses and 1D second-and-later cubes are "inflated" to add their
missing row dimension.
"""
for idx, cube_response in enumerate(self._cube_responses):
cube = Cube(
cube_response,
cube_idx=idx if self._is_multi_cube else None,
transforms=self._transforms_dicts[idx],
population=self._population,
mask_size=self._min_base,
)
# --- numeric-measures cubes require inflation to restore their
# --- rows-dimension, others don't
yield cube.inflate() if self._is_numeric_measure else cube
return tuple(iter_cubes())
@lazyproperty
def _is_multi_cube(self) -> bool:
"""True if more than one cube-response was provided on construction."""
return len(self._cube_responses) > 1
@lazyproperty
def _is_numeric_measure(self) -> bool:
"""True when CubeSet is special-case "numeric-measure" case requiring inflation.
When a numeric variable with `mean`, `sum` or `std_dev` summary statistic
expressed in its view, appears as the rows-dimension in a multitable analysis,
its cube-result has been "reduced" to the mean-value of those numerics. This is
in contrast to being "bucketized" into an arbitrary set of numeric-range
categories like 0-5, 5-10, etc. In the process, as an artifact of the ZZ9 query
response, that dimension is removed. As a result, the rows-dimension cube is 0D
and the column-dimension cubes are 1D. These need to be "inflated" to restore
the lost dimension such that they are uniform with other cube-results and can be
processed without special-case code.
"Inflation" is basically prefixing "1 x" to the dimensionality, for example a 1D
of size 5 becomes a 1 x 5 2D result. Note this requires no mapping in the actual
values because 5 = 1 x 5 = 5 (values).
"""
# --- this case only arises in a multitable analysis ---
if not self._is_multi_cube:
return False
# --- We need the cube to tell us the dimensionality. This redundant
# --- construction is low-overhead because all Cube properties are lazy.
return Cube(self._cube_responses[0]).ndim == 0
[docs]class Cube:
"""Provides access to individual slices on a cube-result.
It also provides some attributes of the overall cube-result.
`cube_idx` must be `None` (or omitted) for a single-cube CubeSet. This indicates the
CubeSet contains only a single cube and influences behaviors like CA-as-0th.
"""
def __init__(
self,
response: Union[str, Dict],
cube_idx: Optional[int] = None,
transforms: Optional[Dict] = None,
population: Optional[int] = None,
mask_size: int = 0,
):
self._cube_response_arg = response
self._transforms_dict = {} if transforms is None else transforms
self._cube_idx_arg = cube_idx
self._population = 0 if population is None else population
self._mask_size = mask_size
def __repr__(self) -> str:
"""Provide text representation suitable for working at console.
Falls back to a default repr on exception, such as might occur in
unit tests where object need not otherwise be provided with all
instance variable values.
"""
try:
dimensionality = " x ".join(dt.name for dt in self.dimension_types)
return (
f"{type(self).__name__}(name='{self.name}', "
f"dimension_types='{dimensionality}')"
)
except Exception:
return super(Cube, self).__repr__()
[docs] @lazyproperty
def available_measures(self) -> FrozenSet[CUBE_MEASURE]:
"""frozenset of available CUBE_MEASURE members in the cube response."""
cube_measures = self._cube_response.get("result", {}).get("measures", {}).keys()
return frozenset(CUBE_MEASURE(m) for m in cube_measures)
@lazyproperty
def counts(self) -> np.ndarray:
return self.counts_with_missings[self._valid_idxs]
[docs] @lazyproperty
def counts_with_missings(self) -> np.ndarray:
"""ndarray of weighted, unweighted or valid counts including missing values.
The difference from .counts is that this property includes value for missing
categories.
"""
return (
self._measures.weighted_valid_counts.raw_cube_array
if self._measures.weighted_valid_counts is not None
else self._measures.unweighted_valid_counts.raw_cube_array
if self._measures.unweighted_valid_counts is not None
else self._measures.weighted_counts.raw_cube_array
if self.has_weighted_counts
else self._measures.unweighted_counts.raw_cube_array
)
[docs] @lazyproperty
def covariance(self) -> Optional[np.ndarray]:
"""Optional float64 ndarray of the cube_covariance if the measure exists."""
if self._measures.covariance is None:
return None
return self._measures.covariance.raw_cube_array[self._valid_idxs].astype(
np.float64
)
[docs] @lazyproperty
def cube_index(self) -> int:
"""Offset of this cube within its CubeSet."""
return 0 if self._cube_idx_arg is None else self._cube_idx_arg
[docs] @lazyproperty
def description(self) -> Optional[str]:
"""Return the description of the cube."""
if not self.dimensions:
return None
return self.dimensions[0].description
[docs] @lazyproperty
def dimension_types(self) -> Tuple[DT, ...]:
"""Tuple of DIMENSION_TYPE member for each dimension of cube."""
return tuple(d.dimension_type for d in self.dimensions)
[docs] @lazyproperty
def dimensions(self) -> list:
"""List of visible dimensions.
A cube involving a multiple-response (MR) variable has two dimensions
for that variable (subvariables and categories dimensions), but is
"collapsed" into a single effective dimension for cube-user purposes
(its categories dimension is supressed). This collection will contain
a single dimension for each MR variable and therefore may have fewer
dimensions than appear in the cube response.
"""
return self._all_dimensions.apparent_dimensions
[docs] def inflate(self) -> "Cube":
"""Return new Cube object with rows-dimension added.
A multi-cube (tabbook) response formed from a function (e.g. mean()) on
a numeric variable arrives without a rows-dimension.
"""
cube_dict = self._cube_dict
dimensions = cube_dict["result"]["dimensions"]
default_name = "-".join([m.value for m in self._available_numeric_measures])
# --- The default value in case of numeric variable is the combination of all
# --- the measures expressed in the cube response.
alias = self._numeric_measure_references.get("alias", default_name)
name = self._numeric_measure_references.get("name", default_name).title()
rows_dimension = {
"references": {"alias": alias, "name": name},
"type": {
"categories": [{"id": 1, "name": name}],
"class": "categorical",
},
}
dimensions.insert(0, rows_dimension)
return Cube(
cube_dict,
self._cube_idx_arg,
self._transforms_dict,
self._population,
self._mask_size,
)
[docs] @lazyproperty
def has_weighted_counts(self) -> bool:
"""True if cube response has weighted count data."""
return self.weighted_counts is not None
[docs] @lazyproperty
def means(self) -> Optional[np.ndarray]:
"""Optional float64 ndarray of the cube_means if the measure exists."""
if self._measures.means is None:
return None
return self._measures.means.raw_cube_array[self._valid_idxs].astype(np.float64)
[docs] @lazyproperty
def missing(self) -> int:
"""Get missing count of a cube."""
return self._measures.missing_count
[docs] @lazyproperty
def name(self) -> Optional[str]:
"""Return the name of the cube.
If the cube has 2 diensions, return the name of the second one. In case
of a different number of dimensions, default to returning the name of
the last one. In case of no dimensions, return the empty string.
"""
if not self.dimensions:
return None
return self.dimensions[0].name
[docs] @lazyproperty
def ndim(self) -> int:
"""int count of dimensions for this cube."""
return len(self.dimensions)
[docs] @lazyproperty
def n_responses(self) -> int:
"""Total (int) number of responses considered."""
return self._cube_response["result"].get("n", 0)
[docs] @lazyproperty
def overlaps(self) -> Optional[np.ndarray]:
"""Optional float64 ndarray of cube_overlaps if the measure exists.
The array has as many dimensions as there are defined in the cube query, plus
the extra subvariables dimension as the last dimension.
"""
if self._measures.overlaps is None:
return None
return self._measures.overlaps.raw_cube_array[self._valid_idxs].astype(
np.float64
)
[docs] @lazyproperty
def partitions(self) -> Tuple[CubePartition, ...]:
"""Sequence of _Slice, _Strand, or _Nub objects from this cube-result."""
return tuple(
CubePartition.factory(
self,
slice_idx=slice_idx,
transforms=self._transforms_dict,
population=self._population,
ca_as_0th=self._ca_as_0th,
mask_size=self._mask_size,
)
for slice_idx in self._slice_idxs
)
[docs] @lazyproperty
def population_fraction(self) -> float:
"""The filtered/unfiltered ratio for cube response.
This value is required for properly calculating population on a cube
where a filter has been applied. Returns 1.0 for an unfiltered cube.
Returns `np.nan` if the unfiltered count is zero, which would
otherwise result in a divide-by-zero error.
"""
return self._measures.population_fraction
[docs] @lazyproperty
def stddev(self) -> Optional[np.ndarray]:
"""Optional float64 ndarray of the cube_stddev if the measure exists."""
if self._measures.stddev is None:
return None
return self._measures.stddev.raw_cube_array[self._valid_idxs].astype(np.float64)
[docs] @lazyproperty
def sums(self) -> Optional[np.ndarray]:
"""Optional float64 ndarray of the cube_sum if the measure exists."""
if self._measures.sums is None:
return None
return self._measures.sums.raw_cube_array[self._valid_idxs].astype(np.float64)
[docs] @lazyproperty
def title(self) -> str:
"""str alternate-name given to cube-result.
This value is suitable for naming a Strand when displayed as a column. In this
use-case it is a stand-in for the columns-dimension name since a strand has no
columns dimension.
"""
return self._cube_dict["result"].get("title", "Untitled")
[docs] @lazyproperty
def unweighted_counts(self) -> np.ndarray:
"""ndarray of unweighted counts, valid elements only.
Unweighted counts are drawn from the `result.counts` field of the cube result.
These counts are always present, even when the measure is numeric and there are
no count measures. These counts are always unweighted, regardless of whether the
cube is "weighted".
In case of presence of valid counts in the cube response the counts are replaced
with the valid counts measure.
"""
unweighted_counts = (
self._measures.unweighted_valid_counts
if self._measures.unweighted_valid_counts is not None
else self._measures.unweighted_counts
)
return unweighted_counts.raw_cube_array[self._valid_idxs]
[docs] @lazyproperty
def unweighted_valid_counts(self) -> Optional[np.ndarray]:
"""Optional float64 ndarray of unweighted_valid_counts if the measure exists."""
if self._measures.unweighted_valid_counts is None:
return None
return self._measures.unweighted_valid_counts.raw_cube_array[
self._valid_idxs
].astype(np.float64)
[docs] @lazyproperty
def valid_counts_summary_range(self) -> Optional[Tuple[float, float]]:
"""Optional (min, max) tuple of summary valid counts"""
if not self._measures.unweighted_valid_counts:
return None
# the axis where we have to sum the valid counts are all the nonarray dimensions
# of the cube.
axis = tuple(
i
for i, dim_type in enumerate(self.dimension_types)
if dim_type not in DT.ARRAY_TYPES
)
valid_counts_summary = np.sum(
self._measures.unweighted_valid_counts.raw_cube_array[self._valid_idxs],
axis=axis,
)
return np.min(valid_counts_summary), np.max(valid_counts_summary)
[docs] @lazyproperty
def valid_overlaps(self) -> Optional[np.ndarray]:
"""Optional float64 ndarray of cube_valid_overlaps if the measure exists.
The array has as many dimensions as there are defined in the cube query, plus
the extra subvariables dimension as the last dimension.
"""
if self._measures.valid_overlaps is None:
return None # pragma: no cover
return self._measures.valid_overlaps.raw_cube_array[self._valid_idxs].astype(
np.float64
)
[docs] @lazyproperty
def weighted_counts(self) -> Optional[np.ndarray]:
"""ndarray of weighted counts, valid elements only.
In case of presence of valid counts in the cube response the weighted counts
are replaced with the valid counts measure.
"""
weighted_counts = (
self._measures.weighted_valid_counts
if self._measures.weighted_valid_counts is not None
else self._measures.weighted_counts
)
return (
weighted_counts.raw_cube_array[self._valid_idxs]
if weighted_counts is not None
else None
)
[docs] @lazyproperty
def weighted_valid_counts(self) -> Optional[np.ndarray]:
"""Optional float64 ndarray of weighted_valid_counts if the measure exists."""
if self._measures.weighted_valid_counts is None:
return None
return self._measures.weighted_valid_counts.raw_cube_array[
self._valid_idxs
].astype(np.float64)
@lazyproperty
def _all_dimensions(self) -> list:
"""List of all dimensions (not just user-apparent ones) for this cube."""
return Dimensions.from_dicts(self._cube_dict["result"]["dimensions"])
@lazyproperty
def _available_numeric_measures(self) -> Tuple[CUBE_MEASURE, ...]:
"""tuple of available numeric measures expressed in the cube_response.
Basically the numeric measures are the intersection between all the measures
within the cube response and the defined NUMERIC_CUBE_MEASURES.
"""
return tuple(self.available_measures.intersection(NUMERIC_CUBE_MEASURES))
@lazyproperty
def _ca_as_0th(self) -> bool:
"""True if slicing is to be performed in so-called "CA-as-0th" mode.
In this mode, a categorical-array (CA) cube (2D) is sliced into a sequence of 1D
slices, each of which represents one subvariable of the CA variable. Normally,
a 2D cube-result becomes a single slice.
"""
return (
(self._cube_idx_arg == 0 or self._is_single_filter_col_cube)
and len(self.dimension_types) > 0
and self.dimension_types[0] == DT.CA
)
@lazyproperty
def _cube_dict(self) -> Dict:
"""dict containing raw cube response, parsed from JSON payload."""
cube_dict = copy.deepcopy(self._cube_response)
if self._numeric_measure_subvariables:
dimensions = cube_dict.get("result", {}).get("dimensions", [])
# ---dim inflation---
# ---In case of numeric arrays, we need to inflate the row dimension
# ---according to the mean subvariables. For each subvar the row dimension
# ---will have a new element related to the subvar metadata.
dimensions.insert(0, self._numeric_array_dimension)
return cube_dict
@lazyproperty
def _cube_response(self) -> Dict:
"""dict representing the parsed cube response arguments."""
try:
response = self._cube_response_arg
# ---parse JSON to a dict when constructed with JSON---
cube_response = (
response if isinstance(response, dict) else json.loads(response)
)
# ---cube is 'value' item in a shoji response---
return cube_response.get("value", cube_response)
except TypeError:
raise TypeError(
f"Unsupported type <{type(self._cube_response_arg).__name__}> provided."
f" Cube response must be JSON (str) or dict."
)
@lazyproperty
def _is_single_filter_col_cube(self) -> float:
"""bool determines if it is a single column filter cube."""
return self._cube_dict["result"].get("is_single_col_cube", False)
@lazyproperty
def _measures(self) -> "_Measures":
"""_Measures object for this cube.
Provides access to count based measures and numeric measures (e.g. mean, sum)
when available.
"""
return _Measures(self._cube_dict, self._all_dimensions, self._cube_idx_arg)
@lazyproperty
def _numeric_measure_references(self) -> Dict:
"""Dict of numeric measure references, typically for numeric measures."""
if not self._available_numeric_measures:
return {}
cube_response = self._cube_response
cube_measures = cube_response.get("result", {}).get("measures", {})
metadata = cube_measures.get(self._available_numeric_measures[0].value, {}).get(
"metadata", {}
)
return metadata.get("references", {})
@lazyproperty
def _numeric_measure_subvariables(self) -> List[str]:
"""List of mean subvariables, typically for numeric arrays."""
if not self._available_numeric_measures:
return []
cube_response = self._cube_response
cube_measures = cube_response.get("result", {}).get("measures", {})
metadata = cube_measures.get(self._available_numeric_measures[0].value, {}).get(
"metadata", {}
)
return metadata.get("type", {}).get("subvariables", [])
@lazyproperty
def _numeric_array_dimension(self) -> Optional[Dict]:
"""Rows dimension object according to the numeric-measure subvariables."""
if not self._numeric_measure_subvariables:
return None
subrefs = self._numeric_measure_references.get("subreferences", [])
rows_dimension = {
"references": {
"alias": self._numeric_measure_references.get("alias"),
"name": self._numeric_measure_references.get("name"),
},
"type": {"elements": [], "class": "enum", "subtype": {"class": "num_arr"}},
}
# ---In case of numeric arrays the row dimension should contains additional
# ---information related to the subreferences for each subvariable of the
# ---array.
for i, _ in enumerate(self._numeric_measure_subvariables):
# ---The row dimensions elements must be expanded with the alias and the
# ---name of the numeric array mean measure subreferences.
rows_dimension["type"].get("elements", []).append(
{
"id": i,
"value": {
"references": {
"alias": subrefs[i].get("alias") if subrefs else None,
"name": subrefs[i].get("name") if subrefs else None,
},
"id": self._numeric_measure_subvariables[i],
},
},
)
return rows_dimension
@lazyproperty
def _slice_idxs(self) -> range:
"""Iterable of contiguous int indices for slices to be produced.
This value is to help cube-section construction which does not by itself know
how many slices are in a cube-result.
"""
if self.ndim < 3 and not self._ca_as_0th:
return range(1)
return range(len(self.dimensions[0].valid_elements))
@lazyproperty
def _valid_idxs(self) -> Tuple[np.ndarray, ...]:
"""Tuple of int64 ndarrays of the valid elements idx for each dimension."""
valid_idxs = np.ix_(
*tuple(d.valid_elements.element_idxs for d in self._all_dimensions)
)
# The dimension dimension order can change in case of numeric array variable on
# the row, and so valid indices needs to be returned in an ordered way.
return tuple(valid_idxs[i] for i in self._all_dimensions.dimension_order)
class _Measures:
"""Provides access to measures contained in cube response."""
def __init__(
self,
cube_dict: Dict,
all_dimensions: Dimensions,
cube_idx_arg: Optional[int] = None,
):
self._cube_dict = cube_dict
self._all_dimensions = all_dimensions
self._cube_idx_arg = cube_idx_arg
@lazyproperty
def covariance(self) -> 'Optional["_CovarianceMeasure"]':
"""Optional _CovarianceMeasure object providing access to covariance values.
Will be None if covariance is not available int the cube response.
"""
covariance = _CovarianceMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
return None if covariance.raw_cube_array is None else covariance
@lazyproperty
def means(self) -> 'Optional["_MeanMeasure"]':
"""Optional _MeanMeasure object providing access to means values.
Will be None if no means are available on the counts.
"""
mean = _MeanMeasure(self._cube_dict, self._all_dimensions, self._cube_idx_arg)
return None if mean.raw_cube_array is None else mean
@lazyproperty
def missing_count(self) -> int:
"""numeric representing count of missing rows in cube response."""
if self.unweighted_valid_counts is not None:
return self.unweighted_valid_counts.missing_count
# The check on the means measure is needed for retro-compatibility with the old
# fixtures that don't have valid_counts.
if self.means is not None:
return self.means.missing_count
return self._cube_dict["result"].get("missing", 0)
@lazyproperty
def overlaps(self) -> 'Optional["_OverlapMeasure"]':
"""Optional _OverlapMeasure object providing access to overlaps values.
Will be None if no overlaps are available on the cube result.
"""
overlap = _OverlapMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
return None if overlap.raw_cube_array is None else overlap
@lazyproperty
def population_fraction(self) -> float:
"""The filtered/unfiltered ratio for cube response.
The filtered counts are calculated for complete-cases. This means that only the
non-missing entries are included in the filtered counts. Complete cases are
used only if the corresponding cases are included in the cube response. If not,
the old-style default calculation is used.
This value is required for properly calculating population on a cube
where a filter has been applied. Returns 1.0 for an unfiltered cube.
Returns `np.nan` if the unfiltered count is zero, which would
otherwise result in a divide-by-zero error.
"""
# Try and get the new-style complete-cases filtered counts
filter_stats = self._cube_dict["result"].get("filter_stats", {})
weighted_filtered_complete = filter_stats.get("filtered_complete", {}).get(
"weighted"
)
if weighted_filtered_complete:
# ---If the filter consists of a single categorical date variable, we need
# ---to treat the population fraction as 1, as it's constant accross dates
if filter_stats.get("is_cat_date"):
return 1
# ---If new format is present in response json, use that for pop fraction
numerator = weighted_filtered_complete["selected"]
denominator = numerator + weighted_filtered_complete["other"]
else:
# ---If new format is not available, default to old-style calculation
numerator = self._cube_dict["result"].get("filtered", {}).get("weighted_n")
denominator = (
self._cube_dict["result"].get("unfiltered", {}).get("weighted_n")
)
try:
return numerator / denominator
except ZeroDivisionError:
return np.nan
except Exception:
return 1.0
@lazyproperty
def stddev(self) -> 'Optional["_StdDevMeasure"]':
"""_StdDevMeasure object providing access to cube stddev values.
None when the cube response does not contain a stddev measure.
"""
stddev = _StdDevMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
return None if stddev.raw_cube_array is None else stddev
@lazyproperty
def sums(self) -> 'Optional["_SumMeasure"]':
"""_SumMeasure object providing access to cube sum values.
None when the cube response does not contain a sum measure.
"""
sums = _SumMeasure(self._cube_dict, self._all_dimensions, self._cube_idx_arg)
return None if sums.raw_cube_array is None else sums
@lazyproperty
def unweighted_counts(self) -> "_UnweightedCountMeasure":
"""_UnweightedCountMeasure object for this cube.
This object provides access to unweighted counts for this cube,
whether or not the cube contains weighted counts.
"""
return _UnweightedCountMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
@lazyproperty
def unweighted_valid_counts(self) -> 'Optional["_UnweightedValidCountsMeasure"]':
"""Optional _UnweightedValidCountsMeasure object for this cube.
Can be None when cube doesn't have unweighted valid counts.
"""
valid_counts = _UnweightedValidCountsMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
return valid_counts if valid_counts.raw_cube_array is not None else None
@lazyproperty
def valid_overlaps(self) -> 'Optional["_ValidOverlapMeasure"]':
"""Optional _ValidOverlapMeasure object providing access to valid overlaps vals.
Will be None if no valid overlaps are available on the cube result.
"""
overlap = _ValidOverlapMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
return None if overlap.raw_cube_array is None else overlap
@lazyproperty
def weighted_counts(self) -> 'Optional["_WeightedCountMeasure"]':
"""Optional _WeightedCountMeasure object for this cube.
Can be None when the cube is unweighted.
"""
weighted_counts = _WeightedCountMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
return weighted_counts if weighted_counts.raw_cube_array is not None else None
@lazyproperty
def weighted_valid_counts(self) -> 'Optional["_WeightedValidCountsMeasure"]':
"""Optional _WeightedValidCountsMeasure object for this cube.
Can be None when cube doesn't have weighted valid counts.
"""
valid_counts = _WeightedValidCountsMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
return valid_counts if valid_counts.raw_cube_array is not None else None
class _BaseMeasure:
"""Base class for measure objects."""
def __init__(
self,
cube_dict: Dict,
all_dimensions: Dimensions,
cube_idx_arg: Optional[int] = None,
):
self._cube_dict = cube_dict
self._all_dimensions = all_dimensions
self._cube_idx_arg = cube_idx_arg
@lazyproperty
def raw_cube_array(self) -> Optional[np.ndarray]:
"""Optional read-only ndarray of measure values from cube-response.
The shape of the ndarray mirrors the shape of the (raw) cube
response. Specifically, it includes values for missing elements, any
MR_CAT dimensions, and any prunable rows and columns. Returns None
if the measure is not available in cube.
"""
if self._flat_values is None:
return None
# ---in case it's impossible to reshape, return None---
if len(self._flat_values) != np.prod(self._shape):
return None
raw_cube_array = self._flat_values.reshape(self._shape)
# ---must be read-only to avoid hard-to-find bugs---
raw_cube_array.flags.writeable = False
return raw_cube_array
@lazyproperty
def _flat_values(self): # pragma: no cover
"""Return ndarray of np.float64 values as found in cube response.
This property must be implemented by each subclass.
"""
raise NotImplementedError("must be implemented by each subclass")
@lazyproperty
def _shape(self) -> Tuple[int, ...]:
"""tuple(int) representing the shape of the raw-cube measure array.
If needed, this property can be overridden, to accustom different measure shapes
even if the basic cube has the same original shape.
"""
return self._all_dimensions.shape
class _CovarianceMeasure(_BaseMeasure):
"""Covariance values from a cube-response."""
@lazyproperty
def _flat_values(self) -> Optional[np.ndarray]:
"""Optional 1D np.ndarray of np.float64 cov values as found in cube response.
Covariance data may include missing items represented by a dict like
{'?': -1} in the cube response. These are replaced by np.nan in the
returned value.
"""
if self._measure_payload is None:
return None
return np.array(
tuple(
np.nan if type(x) is dict else x for x in self._measure_payload["data"]
),
dtype=np.float64,
).flatten()
@lazyproperty
def _measure_payload(self) -> Dict:
"""dict representing the covariance measure part of the cube response."""
return self._cube_dict["result"].get("measures", {}).get("covariance")
@lazyproperty
def _numeric_measure_subvariables(self) -> List[str]:
"""List of subvariables, typically for numeric arrays."""
metadata = self._measure_payload.get("metadata", {})
return metadata.get("type", {}).get("subvariables", [])
@lazyproperty
def _shape(self) -> Tuple[int]:
"""tuple(int) representing the shape of the covariance."""
return self._all_dimensions.shape + (len(self._numeric_measure_subvariables),)
class _MeanMeasure(_BaseMeasure):
"""Statistical mean values from a cube-response."""
@lazyproperty
def missing_count(self) -> int:
"""Numeric value representing count of missing rows in response."""
return self._cube_dict["result"]["measures"]["mean"].get("n_missing", 0)
@lazyproperty
def _flat_values(self) -> Optional[np.ndarray]:
"""Optional 1D np.ndarray of np.float64 mean values as found in cube response.
Mean data may include missing items represented by a dict like
{'?': -1} in the cube response. These are replaced by np.nan in the
returned value.
"""
measure_payload = self._cube_dict["result"].get("measures", {}).get("mean")
if measure_payload is None:
return None
return np.array(
tuple(np.nan if type(x) is dict else x for x in measure_payload["data"]),
dtype=np.float64,
).flatten()
class _OverlapMeasure(_BaseMeasure):
"""Overlap values from a cube-response."""
@lazyproperty
def _flat_values(self) -> Optional[np.ndarray]:
"""Optional 1D np.ndarray of np.float64 overlap values as found in cube response
Overlap data may include missing items represented by a dict like
{'?': -1} in the cube response. These are replaced by np.nan in the
returned value.
"""
if self._measure_payload is None:
return None
return np.array(
tuple(
np.nan if type(x) is dict else x for x in self._measure_payload["data"]
),
dtype=np.float64,
).flatten()
@lazyproperty
def _measure_payload(self) -> Dict:
"""dict representing the overlaps measure part of the cube response."""
return self._cube_dict["result"].get("measures", {}).get("overlap")
@lazyproperty
def _shape(self) -> Tuple[int]:
"""tuple(int) representing shape of the overlaps measure.
The overlaps measure is characteristic in that it produces an additional
dimension for the Multiple Response subvariables. That dimension is always
found at the end of the result shape, because of how responses are
generated by the backend (ZZ9) mechanism.
"""
n_subvars = len(self._measure_payload["metadata"]["type"]["subvariables"])
return self._all_dimensions.shape + (n_subvars,)
class _StdDevMeasure(_BaseMeasure):
"""Statistical stddev values from a cube-response."""
@lazyproperty
def _flat_values(self) -> Optional[np.ndarray]:
"""Optional 1D float64 ndarray of stddev values as found in cube response.
StdDev data may include missing items represented by a dict like
{'?': -1} in the cube response. These are replaced by np.nan in the
returned value.
"""
measure_payload = self._cube_dict["result"].get("measures", {}).get("stddev")
if measure_payload is None:
return None
return np.array(
tuple(np.nan if type(x) is dict else x for x in measure_payload["data"]),
dtype=np.float64,
).flatten()
class _SumMeasure(_BaseMeasure):
"""Statistical sum values from a cube-response."""
@lazyproperty
def _flat_values(self) -> Optional[np.ndarray]:
"""Optional 1D float64 ndarray of sum values as found in cube response.
Sum data may include missing items represented by a dict like
{'?': -1} in the cube response. These are replaced by np.nan in the
returned value.
"""
measure_payload = self._cube_dict["result"].get("measures", {}).get("sum")
if measure_payload is None:
return None
return np.array(
tuple(np.nan if type(x) is dict else x for x in measure_payload["data"]),
dtype=np.float64,
).flatten()
class _UnweightedCountMeasure(_BaseMeasure):
"""Unweighted counts for cube."""
@lazyproperty
def _flat_values(self) -> np.ndarray:
"""1D np.ndarray of np.float64 counts before weighting.
Use np.float64s to avoid int overflow bugs and so we can use nan.
"""
return np.array(self._cube_dict["result"]["counts"], dtype=np.float64)
class _UnweightedValidCountsMeasure(_BaseMeasure):
"""Unweighted Valid counts for cube."""
@lazyproperty
def missing_count(self) -> int:
"""numeric representing count of missing rows reflected in response."""
return self._cube_dict["result"]["measures"]["valid_count_unweighted"].get(
"n_missing", 0
)
@lazyproperty
def _flat_values(self) -> Optional[np.ndarray]:
"""Optional 1D np.ndarray of np.float64 unweighted valid counts."""
valid_counts = (
self._cube_dict["result"]["measures"]
.get("valid_count_unweighted", {})
.get("data", [])
)
return np.array(valid_counts, dtype=np.float64) if valid_counts else None
class _ValidOverlapMeasure(_OverlapMeasure):
"""Valid overlap values from a cube-response."""
@lazyproperty
def _measure_payload(self) -> Dict:
"""dict representing the valid overlaps measure part of the cube response."""
return self._cube_dict["result"].get("measures", {}).get("valid_overlap")
class _WeightedCountMeasure(_BaseMeasure):
"""Weighted counts for cube."""
@lazyproperty
def _flat_values(self) -> Optional[np.ndarray]:
"""Optional 1D np.ndarray of np.float64 numeric counts after weighting."""
unweighted_counts = self._cube_dict["result"]["counts"]
weighted_counts = (
self._cube_dict["result"]["measures"].get("count", {}).get("data")
)
if unweighted_counts == weighted_counts or weighted_counts is None:
return None
return np.array(weighted_counts, dtype=np.float64)
class _WeightedValidCountsMeasure(_BaseMeasure):
"""Weighted Valid counts for cube."""
@lazyproperty
def _flat_values(self) -> Optional[np.ndarray]:
"""Optional 1D np.ndarray of np.float64 weighted valid counts."""
valid_counts = (
self._cube_dict["result"]["measures"]
.get("valid_count_weighted", {})
.get("data", [])
)
return np.array(valid_counts, dtype=np.float64) if valid_counts else None