Source code for joseki.profiles.schema

"""Dataset schema for atmosphere thermophysical profiles.

The dataset schema defines the variables, coordinates and attributes that are
expected in a dataset representing an atmosphere thermophysical profile.
"""

import logging
import typing as t

import numpy as np
import numpy.typing as npt
import pint
import xarray as xr
from attrs import define

from .util import number_density, utcnow
from .._version import __version__
from ..units import ureg

logger = logging.getLogger(__name__)


[docs] def history() -> str: return f"{utcnow()} dataset formatting by joseki version {__version__}."
[docs] def mole_fraction_sum(ds: xr.Dataset) -> pint.Quantity: """Compute the sum of mole fractions. Args: ds: Dataset. Returns: The sum of mole fractions. """ return ( sum([ds[c] for c in ds.data_vars if c.startswith("x_")]).values * ureg.dimensionless )
[docs] @define(frozen=True) class Schema: """Dataset schema for atmosphere thermophysical profiles.""" # name: (dims, data type, dimensionality, standard name) data_vars = { "p": ( ["z"], npt.NDArray[np.float64], "Pa", "air_pressure", ), "t": ( ["z"], npt.NDArray[np.float64], "K", "air_temperature", ), "n": ( ["z"], npt.NDArray[np.float64], "m ** -3", "air_number_density", ), } coords = { "z": ("z", npt.NDArray[np.float64], "km", "altitude"), } attrs = { "Conventions": str, "title": str, "institution": str, "source": str, "history": str, "references": str, "url": str, "urldate": str, }
[docs] def validate( self, ds: xr.Dataset, check_x_sum: bool = False, ret_true_if_valid: bool = False, ) -> t.Optional[bool]: """Validate dataset. Args: ds: Dataset to validate. check_x_sum: if True, check that mole fraction sums are never larger than one. ret_true_if_valid: make this method return True if the dataset is valid. Note that if the dataset is not valid, this method will raise an exception. Raises: ValueError: If the dataset does not match the schema. Returns: None or bool: If `ret_true_if_valid` is True, returns True if the dataset is valid, otherwise returns None. """ logger.debug("Validating dataset") logger.debug("Checking that all data variables are present") for var in self.data_vars: if var not in ds.data_vars: raise ValueError(f"missing data variable: {var}") # pragma: no cover logger.debug("Checking that 'x_*' data variable(s) are present") if not any([name.startswith("x_") for name in ds.data_vars]): raise ValueError( "missing data variable starting with x_" ) # pragma: no cover logger.debug("Checking that all coordinates are present") for coord in self.coords: if coord not in ds.coords: raise ValueError(f"missing coordinate: {coord}") # pragma: no cover logger.debug("Checking that all attributes are present") for attr in self.attrs: if attr not in ds.attrs: raise ValueError(f"missing attribute: {attr}") # pragma: no cover logger.debug("Checking that data variables have the correct dimensions") for var, (dims, _, _, _) in self.data_vars.items(): if set(ds[var].dims) != set(dims): raise ValueError( # pragma: no cover f"incorrect dimensions for {var}. Expected {dims}, " f"got {ds[var].dims}" ) logger.debug("Checking that coordinates have the correct dimensions") for coord, (dims, _, _, _) in self.coords.items(): if set(ds[coord].dims) != set(dims): raise ValueError( # pragma: no cover f"incorrect dimensions for {coord}. Expected {dims}, " f"got {ds[coord].dims}" ) logger.debug("Checking that data variables have the correct dimensionality") for var, (_, _, dimensionality, _) in self.data_vars.items(): units = ureg(ds[var].units) if not units.check(dimensionality): raise ValueError( # pragma: no cover f"incorrect units for {var}. Expected {dimensionality}, " f"got {units.dimensionality}" ) logger.debug("Checking that coordinates have the correct dimensionality") for coord, (_, _, dimensionality, _) in self.coords.items(): units = ureg(ds[coord].units) if not units.check(dimensionality): raise ValueError( # pragma: no cover f"incorrect units for {coord}. Expected {dimensionality}, " f"got {units.dimensionality}" ) logger.debug("Checking that attributes have the correct types") for attr, typ in self.attrs.items(): if not isinstance(ds.attrs[attr], typ): raise ValueError( # pragma: no cover f"incorrect type for {attr}. Expected {typ}, " f"got {type(ds.attrs[attr])}" ) logger.debug("Checking that data variables have the correct standard names") for var, (_, _, _, standard_name) in self.data_vars.items(): if ds[var].attrs["standard_name"] != standard_name: raise ValueError( # pragma: no cover f"incorrect standard name for {var}. Expected " f"{standard_name}, got " f"{ds[var].attrs['standard_name']}" ) logger.debug( "Checking that all x_* data variables have the correct " "dimensionality and standard names" ) for var in ds.data_vars: if var.startswith("x_"): m = var[2:] units = ureg(ds[var].units) if not units.check("[]"): raise ValueError( # pragma: no cover f"incorrect dimensionality for {var}. Expected " f"dimensionless, got {units.dimensionality}" ) if ds[var].attrs["standard_name"] != f"{m}_mole_fraction": raise ValueError( # pragma: no cover f"incorrect standard name for {var}. Expected " f"{m}_mole_fraction, got " f"{ds[var].attrs['standard_name']}" ) if check_x_sum: logger.debug("Checking that mole fraction sums are never larger than one") vfs = mole_fraction_sum(ds) if np.any(vfs.m > 1): raise ValueError( # pragma: no cover "The rescaling factors lead to a profile where the mole " "fraction sum is larger than 1." ) logger.info("Dataset is valid") if ret_true_if_valid: # pragma: no cover return True
[docs] def convert( self, data_vars: t.Mapping[str, pint.Quantity], coords: t.Mapping[str, pint.Quantity], attrs: t.Mapping[str, str], ) -> xr.Dataset: """Convert input to schema-compliant dataset. Args: data_vars: Mapping of data variable names to quantities. coords: Mapping of coordinate names to quantities. attrs: Mapping of attribute names to values. Returns: Dataset with schema-compliant data variables, coordinates, and attributes. """ logger.debug("converting input to schema-compliant dataset") logger.debug("checking that all data variables are present") for var in self.data_vars: if var == "n" not in data_vars: n = number_density( p=data_vars["p"], t=data_vars["t"], ) data_vars["n"] = n else: if var not in data_vars: raise ValueError( f"missing data variable: {var}" ) # pragma: no cover logger.debug("checking that there is at least one x_ data variable") if not any([name.startswith("x_") for name in data_vars]): raise ValueError( "missing data variable starting with x_" ) # pragma: no cover logger.debug("checking that all coordinates are present") for coord in self.coords: if coord not in coords: raise ValueError(f"missing coordinate: {coord}") # pragma: no cover logger.debug("checking that all attributes are present") for attr in self.attrs: if attr not in attrs: raise ValueError(f"missing attribute: {attr}") # pragma: no cover logger.debug("converting data variables to xarray data array tuples") for var, (dims, _, units, standard_name) in self.data_vars.items(): data_vars[var] = ( dims, data_vars[var].m_as(units), { "standard_name": standard_name, "long_name": standard_name.replace("_", " "), "units": units, }, ) logger.debug("converting x_ data variables") for var in data_vars: if var.startswith("x_"): m = var[2:] data_vars[var] = ( "z", data_vars[var].m_as("dimensionless"), { "standard_name": f"{m}_mole_fraction", "long_name": f"{m} mole fraction", "units": "dimensionless", }, ) logger.debug("converting coordinates") for attr, (_, _, units, standard_name) in self.coords.items(): coords[attr] = ( attr, coords[attr].m_as(units), { "standard_name": standard_name, "long_name": standard_name.replace("_", " "), "units": units, }, ) logger.debug("checking that all attributes are present") for attr in self.attrs: if attr not in attrs: raise ValueError(f"missing attribute: {attr}") # pragma: no cover logger.debug("creating dataset") return xr.Dataset( data_vars=data_vars, coords=coords, attrs=attrs, )
schema = Schema()