Source code for physical_validation.ensemble

###########################################################################
#                                                                         #
#    physical_validation,                                                 #
#    a python package to test the physical validity of MD results         #
#                                                                         #
#    Written by Pascal T. Merz <pascal.merz@me.com>                       #
#               Michael R. Shirts <michael.shirts@colorado.edu>           #
#                                                                         #
#    Copyright (c) 2017-2021 University of Colorado Boulder               #
#              (c) 2012      The University of Virginia                   #
#                                                                         #
###########################################################################
r"""
This software allows users to perform statistical test to determine if a
given molecular simulation is consistent with the thermodynamic ensemble
it is performed in.

Users should cite the JCTC paper: Shirts, M. R. "Simple Quantitative
Tests to Validate Sampling from Thermodynamic Ensembles",
J. Chem. Theory Comput., 2013, 9 (2), pp 909-926,
https://dx.doi.org/10.1021/ct300688p
"""
from typing import Dict, List, Optional

import numpy as np

from .data import SimulationData
from .util import ensemble
from .util import error as pv_error


def beta_warning() -> None:
    print(
        "###############################################################################"
    )
    print(
        "# WARNING: Support for muVT ensemble is an experimental feature under current #"
    )
    print(
        "#          development. You can help us to improve it by reporting errors     #"
    )
    print(
        "#          at https://github.com/shirtsgroup/physical_validation              #"
    )
    print(
        "#          Thank you!                                                         #"
    )
    print(
        "###############################################################################"
    )


[docs]def check(
    data_sim_one: SimulationData,
    data_sim_two: SimulationData,
    total_energy: bool = False,
    bootstrap_error: bool = False,
    bootstrap_repetitions: int = 200,
    bootstrap_seed: Optional[int] = None,
    screen: bool = False,
    filename: Optional[str] = None,
    verbosity: int = 1,
    data_is_uncorrelated: bool = False,
) -> List[float]:
    r"""
    Check the ensemble. The correct check is inferred from the
    simulation data given.

    Parameters
    ----------
    data_sim_one
        Simulation data object of first simulation
    data_sim_two
        Simulation data object of second simulation differing in
        its state point from the first
    total_energy
        Whether to use the total energy for the calculation
        Default: False, use potential energy only
    bootstrap_error
        Calculate the standard error via bootstrap resampling
        Default: False
    bootstrap_repetitions
        Number of bootstrap repetitions drawn
        Default: 200
    bootstrap_seed
        Sets the random number seed for bootstrapping.
        If set, bootstrapping will be reproducible.
        Default: None, bootstrapping is non-reproducible.
    screen
        Plot distributions on screen. Default: False.
    filename
        Plot distributions to `filename`.
        Default: `None`, no plotting to file.
    verbosity
        Level of verbosity, from 0 (quiet) to 3 (very verbose).
        Default: 1
    data_is_uncorrelated
        Whether the provided data is uncorrelated. If this option
        is set, the equilibration, decorrelation and tail pruning
        of the trajectory is skipped. This can speed up the analysis,
        but note that if the provided data is correlated, the results
        of the physical validation checks might be invalid.

    Returns
    -------
    quantiles
        The number of quantiles the computed result is off the analytical one.

    """
    data_sim_one.raise_if_units_are_none(
        test_name="ensemble.check",
        argument_name="data_sim_one",
    )
    data_sim_two.raise_if_units_are_none(
        test_name="ensemble.check",
        argument_name="data_sim_two",
    )
    if not SimulationData.compatible(data_sim_one, data_sim_two):
        raise pv_error.InputError(
            ["data_sim_one", "data_sim_two"], "Simulation data not compatible."
        )
    data_sim_one.raise_if_ensemble_is_invalid(
        test_name="ensemble.check",
        argument_name="data_sim_one",
        check_pressure=True,
        check_mu=True,
    )
    data_sim_two.raise_if_ensemble_is_invalid(
        test_name="ensemble.check",
        argument_name="data_sim_two",
        check_pressure=True,
        check_mu=True,
    )

    if data_sim_one.ensemble.ensemble != data_sim_two.ensemble.ensemble:
        raise pv_error.InputError(
            ["data_sim_one", "data_sim_two"],
            "The two simulations were sampling different ensembles. "
            "The simulations are expected to differ in state point "
            "(e.g. target temperature, target pressure), but not "
            "in their sampled ensemble (e.g. NVT, NPT).",
        )

    sampled_ensemble = data_sim_one.ensemble.ensemble

    if not (
        sampled_ensemble == "NVT"
        or sampled_ensemble == "NPT"
        or sampled_ensemble == "muVT"
    ):
        raise NotImplementedError(
            "Test of ensemble " + sampled_ensemble + " is not implemented (yet).",
        )

    labels = {
        "E": "Total Energy",
        "U": "Potential Energy",
        "H": "Enthalpy",
        "V": "Volume",
        "N": "Number of Species",
        r"E - \sum \mu N": r"$E - \sum \mu N$",
        r"U - \sum \mu N": r"$U - \sum \mu N$",
    }

    energy_observable_name = "total_energy" if total_energy else "potential_energy"
    energy_observable_abbreviation = "E" if total_energy else "U"

    energy_units = data_sim_one.units.energy_str

    quantiles = None

    num_bins_for_analysis = 40
    tail_cutoff = 0.001  # 0.1%

    if sampled_ensemble == "NVT":
        data_sim_one.raise_if_observable_data_is_invalid(
            required_observables=[energy_observable_name],
            test_name="ensemble.check",
            argument_name="data_sim_one",
        )
        data_sim_two.raise_if_observable_data_is_invalid(
            required_observables=[energy_observable_name],
            test_name="ensemble.check",
            argument_name="data_sim_two",
        )
        quantiles = ensemble.check_1d(
            traj1=data_sim_one.observables[energy_observable_name],
            traj2=data_sim_two.observables[energy_observable_name],
            param1=data_sim_one.ensemble.temperature,
            param2=data_sim_two.ensemble.temperature,
            kb=data_sim_one.units.kb,
            quantity=energy_observable_abbreviation,
            dtemp=True,
            dpress=False,
            dmu=False,
            temp=None,
            pvconvert=None,
            nbins=num_bins_for_analysis,
            cutoff=tail_cutoff,
            bootstrap_seed=bootstrap_seed,
            bootstrap_error=bootstrap_error,
            bootstrap_repetitions=bootstrap_repetitions,
            verbosity=verbosity,
            filename=filename,
            screen=screen,
            xlabel=labels[energy_observable_abbreviation],
            xunit=energy_units,
            data_is_uncorrelated=data_is_uncorrelated,
        )

    elif sampled_ensemble == "NPT":
        temperatures = np.array(
            [data_sim_one.ensemble.temperature, data_sim_two.ensemble.temperature]
        )
        pressures = np.array(
            [data_sim_one.ensemble.pressure, data_sim_two.ensemble.pressure]
        )
        equal_temps = temperatures[0] == temperatures[1]
        equal_press = pressures[0] == pressures[1]

        required_observables = ["volume"]
        if equal_press:
            required_observables.append(energy_observable_name)

        data_sim_one.raise_if_observable_data_is_invalid(
            required_observables=required_observables,
            test_name="ensemble.check",
            argument_name="data_sim_one",
        )
        data_sim_two.raise_if_observable_data_is_invalid(
            required_observables=required_observables,
            test_name="ensemble.check",
            argument_name="data_sim_two",
        )

        volume_units = data_sim_one.units.volume_str

        # Calculate conversion from p*V to energy units
        #
        # GROMACS standard units are
        #   energy: kJ/mol
        #   volume: nm^3
        #   pressure: bar
        #   => pV-term: bar * nm^3 == 1e-25 kJ == 6.022140857e-2 kJ/mol
        #   => pvconvert = 6.022140857e-2
        # UnitData stores conversion factors relative to GROMACS units
        #   energy: energy_conversion * kJ/mol
        #   volume: volume_conversion * nm^3
        #   pressure: pressure_conversion * bar
        #   => pV-term: [p]*[V] == pressure_conversion * volume_conversion bar * nm^3
        # Units were checked earlier, so we can use either simulation data structure
        pvconvert = 6.022140857e-2
        pvconvert *= (
            data_sim_one.units.pressure_conversion
            * data_sim_one.units.volume_conversion
        )
        pvconvert /= data_sim_one.units.energy_conversion

        if equal_press and not equal_temps:
            if energy_observable_abbreviation == "U":
                energy_observable_abbreviation = "H"
            quantiles = ensemble.check_1d(
                traj1=data_sim_one.observables[energy_observable_name]
                + pvconvert * pressures[0] * data_sim_one.observables.volume,
                traj2=data_sim_two.observables[energy_observable_name]
                + pvconvert * pressures[1] * data_sim_two.observables.volume,
                param1=temperatures[0],
                param2=temperatures[1],
                kb=data_sim_one.units.kb,
                quantity=energy_observable_abbreviation,
                dtemp=True,
                dpress=False,
                dmu=False,
                temp=None,
                pvconvert=None,
                nbins=num_bins_for_analysis,
                cutoff=tail_cutoff,
                bootstrap_seed=bootstrap_seed,
                bootstrap_error=bootstrap_error,
                bootstrap_repetitions=bootstrap_repetitions,
                verbosity=verbosity,
                filename=filename,
                screen=screen,
                xlabel=labels[energy_observable_abbreviation],
                xunit=energy_units,
                data_is_uncorrelated=data_is_uncorrelated,
            )
        elif equal_temps and not equal_press:
            quantiles = ensemble.check_1d(
                traj1=data_sim_one.observables.volume,
                traj2=data_sim_two.observables.volume,
                param1=pressures[0],
                param2=pressures[1],
                kb=data_sim_one.units.kb,
                quantity="V",
                dtemp=False,
                dpress=True,
                dmu=False,
                nbins=num_bins_for_analysis,
                cutoff=tail_cutoff,
                bootstrap_seed=bootstrap_seed,
                temp=temperatures[0],
                pvconvert=pvconvert,
                bootstrap_error=bootstrap_error,
                bootstrap_repetitions=bootstrap_repetitions,
                verbosity=verbosity,
                filename=filename,
                screen=screen,
                xlabel=labels["V"],
                xunit=volume_units,
                data_is_uncorrelated=data_is_uncorrelated,
            )
        else:
            param1 = np.array([temperatures[0], pressures[0]])
            param2 = np.array([temperatures[1], pressures[1]])
            quantiles = ensemble.check_2d(
                traj1=np.array(
                    [
                        data_sim_one.observables[energy_observable_name],
                        data_sim_one.observables.volume,
                    ]
                ),
                traj2=np.array(
                    [
                        data_sim_two.observables[energy_observable_name],
                        data_sim_two.observables.volume,
                    ]
                ),
                param1=param1,
                param2=param2,
                kb=data_sim_one.units.kb,
                pvconvert=pvconvert,
                quantity=[energy_observable_abbreviation, "V"],
                dtempdpress=True,
                dtempdmu=False,
                cutoff=tail_cutoff,
                bootstrap_seed=bootstrap_seed,
                bootstrap_error=bootstrap_error,
                bootstrap_repetitions=bootstrap_repetitions,
                verbosity=verbosity,
                filename=filename,
                screen=screen,
                data_is_uncorrelated=data_is_uncorrelated,
            )

    elif sampled_ensemble == "muVT":
        beta_warning()

        equal_temps = (
            data_sim_one.ensemble.temperature == data_sim_two.ensemble.temperature
        )
        num_differing_chemical_potentials = np.count_nonzero(
            data_sim_one.ensemble.mu != data_sim_two.ensemble.mu
        )
        num_identical_chemical_potentials = np.count_nonzero(
            data_sim_one.ensemble.mu == data_sim_two.ensemble.mu
        )

        if num_differing_chemical_potentials > 1:
            raise NotImplementedError(
                "For muVT, ensemble checking is only implemented for chemical potentials "
                "differing for at most one species."
            )

        if not equal_temps and num_differing_chemical_potentials == 0:
            # quantity = U - S_i mu_i*N_i
            # parameter = T
            energy_observable_abbreviation += r" - \sum \mu N"
            quantiles = ensemble.check_1d(
                traj1=data_sim_one.observables[energy_observable_name]
                - ensemble.chemical_potential_energy(
                    data_sim_one.ensemble.mu, data_sim_one.observables.number_of_species
                ),
                traj2=data_sim_two.observables[energy_observable_name]
                - ensemble.chemical_potential_energy(
                    data_sim_two.ensemble.mu, data_sim_two.observables.number_of_species
                ),
                param1=data_sim_one.ensemble.temperature,
                param2=data_sim_two.ensemble.temperature,
                kb=data_sim_one.units.kb,
                quantity=energy_observable_abbreviation,
                dtemp=True,
                dpress=False,
                dmu=False,
                temp=None,
                pvconvert=None,
                nbins=num_bins_for_analysis,
                cutoff=tail_cutoff,
                bootstrap_seed=bootstrap_seed,
                bootstrap_error=bootstrap_error,
                bootstrap_repetitions=bootstrap_repetitions,
                verbosity=verbosity,
                filename=filename,
                screen=screen,
                xlabel=labels[energy_observable_abbreviation],
                xunit=energy_units,
                data_is_uncorrelated=data_is_uncorrelated,
            )

        if equal_temps and num_differing_chemical_potentials == 1:
            # quantity = N_i
            # parameter = mu_i
            idx = np.flatnonzero(data_sim_one.ensemble.mu != data_sim_two.ensemble.mu)[
                0
            ]
            quantiles = ensemble.check_1d(
                traj1=data_sim_one.observables.number_of_species[:, idx],
                traj2=data_sim_two.observables.number_of_species[:, idx],
                param1=data_sim_one.ensemble.mu[idx],
                param2=data_sim_two.ensemble.mu[idx],
                kb=data_sim_one.units.kb,
                quantity="N",
                dtemp=False,
                dpress=False,
                dmu=True,
                nbins=num_bins_for_analysis,
                cutoff=tail_cutoff,
                bootstrap_seed=bootstrap_seed,
                temp=data_sim_one.ensemble.temperature,
                pvconvert=None,
                bootstrap_error=bootstrap_error,
                bootstrap_repetitions=bootstrap_repetitions,
                verbosity=verbosity,
                filename=filename,
                screen=screen,
                xlabel=labels["N"],
                xunit=None,
                data_is_uncorrelated=data_is_uncorrelated,
            )

        if not equal_temps and num_differing_chemical_potentials == 1:
            # quantity = U - S_i!=j mu_i*N_i | mu_j*N_j
            # parameter = beta, mu_j
            energy_observable_one = data_sim_one.observables[energy_observable_name]
            energy_observable_two = data_sim_two.observables[energy_observable_name]
            if num_identical_chemical_potentials > 0:
                idx_identical = np.flatnonzero(
                    data_sim_one.ensemble.mu == data_sim_two.ensemble.mu
                )
                energy_observable_one -= ensemble.chemical_potential_energy(
                    data_sim_one.ensemble.mu[idx_identical],
                    data_sim_one.observables.number_of_species[:, idx_identical],
                )
                energy_observable_two -= ensemble.chemical_potential_energy(
                    data_sim_two.ensemble.mu[idx_identical],
                    data_sim_two.observables.number_of_species[:, idx_identical],
                )
                energy_observable_abbreviation += r" - \sum mu*N"

            idx_differing = np.flatnonzero(
                data_sim_one.ensemble.mu != data_sim_two.ensemble.mu
            )[0]

            param1 = np.array(
                [
                    data_sim_one.ensemble.temperature,
                    data_sim_one.ensemble.mu[idx_differing],
                ]
            )
            param2 = np.array(
                [
                    data_sim_two.ensemble.temperature,
                    data_sim_two.ensemble.mu[idx_differing],
                ]
            )
            quantiles = ensemble.check_2d(
                traj1=np.array(
                    [
                        energy_observable_one,
                        data_sim_one.observables.number_of_species[:, idx_differing],
                    ]
                ),
                traj2=np.array(
                    [
                        energy_observable_two,
                        data_sim_two.observables.number_of_species[:, idx_differing],
                    ]
                ),
                param1=param1,
                param2=param2,
                kb=data_sim_one.units.kb,
                pvconvert=None,
                quantity=[energy_observable_abbreviation, "N"],
                dtempdpress=False,
                dtempdmu=True,
                cutoff=tail_cutoff,
                bootstrap_seed=bootstrap_seed,
                bootstrap_error=bootstrap_error,
                bootstrap_repetitions=bootstrap_repetitions,
                verbosity=verbosity,
                filename=filename,
                screen=screen,
                data_is_uncorrelated=data_is_uncorrelated,
            )

    return quantiles


[docs]def estimate_interval(
    data: SimulationData,
    verbosity: int = 1,
    total_energy: bool = False,
    data_is_uncorrelated: bool = False,
) -> Dict[str, float]:
    r"""
    In order to perform an ensemble check, two simulations at distinct state
    point are needed. Choosing two state points too far apart will result
    in poor or zero overlap between the distributions, leading to very noisy
    results (due to sample errors in the tails) or a breakdown of the method,
    respectively. Choosing two state points very close to each others, on the
    other hand, makes it difficult to distinguish the slope from statistical
    error in the samples.

    This function implements a rule of thumb based on the standard deviations
    of distributions. It takes a single simulation and suggests appropriate
    intervals for a second simulation to be used for ensemble checking.

    Parameters
    ----------
    data
        The performed simulation.
    verbosity
        If 0, no output is printed on screen. If 1, estimated intervals are
        printed. If larger, additional information during calculation are
        printed.
        Default: 1
    total_energy
        Use total energy instead of potential energy only.
        Default: False
    data_is_uncorrelated
        Whether the provided data is uncorrelated. If this option
        is set, the equilibration, decorrelation and tail pruning
        of the trajectory is skipped. This can speed up the analysis,
        but note that if the provided data is correlated, the results
        of the physical validation checks might be invalid.
        Default: False

    Returns
    -------
    intervals
        If `data` was performed under NVT conditions, `intervals` contains only
        one entry:

            * `'dT'`, containing the suggested temperature interval.

        If `data` was performed under NPT conditions, `intervals` contains three
        entries:

            * `'dT'`: Suggested temperature interval at constant pressure
            * `'dP'`: Suggested pressure interval at constant temperature
            * `'dTdP'`: Suggested combined temperature and pressure interval

    """
    data.raise_if_units_are_none(
        test_name="ensemble.estimate_interval",
        argument_name="data",
    )
    data.raise_if_ensemble_is_invalid(
        test_name="ensemble.estimate_interval",
        argument_name="data",
        check_pressure=True,
        check_mu=True,
    )
    if not (
        data.ensemble.ensemble == "NVT"
        or data.ensemble.ensemble == "NPT"
        or data.ensemble.ensemble == "muVT"
    ):
        raise NotImplementedError(
            "estimate_interval() not implemented for ensemble " + data.ensemble.ensemble
        )

    energy_observable_name = "total_energy" if total_energy else "potential_energy"
    required_observables = [energy_observable_name]
    if data.ensemble.ensemble == "NPT":
        required_observables.append("volume")
    if data.ensemble.ensemble == "muVT":
        required_observables.append("number_of_species")
    data.raise_if_observable_data_is_invalid(
        required_observables=required_observables,
        test_name="ensemble.estimate_interval",
        argument_name="data",
    )

    tail_cutoff = 0.001  # 0.1%

    if data.ensemble.ensemble == "NVT":
        result = ensemble.estimate_interval(
            ens_string="NVT",
            ens_temp=data.ensemble.temperature,
            energy=data.observables[energy_observable_name],
            kb=data.units.kb,
            ens_press=None,
            volume=None,
            pvconvert=None,
            ens_mu=None,
            species_number=None,
            verbosity=verbosity,
            cutoff=tail_cutoff,
            tunit=data.units.temperature_str,
            punit="",
            munit="",
            data_is_uncorrelated=data_is_uncorrelated,
        )
    elif data.ensemble.ensemble == "NPT":
        pvconvert = 6.022140857e-2
        pvconvert *= data.units.pressure_conversion * data.units.volume_conversion
        pvconvert /= data.units.energy_conversion
        result = ensemble.estimate_interval(
            ens_string="NPT",
            ens_temp=data.ensemble.temperature,
            energy=data.observables[energy_observable_name],
            kb=data.units.kb,
            ens_press=data.ensemble.pressure,
            volume=data.observables.volume,
            pvconvert=pvconvert,
            ens_mu=None,
            species_number=None,
            verbosity=verbosity,
            cutoff=tail_cutoff,
            tunit=data.units.temperature_str,
            punit=data.units.pressure_str,
            munit="",
            data_is_uncorrelated=data_is_uncorrelated,
        )
    else:
        beta_warning()

        result = ensemble.estimate_interval(
            ens_string="muVT",
            ens_temp=data.ensemble.temperature,
            energy=data.observables[energy_observable_name],
            kb=data.units.kb,
            ens_press=None,
            volume=None,
            pvconvert=None,
            ens_mu=data.ensemble.mu,
            species_number=data.observables.number_of_species,
            verbosity=verbosity,
            cutoff=tail_cutoff,
            tunit=data.units.temperature_str,
            punit="",
            munit=data.units.energy_str,
            data_is_uncorrelated=data_is_uncorrelated,
        )

    return result