Source code for physical_validation.ensemble

###########################################################################
#                                                                         #
#    physical_validation,                                                 #
#    a python package to test the physical validity of MD results         #
#                                                                         #
#    Written by Pascal T. Merz <pascal.merz@me.com>                       #
#               Michael R. Shirts <michael.shirts@colorado.edu>           #
#                                                                         #
#    Copyright (c) 2017-2021 University of Colorado Boulder               #
#              (c) 2012      The University of Virginia                   #
#                                                                         #
###########################################################################
r"""
This software allows users to perform statistical test to determine if a
given molecular simulation is consistent with the thermodynamic ensemble
it is performed in.

Users should cite the JCTC paper: Shirts, M. R. "Simple Quantitative
Tests to Validate Sampling from Thermodynamic Ensembles",
J. Chem. Theory Comput., 2013, 9 (2), pp 909-926,
https://dx.doi.org/10.1021/ct300688p
"""
from typing import Dict, List, Optional

import numpy as np

from .data import SimulationData
from .util import ensemble
from .util import error as pv_error


def beta_warning() -> None:
    print(
        "###############################################################################"
    )
    print(
        "# WARNING: Support for muVT ensemble is an experimental feature under current #"
    )
    print(
        "#          development. You can help us to improve it by reporting errors     #"
    )
    print(
        "#          at https://github.com/shirtsgroup/physical_validation              #"
    )
    print(
        "#          Thank you!                                                         #"
    )
    print(
        "###############################################################################"
    )


[docs]def check( data_sim_one: SimulationData, data_sim_two: SimulationData, total_energy: bool = False, bootstrap_error: bool = False, bootstrap_repetitions: int = 200, bootstrap_seed: Optional[int] = None, screen: bool = False, filename: Optional[str] = None, verbosity: int = 1, data_is_uncorrelated: bool = False, ) -> List[float]: r""" Check the ensemble. The correct check is inferred from the simulation data given. Parameters ---------- data_sim_one Simulation data object of first simulation data_sim_two Simulation data object of second simulation differing in its state point from the first total_energy Whether to use the total energy for the calculation Default: False, use potential energy only bootstrap_error Calculate the standard error via bootstrap resampling Default: False bootstrap_repetitions Number of bootstrap repetitions drawn Default: 200 bootstrap_seed Sets the random number seed for bootstrapping. If set, bootstrapping will be reproducible. Default: None, bootstrapping is non-reproducible. screen Plot distributions on screen. Default: False. filename Plot distributions to `filename`. Default: `None`, no plotting to file. verbosity Level of verbosity, from 0 (quiet) to 3 (very verbose). Default: 1 data_is_uncorrelated Whether the provided data is uncorrelated. If this option is set, the equilibration, decorrelation and tail pruning of the trajectory is skipped. This can speed up the analysis, but note that if the provided data is correlated, the results of the physical validation checks might be invalid. Returns ------- quantiles The number of quantiles the computed result is off the analytical one. """ data_sim_one.raise_if_units_are_none( test_name="ensemble.check", argument_name="data_sim_one", ) data_sim_two.raise_if_units_are_none( test_name="ensemble.check", argument_name="data_sim_two", ) if not SimulationData.compatible(data_sim_one, data_sim_two): raise pv_error.InputError( ["data_sim_one", "data_sim_two"], "Simulation data not compatible." ) data_sim_one.raise_if_ensemble_is_invalid( test_name="ensemble.check", argument_name="data_sim_one", check_pressure=True, check_mu=True, ) data_sim_two.raise_if_ensemble_is_invalid( test_name="ensemble.check", argument_name="data_sim_two", check_pressure=True, check_mu=True, ) if data_sim_one.ensemble.ensemble != data_sim_two.ensemble.ensemble: raise pv_error.InputError( ["data_sim_one", "data_sim_two"], "The two simulations were sampling different ensembles. " "The simulations are expected to differ in state point " "(e.g. target temperature, target pressure), but not " "in their sampled ensemble (e.g. NVT, NPT).", ) sampled_ensemble = data_sim_one.ensemble.ensemble if not ( sampled_ensemble == "NVT" or sampled_ensemble == "NPT" or sampled_ensemble == "muVT" ): raise NotImplementedError( "Test of ensemble " + sampled_ensemble + " is not implemented (yet).", ) labels = { "E": "Total Energy", "U": "Potential Energy", "H": "Enthalpy", "V": "Volume", "N": "Number of Species", r"E - \sum \mu N": r"$E - \sum \mu N$", r"U - \sum \mu N": r"$U - \sum \mu N$", } energy_observable_name = "total_energy" if total_energy else "potential_energy" energy_observable_abbreviation = "E" if total_energy else "U" energy_units = data_sim_one.units.energy_str quantiles = None num_bins_for_analysis = 40 tail_cutoff = 0.001 # 0.1% if sampled_ensemble == "NVT": data_sim_one.raise_if_observable_data_is_invalid( required_observables=[energy_observable_name], test_name="ensemble.check", argument_name="data_sim_one", ) data_sim_two.raise_if_observable_data_is_invalid( required_observables=[energy_observable_name], test_name="ensemble.check", argument_name="data_sim_two", ) quantiles = ensemble.check_1d( traj1=data_sim_one.observables[energy_observable_name], traj2=data_sim_two.observables[energy_observable_name], param1=data_sim_one.ensemble.temperature, param2=data_sim_two.ensemble.temperature, kb=data_sim_one.units.kb, quantity=energy_observable_abbreviation, dtemp=True, dpress=False, dmu=False, temp=None, pvconvert=None, nbins=num_bins_for_analysis, cutoff=tail_cutoff, bootstrap_seed=bootstrap_seed, bootstrap_error=bootstrap_error, bootstrap_repetitions=bootstrap_repetitions, verbosity=verbosity, filename=filename, screen=screen, xlabel=labels[energy_observable_abbreviation], xunit=energy_units, data_is_uncorrelated=data_is_uncorrelated, ) elif sampled_ensemble == "NPT": temperatures = np.array( [data_sim_one.ensemble.temperature, data_sim_two.ensemble.temperature] ) pressures = np.array( [data_sim_one.ensemble.pressure, data_sim_two.ensemble.pressure] ) equal_temps = temperatures[0] == temperatures[1] equal_press = pressures[0] == pressures[1] required_observables = ["volume"] if equal_press: required_observables.append(energy_observable_name) data_sim_one.raise_if_observable_data_is_invalid( required_observables=required_observables, test_name="ensemble.check", argument_name="data_sim_one", ) data_sim_two.raise_if_observable_data_is_invalid( required_observables=required_observables, test_name="ensemble.check", argument_name="data_sim_two", ) volume_units = data_sim_one.units.volume_str # Calculate conversion from p*V to energy units # # GROMACS standard units are # energy: kJ/mol # volume: nm^3 # pressure: bar # => pV-term: bar * nm^3 == 1e-25 kJ == 6.022140857e-2 kJ/mol # => pvconvert = 6.022140857e-2 # UnitData stores conversion factors relative to GROMACS units # energy: energy_conversion * kJ/mol # volume: volume_conversion * nm^3 # pressure: pressure_conversion * bar # => pV-term: [p]*[V] == pressure_conversion * volume_conversion bar * nm^3 # Units were checked earlier, so we can use either simulation data structure pvconvert = 6.022140857e-2 pvconvert *= ( data_sim_one.units.pressure_conversion * data_sim_one.units.volume_conversion ) pvconvert /= data_sim_one.units.energy_conversion if equal_press and not equal_temps: if energy_observable_abbreviation == "U": energy_observable_abbreviation = "H" quantiles = ensemble.check_1d( traj1=data_sim_one.observables[energy_observable_name] + pvconvert * pressures[0] * data_sim_one.observables.volume, traj2=data_sim_two.observables[energy_observable_name] + pvconvert * pressures[1] * data_sim_two.observables.volume, param1=temperatures[0], param2=temperatures[1], kb=data_sim_one.units.kb, quantity=energy_observable_abbreviation, dtemp=True, dpress=False, dmu=False, temp=None, pvconvert=None, nbins=num_bins_for_analysis, cutoff=tail_cutoff, bootstrap_seed=bootstrap_seed, bootstrap_error=bootstrap_error, bootstrap_repetitions=bootstrap_repetitions, verbosity=verbosity, filename=filename, screen=screen, xlabel=labels[energy_observable_abbreviation], xunit=energy_units, data_is_uncorrelated=data_is_uncorrelated, ) elif equal_temps and not equal_press: quantiles = ensemble.check_1d( traj1=data_sim_one.observables.volume, traj2=data_sim_two.observables.volume, param1=pressures[0], param2=pressures[1], kb=data_sim_one.units.kb, quantity="V", dtemp=False, dpress=True, dmu=False, nbins=num_bins_for_analysis, cutoff=tail_cutoff, bootstrap_seed=bootstrap_seed, temp=temperatures[0], pvconvert=pvconvert, bootstrap_error=bootstrap_error, bootstrap_repetitions=bootstrap_repetitions, verbosity=verbosity, filename=filename, screen=screen, xlabel=labels["V"], xunit=volume_units, data_is_uncorrelated=data_is_uncorrelated, ) else: param1 = np.array([temperatures[0], pressures[0]]) param2 = np.array([temperatures[1], pressures[1]]) quantiles = ensemble.check_2d( traj1=np.array( [ data_sim_one.observables[energy_observable_name], data_sim_one.observables.volume, ] ), traj2=np.array( [ data_sim_two.observables[energy_observable_name], data_sim_two.observables.volume, ] ), param1=param1, param2=param2, kb=data_sim_one.units.kb, pvconvert=pvconvert, quantity=[energy_observable_abbreviation, "V"], dtempdpress=True, dtempdmu=False, cutoff=tail_cutoff, bootstrap_seed=bootstrap_seed, bootstrap_error=bootstrap_error, bootstrap_repetitions=bootstrap_repetitions, verbosity=verbosity, filename=filename, screen=screen, data_is_uncorrelated=data_is_uncorrelated, ) elif sampled_ensemble == "muVT": beta_warning() equal_temps = ( data_sim_one.ensemble.temperature == data_sim_two.ensemble.temperature ) num_differing_chemical_potentials = np.count_nonzero( data_sim_one.ensemble.mu != data_sim_two.ensemble.mu ) num_identical_chemical_potentials = np.count_nonzero( data_sim_one.ensemble.mu == data_sim_two.ensemble.mu ) if num_differing_chemical_potentials > 1: raise NotImplementedError( "For muVT, ensemble checking is only implemented for chemical potentials " "differing for at most one species." ) if not equal_temps and num_differing_chemical_potentials == 0: # quantity = U - S_i mu_i*N_i # parameter = T energy_observable_abbreviation += r" - \sum \mu N" quantiles = ensemble.check_1d( traj1=data_sim_one.observables[energy_observable_name] - ensemble.chemical_potential_energy( data_sim_one.ensemble.mu, data_sim_one.observables.number_of_species ), traj2=data_sim_two.observables[energy_observable_name] - ensemble.chemical_potential_energy( data_sim_two.ensemble.mu, data_sim_two.observables.number_of_species ), param1=data_sim_one.ensemble.temperature, param2=data_sim_two.ensemble.temperature, kb=data_sim_one.units.kb, quantity=energy_observable_abbreviation, dtemp=True, dpress=False, dmu=False, temp=None, pvconvert=None, nbins=num_bins_for_analysis, cutoff=tail_cutoff, bootstrap_seed=bootstrap_seed, bootstrap_error=bootstrap_error, bootstrap_repetitions=bootstrap_repetitions, verbosity=verbosity, filename=filename, screen=screen, xlabel=labels[energy_observable_abbreviation], xunit=energy_units, data_is_uncorrelated=data_is_uncorrelated, ) if equal_temps and num_differing_chemical_potentials == 1: # quantity = N_i # parameter = mu_i idx = np.flatnonzero(data_sim_one.ensemble.mu != data_sim_two.ensemble.mu)[ 0 ] quantiles = ensemble.check_1d( traj1=data_sim_one.observables.number_of_species[:, idx], traj2=data_sim_two.observables.number_of_species[:, idx], param1=data_sim_one.ensemble.mu[idx], param2=data_sim_two.ensemble.mu[idx], kb=data_sim_one.units.kb, quantity="N", dtemp=False, dpress=False, dmu=True, nbins=num_bins_for_analysis, cutoff=tail_cutoff, bootstrap_seed=bootstrap_seed, temp=data_sim_one.ensemble.temperature, pvconvert=None, bootstrap_error=bootstrap_error, bootstrap_repetitions=bootstrap_repetitions, verbosity=verbosity, filename=filename, screen=screen, xlabel=labels["N"], xunit=None, data_is_uncorrelated=data_is_uncorrelated, ) if not equal_temps and num_differing_chemical_potentials == 1: # quantity = U - S_i!=j mu_i*N_i | mu_j*N_j # parameter = beta, mu_j energy_observable_one = data_sim_one.observables[energy_observable_name] energy_observable_two = data_sim_two.observables[energy_observable_name] if num_identical_chemical_potentials > 0: idx_identical = np.flatnonzero( data_sim_one.ensemble.mu == data_sim_two.ensemble.mu ) energy_observable_one -= ensemble.chemical_potential_energy( data_sim_one.ensemble.mu[idx_identical], data_sim_one.observables.number_of_species[:, idx_identical], ) energy_observable_two -= ensemble.chemical_potential_energy( data_sim_two.ensemble.mu[idx_identical], data_sim_two.observables.number_of_species[:, idx_identical], ) energy_observable_abbreviation += r" - \sum mu*N" idx_differing = np.flatnonzero( data_sim_one.ensemble.mu != data_sim_two.ensemble.mu )[0] param1 = np.array( [ data_sim_one.ensemble.temperature, data_sim_one.ensemble.mu[idx_differing], ] ) param2 = np.array( [ data_sim_two.ensemble.temperature, data_sim_two.ensemble.mu[idx_differing], ] ) quantiles = ensemble.check_2d( traj1=np.array( [ energy_observable_one, data_sim_one.observables.number_of_species[:, idx_differing], ] ), traj2=np.array( [ energy_observable_two, data_sim_two.observables.number_of_species[:, idx_differing], ] ), param1=param1, param2=param2, kb=data_sim_one.units.kb, pvconvert=None, quantity=[energy_observable_abbreviation, "N"], dtempdpress=False, dtempdmu=True, cutoff=tail_cutoff, bootstrap_seed=bootstrap_seed, bootstrap_error=bootstrap_error, bootstrap_repetitions=bootstrap_repetitions, verbosity=verbosity, filename=filename, screen=screen, data_is_uncorrelated=data_is_uncorrelated, ) return quantiles
[docs]def estimate_interval( data: SimulationData, verbosity: int = 1, total_energy: bool = False, data_is_uncorrelated: bool = False, ) -> Dict[str, float]: r""" In order to perform an ensemble check, two simulations at distinct state point are needed. Choosing two state points too far apart will result in poor or zero overlap between the distributions, leading to very noisy results (due to sample errors in the tails) or a breakdown of the method, respectively. Choosing two state points very close to each others, on the other hand, makes it difficult to distinguish the slope from statistical error in the samples. This function implements a rule of thumb based on the standard deviations of distributions. It takes a single simulation and suggests appropriate intervals for a second simulation to be used for ensemble checking. Parameters ---------- data The performed simulation. verbosity If 0, no output is printed on screen. If 1, estimated intervals are printed. If larger, additional information during calculation are printed. Default: 1 total_energy Use total energy instead of potential energy only. Default: False data_is_uncorrelated Whether the provided data is uncorrelated. If this option is set, the equilibration, decorrelation and tail pruning of the trajectory is skipped. This can speed up the analysis, but note that if the provided data is correlated, the results of the physical validation checks might be invalid. Default: False Returns ------- intervals If `data` was performed under NVT conditions, `intervals` contains only one entry: * `'dT'`, containing the suggested temperature interval. If `data` was performed under NPT conditions, `intervals` contains three entries: * `'dT'`: Suggested temperature interval at constant pressure * `'dP'`: Suggested pressure interval at constant temperature * `'dTdP'`: Suggested combined temperature and pressure interval """ data.raise_if_units_are_none( test_name="ensemble.estimate_interval", argument_name="data", ) data.raise_if_ensemble_is_invalid( test_name="ensemble.estimate_interval", argument_name="data", check_pressure=True, check_mu=True, ) if not ( data.ensemble.ensemble == "NVT" or data.ensemble.ensemble == "NPT" or data.ensemble.ensemble == "muVT" ): raise NotImplementedError( "estimate_interval() not implemented for ensemble " + data.ensemble.ensemble ) energy_observable_name = "total_energy" if total_energy else "potential_energy" required_observables = [energy_observable_name] if data.ensemble.ensemble == "NPT": required_observables.append("volume") if data.ensemble.ensemble == "muVT": required_observables.append("number_of_species") data.raise_if_observable_data_is_invalid( required_observables=required_observables, test_name="ensemble.estimate_interval", argument_name="data", ) tail_cutoff = 0.001 # 0.1% if data.ensemble.ensemble == "NVT": result = ensemble.estimate_interval( ens_string="NVT", ens_temp=data.ensemble.temperature, energy=data.observables[energy_observable_name], kb=data.units.kb, ens_press=None, volume=None, pvconvert=None, ens_mu=None, species_number=None, verbosity=verbosity, cutoff=tail_cutoff, tunit=data.units.temperature_str, punit="", munit="", data_is_uncorrelated=data_is_uncorrelated, ) elif data.ensemble.ensemble == "NPT": pvconvert = 6.022140857e-2 pvconvert *= data.units.pressure_conversion * data.units.volume_conversion pvconvert /= data.units.energy_conversion result = ensemble.estimate_interval( ens_string="NPT", ens_temp=data.ensemble.temperature, energy=data.observables[energy_observable_name], kb=data.units.kb, ens_press=data.ensemble.pressure, volume=data.observables.volume, pvconvert=pvconvert, ens_mu=None, species_number=None, verbosity=verbosity, cutoff=tail_cutoff, tunit=data.units.temperature_str, punit=data.units.pressure_str, munit="", data_is_uncorrelated=data_is_uncorrelated, ) else: beta_warning() result = ensemble.estimate_interval( ens_string="muVT", ens_temp=data.ensemble.temperature, energy=data.observables[energy_observable_name], kb=data.units.kb, ens_press=None, volume=None, pvconvert=None, ens_mu=data.ensemble.mu, species_number=data.observables.number_of_species, verbosity=verbosity, cutoff=tail_cutoff, tunit=data.units.temperature_str, punit="", munit=data.units.energy_str, data_is_uncorrelated=data_is_uncorrelated, ) return result