Source code for physical_validation.util.trajectory

###########################################################################
#                                                                         #
#    physical_validation,                                                 #
#    a python package to test the physical validity of MD results         #
#                                                                         #
#    Written by Pascal T. Merz <pascal.merz@me.com>                       #
#               Michael R. Shirts <michael.shirts@colorado.edu>           #
#                                                                         #
#    Copyright (c) 2017-2021 University of Colorado Boulder               #
#              (c) 2012      The University of Virginia                   #
#                                                                         #
###########################################################################

import warnings
from typing import Iterator, Optional, Tuple

import numpy as np

try:
    # pymbar >= 4
    from pymbar.timeseries import (
        detect_equilibration,
        statistical_inefficiency,
        subsample_correlated_data,
    )
except ImportError:
    # pymbar < 4
    from pymbar.timeseries import detectEquilibration as detect_equilibration
    from pymbar.timeseries import statisticalInefficiency as statistical_inefficiency
    from pymbar.timeseries import subsampleCorrelatedData as subsample_correlated_data

from scipy import stats

from . import error as pv_error


[docs]def equilibrate(traj: np.ndarray) -> np.ndarray:
    traj = np.array(traj)
    if traj.ndim == 1:
        t0, g, n_eff = detect_equilibration(traj)
        if t0 == 0 and traj.size > 10:
            # See https://github.com/choderalab/pymbar/issues/277
            t0x, gx, n_effx = detect_equilibration(traj[10:])
            if t0x != 0:
                t0 = t0x + 10
        res = traj[t0:]

    elif traj.ndim == 2 and traj.shape[0] == 2:
        t01, g1, n_eff1 = detect_equilibration(traj[0])
        t02, g2, n_eff2 = detect_equilibration(traj[1])
        t0 = max(t01, t02)
        if t0 == 0 and traj.shape[1] > 10:
            # See https://github.com/choderalab/pymbar/issues/277
            t01x, g1x, n_eff1x = detect_equilibration(traj[0, 10:])
            t02x, g2x, n_eff2x = detect_equilibration(traj[1, 10:])
            t0x = max(t01x, t02x)
            if t0x != 0:
                t0 = t0x + 10
        res = traj[:, t0:]
    elif traj.ndim == 2:
        raise NotImplementedError(
            "trajectory.equilibrate() in 2 dimensions is only "
            "implemented for exactly two timeseries."
        )
    else:
        raise NotImplementedError(
            "trajectory.equilibrate() is not implemented for "
            "trajectories with more than 2 dimensions."
        )

    return res


[docs]def decorrelate(traj: np.ndarray) -> np.ndarray:
    traj = np.array(traj)
    if traj.ndim == 1:
        idx = subsample_correlated_data(traj)
        res = traj[idx]
    elif traj.ndim == 2:
        # pymbar doesn't offer to decorrelate two samples, so let's do it ourselves
        # and just use the decorrelation of the sample more strongly correlated
        #
        # calculate (maximal) inefficiency
        g1 = statistical_inefficiency(traj[0])
        g2 = statistical_inefficiency(traj[1])
        g = np.max([g1, g2])
        # calculate index
        n0 = traj.shape[1]
        idx = np.unique(
            np.array(np.round(np.arange(0, int(n0 / g + 0.5)) * g), dtype=int)
        )
        idx = idx[idx < n0]
        res = traj[:, idx]
    else:
        raise NotImplementedError(
            "trajectory.decorrelate() is not implemented for "
            "trajectories with more than 1 dimension."
        )

    return res


[docs]def cut_tails(traj: np.ndarray, cut: float) -> np.ndarray:
    traj = np.array(traj)
    dc = 100 * cut
    if traj.ndim == 1:
        with warnings.catch_warnings():
            # With some combination of python version / scipy version,
            # scoreatpercentile throws a warning
            warnings.filterwarnings("ignore", category=FutureWarning)
            tmax = stats.scoreatpercentile(traj, 100 - dc)
            tmin = stats.scoreatpercentile(traj, dc)
        t = traj[(tmin <= traj) * (traj <= tmax)]
    elif traj.ndim == 2:
        with warnings.catch_warnings():
            # With some combination of python version / scipy version,
            # scoreatpercentile throws a warning
            warnings.filterwarnings("ignore", category=FutureWarning)
            tmax = stats.scoreatpercentile(traj, 100 - dc, axis=1)
            tmin = stats.scoreatpercentile(traj, dc, axis=1)
        t = traj[
            :,
            (tmin[0] <= traj[0])
            * (tmin[1] <= traj[1])
            * (tmax[0] >= traj[0])
            * (tmax[1] >= traj[1]),
        ]
    else:
        raise NotImplementedError(
            "trajectory.cut_tails() is not implemented for "
            "trajectories with more than 2 dimension."
        )

    return t


[docs]def prepare(
    traj: np.ndarray,
    cut: Optional[float] = None,
    verbosity: int = 1,
    name: Optional[str] = None,
    skip_preparation: bool = False,
) -> np.ndarray:
    traj = np.array(traj)
    if not name:
        name = "Traectory"

    def traj_length(t: np.ndarray) -> int:
        if t.ndim == 1:
            return t.size
        else:
            return t.shape[1]

    if skip_preparation:
        if verbosity > 0:
            print(
                "Equilibration, decorrelation and tail pruning was skipped on user "
                "request. Note that if the provided trajectory is statistically "
                "correlated, the results of the physical validation checks might "
                "be invalid."
            )
        return traj

    if traj.ndim > 2:
        raise NotImplementedError(
            "trajectory.prepare() is not implemented for "
            "trajectories with more than 2 dimensions."
        )

    # original length
    n0 = traj_length(traj)
    # equilibrate
    res = equilibrate(traj)
    n1 = traj_length(res)
    if verbosity > 2:
        print(
            "{:s} equilibration: First {:d} frames ({:.1%} of "
            "trajectory) discarded for burn-in.".format(name, n0 - n1, (n0 - n1) / n0)
        )
    # decorrelate
    res = decorrelate(res)
    n2 = traj_length(res)
    if verbosity > 2:
        print(
            "{:s} decorrelation: {:d} frames ({:.1%} of equilibrated "
            "trajectory) discarded for decorrelation.".format(
                name, n1 - n2, (n1 - n2) / n1
            )
        )
    # cut tails
    if cut is not None:
        res = cut_tails(res, cut)
        n3 = traj_length(res)
        if verbosity > 2:
            print(
                "{:s} tails (cut = {:.2%}): {:n} frames ({:.2%} of equilibrated and "
                "decorrelated trajectory) were cut".format(
                    name, cut, n2 - n3, (n2 - n3) / n2
                )
            )
    # end length
    nn = traj_length(res)

    if verbosity > 0:
        print(
            "After equilibration, decorrelation and tail pruning, {:.2%} ({:n} frames) "
            "of original {:s} remain.".format(nn / n0, nn, name)
        )

    return res


[docs]def overlap(
    traj1: np.ndarray, traj2: np.ndarray, cut=None
) -> Tuple[np.ndarray, np.ndarray, Optional[float], Optional[float]]:
    traj1 = np.array(traj1)
    traj2 = np.array(traj2)
    if traj1.ndim == traj2.ndim and traj2.ndim == 1:
        if cut:
            dc = 100 * cut
            max1 = stats.scoreatpercentile(traj1, 100 - dc)
            min1 = stats.scoreatpercentile(traj1, dc)
            max2 = stats.scoreatpercentile(traj2, 100 - dc)
            min2 = stats.scoreatpercentile(traj2, dc)
        else:
            max1 = np.max(traj1)
            min1 = np.min(traj1)
            max2 = np.max(traj2)
            min2 = np.min(traj2)

        tmin = max(min1, min2)
        tmax = min(max1, max2)

        t1 = traj1[(tmin <= traj1) * (traj1 <= tmax)]
        t2 = traj2[(tmin <= traj2) * (traj2 <= tmax)]
    elif traj1.ndim == traj2.ndim and traj2.ndim == 2:
        if traj1.shape[0] != 2 or traj2.shape[0] != 2:
            raise NotImplementedError(
                "trajectory.overlap() in 2 dimensions is only "
                "implemented for exactly two timeseries per trajectory."
            )
        if cut:
            dc = 100 * cut
            max1 = stats.scoreatpercentile(traj1, 100 - dc, axis=1)
            min1 = stats.scoreatpercentile(traj1, dc, axis=1)
            max2 = stats.scoreatpercentile(traj2, 100 - dc, axis=1)
            min2 = stats.scoreatpercentile(traj2, dc, axis=1)
        else:
            max1 = np.max(traj1, axis=1)
            min1 = np.min(traj1, axis=1)
            max2 = np.max(traj2, axis=1)
            min2 = np.min(traj2, axis=1)

        tmin = np.max([min1, min2], axis=0)
        tmax = np.min([max1, max2], axis=0)

        t1 = traj1[
            :,
            (tmin[0] <= traj1[0])
            * (tmin[1] <= traj1[1])
            * (tmax[0] >= traj1[0])
            * (tmax[1] >= traj1[1]),
        ]
        t2 = traj2[
            :,
            (tmin[0] <= traj2[0])
            * (tmin[1] <= traj2[1])
            * (tmax[0] >= traj2[0])
            * (tmax[1] >= traj2[1]),
        ]
    elif traj1.ndim != traj2.ndim:
        raise pv_error.InputError(
            ["traj1", "traj2"], "Trajectories don't have the same number of dimensions"
        )
    else:
        raise NotImplementedError(
            "trajectory.overlap() is not implemented for "
            "trajectories with more than 2 dimensions."
        )

    if np.any(max1 < min2) or np.any(max2 < min1):
        return np.array([]), np.array([]), None, None

    return t1, t2, tmin, tmax


[docs]def bootstrap(traj: np.ndarray, n_samples: int) -> Iterator[np.ndarray]:
    traj = np.array(traj)
    if traj.ndim == 1:
        n_traj = traj.size
    elif traj.ndim == 2:
        n_traj = traj.shape[1]
    else:
        raise NotImplementedError(
            "trajectory.bootstrap() is not implemented for "
            "trajectories with more than 2 dimensions."
        )
    for _ in range(n_samples):
        resample_idx = np.floor(np.random.rand(n_traj) * n_traj).astype(int)
        if traj.ndim == 1:
            yield traj[resample_idx]
        else:
            yield traj[:, resample_idx]