Source code for mics.samples

"""
.. module:: samples
   :platform: Unix, Windows
   :synopsis: a module for defining the class :class:`sample`.

.. moduleauthor:: Charlles R. A. Abreu <abreu@eq.ufrj.br>

"""

import numpy as np
from pymbar import timeseries

import mics
from mics.funcs import deltaMethod
from mics.funcs import func
from mics.utils import covariance
from mics.utils import info
from mics.utils import multimap
from mics.utils import propertyDict
from mics.utils import stdError


[docs]class sample: """ A sample of configurations distributed according to a :term:`PDF` proportional to ``exp(-u(x))``. Each configuration ``x`` is represented by a set of collective variables from which one can evaluate the reduced potential ``u(x)``, as well as other properties of interest. Parameters ---------- dataset : pandas.DataFrame A data frame whose column names are collective variables used to represent the sampled comfigurations. The rows must contain a time series of these variables, obtained by simulating the system at a state with known reduced potential. potential : str A mathematical expression defining the reduced potential of the simulated state. This must be a function of the column names in `dataset` and can also depend on external parameters passed as keyword arguments (see below). acfun : str, optional, default=potential A mathematical expression defining a property to be used for :term:`OBM` autocorrelation analysis and effective sample size calculation. It must be a function of the column names in `dataset` and can also depend on external parameters passed as keyword arguments (see below). batchsize : int, optional, default=sqrt(len(dataset)) The size of each batch (window) to be used in the :term:`OBM` analysis. If omitted, then the batch size will be the integer part of the square root of the sample size. **constants : keyword arguments A set of keyword arguments passed as name=value, aimed to define external parameter values for the evaluation of the mathematical expressions in `potential` and `acfun`. They can also be used as labels to distinguish samples from each other, in this case not necessary being present in the mentioned expressions. """ def __init__(self, dataset, potential, acfun=None, batchsize=None, **constants): names = dataset.columns.tolist() n = len(dataset) b = self.b = batchsize if batchsize else int(np.sqrt(n)) if mics.verbose: info("\n=== Setting up new sample ===") info("Properties:", ", ".join(names)) info("Constants:", constants) info("Reduced potential function:", potential) info("Autocorrelation analysis function:", acfun if acfun else potential) info("Sample size:", n) info("Batch size:", b) self.dataset = dataset self.potential = func(potential, names, constants) self.acfun = self.potential if acfun is None else func(acfun, names, constants) y = multimap([self.acfun.lambdify()], dataset) ym = np.mean(y, axis=1) S1 = covariance(y, ym, 1).item(0) Sb = covariance(y, ym, b).item(0) if not (np.isfinite(S1) and np.isfinite(Sb)): raise FloatingPointError("unable to determine effective sample size") self.neff = n*S1/Sb if mics.verbose: info("Variance disregarding autocorrelation:", S1) info("Variance via Overlapping Batch Means:", Sb) info("Effective sample size:", self.neff) def __add__(self, other): return mics.pooledsample([self]) + other def __radd__(self, other): return mics.pooledsample(other) + self
[docs] def subsampling(self, integratedACF=True): """ Performs inline subsampling based on the statistical inefficiency ``g`` of the specified attribute `acfun` of :class:`sample`, aiming at obtaining a sample of :term:`IID` configurations. Subsampling is done via jumps of varying sizes around ``g``, so that the sample size decays by a factor of approximately ``1/g``. Parameters ---------- integratedACF : bool, optional, default=True If true, the integrated :term:`ACF` method :cite:`Chodera_2007` will be used for computing the statistical inefficiency. Otherwise, the :term:`OBM` method will be used instead. Returns ------- :class:`sample` Although the subsampling is done inline, the new sample is returned for chaining purposes. """ n = len(self.dataset) if mics.verbose: info("\n=== Subsampling via %s ===" % ("integrated ACF" if integratedACF else "OBM")) info("Original sample size:", n) if integratedACF: y = multimap([self.acfun.lambdify()], self.dataset) g = timeseries.statisticalInefficiency(y[0]) else: g = n/self.neff new = timeseries.subsampleCorrelatedData(self.dataset.index, g) self.dataset = self.dataset.reindex(new) self.neff = len(new) if mics.verbose: info("Statistical inefficiency:", g) info("New sample size:", self.neff) return self
[docs] def averaging(self, properties, combinations={}, **constants): """ Computes averages and uncertainties of configurational properties. In addition, computes combinations among these averages while automatically handling uncertainty propagation. Parameters ---------- properties : dict(str: str) A dictionary associating names to mathematical expressions. This is used to define functions of the collective variables included in the samples. Then, averages of these functions will be evaluated at all sampled states, along with their uncertainties. The expressions might also depend on parameters passed as keyword arguments (see below). combinations : dict(str: str), optional, default={} A dictionary associating names to mathematical expressions. This is used to define functions of the names passed as keys in the `properties` dictionary. The expressions might also depend on parameters passed as keyword arguments (see below). **constants : optional keyword arguments A set of arguments passed as ``name=value``, used to define parameter values for evaluating the mathematical expressions in both `properties` and `combinations`. Returns ------- pandas.DataFrame A data frame containing the computed averages and combinations, as well as their estimated standard errors. """ variables = self.dataset.columns.tolist() functions = [func(f, variables, constants).lambdify() for f in properties.values()] y = multimap(functions, self.dataset) ym = np.mean(y, axis=1) Theta = covariance(y, ym, self.b) result = propertyDict(properties.keys(), ym, stdError(Theta)) if combinations: delta = deltaMethod(combinations.values(), properties.keys(), constants) (h, dh) = delta.evaluate(ym, Theta) result.update(propertyDict(combinations.keys(), h, dh)) return result.to_frame(0)