Source code for mics.samples

"""
.. module:: samples
   :platform: Unix, Windows
   :synopsis: a module for defining the class :class:`sample`.

.. moduleauthor:: Charlles R. A. Abreu <abreu@eq.ufrj.br>


"""

# TODO: save potential and autocor as strings rather than lambda functions, so that
#       one can use pickle to save a sample or a mixture object.

from copy import deepcopy

import numpy as np
from pymbar import timeseries

from mics.utils import covariance
from mics.utils import genfunc
from mics.utils import info
from mics.utils import multimap


[docs]class sample:
    """
    A sample of configurations collected at a specific equilibrium state, aimed to be part
    of a mixture of independently collected samples (MICS).

        Args:
            dataset (pandas.DataFrame):
                a data frame whose rows represent configurations datasetd according to a
                given probability distribution and whose columns contain a number of
                properties evaluated for such configurations.
            potential (function):
                the reduced potential that defines the equilibrium sample. This function
                might for instance receive **x** and return the result of an element-wise
                calculation involving **x["a"]**, **x["b"]**, etc, with **"a"**, **"b"**,
                etc being names of properties in **dataset**.
            autocorr (function, optional):
                a function similar to **potential**, but whose result is an autocorrelated
                property to be used for determining the effective dataset size. If omitted,
                **potential** will be used to for this purpose.

        Note:
            Formally, functions **potential** and **autocorr** must receive **x** and
            return **y**, where `length(y) == nrow(x)`.

    """

    def __init__(self, dataset, potential, autocorr=None, label=None,
                 batchsize=None, verbose=False, **kwargs):

        if verbose:
            info("Setting up sample with label:", label)
            info("Reduced potential:", potential)
            info("Autocorrelated property:", (autocorr if autocorr else potential))
            info("Constants:", kwargs)

        names = list(dataset.columns)
        verbose and info("Properties:", ", ".join(names))

        self.dataset = dataset
        self.potential = genfunc(potential, names, kwargs)
        self.label = str(label)
        n = self.n = dataset.shape[0]
        b = self.b = batchsize if batchsize else int(np.sqrt(n))

        if verbose:
            info("Sample size:", n)
            info("Batch size:", b)

        self.autocorr = genfunc(autocorr, names, kwargs) if autocorr else self.potential
        y = multimap([self.autocorr], dataset)
        ym = np.mean(y, axis=1)
        S1 = covariance(y, ym, 1).item(0)
        Sb = covariance(y, ym, b).item(0)
        if not (np.isfinite(S1) and np.isfinite(Sb)):
            raise FloatingPointError("unable to determine effective sample size")
        self.neff = n*S1/Sb

        if verbose:
            info("Variance disregarding autocorrelation:", S1)
            info("Variance via Overlapping Batch Means:", Sb)
            info("Effective sample size via OBM:", self.neff)


[docs]class pool:
    """
    A pool of independently collected samples.

    """

    # ======================================================================================
    def __init__(self, label="", verbose=False):
        self.samples = list()
        self.label = str(label)
        self.verbose = verbose

    # ======================================================================================
    def add(self, *args, **kwargs):
        self.samples.append(sample(*args, verbose=self.verbose, **kwargs))

    # ======================================================================================
    def copy(self):
        return deepcopy(self)

    # ======================================================================================
    def subsample(self, compute_inefficiency=True):
        self.verbose and info("Performing subsampling...")
        for (i, sample) in enumerate(self.samples):
            self.verbose and info("Original sample size:", sample.n)
            old = sample.dataset.index
            if compute_inefficiency:
                y = multimap([sample.autocorr], sample.dataset)
                g = timeseries.statisticalInefficiency(y[0])
                self.verbose and info("Statistical inefficency via integrated ACF:", g)
            else:
                g = sample.n/sample.neff
                self.verbose and info("Statistical inefficency via Overlapping Batch Means:", g)
            new = timeseries.subsampleCorrelatedData(old, g)
            sample.dataset = sample.dataset.reindex(new)
            sample.neff = sample.n = len(new)
            self.verbose and info("New sample size:", sample.n)
        return self

    # ======================================================================================
    def __getitem__(self, i):
        return self.samples[i]

    # ======================================================================================
    def __len__(self):
        return len(self.samples)