Source code for prob140.multi_variable

import ast
import itertools as it
import numpy as np
import pandas as pd
import warnings

from datascience import (
    make_array,
    Table,
)


def conditional(array):
    value = array / sum(array[0: -1])
    return value


def evaluate(name):
    """
    Deletes name of RV and outputs the correct datatype, like int, float, or
    string.

    Parameters
    ----------
    name : String
        In the form 'rv=123123124'.

    Returns
    -------
    String, int, float
    """
    index = 0
    try:
        index = name.index('=')
        return ast.literal_eval(name[index + 1:])
    except Exception:
        return name[index + 1:]


[docs]class JointDistribution(pd.DataFrame):

[docs]    @classmethod
    def from_table(cls, table, reverse=True):
        """
        Constructs a JointDistribution from a Table.

        Parameters
        ----------
        table : Table
            3-column table with RV1, RV2, and joint probability
        reverse : bool (optional)
            If True, vertical random variables are reversed. (Default: True)

        Returns
        -------
        JointDistribution
        """
        return table.to_joint(reverse=reverse)

[docs]    def get_possible_values(self, label=''):
        """
        Returns the possible values. If a label is given, returns the values for
        that random variable. Automatically converts to float/int if relevant.

        Parameters
        ----------
        label : str
            Name of random variable.

        Returns
        -------
        List of values.
        """
        values = []
        if label == self._X_column_label or label == '':
            labels = list(self)
            values.append([evaluate(lab) for lab in labels])
        if label == self._Y_column_label or label == '':
            labels = list(self.index)
            values.append([evaluate(lab) for lab in labels])
        assert len(values) != 0, \
               'Label does not correspond with existing variable name'
        if len(values) == 1:
            return values[0]
        else:
            return values

[docs]    def marginal(self, label):
        """
        Returns the marginal distribution of label.

        Parameters
        ----------
        label : String
            The label of the variable of which we want to find the marginal
            distribution.

        Returns
        -------
        JointDistribution Table

        Examples
        --------
        >>> dist2 = Table().values('Coin1', ['H', 'T'], 'Coin2', ['H', 'T']).probability(np.array([0.24, 0.36, 0.16, 0.24])).to_joint()
        >>> dist2.marginal('Coin1')
                                Coin1=H  Coin1=T
        Coin2=T                    0.36     0.24
        Coin2=H                    0.24     0.16
        Sum: Marginal of Coin1     0.60     0.40
        >>> dist2.marginal('Coin2')
                 Coin1=H  Coin1=T  Sum: Marginal of Coin2
        Coin2=T     0.36     0.24                     0.6
        Coin2=H     0.24     0.16                     0.4
        """
        copy = JointDistribution(self, copy=True)
        if label == self._X_column_label:
            key = 'Sum: Marginal of {0}'.format(self._X_column_label)
            copy.loc[key] = copy.sum(axis=0)
        elif label == self._Y_column_label:
            key = 'Sum: Marginal of {0}'.format(self._Y_column_label)
            copy[key] = copy.sum(axis=1)
        else:
            raise AssertionError(
                'Label does not correspond with existing variable name')
        return copy

[docs]    def marginal_dist(self, label):
        """
        Finds the marginal marginal distribution of label, returns as a single
        variable distribution.

        Parameters
        ----------
        label
            The label of the variable of which we want to find the marginal
            distribution.

        Returns
        -------
        Table
            Single variable distribution of label.
        """
        marginal = self.marginal(label).as_matrix()
        if label == self._X_column_label:
            prob = marginal[-1, :]
        else:
            prob = marginal[:, -1]
        domain = self.get_possible_values(label)
        return Table().values(domain).probability(prob)

[docs]    def both_marginals(self):
        """
        Finds the marginal distribution of both variables.

        Returns
        -------
        JointDistribution Table.

        Examples
        --------
        >>> dist1 = Table().values([0, 1], [2, 3]).probability([0.1, 0.2, 0.3, 0.4]).to_joint()
        >>> dist1.both_marginals()
                            X=0  X=1  Sum: Marginal of Y
        Y=3                 0.2  0.4                 0.6
        Y=2                 0.1  0.3                 0.4
        Sum: Marginal of X  0.3  0.7                 1.0
        """
        copy = JointDistribution(self, copy=True)
        key_y = 'Sum: Marginal of {0}'.format(self._Y_column_label)
        key_x = 'Sum: Marginal of {0}'.format(self._X_column_label)
        copy[key_y] = copy.sum(axis=1)
        copy.loc[key_x] = copy.sum(axis=0)
        return copy

[docs]    def conditional_dist(self, label, given='', show_ev=False):
        """
        Given the random variable label, finds the conditional distribution of
        the other variable.

        Parameters
        ----------
        label : String
            Variable given.

        Returns
        -------
        JointDistribution Table

        Examples
        --------
        >>> coins = Table().values('Coin1', ['H', 'T'], 'Coin2', ['H','T']).probability(np.array([0.24, 0.36, 0.16,0.24])).to_joint()
        >>> coins.conditional_dist('Coin1', 'Coin2')
                                  Coin1=H  Coin1=T  Sum
        Dist. of Coin1 | Coin2=H      0.6      0.4  1.0
        Dist. of Coin1 | Coin2=T      0.6      0.4  1.0
        Marginal of Coin1             0.6      0.4  1.0
        >>> coins.conditional_dist('Coin2', 'Coin1')
                 Dist. of Coin2 | Coin1=H  Dist. of Coin2 | Coin1=T  Marginal of Coin2
        Coin2=H                       0.4                       0.4                0.4
        Coin2=T                       0.6                       0.6                0.6
        Sum                           1.0                       1.0                1.0
        """
        # TODO Refactor this function.
        if label == self._Y_column_label:
            both = self.both_marginals()
            new = np.append(both.index[0: -1], 'Sum')
            y = both.apply(conditional, axis=0).set_index(new)
            matrix = y.as_matrix()[:-1, :]
            y_labels = list(self.index)
            domain = np.array([evaluate(lab) for lab in y_labels])
            exp_values = [sum(matrix[:, i] * domain)
                          for i in range(len(matrix[0]))]
            column_names = y.columns

            new = make_array()
            for i in np.arange(len(column_names) - 1):
                new_name = 'Dist. of {0} | '.format(self._Y_column_label)
                new_name += column_names[i]
                new = np.append(new, new_name)
            new = np.append(new, 'Marginal of {0}'.format(self._Y_column_label))
            y.columns = new
            if show_ev:
                y.loc['EV'] = exp_values
            return y

        elif label == self._X_column_label:
            both = self.both_marginals()

            x = both.apply(conditional, axis=1).rename(columns={
                'Sum: Marginal of {0}'.format(self._Y_column_label): 'Sum'})

            matrix = x.as_matrix()[:, :-1]
            x_labels = list(self)
            domain = np.array([evaluate(lab) for lab in x_labels])
            exp_values = [sum(matrix[i] * domain) for i in range(len(matrix))]
            indices = both.index
            new = make_array()
            for i in np.arange(len(indices) - 1):
                new_name = 'Dist. of {0} | '.format(self._X_column_label)
                new_name += indices[i]
                new = np.append(new, new_name)
            new = np.append(new, 'Marginal of {0}'.format(self._X_column_label))
            new_df = x.set_index(new)

            if show_ev:
                new_df['EV'] = exp_values

            return new_df
        else:
            raise AssertionError(
                'Label does not correspond with existing variable name')


def multi_domain(table, *args):

    if isinstance(args[0], str):
        assert len(args) % 2 == 0, 'Must alternate between name and values'
        var_names = [args[2 * i] for i in range(len(args) // 2)]
        values = [args[2 * i + 1] for i in range(len(args) // 2)]
        var_values = list(zip(*it.product(*values)))
    else:
        var_names = [chr(ord('X') + i) for i in range(len(args))]
        var_values = list(zip(*it.product(*args)))

    new_table = table.copy()
    for column_name, column_value in reversed(list(zip(var_names, var_values))):
        new_table = new_table.with_column(column_name, column_value)
        new_table.move_to_start(column_name)

    return new_table


def to_joint(table, X_column_label=None, Y_column_label=None,
             probability_column_label=None, reverse=True):
    """
    Converts a table of probabilities associated with two variables into a
    JointDistribution object

    Parameters
    ----------
    table : Table
        You can either call pass in a Table directly or call the toJoint()
        method of that Table. See examples.
    X_column_label (optional) : str
        Label for the first variable. Defaults to the same label as that of
        first variable of Table.
    Y_column_label (optional) : str
        Label for the second variable. Defaults to the same label as that of
        second variable of Table.
    probability_column_label (optional) : str
        Label for probabilities.
    reverse (optional) : bool
        If True, the vertical values will be reversed.

    Returns
    -------
    JointDistribution
        A JointDistribution object.

    Examples
    --------
    >>> dist1 = Table().values([0,1],[2,3])
    >>> dist1['Probability'] = make_array(0.1, 0.2, 0.3, 0.4)
    >>> dist1.to_joint()
         X=0  X=1
    Y=3  0.2  0.4
    Y=2  0.1  0.3
    >>> dist2 = Table().values('Coin1',['H','T'], 'Coin2', ['H','T'])
    >>> dist2['Probability'] = np.array([0.4*0.6, 0.6*0.6, 0.4*0.4, 0.6*0.4])
    >>> dist2.toJoint()
             Coin1=H  Coin1=T
    Coin2=T     0.36     0.24
    Coin2=H     0.24     0.16
    """
    assert table.num_columns >= 3, (
        'You must have columns for your X variable, for your Y variable, and '
        'for your probabilities')
    if X_column_label is None:
        X_column_label = table.labels[0]
    if Y_column_label is None:
        Y_column_label = table.labels[1]
    if probability_column_label is None:
        probability_column_label = table.labels[table.num_columns-1]

    total = sum(table[probability_column_label])

    if round(total, 6) != 1:
        warnings.warn('Your probabilities sum to {0}'.format(total))

    x_possibilities = sorted(set(table[X_column_label]))
    y_possibilities = sorted(set(table[Y_column_label]), reverse=reverse)

    xInd = table.column_index(X_column_label)
    yInd = table.column_index(Y_column_label)
    pInd = table.column_index(probability_column_label)

    data = {poss: [0]*len(y_possibilities) for poss in x_possibilities}

    for row in table.rows:
        data[row[xInd]][y_possibilities.index(row[yInd])] += row[pInd]

    x_order = ['{}={}'.format(X_column_label, poss) for poss in x_possibilities]

    realData = {'{}={}'.format(X_column_label, str(poss)): value
                for poss, value in data.items()}
    index = ['{}={}'.format(Y_column_label, poss) for poss in y_possibilities]

    # Reverting order back to original
    df = pd.DataFrame(realData, index=index)
    joint_dist = JointDistribution(df[x_order], index=index)

    joint_dist.reindex(index)

    joint_dist._X_column_label = X_column_label
    joint_dist._Y_column_label = Y_column_label

    return joint_dist