From d8dcb6892e11ebc8ceec0e73dee8ea3aed70018d Mon Sep 17 00:00:00 2001
From: Ian Goodfellow <goodfellow.ian@gmail.com>
Date: Sun, 16 Oct 2011 11:43:48 -0400
Subject: [PATCH] checked in some scripts I hadn't been tracking

---
 mean_field_experiments/hypercube1.py |   79 ++
 s3c/bug_report/s3c.py                | 1873 ++++++++++++++++++++++++++
 s3c/paper/hist_h.py                  |   64 +
 s3c/paper/learning_curve.py          |    2 +-
 s3c/paper/plot_em_functional.py      |   47 +
 s3c/paper/plot_inference.py          |   48 +
 6 files changed, 2112 insertions(+), 1 deletion(-)
 create mode 100644 mean_field_experiments/hypercube1.py
 create mode 100644 s3c/bug_report/s3c.py
 create mode 100644 s3c/paper/hist_h.py
 create mode 100644 s3c/paper/plot_em_functional.py
 create mode 100644 s3c/paper/plot_inference.py

diff --git a/mean_field_experiments/hypercube1.py b/mean_field_experiments/hypercube1.py
new file mode 100644
index 00000000..5032432f
--- /dev/null
+++ b/mean_field_experiments/hypercube1.py
@@ -0,0 +1,79 @@
+n = 2
+eps = 1e-6
+tol = 1e-3 #stop when variational parameters change by less than this amount in 2 norm
+
+#True distribution:
+#   with probability 1-eps, exactly one bit is on. uniform over which is on
+#   with probability, eps, some other number of bits is on
+#Mean field distribution:
+#   All bits are independent
+#
+
+
+# q( h_i ) propto exp( E_[h_-i \sim q] log P(h) )
+# Suppose h_i is 0.
+# Then we have
+# sum_{j \neq i} q_j \Pi_{k \neq i,j} (1-q_k)
+# chance of log P(h) being log(1-eps)
+# and a one minus that chance of it being eps
+# Suppose h_i is 1
+# Then we have
+# \Pi_{j \neq i} (1-q_j)
+# chance of log P(h) being log(1-eps)
+
+
+import numpy as np
+rng = np.random.RandomState([1,2,5])
+
+#q = rng.uniform(0.,1.,(n,))
+q = np.zeros(n)
+q[0] = 1.
+
+print q
+
+while True:
+    prev_q = q.copy()
+
+    order = range(n)
+    rng.shuffle(order)
+
+    for var_to_update in order:
+        high_prob = 0.
+
+        for i in xrange(n):
+            if i == var_to_update:
+                continue
+
+            rest_off_prob = 1
+
+            for j in xrange(n):
+                if j in [var_to_update, i]:
+                    continue
+                rest_off_prob *= (1.-q[j])
+
+            high_prob += q[i] * rest_off_prob
+        #end for i
+
+        zero_mass = high_prob * np.log(1.-eps) + (1.-high_prob)*np.log(eps)
+
+
+        high_prob = 1.
+
+        for i in xrange(n):
+            if i == var_to_update:
+                continue
+
+            high_prob *= (1.-q[i])
+
+        one_mass = high_prob * np.log(1.-eps) + (1.-high_prob)*np.log(eps)
+
+        prob = one_mass / (zero_mass + one_mass)
+
+        q[var_to_update] = prob
+
+    print q
+
+    if np.sqrt(np.sum(np.square(prev_q-q))) < tol:
+        break
+
+
diff --git a/s3c/bug_report/s3c.py b/s3c/bug_report/s3c.py
new file mode 100644
index 00000000..81f80b3c
--- /dev/null
+++ b/s3c/bug_report/s3c.py
@@ -0,0 +1,1873 @@
+from pylearn2.models.model import Model
+from theano import config, function, shared
+import theano.tensor as T
+import numpy as np
+from theano.sandbox.linalg.ops import alloc_diag
+#from theano.sandbox.linalg.ops import extract_diag
+from theano.sandbox.linalg.ops import matrix_inverse
+import warnings
+from theano.printing import Print
+from pylearn2.utils import make_name, sharedX, as_floatX
+from pylearn2.monitor import Monitor
+#import copy
+#config.compute_test_value = 'raise'
+
+class SufficientStatisticsHolder:
+    def __init__(self, nvis, nhid, needed_stats):
+        d = {
+                    "mean_h"                :   sharedX(np.zeros(nhid), "mean_h" ),
+                    "mean_v"                :   sharedX(np.zeros(nvis), "mean_v" ),
+                    "mean_sq_v"             :   sharedX(np.zeros(nvis), "mean_sq_v" ),
+                    "mean_s1"               :   sharedX(np.zeros(nhid), "mean_s1"),
+                    "mean_s"                :   sharedX(np.zeros(nhid), "mean_s" ),
+                    "mean_sq_s"             :   sharedX(np.zeros(nhid), "mean_sq_s" ),
+                    "mean_hs"               :   sharedX(np.zeros(nhid), "mean_hs" ),
+                    "mean_sq_hs"            :   sharedX(np.zeros(nhid), "mean_sq_hs" ),
+                    #"mean_D_sq_mean_Q_hs"   :   sharedX(np.zeros(nhid), "mean_D_sq_mean_Q_hs"),
+                    "cov_hs"                :   sharedX(np.zeros((nhid,nhid)), 'cov_hs'),
+                    "mean_hsv"              :   sharedX(np.zeros((nhid,nvis)), 'mean_hsv'),
+                    "u_stat_1"              :   sharedX(np.zeros((nhid,nvis)), 'u_stat_1'),
+                    "u_stat_2"              :   sharedX(np.zeros((nvis,)),'u_stat_2')
+                }
+
+        self.d = {}
+
+        for stat in needed_stats:
+            self.d[stat] = d[stat]
+
+
+    def __getstate__(self):
+        rval = {}
+
+        for name in self.d:
+            rval[name] = self.d[name].get_value(borrow=False)
+
+        return rval
+
+    def __setstate__(self,d):
+        self.d = {}
+
+        for name in d:
+            self.d[name] = shared(d[name])
+
+    def update(self, updates, updated_stats):
+        for key in updated_stats.d:
+            assert key in self.d
+        for key in self.d:
+            assert key in updated_stats.d
+            assert key not in updates
+            updates[self.d[key]] = updated_stats.d[key]
+
+class SufficientStatistics:
+    def __init__(self, d):
+        self. d = {}
+        for key in d:
+            self.d[key] = d[key]
+        #
+    #
+
+    @classmethod
+    def from_holder(self, holder):
+        return SufficientStatistics(holder.d)
+
+
+    @classmethod
+    def from_observations(self, needed_stats, X, H, mu0, Mu1, sigma0, Sigma1, \
+            U = None, N = None, B = None, W = None):
+
+        m = T.cast(X.shape[0],config.floatX)
+
+        H_name = make_name(H, 'anon_H')
+        Mu1_name = make_name(Mu1, 'anon_Mu1')
+
+        #mean_h
+        assert H.dtype == config.floatX
+        mean_h = T.mean(H, axis=0)
+        assert H.dtype == mean_h.dtype
+        assert mean_h.dtype == config.floatX
+        mean_h.name = 'mean_h('+H_name+')'
+
+        #mean_v
+        mean_v = T.mean(X,axis=0)
+
+        #mean_sq_v
+        mean_sq_v = T.mean(T.sqr(X),axis=0)
+
+        #mean_s
+        mean_S = H * Mu1 + (1.-H)*mu0
+        mean_s = T.mean(mean_S,axis=0)
+
+        #mean_s1
+        mean_s1 = T.mean(Mu1,axis=0)
+
+        #mean_sq_s
+        mean_sq_S = H * (Sigma1 + T.sqr(Mu1)) + (1. - H)*(sigma0+T.sqr(mu0))
+        mean_sq_s = T.mean(mean_sq_S,axis=0)
+
+        #mean_hs
+        mean_HS = H * Mu1
+        mean_hs = T.mean(mean_HS,axis=0)
+        mean_hs.name = 'mean_hs(%s,%s)' % (H_name, Mu1_name)
+        mean_D_sq_mean_Q_hs = T.mean(T.sqr(mean_HS), axis=0)
+
+        #mean_sq_hs
+        mean_sq_HS = H * (Sigma1 + T.sqr(Mu1))
+        mean_sq_hs = T.mean(mean_sq_HS, axis=0)
+        mean_sq_hs.name = 'mean_sq_hs(%s,%s)' % (H_name, Mu1_name)
+
+        #cov_hs
+        outer_prod = T.dot(mean_HS.T,mean_HS)
+        outer_prod.name = 'outer_prod<from_observations>'
+        outer = outer_prod/m
+        mask = T.identity_like(outer)
+        cov_hs = (1.-mask) * outer + alloc_diag(mean_sq_hs)
+        cov_hs.name = 'exp_outer_hs(%s,%s)' % (H_name, Mu1_name)
+
+        #mean_hsv
+        sum_hsv = T.dot(mean_HS.T,X)
+        sum_hsv.name = 'sum_hsv<from_observations>'
+        mean_hsv = sum_hsv / m
+
+        u_stat_1 = None
+        u_stat_2 = None
+        if U is not None:
+            N = as_floatX(N)
+            #u_stat_1
+            two = np.cast[config.floatX](2.)
+            u_stat_1 = - two * T.mean( T.as_tensor_variable(mean_HS).dimshuffle(0,1,'x') * U, axis=0)
+
+            #u_stat_2
+            #B = Print('B',attrs=['mean'])(B)
+            #N = Print('N')(N)
+            coeff = two * T.sqr(N)
+            #coeff = Print('coeff')(coeff)
+            term1 = coeff/B
+            #term1 = Print('us2 term1',attrs=['mean'])(term1)
+            dotA = T.dot(T.sqr(mean_HS),T.sqr(W.T))
+            dotA.name = 'dotA'
+            term2 = two * N * dotA
+            #term2 = Print('us2 term2',attrs=['mean'])(term2)
+            dotB = T.dot(mean_HS, W.T)
+            dotB.name = 'dotB'
+            term3 = - two * T.sqr( dotB )
+            #term3 = Print('us2 term3',attrs=['mean'])(term3)
+
+            u_stat_2 = (term1+term2+term3).mean(axis=0)
+
+
+
+        d = {
+                    "mean_h"                :   mean_h,
+                    "mean_v"                :   mean_v,
+                    "mean_sq_v"             :   mean_sq_v,
+                    "mean_s"                :   mean_s,
+                    "mean_s1"               :   mean_s1,
+                    "mean_sq_s"             :   mean_sq_s,
+                    "mean_hs"               :   mean_hs,
+                    "mean_sq_hs"            :   mean_sq_hs,
+                    #"mean_D_sq_mean_Q_hs"   :   mean_D_sq_mean_Q_hs,
+                    "cov_hs"                :   cov_hs,
+                    "mean_hsv"              :   mean_hsv,
+                    "u_stat_1"              :   u_stat_1,
+                    "u_stat_2"              :   u_stat_2
+                }
+
+
+        final_d = {}
+
+        for stat in needed_stats:
+            final_d[stat] = d[stat]
+            final_d[stat].name = 'observed_'+stat
+
+        return SufficientStatistics(final_d)
+
+    def decay(self, coeff):
+        rval_d = {}
+
+        coeff = np.cast[config.floatX](coeff)
+
+        for key in self.d:
+            rval_d[key] = self.d[key] * coeff
+            rval_d[key].name = 'decayed_'+self.d[key].name
+        #
+
+        return SufficientStatistics(rval_d)
+
+    def accum(self, new_stat_coeff, new_stats):
+
+        if hasattr(new_stat_coeff,'dtype'):
+            assert new_stat_coeff.dtype == config.floatX
+        else:
+            assert isinstance(new_stat_coeff,float)
+            new_stat_coeff = np.cast[config.floatX](new_stat_coeff)
+
+        rval_d = {}
+
+        for key in self.d:
+            rval_d[key] = self.d[key] + new_stat_coeff * new_stats.d[key]
+            rval_d[key].name = 'blend_'+self.d[key].name+'_'+new_stats.d[key].name
+
+        return SufficientStatistics(rval_d)
+
+
+class DebugEnergy:
+    def __init__(self,
+                    h_term = True,
+                    s_term_1 = True,
+                    s_term_2 = True,
+                    s_term_3 = True,
+                    v_term = True):
+        self.h_term = h_term
+        self.s_term_1 = s_term_1
+        self.s_term_2 = s_term_2
+        self.s_term_3 = s_term_3
+        self.v_term = v_term
+
+        for field in dir(self):
+            if type(field) == type(True) and not field:
+                print "HACK: some terms of energy / expected energy zeroed out"
+                break
+
+
+class S3C(Model):
+    def __init__(self, nvis, nhid, irange, init_bias_hid,
+                       init_B, min_B, max_B,
+                       init_alpha, min_alpha, max_alpha, init_mu,
+                       new_stat_coeff,
+                       e_step,
+                       m_step,
+                       W_eps = 1e-6, mu_eps = 1e-8, b_eps = 0.,
+                        min_bias_hid = -1e30,
+                        max_bias_hid = 1e30,
+                        min_mu = -1e30,
+                        max_mu = 1e30,
+                        tied_B = False,
+                       learn_after = None, hard_max_step = None,
+                       monitor_stats = None,
+                       monitor_functional = False,
+                       recycle_q = 0,
+                       seed = None,
+                       disable_W_update = False):
+        """"
+        nvis: # of visible units
+        nhid: # of hidden units
+        irange: (scalar) weights are initinialized ~U( [-irange,irange] )
+        init_bias_hid: initial value of hidden biases (scalar or vector)
+        init_B: initial value of B (scalar or vector)
+        min_B, max_B: (scalar) learning updates to B are clipped to [min_B, max_B]
+        init_alpha: initial value of alpha (scalar or vector)
+        min_alpha, max_alpha: (scalar) learning updates to alpha are clipped to [min_alpha, max_alpha]
+        init_mu: initial value of mu (scalar or vector)
+        new_stat_coeff: Exponential decay steps on a variable eta take the form
+                        eta:=  new_stat_coeff * new_observation + (1-new_stat_coeff) * eta
+        e_step:      An E_Step object that determines what kind of E-step to do
+        m_step:      An M_Step object that determines what kind of M-step to do
+        W_eps:       L2 regularization parameter for linear regression problem for W
+        mu_eps:      L2 regularization parameter for linear regression problem for mu
+        b_eps:       L2 regularization parameter for linear regression problem for b
+        learn_after: only applicable when new_stat_coeff < 1.0
+                        begins learning parameters and decaying sufficient statistics
+                        after seeing learn_after examples
+                        until this time, only accumulates sufficient statistics
+        hard_max_step:  if set to None, has no effect
+                        otherwise, every element of every parameter is not allowed to change
+                        by more than this amount on each M-step. This is basically a hack
+                        introduced to prevent explosion in gradient descent.
+        tied_B:         if True, use a scalar times identity for the precision on visible units.
+                        otherwise use a diagonal matrix for the precision on visible units
+        monitor_stats:  a list of sufficient statistics to monitor on the monitoring dataset
+        monitor_functional: if true, monitors the EM functional on the monitoring dataset
+        recycle_q: if nonzero, initializes the e-step with the output of the previous iteration's
+                    e-step. obviously this should only be used if you are using the same data
+                    in each batch. when recycle_q is nonzero, it should be set to the batch size.
+        disable_W_update: if true, doesn't update W (for debugging)
+        """
+
+        super(S3C,self).__init__()
+
+        if monitor_stats is None:
+            self.monitor_stats = []
+        else:
+            self.monitor_stats = [ elem for elem in monitor_stats ]
+
+        self.seed = seed
+
+        self.disable_W_update = disable_W_update
+        self.monitor_functional = monitor_functional
+        self.W_eps = np.cast[config.floatX](float(W_eps))
+        self.mu_eps = np.cast[config.floatX](float(mu_eps))
+        self.b_eps = np.cast[config.floatX](float(b_eps))
+        self.nvis = nvis
+        self.nhid = nhid
+        self.irange = irange
+        self.init_bias_hid = init_bias_hid
+        self.init_alpha = float(init_alpha)
+        self.min_alpha = float(min_alpha)
+        self.max_alpha = float(max_alpha)
+        self.init_B = float(init_B)
+        self.min_B = float(min_B)
+        self.max_B = float(max_B)
+        self.e_step = e_step
+        self.e_step.register_model(self)
+        self.m_step = m_step
+        self.init_mu = init_mu
+        self.min_mu = min_mu
+        self.max_mu = max_mu
+        self.min_bias_hid = min_bias_hid
+        self.max_bias_hid = max_bias_hid
+        self.recycle_q = recycle_q
+
+        self.tied_B = tied_B
+
+        self.hard_max_step = hard_max_step
+        if self.hard_max_step is not None:
+            self.hard_max_step = as_floatX(float(self.hard_max_step))
+
+
+
+        #this class always needs a monitor, since it is used to implement the learn_after feature
+        Monitor.get_monitor(self)
+
+        self.new_stat_coeff = np.cast[config.floatX](float(new_stat_coeff))
+        if self.new_stat_coeff < 1.0:
+            assert learn_after is not None
+        else:
+            assert learn_after is None
+        #
+
+        self.learn_after = learn_after
+
+        self.reset_rng()
+
+        self.redo_everything()
+
+    def reset_rng(self):
+        if self.seed is None:
+            self.rng = np.random.RandomState([1.,2.,3.])
+        else:
+            self.rng = np.random.RandomState(self.seed)
+
+    def redo_everything(self):
+        self.W = sharedX(self.rng.uniform(-self.irange, self.irange, (self.nvis, self.nhid)), name = 'W')
+        self.bias_hid = sharedX(np.zeros(self.nhid)+self.init_bias_hid, name='bias_hid')
+        self.alpha = sharedX(np.zeros(self.nhid)+self.init_alpha, name = 'alpha')
+        self.mu = sharedX(np.zeros(self.nhid)+self.init_mu, name='mu')
+        if self.tied_B:
+            self.B_driver = sharedX(0.0+self.init_B, name='B')
+        else:
+            self.B_driver = sharedX(np.zeros(self.nvis)+self.init_B, name='B')
+
+
+        if self.new_stat_coeff < 1.0:
+            self.suff_stat_holder = SufficientStatisticsHolder(nvis = self.nvis, nhid = self.nhid,
+                    needed_stats = self.m_step.needed_stats() )
+
+        self.test_batch_size = 5
+
+        if self.recycle_q:
+            self.prev_H = sharedX(np.zeros((self.test_batch_size,self.nhid)), name="prev_H")
+            self.prev_Mu1 = sharedX(np.zeros((self.test_batch_size,self.nhid)), name="prev_Mu1")
+
+        self.debug_m_step = True
+        if self.debug_m_step:
+            self.em_functional_diff = sharedX(0.)
+
+        self.redo_theano()
+
+
+        if self.recycle_q:
+            self.prev_H.set_value( np.cast[self.prev_H.dtype]( np.zeros((self.recycle_q, self.nhid)) + 1./(1.+np.exp(-self.bias_hid.get_value()))))
+            self.prev_Mu1.set_value( np.cast[self.prev_Mu1.dtype]( np.zeros((self.recycle_q, self.nhid)) + self.mu.get_value() ) )
+
+    def em_functional(self, H, sigma0, Sigma1, stats):
+        """ Returns the em_functional for a single batch of data
+            stats is assumed to be computed from and only from
+            the same data points that yielded H """
+
+        assert self.new_stat_coeff == 1.0
+
+
+        warnings.warn('em functional hacked')
+
+        entropy_term = (self.entropy_hs(H = H, sigma0 = sigma0, Sigma1 = Sigma1)).mean()
+        likelihood_term = self.log_likelihood_vhs(stats)
+
+        em_functional = likelihood_term #+ entropy_term
+
+        return em_functional
+
+    def get_monitoring_channels(self, V):
+
+        rval = self.m_step.get_monitoring_channels(V, self)
+
+        from_e_step = self.e_step.get_monitoring_channels(V, self)
+
+        rval.update(from_e_step)
+
+        monitor_stats = len(self.monitor_stats) > 0
+
+        if monitor_stats or self.monitor_functional:
+
+            obs = self.e_step.mean_field(V)
+
+            needed_stats = set(self.monitor_stats)
+
+            if self.monitor_functional:
+                needed_stats = needed_stats.union(S3C.log_likelihood_vhs_needed_stats())
+
+            stats = SufficientStatistics.from_observations( needed_stats = needed_stats,
+                                                            X = V, ** obs )
+
+            H = obs['H']
+            sigma0 = obs['sigma0']
+            Sigma1 = obs['Sigma1']
+
+            if self.monitor_functional:
+                em_functional = self.em_functional(H = H, sigma0 = sigma0, Sigma1 = Sigma1, stats = stats)
+
+                rval['em_functional'] = em_functional
+
+
+            if monitor_stats:
+
+
+
+                for stat in self.monitor_stats:
+                    stat_val = stats.d[stat]
+
+                    rval[stat+'_min'] = T.min(stat_val)
+                    rval[stat+'_mean'] = T.mean(stat_val)
+                    rval[stat+'_max'] = T.max(stat_val)
+
+        return rval
+
+    def get_params(self):
+        return [self.W, self.bias_hid, self.alpha, self.mu, self.B_driver ]
+
+    @classmethod
+    def solve_vhs_needed_stats(cls):
+        return set([ 'cov_hs',
+                 'mean_hsv',
+                 'mean_v',
+                 'mean_sq_v',
+                 'mean_sq_s',
+                 'mean_sq_hs',
+                 'mean_hs',
+                 'mean_h'
+                ])
+
+    def solve_vhs_from_stats(self, stats):
+
+        """Solve multiple linear regression problem where
+         W is a matrix used to predict v from h*s
+
+
+        cov_hs[i,j] = E_D,Q h_i s_i h_j s_j   (note that diagonal has different formula)
+        """
+
+        cov_hs = stats.d['cov_hs']
+        assert cov_hs.dtype == config.floatX
+        #mean_hsv[i,j] = E_D,Q h_i s_i v_j
+        mean_hsv = stats.d['mean_hsv']
+
+        regularized = cov_hs + alloc_diag(T.ones_like(self.mu) * self.W_eps)
+        assert regularized.dtype == config.floatX
+
+
+        inv = matrix_inverse(regularized)
+        assert inv.dtype == config.floatX
+
+        inv_prod = T.dot(inv,mean_hsv)
+        inv_prod.name = 'inv_prod'
+        new_W = inv_prod.T
+        assert new_W.dtype == config.floatX
+
+        #Solve for B by setting gradient of log likelihood to 0
+        mean_sq_v = stats.d['mean_sq_v']
+
+        one = as_floatX(1.)
+        two = as_floatX(2.)
+
+        denom1 = mean_sq_v
+
+        denom2 = - two * (new_W * mean_hsv.T).sum(axis=1)
+        denom3 = (cov_hs.dimshuffle('x',0,1)*new_W.dimshuffle(0,1,'x')*new_W.dimshuffle(0,1,'x')).sum(axis=(1,2))
+
+        warnings.warn('zeroing out B denom terms')
+        denom = denom1 #+ denom2 + denom3
+
+        #denom = T.clip(denom1 + denom2 + denom3, 1e-10, 1e8)
+
+        new_B = one / denom
+
+        if self.tied_B:
+            new_B = new_B.mean()
+
+        mean_hs = stats.d['mean_hs']
+
+        # Now a linear regression problem where mu_i is used to predict
+        # s_i from h_i
+
+        # mu_i = ( h^T h + reg)^-1 h^T s_i
+
+        mean_h = stats.d['mean_h']
+        assert mean_h.dtype == config.floatX
+        reg = self.mu_eps
+        new_mu = mean_hs/(mean_h+reg)
+
+
+        mean_sq_s = stats.d['mean_sq_s']
+        mean_sq_hs = stats.d['mean_h']
+
+        s_denom1 = mean_sq_s
+        s_denom2 = - two * new_mu * mean_hs
+        s_denom3 = T.sqr(new_mu) * mean_h
+
+
+        s_denom = s_denom1 + s_denom2 + s_denom3
+
+        new_alpha = one / s_denom
+
+
+        #probability of hiddens just comes from sample counting
+        #to put it back in bias_hid space apply sigmoid inverse
+
+        p = T.clip(mean_h,np.cast[config.floatX](1e-8),np.cast[config.floatX](1.-1e-8))
+        p.name = 'mean_h_clipped'
+
+        assert p.dtype == config.floatX
+
+        bias_hid = T.log( - p / (p-1.+self.b_eps) )
+
+        assert bias_hid.dtype == config.floatX
+
+        return new_W, bias_hid, new_alpha, new_mu, new_B
+
+    @classmethod
+    def solve_vhsu_needed_stats(cls):
+        return set(['mean_hsv',
+                    'mean_sq_s',
+                    'u_stat_1',
+                    'mean_sq_hs',
+                    'mean_hs',
+                    'mean_h',
+                    'u_stat_2',
+                    'mean_sq_v'])
+
+    def solve_vhsu_from_stats(self, stats):
+         #TODO: write unit test verifying that this results in zero gradient
+
+        #Solve for W
+        mean_hsv = stats.d['mean_hsv']
+        half = np.cast[config.floatX](0.5)
+        u_stat_1 = stats.d['u_stat_1']
+        mean_sq_hs = stats.d['mean_sq_hs']
+        N = np.cast[config.floatX](self.nhid)
+
+        numer1 = mean_hsv.T
+        numer2 = half * u_stat_1.T
+
+        numer = numer1 + numer2
+
+        #mean_sq_hs = Print('mean_sq_hs',attrs=['mean'])(mean_sq_hs)
+
+        denom = N * mean_sq_hs
+
+        new_W = numer / denom
+        new_W.name = 'new_W'
+
+
+        #Solve for mu
+        mean_hs = stats.d['mean_hs']
+        mean_h =  stats.d['mean_h']
+        mean_h = Print('mean_h',attrs=['min','mean','max'])(mean_h)
+        new_mu = mean_hs / (mean_h + self.W_eps)
+        new_mu.name = 'new_mu'
+
+
+        #Solve for bias_hid
+        denom = T.clip(mean_h - 1., -1., -1e-10)
+
+
+        new_bias_hid = T.log( - mean_h / denom )
+        new_bias_hid.name = 'new_bias_hid'
+
+
+        #Solve for alpha
+        mean_sq_s = stats.d['mean_sq_s']
+        one = np.cast[config.floatX](1.)
+        two = np.cast[config.floatX](2.)
+        denom = mean_sq_s + mean_h * T.sqr(new_mu) - two * new_mu * mean_hs
+        new_alpha =  one / denom
+        new_alpha.name = 'new_alpha'
+
+
+        #Solve for B
+        #new_W = Print('new_W',attrs=['mean'])(new_W)
+
+        numer = T.sqr(N)+one
+        numer = Print('numer')(numer)
+        assert numer.dtype == config.floatX
+        u_stat_2 = stats.d['u_stat_2']
+        #u_stat_2 = Print('u_stat_2',attrs=['mean'])(u_stat_2)
+
+        mean_sq_v = stats.d['mean_sq_v']
+        #mean_sq_v = Print('mean_sq_v',attrs=['mean'])(mean_sq_v)
+
+        mean_sq_hs = Print('mean_sq_hs',attrs=['mean'])(mean_sq_hs)
+        #mean_hsv = Print('mean_hsv',attrs=['mean'])(mean_hsv)
+
+        dotC =  T.dot(T.sqr(new_W), mean_sq_hs)
+        dotC.name = 'dotC'
+        denom1 = N * dotC
+        denom2 = half * u_stat_2
+        denom3 = - (new_W.T *  u_stat_1).sum(axis=0)
+        denom4 = - two * (new_W.T * mean_hsv).sum(axis=0)
+        denom5 = mean_sq_v
+
+        denom = T.clip(denom1 + denom2 + denom3 + denom4 + denom5, 1e-8, 1e12)
+        #denom = Print('denom', attrs=['min','max'])(denom)
+        assert denom.dtype == config.floatX
+
+        new_B = numer / denom
+        new_B.name = 'new_B'
+        assert new_B.dtype == config.floatX
+
+
+        return new_W, new_bias_hid, new_alpha, new_mu, new_B
+
+
+    def energy_vhs(self, V, H, S, debug_energy = None):
+        " H MUST be binary "
+
+        if debug_energy is None:
+            debug_energy = DebugEnergy()
+
+
+
+        h_term = - T.dot(H, self.bias_hid)
+        assert len(h_term.type.broadcastable) == 1
+
+        if not debug_energy.h_term:
+            h_term = 0.
+
+
+        s_term_1 = T.dot(T.sqr(S), self.alpha)/2.
+        s_term_2 = -T.dot(S * self.mu * H , self.alpha)
+        #s_term_3 = T.dot(T.sqr(self.mu * H), self.alpha)/2.
+        s_term_3 = T.dot(T.sqr(self.mu) * H, self.alpha) / 2.
+
+        if not debug_energy.s_term_1:
+            s_term_1 = 0.
+
+        if not debug_energy.s_term_2:
+            s_term_2 = 0.
+
+        if not debug_energy.s_term_3:
+            s_term_3 = 0.
+
+        s_term = s_term_1 + s_term_2 + s_term_3
+        #s_term = T.dot( T.sqr( S - self.mu * H) , self.alpha) / 2.
+        assert len(s_term.type.broadcastable) == 1
+
+
+        recons = T.dot(H*S, self.W.T)
+
+        v_term_1 = T.dot( T.sqr(V), self.B) / 2.
+        v_term_2 = T.dot( - V * recons, self.B)
+        v_term_3 = T.dot( T.sqr(recons), self.B) / 2.
+
+        v_term = v_term_1 + v_term_2 + v_term_3
+
+        #v_term = T.dot( T.sqr( V - recons), self. B) / 2.
+        assert len(v_term.type.broadcastable) == 1
+
+
+        if not debug_energy.v_term:
+            v_term = 0.
+
+        rval = h_term + s_term + v_term
+        assert len(rval.type.broadcastable) == 1
+
+        return rval
+
+
+    def expected_energy_vhs(self, V, H, mu0, Mu1, sigma0, Sigma1, debug_energy = None):
+
+        var_HS = H * Sigma1 + (1.-H) * sigma0
+
+        if debug_energy is None:
+            debug_energy = DebugEnergy()
+
+        half = as_floatX(.5)
+
+        HS = H * Mu1
+
+
+        sq_HS = H * ( Sigma1 + T.sqr(Mu1))
+
+        sq_S = sq_HS + (1.-H)*(sigma0 + T.sqr(mu0))
+
+        presign = T.dot(H, self.bias_hid)
+        presign.name = 'presign'
+        h_term = - presign
+        assert len(h_term.type.broadcastable) == 1
+
+        if not debug_energy.h_term:
+            h_term = 0.
+
+        precoeff =  T.dot(sq_S, self.alpha)
+        precoeff.name = 'precoeff'
+        s_term_1 = half * precoeff
+        assert len(s_term_1.type.broadcastable) == 1
+
+        if not debug_energy.s_term_1:
+            s_term_1 = 0.
+
+        presign2 = T.dot(HS, self.alpha * self.mu)
+        presign2.name = 'presign2'
+        s_term_2 = - presign2
+        assert len(s_term_2.type.broadcastable) == 1
+
+        if not debug_energy.s_term_2:
+            s_term_2 = 0.
+
+        s_term_3 = half * T.dot(H, T.sqr(self.mu) * self.alpha)
+        assert len(s_term_3.type.broadcastable) == 1
+
+        if not debug_energy.s_term_3:
+            s_term_3 = 0.
+
+        s_term = s_term_1 + s_term_2 + s_term_3
+
+        v_term_1 = half * T.dot(T.sqr(V),self.B)
+        assert len(v_term_1.type.broadcastable) == 1
+
+        term6_factor1 = V * self.B
+        term6_factor2 = T.dot(HS, self.W.T)
+        v_term_2 = - (term6_factor1 * term6_factor2).sum(axis=1)
+        assert len(v_term_2.type.broadcastable) == 1
+
+        term7_subterm1 = T.dot(T.sqr(T.dot(HS, self.W.T)), self.B)
+        assert len(term7_subterm1.type.broadcastable) == 1
+        #term7_subterm2 = T.dot(var_HS, self.w)
+        term7_subterm2 = - T.dot( T.dot(T.sqr(HS), T.sqr(self.W.T)), self.B)
+        term7_subterm3 = T.dot( T.dot(sq_HS, T.sqr(self.W.T)), self.B )
+
+        #v_term_3 = half * (term7_subterm1 + term7_subterm2)
+        v_term_3 = half * (term7_subterm1 + term7_subterm2 + term7_subterm3)
+        assert len(v_term_3.type.broadcastable) == 1
+
+        v_term = v_term_1 + v_term_2 + v_term_3
+
+        if not debug_energy.v_term:
+            v_term = 0.0
+
+        rval = h_term + s_term + v_term
+
+        return rval
+
+
+    def entropy_h(self, H):
+
+        #TODO: replace with actually evaluating 0 log 0 as 0
+        #note: can't do 1e-8, 1.-1e-8 rounds to 1.0 in float32
+        H = T.clip(H, 1e-7, 1.-1e-7)
+
+        #H = Print('entropy_h',attrs=['min','max'])(H)
+
+        logH = T.log(H)
+
+        #logH = Print('logH',attrs=['min','max'])(logH)
+
+        logOneMinusH = T.log(1.-H)
+
+        #logOneMinusH = Print('logOneMinusH',attrs=['min','max'])(logOneMinusH)
+
+        term1 = - T.sum( H * logH , axis=1)
+        assert len(term1.type.broadcastable) == 1
+
+        term2 = - T.sum( (1.-H) * logOneMinusH , axis =1 )
+        assert len(term2.type.broadcastable) == 1
+
+        rval = term1 + term2
+
+        return rval
+
+    def entropy_hs(self, H, sigma0, Sigma1):
+
+        half = as_floatX(.5)
+
+        one = as_floatX(1.)
+
+        two = as_floatX(2.)
+
+        pi = as_floatX(np.pi)
+
+        term1_plus_term2 = self.entropy_h(H)
+        assert len(term1_plus_term2.type.broadcastable) == 1
+
+        #TODO: change Sigma1 back into a vector
+        #TODO: pick new name for Sigma1; does capitalization mean it's a covariance matrix rather than a scalar
+        #                               or does it mean it's a minibatch rather than one example?
+        #
+        term3 = T.sum( H * ( half * (T.log(Sigma1) +  T.log(two*pi) + one )  ) , axis= 1)
+        assert len(term3.type.broadcastable) == 1
+
+        term4 = T.dot( 1.-H, half * (T.log(sigma0) +  T.log(two*pi) + one ))
+        assert len(term4.type.broadcastable) == 1
+
+        rval = term1_plus_term2 + term3 + term4
+
+        return rval
+
+
+    def make_learn_func(self, X, learn = None):
+        """
+        X: a symbolic design matrix
+        learn:
+            must be None unless using sufficient statistics decay
+            False: accumulate sufficient statistics
+            True: exponentially decay sufficient statistics, accumulate new ones, and learn new params
+        """
+
+        #E step
+        hidden_obs = self.e_step.mean_field(X)
+
+        m = T.cast(X.shape[0],dtype = config.floatX)
+        N = np.cast[config.floatX](self.nhid)
+        new_stats = SufficientStatistics.from_observations(needed_stats = self.m_step.needed_stats(),
+                X = X, N = N, B = self.B, W = self.W, **hidden_obs)
+
+
+        if self.new_stat_coeff == 1.0:
+            assert learn is None
+            updated_stats = new_stats
+            do_learn_updates = True
+            do_stats_updates = False
+        else:
+            do_stats_updates = True
+            do_learn_updates = learn
+
+            old_stats = SufficientStatistics.from_holder(self.suff_stat_holder)
+
+            if learn:
+                updated_stats = old_stats.decay(1.0-self.new_stat_coeff)
+                updated_stats = updated_stats.accum(new_stat_coeff = self.new_stat_coeff, new_stats = new_stats)
+            else:
+                updated_stats = old_stats.accum(new_stat_coeff = m / self.learn_after, new_stats = new_stats)
+            #
+
+        if do_learn_updates:
+            learning_updates = self.m_step.get_updates(self, updated_stats)
+        else:
+            learning_updates = {}
+
+        if do_stats_updates:
+            self.suff_stat_holder.update(learning_updates, updated_stats)
+
+        if self.recycle_q:
+            learning_updates[self.prev_H] = hidden_obs['H']
+            learning_updates[self.prev_Mu1] = hidden_obs['Mu1']
+
+        self.censor_updates(learning_updates)
+
+        if self.debug_m_step:
+            em_functional_before = self.em_functional(H = hidden_obs['H'],
+                                                      sigma0 = hidden_obs['sigma0'],
+                                                      Sigma1 = hidden_obs['Sigma1'],
+                                                      stats = updated_stats)
+
+            tmp_bias_hid = self.bias_hid
+            tmp_mu = self.mu
+            tmp_alpha = self.alpha
+            tmp_W = self.W
+            tmp_B_driver = self.B_driver
+
+            self.bias_hid = learning_updates[self.bias_hid]
+            self.mu = learning_updates[self.mu]
+            self.alpha = learning_updates[self.alpha]
+            if self.W in learning_updates:
+                self.W = learning_updates[self.W]
+            self.B_driver = learning_updates[self.B_driver]
+            self.make_B_and_w()
+
+            try:
+                em_functional_after  = self.em_functional(H = hidden_obs['H'],
+                                                          sigma0 = hidden_obs['sigma0'],
+                                                          Sigma1 = hidden_obs['Sigma1'],
+                                                          stats = updated_stats)
+            finally:
+                self.bias_hid = tmp_bias_hid
+                self.mu = tmp_mu
+                self.alpha = tmp_alpha
+                self.W = tmp_W
+                self.B_driver = tmp_B_driver
+                self.make_B_and_w()
+
+            em_functional_diff = em_functional_after - em_functional_before
+
+            learning_updates[self.em_functional_diff] = em_functional_diff
+
+        return function([X], updates = learning_updates)
+    #
+
+    def censor_updates(self, updates):
+
+        if self.disable_W_update and self.W in updates:
+            del updates[self.W]
+
+        if self.alpha in updates:
+            updates[self.alpha] = T.clip(updates[self.alpha],self.min_alpha,self.max_alpha)
+
+        if self.mu in updates:
+            updates[self.mu] = T.clip(updates[self.mu],self.min_mu,self.max_mu)
+
+        if self.B_driver in updates:
+            updates[self.B_driver] = T.clip(updates[self.B_driver],self.min_B,self.max_B)
+
+        if self.bias_hid in updates:
+            updates[self.bias_hid] = T.clip(updates[self.bias_hid],self.min_bias_hid,self.max_bias_hid)
+
+        if self.hard_max_step is not None:
+            for param in updates:
+                updates[param] = T.clip(updates[param],param-self.hard_max_step,param+self.hard_max_step)
+
+
+    @classmethod
+    def log_likelihood_vhs_needed_stats(cls):
+        h = S3C.log_likelihood_h_needed_stats()
+        s = S3C.log_likelihood_s_given_h_needed_stats()
+        v = S3C.log_likelihood_v_given_hs_needed_stats()
+
+        union = h.union(s).union(v)
+
+        return union
+
+
+    def log_likelihood_vhs(self, stats):
+
+        log_likelihood_v_given_hs = self.log_likelihood_v_given_hs(stats)
+        #log_likelihood_v_given_hs = Print('log_likelihood_v_given_hs')(log_likelihood_v_given_hs)
+        log_likelihood_s_given_h  = self.log_likelihood_s_given_h(stats)
+        #log_likelihood_s_given_h = Print('log_likelihood_s_given_h')(log_likelihood_s_given_h)
+        log_likelihood_h          = self.log_likelihood_h(stats)
+        #log_likelihood_h = Print('log_likelihood_h')(log_likelihood_h)
+
+        rval = log_likelihood_v_given_hs + log_likelihood_s_given_h + log_likelihood_h
+
+        assert len(rval.type.broadcastable) == 0
+
+        return rval
+
+    def log_likelihood_vhsu(self, stats):
+
+        Z_b_term = - T.nnet.softplus(self.bias_hid).sum()
+        Z_alpha_term = 0.5 * T.log(self.alpha).sum()
+
+        N = np.cast[config.floatX]( self.nhid )
+        D = np.cast[config.floatX]( self.nvis )
+        half = np.cast[config.floatX]( 0.5)
+        one = np.cast[config.floatX](1.)
+        two = np.cast[config.floatX](2.)
+        four = np.cast[config.floatX](4.)
+        pi = np.cast[config.floatX](np.pi)
+
+        Z_B_term = half * (np.square(N) + one) * T.log(self.B).sum()
+
+        Z_constant_term = - half * (N+D)*np.log(two*pi) - half * np.square(N)*D*np.log(four*pi)
+
+
+        negative_log_Z = Z_b_term + Z_alpha_term + Z_B_term + Z_constant_term
+        negative_log_Z.name = 'negative_log_Z'
+        assert len(negative_log_Z.type.broadcastable) == 0
+
+        u_stat_1 = stats.d['u_stat_1']
+
+        first_term = half * T.dot(self.B, (self.W.T * u_stat_1).sum(axis=0) )
+
+        assert len(first_term.type.broadcastable) == 0
+
+        mean_hsv = stats.d['mean_hsv']
+
+        second_term = T.sum(self.B *  T.sum(self.W.T * mean_hsv,axis=0), axis=0)
+
+        assert len(second_term.type.broadcastable) == 0
+
+
+        mean_sq_hs = stats.d['mean_sq_hs']
+        third_term = - half * N *  T.dot(self.B, T.dot(T.sqr(self.W),mean_sq_hs))
+
+        mean_hs = stats.d['mean_hs']
+
+        fourth_term = T.dot(self.mu, self.alpha * mean_hs)
+
+        mean_sq_v = stats.d['mean_sq_v']
+
+        fifth_term = - half * T.dot(self.B, mean_sq_v)
+
+        mean_sq_s = stats.d['mean_sq_s']
+
+        sixth_term = - half * T.dot(self.alpha, mean_sq_s)
+
+        mean_h = stats.d['mean_h']
+
+        seventh_term = T.dot(self.bias_hid, mean_h)
+
+        eighth_term = - half * T.dot(mean_h, self.alpha * T.sqr(self.mu))
+
+        u_stat_2 = stats.d['u_stat_2']
+
+        ninth_term = - (one / four ) * T.dot( self.B, u_stat_2)
+
+        ne_first_quarter = first_term + second_term
+        assert len(ne_first_quarter.type.broadcastable) == 0
+
+        ne_second_quarter = third_term + fourth_term
+        assert len(ne_second_quarter.type.broadcastable) ==0
+
+
+        ne_first_half = ne_first_quarter + ne_second_quarter
+        assert len(ne_first_half.type.broadcastable) == 0
+
+        ne_second_half = fifth_term + sixth_term + seventh_term + eighth_term + ninth_term
+        assert len(ne_second_half.type.broadcastable) == 0
+
+        negative_energy = ne_first_half + ne_second_half
+        negative_energy.name = 'negative_energy'
+        assert len(negative_energy.type.broadcastable) ==0
+
+        rval = negative_energy + negative_log_Z
+        assert len(rval.type.broadcastable) == 0
+        rval.name = 'log_likelihood_vhsu'
+
+        return rval
+
+
+    def log_likelihood_u_given_hs(self, stats):
+        """Note: drops some constant terms """
+
+        NH = np.cast[config.floatX](self.nhid)
+
+        mean_sq_hs = stats.d['mean_sq_hs']
+        cov_hs = stats.d['cov_hs']
+        mean_D_sq_mean_Q_hs = stats.d['mean_D_sq_mean_Q_hs']
+
+        term1 = 0.5 * T.sqr(NH) * T.sum(T.log(self.B))
+        #term1 = Print('term1')(term1)
+        term2 = 0.5 * (NH + 1) * T.dot(self.B,T.dot(self.W,mean_sq_hs))
+        #term2 = Print('term2')(term2)
+        term3 = - (self.B *  ( cov_hs.dimshuffle('x',0,1) * self.W.dimshuffle(0,1,'x') *
+                        self.W.dimshuffle(0,'x',1)).sum(axis=(1,2))).sum()
+        #term3 = Print('term3')(term3)
+        a = T.dot(T.sqr(self.W), mean_D_sq_mean_Q_hs)
+        term4 = -0.5 * T.dot(self.B, a)
+        #term4 = Print('term4')(term4)
+
+        rval = term1 + term2 + term3 + term4
+
+        return rval
+
+    @classmethod
+    def log_likelihood_v_given_hs_needed_stats(cls):
+        return set(['mean_sq_v','mean_hsv','cov_hs'])
+
+    def log_likelihood_v_given_hs(self, stats):
+
+        """
+        E_v,h,s \sim Q log P( v | h, s)
+        = E_v,h,s \sim Q log sqrt(B/2 pi) exp( - 0.5 B (v- W[v,:] (h*s) )^2)
+        = E_v,h,s \sim Q 0.5 log B - 0.5 log 2 pi - 0.5 B v^2 + v B W[v,:] (h*s) - 0.5 B sum_i sum_j W[v,i] W[v,j] h_i s_i h_j s_j
+        = 0.5 log B - 0.5 log 2 pi - 0.5 B v^2 + v B W[v,:] (h*s) - 0.5 B sum_i,j W[v,i] W[v,j] cov(h_i s_i, h_j s_j)
+        """
+
+        half = as_floatX(0.5)
+        two = as_floatX(2.)
+        pi = as_floatX(np.pi)
+        N = as_floatX(self.nhid)
+
+        mean_sq_v = stats.d['mean_sq_v']
+        mean_hsv  = stats.d['mean_hsv']
+        cov_hs = stats.d['cov_hs']
+
+        term1 = half * T.sum(T.log(self.B))
+        term2 = - half * N * T.log(two * pi)
+        term3 = - half * T.dot(self.B, mean_sq_v)
+        term4 = T.dot(self.B , (self.W * mean_hsv.T).sum(axis=1))
+        term5 = - half * T.dot(self.B,  ( cov_hs.dimshuffle('x',0,1) * self.W.dimshuffle(0,1,'x') *
+                        self.W.dimshuffle(0,'x',1)).sum(axis=(1,2)))
+
+        rval = term1 + term2 + term3 + term4 + term5
+
+        assert len(rval.type.broadcastable) == 0
+
+        return rval
+
+    @classmethod
+    def log_likelihood_s_given_h_needed_stats(cls):
+        return set(['mean_h','mean_hs','mean_sq_s'])
+
+    def log_likelihood_s_given_h(self, stats):
+
+        """
+        E_h,s\sim Q log P(s|h)
+        = E_h,s\sim Q log sqrt( alpha / 2pi) exp(- 0.5 alpha (s-mu h)^2)
+        = E_h,s\sim Q log sqrt( alpha / 2pi) - 0.5 alpha (s-mu h)^2
+        = E_h,s\sim Q  0.5 log alpha - 0.5 log 2 pi - 0.5 alpha s^2 + alpha s mu h + 0.5 alpha mu^2 h^2
+        = E_h,s\sim Q 0.5 log alpha - 0.5 log 2 pi - 0.5 alpha s^2 + alpha mu h s + 0.5 alpha mu^2 h
+        = 0.5 log alpha - 0.5 log 2 pi - 0.5 alpha mean_sq_s + alpha mu mean_hs - 0.5 alpha mu^2 mean_h
+        """
+
+        mean_h = stats.d['mean_h']
+        mean_sq_s = stats.d['mean_sq_s']
+        mean_hs = stats.d['mean_hs']
+
+        half = as_floatX(0.5)
+        two = as_floatX(2.)
+        N = as_floatX(self.nhid)
+        pi = as_floatX(np.pi)
+
+        term1 = half * T.log( self.alpha ).sum()
+        term2 = - half * N * T.log(two*pi)
+        term3 = - half * T.dot( self.alpha , mean_sq_s )
+        term4 = T.dot(self.mu*self.alpha,mean_hs)
+        term5 = - half * T.dot(T.sqr(self.mu), self.alpha * mean_h)
+
+        rval = term1 + term2 + term3 + term4 + term5
+
+        assert len(rval.type.broadcastable) == 0
+
+        return rval
+
+    @classmethod
+    def log_likelihood_h_needed_stats(cls):
+        return set(['mean_h'])
+
+    def log_likelihood_h(self, stats):
+        """ Returns the expected log probability of the vector h
+            under the model when the data is drawn according to
+            stats
+        """
+
+        """
+            E_h\sim Q log P(h)
+            = E_h\sim Q log exp( bh) / (1+exp(b))
+            = E_h\sim Q bh - softplus(b)
+        """
+
+        mean_h = stats.d['mean_h']
+
+        term1 = T.dot(self.bias_hid, mean_h)
+        term2 = - T.nnet.softplus(self.bias_hid).sum()
+
+        rval = term1 + term2
+
+        assert len(rval.type.broadcastable) == 0
+
+        return rval
+
+
+    def make_B_and_w(self):
+        if self.tied_B:
+            #can't just use a dimshuffle; dot products involving B won't work
+            self.B = self.B_driver + as_floatX(np.zeros(self.nvis))
+        else:
+            self.B = self.B_driver
+
+        self.w = T.dot(self.B, T.sqr(self.W))
+
+    def redo_theano(self):
+        init_names = dir(self)
+
+        self.make_B_and_w()
+
+        X = T.matrix()
+        X.tag.test_value = np.cast[config.floatX](self.rng.randn(self.test_batch_size,self.nvis))
+
+        if self.learn_after is not None:
+            self.learn_func = self.make_learn_func(X, learn = True )
+            self.accum_func = self.make_learn_func(X, learn = False )
+        else:
+            self.learn_func = self.make_learn_func(X)
+        #
+
+        final_names = dir(self)
+
+        self.register_names_to_del([name for name in final_names if name not in init_names])
+    #
+
+    def learn(self, dataset, batch_size):
+        self.learn_mini_batch(dataset.get_batch_design(batch_size))
+    #
+
+
+    def learn_mini_batch(self, X):
+
+        if hasattr(self, 'cached_X'):
+            assert np.all(X == self.cached_X)
+        self.cached_X = X.copy()
+
+
+        if self.learn_after is not None:
+            if self.monitor.examples_seen >= self.learn_after:
+                self.learn_func(X)
+            else:
+                self.accum_func(X)
+        else:
+            self.learn_func(X)
+
+        if True: #self.em_functional_diff.get_value() < 0.0:
+            #print "FAIL!"
+            print "diff of thing you're monitoring: "
+            print self.em_functional_diff.get_value()
+            #quit(-1)
+
+
+        """cov_hs = self.suff_stat_holder.d['cov_hs'].get_value(borrow=True)
+        a,b = np.linalg.eigh(cov_hs)
+
+        assert not np.any(np.isnan(a))
+        assert not np.any(np.isinf(a))
+        print 'minimum eigenvalue: '+str(a.min())
+        assert a.min() >= 0"""
+
+        if self.monitor.examples_seen % 10000 == 0:
+
+            print ""
+            b = self.bias_hid.get_value(borrow=True)
+            assert not np.any(np.isnan(b))
+            p = 1./(1.+np.exp(-b))
+            print 'p: ',(p.min(),p.mean(),p.max())
+            B = self.B_driver.get_value(borrow=True)
+            assert not np.any(np.isnan(B))
+            print 'B: ',(B.min(),B.mean(),B.max())
+            mu = self.mu.get_value(borrow=True)
+            assert not np.any(np.isnan(mu))
+            print 'mu: ',(mu.min(),mu.mean(),mu.max())
+            alpha = self.alpha.get_value(borrow=True)
+            assert not np.any(np.isnan(alpha))
+            print 'alpha: ',(alpha.min(),alpha.mean(),alpha.max())
+            W = self.W.get_value(borrow=True)
+            assert not np.any(np.isnan(W))
+            print 'W: ',(W.min(),W.mean(),W.max())
+    #
+
+    def get_weights_format(self):
+        return ['v','h']
+
+class E_step(object):
+    def __init__(self):
+        self.model = None
+
+    def get_monitoring_channels(self, V, model):
+        return {}
+
+    def register_model(self, model):
+        self.model = model
+
+    def mean_field(self, V):
+        raise NotImplementedError()
+
+class VHS_E_Step(E_step):
+    """ A variational E_step that works by running damped mean field
+        on the original  model
+
+        All variables are updated simultaneously, in parallel. The
+        spike variables are updated with a fixed damping. The slab
+        variables are updated with a unit-specific damping designed
+        to ensure stability.
+
+        The update equations were derived based on updating h_i and
+        s_i simultaneously, but not updating all units simultaneously.
+
+        The updates are not valid for updating h_i without also updating
+        h_i (i.e., doing this could increase the KL divergence).
+
+        They are also not valid for updating all units simulataneously,
+        but we do this anyway.
+
+        """
+
+    def truncated_KL(self, V, model, obs):
+        """ KL divergence between variation and true posterior, dropping terms that don't
+            depend on the mean field parameters """
+
+        H = obs['H']
+        sigma0 = obs['sigma0']
+        Sigma1 = obs['Sigma1']
+        Mu1 = obs['Mu1']
+        mu0 = obs['mu0']
+
+        entropy_term = - model.entropy_hs(H = H, sigma0 = sigma0, Sigma1 = Sigma1)
+        energy_term = model.expected_energy_vhs(V, H, mu0, Mu1, sigma0, Sigma1)
+
+        KL = entropy_term + energy_term
+
+        return KL
+
+    def em_functional(self, V, model, obs):
+        """ Return value is a scalar """
+
+        needed_stats = S3C.log_likelihood_vhs_needed_stats()
+
+        stats = SufficientStatistics.from_observations( needed_stats = needed_stats,
+                                                        X = V, ** obs )
+
+        H = obs['H']
+        sigma0 = obs['sigma0']
+        Sigma1 = obs['Sigma1']
+
+        entropy_term = (model.entropy_hs(H = H, sigma0 = sigma0, Sigma1 = Sigma1)).mean()
+        likelihood_term = model.log_likelihood_vhs(stats)
+
+        em_functional = entropy_term + likelihood_term
+
+        return em_functional
+
+
+    def get_monitoring_channels(self, V, model):
+
+        rval = {}
+
+        if self.monitor_kl or self.monitor_em_functional:
+            obs_history = self.mean_field(V, return_history = True)
+
+            for i in xrange(1, 2 + len(self.h_new_coeff_schedule)):
+                obs = obs_history[i-1]
+                if self.monitor_kl:
+                    rval['trunc_KL_'+str(i)] = self.truncated_KL(V, model, obs).mean()
+                if self.monitor_em_functional:
+                    rval['em_functional_'+str(i)] = self.em_functional(V, model, obs).mean()
+
+        return rval
+
+
+    def __init__(self, h_new_coeff_schedule, monitor_kl = False, monitor_em_functional = False):
+        """Parameters
+        --------------
+        h_new_coeff_schedule:
+            list of coefficients to put on the new value of h on each damped mean field step
+                    (coefficients on s are driven by a special formula)
+            length of this list determines the number of mean field steps
+        """
+
+        self.h_new_coeff_schedule = h_new_coeff_schedule
+        self.monitor_kl = monitor_kl
+        self.monitor_em_functional = monitor_em_functional
+
+        super(VHS_E_Step, self).__init__()
+
+    def init_mf_H(self, V):
+        if self.model.recycle_q:
+            rval = self.model.prev_H
+        else:
+            #just use the prior
+            value =  T.nnet.sigmoid(self.model.bias_hid)
+            rval = T.alloc(value, V.shape[0], value.shape[0])
+
+        return rval
+
+    def init_mf_Mu1(self, V):
+        if self.model.recycle_q:
+            rval = self.model.prev_Mu1
+        else:
+            #just use the prior
+            value = self.model.mu
+            if config.compute_test_value != 'off':
+                assert value.tag.test_value != None
+            rval = T.alloc(value, V.shape[0], value.shape[0])
+
+        return rval
+
+
+
+    def mean_field_A(self, V, H, Mu1):
+
+        mu = self.model.mu
+        alpha = self.model.alpha
+        W = self.model.W
+        B = self.model.B
+        w = self.model.w
+
+        BW = B.dimshuffle(0,'x') * W
+
+        HS = H * Mu1
+
+        mean_term = mu * alpha
+
+        data_term = T.dot(V, BW)
+
+        iterm_part_1 = - T.dot(T.dot(HS, W.T), BW)
+        iterm_part_2 = w * HS
+
+        interaction_term = iterm_part_1 + iterm_part_2
+
+        A = mean_term + data_term + interaction_term
+
+        V_name = make_name(V, 'anon_V')
+        H_name = make_name(H, 'anon_H')
+        Mu1_name = make_name(Mu1, 'anon_Mu1')
+
+        A.name = 'mean_field_A( %s, %s, %s ) ' % ( V_name, H_name, Mu1_name)
+
+        return A
+
+    def mean_field_Mu1(self, A):
+
+        alpha = self.model.alpha
+        w = self.model.w
+
+        denom = alpha + w
+
+        Mu1 =  A / denom
+
+        A_name = make_name(A, 'anon_A')
+
+        Mu1.name = 'mean_field_Mu1(%s)'%A_name
+
+        return Mu1
+
+    def mean_field_Sigma1(self):
+        """TODO: this is a bad name, since in the univariate case we would
+         call this sigma^2
+        I think what I was going for was covariance matrix Sigma constrained to be diagonal
+         but it is still confusing """
+
+        rval =  1./ (self.model.alpha + self.model.w )
+
+        rval.name = 'mean_field_Sigma1'
+
+        return rval
+
+    def mean_field_H(self, A):
+
+        half = as_floatX(.5)
+        alpha = self.model.alpha
+        w = self.model.w
+
+        term1 = half * T.sqr(A) / (alpha + w)
+
+        term2 = self.model.bias_hid
+
+        term3 = - half * T.sqr(self.model.mu) * self.model.alpha
+
+        term4 = -half * T.log(self.model.alpha + self.model.w)
+
+        term5 = half * T.log(self.model.alpha)
+
+        arg_to_sigmoid = term1 + term2 + term3 + term4 + term5
+
+        H = T.nnet.sigmoid(arg_to_sigmoid)
+
+        A_name = make_name(A, 'anon_A')
+
+        H.name = 'mean_field_H('+A_name+')'
+
+        return H
+
+    def damp_H(self, H, new_H, new_coeff):
+        return new_coeff * new_H + (1. - new_coeff) * H
+
+    def damp_Mu1(self, Mu1, new_Mu1):
+        rho = 0.5
+        ceiling = 1000.
+
+        positives = Mu1 > 0
+        non_positives = 1. - positives
+        negatives = Mu1 < 0
+        non_negatives = 1. - negatives
+
+        rval = T.clip(new_Mu1, - rho * positives * Mu1 - non_positives * ceiling, non_negatives * ceiling - rho * negatives * Mu1 )
+
+        return rval
+
+    def mean_field(self, V, return_history = False):
+        """
+
+            return_history: if True:
+                                returns a list of dictionaries with
+                                showing the history of the mean field
+                                parameters
+                                throughout fixed point updates
+                            if False:
+                                returns a dictionary containing the final
+                                mean field parameters
+        """
+
+        alpha = self.model.alpha
+
+        sigma0 = 1. / alpha
+        Sigma1 = self.mean_field_Sigma1()
+        mu0 = T.zeros_like(sigma0)
+
+
+
+        H   =    self.init_mf_H(V)
+        Mu1 =    self.init_mf_Mu1(V)
+
+        def make_dict():
+
+            return {
+                    'H' : H,
+                    'mu0' : mu0,
+                    'Mu1' : Mu1,
+                    'sigma0' : sigma0,
+                    'Sigma1': Sigma1,
+                    }
+
+        history = [ make_dict() ]
+
+        for new_coeff in self.h_new_coeff_schedule:
+
+            A = self.mean_field_A(V = V, H = H, Mu1 = Mu1)
+            new_Mu1 = self.mean_field_Mu1(A = A)
+            new_H = self.mean_field_H(A = A)
+
+            H = self.damp_H(H = H, new_H = new_H, new_coeff = new_coeff)
+            Mu1 = self.damp_Mu1(Mu1 = Mu1, new_Mu1 = new_Mu1)
+
+            history.append(make_dict())
+
+        if return_history:
+            return history
+        else:
+            return history[-1]
+
+
+
+
+class VHSU_E_Step(E_step):
+    """ A variational E-step that works by running mean field on
+        the auxiliary variable model """
+
+    def __init__(self, N_schedule):
+        """
+        parameters:
+            N_schedule: list of values to use for N throughout mean field updates.
+                    len(N_schedule) determines # mean field steps
+        """
+        self.N_schedule = N_schedule
+
+        super(VHSU_E_Step, self).__init__()
+
+
+    def init_mf_Mu1(self, V):
+        #Mu1 = (self.alpha*self.mu + T.dot(V*self.B,self.W))/(self.alpha+self.w)
+        #Just use the prior
+        Mu1 = self.model.mu.dimshuffle('x',0)
+        assert Mu1.tag.test_value != None
+
+        Mu1.name = "init_mf_Mu1"
+
+        return Mu1
+    #
+
+
+    def mean_field_H(self, U, V, NH):
+
+        BW = self.model.W * (self.model.B.dimshuffle(0,'x'))
+
+        filt = T.dot(V,BW)
+
+        u_contrib = (U * BW.dimshuffle('x',1,0)).sum(axis=2)
+
+        pre_sq = filt - u_contrib + self.model.alpha * self.model.mu
+
+        sq_term = T.sqr(pre_sq)
+
+        beta = self.model.alpha + NH * self.model.w
+
+        log_term = T.log(1.0 + NH * self.model.w / self.model.alpha )
+
+        H = T.nnet.sigmoid(-self.h_coeff() + 0.5 * sq_term / beta  - 0.5 * log_term )
+
+        H.name = "mean_field_H"
+
+        return H
+    #
+
+    def mean_field_Mu1(self, U, V, NH):
+
+        beta = self.model.alpha + NH * self.model.w
+
+        BW = self.model.W * self.model.B.dimshuffle(0,'x')
+
+        filt = T.dot(V,BW)
+
+        u_mod = - (U * BW.dimshuffle('x',1,0)).sum(axis=2)
+
+        Mu1 = (filt + u_mod + self.model.alpha * self.model.mu) / beta
+
+        Mu1.name = "mean_field_Mu1"
+
+        return Mu1
+    #
+
+
+    def mean_field_Sigma1(self, NH):
+        Sigma1 = 1./(self.model.alpha + NH * self.model.w)
+
+        Sigma1.name = "mean_field_Sigma1"
+
+        return Sigma1
+    #
+
+
+    def mean_field(self, V):
+        alpha = self.model.alpha
+
+        sigma0 = 1. / alpha
+        mu0 = T.zeros_like(sigma0)
+
+        H   =    self.init_mf_H(V)
+        Mu1 =    self.init_mf_Mu1(V)
+
+
+        for NH in self.N_schedule:
+            U   = self.mean_field_U  (H = H, Mu1 = Mu1, NH = NH)
+            H   = self.mean_field_H  (U = U, V = V,     NH = NH)
+            Mu1 = self.mean_field_Mu1(U = U, V = V,     NH = NH)
+
+
+        Sigma1 = self.mean_field_Sigma1(NH = np.cast[config.floatX](self.model.nhid))
+
+        return {
+                'H' : H,
+                'mu0' : mu0,
+                'Mu1' : Mu1,
+                'sigma0' : sigma0,
+                'Sigma1': Sigma1,
+                'U' : U
+                }
+    #
+
+    def mean_field_U(self, H, Mu1, NH):
+
+        W = self.model.W
+
+        prod = Mu1 * H
+
+        first_term = T.dot(prod, W.T)
+        first_term_broadcast = first_term.dimshuffle(0,'x',1)
+
+        W_broadcast = W.dimshuffle('x',1,0)
+        prod_broadcast = prod.dimshuffle(0,1,'x')
+
+        second_term = NH * W_broadcast * prod_broadcast
+
+        U = first_term_broadcast - second_term
+
+        U.name = "mean_field_U"
+
+        return U
+    #
+
+    def h_coeff(self):
+        """ Returns the coefficient on h in the energy function """
+        return - self.model.bias_hid  + 0.5 * T.sqr(self.model.mu) * self.model.alpha
+
+    def init_mf_H(self,V):
+        nhid = self.model.nhid
+        w = self.model.w
+        alpha = self.model.alpha
+        mu = self.model.mu
+        W = self.model.W
+        B = self.model.B
+
+        NH = np.cast[config.floatX] ( nhid)
+        arg_to_log = 1.+(1./alpha) * NH * w
+
+        hid_vec = alpha * mu
+        #assert (hasattr(V,'__array__') or (V.tag.test_value is not None))
+        dotty_thing = T.dot(V*B, W)
+        pre_sq = hid_vec + dotty_thing
+        numer = T.sqr(pre_sq)
+        denom = alpha + w
+        frac = numer/ denom
+
+        first_term = 0.5 *  frac
+
+        H = T.nnet.sigmoid( first_term - self.h_coeff() - 0.5 * T.log(arg_to_log) )
+
+
+        #just use the prior
+        H = T.nnet.sigmoid( self.model.bias_hid )
+
+        #H = Print('init_mf_H')(H)
+
+        return H
+    #
+
+
+class M_Step(object):
+
+    def needed_stats(self):
+        """ Return a set of string names of the sufficient statistics that will be needed
+            TODO: do this automatically instead of requiring it to be hard-coded """
+        raise NotImplementedError()
+
+    def get_updates(self, model, stats):
+        raise NotImplementedError()
+
+    def get_monitoring_channels(self, V, model):
+        return {}
+
+class VHS_M_Step(M_Step):
+    """ An M-step based on learning using the distribution
+        over V,H, and S.
+
+        In conjunction with VHS_E_Step this does variational
+        EM in the original model. (Haven't run this yet as of
+        time of writing this comment)
+
+        In conjunction with VHSU_E_Step: we have no theoretical
+        justification for this. In experiments on CIFAR it learns
+        a mixture of gabors and dead filters.
+    """
+
+    def get_monitoring_channels(self, V, model):
+
+        hid_observations = model.e_step.mean_field(V)
+
+        stats = SufficientStatistics.from_observations(needed_stats = S3C.log_likelihood_vhs_needed_stats(),
+                X = V, **hid_observations)
+
+        obj = model.log_likelihood_vhs(stats)
+
+        return { 'log_likelihood_vhs' : obj }
+
+class VHSU_M_Step(M_Step):
+    """ An M-step based on learning using the distribution over
+        V,H,S, and U-- i.e. good old-fashioned, theoretically
+        justified EM
+
+        This M step has been unit tested and seems to work correctly
+        in unit tests. It has not been shown to work well in learning
+        experiments. That could mean the auxiliary variables are a bad
+        idea or it could mean something is wrong with the VHSU E step.
+    """
+
+    def get_monitoring_channels(self, V, model):
+
+        hidden_obs  = model.e_step.mean_field(V)
+
+        stats = SufficientStatistics.from_observations(needed_stats = S3C.log_likelihood_vhsu_needed_stats(), X =V, \
+                                                            N = np.cast[config.floatX](model.nhid),
+                                                            B = model.B,
+                                                            W = model.W,
+                                                            **hidden_obs)
+
+        obj = model.log_likelihood_vhsu(stats)
+
+        return { 'log_likelihood_vhsu' : obj }
+
+
+def take_step(model, W, bias_hid, alpha, mu, B, new_coeff):
+    """
+    Returns a dictionary of learning updates of the form
+        model.param := new_coeff * param + (1-new_coeff) * model.param
+    """
+
+    new_coeff = np.cast[config.floatX](new_coeff)
+
+    def step(old, new):
+        if new_coeff == 1.0:
+            return new
+        else:
+            rval =  new_coeff * new + (np.cast[config.floatX](1.)-new_coeff) * old
+
+        assert rval.dtype == config.floatX
+
+        return rval
+
+    learning_updates = \
+        {
+            model.W: step(model.W, W),
+            model.bias_hid: step(model.bias_hid,bias_hid),
+            model.alpha: step(model.alpha, alpha),
+            model.mu: step(model.mu, mu),
+            model.B_driver: step(model.B_driver, B)
+        }
+
+    return learning_updates
+
+class VHS_Solve_M_Step(VHS_M_Step):
+
+    def __init__(self, new_coeff):
+        self.new_coeff = np.cast[config.floatX](float(new_coeff))
+
+    def needed_stats(self):
+        return S3C.solve_vhs_needed_stats()
+
+    def get_updates(self, model, stats):
+
+        W, bias_hid, alpha, mu, B = model.solve_vhs_from_stats(stats)
+
+        learning_updates = take_step(model, W, bias_hid, alpha, mu, B, self.new_coeff)
+
+        return learning_updates
+
+class VHSU_Solve_M_Step(VHSU_M_Step):
+
+    def __init__(self, new_coeff):
+        self.new_coeff = np.cast[config.floatX](float(new_coeff))
+
+    def needed_stats(self):
+        return S3C.solve_vhsu_needed_stats()
+
+    def get_updates(self, model, stats):
+
+        W, bias_hid, alpha, mu, B = model.solve_vhsu_from_stats(stats)
+
+        learning_updates = take_step(model, W, bias_hid, alpha, mu, B, self.new_coeff)
+
+        return learning_updates
+
+class VHS_Grad_M_Step(VHS_M_Step):
+
+    def __init__(self, learning_rate):
+        self.learning_rate = np.cast[config.floatX](float(learning_rate))
+
+    def get_updates(self, model, stats):
+
+        params = model.get_params()
+
+        obj = model.log_likelihood_vhs(stats)
+
+        grads = T.grad(obj, params, consider_constant = stats.d.values())
+
+        updates = {}
+
+        for param, grad in zip(params, grads):
+            updates[param] = param + self.learning_rate * grad
+
+        return updates
+
+    def needed_stats(self):
+        return S3C.log_likelihood_vhs_needed_stats()
+
+class VHSU_Grad_M_Step(VHSU_M_Step):
+
+    def __init__(self, learning_rate):
+        self.learning_rate = np.cast[config.floatX](float(learning_rate))
+
+    def get_updates(self, model, stats):
+
+        params = model.get_params()
+
+        obj = model.log_likelihood_vhsu(stats)
+
+        grads = T.grad(obj, params, consider_constant = stats.d.values())
+
+        updates = {}
+
+        for param, grad in zip(params, grads):
+            #if param is model.W:
+            #    grad = Print('grad_W',attrs=['min','mean','max'])(grad)
+
+            updates[param] = param + self.learning_rate * grad
+
+        return updates
+
+
+
diff --git a/s3c/paper/hist_h.py b/s3c/paper/hist_h.py
new file mode 100644
index 00000000..9d0cb661
--- /dev/null
+++ b/s3c/paper/hist_h.py
@@ -0,0 +1,64 @@
+import sys
+import theano.tensor as T
+from pylearn2.utils import serial
+
+model_path = 'rpla_p5_interm.pkl'
+
+shuffle = False
+if len(sys.argv) > 2:
+    assert sys.argv[2] == '--shuffle'
+    #shuffle option gets random examples by skipping model.nhid ahead at the start
+    #this way if using a random patches model we don't see the random patches that the
+    #model uses as weights
+    shuffle = True
+
+model = serial.load(model_path)
+
+model.make_Bwp()
+
+stl10 = model.dataset_yaml_src.find('stl10') != -1
+
+if not stl10:
+    raise NotImplementedError("Doesn't support CIFAR10 yet")
+
+if stl10:
+    dataset = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_patches/data.pkl")
+
+if shuffle:
+    dataset.get_batch_design(model.nhid)
+
+V_var = T.matrix()
+
+mean_field = model.e_step.mean_field(V = V_var)
+
+feature_type = 'exp_h'
+
+if feature_type == 'exp_h':
+    outputs = mean_field['H']
+else:
+    raise NotImplementedError()
+
+from theano import function
+f = function([V_var], outputs= outputs)
+
+import matplotlib.pyplot as plt
+
+V = dataset.get_batch_design(100)
+y = f(V)
+
+y = y.reshape( y.shape[0] * y.shape[1])
+
+
+plt.hist(y, bins = 20, log = True)
+
+#ax = plt.gca()
+#ax.set_yscale('symlog' )
+
+plt.title('Distribution of $\mathbb{E}[h_i] $')
+plt.xlabel('$ \mathbb{E}[h_i] $')
+plt.ylabel('log number of occurrences')
+
+plt.show()
+
+
+print (y < .01).mean()
diff --git a/s3c/paper/learning_curve.py b/s3c/paper/learning_curve.py
index b38ddc07..f5ab0880 100644
--- a/s3c/paper/learning_curve.py
+++ b/s3c/paper/learning_curve.py
@@ -27,7 +27,7 @@
 plt.plot(train_size,s3c_acc-s3c_confidence, color="green", linestyle="dashed")
 plt.plot(train_size,s3c_acc+s3c_confidence, color="green", linestyle="dashed")
 
-plt.xlabel('Labeled Training Examples')
+plt.xlabel('Labeled Training Examples Per Class')
 plt.ylabel('Test Set Accuracy')
 
 plt.title('CIFAR-10 Learning Curve')
diff --git a/s3c/paper/plot_em_functional.py b/s3c/paper/plot_em_functional.py
new file mode 100644
index 00000000..4688bd69
--- /dev/null
+++ b/s3c/paper/plot_em_functional.py
@@ -0,0 +1,47 @@
+import sys
+import numpy as np
+
+model_path = 'H.pkl'
+
+if len(sys.argv) > 2:
+    tidx = int(sys.argv[2])
+else:
+    tidx = -1
+
+from pylearn2.utils.serial import load
+
+model = load(model_path)
+
+monitor = model.monitor
+
+em_functional_channels = {}
+
+for key in monitor.channels:
+    if key.startswith('em_functional_'):
+        em_functional_channels[key] = monitor.channels[key]
+
+vals = np.zeros(len(em_functional_channels.keys()))
+
+
+for key in em_functional_channels:
+    pieces = key.split('_')
+    assert len(pieces) == 3
+    idx = int(pieces[2]) - 1
+
+    val = em_functional_channels[key].val_record[tidx]
+
+    vals[idx] = val
+
+if len(vals) == 0:
+    print 'this model did not use monitoring of the em functional across the e step'
+    quit(-1)
+
+from matplotlib import pyplot as plt
+
+plt.plot(vals)
+
+plt.title('Inference by Optimization')
+plt.ylabel('Energy Functional')
+plt.xlabel('Damped parallel fixed point updates')
+
+plt.show()
diff --git a/s3c/paper/plot_inference.py b/s3c/paper/plot_inference.py
new file mode 100644
index 00000000..9dfc4428
--- /dev/null
+++ b/s3c/paper/plot_inference.py
@@ -0,0 +1,48 @@
+import theano.tensor as T
+from pylearn2.utils import serial
+
+model_path = 'rpla_p5_interm.pkl'
+
+
+model = serial.load(model_path)
+
+model.make_Bwp()
+
+stl10 = model.dataset_yaml_src.find('stl10') != -1
+
+if not stl10:
+    raise NotImplementedError("Doesn't support CIFAR10 yet")
+
+if stl10:
+    dataset = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_patches/data.pkl")
+
+V_var = T.matrix()
+
+history = model.e_step.mean_field(V = V_var, return_history = True)
+
+feature_type = 'exp_h'
+
+if feature_type == 'exp_h':
+    outputs = [ hist_elem['H'].mean() for hist_elem in history ]
+else:
+    raise NotImplementedError()
+
+from theano import function
+f = function([V_var], outputs= outputs)
+
+import matplotlib.pyplot as plt
+
+V = dataset.get_batch_design(1)
+y = f(V)
+
+plt.plot(y)
+
+ax = plt.gca()
+
+
+plt.title('Sparsification during inference')
+plt.xlabel('Damped parallel fixed point updates')
+plt.ylabel('Mean of $\mathbb{E}_Q [h]$')
+
+plt.show()
+