// compute average grad_F_marginal over the minibatch
//score = 0.0
dw = be.zeros_like(w)
da = be.zeros_like(a)
db = be.zeros_like(b)
//TODO: vectorize
for v in vdata:
//score -= self.marginal_free_energy(v)
dw += be.outer(v,be.expit(be.dot(v,w) + b))
da += v
db += be.expit(b + be.dot(v,w))
grad = gu.Gradient(
[None for l in self.layers],
After Change
batch_size = be.shape(vdata)[0]
// This is the same as \sum_{i} vdata[i] \outer intermediate[i]
// TODO: is this efficient?
dw = be.dot(be.transpose(vdata), intermediate) / batch_size
grad = gu.Gradient(
[None for l in self.layers],
[None for w in self.weights]