|
| 1 | +import numpy as np |
| 2 | +import tensorflow as tf |
| 3 | + |
| 4 | + |
| 5 | + |
| 6 | +def logprob(logits, samples): |
| 7 | + """ |
| 8 | + Calculate log probability |
| 9 | + |
| 10 | + Args: |
| 11 | + logits: tensor, |
| 12 | + samples: tensor, ntarget x nsample x ncontext |
| 13 | + |
| 14 | + Return: |
| 15 | + logprob: tensor, ntarget x nsample log probability of each sample |
| 16 | + """ |
| 17 | + |
| 18 | + # scale up probabilities so that the maximum of logits is at least -10.0, and the minum of logits is at least -25.0 |
| 19 | + |
| 20 | + # If ((-10) - max_logit) > 0, then add ((-10) - max_logit) to all logits |
| 21 | + # the scaled up probability vector gets neglectable larger probability to obtain more than 1 bits with value one. |
| 22 | + max_logits = tf.reduce_max(logits, axis=-1, keep_dims=True) |
| 23 | + diff = tf.clip_by_value(-10.0 - max_logits, clip_value_min=0.0, clip_value_max=np.inf) |
| 24 | + logits = logits + diff |
| 25 | + |
| 26 | + # clip min logits value, so that min_mu = sigmoid(min_logit) is not zero. |
| 27 | + # Since the largest logits is at least -10.0. The value clip will neglectably increase the probability of getting |
| 28 | + # value ones at these bits |
| 29 | + logits = tf.clip_by_value(logits, clip_value_min=-25.0, clip_value_max=np.inf) |
| 30 | + |
| 31 | + |
| 32 | + logp0 = - tf.nn.softplus(logits) |
| 33 | + |
| 34 | + #logprob without constraints |
| 35 | + |
| 36 | + # \sum_i b_i * logit_i |
| 37 | + logits_sum = tf.reduce_sum(samples * tf.expand_dims(logits, 1), axis=-1) # broadcast to samples |
| 38 | + |
| 39 | + # \sum_i log(1 - p_i) |
| 40 | + minusa_sum = tf.expand_dims(tf.reduce_sum(logp0, axis=-1), 1) |
| 41 | + # \sum_i b_i * logit_i + log(1 - p_i) |
| 42 | + logprob_unc = logits_sum + minusa_sum # minusa_sum is broadcasted to samples |
| 43 | + |
| 44 | + # log probability that at least one bit is 1 |
| 45 | + |
| 46 | + # the probability is calculated as |
| 47 | + # log(1 - exp(Sum_ - log(1 + exp(logit_i)))) |
| 48 | + accurate = tf.log(1 - tf.exp(tf.reduce_sum(logp0, axis=-1))) |
| 49 | + |
| 50 | + # when all logits are small, it can be approximated as |
| 51 | + # log(Sum exp(logit_i)) |
| 52 | + approxim = tf.reduce_logsumexp(logits, axis=-1) |
| 53 | + |
| 54 | + # use the approximate calculation when the logit is negatively large |
| 55 | + max_logit = tf.reduce_max(logits, axis=-1) |
| 56 | + logprob_non0 = tf.where(max_logit < -15.0, approxim, accurate) |
| 57 | + |
| 58 | + logprob_cbs = logprob_unc - tf.expand_dims(logprob_non0, 1) # expand to samples |
| 59 | + |
| 60 | + check_point = tf.assert_less(tf.reduce_mean(logprob_cbs), 0.001, data=[tf.reduce_mean(logprob_cbs), tf.reduce_mean(logits), tf.reduce_mean(samples)]) |
| 61 | + with tf.control_dependencies([check_point]): |
| 62 | + logprob_cbs = tf.identity(logprob_cbs) |
| 63 | + |
| 64 | + |
| 65 | + return logprob_cbs |
| 66 | + |
| 67 | + |
| 68 | +def sample(logits, nsample): |
| 69 | + """ |
| 70 | + Sample an array of Bernoulli random variables under the constraint that at least one bit is 1 |
| 71 | +
|
| 72 | + Args: |
| 73 | + logits: tf float32 array, the last dimension is treated as a Bernoulli array |
| 74 | + Return: |
| 75 | + samples: tf float32 array, |
| 76 | +
|
| 77 | + """ |
| 78 | + |
| 79 | + |
| 80 | + # scale up probabilities so that the maximum of logits is at least -10.0, and the minum of logits is at least -25.0 |
| 81 | + |
| 82 | + # If ((-10) - max_logit) > 0, then add ((-10) - max_logit) to all logits |
| 83 | + # the scaled up probability vector gets neglectable larger probability to obtain more than 1 bits with value one. |
| 84 | + max_logits = tf.reduce_max(logits, axis=-1, keep_dims=True) |
| 85 | + diff = tf.clip_by_value(-10.0 - max_logits, clip_value_min=0.0, clip_value_max=np.inf) |
| 86 | + logits = logits + diff |
| 87 | + |
| 88 | + # clip min logits value, so that min_mu = sigmoid(min_logit) is not zero. |
| 89 | + # Since the largest logits is at least -10.0. The value clip will neglectably increase the probability of getting |
| 90 | + # value ones at these bits |
| 91 | + logits = tf.clip_by_value(logits, clip_value_min=-25.0, clip_value_max=np.inf) |
| 92 | + |
| 93 | + |
| 94 | + # sample Bernoulli bits freely |
| 95 | + logit_shape = tf.shape(logits) |
| 96 | + sample_shape = [logit_shape[0], nsample, logit_shape[1]] |
| 97 | + |
| 98 | + prob = tf.sigmoid(logits) |
| 99 | + # bernoulli sampling with uniform dist |
| 100 | + samples = tf.ceil(tf.subtract(tf.expand_dims(prob, 1), tf.random_uniform(sample_shape))) |
| 101 | + |
| 102 | + |
| 103 | + # calculate log p(b_{1:i-1} = 0, b_i = 1) |
| 104 | + |
| 105 | + # log(1 - mu_i) |
| 106 | + logp0 = - tf.nn.softplus(logits) |
| 107 | + # [0, log(1 - mu_1), log( (1 - mu_1)(1 - mu_2) ), ...] |
| 108 | + cum_logp0 = tf.cumsum(logp0, axis=-1, exclusive=True) |
| 109 | + # [log(mu_1), log( (1 - mu_1)mu_2 ), log( (1 - mu_1)(1 - mu_2)mu_3 ), ... ] |
| 110 | + log_p001 = cum_logp0 - tf.nn.softplus(- logits) |
| 111 | + |
| 112 | + |
| 113 | + # calculate the probability that p(b_{1 : i-1} > 0) |
| 114 | + |
| 115 | + max_log = tf.reduce_max(log_p001, axis=-1, keep_dims=True) |
| 116 | + # [mu_1, (1 - mu_1)mu_2, (1 - mu_1)(1 - mu_2)mu_3, ... ] |
| 117 | + p001 = tf.exp(log_p001 - max_log) |
| 118 | + # [0, mu_1, 1 - (1 - mu_1)(1 - mu_2), ... ] |
| 119 | + pvalid = tf.cumsum(p001, axis=1, exclusive=True) # need a cumlative log-sum-exp here, but it's fine |
| 120 | + log_pvalid = tf.log(pvalid) + max_log # log(0) = -Inf |
| 121 | + |
| 122 | + |
| 123 | + # sample from bernouli with p001 and pvalid to get sample mask |
| 124 | + |
| 125 | + first_one_prob = tf.sigmoid(log_p001 - log_pvalid) # probability of getting the first one value |
| 126 | + first_one_bits = tf.cast(tf.greater(tf.expand_dims(first_one_prob, 1), tf.random_uniform(sample_shape)), tf.int32) |
| 127 | + |
| 128 | + |
| 129 | + # cumsum twice, so the last one bit is still one. Bits proceeding it are greater than 1, and bits succeeding it are zero |
| 130 | + cum2_bits = tf.cumsum(tf.cumsum(first_one_bits, axis=2, reverse=True), axis=2, reverse=True) |
| 131 | + trunc_flag = tf.cast(tf.equal(cum2_bits, 1), tf.float32) |
| 132 | + trunc_mask = tf.cumsum(trunc_flag, exclusive=True, axis=2) # mask for bits after trunc_flag |
| 133 | + |
| 134 | + # if i-th bit comes from p001, then set all preceeding bits as 1, set bit i as 1, and leave following bits |
| 135 | + samples = samples * trunc_mask + trunc_flag |
| 136 | + |
| 137 | + |
| 138 | + check_point = tf.assert_greater(tf.reduce_mean(samples), 0.0, data=[tf.reduce_mean(logits), tf.reduce_mean(samples)]) |
| 139 | + with tf.control_dependencies([check_point]): |
| 140 | + samples = tf.identity(samples) |
| 141 | + |
| 142 | + return samples |
| 143 | + |
0 commit comments