Skip to content

Commit

Permalink
Merge pull request #3581 from ngeiswei/improve-miner
Browse files Browse the repository at this point in the history
Add Jensen-Divergence Distance code
  • Loading branch information
ngeiswei authored Jul 16, 2019
2 parents 54a434f + df3b37b commit 5a55975
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 13 deletions.
81 changes: 68 additions & 13 deletions opencog/miner/Surprisingness.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <opencog/atoms/core/FindUtils.h>
#include <opencog/atoms/core/LambdaLink.h>
#include <opencog/atoms/truthvalue/SimpleTruthValue.h>
#include <opencog/ure/BetaDistribution.h>

#include <boost/range/adaptor/transformed.hpp>
#include <boost/range/algorithm/transform.hpp>
Expand Down Expand Up @@ -666,19 +667,6 @@ double Surprisingness::eq_prob(const HandleSeqSeq& partition,
return p;
}

std::string oc_to_string(const HandleSeqSeqSeq& hsss, const std::string& indent)
{
std::stringstream ss;
ss << indent << "size = " << hsss.size() << std::endl;
size_t i = 0;
for (const HandleSeqSeq& hss : hsss) {
ss << indent << "atoms sets[" << i << "]:" << std::endl
<< oc_to_string(hss, indent + oc_to_string_indent);
i++;
}
return ss.str();
}

const Handle& Surprisingness::emp_prob_key()
{
static Handle epk(createNode(NODE, "*-EmpiricalProbabilityKey-*"));
Expand All @@ -700,4 +688,71 @@ void Surprisingness::set_emp_prob(const Handle& pattern,
pattern->setValue(emp_prob_key(), ValueCast(emp_prob_tv));
}

double avrg(double l, double r)
{
return (l + r) / 2.0;
}

double Surprisingness::jsd(const TruthValuePtr l_tv, const TruthValuePtr r_tv)
{
static int bins = 100;
std::vector<double>
l_cdf = BetaDistribution(l_tv).cdf(bins),
r_cdf = BetaDistribution(r_tv).cdf(bins),
m_cdf = avrg_cdf(l_cdf, r_cdf);
double
ld = kld(l_cdf, m_cdf),
rd = kld(r_cdf, m_cdf);
return sqrt(avrg(ld, rd));
}

double Surprisingness::kld(const std::vector<double>& l_cdf,
const std::vector<double>& r_cdf)
{
static double epsilon = 1e-32;
OC_ASSERT(l_cdf.size() == r_cdf.size());

// Value of the previous data point in the left and right cdf
// respectively
double last_lv = 0.0;
double last_rv = 0.0;

// Integrate the relative entropy between the 2 cdfs for each data
// point
double kldi = 0.0;
for (size_t i = 0; i < l_cdf.size(); i++) {
// Probabilities of the right and left points
double lp = l_cdf[i] - last_lv;
double rp = r_cdf[i] - last_rv;
// Their relative entropy
kldi += epsilon < rp ? lp * std::log2(lp/rp) : 0.0;
// Remember last cummulated probabilities
last_lv = l_cdf[i];
last_rv = r_cdf[i];
}
return kldi;
}

std::vector<double> Surprisingness::avrg_cdf(const std::vector<double>& l_cdf,
const std::vector<double>& r_cdf)
{
OC_ASSERT(l_cdf.size() == r_cdf.size());
std::vector<double> m_cdf(l_cdf.size());
boost::transform(l_cdf, r_cdf, m_cdf.begin(), avrg);
return m_cdf;
}

std::string oc_to_string(const HandleSeqSeqSeq& hsss, const std::string& indent)
{
std::stringstream ss;
ss << indent << "size = " << hsss.size() << std::endl;
size_t i = 0;
for (const HandleSeqSeq& hss : hsss) {
ss << indent << "atoms sets[" << i << "]:" << std::endl
<< oc_to_string(hss, indent + oc_to_string_indent);
i++;
}
return ss.str();
}

} // namespace opencog
26 changes: 26 additions & 0 deletions opencog/miner/Surprisingness.h
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,32 @@ class Surprisingness {
*/
static TruthValuePtr get_emp_prob(const Handle& pattern);
static void set_emp_prob(const Handle& pattern, double emp_prob);

/**
* Given 2 TVs, typically representing the empirical probability
* and the probability estimate of a pattern, calculate the
* Jensen-Shannon distance between them.
*/
static double jsd(const TruthValuePtr l_tv, const TruthValuePtr r_tv);

/**
* Given 2 cdfs (cummulative distribution functions) return their
* Kullback-Leibler divergence.
*
* The cdfs are described as vectors of regularly spaced right-end
* points. The point at the origin is ignored because it is always
* 0, but the last one, which is always 1, is present for
* completeness.
*/
static double kld(const std::vector<double>& l_cdf,
const std::vector<double>& r_cdf);

/**
* Given 2 cdfs, return their average, that is (cdf1 + cdf2)/2.
*/
static std::vector<double> avrg_cdf(const std::vector<double>& l_cdf,
const std::vector<double>& r_cdf);

};

/**
Expand Down

0 comments on commit 5a55975

Please sign in to comment.