Program Listing for File entropy.hpp
↰ Return to documentation for file (rcppsw/algorithm/clustering/entropy.hpp
)
#pragma once
/*******************************************************************************
* Includes
******************************************************************************/
#include <vector>
#include <limits>
#include <map>
#include "rcppsw/rcppsw.hpp"
#include "rcppsw/er/client.hpp"
#include "rcppsw/algorithm/clustering/cluster.hpp"
#include "rcsw/utils/time.h"
#include "rcsw/al/clock.h"
#include "rcppsw/algorithm/clustering/eh_clustering_impl.hpp"
#include "rcppsw/math/range.hpp"
#include "rcppsw/math/ientropy.hpp"
/*******************************************************************************
* Namespaces/Decls
******************************************************************************/
namespace rcppsw::algorithm::clustering {
/*******************************************************************************
* Class Definitions
******************************************************************************/
template <typename T>
class entropy_balch2000 : public er::client<entropy_balch2000<T>> {
public:
using cluster_vector = typename base_clustering_impl<
T,
policy::EH>::cluster_vector;
using dist_calc_ftype = typename base_clustering_impl<
T,
policy::EH>::dist_calc_ftype;
using membership_map = std::map<double,
membership_type<policy::EH>>;
using membership_vector = std::vector<membership_type<policy::EH>>;
entropy_balch2000(std::unique_ptr<eh_clustering_impl<T>> impl,
const math::ranged& horizon,
double horizon_delta)
: ER_CLIENT_INIT(),
mc_horizon(horizon),
mc_horizon_delta(horizon_delta),
m_impl(std::move(impl)) {}
double run(const std::vector<T>& data, const dist_calc_ftype& dist_func) {
ER_INFO("Initialize");
m_data = data;
/*
* You can't use reserve() here, as that just allocates data without
* initialization, which is needed to mimic initializing membership arrays
* in the constructor.
*/
m_membership.assign(m_data.size(),
typename decltype(m_membership)::value_type());
m_membership_cp.assign(m_data.size(),
typename decltype(m_membership_cp)::value_type());
m_impl->initialize(&m_data, &m_membership);
m_clusters = clusters_init();
double t_accum = 0.0;
double e_accum = 0.0;
double entropy_h_1 = 0.0;
size_t n_iter = static_cast<size_t>((mc_horizon.span() / mc_horizon_delta)) + 1;
ER_INFO("Begin n_datapoints=%zu,horizon=%s,delta=%f,n_iter=%zu",
m_clusters.size(),
mc_horizon.to_str().c_str(),
mc_horizon_delta,
n_iter);
/* iterate through all horizons */
for (size_t i = 0; i < n_iter; ++i) {
struct timespec start = clock_monotime();
double horizon = mc_horizon.lb() + i* mc_horizon_delta;
double entropy_h = balch2000_iter(dist_func, horizon);
struct timespec end = clock_monotime();
if (std::fabs(entropy_h - entropy_h_1) <= rmath::kDOUBLE_EPSILON) {
ER_WARN("Redundant entropy %f: horizon=%f", entropy_h, horizon);
} else {
e_accum += entropy_h;
}
struct timespec diff;
time_ts_diff(&start, &end, &diff);
double diff2 = time_ts2monons(&diff);
ER_DEBUG("Horizon=%f: time=%.8fms,entropy=%f",
horizon,
diff2,
entropy_h);
entropy_h_1 = entropy_h;
t_accum += diff2;
} /* for(i..) */
ER_INFO("Finish: time=%0.04fs,entropy=%f", t_accum, e_accum);
return e_accum;
} /* run() */
private:
using cluster_type = typename base_clustering_impl<
T,
policy::EH>::cluster_type;
cluster_vector clusters_init(void) {
cluster_vector clusters;
for (size_t i = 0; i < m_data.size(); ++i) {
clusters.emplace_back(cluster_type(i, m_data, m_membership));
} /* for(i..) */
return clusters;
}
double balch2000_iter(const dist_calc_ftype& dist_func,
double horizon) {
m_impl->horizon(horizon);
m_impl->iterate(m_data, dist_func, &m_clusters);
m_impl->post_iter_update(&m_clusters);
/*
* Make a copy to prevent permanently deleting clusters that may be
* duplicates for THIS horizon, but may not for a future horizon value.
*/
m_membership_cp = m_membership;
balch2000_rm_dup_clusters(&m_membership_cp);
std::vector<double> proportions(m_membership_cp.size());
for (size_t i = 0; i < proportions.size(); ++i) {
double prop = static_cast<double>(m_membership_cp[i].size()) /
m_data.size();
ER_TRACE("cluster@%zu size=%zu, prop=%f",
i,
m_membership_cp[i].size(),
prop);
proportions[i] = prop;
} /* for(i..) */
return math::ientropy()(proportions);
}
void balch2000_rm_dup_clusters(membership_type<policy::EH>* const clusters) {
ER_TRACE("Removing duplicate clusters");
/* for each datapoint/cluster in a given iteration */
for (size_t i = 0; i < clusters->size(); ++i) {
ER_TRACE("cluster%zu: n_members=%zu", i, (*clusters)[i].size());
std::unordered_set<size_t>& cluster_i = (*clusters)[i];
auto comp = [&](const auto& clust) RCPPSW_PURE {
/*
* Duplicates are defined as two of the unordered membership sets
* containing the same points AND the sets not being in the same index
* in the vector of results (i.e. not self equality).
*/
return cluster_i == clust &&
static_cast<size_t>(&clust - &((*clusters)[0])) != i;
};
auto rm = std::remove_if(clusters->begin(), clusters->end(), comp);
clusters->erase(rm, clusters->end());
} /* for(i..) */
}
/* clang-format off */
const math::ranged mc_horizon;
const double mc_horizon_delta;
std::vector<T> m_data{};
membership_type<policy::EH> m_membership{};
membership_type<policy::EH> m_membership_cp{};
cluster_vector m_clusters{};
std::unique_ptr<eh_clustering_impl<T>> m_impl;
/* clang-format on */
};
} /* namespace rcppsw::algorithm::clustering */