Program Listing for File entropy.hpp

Return to documentation for file (rcppsw/algorithm/clustering/entropy.hpp)

#pragma once

/*******************************************************************************
 * Includes
 ******************************************************************************/
#include <vector>
#include <limits>
#include <map>

#include "rcppsw/rcppsw.hpp"
#include "rcppsw/er/client.hpp"
#include "rcppsw/algorithm/clustering/cluster.hpp"
#include "rcsw/utils/time.h"
#include "rcsw/al/clock.h"
#include "rcppsw/algorithm/clustering/eh_clustering_impl.hpp"
#include "rcppsw/math/range.hpp"
#include "rcppsw/math/ientropy.hpp"

/*******************************************************************************
 * Namespaces/Decls
 ******************************************************************************/
namespace rcppsw::algorithm::clustering {

/*******************************************************************************
 * Class Definitions
 ******************************************************************************/
template <typename T>
class entropy_balch2000 : public er::client<entropy_balch2000<T>> {
 public:
  using cluster_vector = typename base_clustering_impl<
   T,
   policy::EH>::cluster_vector;
  using dist_calc_ftype = typename base_clustering_impl<
    T,
    policy::EH>::dist_calc_ftype;
  using membership_map = std::map<double,
                                  membership_type<policy::EH>>;
  using membership_vector = std::vector<membership_type<policy::EH>>;

  entropy_balch2000(std::unique_ptr<eh_clustering_impl<T>> impl,
                    const math::ranged& horizon,
                    double horizon_delta)
      : ER_CLIENT_INIT(),
        mc_horizon(horizon),
        mc_horizon_delta(horizon_delta),
        m_impl(std::move(impl)) {}

  double run(const std::vector<T>& data, const dist_calc_ftype& dist_func) {
    ER_INFO("Initialize");
    m_data = data;

    /*
     * You can't use reserve() here, as that just allocates data without
     * initialization, which is needed to mimic initializing membership arrays
     * in the constructor.
     */
    m_membership.assign(m_data.size(),
                        typename decltype(m_membership)::value_type());
    m_membership_cp.assign(m_data.size(),
                           typename decltype(m_membership_cp)::value_type());

    m_impl->initialize(&m_data, &m_membership);
    m_clusters = clusters_init();

    double t_accum = 0.0;
    double e_accum = 0.0;
    double entropy_h_1 = 0.0;
    size_t n_iter = static_cast<size_t>((mc_horizon.span() / mc_horizon_delta)) + 1;

    ER_INFO("Begin n_datapoints=%zu,horizon=%s,delta=%f,n_iter=%zu",
            m_clusters.size(),
            mc_horizon.to_str().c_str(),
            mc_horizon_delta,
            n_iter);

    /* iterate through all horizons */
    for (size_t i = 0; i < n_iter; ++i) {
      struct timespec start = clock_monotime();
      double horizon = mc_horizon.lb() + i* mc_horizon_delta;
      double entropy_h = balch2000_iter(dist_func, horizon);
      struct timespec end = clock_monotime();

      if (std::fabs(entropy_h - entropy_h_1) <= rmath::kDOUBLE_EPSILON) {
        ER_WARN("Redundant entropy %f: horizon=%f", entropy_h, horizon);
      } else {
        e_accum += entropy_h;
      }
      struct timespec diff;
      time_ts_diff(&start, &end, &diff);
      double diff2 = time_ts2monons(&diff);
      ER_DEBUG("Horizon=%f: time=%.8fms,entropy=%f",
               horizon,
               diff2,
               entropy_h);
      entropy_h_1 = entropy_h;
      t_accum += diff2;
    } /* for(i..) */
    ER_INFO("Finish: time=%0.04fs,entropy=%f", t_accum, e_accum);
    return e_accum;
  } /* run() */

 private:
  using cluster_type = typename base_clustering_impl<
   T,
   policy::EH>::cluster_type;
  cluster_vector clusters_init(void) {
    cluster_vector clusters;
    for (size_t i = 0; i < m_data.size(); ++i) {
      clusters.emplace_back(cluster_type(i, m_data, m_membership));
    } /* for(i..) */
    return clusters;
  }

  double balch2000_iter(const dist_calc_ftype& dist_func,
                        double horizon) {
    m_impl->horizon(horizon);
    m_impl->iterate(m_data, dist_func, &m_clusters);
    m_impl->post_iter_update(&m_clusters);

    /*
     * Make a copy to prevent permanently deleting clusters that may be
     * duplicates for THIS horizon, but may not for a future horizon value.
     */
    m_membership_cp = m_membership;
    balch2000_rm_dup_clusters(&m_membership_cp);
    std::vector<double> proportions(m_membership_cp.size());
    for (size_t i = 0; i < proportions.size(); ++i) {
      double prop = static_cast<double>(m_membership_cp[i].size()) /
                    m_data.size();
      ER_TRACE("cluster@%zu size=%zu, prop=%f",
               i,
               m_membership_cp[i].size(),
               prop);
      proportions[i] = prop;
    } /* for(i..) */
    return math::ientropy()(proportions);
  }

  void balch2000_rm_dup_clusters(membership_type<policy::EH>* const clusters) {
    ER_TRACE("Removing duplicate clusters");
    /* for each datapoint/cluster in a given iteration */
    for (size_t i = 0; i < clusters->size(); ++i) {
      ER_TRACE("cluster%zu: n_members=%zu", i, (*clusters)[i].size());
      std::unordered_set<size_t>& cluster_i = (*clusters)[i];
      auto comp = [&](const auto& clust) RCPPSW_PURE {
        /*
         * Duplicates are defined as two of the unordered membership sets
         * containing the same points AND the sets not being in the same index
         * in the vector of results (i.e. not self equality).
         */
        return cluster_i == clust &&
        static_cast<size_t>(&clust - &((*clusters)[0])) != i;
      };
      auto rm = std::remove_if(clusters->begin(), clusters->end(), comp);
      clusters->erase(rm, clusters->end());
    } /* for(i..) */
  }

  /* clang-format off */
  const math::ranged                     mc_horizon;
  const double                           mc_horizon_delta;

  std::vector<T>                         m_data{};
  membership_type<policy::EH>            m_membership{};

  membership_type<policy::EH>            m_membership_cp{};
  cluster_vector                         m_clusters{};
  std::unique_ptr<eh_clustering_impl<T>> m_impl;
  /* clang-format on */
};

} /* namespace rcppsw::algorithm::clustering */