Source code for ctparse.nb_estimator

from typing import Sequence, Dict, Tuple, List
from math import log, exp


def _log_sum_exp(x: Sequence[float]) -> float:
    max_value = max(x)
    sum_of_exp = sum(exp(x_i - max_value) for x_i in x)
    return max_value + log(sum_of_exp)


[docs]class MultinomialNaiveBayes: """Implements a multinomial naive Bayes classifier. For background information (and what has inspired this, see e.g. https://scikit-learn.org/stable/... ...modules/generated/sklearn.naive_bayes.MultinomialNB.html) """ def __init__(self, alpha: float = 1.0): """Create new un-trained model Parameters ---------- alpha : Optional[float] Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing), defaults to 1.0 """ self.alpha = alpha self.class_prior = (0.0, 0.0) self.log_likelihood: Dict[str, List[float]] = {} @staticmethod def _construct_log_class_prior(y: Sequence[int]) -> Tuple[float, float]: # Input classes are -1 and 1 neg_class_count = sum(1 if y_i == -1 else 0 for y_i in y) pos_class_count = len(y) - neg_class_count neg_log_prior = log(neg_class_count / (pos_class_count + neg_class_count)) pos_log_prior = log(pos_class_count / (pos_class_count + neg_class_count)) return (neg_log_prior, pos_log_prior) @staticmethod def _construct_log_likelihood( X: Sequence[Dict[int, int]], y: Sequence[int], alpha: float ) -> Dict[str, List[float]]: # Token counts # implicit assumption from vectorizer: first element has count for #vocab # size set vocabulary_len = max(X[0].keys()) + 1 token_counts_negative = [alpha] * vocabulary_len token_counts_positive = [alpha] * vocabulary_len for x, y_ in zip(X, y): for idx, cnt in x.items(): if y_ == 1: token_counts_positive[idx] += cnt else: token_counts_negative[idx] += cnt token_pos_class_sum = sum(token_counts_positive) token_neg_class_sum = sum(token_counts_negative) log_likelihood_negative = [] log_likelihood_positive = [] for token_ind in range(vocabulary_len): log_likelihood_positive.append( log(token_counts_positive[token_ind]) - log(token_pos_class_sum) ) log_likelihood_negative.append( log(token_counts_negative[token_ind]) - log(token_neg_class_sum) ) return { "negative_class": log_likelihood_negative, "positive_class": log_likelihood_positive, }
[docs] def fit( self, X: Sequence[Dict[int, int]], y: Sequence[int] ) -> "MultinomialNaiveBayes": """Fit a naive Bayes model from a count of feature matrix Parameters ---------- X : Sequence[Dict[int, int]] Sequence of sparse {feature_index: count} dictionaries y : Sequence[int] Labels +1/-1 Returns ------- MultinomialNaiveBayes The fitted model """ self.class_prior = self._construct_log_class_prior(y) self.log_likelihood = self._construct_log_likelihood(X, y, self.alpha) return self
[docs] def predict_log_probability( self, X: Sequence[Dict[int, int]] ) -> Sequence[Tuple[float, float]]: """Calculate the posterior log probability of new sample X Parameters ---------- X : Sequence[Dict[int, int]] Sequence of data to predict on as sparse {feature_index: count} dictionarie Returns ------- Sequence[Tuple[float, float]] Tuple of (negative-class, positive-class) log likelihoods """ scores = [] for x in X: # Initialise the scores with priors of positive and negative class neg_score = self.class_prior[0] pos_score = self.class_prior[1] for idx, cnt in x.items(): pos_score += self.log_likelihood["positive_class"][idx] * cnt neg_score += self.log_likelihood["negative_class"][idx] * cnt joint_log_likelihood = [neg_score, pos_score] # Normalize the scores log_prob_x = _log_sum_exp(joint_log_likelihood) scores.append((neg_score - log_prob_x, pos_score - log_prob_x)) return scores