from typing import Sequence, Dict, Tuple, List
from math import log, exp
def _log_sum_exp(x: Sequence[float]) -> float:
max_value = max(x)
sum_of_exp = sum(exp(x_i - max_value) for x_i in x)
return max_value + log(sum_of_exp)
[docs]class MultinomialNaiveBayes:
"""Implements a multinomial naive Bayes classifier. For background information
(and what has inspired this, see e.g. https://scikit-learn.org/stable/...
...modules/generated/sklearn.naive_bayes.MultinomialNB.html)
"""
def __init__(self, alpha: float = 1.0):
"""Create new un-trained model
Parameters
----------
alpha : Optional[float]
Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing),
defaults to 1.0
"""
self.alpha = alpha
self.class_prior = (0.0, 0.0)
self.log_likelihood: Dict[str, List[float]] = {}
@staticmethod
def _construct_log_class_prior(y: Sequence[int]) -> Tuple[float, float]:
# Input classes are -1 and 1
neg_class_count = sum(1 if y_i == -1 else 0 for y_i in y)
pos_class_count = len(y) - neg_class_count
neg_log_prior = log(neg_class_count / (pos_class_count + neg_class_count))
pos_log_prior = log(pos_class_count / (pos_class_count + neg_class_count))
return (neg_log_prior, pos_log_prior)
@staticmethod
def _construct_log_likelihood(
X: Sequence[Dict[int, int]], y: Sequence[int], alpha: float
) -> Dict[str, List[float]]:
# Token counts
# implicit assumption from vectorizer: first element has count for #vocab
# size set
vocabulary_len = max(X[0].keys()) + 1
token_counts_negative = [alpha] * vocabulary_len
token_counts_positive = [alpha] * vocabulary_len
for x, y_ in zip(X, y):
for idx, cnt in x.items():
if y_ == 1:
token_counts_positive[idx] += cnt
else:
token_counts_negative[idx] += cnt
token_pos_class_sum = sum(token_counts_positive)
token_neg_class_sum = sum(token_counts_negative)
log_likelihood_negative = []
log_likelihood_positive = []
for token_ind in range(vocabulary_len):
log_likelihood_positive.append(
log(token_counts_positive[token_ind]) - log(token_pos_class_sum)
)
log_likelihood_negative.append(
log(token_counts_negative[token_ind]) - log(token_neg_class_sum)
)
return {
"negative_class": log_likelihood_negative,
"positive_class": log_likelihood_positive,
}
[docs] def fit(
self, X: Sequence[Dict[int, int]], y: Sequence[int]
) -> "MultinomialNaiveBayes":
"""Fit a naive Bayes model from a count of feature matrix
Parameters
----------
X : Sequence[Dict[int, int]]
Sequence of sparse {feature_index: count} dictionaries
y : Sequence[int]
Labels +1/-1
Returns
-------
MultinomialNaiveBayes
The fitted model
"""
self.class_prior = self._construct_log_class_prior(y)
self.log_likelihood = self._construct_log_likelihood(X, y, self.alpha)
return self
[docs] def predict_log_probability(
self, X: Sequence[Dict[int, int]]
) -> Sequence[Tuple[float, float]]:
"""Calculate the posterior log probability of new sample X
Parameters
----------
X : Sequence[Dict[int, int]]
Sequence of data to predict on as sparse {feature_index: count} dictionarie
Returns
-------
Sequence[Tuple[float, float]]
Tuple of (negative-class, positive-class) log likelihoods
"""
scores = []
for x in X:
# Initialise the scores with priors of positive and negative class
neg_score = self.class_prior[0]
pos_score = self.class_prior[1]
for idx, cnt in x.items():
pos_score += self.log_likelihood["positive_class"][idx] * cnt
neg_score += self.log_likelihood["negative_class"][idx] * cnt
joint_log_likelihood = [neg_score, pos_score]
# Normalize the scores
log_prob_x = _log_sum_exp(joint_log_likelihood)
scores.append((neg_score - log_prob_x, pos_score - log_prob_x))
return scores