"""This module cointains the implementation of the scorer based on naive bayes."""
import bz2
import math
import pickle
from datetime import datetime
from typing import Sequence
from ctparse.nb_estimator import MultinomialNaiveBayes
from ctparse.count_vectorizer import CountVectorizer
from ctparse.pipeline import CTParsePipeline
from ctparse.scorer import Scorer
from ctparse.partial_parse import PartialParse
from ctparse.types import Artifact
[docs]class NaiveBayesScorer(Scorer):
def __init__(self, nb_model: CTParsePipeline) -> None:
"""Scorer based on a naive bayes estimator.
This scorer models the probability of having a correct parse, conditioned
on the sequence of rules (expressed as a categorical feature) that led to
that parse.
The score is also modified by a "length" factor that penalizes parses that
cover a smaller part of the text string.
:param nb_model:
A scikit-learn style Estimator that was trained on a corpus that takes
a Sequence[Sequence[str]] as X (each entry is a sequence of rule
identifiers) and a Sequence[int] in the set {-1, 1} that indicates if
the parse was correct or incorrect.
"""
self._model = nb_model
[docs] @classmethod
def from_model_file(cls, fname: str) -> "NaiveBayesScorer":
with bz2.open(fname, "rb") as fd:
return cls(pickle.load(fd))
[docs] def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float:
# Penalty for partial matches
max_covered_chars = partial_parse.prod[-1].mend - partial_parse.prod[0].mstart
len_score = math.log(max_covered_chars / len(txt))
X = _feature_extractor(txt, ts, partial_parse)
pred = self._model.predict_log_proba([X])
# NOTE: the prediction is log-odds, or logit
model_score = pred[0][1] - pred[0][0]
return model_score + len_score
[docs] def score_final(
self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact
) -> float:
# The difference between the original score and final score is that in the
# final score, the len_score is calculated based on the length of the final
# production
len_score = math.log(len(prod) / len(txt))
X = _feature_extractor(txt, ts, partial_parse)
pred = self._model.predict_log_proba([X])
# NOTE: the prediction is log-odds, or logit
model_score = pred[0][1] - pred[0][0]
# We want the len_score to always take precedence. I believe a logit won't go up
# more than 1000. A better way would be to return an ordering tuple instead,
# but then we would need to change many interfaces.
return model_score + 1000 * len_score
def _feature_extractor(
txt: str, ts: datetime, partial_parse: PartialParse
) -> Sequence[str]:
return [str(r) for r in partial_parse.rules]
[docs]def train_naive_bayes(X: Sequence[Sequence[str]], y: Sequence[bool]) -> CTParsePipeline:
"""Train a naive bayes model for NaiveBayesScorer"""
y_binary = [1 if y_i else -1 for y_i in y]
# Create and train the pipeline
pipeline = CTParsePipeline(
CountVectorizer(ngram_range=(1, 3)), MultinomialNaiveBayes(alpha=1.0)
)
model = pipeline.fit(X, y_binary)
return model
[docs]def save_naive_bayes(model: CTParsePipeline, fname: str) -> None:
"""Save a naive bayes model for NaiveBayesScorer"""
# TODO: version this model and dump metadata with lots of information
with bz2.open(fname, "wb") as fd:
pickle.dump(model, fd)