Source code for ctparse.nb_scorer

"""This module cointains the implementation of the scorer based on naive bayes."""
import bz2
import math
import pickle
from datetime import datetime
from typing import Sequence

from ctparse.nb_estimator import MultinomialNaiveBayes
from ctparse.count_vectorizer import CountVectorizer
from ctparse.pipeline import CTParsePipeline
from ctparse.scorer import Scorer
from ctparse.partial_parse import PartialParse
from ctparse.types import Artifact


[docs]class NaiveBayesScorer(Scorer):
    def __init__(self, nb_model: CTParsePipeline) -> None:
        """Scorer based on a naive bayes estimator.

        This scorer models the probability of having a correct parse, conditioned
        on the sequence of rules (expressed as a categorical feature) that led to
        that parse.

        The score is also modified by a "length" factor that penalizes parses that
        cover a smaller part of the text string.

        :param nb_model:
            A scikit-learn style Estimator that was trained on a corpus that takes
            a Sequence[Sequence[str]] as X (each entry is a sequence of rule
            identifiers) and a Sequence[int] in the set {-1, 1} that indicates if
            the parse was correct or incorrect.
        """
        self._model = nb_model

[docs]    @classmethod
    def from_model_file(cls, fname: str) -> "NaiveBayesScorer":
        with bz2.open(fname, "rb") as fd:
            return cls(pickle.load(fd))

[docs]    def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float:
        # Penalty for partial matches
        max_covered_chars = partial_parse.prod[-1].mend - partial_parse.prod[0].mstart
        len_score = math.log(max_covered_chars / len(txt))

        X = _feature_extractor(txt, ts, partial_parse)
        pred = self._model.predict_log_proba([X])

        # NOTE: the prediction is log-odds, or logit
        model_score = pred[0][1] - pred[0][0]

        return model_score + len_score

[docs]    def score_final(
        self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact
    ) -> float:
        # The difference between the original score and final score is that in the
        # final score, the len_score is calculated based on the length of the final
        # production
        len_score = math.log(len(prod) / len(txt))

        X = _feature_extractor(txt, ts, partial_parse)
        pred = self._model.predict_log_proba([X])

        # NOTE: the prediction is log-odds, or logit
        model_score = pred[0][1] - pred[0][0]

        # We want the len_score to always take precedence. I believe a logit won't go up
        # more than 1000. A better way would be to return an ordering tuple instead,
        # but then we would need to change many interfaces.
        return model_score + 1000 * len_score


def _feature_extractor(
    txt: str, ts: datetime, partial_parse: PartialParse
) -> Sequence[str]:
    return [str(r) for r in partial_parse.rules]


[docs]def train_naive_bayes(X: Sequence[Sequence[str]], y: Sequence[bool]) -> CTParsePipeline:
    """Train a naive bayes model for NaiveBayesScorer"""
    y_binary = [1 if y_i else -1 for y_i in y]
    # Create and train the pipeline
    pipeline = CTParsePipeline(
        CountVectorizer(ngram_range=(1, 3)), MultinomialNaiveBayes(alpha=1.0)
    )
    model = pipeline.fit(X, y_binary)
    return model


[docs]def save_naive_bayes(model: CTParsePipeline, fname: str) -> None:
    """Save a naive bayes model for NaiveBayesScorer"""
    # TODO: version this model and dump metadata with lots of information
    with bz2.open(fname, "wb") as fd:
        pickle.dump(model, fd)