Source code for ctparse.pipeline

from typing import Sequence, Tuple

from ctparse.nb_estimator import MultinomialNaiveBayes
from ctparse.count_vectorizer import CountVectorizer


[docs]class CTParsePipeline: def __init__(self, transformer: CountVectorizer, estimator: MultinomialNaiveBayes): """Setup a pipeline of feature extraction and naive bayes. Overkill for what it does but leaves room to use different models/features in the future Parameters ---------- transformer : CountVectorizer feature extraction step estimator : MultinomialNaiveBayes naive bayes model """ self.transformer = transformer self.estimator = estimator
[docs] def fit(self, X: Sequence[Sequence[str]], y: Sequence[int]) -> "CTParsePipeline": """Fit the transformer and then fit the Naive Bayes model on the transformed data Returns ------- CTParsePipeline Returns the fitted pipeline """ X_transformed = self.transformer.fit_transform(X) self.estimator = self.estimator.fit(X_transformed, y) return self
[docs] def predict_log_proba( self, X: Sequence[Sequence[str]] ) -> Sequence[Tuple[float, float]]: """Apply the transforms and get probability predictions from the estimator Parameters ---------- X : Sequence[Sequence[str]] Sequence of documents, each as sequence of tokens. In ctparse case there are just the names of the regex matches and rules applied Returns ------- Sequence[Tuple[float, float]] For each document the tuple of negative/positive log probability from the naive bayes model """ X_transformed = self.transformer.transform(X) return self.estimator.predict_log_probability(X_transformed)