Source code for ctparse.count_vectorizer

from collections import defaultdict
from typing import Dict, Sequence, Tuple, Optional


[docs]class CountVectorizer:
    def __init__(self, ngram_range: Tuple[int, int]):
        """Create new count vectorizer that also counts n-grams.

        A count vectorizer builds an internal vocabulary and embeds each input
        by counting for each term in the document how often it appearsin the vocabulary.
        Here also n-grams are considered to be part of the vocabulary and the document
        terms, respectively

        Parameters
        ----------
        ngram_range : Tuple[int, int]
            n-gram range to consider
        """
        self.ngram_range = ngram_range
        self.vocabulary: Optional[Dict[str, int]] = None

    @staticmethod
    def _create_ngrams(
        ngram_range: Tuple[int, int], documents: Sequence[Sequence[str]]
    ) -> Sequence[Sequence[str]]:
        """For each document in documents, replace original tokens by a list of
        all min_n:max_n = self.ngram_range ngrams in that document.

        Parameters
        ----------
        ngram_range : Tuple[int, int]
            Min and max number of ngrams to generate

        documents : Sequence[Sequence[str]]
            A sequence of already tokenized documents

        Returns
        -------
        Sequence[Sequence[str]]
            For each document all ngrams of tokens in the desired range
        """
        min_n, max_n = ngram_range
        space_join = " ".join

        def _create(document: Sequence[str]) -> Sequence[str]:
            doc_len = len(document)
            doc_max_n = min(max_n, doc_len) + 1
            if min_n == 1:
                ngrams = list(document)
                min_nn = min_n + 1
            else:
                ngrams = []
                min_nn = min_n

            for n in range(min_nn, doc_max_n):
                for i in range(0, doc_len - n + 1):
                    ngrams.append(space_join(document[i : i + n]))
            return ngrams

        return [_create(d) for d in documents]

    @staticmethod
    def _get_feature_counts(
        ngram_range: Tuple[int, int], documents: Sequence[Sequence[str]]
    ) -> Sequence[Dict[str, int]]:
        """Count (ngram) features appearing in each document

        Parameters
        ----------
        ngram_range : Tuple[int, int]
            Min and max number of ngrams to generate

        documents : Sequence[Sequence[str]]
            Sequence of documents tokenized as sequence of string

        Returns
        -------
        Tuple[Sequence[Dict[str, int]], Set[str]]
            For each document a dictionary counting how often which feature appeared and
            a set of all features in all documents. Features are according to this
            vectorizers n-gram settings.
        """
        documents = CountVectorizer._create_ngrams(ngram_range, documents)
        count_matrix = []

        for document in documents:
            # This is 5x faster than using a build in Counter
            feature_counts: Dict[str, int] = defaultdict(int)
            for feature in document:
                feature_counts[feature] += 1
            count_matrix.append(feature_counts)
        return count_matrix

    @staticmethod
    def _build_vocabulary(count_matrix: Sequence[Dict[str, int]]) -> Dict[str, int]:
        """Build the vocabulary from feature counts

        Parameters
        ----------
        count_matrix : Sequence[Dict[str, int]]
            Sequence of dicts with counts (values) per feature (keys)

        Returns
        -------
        Dict[str, int]
            The vocabulary as {feature: index} pairs
        """
        all_features = set()
        for feature_counts in count_matrix:
            for feature in feature_counts.keys():
                all_features.add(feature)
        return {word: idx for idx, word in enumerate(sorted(all_features))}

    @staticmethod
    def _create_feature_matrix(
        vocabulary: Dict[str, int], count_matrix: Sequence[Dict[str, int]]
    ) -> Sequence[Dict[int, int]]:
        """Map counts of string features to numerical data (sparse maps of
        `{feature_index: count}`). Here `feature_index` is relative to the vocabulary of
        this vectorizer.

        Parameters
        ----------
        vocabulary : Dict[str, int]
            Vocabulary with {feature: index} mappings

        count_matrix : Sequence[Dict[str, int]]
            Sequence of dictionaries with feature counts

        Returns
        -------
        Sequence[Dict[int, int]]
            For each document a mapping of `feature_index` to a count how often this
            feature appeared in the document.
        """
        len_vocab = len(vocabulary)
        count_vectors_matrix = []
        # Build document frequency matrix
        for count_dict in count_matrix:
            doc_vector: Dict[int, int] = defaultdict(int)
            for word, cnt in count_dict.items():
                idx = vocabulary.get(word, None)
                if idx is not None:
                    doc_vector[idx] = cnt
            count_vectors_matrix.append(doc_vector)
        # add vocab length in first element
        count_vectors_matrix[0][len_vocab - 1] = count_vectors_matrix[0][len_vocab - 1]
        return count_vectors_matrix

[docs]    def fit(self, documents: Sequence[Sequence[str]]) -> "CountVectorizer":
        """Learn a vocabulary dictionary of all tokens in the raw documents.

        Parameters
        ----------
        documents : Sequence[Sequence[str]]
            Sequence of documents, each as a sequence of tokens

        Returns
        -------
        CountVectorizer
            The updated vectorizer, i.e. this updates the internal vocabulary
        """
        self.fit_transform(documents)
        return self

[docs]    def fit_transform(
        self, documents: Sequence[Sequence[str]]
    ) -> Sequence[Dict[int, int]]:
        """Learn the vocabulary dictionary and return a term-document matrix. Updates
        the internal vocabulary state of the vectorizer.

        Parameters
        ----------
        documents : Sequence[Sequence[str]
            Sequence of documents, each as a sequence of tokens

        Returns
        -------
        Sequence[Dict[int, int]]
            Document-term matrix.
        """
        count_matrix = CountVectorizer._get_feature_counts(self.ngram_range, documents)
        self.vocabulary = CountVectorizer._build_vocabulary(count_matrix)
        return CountVectorizer._create_feature_matrix(self.vocabulary, count_matrix)

[docs]    def transform(self, documents: Sequence[Sequence[str]]) -> Sequence[Dict[int, int]]:
        """Create term-document matrix based on pre-generated vocabulary. Does *not*
        update the internal state of the vocabulary.

        Parameters
        ----------
        documents : Sequence[Sequence[str]]
            Sequence of documents, each as a sequence of tokens

        Returns
        -------
        Sequence[Dict[int, int]]
            Document-term matrix.
        """
        if not self.vocabulary:
            raise ValueError("no vocabulary - vectorizer not fitted?")
        count_matrix = CountVectorizer._get_feature_counts(self.ngram_range, documents)
        return CountVectorizer._create_feature_matrix(self.vocabulary, count_matrix)