Source code for ctparse.corpus

import json
import logging
from datetime import datetime
from typing import (
    Callable,
    Iterable,
    Iterator,
    List,
    Optional,
    NamedTuple,
    Sequence,
    Tuple,
    TypeVar,
    Union,
)

from tqdm import tqdm

from ctparse.ctparse import ctparse_gen, CTParse
from ctparse.scorer import DummyScorer, Scorer
from ctparse.types import Artifact, Duration, Interval, Time

logger = logging.getLogger(__name__)

# A triplet of text, reference timestamp and correct parse.
# It can be used as raw data to build datasets for ctparse.
TimeParseEntry = NamedTuple(
    "TimeParseEntry",
    [("text", str), ("ts", datetime), ("gold", Artifact)],
)

T = TypeVar("T")


[docs]def make_partial_rule_dataset(
    entries: Sequence[TimeParseEntry],
    scorer: Scorer,
    timeout: Union[float, int],
    max_stack_depth: int,
    relative_match_len: float = 1.0,
    progress: bool = False,
) -> Iterable[Tuple[List[str], bool]]:
    """Build a data set from an iterable of TimeParseEntry.

    The text is run through ctparse and all parses (within the specified timeout,
    max_stack_depth and scorer) are obtained. Each parse contains a sequence
    of rules (see ``CTParse.rules``) used to produce that parse.

    A dataset is generated by taking every possible partial rule and assigning to it
    a boolean indicating if that partial sequence did lead to a successful parse.

    If `progress` is ``True``, display a progress bar.

    Example:

    rule sequence: [r1, r2, r3]
    parse_is_correct: True

    [r1] -> True
    [r1, r2] -> True
    [r1, r2, r3] -> True
    """
    # If we look at the signature for a scorer, the score is obtained from:
    # (text, reference_time, partial_parse) and optionally a production for a
    # partial parse.
    # Clearly, if we were to make a general scorer for the dataset, we would need
    # all of these features. It is possible to achieve that by tracking the list of
    # partial parses that led to a correct parse. Unfortunately we don't have the
    # full history with the current implementation, however we can obtain a dataset
    # of (text, reference_time, rule_ids) quite easily, because the rule is a linear
    # list.

    if progress:
        entries_it = _progress_bar(
            entries,
            total=len(entries),
            status_text=lambda entry: "  {: <70}".format(entry.text),
        )
    else:
        entries_it = entries

    for entry in entries_it:
        for parse in ctparse_gen(
            entry.text,
            entry.ts,
            relative_match_len=relative_match_len,
            timeout=timeout,
            max_stack_depth=max_stack_depth,
            scorer=scorer,
            latent_time=False,
        ):
            # TODO: we should make sure ctparse_gen never returns None. If there is no
            # result it should return an empty list
            if parse is None:
                continue

            y = parse.resolution == entry.gold
            # Build data set, one sample for each applied rule in
            # the sequence of rules applied in this production
            # *after* the matched regular expressions
            for i in range(1, len(parse.production) + 1):
                X = [str(p) for p in parse.production[:i]]
                yield X, y


def _progress_bar(
    it: Iterable[T], total: int, status_text: Callable[[T], str]
) -> Iterable[T]:
    # Progress bar that can update text
    pbar = tqdm(it, total=total)
    for val in pbar:
        pbar.set_description(status_text(val))
        yield val


[docs]def load_timeparse_corpus(fname: str) -> Sequence[TimeParseEntry]:
    """Load a corpus from disk.

    For more information about the format of the time parse corpus,
    refer to the documentation.
    """
    with open(fname, "r", encoding="utf-8") as fd:
        entries = json.load(fd)

    return [
        TimeParseEntry(
            text=e["text"],
            ts=datetime.strptime(e["ref_time"], "%Y-%m-%dT%H:%M:%S"),
            gold=parse_nb_string(e["gold_parse"]),
        )
        for e in entries
    ]


[docs]def parse_nb_string(gold_parse: str) -> Union[Time, Interval, Duration]:
    """Parse a Time, Interval or Duration from their no-bound string representation.

    The no-bound string representations are generated from ``Artifact.nb_str``.
    """
    if gold_parse.startswith("Time"):
        return Time.from_str(gold_parse[7:-1])
    if gold_parse.startswith("Interval"):
        return Interval.from_str(gold_parse[11:-1])
    if gold_parse.startswith("Duration"):
        return Duration.from_str(gold_parse[11:-1])
    else:
        raise ValueError("'{}' has an invalid format".format(gold_parse))


def _run_corpus_one_test(
    target: str,
    ts_str: str,
    tests: List[str],
    ctparse_generator: Callable[[str, datetime], Iterator[Optional[CTParse]]],
) -> Tuple[List[List[str]], List[bool], int, int, int, int, int, bool]:
    ts = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M")
    all_tests_pass = True
    Xs = []
    ys = []
    pos_parses = neg_parses = pos_first_parses = pos_best_scored = total_tests = 0

    at_least_one_failed = False
    for test in tests:
        one_prod_passes = False
        first_prod = True
        y_score = []
        for parse in ctparse_generator(test, ts):
            assert parse is not None

            y = parse.resolution.nb_str() == target
            # Build data set, one sample for each applied rule in
            # the sequence of rules applied in this production
            # *after* the matched regular expressions
            for i in range(1, len(parse.production) + 1):
                Xs.append([str(p) for p in parse.production[:i]])
                ys.append(y)

            one_prod_passes |= y
            pos_parses += int(y)
            neg_parses += int(not y)
            pos_first_parses += int(y and first_prod)
            first_prod = False
            y_score.append((parse.score, y))
        if not one_prod_passes:
            logger.warning(
                'failure: target "{}" never produced in "{}"'.format(target, test)
            )
        pos_best_scored += int(max(y_score, key=lambda x: x[0])[1])
        total_tests += len(tests)
        all_tests_pass &= one_prod_passes
    if not all_tests_pass:
        logger.warning('failure: "{}" not always produced'.format(target))
        at_least_one_failed = True
    return (
        Xs,
        ys,
        total_tests,
        pos_parses,
        neg_parses,
        pos_first_parses,
        pos_best_scored,
        at_least_one_failed,
    )


[docs]def run_single_test(target: str, ts: str, test: str) -> None:
    """Run a single test case and raise an exception if the target was never produced.

    Below `max_stack_depth` might be increased if tests fail.

    Parameters
    ----------
    target : str
        Target to produce
    ts : str
        Reference time as *string*
    test : str
        Test case
    """

    def ctparse_generator(test: str, ts: datetime) -> Iterator[Optional[CTParse]]:
        return ctparse_gen(
            test,
            ts,
            relative_match_len=1.0,
            timeout=0,
            max_stack_depth=100,
            latent_time=False,
        )

    res = _run_corpus_one_test(target, ts, [test], ctparse_generator)
    if res[-1]:
        raise Exception(
            'failure: target "{}" never produced in "{}"'.format(target, test)
        )


[docs]def run_corpus(
    corpus: Sequence[Tuple[str, str, Sequence[str]]]
) -> Tuple[List[List[str]], List[bool]]:
    """Load the corpus (currently hard coded), run it through ctparse with
    no timeout and no limit on the stack depth.

    The corpus passes if ctparse generates the desired solution for
    each test at least once. Otherwise it fails.

    While testing this, a labeled data set (X, y) is generated based
    on *all* productions. Given a final production p, based on initial
    regular expression matches r_0, ..., r_n, which are then
    subsequently transformed using production rules p_0, ..., p_m,
    will result in the samples

    [r_0, ..., r_n, p_0, 'step_0']
    [r_0, ..., r_n, p_0, p_1, 'step_1']
    ...
    [r_0, ..., r_n, p_0, ..., p_m, 'step_m']

    All samples from one production are given the same label which indicates if
    the production was correct.

    To build a similar datasets without the strict checking, use
    `make_partial_rule_dataset`
    """
    at_least_one_failed = False
    # pos_parses: number of parses that are correct
    # neg_parses: number of parses that are wrong
    # pos_first_parses: number of first parses generated that are correct
    # pos_best_scored: number of correct parses that have the best score
    pos_parses = neg_parses = pos_first_parses = pos_best_scored = 0
    total_tests = 0
    Xs = []
    ys = []

    def ctparse_generator(test: str, ts: datetime) -> Iterator[Optional[CTParse]]:
        return ctparse_gen(
            test,
            ts,
            relative_match_len=1.0,
            timeout=0,
            max_stack_depth=0,
            scorer=DummyScorer(),
            latent_time=False,
        )

    for target, ts, tests in tqdm(corpus):
        (
            Xs_,
            ys_,
            total_tests_,
            pos_parses_,
            neg_parses_,
            pos_first_parses_,
            pos_best_scored_,
            at_least_one_failed_,
        ) = _run_corpus_one_test(target, ts, tests, ctparse_generator)
        Xs.extend(Xs_)
        ys.extend(ys_)
        total_tests += total_tests_
        pos_parses += pos_parses_
        neg_parses += neg_parses_
        pos_first_parses += pos_first_parses_
        pos_best_scored += pos_best_scored_
        at_least_one_failed = at_least_one_failed or at_least_one_failed_
    logger.info(
        "run {} tests on {} targets with a total of "
        "{} positive and {} negative parses (={})".format(
            total_tests, len(corpus), pos_parses, neg_parses, pos_parses + neg_parses
        )
    )
    logger.info(
        "share of correct parses in all parses: {:.2%}".format(
            pos_parses / (pos_parses + neg_parses)
        )
    )
    logger.info(
        "share of correct parses being produced first: {:.2%}".format(
            pos_first_parses / (pos_parses + neg_parses)
        )
    )
    logger.info(
        "share of correct parses being scored highest: {:.2%}".format(
            pos_best_scored / total_tests
        )
    )
    if at_least_one_failed:
        raise Exception("ctparse corpus has errors")
    return Xs, ys