Source code for joeynmt.metrics

# coding: utf-8
"""
Evaluation metrics
"""
from inspect import getfullargspec
from typing import Callable, List

from sacrebleu.metrics import BLEU, CHRF

from joeynmt.helpers_for_ddp import get_logger

logger = get_logger(__name__)



[docs]
def chrf(hypotheses: List[str], references: List[str], **sacrebleu_cfg) -> float:
    """
    Character F-score from sacrebleu
    cf. https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/chrf.py

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :return: character f-score (0 <= chf <= 1)
             see Breaking Change in sacrebleu v2.0
    """
    kwargs = {}
    if sacrebleu_cfg:
        valid_keys = getfullargspec(CHRF).args
        for k, v in sacrebleu_cfg.items():
            if k in valid_keys:
                kwargs[k] = v

    metric = CHRF(**kwargs)
    score = metric.corpus_score(hypotheses=hypotheses, references=[references]).score

    # log sacrebleu signature
    logger.info(metric.get_signature())
    return score / 100




[docs]
def bleu(hypotheses: List[str], references: List[str], **sacrebleu_cfg) -> float:
    """
    Raw corpus BLEU from sacrebleu (without tokenization)
    cf. https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/bleu.py

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :return: bleu score
    """
    kwargs = {}
    if sacrebleu_cfg:
        valid_keys = getfullargspec(BLEU).args
        for k, v in sacrebleu_cfg.items():
            if k in valid_keys:
                kwargs[k] = v

    metric = BLEU(**kwargs)
    score = metric.corpus_score(hypotheses=hypotheses, references=[references]).score

    # log sacrebleu signature
    logger.info(metric.get_signature())
    return score




[docs]
def token_accuracy(
    hypotheses: List[str], references: List[str], tokenizer: Callable
) -> float:
    """
    Compute the accuracy of hypothesis tokens: correct tokens / all tokens
    Tokens are correct if they appear in the same position in the reference.
    We lookup the references before one-hot-encoding, that is, UNK generation in
    hypotheses is always evaluated as incorrect.

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :return: token accuracy (float)
    """
    correct_tokens = 0
    all_tokens = 0
    assert len(hypotheses) == len(references)
    for hyp, ref in zip(hypotheses, references):
        hyp = tokenizer(hyp)
        ref = tokenizer(ref)
        all_tokens += len(hyp)
        for h_i, r_i in zip(hyp, ref):
            # min(len(h), len(r)) tokens considered
            if h_i == r_i:
                correct_tokens += 1
    return (correct_tokens / all_tokens) * 100 if all_tokens > 0 else 0.0




[docs]
def sequence_accuracy(hypotheses: List[str], references: List[str]) -> float:
    """
    Compute the accuracy of hypothesis tokens: correct tokens / all tokens
    Tokens are correct if they appear in the same position in the reference.
    We lookup the references before one-hot-encoding, that is, hypotheses with UNK
    are always evaluated as incorrect.

    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :return:
    """
    assert len(hypotheses) == len(references)
    correct_sequences = sum([
        1 for (hyp, ref) in zip(hypotheses, references) if hyp == ref
    ])
    return (correct_sequences / len(hypotheses)) * 100 if hypotheses else 0.0