Source code for kiwi.lib.predict

#  OpenKiwi: Open-Source Machine Translation Quality Estimation
#  Copyright (C) 2019 Unbabel <openkiwi@unbabel.com>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

import logging
from pathlib import Path
from pprint import pformat

from kiwi.data.builders import build_test_dataset
from kiwi.data.utils import (
    deserialize_fields_from_vocabs,
    save_predicted_probabilities,
)
from kiwi.lib.utils import (
    configure_device,
    configure_logging,
    configure_seed,
    save_config_file,
    setup_output_directory,
)
from kiwi.models.linear_word_qe_classifier import LinearWordQEClassifier
from kiwi.models.model import Model
from kiwi.predictors.linear_tester import LinearTester
from kiwi.predictors.predictor import Predicter

logger = logging.getLogger(__name__)


[docs]def predict_from_options(options):
    """
    Uses the configuration options to run the prediction pipeline.
    Iteratively calls `setup`, `run` and `teardown`.

    Args:
        options (Namespace): Namespace containing all parsed options.
    """
    logger.debug("Setting up predict..")
    output_dir = setup(options.pipeline)

    logger.debug("Predict set up. Running...")
    run(options.model_api, output_dir, options.pipeline, options.model)

    logger.debug("Prediction finished. Tearing down")
    teardown(options.pipeline)


[docs]def load_model(model_path):
    """Load a pretrained model into a `Predicter` object.

    Args:
      load_model (str): A path to the saved model file.

    Throws:
      Exception: If the path does not exist, or is not a valid model file.

    """
    model_path = Path(model_path)
    if not model_path.exists():
        raise Exception('Path "{}" does not exist!'.format(model_path))

    model = Model.create_from_file(model_path)
    if not model:
        raise Exception('No model found in "{}"'.format(model_path))
    fieldset = model.fieldset()
    fields = deserialize_fields_from_vocabs(fieldset.fields, model.vocabs)
    predicter = Predicter(model, fields=fields)
    return predicter


[docs]def run(ModelClass, output_dir, pipeline_opts, model_opts):
    """
    Runs the prediction pipeline. Loads the model and necessary files
    and creates the model's predictions for all data received.

    Args:
        ModelClass (type): Python Type of the Model to train
        output_dir: Directory to save predictions
        pipeline_options (Namespace): Generic predict Options
            batch_size: Max batch size for predicting

        model_options (Namespace): Model Specific options

    Returns:
        Predictions (dict): Dictionary with format {'target':predictions}
    """
    model_name = getattr(ModelClass, "title", ModelClass.__name__)
    logger.info("Predict with the {} model".format(model_name))

    if ModelClass == LinearWordQEClassifier:
        load_vocab = None

        model = LinearWordQEClassifier(
            evaluation_metric=model_opts.evaluation_metric
        )
        model.load(pipeline_opts.load_model)
        predicter = LinearTester(model)
    else:
        load_vocab = pipeline_opts.load_model

        model = Model.create_from_file(pipeline_opts.load_model)

        # Set GPU or CPU. This has to be done before instantiating the optimizer
        device_id = None
        if pipeline_opts.gpu_id is not None and pipeline_opts.gpu_id >= 0:
            device_id = pipeline_opts.gpu_id
        model.to(device_id)

        predicter = Predicter(model)

    test_dataset = build_test_dataset(
        fieldset=ModelClass.fieldset(
            wmt18_format=model_opts.__dict__.get("wmt18_format")
        ),
        load_vocab=load_vocab,
        **vars(model_opts),
    )
    predictions = predicter.run(
        test_dataset, batch_size=pipeline_opts.batch_size
    )

    save_predicted_probabilities(output_dir, predictions)
    return predictions


[docs]def setup(options):
    """
    Analyze pipeline options and set up requirements to running
    the prediction pipeline. This includes setting up the output
    directory, random seeds and the device where predictions are run.

    Args:
        options(Namespace): Pipeline specific options

    Returns:
        output_dir(str): Path to output directory
    """
    output_dir = setup_output_directory(
        options.output_dir, options.run_uuid, experiment_id=None, create=True
    )
    configure_logging(
        output_dir=output_dir, debug=options.debug, quiet=options.quiet
    )
    configure_seed(options.seed)
    configure_device(options.gpu_id)

    logger.info(pformat(vars(options)))
    logger.info("Local output directory is: {}".format(output_dir))

    if options.save_config:
        save_config_file(options, options.save_config)

    del options.output_dir  # FIXME: remove this after making sure no other
    # place uses it! # noqa
    return output_dir


[docs]def teardown(options):
    """
    Tears down after executing prediction pipeline.

    Args:
        options(Namespace): Pipeline specific options
    """
    pass