Source code for kiwi.predictors.predictor

#  OpenKiwi: Open-Source Machine Translation Quality Estimation
#  Copyright (C) 2019 Unbabel <openkiwi@unbabel.com>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

import logging
from collections import defaultdict

import torch
from torchtext.data import Example

from kiwi import constants as const
from kiwi.data.iterators import build_bucket_iterator
from kiwi.data.qe_dataset import QEDataset

logger = logging.getLogger(__name__)


[docs]class Predicter: def __init__(self, model, fields=None): """Class to load a model for inference. Args: model (kiwi.models.Model): A trained QE model fields (dict[str: Field]): A dict mapping field names to strings. For online prediction. """ self.model = model self.fields = fields # Will break in Multi GPU mode self._device = next(model.parameters()).device
[docs] def to(self, device): """Method to mode Predicter object to other device. e.g: "cuda" Args: device (str): Device to which the model should be move to. """ self._device = device self.model.to(device)
[docs] def predict(self, examples, batch_size=1): """Create Predictions for a list of examples. Args: examples: A dict mapping field names to the list of raw examples (strings). batch_size: Batch Size to use. Default 1. Returns: A dict mapping prediction levels (word, sentence ..) to the model predictions for each example. Raises: Exception: If an example has an empty string as `source` or `target` field. Example: >>> import kiwi >>> predictor = kiwi.load_model('tests/toy-data/models/nuqe.torch') >>> src = ['a b c', 'd e f g'] >>> tgt = ['q w e r', 't y'] >>> align = ['0-0 1-1 1-2', '1-1 3-0'] >>> examples = {kiwi.constants.SOURCE: src, kiwi.constants.TARGET: tgt, kiwi.constants.ALIGNMENTS: align} >>> predictor.predict(examples) {'tags': [[0.4760947525501251, 0.47569847106933594, 0.4948718547821045, 0.5305878520011902], [0.5105430483818054, 0.5252899527549744]]} """ if not examples: return defaultdict(list) if self.fields is None: raise Exception('Missing fields object.') if not examples.get(const.SOURCE): raise KeyError('Missing required field "{}"'.format(const.SOURCE)) if not examples.get(const.TARGET): raise KeyError('Missing required field "{}"'.format(const.TARGET)) if not all( [s.strip() for s in examples[const.SOURCE] + examples[const.TARGET]] ): raise Exception( 'Empty String in {} or {} field found!'.format( const.SOURCE, const.TARGET ) ) fields = [(name, self.fields[name]) for name in examples] field_examples = [ Example.fromlist(values, fields) for values in zip(*examples.values()) ] dataset = QEDataset(field_examples, fields=fields) return self.run(dataset, batch_size)
[docs] def run(self, dataset, batch_size=1): iterator = build_bucket_iterator( dataset, self._device, batch_size, is_train=False ) self.model.eval() predictions = defaultdict(list) with torch.no_grad(): for batch in iterator: model_pred = self.model.predict(batch) for key, values in model_pred.items(): if isinstance(values, list): predictions[key] += values else: predictions[key].append(values) return dict(predictions)