# -*- coding: utf-8 -*-
# Copyright (C) 2020 Unbabel
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""
ReferencelessRegression
========================
Referenceless Regression Metric that learns to predict a quality assessment by
looking at source and translation.
"""
from typing import Dict, List, Optional, Tuple, Union
import pandas as pd
import torch
from comet.models.regression.regression_metric import RegressionMetric
from comet.modules import FeedForward
[docs]class ReferencelessRegression(RegressionMetric):
"""ReferencelessRegression:
:param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen.
:param keep_embeddings_frozen: Keeps the encoder frozen during training.
:param optimizer: Optimizer used during training.
:param encoder_learning_rate: Learning rate used to fine-tune the encoder model.
:param learning_rate: Learning rate used to fine-tune the top layers.
:param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers.
:param encoder_model: Encoder model to be used.
:param pretrained_model: Pretrained model from Hugging Face.
:param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg'].
:param layer: Encoder layer to be used ('mix' for pooling info from all layers.)
:param dropout: Dropout used in the top-layers.
:param batch_size: Batch size used during training.
:param train_data: Path to a csv file containing the training data.
:param validation_data: Path to a csv file containing the validation data.
:param hidden_sizes: Hidden sizes for the Feed Forward regression.
:param activations: Feed Forward activation function.
:param load_weights_from_checkpoint: Path to a checkpoint file.
"""
def __init__(
self,
nr_frozen_epochs: Union[float, int] = 0.3,
keep_embeddings_frozen: bool = False,
optimizer: str = "AdamW",
encoder_learning_rate: float = 1e-05,
learning_rate: float = 3e-05,
layerwise_decay: float = 0.95,
encoder_model: str = "XLM-RoBERTa",
pretrained_model: str = "xlm-roberta-base",
pool: str = "avg",
layer: Union[str, int] = "mix",
dropout: float = 0.1,
batch_size: int = 4,
train_data: Optional[str] = None,
validation_data: Optional[str] = None,
hidden_sizes: List[int] = [1024],
activations: str = "Tanh",
final_activation: Optional[str] = None,
load_weights_from_checkpoint: Optional[str] = None,
) -> None:
super(RegressionMetric, self).__init__(
nr_frozen_epochs,
keep_embeddings_frozen,
optimizer,
encoder_learning_rate,
learning_rate,
layerwise_decay,
encoder_model,
pretrained_model,
pool,
layer,
dropout,
batch_size,
train_data,
validation_data,
load_weights_from_checkpoint,
"referenceless_regression_metric",
)
self.save_hyperparameters()
self.estimator = FeedForward(
in_dim=self.encoder.output_units * 4,
hidden_sizes=self.hparams.hidden_sizes,
activations=self.hparams.activations,
dropout=self.hparams.dropout,
final_activation=self.hparams.final_activation,
)
[docs] def prepare_sample(
self, sample: List[Dict[str, Union[str, float]]], inference: bool = False
) -> Union[
Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor]
]:
"""
Function that prepares a sample to input the model.
:param sample: list of dictionaries.
:param inference: If set to true prepares only the model inputs.
:returns: Tuple with 2 dictionaries (model inputs and targets).
If `inference=True` returns only the model inputs.
"""
sample = {k: [dic[k] for dic in sample] for k in sample[0]}
src_inputs = self.encoder.prepare_sample(sample["src"])
mt_inputs = self.encoder.prepare_sample(sample["mt"])
src_inputs = {"src_" + k: v for k, v in src_inputs.items()}
mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()}
inputs = {**src_inputs, **mt_inputs}
if inference:
return inputs
targets = {"score": torch.tensor(sample["score"], dtype=torch.float)}
return inputs, targets
[docs] def forward(
self,
src_input_ids: torch.tensor,
src_attention_mask: torch.tensor,
mt_input_ids: torch.tensor,
mt_attention_mask: torch.tensor,
**kwargs
) -> Dict[str, torch.Tensor]:
src_sentemb = self.get_sentence_embedding(src_input_ids, src_attention_mask)
mt_sentemb = self.get_sentence_embedding(mt_input_ids, mt_attention_mask)
diff_src = torch.abs(mt_sentemb - src_sentemb)
prod_src = mt_sentemb * src_sentemb
embedded_sequences = torch.cat(
(mt_sentemb, src_sentemb, prod_src, diff_src), dim=1
)
return {"score": self.estimator(embedded_sequences)}
[docs] def read_csv(self, path: str) -> List[dict]:
"""Reads a comma separated value file.
:param path: path to a csv file.
:return: List of records as dictionaries
"""
df = pd.read_csv(path)
df = df[["src", "mt", "score"]]
df["src"] = df["src"].astype(str)
df["mt"] = df["mt"].astype(str)
df["score"] = df["score"].astype(float)
return df.to_dict("records")