Source code for kiwi.data.fieldsets.quetch

#  OpenKiwi: Open-Source Machine Translation Quality Estimation
#  Copyright (C) 2019 Unbabel <openkiwi@unbabel.com>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

from torchtext import data

from kiwi import constants as const
from kiwi.data import utils
from kiwi.data.fields.alignment_field import AlignmentField
from kiwi.data.fields.qe_field import QEField
from kiwi.data.fields.sequence_labels_field import SequenceLabelsField
from kiwi.data.fieldsets.fieldset import Fieldset
from kiwi.data.tokenizers import align_tokenizer, tokenizer


[docs]def build_fieldset(wmt18_format=False): fs = Fieldset() fs.add( name=const.SOURCE, field=QEField( tokenize=tokenizer, init_token=None, eos_token=None, include_lengths=True, ), file_option_suffix='_source', required=Fieldset.ALL, vocab_options=dict( min_freq='source_vocab_min_frequency', max_size='source_vocab_size', rare_with_vectors='keep_rare_words_with_embeddings', add_vectors_vocab='add_embeddings_vocab', ), vocab_vectors='source_embeddings', ) fs.add( name=const.TARGET, field=QEField( tokenize=tokenizer, init_token=None, eos_token=None, include_lengths=True, ), file_option_suffix='_target', required=Fieldset.ALL, vocab_options=dict( min_freq='target_vocab_min_frequency', max_size='target_vocab_size', rare_with_vectors='keep_rare_words_with_embeddings', add_vectors_vocab='add_embeddings_vocab', ), vocab_vectors='target_embeddings', ) fs.add( name=const.ALIGNMENTS, field=AlignmentField(tokenize=align_tokenizer, use_vocab=False), file_option_suffix='_alignments', required=Fieldset.ALL, ) post_pipe_target = data.Pipeline(utils.project) if wmt18_format: post_pipe_gaps = data.Pipeline(utils.wmt18_to_gaps) post_pipe_target = data.Pipeline(utils.wmt18_to_target) fs.add( name=const.GAP_TAGS, field=SequenceLabelsField( classes=const.LABELS, tokenize=tokenizer, pad_token=const.PAD, unk_token=None, batch_first=True, # eos_token=const.STOP, postprocessing=post_pipe_gaps, ), file_option_suffix='_target_tags', required=[Fieldset.TRAIN, Fieldset.VALID], ) fs.add( name=const.TARGET_TAGS, field=SequenceLabelsField( classes=const.LABELS, tokenize=tokenizer, pad_token=const.PAD, unk_token=None, batch_first=True, postprocessing=post_pipe_target, ), file_option_suffix='_target_tags', required=[Fieldset.TRAIN, Fieldset.VALID], ) fs.add( name=const.SOURCE_TAGS, field=SequenceLabelsField( classes=const.LABELS, tokenize=tokenizer, pad_token=const.PAD, unk_token=None, batch_first=True, ), file_option_suffix='_source_tags', required=None, ) return fs