Source code for kiwi.models.linear.linear_word_qe_features

"""A class for handling features for word-level quality estimation."""

#  OpenKiwi: Open-Source Machine Translation Quality Estimation
#  Copyright (C) 2019 Unbabel <openkiwi@unbabel.com>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

import numpy as np

from kiwi.models.linear.linear_word_qe_sentence import LinearWordQESentence

from .sparse_feature_vector import SparseFeatureVector


[docs]def quantize(value, bins_down): """Quantize a numeric feature into bins. Example: bins = [50, 40, 30, 25, 20, 18, 16, 14, 12, 10].""" bin_up = np.inf for bin_down in bins_down: if bin_down < value <= bin_up: bin_value = bin_down return bin_value bin_up = bin_down return value
[docs]class LinearWordQEFeatures(SparseFeatureVector): """This class implements a feature vector for word-level quality estimation.""" def __init__( self, use_basic_features_only=True, use_simple_bigram_features=True, use_parse_features=False, use_stacked_features=False, save_to_cache=False, load_from_cache=False, cached_features_file=None, ): SparseFeatureVector.__init__( self, save_to_cache, load_from_cache, cached_features_file ) self.use_basic_features_only = use_basic_features_only # True for using only a single bigram indicator feature. self.use_simple_bigram_features = use_simple_bigram_features self.use_parse_features = use_parse_features self.use_stacked_features = use_stacked_features self.use_client_features = False
[docs] def get_siblings(self, sentence_word_features, index): if index < 0 or index >= len(sentence_word_features): info = None else: info = sentence_word_features[index] if info is not None: siblings = [ k for k in range(len(sentence_word_features)) if sentence_word_features[k].target_head == info.target_head ] left_siblings = [k for k in siblings if k < index] right_siblings = [k for k in siblings if k > index] if len(left_siblings) > 0: left_sibling = max(left_siblings) else: left_sibling = -1 if len(right_siblings) > 0: right_sibling = min(right_siblings) else: right_sibling = -1 else: left_sibling = -2 right_sibling = -2 if left_sibling >= 0: left_sibling_info = sentence_word_features[left_sibling] left_sibling_token = left_sibling_info.token left_sibling_pos = left_sibling_info.target_pos elif left_sibling == -1: left_sibling_token = '__ROOT__' left_sibling_pos = '__ROOT__' else: left_sibling_info = None left_sibling_token = '__START__' left_sibling_pos = '__START__' if right_sibling >= 0: right_sibling_info = sentence_word_features[right_sibling] right_sibling_token = right_sibling_info.token right_sibling_pos = right_sibling_info.target_pos elif right_sibling == -1: right_sibling_info = None right_sibling_token = '__ROOT__' right_sibling_pos = '__ROOT__' else: right_sibling_info = None right_sibling_token = '__START__' right_sibling_pos = '__START__' return ( left_sibling_token, left_sibling_pos, right_sibling_token, right_sibling_pos, )
[docs] def get_head(self, sentence_word_features, index): if index < 0 or index >= len(sentence_word_features): info = None else: info = sentence_word_features[index] if info is not None: head_index = info.target_head - 1 else: head_index = -2 if head_index >= 0: head_info = sentence_word_features[head_index] head_token = head_info.token head_pos = head_info.target_pos head_morph = head_info.target_morph elif head_index == -1: head_info = None head_token = '__ROOT__' head_pos = '__ROOT__' head_morph = '__ROOT__' else: head_info = None head_token = '__START__' head_pos = '__START__' head_morph = '__START__' return head_index, head_token, head_pos, head_morph
[docs] def compute_unigram_features(self, sentence_word_features, part): """Compute unigram features (depending only on a single label).""" if self.load_from_cache: self.load_cached_features() return index = part.index ignore_source = False only_basic_features = self.use_basic_features_only use_client_features = self.use_client_features use_parse_features = self.use_parse_features use_stacked_features = self.use_stacked_features use_bias = True use_language_model = True use_binary_features = False if use_parse_features: use_split_morphs = False use_morph_features = False use_deprel_features = True use_head_features = True use_grandparent_features = True use_sibling_features = True else: use_split_morphs = False use_morph_features = False use_deprel_features = False use_head_features = False use_grandparent_features = False use_sibling_features = False use_unuseful_shared_task_features = False info = sentence_word_features[index] if use_client_features: labels = [str(part.label), info.client_name + '_' + str(part.label)] else: labels = [str(part.label)] for label in labels: if use_bias: self.add_binary_feature('BIAS_%s' % label) if use_unuseful_shared_task_features: self.add_binary_feature( 'F0=%d_%s' % ( quantize(info.source_token_count, [40, 30, 20, 10]), label, ) ) self.add_binary_feature( 'F1=%d_%s' % ( quantize(info.target_token_count, [40, 30, 20, 10]), label, ) ) self.add_binary_feature( 'F2=%f_%s' % ( quantize( info.source_target_token_count_ratio, [5.0, 2.0] ), label, ) ) self.add_binary_feature('F3=%s_%s' % (info.token, label)) self.add_binary_feature('F4=%s_%s' % (info.left_context, label)) self.add_binary_feature('F5=%s_%s' % (info.right_context, label)) if not ignore_source: self.add_binary_feature( 'F6=%s_%s' % (info.first_aligned_token, label) ) self.add_binary_feature( 'F7=%s_%s' % (info.left_alignment, label) ) self.add_binary_feature( 'F8=%s_%s' % (info.right_alignment, label) ) if use_binary_features and not only_basic_features: # Ablated for German WMT16 (the provided stoplist is wrong). # self.add_binary_feature( # 'F9=%d_%s' % (int(info.is_stopword), label)) self.add_binary_feature( 'F10=%d_%s' % (int(info.is_punctuation), label) ) # Ablated for German (capitalized words are nouns) # self.add_binary_feature( # 'F11=%d_%s' % (int(info.is_proper_noun), label)) self.add_binary_feature( 'F12=%d_%s' % (int(info.is_digit), label) ) if use_language_model and not only_basic_features: self.add_binary_feature( 'F13=%d_%s' % (info.highest_order_ngram_left, label) ) self.add_binary_feature( 'F14=%d_%s' % (info.highest_order_ngram_right, label) ) # if use_language_model and not only_basic_features: # self.add_binary_feature( # 'F15=%d_%s' % (info.backoff_behavior_left, label)) # self.add_binary_feature( # 'F16=%d_%s' % (info.backoff_behavior_middle, label)) # self.add_binary_feature( # 'F17=%d_%s' % (info.backoff_behavior_right, label)) if use_language_model and not only_basic_features: self.add_binary_feature( 'F18=%d_%s' % (info.source_highest_order_ngram_left, label) ) self.add_binary_feature( 'F19=%d_%s' % (info.source_highest_order_ngram_right, label) ) self.add_binary_feature( 'F20=%d_%s' % (int(info.pseudo_reference), label) ) if not only_basic_features: self.add_binary_feature('F21=%s_%s' % (info.target_pos, label)) self.add_binary_feature( 'F22=%s_%s' % (info.aligned_source_pos_list, label) ) if use_unuseful_shared_task_features: self.add_binary_feature( 'F23=%d_%s' % (info.polysemy_count_source, label) ) self.add_binary_feature( 'F24=%d_%s' % (info.polysemy_count_target, label) ) # QUETCH linear model conjoined features. self.add_binary_feature( 'G0=%s_%s_%s' % (info.token, info.left_context, label) ) self.add_binary_feature( 'G1=%s_%s_%s' % (info.token, info.right_context, label) ) if not ignore_source: self.add_binary_feature( 'G2=%s_%s_%s' % (info.token, info.first_aligned_token, label) ) if not only_basic_features: self.add_binary_feature( 'G3=%s_%s_%s' % (info.target_pos, info.aligned_source_pos_list, label) ) # Parse features. if use_parse_features: head_index, head_token, head_pos, head_morph = self.get_head( sentence_word_features, index ) head_on_left = True # (head_index <= index) if head_index >= 0: _, grandparent_token, grandparent_pos, _ = self.get_head( sentence_word_features, head_index ) else: grandparent_token, grandparent_pos = head_token, head_pos grandparent_on_left = True # (grandparent_index <= index) left_sibling_token, left_sibling_pos, right_sibling_token, right_sibling_pos = self.get_siblings( # NOQA sentence_word_features, index ) if use_deprel_features: self.add_binary_feature( 'H0=%s_%s' % (info.target_deprel, label) ) self.add_binary_feature( 'H1=%s_%s_%s' % (info.token, info.target_deprel, label) ) if use_head_features: # self.add_binary_feature( # 'H2=%s_%s_%s' % (info.target_pos, head_pos, label)) # self.add_binary_feature( # 'H3=%s_%s_%s' % (info.token, head_token, label)) self.add_binary_feature( 'H2=%s_%s_%d_%s' % (info.target_pos, head_pos, int(head_on_left), label) ) self.add_binary_feature( 'H3=%s_%s_%d_%s' % (info.token, head_token, int(head_on_left), label) ) self.add_binary_feature( 'H3a=%s_%s_%d_%s' % (info.token, head_pos, int(head_on_left), label) ) self.add_binary_feature( 'H3b=%s_%s_%d_%s' % ( info.target_pos, head_token, int(head_on_left), label, ) ) if use_morph_features: self.add_binary_feature( 'H4=%s_%s' % (info.target_morph, label) ) self.add_binary_feature( 'H5=%s_%s_%s' % (info.target_morph, head_morph, label) ) if use_split_morphs: all_morphs = info.target_morph.split('|') all_head_morphs = head_morph.split('|') for m in all_morphs: self.add_binary_feature('H6=%s_%s' % (m, label)) for hm in all_head_morphs: self.add_binary_feature( 'H7=%s_%s_%s' % (m, hm, label) ) if use_sibling_features: self.add_binary_feature( 'H8=%s_%s_%s' % (info.target_pos, left_sibling_pos, label) ) self.add_binary_feature( 'H9=%s_%s_%s' % (info.token, left_sibling_token, label) ) self.add_binary_feature( 'H10=%s_%s_%s' % (info.target_pos, right_sibling_pos, label) ) self.add_binary_feature( 'H11=%s_%s_%s' % (info.token, right_sibling_token, label) ) if use_grandparent_features: self.add_binary_feature( 'H12=%s_%s_%d_%s' % ( info.target_pos, grandparent_pos, int(grandparent_on_left), label, ) ) self.add_binary_feature( 'H13=%s_%s_%d_%s' % ( info.token, grandparent_token, int(grandparent_on_left), label, ) ) self.add_binary_feature( 'H14=%s_%s_%s_%d_%s' % ( info.target_pos, head_pos, grandparent_pos, int(grandparent_on_left), label, ) ) self.add_binary_feature( 'H15=%s_%s_%s_%d_%s' % ( info.token, head_pos, grandparent_token, int(grandparent_on_left), label, ) ) self.add_binary_feature( 'H16=%s_%s_%s_%d_%s' % ( info.token, head_token, grandparent_pos, int(grandparent_on_left), label, ) ) self.add_binary_feature( 'H17=%s_%s_%s_%d_%s' % ( info.target_pos, head_token, grandparent_token, int(grandparent_on_left), label, ) ) self.add_binary_feature( 'H18=%s_%s_%s_%d_%s' % ( info.target_pos, head_pos, grandparent_token, int(grandparent_on_left), label, ) ) self.add_binary_feature( 'H19=%s_%s_%s_%d_%s' % ( info.target_pos, head_token, grandparent_pos, int(grandparent_on_left), label, ) ) self.add_binary_feature( 'H20=%s_%s_%s_%d_%s' % ( info.token, head_pos, grandparent_pos, int(grandparent_on_left), label, ) ) if use_stacked_features: if len(info.stacked_features) > 0: for i, value in enumerate(info.stacked_features): self.add_numeric_feature('S%d_%s' % (i, label), value) if self.save_to_cache: self.save_cached_features() return
[docs] def compute_bigram_features(self, sentence_word_features, part): """Compute bigram features (that depend on consecutive labels).""" if self.load_from_cache: self.load_cached_features() return index = part.index label = part.label previous_label = part.previous_label ignore_source = False only_basic_features = self.use_basic_features_only use_client_features = self.use_client_features use_parse_features = self.use_parse_features use_stacked_features = self.use_stacked_features # False use_bias = True # True for using only a single bigram indicator feature. use_only_bias = self.use_simple_bigram_features use_language_model = True use_binary_features = False use_trigram_features = True if use_parse_features: use_split_morphs = False use_morph_features = False use_deprel_features = True use_head_features = False use_sibling_features = False else: use_split_morphs = False use_morph_features = False use_deprel_features = False use_head_features = False use_sibling_features = False if index < len(sentence_word_features): info = sentence_word_features[index] else: info = LinearWordQESentence.create_stop_symbol() if index > 0: info_previous = sentence_word_features[index - 1] else: info_previous = LinearWordQESentence.create_stop_symbol() bigram_label = str(previous_label) + '_' + str(label) if use_client_features: labels = [bigram_label, info.client_name + '_' + bigram_label] else: labels = [bigram_label] for label in labels: if use_bias: self.add_binary_feature('B1=%s' % label) if use_only_bias: continue self.add_binary_feature('B2=%s_%s' % (info.token, label)) self.add_binary_feature('B3=%s_%s' % (info_previous.token, label)) self.add_binary_feature('B4=%s_%s' % (info.right_context, label)) self.add_binary_feature( 'B5=%s_%s' % (info_previous.left_context, label) ) if not ignore_source: self.add_binary_feature( 'B6=%s_%s' % (info.first_aligned_token, label) ) self.add_binary_feature( 'B7=%s_%s' % (info.left_alignment, label) ) self.add_binary_feature( 'B8=%s_%s' % (info.right_alignment, label) ) self.add_binary_feature( 'B9=%s_%s' % (info_previous.first_aligned_token, label) ) self.add_binary_feature( 'B10=%s_%s' % (info_previous.left_alignment, label) ) self.add_binary_feature( 'B11=%s_%s' % (info_previous.right_alignment, label) ) if use_binary_features and not only_basic_features: # Ablated for German WMT16 (the provided stoplist is wrong). # self.add_binary_feature( # 'B12=%d_%s' % (int(info.is_stopword), label)) # self.add_binary_feature( # 'B13=%d_%s' % (int(info_previous.is_stopword), label)) self.add_binary_feature( 'B14=%d_%s' % (int(info.is_punctuation), label) ) self.add_binary_feature( 'B15=%d_%s' % (int(info_previous.is_punctuation), label) ) # Ablated for German (capitalized words are nouns) # self.add_binary_feature( # 'B16=%d_%s' % (int(info.is_proper_noun), label)) # self.add_binary_feature( # 'B17=%d_%s' % (int(info_previous.is_proper_noun), label)) self.add_binary_feature( 'B18=%d_%s' % (int(info.is_digit), label) ) self.add_binary_feature( 'B19=%d_%s' % (int(info_previous.is_digit), label) ) if use_language_model and not only_basic_features: self.add_binary_feature( 'B20=%d_%s' % (info.highest_order_ngram_left, label) ) self.add_binary_feature( 'B21=%d_%s' % (info.highest_order_ngram_right, label) ) self.add_binary_feature( 'B22=%d_%s' % (info_previous.highest_order_ngram_left, label) ) self.add_binary_feature( 'B23=%d_%s' % (info_previous.highest_order_ngram_right, label) ) # if use_language_model and not only_basic_features: # self.add_binary_feature( # 'B24=%d_%s' % (info.backoff_behavior_left, label)) # self.add_binary_feature( # 'B25=%d_%s' % (info.backoff_behavior_middle, label)) # self.add_binary_feature( # 'B26=%d_%s' % (info.backoff_behavior_right, label)) # self.add_binary_feature( # 'B27=%d_%s' % (info_previous.backoff_behavior_left, # label)) # self.add_binary_feature( # 'B28=%d_%s' % (info_previous.backoff_behavior_middle, # label)) # self.add_binary_feature( # 'B29=%d_%s' % (info_previous.backoff_behavior_right, # label)) if use_language_model and not only_basic_features: self.add_binary_feature( 'B30=%d_%s' % (info.source_highest_order_ngram_left, label) ) self.add_binary_feature( 'B31=%d_%s' % (info.source_highest_order_ngram_right, label) ) self.add_binary_feature( 'B33=%d_%s' % (info_previous.source_highest_order_ngram_left, label) ) self.add_binary_feature( 'B34=%d_%s' % (info_previous.source_highest_order_ngram_right, label) ) if not only_basic_features: self.add_binary_feature('B35=%s_%s' % (info.target_pos, label)) self.add_binary_feature( 'B36=%s_%s' % (info.aligned_source_pos_list, label) ) self.add_binary_feature( 'B37=%s_%s' % (info_previous.target_pos, label) ) self.add_binary_feature( 'B38=%s_%s' % (info_previous.aligned_source_pos_list, label) ) # Conjoined features. self.add_binary_feature( 'C0=%s_%s_%s' % (info.token, info.left_context, label) ) self.add_binary_feature( 'C1=%s_%s_%s' % (info.token, info.right_context, label) ) self.add_binary_feature( 'C2=%s_%s_%s' % (info_previous.token, info_previous.left_context, label) ) self.add_binary_feature( 'C3=%s_%s_%s' % (info_previous.token, info_previous.right_context, label) ) if use_trigram_features: self.add_binary_feature( 'D1=%s_%s_%s_%s' % ( info_previous.left_context, info_previous.token, info.token, label, ) ) self.add_binary_feature( 'D2=%s_%s_%s_%s' % ( info_previous.token, info.token, info.right_context, label, ) ) if not ignore_source: self.add_binary_feature( 'C4=%s_%s_%s' % (info.token, info.first_aligned_token, label) ) self.add_binary_feature( 'C5=%s_%s_%s' % ( info_previous.token, info_previous.first_aligned_token, label, ) ) if not only_basic_features: self.add_binary_feature( 'C6=%s_%s_%s' % (info.target_pos, info.aligned_source_pos_list, label) ) self.add_binary_feature( 'C7=%s_%s_%s' % ( info_previous.target_pos, info_previous.aligned_source_pos_list, label, ) ) # Parse features. if use_parse_features: head_index = info.target_head - 1 previous_head_index = info_previous.target_head - 1 if head_index >= 0: head_info = sentence_word_features[head_index] head_token = head_info.token head_pos = head_info.target_pos head_morph = head_info.target_morph elif head_index == -1: head_info = None head_token = '__ROOT__' head_pos = '__ROOT__' head_morph = '__ROOT__' else: head_info = None head_token = '__START__' head_pos = '__START__' head_morph = '__START__' if previous_head_index >= 0: previous_head_info = sentence_word_features[ previous_head_index ] previous_head_token = previous_head_info.token previous_head_pos = previous_head_info.target_pos previous_head_morph = previous_head_info.target_morph elif previous_head_index == -1: previous_head_info = None previous_head_token = '__ROOT__' previous_head_pos = '__ROOT__' previous_head_morph = '__ROOT__' else: previous_head_info = None previous_head_token = '__START__' previous_head_pos = '__START__' previous_head_morph = '__START__' left_sibling_token, left_sibling_pos, right_sibling_token, right_sibling_pos = self.get_siblings( # NOQA sentence_word_features, index ) previous_left_sibling_token, previous_left_sibling_pos, previous_right_sibling_token, previous_right_sibling_pos = self.get_siblings( # NOQA sentence_word_features, index - 1 ) if use_deprel_features: self.add_binary_feature( 'D0=%s_%s' % (info_previous.target_deprel, label) ) self.add_binary_feature( 'D1=%s_%s_%s' % ( info_previous.token, info_previous.target_deprel, label, ) ) if use_head_features: self.add_binary_feature( 'D2=%s_%s_%s' % (info_previous.target_pos, previous_head_pos, label) ) self.add_binary_feature( 'D3=%s_%s_%s' % (info_previous.token, previous_head_token, label) ) if use_morph_features: self.add_binary_feature( 'D4=%s_%s' % (info_previous.target_morph, label) ) self.add_binary_feature( 'D5=%s_%s_%s' % ( info_previous.target_morph, previous_head_morph, label, ) ) if use_split_morphs: all_morphs = info_previous.target_morph.split('|') all_head_morphs = previous_head_morph.split('|') for m in all_morphs: self.add_binary_feature('D6=%s_%s' % (m, label)) for hm in all_head_morphs: self.add_binary_feature( 'D7=%s_%s_%s' % (m, hm, label) ) if use_sibling_features: self.add_binary_feature( 'D8=%s_%s_%s' % ( info_previous.target_pos, previous_left_sibling_pos, label, ) ) self.add_binary_feature( 'D9=%s_%s_%s' % ( info_previous.token, previous_left_sibling_token, label, ) ) self.add_binary_feature( 'D10=%s_%s_%s' % ( info_previous.target_pos, previous_right_sibling_pos, label, ) ) self.add_binary_feature( 'D11=%s_%s_%s' % ( info_previous.token, previous_right_sibling_token, label, ) ) if use_deprel_features: self.add_binary_feature( 'E0=%s_%s' % (info.target_deprel, label) ) self.add_binary_feature( 'E1=%s_%s_%s' % (info.token, info.target_deprel, label) ) if use_head_features: self.add_binary_feature( 'E2=%s_%s_%s' % (info.target_pos, head_pos, label) ) self.add_binary_feature( 'E3=%s_%s_%s' % (info.token, head_token, label) ) if use_morph_features: self.add_binary_feature( 'E4=%s_%s' % (info.target_morph, label) ) self.add_binary_feature( 'E5=%s_%s_%s' % (info.target_morph, head_morph, label) ) if use_split_morphs: all_morphs = info.target_morph.split('|') all_head_morphs = head_morph.split('|') for m in all_morphs: self.add_binary_feature('E6=%s_%s' % (m, label)) for hm in all_head_morphs: self.add_binary_feature( 'E7=%s_%s_%s' % (m, hm, label) ) if use_sibling_features: self.add_binary_feature( 'E8=%s_%s_%s' % (info.target_pos, left_sibling_pos, label) ) self.add_binary_feature( 'E9=%s_%s_%s' % (info.token, left_sibling_token, label) ) self.add_binary_feature( 'E10=%s_%s_%s' % (info.target_pos, right_sibling_pos, label) ) self.add_binary_feature( 'E11=%s_%s_%s' % (info.token, right_sibling_token, label) ) if use_stacked_features: if len(info.stacked_features) > 0: for i, value in enumerate(info.stacked_features): self.add_numeric_feature('Z%d_%s' % (i, label), value) if len(info_previous.stacked_features) > 0: for i, value in enumerate(info_previous.stacked_features): self.add_numeric_feature('ZZ%d_%s' % (i, label), value) if self.save_to_cache: self.save_cached_features() return