Source code for kiwi.models.linear.sparse_feature_vector

"""This defines the class for defining sparse features in linear models."""

#  OpenKiwi: Open-Source Machine Translation Quality Estimation
#  Copyright (C) 2019 Unbabel <openkiwi@unbabel.com>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

from .sparse_vector import SparseVector


[docs]class SparseFeatureVector(SparseVector): """A generic class for a sparse feature vector.""" def __init__( self, save_to_cache=False, load_from_cache=False, cached_features_file=None, ): SparseVector.__init__(self) self.cached_features_file = cached_features_file self.save_to_cache = save_to_cache self.load_from_cache = load_from_cache
[docs] def add_categorical_feature(self, name, value, allow_duplicates=False): """Add a categorical feature, represented internally as a binary feature.""" fname = name + "=" + value assert allow_duplicates or fname not in self self[fname] = 1.0
[docs] def add_binary_feature(self, name): """Add a binary feature.""" if name in self: return self[name] = 1.0
[docs] def add_numeric_feature(self, name, value): """Add a numeric feature.""" self[name] = value
[docs] def save_cached_features(self): """Save features to file.""" self.cached_features_file.write(str(len(self)) + '\n') for key in self: self.cached_features_file.write(key + '\t' + str(self[key]) + '\n')
[docs] def load_cached_features(self): """Load features from file.""" num_features = int(self.cached_features_file.next()) for i in range(num_features): key, value = ( self.cached_features_file.next().rstrip('\n').split('\t') ) self[key] = float(value)
[docs]class SparseBinaryFeatureVector(list): """A generic class for a sparse binary feature vector.""" def __init__( self, feature_indices=None, save_to_cache=False, load_from_cache=False, cached_features_file=None, ): list.__init__(self) self.feature_indices = feature_indices self.cached_features_file = cached_features_file self.save_to_cache = save_to_cache self.load_from_cache = load_from_cache
[docs] def add_categorical_feature(self, name, value): """Add a categorical feature, represented internally as a binary feature.""" fname = name + "=" + value self.add_binary_feature(fname)
[docs] def add_binary_feature(self, name): """Add a binary feature.""" add = True index = self.feature_indices.get(name, -1) if index < 0: if not add: return else: index = self.feature_indices.add(name) self.append(index)
[docs] def to_sparse_vector(self): """Convert to a SparseVector.""" vector = SparseVector() for index in self: vector[index] = 1.0 return vector
[docs] def save_cached_features(self): """Save features to file.""" self.cached_features_file.write( '\t'.join([str(key) for key in self]) + '\n' )
[docs] def load_cached_features(self): """Load features from file.""" self[:] = [ int(key) for key in self.cached_features_file.next().rstrip('\n').split('\t') ]