Source code for kiwi.cli.models.linear

#  OpenKiwi: Open-Source Machine Translation Quality Estimation
#  Copyright (C) 2019 Unbabel <openkiwi@unbabel.com>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

import logging

from kiwi.cli.better_argparse import ModelParser
from kiwi.cli.opts import PathType
from kiwi.models.linear_word_qe_classifier import LinearWordQEClassifier

logger = logging.getLogger(__name__)

title = 'linear'


def _add_vocabulary_opts(parser):
    group = parser.add_argument_group('vocabulary options')
    group.add_argument(
        '--source-vocab-size',
        type=int,
        default=None,
        help='Size of the source vocabulary.',
    )
    group.add_argument(
        '--target-vocab-size',
        type=int,
        default=None,
        help='Size of the target vocabulary.',
    )
    group.add_argument(
        '--source-vocab-min-frequency',
        type=int,
        default=1,
        help='Min word frequency for source vocabulary.',
    )
    group.add_argument(
        '--target-vocab-min-frequency',
        type=int,
        default=1,
        help='Min word frequency for target vocabulary.',
    )


[docs]def add_training_data_file_opts(parser): # Data options group = parser.add_argument_group('data') group.add_argument( '--train-source', type=PathType(exists=True), help='Path to training source file', ) group.add_argument( '--train-target', type=PathType(exists=True), help='Path to training target file', ) group.add_argument( '--train-alignments', type=str, help='Path to train alignments between source and target.', ) group.add_argument( '--train-source-tags', type=PathType(exists=True), help='Path to validation label file for source (WMT18 format)', ) group.add_argument( '--train-target-tags', type=PathType(exists=True), help='Path to validation label file for target', ) group.add_argument( '--train-source-pos', type=PathType(exists=True), help='Path to training PoS tags file for source', ) group.add_argument( '--train-target-pos', type=PathType(exists=True), help='Path to training PoS tags file for target', ) group.add_argument( '--train-target-parse', type=PathType(exists=True), help='Path to training dependency parsing file for target (tabular ' 'format)', ) group.add_argument( '--train-target-ngram', type=PathType(exists=True), help='Path to training highest order ngram file for target (tabular ' 'format)', ) group.add_argument( '--train-target-stacked', type=PathType(exists=True), help='Path to training stacked predictions file for target (tabular ' 'format)', ) group = parser.add_argument_group('validation data') group.add_argument( '--valid-source', type=PathType(exists=True), # required=True, help='Path to validation source file', ) group.add_argument( '--valid-target', type=PathType(exists=True), # required=True, help='Path to validation target file', ) group.add_argument( '--valid-alignments', type=str, # required=True, help='Path to valid alignments between source and target.', ) group.add_argument( '--valid-source-tags', type=PathType(exists=True), help='Path to validation label file for source (WMT18 format)', ) group.add_argument( '--valid-target-tags', type=PathType(exists=True), help='Path to validation label file for target', ) group.add_argument( '--valid-source-pos', type=PathType(exists=True), help='Path to training PoS tags file for source', ) group.add_argument( '--valid-target-pos', type=PathType(exists=True), help='Path to training PoS tags file for target', ) group.add_argument( '--valid-target-parse', type=PathType(exists=True), help='Path to validation dependency parsing file for target (tabular ' 'format)', ) group.add_argument( '--valid-target-ngram', type=PathType(exists=True), help='Path to validation highest order ngram file for target (tabular ' 'format)', ) group.add_argument( '--valid-target-stacked', type=PathType(exists=True), help='Path to validation stacked predictions file for target (tabular ' 'format)', )
[docs]def add_predicting_data_file_opts(parser): # Data options group = parser.add_argument_group('data') group.add_argument( '--test-source', type=PathType(exists=True), required=True, help='Path to validation source file', ) group.add_argument( '--test-target', type=PathType(exists=True), required=True, help='Path to validation target file', ) group.add_argument( '--test-alignments', type=PathType(exists=True), help='Path to test alignments between source and target.', ) group.add_argument( '--test-source-pos', type=PathType(exists=True), help='Path to training PoS tags file for source', ) group.add_argument( '--test-target-pos', type=PathType(exists=True), help='Path to training PoS tags file for target', ) group.add_argument( '--test-target-parse', type=PathType(exists=True), help='Path to test dependency parsing file for target (tabular format)', ) group.add_argument( '--test-target-ngram', type=PathType(exists=True), help='Path to test highest order ngram file for target (tabular ' 'format)', ) # noqa group.add_argument( '--test-target-stacked', type=PathType(exists=True), help='Path to test stacked predictions file for target (tabular ' 'format)', ) # noqa return group
def _add_output_options(group): # Other options (used both at training and test time). group.add_argument( '--evaluation-metric', type=str, default='f1_mult', help='Evaluation metric (f1_mult or f1_bad).', )
[docs]def add_training_options(training_parser): add_training_data_file_opts(training_parser) _add_vocabulary_opts(training_parser) group = training_parser.add_argument_group( 'linear', description='Linear Quality Estimation' ) # Model options (training time). group.add_argument( '--use-basic-features-only', type=int, default=0, help='1 for using only basic features (words).', ) group.add_argument( '--use-bigrams', type=int, default=1, help='1 for using bigram features (i.e. a CRF-like model).', ) group.add_argument( '--use-simple-bigram-features', type=int, default=0, help='1 for using only label indicators as bigram features.', ) # Training options. group.add_argument( '--training-algorithm', type=str, default='svm_mira', help='Algorithm for training the model (svm_mira, svm_sgd, ' 'perceptron).', ) group.add_argument( '--regularization-constant', type=float, default=0.001, help='L2 regularization constant.', ) group.add_argument( '--cost-false-positives', type=float, default=0.2, help='Cost for false positives (svm_mira and svm_sgd only).', ) group.add_argument( '--cost-false-negatives', type=float, default=0.8, help='Cost for false negatives (svm_mira and svm_sgd only).', ) _add_output_options(group)
[docs]def add_predicting_options(predicting_parser): add_predicting_data_file_opts(predicting_parser) _add_output_options(predicting_parser)
[docs]def parser_for_pipeline(pipeline): if pipeline == 'train': return ModelParser( 'linear', 'train', title=LinearWordQEClassifier.title, options_fn=add_training_options, api_module=LinearWordQEClassifier, ) if pipeline == 'predict': return ModelParser( 'linear', 'predict', title=LinearWordQEClassifier.title, options_fn=add_predicting_options, api_module=LinearWordQEClassifier, ) return None