Logo Search packages:      
Sourcecode: apertium version File versions

tagger.cc

/*
 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA.
 */
/** PoS tagger main program.
 *
 *  @author Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es
 */

#include <apertium/tagger.h>

#include <apertium/hmm.h>
#include <apertium/tagger_utils.h>
#include <apertium/tsx_reader.h>

#include <cstdio>
#include <fstream>
#include <string>
#include <libgen.h>
#include <locale.h>

#include <sys/types.h>
#include <unistd.h>
#include <getopt.h>
#include <apertium/string_utils.h>

using namespace Apertium;
using namespace std;

int
Tagger::getMode(int argc, char *argv[])
{
  int mode=UNKNOWN_MODE;
   
  int c;
  int option_index=0;

  while (true) {
    static struct option long_options[] =
    {
      {"train",      required_argument, 0, 't'},
      {"supervised", required_argument, 0, 's'},
      {"retrain",    required_argument, 0, 'r'},
      {"tagger",     no_argument,       0, 'g'},
      {"eval",       no_argument,       0, 'e'},
      {"first",      no_argument,       0, 'f'},
      {"help",       no_argument,       0, 'h'}, 
      {"debug",      no_argument,       0, 'd'}, 
      {0, 0, 0, 0}
    };

    c=getopt_long(argc, argv, "dt:s:r:gefh",long_options, &option_index);
    if (c==-1)
      break;
      
    switch (c)
    {
      case 'd':
        debug=true;
        break;

      case 't':  //Training
        if(!isNumber(optarg))
        {
        wcerr <<L"Error: mandatory --train argument <n> must be zero or a positive integer\n";
        help();
        }
        else 
      {
          nit = atoi(optarg); //Number of iterations
        }
        if(mode==UNKNOWN_MODE) 
        {
          mode=TRAIN_MODE;
        }
        else
        {
        wcerr<<L"Error: --train <n> argument cannot be mixed with --retrain or --tagger arguments\n";
        help();
      }
      break;
      
      case 's':
        if(!isNumber(optarg))
        {
        wcerr<<L"Error: mandatory --supervised argument <n> must be zero or a positive integer\n";
        help();
        }
        else 
      {
          nit = atoi(optarg); //Number of iterations
        }

        if(mode==UNKNOWN_MODE) 
        {
          mode=TRAIN_SUPERVISED_MODE;
        }
        else
        {
        wcerr<<L"Error: --supervised optional argument should only appear after --train <n> argument\n";
        help();
      }
      break;
      
      case 'r':
        if(!isNumber(optarg))
        {
        wcerr<<L"Error: mandatory --train argument <n> must be zero or a positive integer\n";
          help();
        } 
        else
        { 
        nit = atoi(optarg); //Number of iterations
        }
        if(mode==UNKNOWN_MODE) 
        {
          mode=RETRAIN_MODE; 
        }
        else
        {
        wcerr<<L"Error: --retrain argument cannot be mixed with --train or --tagger arguments\n";
        help();
      }
        break;
        
      case 'g': 
        if(mode==UNKNOWN_MODE)
        {
        mode=TAGGER_MODE;
        }
        else 
        {
          wcerr<<L"Error: --tagger argument cannot be mixed with --train or --retrain arguments\n";
          help();
        }
        break;
         
      case 'e':
        if(mode==TAGGER_MODE)
        {
        mode=TAGGER_EVAL_MODE;
        }
        else
        {
          wcerr<<L"Error: --eval optional argument should only appear after --tagger argument\n";
        help();
      }
      break;
        
      case 'f': 
        if(mode==TAGGER_MODE)
        {
          mode=TAGGER_FIRST_MODE;
        }
        else
        {
          wcerr<<L"Error: --first optional argument should only appear after --tagger argument\n";
        help();
      } 
      break;
        
      case 'h':
        help(); 
        break;
     
      default:
        //wcerr<<L"Error: getopt() returned the char code '"<<c<<L"'\n";
        help();
        break;
    }    
  }

  if(mode==UNKNOWN_MODE)
  {
    wcerr<<L"Error: Arguments missing\n";
    help();
  }   
  
  switch(argc-optind)
  {
    case 6:
      if(mode != TRAIN_SUPERVISED_MODE)
      {
        help();
      }
      break;
    
    case 4:
      if(mode != TRAIN_MODE)
      {
        help();
      }
      break;
    case 3:
      if ((mode != TAGGER_MODE) && (mode != TAGGER_FIRST_MODE))
      {
        help();
      }
      break;
      
    case 2:
      if(mode != RETRAIN_MODE && mode != TAGGER_MODE)
      {
        help();
      }
      break;
    
    case 1:
      if ((mode != TAGGER_MODE) && (mode != TAGGER_FIRST_MODE))
      {
        help();
      }
      break;
    
    default:
      help();
      break;
  }

  for(int i = optind; i != argc; i++)
  {
    filenames.push_back(argv[i]);
  }
  
  return mode;
}

Tagger::Tagger()
{
  debug = false;
}

void
Tagger::main(int argc, char *argv[])
{
  name = argv[0];
  int mode = getMode(argc, argv);

  switch(mode)
    {
    case TRAIN_MODE:
      train();
      break;
    
    case TRAIN_SUPERVISED_MODE:
      trainSupervised();
      break;

    case RETRAIN_MODE:
      retrain();
      break;
      
    case TAGGER_MODE:
      tagger();
      break;

    case TAGGER_FIRST_MODE:
      tagger(true);
      break;

    default:
      wcerr<<L"Error: Unknown working mode mode\n";
      help();
      break;
    }
}

void
Tagger::tagger(bool mode_first)
{
  FILE *ftdata = fopen(filenames[0].c_str(), "rb");
  if (!ftdata) {
    filerror(filenames[0]);
  }

  TaggerData td;
  td.read(ftdata);
  fclose(ftdata);
  
  HMM hmm(&td);
  
  if(filenames.size() == 1)
  {
    hmm.tagger(stdin, stdout, mode_first);
  }
  else
  {
    FILE *finput = fopen(filenames[1].c_str(), "r");
    if (!finput) {
      filerror(filenames[1]);
    }
    if(filenames.size() == 2)
    {
      hmm.tagger(finput, stdout, mode_first);
    }
    else
    {
      FILE *foutput = fopen(filenames[2].c_str(), "w");
      if (!foutput) {
        filerror(filenames[2]);
      }

      hmm.tagger(finput, foutput, mode_first);
      fclose(foutput);
    }
    fclose(finput);
  }
}

void
Tagger::filerror(string const &filename)
{
  cerr << "Error: cannot open file '" << filenames[0] << "'\n\n";
  help();
}

void
Tagger::train()
{
  TSXReader treader;
  treader.read(filenames[2]);
  HMM hmm(&(treader.getTaggerData()));
  hmm.set_debug(debug);
  hmm.set_eos((treader.getTaggerData().getTagIndex())[L"TAG_SENT"]);
  TaggerWord::setArrayTags(treader.getTaggerData().getArrayTags());
  
  wcerr << L"Calculating ambiguity classes...\n";
  FILE *fdic = fopen(filenames[0].c_str(), "r");
  if(fdic)
  {
    hmm.read_dictionary(fdic);
  }
  else
  {
    filerror(filenames[0]);
  }
  wcerr << L"Kupiec's initialization of transition and emission probabilities...\n";
  FILE *fcrp = fopen(filenames[1].c_str(), "r");
  if(fcrp)
  {
    hmm.init_probabilities_kupiec(fcrp);               
  }
  else
  {
    filerror(filenames[1]);
  }
  
  wcerr << L"Applying forbid and enforce rules...\n";
  hmm.apply_rules();
  
  wcerr << L"Training (Baum-Welch)...\n";
  for(int i=0; i != nit; i++)
  {
    fseek(fcrp, 0, SEEK_SET);
    hmm.train(fcrp);
  }
  wcerr << L"Applying forbid and enforce rules...\n";
  hmm.apply_rules();

  fclose(fdic);
  fclose(fcrp);
  treader.write(filenames[3]);
}

void
Tagger::trainSupervised()
{
  TSXReader treader;
  treader.read(filenames[2]);
  HMM hmm(&(treader.getTaggerData()));
  hmm.set_debug(debug);
  hmm.set_eos((treader.getTaggerData().getTagIndex())[L"TAG_SENT"]);
  TaggerWord::setArrayTags(treader.getTaggerData().getArrayTags());
  
  wcerr << L"Calculating ambiguity classes...\n";
  FILE *fdic = fopen(filenames[0].c_str(), "r");
  if(fdic)
  {
    hmm.read_dictionary(fdic);
  }
  else
  {
    filerror(filenames[0]);
  }
  wcerr << L"Kupiec's initialization of transition and emission probabilities...\n";
  FILE *ftagged = fopen(filenames[4].c_str(), "r");
  FILE *funtagged = fopen(filenames[5].c_str(), "r");
  if(ftagged && funtagged)
  {
    wcerr << L"Initializing transition and emission probabilities from a hand-tagged corpus...\n";
    hmm.init_probabilities_from_tagged_text(ftagged, funtagged);
  }
  else
  {
    filerror(filenames[4]+ "' or '" + filenames[5]);
  }
  fclose(ftagged);
  fclose(funtagged);
  
  wcerr << L"Applying forbid and enforce rules...\n";
  hmm.apply_rules();
  
  wcerr << L"Training (Baum-Welch)...\n";
  FILE *fcrp = fopen(filenames[1].c_str(), "r");
  if(fcrp)
  {
    for(int i=0; i != nit; i++)
    {
      fseek(fcrp, 0, SEEK_SET);
      hmm.train(fcrp);
    }
    wcerr << L"Applying forbid and enforce rules...\n";
    hmm.apply_rules();
  }
  else
  {
    filerror(filenames[1]);
  }

  fclose(fdic);
  fclose(fcrp);
  treader.write(filenames[3]);
}

void
Tagger::retrain()
{
  TaggerData td;
  FILE *ftdata = fopen(filenames[1].c_str(), "rb");
  if(!ftdata)
  {
    filerror(filenames[1]);
  }
  td.read(ftdata);
  fclose(ftdata);

  HMM hmm(&td);
  hmm.set_debug(debug);
  hmm.set_eos((td.getTagIndex())[L"TAG_SENT"]);
  TaggerWord::setArrayTags(td.getArrayTags());

  FILE *fcrp = fopen(filenames[0].c_str(), "r");
  if(!fcrp)
  {
    filerror(filenames[0]);
  }
  wcerr << L"Training (Baum-Welch)...\n";
  for(int i=0; i != nit; i++)
  {
    fseek(fcrp, 0, SEEK_SET);
    hmm.train(fcrp);
  }
  wcerr << L"Applying forbid and enforce rules...\n";
  hmm.apply_rules();
  fclose(fcrp);

  ftdata = fopen(filenames[1].c_str(), "wb");
  if(!ftdata)
  {
    filerror(filenames[1]);
  }
  td.write(ftdata);
  fclose(ftdata);
}

void
Tagger::help()
{
  ostream &out = cerr;
  char localname[name.size()+1];
  strcpy(localname, name.c_str());
  out << basename(localname) << ": HMM part-of-speech tagging and training program" << endl;
  out << "GENERIC USAGE: " << basename(localname) << "[-d] <OPTION>=[PARAM] [FILES]" << endl;
  out << "USAGE: " << basename(localname) << "[-d] -t=n DIC CRP TSX TAGGER_DATA" << endl;
  out << "       " << basename(localname) << "[-d] -s=n DIC CRP TSX TAGGER_DATA HTAG UNTAG" << endl;
  out << "       " << basename(localname) << "[-d] -r=n CRP TAGGER_DATA" << endl;
  out << "       " << basename(localname) << "[-d] -g [-f] TAGGER_DATA [INPUT [OUTPUT]]" << endl;
  out << endl;
  out << "Where OPTIONS are:" << endl;
  out << "  -t, --train=n:       performs n iterations of the Baum-Welch training" << endl;
  out << "                       algorithm (unsupervised)" << endl;
  out << "  -s, --supervised=n:  initializes parameters against a hand-tagged text" << endl;
  out << "                       (supervised), and trains it with n iterations" << endl;
  out << "  -r, --retrain=n:     retrains the model with n aditional Baum-Welch" << endl;
  out << "                       iterations (unsupervised)" << endl;
  out << "  -g, --tagger:        tags input text by means of Viterbi algorithm" << endl;
  out << "  -f, --first:         used if conjuntion with -g (--tagger) makes the tagger"<< endl;
  out << "                       to give all lexical forms of each word, being the choosen" << endl;
  out << "                       one in the first place (after the lemma)"<<endl;
  out << "  -d, --debug:         print error mesages when tagging input text" << endl;
  out << endl;
  out << "And FILES are:" << endl;          
  out << "  DIC:         full expanded dictionary file" << endl;
  out << "  CRP:         training text corpus file" << endl;
  out << "  TSX:         tagger specification file, in XML format" << endl;
  out << "  TAGGER_DATA: tagger data file, built in the training and used while" << endl;
  out << "               tagging" << endl;
  out << "  HTAG:        hand-tagged text corpus" << endl;
  out << "  UNTAG:       untagged text corpus, morphological analysis of HTAG" << endl;
  out << "               corpus to use both jointly with -s option" << endl; 
  out << "  INPUT:       input file, stdin by default" << endl;
  out << "  OUTPUT:      output file, stdout by default" << endl;
  exit(EXIT_FAILURE);
}

bool
Tagger::isNumber(const char *str)
{
  for(unsigned int i = 0, limit = strlen(str); i != limit; i++)
  {
    if(!isdigit(str[i]))
    {
      return false;
    }
  }
  
  return true;
}

Generated by  Doxygen 1.6.0   Back to index