Logo Search packages:      
Sourcecode: apertium version File versions  Download package

MorphoStream.C

/*
 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA.
 */
/** 
 *  Word class and MorphoStream class definitions
 *
 *  @author Felipe Sánchez-Martínez 
 */

#include <apertium/MorphoStream.H>
#include <vector>


00029 MorphoStream::MorphoStream(FILE *ftxt, bool d, ConstantManager const &cm,
                           map<string, int> const &ti, vector<string> const &pr) {
  flexx = new LexWrapper(ftxt, stdout);
  foundEOF = false;
  debug=d;
  tags_index = ti;
  constants = cm;
  prefer_rules = pr;
}

00039 MorphoStream::~MorphoStream() 
{
//  delete flexx;
}

TaggerWord*
00045 MorphoStream::get_next_word() {
  TTag tag;
  bool must_continue=true;
  int ivwords; // Index of vwords
  bool read_word = false;
  
  //If there are words in the vwords buffer, the next word of the buffer is returned
  if (vwords.size()>0) {
    TaggerWord* word=vwords.front();
    vwords.erase(vwords.begin());
    return word;
  }
  
  if (foundEOF)
    return NULL;

  tag = flexx->yylex();
  //cout << tag << "\n";
  //cout << flexx->yytext() << "\n";

  vwords.push_back(new TaggerWord());
  ivwords=0;
  
  while (must_continue) {
    
    string debug_string = flexx->yytext();
    //cerr<<debug_string<<"\n";

    //NOTE: Constants k???? are defined in tags.h (generated by tag2flex.awk)

    if(tag == constants.getConstant("kBEGIN")) 
    {
      //A new word has begun
      read_word=true;
    }
    else if(tag == constants.getConstant("kDOLLAR"))
    {
      read_word=false;
      must_continue=false;
    }
    else if(tag == constants.getConstant("kMOT"))
    {
      if (read_word)
        vwords[ivwords]->set_superficial_form(flexx->yytext()); 
      else {
        cerr<<"Warning (internal): kMOT tag was returned while not reading a word\n";
        cerr<<"String read: "<<flexx->yytext()<<"\n";
        cerr<<"Debug: "<<debug_string<<"\n";
        vwords[ivwords]->add_ignored_string(flexx->yytext());
      }
    }
    else if(tag == constants.getConstant("kBARRA")) 
    { 
      if (!read_word)  //The '/' was in the text, it is not a delimiter
        vwords[ivwords]->add_ignored_string(flexx->yytext());
      else   
        ivwords=0;
    }
    else if(tag == constants.getConstant("kMAS"))
    {
      if (!read_word)  //The + was in the text, it is not a delimiter
        vwords[ivwords]->add_ignored_string(flexx->yytext());
      else 
      {
        //This is a multiword unit, we read the multiword completely
        vwords[ivwords]->set_plus_cut(true); 
        if (((int)vwords.size())<=((int)(ivwords+1)))
          vwords.push_back(new TaggerWord(true));
        ivwords++;
      }
    }
    else if(tag == constants.getConstant("kIGNORAR"))
    { // This text must be ignoted because is a "[" whatever "]"
      vwords[ivwords]->add_ignored_string(flexx->yytext());
      if (read_word) {
         cerr<<"Warning (internal): kIGNORE was returned while reading a word\n";
         cerr<<"Word being read: "<<vwords[ivwords]->get_superficial_form()<<"\n";
         cerr<<"Debug: "<<debug_string<<"\n";
      }
    }
    else if(tag == tags_index["TAG_kEOF"])
    {
        vwords[ivwords]->add_tag(tag, "", prefer_rules);
      must_continue=false;
      foundEOF=true;
      break; 
    }
    else if(tag == constants.getConstant("kUNKNOWN")) 
    {
      if (!read_word) {
        cerr<<"Warning (internal): An unknown tag was returned while not reading a word\n";
        cerr<<"Word being read: "<<vwords[ivwords]->get_superficial_form()<<"\n";
      cerr<<"Unknown tag: "<< flexx->yytext() <<"\n";
      }
    }
    else if(tag == tags_index["TAG_kUNDEF"])
    {
      if (read_word) {
        if (debug) {
        cerr<<"Warning: There is not coarse tag for the fine tag '"<< flexx->yytext() <<"'\n";
          cerr<<"         This is because of an incomplete tagset definition or a dictionary error\n";
      }
        vwords[ivwords]->add_tag(tag, flexx->yytext() , prefer_rules);
      } else {
        cerr<<"Warning (internal): An undef tag was returned while not reading a word\n";
        vwords[ivwords]->add_ignored_string(flexx->yytext());
      }
    }
    else 
    { 
      if (read_word) 
        vwords[ivwords]->add_tag(tag, flexx->yytext(), prefer_rules);
      else {
        cerr<<"Warning (internal): A common tag was returned while not reading a word\n";
        vwords[ivwords]->add_ignored_string(flexx->yytext());
      }
    }

    if (must_continue)
    {
      tag=flexx->yylex();
      //cout << tag << "\n";
      //cout << flexx->yytext() << "\n";
    }
  }  

  /* Test of consistency in case of multiword units.
     There can only be one ambiguous word between all words of the
     multiword unit
   */

  if(vwords.size()>1) {
    vector<TaggerWord*>::iterator it;
    int nambiguous=0;
    for(it=vwords.begin(); it!=vwords.end(); it++) {
      if ((*it)->get_tags().size()>1)
        nambiguous++;
    }
    
    if (nambiguous>1) {
      string errors;
      errors = "A multiword unit with more than one ambiguous word was found.\n";
      errors+= "This is a very big problem not considered when implementing the tagger.\n\n";
      errors+= "Word: "+vwords.front()->get_superficial_form()+"\n";
      cerr<<"Error: "<<errors;
    }
  }
  /* End of test of consisntency
   */
   
  //Return the fisrt word of vwords vector
  TaggerWord* word=vwords.front();
  vwords.erase(vwords.begin());
  return word;
}

TTag
00202 MorphoStream::get_next_tag() {
  TTag tag;

  do {
    tag = flexx->yylex();
    //cout << tag << "\n";
    //cout << flexx->yytext() << "\n";
    last_string_tag = (string) flexx->yytext();
  } while ((tag==constants.getConstant("kIGNORAR")) ||
           (tag==constants.getConstant("kMAS")) ||
           (tag==constants.getConstant("kBARRA")));
   
  return tag;
}

string
00218 MorphoStream::get_string_last_tag() {
  return last_string_tag;
}

Generated by  Doxygen 1.6.0   Back to index