Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void HMM2::tagger ( FILE *  in,
FILE *  out 
)

Tagging algorithm (Viterbi implementation).

Parameters:
in the input stream with the untagged text to tag
out the output stream with the tagged text

Definition at line 691 of file HMM2.C.

References find_similar_ambiguity_class(), MorphoStream2::get_next_word(), TaggerWord::get_string_tags(), TaggerWord::get_superficial_form(), TaggerWord::get_tags(), and Collection::has_not().

{
  int i, j, k, nw;
  TaggerWord *word=NULL;
  TTag tag;
  
  set <TTag> tags, pretags;
  set <TTag>::iterator itag, jtag;
  
  double prob, loli, x;
  int N = td->getN();  
  double alpha[2][N];
  vector<TTag> best[2][N];
  
  vector <TaggerWord> wpend; 
  int nwpend;
  
  MorphoStream2 morpho_stream(in, debug, td);                             

  Collection &output = td->getOutput();
  
  loli = nw = 0;
  
  //Initialization
  tags.insert(eos);
  alpha[0][eos] = 1;
   
  word = morpho_stream.get_next_word();
 
  while (word) {
    wpend.push_back(*word);             
    nwpend = wpend.size();
    
    pretags = tags; // Tags from the previous word

    tags = word->get_tags();
  
    if (tags.size()==0) // This is an unknown word
      tags = td->getOpenClass();
                       
    if (output.has_not(tags)) {  // Encontrada una clase de ambigŁedad desconocida hasta el momento      
      if (debug) {
        string errors;
      errors = "A new ambiguity class was found. \n";
      errors+= "Retraining the tagger is necessary so as to take it into account.\n";
      errors+= "Word '"+word->get_superficial_form()+"'.\n";
      errors+= "New ambiguity class: "+word->get_string_tags()+"\n";
      cerr<<"Error: "<<errors;
      }
      tags = find_similar_ambiguity_class(tags);
    } 
         
    k = output[tags];  //Ambiguity class the word belongs to
    
    clear_array_double(alpha[nwpend%2], N);    
    clear_array_vector(best[nwpend%2], N);
    
    //Induction
    for (itag=tags.begin(); itag!=tags.end(); itag++) { //For all tag from the current word
      i=*itag;
      for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {   //For all tags from the previous word
      j=*jtag;
      x = alpha[1-nwpend%2][j]*(td->getA())[j][i]*(td->getB())[i][k];
      if (alpha[nwpend%2][i]<=x) {
        if (nwpend>1) 
          best[nwpend%2][i] = best[1-nwpend%2][j];
        best[nwpend%2][i].push_back(i);
        alpha[nwpend%2][i] = x;
      }
      }
    }
    
    //Backtracking
    if (tags.size()==1) {       
      tag = *tags.begin();      
      
      prob = alpha[nwpend%2][tag];
      
      if (prob>0) 
      loli -= log(prob);
      else {
        if (debug)
        cerr<<"Problem with word '"<<word->get_superficial_form()<<"' "<<word->get_string_tags()<<"\n";
      }
      for (unsigned t=0; t<best[nwpend%2][tag].size(); t++)
      {
        string const &micad = wpend[t].get_lexical_form(best[nwpend%2][tag][t], (td->getTagIndex())["TAG_kEOF"]);
        fwrite_unlocked(micad.c_str(), sizeof(char), micad.size(), out); //For eficiency
      }
      
      //Return to the initial state
      wpend.clear();   
      alpha[0][tag] = 1;
    }
    
    delete word;
    word = morpho_stream.get_next_word();    
  }
  
  if ((tags.size()>1)&&(debug)) {
    string errors;
    errors = "The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n";
    errors+= "This message should never appears. If you are reading this ..... these are very bad news.\n";
    cerr<<"Error: "<<errors;
  }  
}


Generated by  Doxygen 1.6.0   Back to index