Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void HMM::tagger ( FILE *  in,
FILE *  out,
bool  show_all_good_first = false 
)

Tagging algorithm (Viterbi implementation).

Parameters:
in the input stream with the untagged text to tag
out the output stream with the tagged text

Definition at line 719 of file hmm.cc.

References find_similar_ambiguity_class(), MorphoStream::get_next_word(), TaggerWord::get_string_tags(), TaggerWord::get_superficial_form(), TaggerWord::get_tags(), and Collection::has_not().

                                                         {
  int i, j, k, nw;
  TaggerWord *word=NULL;
  TTag tag;
  
  set <TTag> tags, pretags;
  set <TTag>::iterator itag, jtag;
  
  double prob, loli, x;
  int N = td->getN();  
  double alpha[2][N];
  vector<TTag> best[2][N];
  
  vector <TaggerWord> wpend; 
  int nwpend;
  
  MorphoStream morpho_stream(in, debug, td);                             

  Collection &output = td->getOutput();
  
  loli = nw = 0;
  
  //Initialization
  tags.insert(eos);
  alpha[0][eos] = 1;
   
  word = morpho_stream.get_next_word();
 
  while (word) {
    wpend.push_back(*word);             
    nwpend = wpend.size();
    
    pretags = tags; // Tags from the previous word

    tags = word->get_tags();
  
    if (tags.size()==0) // This is an unknown word
      tags = td->getOpenClass();
                       
    if (output.has_not(tags)) {  // Encontrada una clase de ambigŁedad desconocida hasta el momento      
      if (debug) {
        wstring errors;
      errors = L"A new ambiguity class was found. \n";
      errors+= L"Retraining the tagger is necessary so as to take it into account.\n";
      errors+= L"Word '"+word->get_superficial_form()+L"'.\n";
      errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n";
      wcerr<<L"Error: "<<errors;
      }
      tags = find_similar_ambiguity_class(tags);
    } 
         
    k = output[tags];  //Ambiguity class the word belongs to
    
    clear_array_double(alpha[nwpend%2], N);    
    clear_array_vector(best[nwpend%2], N);
    
    //Induction
    for (itag=tags.begin(); itag!=tags.end(); itag++) { //For all tag from the current word
      i=*itag;
      for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {   //For all tags from the previous word
      j=*jtag;
      x = alpha[1-nwpend%2][j]*(td->getA())[j][i]*(td->getB())[i][k];
      if (alpha[nwpend%2][i]<=x) {
        if (nwpend>1) 
          best[nwpend%2][i] = best[1-nwpend%2][j];
        best[nwpend%2][i].push_back(i);
        alpha[nwpend%2][i] = x;
      }
      }
    }
    
    //Backtracking
    if (tags.size()==1) {       
      tag = *tags.begin();      
      
      prob = alpha[nwpend%2][tag];
      
      if (prob>0) 
      loli -= log(prob);
      else {
        if (debug)
        wcerr<<L"Problem with word '"<<word->get_superficial_form()<<L"' "<<word->get_string_tags()<<L"\n";
      }
      for (unsigned t=0; t<best[nwpend%2][tag].size(); t++) {
      if (show_all_good_first) {
        wstring const &micad = wpend[t].get_all_chosen_tag_first(best[nwpend%2][tag][t], (td->getTagIndex())[L"TAG_kEOF"]);
        fputws_unlocked(micad.c_str(), out); 
      } else {
        // print out
        wpend[t].set_show_sf(show_sf);
        wstring const &micad = wpend[t].get_lexical_form(best[nwpend%2][tag][t], (td->getTagIndex())[L"TAG_kEOF"]);
        fputws_unlocked(micad.c_str(), out); 
      }
      }
      
      //Return to the initial state
      wpend.clear();   
      alpha[0][tag] = 1;
    }
    
    delete word;
    word = morpho_stream.get_next_word();    
  }
  
  if ((tags.size()>1)&&(debug)) {
    wstring errors;
    errors = L"The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n";
    errors+= L"This message should never appears. If you are reading this ..... these are very bad news.\n";
    wcerr<<L"Error: "<<errors;
  }  
}


Generated by  Doxygen 1.6.0   Back to index