Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void HMM::tagger ( FILE *  is,
FILE *  iseval = NULL,
bool  for_hand_tagging = false 
)

Tagging algorithm (Viterbi implementation).

Parameters:
is the input stream with the untagged text to tag
iseval the input stream with the tagged text to be used for evaluation
for_hand_tagging flag that tells the method whether the output tagged text will be used as an input to a hand tagging program.

Definition at line 800 of file HMM.C.

References evaltagger(), find_similar_ambiguity_class(), MorphoStream::get_next_word(), TaggerWord::get_string_tags(), TaggerWord::get_superficial_form(), TaggerWord::get_tags(), and Collection::has_not().

                                                          {
  int i, j, k, nw;
  TaggerWord *word=NULL;
  TTag tag;
  
  set <TTag> tags, pretags;
  set <TTag>::iterator itag, jtag;
  
  double prob, loli, x;
  
  double alpha[2][N];
  vector<TTag> best[2][N];
  
  vector <TaggerWord> wpend; 
  int nwpend;
  int nleer_sin_evaluar=0;
  
  MorphoStream morpho_stream(is, debug, constants, tags_index, prefer_rules);
  MorphoStream morpho_stream_eval(iseval, debug, constants, tags_index, prefer_rules);
  
  loli = nw = 0;
  
  //Initialization
  tags.insert(eos);
  alpha[0][eos] = 1;
   
  word = morpho_stream.get_next_word();
  nword++;

  while (word) {
    wpend.push_back(*word);             
    nwpend = wpend.size();
    
    pretags = tags; // Tags from the previous word

    tags = word->get_tags();
  
    if (tags.size()==0) // This is an unknown word
      tags = open_class;
                       
    if (output.has_not(tags)) {  // Encontrada una clase de ambigŁedad desconocida hasta el momento      
      if (debug) {
        string errors;
      errors = "A new ambiguity class was found. \n";
      errors+= "Retraining the tagger is necessary so as to take it into account.\n";
      errors+= "Word '"+word->get_superficial_form()+"'.\n";
      errors+= "New ambiguity class: "+word->get_string_tags()+"\n";
      cerr<<"Error: "<<errors;
      }
      tags = find_similar_ambiguity_class(tags);
    } 
         
    k = output[tags];  //Ambiguity class the word belongs to
    
    clear_array_double(alpha[nwpend%2], N);    
    clear_array_vector(best[nwpend%2], N);
    
    //Induction
    for (itag=tags.begin(); itag!=tags.end(); itag++) { //For all tag from the current word
      i=*itag;
      for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {   //For all tags from the previous word
      j=*jtag;
      x = alpha[1-nwpend%2][j]*a[j][i]*b[i][k];
      if (alpha[nwpend%2][i]<=x) {
        if (nwpend>1) 
          best[nwpend%2][i] = best[1-nwpend%2][j];
        best[nwpend%2][i].push_back(i);
        alpha[nwpend%2][i] = x;
      }
      }
    }
    
    //Backtracking
    if (tags.size()==1) {       
      tag = *tags.begin();      
      
      prob = alpha[nwpend%2][tag];
      
      if (prob>0) 
      loli -= log(prob);
      else {
        if (debug)
        cerr<<"Problem with word '"<<word->get_superficial_form()<<"' "<<word->get_string_tags()<<"\n";
      }
      for (unsigned t=0; t<best[nwpend%2][tag].size(); t++) {
        if (iseval) {
        if (nleer_sin_evaluar==0)
          nleer_sin_evaluar=evaltagger(wpend[t], best[nwpend%2][tag][t], morpho_stream_eval);
        else
          nleer_sin_evaluar--;   
      } else if (for_hand_tagging) {
        string cad="";
        if (best[nwpend%2][tag][t]!=tags_index["TAG_kEOF"]) {
          set<TTag> tags_aux;
          set<TTag>::iterator it_tags_aux;
          tags_aux=wpend[t].get_tags();
            
          //if((cad.length()>0)&&(cad[cad.length()-1]!='+'))
          //  cad+=wpend[t].get_superficial_form()+"/";
            
          if (tags_aux.size()==0)
            cad+="*"+wpend[t].get_superficial_form()+"/";
               
          for(it_tags_aux=tags_aux.begin(); it_tags_aux!=tags_aux.end(); it_tags_aux++) {
            if ((best[nwpend%2][tag][t]==(*it_tags_aux))&&(tags_aux.size()>1))
            cad+="@";
            cad+=wpend[t].get_lexical_form_without_ignored_string((TTag&)*it_tags_aux, tags_index["TAG_kEOF"]);
          }
            
          if((cad.length()>0)&&(cad[cad.length()-1]!='+'))
            cad+="$ ";
        }
        if (wpend[t].get_superficial_form().length()>0)
          cad=wpend[t].get_superficial_form()+"/"+cad;
        fwrite_unlocked(cad.c_str(), sizeof(char), cad.size(), stdout);
      } else {
        string const &micad = wpend[t].get_lexical_form(best[nwpend%2][tag][t], tags_index["TAG_kEOF"]);
        fwrite_unlocked(micad.c_str(), sizeof(char), micad.size(), stdout); //For eficiency
        //fflush_unlocked(stdout);
      }
      }
      
      //Return to the initial state
      wpend.clear();   
      alpha[0][tag] = 1;
    }
    
    delete word;
    word = morpho_stream.get_next_word();    
    nword++;
  }
  
  if ((tags.size()>1)&&(debug)) {
    string errors;
    errors = "The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n";
    errors+= "This message should never appears. If you are reading this ..... these are very bad news.\n";
    cerr<<"Error: "<<errors;
  }  
}


Generated by  Doxygen 1.6.0   Back to index