Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void HMM::tagger ( FILE *  in,
FILE *  out,
bool  show_all_good_first = false 

Tagging algorithm (Viterbi implementation).

inthe input stream with the untagged text to tag
outthe output stream with the tagged text

Definition at line 719 of file hmm.cc.

References MorphoStream::get_next_word(), TaggerWord::get_string_tags(), TaggerWord::get_superficial_form(), TaggerWord::get_tags(), and Collection::has_not().

  int i, j, k, nw;
  TaggerWord *word=NULL;
  TTag tag;
  set <TTag> tags, pretags;
  set <TTag>::iterator itag, jtag;
  double prob, loli, x;
  int N = td->getN();  
  double alpha[2][N];
  vector<TTag> best[2][N];
  vector <TaggerWord> wpend; 
  int nwpend;
  MorphoStream morpho_stream(in, debug, td);                             

  Collection &output = td->getOutput();
  loli = nw = 0;
  alpha[0][eos] = 1;
  word = morpho_stream.get_next_word();
  while (word) {
    nwpend = wpend.size();
    pretags = tags; // Tags from the previous word

    tags = word->get_tags();
    if (tags.size()==0) // This is an unknown word
      tags = td->getOpenClass();
    if (output.has_not(tags)) {  // Encontrada una clase de ambigŁedad desconocida hasta el momento      
      if (debug) {
        wstring errors;
      errors = L"A new ambiguity class was found. \n";
      errors+= L"Retraining the tagger is necessary so as to take it into account.\n";
      errors+= L"Word '"+word->get_superficial_form()+L"'.\n";
      errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n";
      wcerr<<L"Error: "<<errors;
      tags = find_similar_ambiguity_class(tags);
    k = output[tags];  //Ambiguity class the word belongs to
    clear_array_double(alpha[nwpend%2], N);    
    clear_array_vector(best[nwpend%2], N);
    for (itag=tags.begin(); itag!=tags.end(); itag++) { //For all tag from the current word
      for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {   //For all tags from the previous word
      x = alpha[1-nwpend%2][j]*(td->getA())[j][i]*(td->getB())[i][k];
      if (alpha[nwpend%2][i]<=x) {
        if (nwpend>1) 
          best[nwpend%2][i] = best[1-nwpend%2][j];
        alpha[nwpend%2][i] = x;
    if (tags.size()==1) {       
      tag = *tags.begin();      
      prob = alpha[nwpend%2][tag];
      if (prob>0) 
      loli -= log(prob);
      else {
        if (debug)
        wcerr<<L"Problem with word '"<<word->get_superficial_form()<<L"' "<<word->get_string_tags()<<L"\n";
      for (unsigned t=0; t<best[nwpend%2][tag].size(); t++) {
      if (show_all_good_first) {
        wstring const &micad = wpend[t].get_all_chosen_tag_first(best[nwpend%2][tag][t], (td->getTagIndex())[L"TAG_kEOF"]);
        fputws_unlocked(micad.c_str(), out); 
      } else {
        // print out
        wstring const &micad = wpend[t].get_lexical_form(best[nwpend%2][tag][t], (td->getTagIndex())[L"TAG_kEOF"]);
        fputws_unlocked(micad.c_str(), out); 
      //Return to the initial state
      alpha[0][tag] = 1;
    delete word;
    word = morpho_stream.get_next_word();    
  if ((tags.size()>1)&&(debug)) {
    wstring errors;
    errors = L"The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n";
    errors+= L"This message should never appears. If you are reading this ..... these are very bad news.\n";
    wcerr<<L"Error: "<<errors;

Here is the call graph for this function:

Generated by  Doxygen 1.6.0   Back to index