Logo Search packages:      
Sourcecode: apertium version File versions  Download package

int HMM::evaltagger ( TaggerWord word,
TTag &  tag,
MorphoStream l 
)

It updates the data structure that holds the tagger evaluation.

Parameters:
word the word that has been tagged
tag the tag assigned as the correct one
l the MorphoStream from wich the correct tag must be read
Returns:
the number of word that must be ignored because of the superficial forms from the word and the MorphoStream are not the same

Definition at line 1007 of file HMM.C.

References TaggerWord::get_lexical_form(), MorphoStream::get_next_word(), TaggerWord::get_string_tags(), TaggerWord::get_superficial_form(), TaggerWord::get_tags(), data_tagger_eval::nambiguous, data_tagger_eval::nerrors_amb, data_tagger_eval::nerrors_noamb, data_tagger_eval::nerrors_unk, data_tagger_eval::nignored, data_tagger_eval::nunknown, and data_tagger_eval::nwords.

Referenced by tagger().

                                                                            {
  static string fsok;
  static TTag tagok;   
  static bool read_word_ok=true;
  static TaggerWord *wordok;    
   
  string fstag; 
  set<TTag> tagsok;   
   
  fstag=word.get_superficial_form();
   
  if(read_word_ok) {   
    wordok = morpho_streameval.get_next_word();
    fsok = wordok->get_superficial_form();
    tagsok = wordok->get_tags();   
    nword_eval++;

    if(tagsok.size()>1) {
      cerr<<"Error in tagged corpus (.eval) used for evaluation. A word with more than one tag was found\n"<<*wordok<<"\n";
      return 0;
    }
    else if(tagsok.size()==0) {
      cerr<<"Error in tagged corpus (.eval) used for evaluation. An unknown word was found\n"<<*wordok<<"\n";
      return 0;
    }
   
    tagok = *(tagsok.begin());
  
    eval_data.nwords+=1.0;
  }
  else
    read_word_ok=true;
   
  if ((tagok!=tag)&&(tag==tags_index["TAG_SENT"])) { 
    //An end-of-sentence that needs to be skeeped
    read_word_ok=false;
    return 0;
  }
         
  cerr<<"("<<nword<<") "<<word.get_string_tags()<<" "<<word.get_superficial_form()<<" ("<<TaggerWord::array_tags[tag]<<") \t--\t ("<<nword_eval<<") "<<wordok->get_superficial_form()<<" ("<<TaggerWord::array_tags[tagok]<<")  ===> ";
   
  if (fstag!=fsok) {        
    int ntokens_ok = ntokens_multiword(wordok->get_lexical_form(tagok, tags_index["TAG_kEOF"]));
    int ntokens_tag = ntokens_multiword(word.get_lexical_form(tag, tags_index["TAG_kEOF"]));
    int words_distance = abs(ntokens_ok - ntokens_tag);
    if(ntokens_ok<ntokens_tag) { 
      //We need to read more words from the tagged corpus used for evaluation
      //so as to align it witrh the corpus being tagged
      while(words_distance>0) {
      delete wordok;
      wordok = morpho_streameval.get_next_word();
      nword_eval++;
      words_distance--;
      }
      words_distance=0;
    }
      
    if (words_distance>0) {
      delete wordok;
      eval_data.nignored+=1.0;
      cerr<<" IGNORED (multiword): "<<words_distance<<"\n";
      return words_distance;
    }
      
    int nguiones_ok = nguiones_fs(fsok); 
    int nguiones_tag = nguiones_fs(fstag);
    words_distance = abs(nguiones_ok-nguiones_tag);
    if(nguiones_ok<nguiones_tag) { 
      //We need to read more words from the tagged corpus used for evaluation
      //so as to align it witrh the corpus being tagged
      while(words_distance>0) {
      delete wordok;
      wordok = morpho_streameval.get_next_word();
      nword_eval++;
      words_distance--;
      }
      words_distance=0;
    }
    delete wordok;
    eval_data.nignored+=1.0;
    cerr<<" IGNORED (hyphen): "<<words_distance<<"\n";      
    return words_distance;      
  }
 
  //Both superficial forms are equal
  if (word.get_tags().size()>1) // Ambiguous
    eval_data.nambiguous+=1.0;
  else if (word.get_tags().size()==0) { // Unknown
    eval_data.nunknown+=1.0;
  }

  if (tag!=tagok) {
    if (word.get_tags().size()==0) {
      eval_data.nerrors_unk+=1.0;
      cerr<<"ERROR UNKNOWN";
      if (open_class.find(tagok)==open_class.end())
      cerr<<", TAG NOT AVAILABLE IN THE OPEN CLASS";
      cerr<<"\n";
    } else  if (word.get_tags().size()==1) {
      eval_data.nerrors_noamb+=1.0;
      cerr<<"ERROR TAG OK NOT AVAILABLE (NO AMBIGUOUS)\n";
    } else {
      if (word.get_tags().find(tagok)==word.get_tags().end()) {
      eval_data.nerrors_noamb+=1.0;
      cerr<<"ERROR TAG OK NOT AVAILABLE (AMBIGUOUS)\n";
      }
      else { 
      eval_data.nerrors_amb+=1.0;
      cerr<<"ERROR\n";
      }
    }
  }
  else
    cerr<<"OK\n";
   
  delete wordok;   
  return 0;
}


Generated by  Doxygen 1.6.0   Back to index