Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void HMM::tagger ( FILE *  is,
FILE *  iseval = NULL,
bool  for_hand_tagging = false 

Tagging algorithm (Viterbi implementation).

is the input stream with the untagged text to tag
iseval the input stream with the tagged text to be used for evaluation
for_hand_tagging flag that tells the method whether the output tagged text will be used as an input to a hand tagging program.

Definition at line 800 of file HMM.C.

References evaltagger(), find_similar_ambiguity_class(), MorphoStream::get_next_word(), TaggerWord::get_string_tags(), TaggerWord::get_superficial_form(), TaggerWord::get_tags(), and Collection::has_not().

  int i, j, k, nw;
  TaggerWord *word=NULL;
  TTag tag;
  set <TTag> tags, pretags;
  set <TTag>::iterator itag, jtag;
  double prob, loli, x;
  double alpha[2][N];
  vector<TTag> best[2][N];
  vector <TaggerWord> wpend; 
  int nwpend;
  int nleer_sin_evaluar=0;
  MorphoStream morpho_stream(is, debug, constants, tags_index, prefer_rules);
  MorphoStream morpho_stream_eval(iseval, debug, constants, tags_index, prefer_rules);
  loli = nw = 0;
  alpha[0][eos] = 1;
  word = morpho_stream.get_next_word();

  while (word) {
    nwpend = wpend.size();
    pretags = tags; // Tags from the previous word

    tags = word->get_tags();
    if (tags.size()==0) // This is an unknown word
      tags = open_class;
    if (output.has_not(tags)) {  // Encontrada una clase de ambigŁedad desconocida hasta el momento      
      if (debug) {
        string errors;
      errors = "A new ambiguity class was found. \n";
      errors+= "Retraining the tagger is necessary so as to take it into account.\n";
      errors+= "Word '"+word->get_superficial_form()+"'.\n";
      errors+= "New ambiguity class: "+word->get_string_tags()+"\n";
      cerr<<"Error: "<<errors;
      tags = find_similar_ambiguity_class(tags);
    k = output[tags];  //Ambiguity class the word belongs to
    clear_array_double(alpha[nwpend%2], N);    
    clear_array_vector(best[nwpend%2], N);
    for (itag=tags.begin(); itag!=tags.end(); itag++) { //For all tag from the current word
      for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {   //For all tags from the previous word
      x = alpha[1-nwpend%2][j]*a[j][i]*b[i][k];
      if (alpha[nwpend%2][i]<=x) {
        if (nwpend>1) 
          best[nwpend%2][i] = best[1-nwpend%2][j];
        alpha[nwpend%2][i] = x;
    if (tags.size()==1) {       
      tag = *tags.begin();      
      prob = alpha[nwpend%2][tag];
      if (prob>0) 
      loli -= log(prob);
      else {
        if (debug)
        cerr<<"Problem with word '"<<word->get_superficial_form()<<"' "<<word->get_string_tags()<<"\n";
      for (unsigned t=0; t<best[nwpend%2][tag].size(); t++) {
        if (iseval) {
        if (nleer_sin_evaluar==0)
          nleer_sin_evaluar=evaltagger(wpend[t], best[nwpend%2][tag][t], morpho_stream_eval);
      } else if (for_hand_tagging) {
        string cad="";
        if (best[nwpend%2][tag][t]!=tags_index["TAG_kEOF"]) {
          set<TTag> tags_aux;
          set<TTag>::iterator it_tags_aux;
          //  cad+=wpend[t].get_superficial_form()+"/";
          if (tags_aux.size()==0)
          for(it_tags_aux=tags_aux.begin(); it_tags_aux!=tags_aux.end(); it_tags_aux++) {
            if ((best[nwpend%2][tag][t]==(*it_tags_aux))&&(tags_aux.size()>1))
            cad+=wpend[t].get_lexical_form_without_ignored_string((TTag&)*it_tags_aux, tags_index["TAG_kEOF"]);
            cad+="$ ";
        if (wpend[t].get_superficial_form().length()>0)
        fwrite_unlocked(cad.c_str(), sizeof(char), cad.size(), stdout);
      } else {
        string const &micad = wpend[t].get_lexical_form(best[nwpend%2][tag][t], tags_index["TAG_kEOF"]);
        fwrite_unlocked(micad.c_str(), sizeof(char), micad.size(), stdout); //For eficiency
      //Return to the initial state
      alpha[0][tag] = 1;
    delete word;
    word = morpho_stream.get_next_word();    
  if ((tags.size()>1)&&(debug)) {
    string errors;
    errors = "The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n";
    errors+= "This message should never appears. If you are reading this ..... these are very bad news.\n";
    cerr<<"Error: "<<errors;

Generated by  Doxygen 1.6.0   Back to index