Logo Search packages:      
Sourcecode: apertium version File versions  Download package

HMM.C

/*
 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA.
 */
/*
 *  First order hidden Markov model (HMM) implementation (source)
 *
 *  @author Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es
 */

#include <apertium/HMM.H>
#include <apertium/TaggerUtils.H>
#include <lttoolbox/EndianUtil.H>

#include <stdio.h>
#include <unistd.h>
#include <vector>
#include <algorithm>

00034 HMM::HMM(set<TTag> const &oc, map<string, int> const &ti, 
         ConstantManager const &cm, vector<string> const &pr) {
  open_class = oc; 
  tags_index = ti;
  constants = cm;
  prefer_rules = pr;

  a=NULL;
  b=NULL; 

  eval_data.nwords=0.0;
  eval_data.nunknown=0.0;
  eval_data.nignored=0.0;
  eval_data.nambiguous=0.0;
  eval_data.nerrors_noamb=0.0;
  eval_data.nerrors_amb=0.0;
  eval_data.nerrors_unk=0.0;

  debug=false;
  
  nword=0;
  nword_eval=0;
}

00058 HMM::HMM(TaggerData &td)
{
  open_class = td.getOpenClass();
  tags_index = td.getTagIndex();
  constants = td.getConstants();
  prefer_rules = td.getPreferRules();
  a = td.getA();
  b = td.getB();
  N = td.getN();
  M = td.getM();
  output = td.getOutput();

  eval_data.nwords=0.0;
  eval_data.nunknown=0.0;
  eval_data.nignored=0.0;
  eval_data.nambiguous=0.0;
  eval_data.nerrors_noamb=0.0;
  eval_data.nerrors_amb=0.0;
  eval_data.nerrors_unk=0.0;
    
  debug=false;
}


00082 HMM::~HMM(){

  if (a!=NULL) {
    for(short int i=0; i<N; i++)
      delete [] a[i];
    delete [] a;  
  }
  
  if (b!=NULL) {
    for(short int i=0; i<N; i++)
      delete [] b[i];
    delete [] b;  
  }
}

void
00098 HMM::init() {
  int i;
  
  //Matrix NxN
  a = new double* [N];
  for(i=0; i<N; i++)
    a[i] = new double[N];
  
  //Matrix NxM 
  b = new double* [N];
  for(i=0; i<N; i++)
    b[i] = new double[M];
}

double** 
00113 HMM::get_a() {
  return a;
}

double** 
00118 HMM::get_b() {
  return b;
}

int 
00123 HMM::get_number_states() {
  return N;
}

int 
HMM::get_number_ambiguity_classes() {
  return M;
}

Collection 
HMM::get_ambiguity_classes() {
  return output;
}

void
00138 HMM::set_eos(TTag t) { 
  eos = t; 
} 

void
00143 HMM::set_debug(bool d) { 
  debug = d; 
} 

void 
00148 HMM::read_ambiguity_classes(istream& is) {
  int ntags;
  set<TTag> ambiguity_class;
  TTag tag;
  
  while (!is.eof()) {
    ntags = EndianUtil<int>::read(is);
    if(!is)
    {
      break;
    }
    ambiguity_class.clear();
    while (ntags>0) {
      tag = EndianUtil<TTag>::read(is);
      ntags--;          
      ambiguity_class.insert(tag);
    }
    
    if (ambiguity_class.size()>0) {
      output.add(ambiguity_class);
    }     
  }
  
  N = tags_index.size();
  M = output.size();  

  init();
}

void 
00178 HMM::write_ambiguity_classes(ostream& os) {
  set<TTag> ambiguity_class;
  set<TTag>::iterator itag;
  int size;
     
  //Ambiguity classes
  for(int i=0; i<output.size(); i++) {
    ambiguity_class = output[i];
    size = ambiguity_class.size();
    EndianUtil<int>::write(os, size);
    for (itag=ambiguity_class.begin(); itag!=ambiguity_class.end(); itag++) {
      EndianUtil<TTag>::write(os, *itag);
    }
  } 
}  

void 
00195 HMM::read_probabilities(istream& is) {
  int i, j, k;
  
  //Matrix A
  for(i=0;i<N; i++)
    for(j=0; j<N; j++)
      a[i][j] = EndianUtil<double>::read(is);

  //Matrix B (only useful values)
  while (!is.eof()) {
    i = EndianUtil<int>::read(is);
    if(!is)
    {
      break;
    }
    k = EndianUtil<int>::read(is);
    b[i][k] = EndianUtil<double>::read(is);
  }
}

void 
00216 HMM::write_probabilities(ostream& os) {
  int i, j, k;
  
  //Matrix A  
  for(i=0;i<N; i++)
    for(j=0; j<N; j++)
      EndianUtil<double>::write(os, a[i][j]);

  //Matrix B  (not the full matrix, only useful values)
  for(i=0; i<N; i++) {
    for(k=0; k<M; k++) {
      if(output[k].find(i)!=output[k].end()) {
        EndianUtil<int>::write(os, i);
        EndianUtil<int>::write(os, k);
        EndianUtil<double>::write(os, b[i][k]);
      }
    }
  }  
}  

void 
00237 HMM::init_probabilities_kupiec (FILE *is) {
  int i, j, k, k1, k2, nw=0;
  double classes_ocurrences[M]; //M = Number of ambiguity classes
  double classes_pair_ocurrences[M][M];
  double tags_estimate[N]; //N = Number of tags (states)
  double tags_pair_estimate[N][N];
   
  MorphoStream lexmorfo(is, true, constants, tags_index, prefer_rules);
  TaggerWord *word=NULL;

  for(k=0; k<M; k++) {
    classes_ocurrences[k]=1; 
    for (k2=0; k2<M; k2++)
      classes_pair_ocurrences[k][k2]=1;
  }

  set<TTag> tags;
  tags.clear();
  tags.insert(eos);  
  k1=output[tags]; //The first tag (ambiguity class) seen is the end-of-sentence
  classes_ocurrences[k]++;
  
  //We count for each ambiguity class the number of ocurrences
  word = lexmorfo.get_next_word();
  while((word)) {
    if (++nw%10000==0) cerr<<'.'<<flush; 
    
    tags=word->get_tags();

    if (tags.size()==0) { //This is an unknown word
      tags = open_class;
    }
    else if (output.has_not(tags)) { 
      string errors;
      errors = "A new ambiguity class was found. I cannot continue.\n";
      errors+= "Word '"+word->get_superficial_form()+"' not found in the dictionary.\n";
      errors+= "New ambiguity class: "+word->get_string_tags()+"\n";
      errors+= "Take a look at the dictionary and at the training corpus. Then, retrain.";      
      fatal_error(errors);      
    }    


    k2=output[tags];

    classes_ocurrences[k1]++;
    classes_pair_ocurrences[k1][k2]++;  //k1 followed by k2
    delete word;
    word=lexmorfo.get_next_word();

    k1=k2;

  }  

  //Estimation of the number of time each tags occurs in the training text
  for(i=0; i<N; i++) {  
    tags_estimate[i]=0;
    for(k=0; k<M;  k++) { 
      if(output[k].find(i)!=output[k].end())
        tags_estimate[i] += classes_ocurrences[k]/output[k].size();     
    }
  }
  

  //Estimation of the number of times each tag pair occurs
  for(i=0; i<N; i++)
    for(j=0; j<N; j++)
      tags_pair_estimate[i][j]=0;

  set<TTag> tags1, tags2;
  set<TTag>::iterator itag1, itag2;
  for(k1=0; k1<M; k1++) {
    tags1=output[k1];
    for(k2=0; k2<M; k2++) {
      tags2=output[k2];
      double nocurrences=classes_pair_ocurrences[k1][k2]/((double)(tags1.size()*tags2.size()));
      for (itag1=tags1.begin(); itag1!=tags1.end(); itag1++) {
        for (itag2=tags2.begin(); itag2!=tags2.end(); itag2++)
          tags_pair_estimate[*itag1][*itag2]+=nocurrences;
      }
    }
  }

   //a[i][j] estimation.
  double sum;
  for(i=0; i<N; i++) {
    sum=0;
    for(j=0; j<N; j++)
      sum+=tags_pair_estimate[i][j];

    for(j=0; j<N; j++) {  
      if (sum>0)
        a[i][j] = tags_pair_estimate[i][j]/sum;
      else {
        a[i][j] = 0;
      }
    }
  }

  //b[i][k] estimation
  for(i=0; i<N; i++) {
    for(k=0; k<M; k++)  {
      if (output[k].find(i)!=output[k].end()) {
        if (tags_estimate[i]>0)
          b[i][k] = (classes_ocurrences[k]/output[k].size())/tags_estimate[i];
        else 
        b[i][k] = 0;
      }
    }
  }
  cerr<<"\n";
}




void 
00353 HMM::init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged) {
  int i, j, k, nw=0;
  double tags_pair[N][N];
  double emission[N][M];
  
  MorphoStream stream_tagged(ftagged, true, constants, tags_index, prefer_rules);
  MorphoStream stream_untagged(funtagged, true, constants, tags_index, prefer_rules);
  
  TaggerWord *word_tagged=NULL, *word_untagged=NULL;
  
  set<TTag> tags;
 
  // Init counters - each event appears at least once. 
  // Espected likelihood estimate (ELE) with a fixed initial count of 1
  for(i=0; i<N; i++) {
    for(j=0; j<N; j++)
      tags_pair[i][j]=0;
  }
  for(k=0; k<M; k++) {
    for(i=0; i<N; i++) {
      if (output[k].find(i)!=output[k].end())
        emission[i][k] = 0;
    }  
  }
 
  TTag tag1, tag2;  
  tag1 = eos; // The first seen tag is the end-of-sentence tag
  
  word_tagged = stream_tagged.get_next_word();
  word_untagged = stream_untagged.get_next_word();
  while(word_tagged) {
    cerr<<*word_tagged<<" -- "<<*word_untagged<<"\n"; 

    if (word_tagged->get_superficial_form()!=word_untagged->get_superficial_form()) {              
      cerr<<"\nTagged text (.tagged) and analyzed text (.untagged) streams are not aligned.\n";
      cerr<<"Take a look at tagged text (.tagged).\n";
      cerr<<"Perhaps this is caused by a multiword unit that is not a multiword unit in one of the two files.\n";
      cerr<<*word_tagged<<" -- "<<*word_untagged<<"\n"; 
      exit(1);
    }

    if (++nw%100==0) cerr<<'.'<<flush; 
    
    tag2 = tag1;
   
    if (word_untagged==NULL) {
      cerr<<"word_untagged==NULL\n";
      exit(1);
    }

    if (word_tagged->get_tags().size()==0) // Unknown word
      tag1 = -1;
    else if (word_tagged->get_tags().size()>1) // Ambiguous word
      cerr<<"Error in tagged text. An ambiguous word was found: "<<word_tagged->get_superficial_form()<<"\n";
    else
      tag1 = *(word_tagged->get_tags()).begin();


    if ((tag1>=0) && (tag2>=0))
      tags_pair[tag2][tag1]++;
    

    if (word_untagged->get_tags().size()==0) { // Unknown word
      tags = open_class;
    }
    else if (output.has_not(word_untagged->get_tags())) { //We are training, there is no problem
      string errors;
      errors = "A new ambiguity class was found. I cannot continue.\n";
      errors+= "Word '"+word_untagged->get_superficial_form()+"' not found in the dictionary.\n";
      errors+= "New ambiguity class: "+word_untagged->get_string_tags()+"\n";
      errors+= "Take a look at the dictionary, then retrain.";
      fatal_error(errors);      
    }    
    else {
      tags = word_untagged->get_tags();
    }

    k=output[tags];
    if(tag1>=0)
      emission[tag1][k]++;
                   
    delete word_tagged;
    word_tagged=stream_tagged.get_next_word();
    delete word_untagged;
    word_untagged=stream_untagged.get_next_word();       
  }
  
  
  //Estimate of a[i][j]
  for(i=0; i<N; i++) {
    double sum=0;
    for(j=0; j<N; j++)  
      sum += tags_pair[i][j]+1.0;
    for(j=0; j<N; j++)  
      a[i][j] = (tags_pair[i][j]+1.0)/sum;
  }
    
  
  //Estimate of b[i][k]
  for(i=0; i<N; i++) {
    int nclasses_appear=0;
    double times_appear=0.0;
    for(k=0; k<M; k++)  {
      if (output[k].find(i)!=output[k].end())  {
      nclasses_appear++;      
      times_appear+=emission[i][k];
      }
    }       
    for(k=0; k<M; k++)  {
      if (output[k].find(i)!=output[k].end())
      b[i][k] = (emission[i][k]+(((double)1.0)/((double)nclasses_appear)))/(times_appear+((double)1.0));
    }
  }
  
  cerr<<"\n";  
}
  
void
00471 HMM::apply_rules(vector<TForbidRule> const &forbid_rules,
                 vector<TEnforceAfterRule> const &enforce_rules) {
//  TForbidRule forbid_rules[] = FORBID_RULES;         // Automatically generated by etq2flex.awk
//  TEnforceAfterRule enforce_rules[] = ENFORCE_RULES; // Automatically generated by etq2flex.awk
  int i, j, j2;
  bool found;
   
  for(i=0; i<(int) forbid_rules.size(); i++) {
    a[forbid_rules[i].tagi][forbid_rules[i].tagj] = ZERO;
  }

  for(i=0; i<(int) enforce_rules.size(); i++) {
    for(j=0; j<N; j++) {
      found = false;
      for (j2=0; j2<(int) enforce_rules[i].tagsj.size(); j2++) {
           if (enforce_rules[i].tagsj[j2]==j) {
             found = true;
             break;
           }        
      }
      if (!found)
        a[enforce_rules[i].tagi][j] = ZERO;
    }
  }
    
  // Normalize probabilities
  for(i=0; i<N; i++) {
    double sum=0;
    for(j=0; j<N; j++) 
      sum += a[i][j];
    for(j=0; j<N; j++)
      a[i][j] = a[i][j]/sum;
  }
}

void 
00507 HMM::read_dictionary (FILE *fdic) {
  int i, k, nw=0;
  TaggerWord *word=NULL;
  set <TTag> tags;

  MorphoStream morpho_stream(fdic, true, constants, tags_index, prefer_rules);
  
  // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark
   
  word = morpho_stream.get_next_word();
  
  while (word) {
    if (++nw%10000==0) cerr<<'.'<<flush;
    
    tags = word->get_tags();

    if (tags.size()>0)
      k = output[tags];

    delete word;
    word = morpho_stream.get_next_word();
  }
  cerr<<"\n";
  
  // OPEN AMBIGUITY CLASS
  // It contains all tags that are not closed.
  // Unknown words are assigned the open ambiguity class
  k=output[open_class];
  

  N = tags_index.size();  

  // Create ambiguity class holding one single tag for each tag.
  // If not created yet
  for(i=0; i<N; i++) {
    set<TTag> amb_class;
    amb_class.clear();
    amb_class.insert(i);
    k=output[amb_class];
  }

  M = output.size();
  
  /*
  //We check that all tags appears at least in one ambiguity class.
  //If not a warning message is reported and an ambiguity class holding 
  //the tag is created
  for (int i=0; i<N; i++) {
  bool appear=false;
  for (int k=0; k<M; k++) {
  if (output[k].find(i)!=output[k].end()) {
  appear=true;
  break;
  }
  }
  if (!appear) {
  char* tags_array[]=TAGS_ARRAY; 
  cerr<<"Warning: Tag '"<<tags_array[i]<<"' does not appear in any ambiguity class.\n";
  cerr<<"Warning: An ambiguity class for tag '"<<tags_array[i]<<"' was created.\n";
  set<TTag> amb_class;
  amb_class.insert(i);
  output.add(amb_class);  
  M++; //One omer ambiguity class
  }
  }
  */

  cerr<<N<<" states and "<<M<<" ambiguity classes\n";
  init();
}

void 
00579 HMM::train (FILE *ftxt) {
  int i, j, k, t, len, nw = 0;
  TaggerWord *word=NULL;
  TTag tag; 
  set<TTag> tags, pretags;
  set<TTag>::iterator itag, jtag;
  map <int, double> gamma;
  map <int, double>::iterator jt, kt;
  map < int, map <int, double> > alpha, beta, xsi, phi;
  map < int, map <int, double> >::iterator it;
  double prob, loli;              
  vector < set<TTag> > pending;
  
  int ndesconocidas=0;
  // alpha => forward probabilities
  // beta  => backward probabilities

  
  MorphoStream morpho_stream(ftxt, true, constants, tags_index, prefer_rules);

  loli = 0;
  tag = eos;
  tags.clear();
  tags.insert(tag);
  pending.push_back(tags);

  alpha[0].clear();      
  alpha[0][tag] = 1;

  word = morpho_stream.get_next_word();

  while (word) {   

    //cerr<<"Enter para continuar\n";
    //getchar();

    if (++nw%10000==0) cerr<<'.'<<flush;

    //cerr<<*word<<"\n";

    pretags = pending.back();

    tags = word->get_tags();    
    
    if (tags.size()==0) { // This is an unknown word
      tags = open_class;
      ndesconocidas++;
    }
    
    if (output.has_not(tags)) {
      string errors;
      errors = "A new ambiguity class was found. I cannot continue.\n";
      errors+= "Word '"+word->get_superficial_form()+"' not found in the dictionary.\n";
      errors+= "New ambiguity class: "+word->get_string_tags()+"\n";
      errors+= "Take a look at the dictionary, then retrain.";
      fatal_error(errors);      
    }
    
    k = output[tags];    
    len = pending.size();
    alpha[len].clear();     
      
    //Forward probabilities
    for (itag=tags.begin(); itag!=tags.end(); itag++) {
      i=*itag;
      for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {
         j=*jtag;
         //cerr<<"previous alpha["<<len<<"]["<<i<<"]="<<alpha[len][i]<<"\n";
       //cerr<<"alpha["<<len-1<<"]["<<j<<"]="<<alpha[len-1][j]<<"\n";
         //cerr<<"a["<<j<<"]["<<i<<"]="<<a[j][i]<<"\n";
         //cerr<<"b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\n";
       alpha[len][i] += alpha[len-1][j]*a[j][i]*b[i][k];
      }
      if (alpha[len][i]==0)
        alpha[len][i]=DBL_MIN;
      //cerr<<"alpha["<<len<<"]["<<i<<"]="<<alpha[len][i]<<"\n--------\n";
    }

    if (tags.size()>1) {
      pending.push_back(tags);
    } else {  // word is unambiguous
      tag = *tags.begin(); 
      beta[0].clear();
      beta[0][tag] = 1;   
      
      prob = alpha[len][tag];
      //cerr<<"prob="<<prob<<"\n";
      //cerr<<"alpha["<<len<<"]["<<tag<<"]="<<alpha[len][tag]<<"\n";
      loli -= log(prob);  
      
      for (t=0; t<len; t++) {  // loop from T-1 to 0  
        pretags = pending.back();
        pending.pop_back();
        k = output[tags];
           beta[1-t%2].clear();
           for (itag=tags.begin(); itag!=tags.end(); itag++) {
             i=*itag;
             for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {
               j = *jtag;           
               beta[1-t%2][j] += a[j][i]*b[i][k]*beta[t%2][i];
               xsi[j][i] += alpha[len-t-1][j]*a[j][i]*b[i][k]*beta[t%2][i]/prob;
             }
             double previous_value = gamma[i];
       
             gamma[i] +=  alpha[len-t][i]*beta[t%2][i]/prob;                   
             if (isnan(gamma[i])) {
                cout<<"NAN(3) gamma["<<i<<"] = "<<gamma[i]<<" alpha["<<len-t<<"]["<<i<<"]= "<<alpha[len-t][i]
                    <<" beta["<<t%2<<"]["<<i<<"] = "<<beta[t%2][i]<<" prob = "<<prob<<" previous gamma = "<<previous_value<<"\n";
                exit(1);                     
             }
             if (isinf(gamma[i])) {
                cout<<"INF(3) gamma["<<i<<"] = "<<gamma[i]<<" alpha["<<len-t<<"]["<<i<<"]= "<<alpha[len-t][i]
                    <<" beta["<<t%2<<"]["<<i<<"] = "<<beta[t%2][i]<<" prob = "<<prob<<" previous gamma = "<<previous_value<<"\n";
                exit(1);                     
             }
             if (gamma[i]==0) {
                //cout<<"ZERO(3) gamma["<<i<<"] = "<<gamma[i]<<" alpha["<<len-t<<"]["<<i<<"]= "<<alpha[len-t][i]
                //    <<" beta["<<t%2<<"]["<<i<<"] = "<<beta[t%2][i]<<" prob = "<<prob<<" previous gamma = "<<previous_value<<"\n";
                gamma[i]=DBL_MIN;
                //exit(1);                   
             }
              phi[i][k] += alpha[len-t][i]*beta[t%2][i]/prob;
           }
           tags=pretags;
      }
      
      tags.clear();
      tags.insert(tag);
      pending.push_back(tags);
      alpha[0].clear();
      alpha[0][tag] = 1;
    }
    
    delete word; 
    word = morpho_stream.get_next_word();
  }  
  if ((pending.size()>1) || ((tag!=eos)&&(tag != tags_index["TAG_kEOF"]))) 
    cerr<<"Warning: Thee las tag is not the end-of-sentence-tag\n";
  
  //Clean previous values  
  for(i=0; i<N; i++) {
     for(j=0; j<N; j++)
        a[i][j]=ZERO;
     for(k=0; k<M; k++)
        b[i][k]=ZERO;
  }
  
  // new parameters
  for (it=xsi.begin(); it!=xsi.end(); it++) {
    i = it->first;
    for (jt=xsi[i].begin(); jt!=xsi[i].end(); jt++) {
      j = jt->first;
      if (xsi[i][j]>0) {        
        a[i][j] = xsi[i][j]/gamma[i];
      
        if (isnan(a[i][j])) {
          cerr <<"Error: BW - NAN(1) a["<<i<<"]["<<j<<"]="<<a[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      if (isinf(a[i][j])) {
          cerr <<"Error: BW - INF(1) a["<<i<<"]["<<j<<"]="<<a[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      if (a[i][j]==0) {
          cerr <<"Error: BW - ZERO(1) a["<<i<<"]["<<j<<"]="<<a[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      }
    }
  }

  for (it=phi.begin(); it!=phi.end(); it++) {
    i = it->first;
    for (kt=phi[i].begin(); kt!=phi[i].end(); kt++) {
      k = kt->first;
      if (phi[i][k]>0) {
        b[i][k] = phi[i][k]/gamma[i];     
        
      if (isnan(b[i][k])) {
          cerr <<"Error: BW - NAN(2) b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      if (isinf(b[i][k])) {
          cerr <<"Error: BW - INF(2) b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      if (b[i][k]==0) {
          cerr <<"Error: BW - ZERO(2) b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      }
    }
  }

    
  //It can be possible that a probability is not updated
  //We normalize the probabilitites
  for(i=0; i<N; i++) {
    double sum=0;
    for(j=0; j<N; j++)
      sum+=a[i][j];
    for(j=0; j<N; j++)
      a[i][j]=a[i][j]/sum;
  }

  for(i=0; i<N; i++) {
    double sum=0;
    for(k=0; k<M; k++) {
      if(output[k].find(i)!=output[k].end())
        sum+=b[i][k];
    }
    for(k=0; k<M; k++) {
      if(output[k].find(i)!=output[k].end())
        b[i][k]=b[i][k]/sum;
    }
  }

  cerr<<"Log="<<loli<<"\n";
}

void 
00800 HMM::tagger (FILE *is, FILE *iseval, bool for_hand_tagging) {
  int i, j, k, nw;
  TaggerWord *word=NULL;
  TTag tag;
  
  set <TTag> tags, pretags;
  set <TTag>::iterator itag, jtag;
  
  double prob, loli, x;
  
  double alpha[2][N];
  vector<TTag> best[2][N];
  
  vector <TaggerWord> wpend; 
  int nwpend;
  int nleer_sin_evaluar=0;
  
  MorphoStream morpho_stream(is, debug, constants, tags_index, prefer_rules);
  MorphoStream morpho_stream_eval(iseval, debug, constants, tags_index, prefer_rules);
  
  loli = nw = 0;
  
  //Initialization
  tags.insert(eos);
  alpha[0][eos] = 1;
   
  word = morpho_stream.get_next_word();
  nword++;

  while (word) {
    wpend.push_back(*word);             
    nwpend = wpend.size();
    
    pretags = tags; // Tags from the previous word

    tags = word->get_tags();
  
    if (tags.size()==0) // This is an unknown word
      tags = open_class;
                       
    if (output.has_not(tags)) {  // Encontrada una clase de ambigüedad desconocida hasta el momento      
      if (debug) {
        string errors;
      errors = "A new ambiguity class was found. \n";
      errors+= "Retraining the tagger is necessary so as to take it into account.\n";
      errors+= "Word '"+word->get_superficial_form()+"'.\n";
      errors+= "New ambiguity class: "+word->get_string_tags()+"\n";
      cerr<<"Error: "<<errors;
      }
      tags = find_similar_ambiguity_class(tags);
    } 
         
    k = output[tags];  //Ambiguity class the word belongs to
    
    clear_array_double(alpha[nwpend%2], N);    
    clear_array_vector(best[nwpend%2], N);
    
    //Induction
    for (itag=tags.begin(); itag!=tags.end(); itag++) { //For all tag from the current word
      i=*itag;
      for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {   //For all tags from the previous word
      j=*jtag;
      x = alpha[1-nwpend%2][j]*a[j][i]*b[i][k];
      if (alpha[nwpend%2][i]<=x) {
        if (nwpend>1) 
          best[nwpend%2][i] = best[1-nwpend%2][j];
        best[nwpend%2][i].push_back(i);
        alpha[nwpend%2][i] = x;
      }
      }
    }
    
    //Backtracking
    if (tags.size()==1) {       
      tag = *tags.begin();      
      
      prob = alpha[nwpend%2][tag];
      
      if (prob>0) 
      loli -= log(prob);
      else {
        if (debug)
        cerr<<"Problem with word '"<<word->get_superficial_form()<<"' "<<word->get_string_tags()<<"\n";
      }
      for (unsigned t=0; t<best[nwpend%2][tag].size(); t++) {
        if (iseval) {
        if (nleer_sin_evaluar==0)
          nleer_sin_evaluar=evaltagger(wpend[t], best[nwpend%2][tag][t], morpho_stream_eval);
        else
          nleer_sin_evaluar--;   
      } else if (for_hand_tagging) {
        string cad="";
        if (best[nwpend%2][tag][t]!=tags_index["TAG_kEOF"]) {
          set<TTag> tags_aux;
          set<TTag>::iterator it_tags_aux;
          tags_aux=wpend[t].get_tags();
            
          //if((cad.length()>0)&&(cad[cad.length()-1]!='+'))
          //  cad+=wpend[t].get_superficial_form()+"/";
            
          if (tags_aux.size()==0)
            cad+="*"+wpend[t].get_superficial_form()+"/";
               
          for(it_tags_aux=tags_aux.begin(); it_tags_aux!=tags_aux.end(); it_tags_aux++) {
            if ((best[nwpend%2][tag][t]==(*it_tags_aux))&&(tags_aux.size()>1))
            cad+="@";
            cad+=wpend[t].get_lexical_form_without_ignored_string((TTag&)*it_tags_aux, tags_index["TAG_kEOF"]);
          }
            
          if((cad.length()>0)&&(cad[cad.length()-1]!='+'))
            cad+="$ ";
        }
        if (wpend[t].get_superficial_form().length()>0)
          cad=wpend[t].get_superficial_form()+"/"+cad;
        fwrite_unlocked(cad.c_str(), sizeof(char), cad.size(), stdout);
      } else {
        string const &micad = wpend[t].get_lexical_form(best[nwpend%2][tag][t], tags_index["TAG_kEOF"]);
        fwrite_unlocked(micad.c_str(), sizeof(char), micad.size(), stdout); //For eficiency
        //fflush_unlocked(stdout);
      }
      }
      
      //Return to the initial state
      wpend.clear();   
      alpha[0][tag] = 1;
    }
    
    delete word;
    word = morpho_stream.get_next_word();    
    nword++;
  }
  
  if ((tags.size()>1)&&(debug)) {
    string errors;
    errors = "The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n";
    errors+= "This message should never appears. If you are reading this ..... these are very bad news.\n";
    cerr<<"Error: "<<errors;
  }  
}


void
00942 HMM::print_A() {
  int i,j;
    
  cout<<"TRANSITION MATRIX (A)\n------------------------------\n";  
  for(i=0; i<N; i++)
    for(j=0; j<N; j++) {
      cout<<"A["<<i<<"]["<<j<<"] = "<<a[i][j]<<"\n";
    }    
}

void
00953 HMM::print_B() {
  int i,k;  

  cout<<"EMISSION MATRIX (B)\n-------------------------------\n";
  for(i=0; i<N; i++)
    for(k=0; k<M; k++) {
      if(output[k].find(i)!=output[k].end())
        cout<<"B["<<i<<"]["<<k<<"] = "<<b[i][k]<<"\n";
    }
}

00964 void HMM::print_ambiguity_classes() {
  set<TTag> ambiguity_class;
  set<TTag>::iterator itag;
  cout<<"AMBIGUITY CLASSES\n-------------------------------\n";
  for(int i=0; i<M; i++) {
    ambiguity_class = output[i];
    cout <<i<<": ";
    for (itag=ambiguity_class.begin(); itag!=ambiguity_class.end(); itag++) {
      cout << *itag <<" ";
    }
    cout << "\n";
  }
}   

void
00979 HMM::print_evaluation() {
  cout<<"# of words.......................................... "<<
    eval_data.nwords<<"\n"
      <<"# of unknown words.................................. "<<
    eval_data.nunknown<<"\n" 
      <<"# of ignored words.................................. "<<
    eval_data.nignored<<"\n"
      <<"# of ambiguous words................................ "<<
    eval_data.nambiguous<<"\n"
      <<"# of errors NOT due to ambiguous words.............. "<<
    eval_data.nerrors_noamb<<"\n"
      <<"# of errors due to ambiguous words.................. "<<
    eval_data.nerrors_amb<<"\n"
      <<"# of errors due to unknown words.................... "<<
    eval_data.nerrors_unk<<"\n"
      <<"% of unknown words.................................. "<<
    (eval_data.nunknown/eval_data.nwords)*100.0<<" %\n"
      <<"% of ambiguous words................................ "<<
    (eval_data.nambiguous/eval_data.nwords)*100.0<<" %\n"
      <<"% of unknown and ambiguous words.................... "<<
    ((eval_data.nambiguous+eval_data.nunknown)/eval_data.nwords)*100.0<<" %\n"
      <<"% of error over ambiguous words .................... "<<
    (eval_data.nerrors_amb/eval_data.nambiguous)*100.0<<" %\n"
      <<"% of error over ambiguous and unknown words......... "<<
    ((eval_data.nerrors_amb+eval_data.nerrors_unk)/(eval_data.nambiguous+eval_data.nunknown))*100.0<<" %\n";
}

int
01007 HMM::evaltagger(TaggerWord& word, TTag& tag, MorphoStream& morpho_streameval) {
  static string fsok;
  static TTag tagok;   
  static bool read_word_ok=true;
  static TaggerWord *wordok;    
   
  string fstag; 
  set<TTag> tagsok;   
   
  fstag=word.get_superficial_form();
   
  if(read_word_ok) {   
    wordok = morpho_streameval.get_next_word();
    fsok = wordok->get_superficial_form();
    tagsok = wordok->get_tags();   
    nword_eval++;

    if(tagsok.size()>1) {
      cerr<<"Error in tagged corpus (.eval) used for evaluation. A word with more than one tag was found\n"<<*wordok<<"\n";
      return 0;
    }
    else if(tagsok.size()==0) {
      cerr<<"Error in tagged corpus (.eval) used for evaluation. An unknown word was found\n"<<*wordok<<"\n";
      return 0;
    }
   
    tagok = *(tagsok.begin());
  
    eval_data.nwords+=1.0;
  }
  else
    read_word_ok=true;
   
  if ((tagok!=tag)&&(tag==tags_index["TAG_SENT"])) { 
    //An end-of-sentence that needs to be skeeped
    read_word_ok=false;
    return 0;
  }
         
  cerr<<"("<<nword<<") "<<word.get_string_tags()<<" "<<word.get_superficial_form()<<" ("<<TaggerWord::array_tags[tag]<<") \t--\t ("<<nword_eval<<") "<<wordok->get_superficial_form()<<" ("<<TaggerWord::array_tags[tagok]<<")  ===> ";
   
  if (fstag!=fsok) {        
    int ntokens_ok = ntokens_multiword(wordok->get_lexical_form(tagok, tags_index["TAG_kEOF"]));
    int ntokens_tag = ntokens_multiword(word.get_lexical_form(tag, tags_index["TAG_kEOF"]));
    int words_distance = abs(ntokens_ok - ntokens_tag);
    if(ntokens_ok<ntokens_tag) { 
      //We need to read more words from the tagged corpus used for evaluation
      //so as to align it witrh the corpus being tagged
      while(words_distance>0) {
      delete wordok;
      wordok = morpho_streameval.get_next_word();
      nword_eval++;
      words_distance--;
      }
      words_distance=0;
    }
      
    if (words_distance>0) {
      delete wordok;
      eval_data.nignored+=1.0;
      cerr<<" IGNORED (multiword): "<<words_distance<<"\n";
      return words_distance;
    }
      
    int nguiones_ok = nguiones_fs(fsok); 
    int nguiones_tag = nguiones_fs(fstag);
    words_distance = abs(nguiones_ok-nguiones_tag);
    if(nguiones_ok<nguiones_tag) { 
      //We need to read more words from the tagged corpus used for evaluation
      //so as to align it witrh the corpus being tagged
      while(words_distance>0) {
      delete wordok;
      wordok = morpho_streameval.get_next_word();
      nword_eval++;
      words_distance--;
      }
      words_distance=0;
    }
    delete wordok;
    eval_data.nignored+=1.0;
    cerr<<" IGNORED (hyphen): "<<words_distance<<"\n";      
    return words_distance;      
  }
 
  //Both superficial forms are equal
  if (word.get_tags().size()>1) // Ambiguous
    eval_data.nambiguous+=1.0;
  else if (word.get_tags().size()==0) { // Unknown
    eval_data.nunknown+=1.0;
  }

  if (tag!=tagok) {
    if (word.get_tags().size()==0) {
      eval_data.nerrors_unk+=1.0;
      cerr<<"ERROR UNKNOWN";
      if (open_class.find(tagok)==open_class.end())
      cerr<<", TAG NOT AVAILABLE IN THE OPEN CLASS";
      cerr<<"\n";
    } else  if (word.get_tags().size()==1) {
      eval_data.nerrors_noamb+=1.0;
      cerr<<"ERROR TAG OK NOT AVAILABLE (NO AMBIGUOUS)\n";
    } else {
      if (word.get_tags().find(tagok)==word.get_tags().end()) {
      eval_data.nerrors_noamb+=1.0;
      cerr<<"ERROR TAG OK NOT AVAILABLE (AMBIGUOUS)\n";
      }
      else { 
      eval_data.nerrors_amb+=1.0;
      cerr<<"ERROR\n";
      }
    }
  }
  else
    cerr<<"OK\n";
   
  delete wordok;   
  return 0;
}

set<TTag>
01127 HMM::find_similar_ambiguity_class(set<TTag> c) {
  int size_ret = -1;
  set<TTag> ret=open_class; //Se devolverá si no encontramos ninguna clase mejor
  bool skeep_class;

  for(int k=0; k<M; k++) {
    if ((((int)output[k].size())>((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) {
      skeep_class=false;
      // Test if output[k] is a subset of class
      for(set<TTag>::iterator it=output[k].begin(); it!=output[k].end(); it++) {
        if (c.find(*it)==c.end()) { 
         skeep_class=true; //output[k] is not a subset of class
         break;
      }
      }
      if (!skeep_class) {
        size_ret = output[k].size();
           ret = output[k];
      }
    }
  }
  return ret;
}

Generated by  Doxygen 1.6.0   Back to index