Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void HMM::train ( FILE *  is  ) 

Unsupervised training algorithm (Baum-Welch implementation).

Parameters:
is the input stream with the untagged corpus to process

Definition at line 579 of file HMM.C.

References MorphoStream::get_next_word(), TaggerWord::get_string_tags(), TaggerWord::get_superficial_form(), TaggerWord::get_tags(), Collection::has_not(), and Collection::size().

                      {
  int i, j, k, t, len, nw = 0;
  TaggerWord *word=NULL;
  TTag tag; 
  set<TTag> tags, pretags;
  set<TTag>::iterator itag, jtag;
  map <int, double> gamma;
  map <int, double>::iterator jt, kt;
  map < int, map <int, double> > alpha, beta, xsi, phi;
  map < int, map <int, double> >::iterator it;
  double prob, loli;              
  vector < set<TTag> > pending;
  
  int ndesconocidas=0;
  // alpha => forward probabilities
  // beta  => backward probabilities

  
  MorphoStream morpho_stream(ftxt, true, constants, tags_index, prefer_rules);

  loli = 0;
  tag = eos;
  tags.clear();
  tags.insert(tag);
  pending.push_back(tags);

  alpha[0].clear();      
  alpha[0][tag] = 1;

  word = morpho_stream.get_next_word();

  while (word) {   

    //cerr<<"Enter para continuar\n";
    //getchar();

    if (++nw%10000==0) cerr<<'.'<<flush;

    //cerr<<*word<<"\n";

    pretags = pending.back();

    tags = word->get_tags();    
    
    if (tags.size()==0) { // This is an unknown word
      tags = open_class;
      ndesconocidas++;
    }
    
    if (output.has_not(tags)) {
      string errors;
      errors = "A new ambiguity class was found. I cannot continue.\n";
      errors+= "Word '"+word->get_superficial_form()+"' not found in the dictionary.\n";
      errors+= "New ambiguity class: "+word->get_string_tags()+"\n";
      errors+= "Take a look at the dictionary, then retrain.";
      fatal_error(errors);      
    }
    
    k = output[tags];    
    len = pending.size();
    alpha[len].clear();     
      
    //Forward probabilities
    for (itag=tags.begin(); itag!=tags.end(); itag++) {
      i=*itag;
      for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {
         j=*jtag;
         //cerr<<"previous alpha["<<len<<"]["<<i<<"]="<<alpha[len][i]<<"\n";
       //cerr<<"alpha["<<len-1<<"]["<<j<<"]="<<alpha[len-1][j]<<"\n";
         //cerr<<"a["<<j<<"]["<<i<<"]="<<a[j][i]<<"\n";
         //cerr<<"b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\n";
       alpha[len][i] += alpha[len-1][j]*a[j][i]*b[i][k];
      }
      if (alpha[len][i]==0)
        alpha[len][i]=DBL_MIN;
      //cerr<<"alpha["<<len<<"]["<<i<<"]="<<alpha[len][i]<<"\n--------\n";
    }

    if (tags.size()>1) {
      pending.push_back(tags);
    } else {  // word is unambiguous
      tag = *tags.begin(); 
      beta[0].clear();
      beta[0][tag] = 1;   
      
      prob = alpha[len][tag];
      //cerr<<"prob="<<prob<<"\n";
      //cerr<<"alpha["<<len<<"]["<<tag<<"]="<<alpha[len][tag]<<"\n";
      loli -= log(prob);  
      
      for (t=0; t<len; t++) {  // loop from T-1 to 0  
        pretags = pending.back();
        pending.pop_back();
        k = output[tags];
           beta[1-t%2].clear();
           for (itag=tags.begin(); itag!=tags.end(); itag++) {
             i=*itag;
             for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {
               j = *jtag;           
               beta[1-t%2][j] += a[j][i]*b[i][k]*beta[t%2][i];
               xsi[j][i] += alpha[len-t-1][j]*a[j][i]*b[i][k]*beta[t%2][i]/prob;
             }
             double previous_value = gamma[i];
       
             gamma[i] +=  alpha[len-t][i]*beta[t%2][i]/prob;                   
             if (isnan(gamma[i])) {
                cout<<"NAN(3) gamma["<<i<<"] = "<<gamma[i]<<" alpha["<<len-t<<"]["<<i<<"]= "<<alpha[len-t][i]
                    <<" beta["<<t%2<<"]["<<i<<"] = "<<beta[t%2][i]<<" prob = "<<prob<<" previous gamma = "<<previous_value<<"\n";
                exit(1);                     
             }
             if (isinf(gamma[i])) {
                cout<<"INF(3) gamma["<<i<<"] = "<<gamma[i]<<" alpha["<<len-t<<"]["<<i<<"]= "<<alpha[len-t][i]
                    <<" beta["<<t%2<<"]["<<i<<"] = "<<beta[t%2][i]<<" prob = "<<prob<<" previous gamma = "<<previous_value<<"\n";
                exit(1);                     
             }
             if (gamma[i]==0) {
                //cout<<"ZERO(3) gamma["<<i<<"] = "<<gamma[i]<<" alpha["<<len-t<<"]["<<i<<"]= "<<alpha[len-t][i]
                //    <<" beta["<<t%2<<"]["<<i<<"] = "<<beta[t%2][i]<<" prob = "<<prob<<" previous gamma = "<<previous_value<<"\n";
                gamma[i]=DBL_MIN;
                //exit(1);                   
             }
              phi[i][k] += alpha[len-t][i]*beta[t%2][i]/prob;
           }
           tags=pretags;
      }
      
      tags.clear();
      tags.insert(tag);
      pending.push_back(tags);
      alpha[0].clear();
      alpha[0][tag] = 1;
    }
    
    delete word; 
    word = morpho_stream.get_next_word();
  }  
  if ((pending.size()>1) || ((tag!=eos)&&(tag != tags_index["TAG_kEOF"]))) 
    cerr<<"Warning: Thee las tag is not the end-of-sentence-tag\n";
  
  //Clean previous values  
  for(i=0; i<N; i++) {
     for(j=0; j<N; j++)
        a[i][j]=ZERO;
     for(k=0; k<M; k++)
        b[i][k]=ZERO;
  }
  
  // new parameters
  for (it=xsi.begin(); it!=xsi.end(); it++) {
    i = it->first;
    for (jt=xsi[i].begin(); jt!=xsi[i].end(); jt++) {
      j = jt->first;
      if (xsi[i][j]>0) {        
        a[i][j] = xsi[i][j]/gamma[i];
      
        if (isnan(a[i][j])) {
          cerr <<"Error: BW - NAN(1) a["<<i<<"]["<<j<<"]="<<a[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      if (isinf(a[i][j])) {
          cerr <<"Error: BW - INF(1) a["<<i<<"]["<<j<<"]="<<a[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      if (a[i][j]==0) {
          cerr <<"Error: BW - ZERO(1) a["<<i<<"]["<<j<<"]="<<a[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      }
    }
  }

  for (it=phi.begin(); it!=phi.end(); it++) {
    i = it->first;
    for (kt=phi[i].begin(); kt!=phi[i].end(); kt++) {
      k = kt->first;
      if (phi[i][k]>0) {
        b[i][k] = phi[i][k]/gamma[i];     
        
      if (isnan(b[i][k])) {
          cerr <<"Error: BW - NAN(2) b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      if (isinf(b[i][k])) {
          cerr <<"Error: BW - INF(2) b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      if (b[i][k]==0) {
          cerr <<"Error: BW - ZERO(2) b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
             exit(1);
        }
      }
    }
  }

    
  //It can be possible that a probability is not updated
  //We normalize the probabilitites
  for(i=0; i<N; i++) {
    double sum=0;
    for(j=0; j<N; j++)
      sum+=a[i][j];
    for(j=0; j<N; j++)
      a[i][j]=a[i][j]/sum;
  }

  for(i=0; i<N; i++) {
    double sum=0;
    for(k=0; k<M; k++) {
      if(output[k].find(i)!=output[k].end())
        sum+=b[i][k];
    }
    for(k=0; k<M; k++) {
      if(output[k].find(i)!=output[k].end())
        b[i][k]=b[i][k]/sum;
    }
  }

  cerr<<"Log="<<loli<<"\n";
}


Generated by  Doxygen 1.6.0   Back to index