Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void HMM2::init_probabilities_from_tagged_text ( FILE *  ftagged,
FILE *  funtagged 
)

It initializes the transtion (a) and emission (b) probabilities from a tagged input text by means of the expected-likelihood estimate (ELE) method

Parameters:
ftagged the input stream with the tagged corpus to process
funtagged the same corpus to porcesss but untagged

Definition at line 234 of file HMM2.C.

References MorphoStream2::get_next_word(), and Collection::has_not().

                                                                        {
  int i, j, k, nw=0;
  int N = td->getN();
  int M = td->getM();
  double tags_pair[N][N];
  double emission[N][M];
  
  MorphoStream2 stream_tagged(ftagged, true, td);
  MorphoStream2 stream_untagged(funtagged, true, td);
  
  TaggerWord *word_tagged=NULL, *word_untagged=NULL;
  Collection &output = td->getOutput();

  
  set<TTag> tags;
 
  // Init counters - each event appears at least once. 
  // Espected likelihood estimate (ELE) with a fixed initial count of 1
  for(i=0; i<N; i++) {
    for(j=0; j<N; j++)
      tags_pair[i][j]=0;
  }
  for(k=0; k<M; k++) {
    for(i=0; i<N; i++) {
      if (output[k].find(i)!=output[k].end())
        emission[i][k] = 0;
    }  
  }
 
  TTag tag1, tag2;  
  tag1 = eos; // The first seen tag is the end-of-sentence tag
  
  word_tagged = stream_tagged.get_next_word();
  word_untagged = stream_untagged.get_next_word();
  while(word_tagged) {
    cerr<<*word_tagged;
    cerr<<" -- "<<*word_untagged<<"\n"; 

    if (word_tagged->get_superficial_form()!=word_untagged->get_superficial_form()) {              
      cerr<<"\nTagged text (.tagged) and analyzed text (.untagged) streams are not aligned.\n";
      cerr<<"Take a look at tagged text (.tagged).\n";
      cerr<<"Perhaps this is caused by a multiword unit that is not a multiword unit in one of the two files.\n";
      cerr<<*word_tagged<<" -- "<<*word_untagged<<"\n"; 
      exit(1);
    }

    if (++nw%100==0) cerr<<'.'<<flush; 
    
    tag2 = tag1;
   
    if (word_untagged==NULL) {
      cerr<<"word_untagged==NULL\n";
      exit(1);
    }

    if (word_tagged->get_tags().size()==0) // Unknown word
      tag1 = -1;
    else if (word_tagged->get_tags().size()>1) // Ambiguous word
      cerr<<"Error in tagged text. An ambiguous word was found: "<<word_tagged->get_superficial_form()<<"\n";
    else
      tag1 = *(word_tagged->get_tags()).begin();


    if ((tag1>=0) && (tag2>=0))
      tags_pair[tag2][tag1]++;
    

    if (word_untagged->get_tags().size()==0) { // Unknown word
      tags = td->getOpenClass();
    }
    else if (output.has_not(word_untagged->get_tags())) { //We are training, there is no problem
      string errors;
      errors = "A new ambiguity class was found. I cannot continue.\n";
      errors+= "Word '"+word_untagged->get_superficial_form()+"' not found in the dictionary.\n";
      errors+= "New ambiguity class: "+word_untagged->get_string_tags()+"\n";
      errors+= "Take a look at the dictionary, then retrain.";
      fatal_error(errors);      
    }    
    else {
      tags = word_untagged->get_tags();
    }

    k=output[tags];
    if(tag1>=0)
      emission[tag1][k]++;
                   
    delete word_tagged;
    word_tagged=stream_tagged.get_next_word();
    delete word_untagged;
    word_untagged=stream_untagged.get_next_word();       
  }
  
  
  //Estimate of a[i][j]
  for(i=0; i<N; i++) {
    double sum=0;
    for(j=0; j<N; j++)  
      sum += tags_pair[i][j]+1.0;
    for(j=0; j<N; j++)  
      (td->getA())[i][j] = (tags_pair[i][j]+1.0)/sum;
  }
    
  
  //Estimate of b[i][k]
  for(i=0; i<N; i++) {
    int nclasses_appear=0;
    double times_appear=0.0;
    for(k=0; k<M; k++)  {
      if (output[k].find(i)!=output[k].end())  {
      nclasses_appear++;      
      times_appear+=emission[i][k];
      }
    }       
    for(k=0; k<M; k++)  {
      if (output[k].find(i)!=output[k].end())
      (td->getB())[i][k] = (emission[i][k]+(((double)1.0)/((double)nclasses_appear)))/(times_appear+((double)1.0));
    }
   }
  
  cerr<<"\n";  
}


Generated by  Doxygen 1.6.0   Back to index