Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void HMM::init_probabilities_from_tagged_text ( FILE *  ftagged,
FILE *  funtagged 
)

It initializes the transtion (a) and emission (b) probabilities from a tagged input text by means of the expected-likelihood estimate (ELE) method

Parameters:
ftaggedthe input stream with the tagged corpus to process
funtaggedthe same corpus to process but untagged

Definition at line 253 of file hmm.cc.

References MorphoStream::get_next_word(), and Collection::has_not().

                                                                       {
  int i, j, k, nw=0;
  int N = td->getN();
  int M = td->getM();
  double tags_pair[N][N];
  double emission[N][M];
  
  MorphoStream stream_tagged(ftagged, true, td);
  MorphoStream stream_untagged(funtagged, true, td);
  
  TaggerWord *word_tagged=NULL, *word_untagged=NULL;
  Collection &output = td->getOutput();

  
  set<TTag> tags;
 
  // Init counters - each event appears at least once. 
  // Espected likelihood estimate (ELE) with a fixed initial count of 1
  for(i=0; i<N; i++) {
    for(j=0; j<N; j++)
      tags_pair[i][j]=0;
  }
  for(k=0; k<M; k++) {
    for(i=0; i<N; i++) {
      if (output[k].find(i)!=output[k].end())
        emission[i][k] = 0;
    }  
  }
 
  TTag tag1, tag2;  
  tag1 = eos; // The first seen tag is the end-of-sentence tag
  
  word_tagged = stream_tagged.get_next_word();
  word_untagged = stream_untagged.get_next_word();
  while(word_tagged) {
    wcerr<<*word_tagged;
    wcerr<<L" -- "<<*word_untagged<<L"\n"; 

    if (word_tagged->get_superficial_form()!=word_untagged->get_superficial_form()) {              
      wcerr<<L"\nTagged text (.tagged) and analyzed text (.untagged) streams are not aligned.\n";
      wcerr<<L"Take a look at tagged text (.tagged).\n";
      wcerr<<L"Perhaps this is caused by a multiword unit that is not a multiword unit in one of the two files.\n";
      wcerr<<*word_tagged<<L" -- "<<*word_untagged<<L"\n"; 
      exit(1);
    }

    if (++nw%100==0) wcerr<<L'.'<<flush; 
    
    tag2 = tag1;
   
    if (word_untagged==NULL) {
      wcerr<<L"word_untagged==NULL\n";
      exit(1);
    }

    if (word_tagged->get_tags().size()==0) // Unknown word
      tag1 = -1;
    else if (word_tagged->get_tags().size()>1) // Ambiguous word
      wcerr<<L"Error in tagged text. An ambiguous word was found: "<<word_tagged->get_superficial_form()<<L"\n";
    else
      tag1 = *(word_tagged->get_tags()).begin();


    if ((tag1>=0) && (tag2>=0))
      tags_pair[tag2][tag1]++;
    

    if (word_untagged->get_tags().size()==0) { // Unknown word
      tags = td->getOpenClass();
    }
    else if (output.has_not(word_untagged->get_tags())) { //We are training, there is no problem
      wstring errors;
      errors = L"A new ambiguity class was found. I cannot continue.\n";
      errors+= L"Word '"+word_untagged->get_superficial_form()+L"' not found in the dictionary.\n";
      errors+= L"New ambiguity class: "+word_untagged->get_string_tags()+L"\n";
      errors+= L"Take a look at the dictionary, then retrain.";
      fatal_error(errors);      
    }    
    else {
      tags = word_untagged->get_tags();
    }

    k=output[tags];
    if(tag1>=0)
      emission[tag1][k]++;
                   
    delete word_tagged;
    word_tagged=stream_tagged.get_next_word();
    delete word_untagged;
    word_untagged=stream_untagged.get_next_word();       
  }
  
  
  //Estimate of a[i][j]
  for(i=0; i<N; i++) {
    double sum=0;
    for(j=0; j<N; j++)  
      sum += tags_pair[i][j]+1.0;
    for(j=0; j<N; j++)  
      (td->getA())[i][j] = (tags_pair[i][j]+1.0)/sum;
  }
    
  
  //Estimate of b[i][k]
  for(i=0; i<N; i++) {
    int nclasses_appear=0;
    double times_appear=0.0;
    for(k=0; k<M; k++)  {
      if (output[k].find(i)!=output[k].end())  {
      nclasses_appear++;      
      times_appear+=emission[i][k];
      }
    }       
    for(k=0; k<M; k++)  {
      if (output[k].find(i)!=output[k].end())
      (td->getB())[i][k] = (emission[i][k]+(((double)1.0)/((double)nclasses_appear)))/(times_appear+((double)1.0));
    }
   }
  
  wcerr<<L"\n";  
}

Here is the call graph for this function:


Generated by  Doxygen 1.6.0   Back to index