Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void HMM::init_probabilities_kupiec ( FILE *  is  ) 

It initializes the transtion (a) and emission (b) probabilities from an untagged input text by means of Kupiec's method

Parameters:
is the input stream with the untagged corpus to process

Definition at line 137 of file hmm.cc.

References MorphoStream::get_next_word(), TaggerWord::get_string_tags(), TaggerWord::get_superficial_form(), TaggerWord::get_tags(), Collection::has_not(), and Collection::size().

{
  int N = td->getN();
  int M = td->getM();
  int i, j, k, k1, k2, nw=0;
  double classes_ocurrences[M]; //M = Number of ambiguity classes
  double classes_pair_ocurrences[M][M];
  double tags_estimate[N]; //N = Number of tags (states)
  double tags_pair_estimate[N][N];
  Collection &output = td->getOutput();
 
  MorphoStream lexmorfo(is, true, td);
  
  TaggerWord *word=NULL;

  for(k=0; k<M; k++) {
    classes_ocurrences[k]=1; 
    for (k2=0; k2<M; k2++)
      classes_pair_ocurrences[k][k2]=1;
  }

  set<TTag> tags;
  tags.insert(eos);  
  k1=output[tags]; //The first tag (ambiguity class) seen is the end-of-sentence
  classes_ocurrences[k]++;
  
  //We count for each ambiguity class the number of ocurrences
  word = lexmorfo.get_next_word();
  while((word)) {
    if (++nw%10000==0) wcerr<<L'.'<<flush; 
    
    tags=word->get_tags();

    if (tags.size()==0) { //This is an unknown word
      tags = td->getOpenClass();
    }
    else if (output.has_not(tags)) { 
      wstring errors;
      errors = L"A new ambiguity class was found. I cannot continue.\n";
      errors+= L"Word '"+word->get_superficial_form()+L"' not found in the dictionary.\n";
      errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n";
      errors+= L"Take a look at the dictionary and at the training corpus. Then, retrain.";      
      fatal_error(errors);      
    }    

    k2=output[tags];

    classes_ocurrences[k1]++;
    classes_pair_ocurrences[k1][k2]++;  //k1 followed by k2
    delete word;
    word=lexmorfo.get_next_word();

    k1=k2;

  }  

  //Estimation of the number of time each tags occurs in the training text
  for(i=0; i<N; i++) {  
    tags_estimate[i]=0;
    for(k=0; k<M;  k++) { 
  
      if(output[k].find(i) != output[k].end())
        tags_estimate[i] += classes_ocurrences[k]/output[k].size();     
    }
  }
  
  //Estimation of the number of times each tag pair occurs
  for(i=0; i<N; i++)
    for(j=0; j<N; j++)
      tags_pair_estimate[i][j]=0;

  set<TTag> tags1, tags2;
  set<TTag>::iterator itag1, itag2;
  for(k1=0; k1<M; k1++) {
    tags1=output[k1];
    for(k2=0; k2<M; k2++) {
      tags2=output[k2];
      double nocurrences=classes_pair_ocurrences[k1][k2]/((double)(tags1.size()*tags2.size()));
      for (itag1=tags1.begin(); itag1!=tags1.end(); itag1++) {
        for (itag2=tags2.begin(); itag2!=tags2.end(); itag2++)
          tags_pair_estimate[*itag1][*itag2]+=nocurrences;
      }
    }
  }

   //a[i][j] estimation.
  double sum;
  for(i=0; i<N; i++) {
    sum=0;
    for(j=0; j<N; j++)
      sum+=tags_pair_estimate[i][j];

    for(j=0; j<N; j++) {  
      if (sum>0)
        (td->getA())[i][j] = tags_pair_estimate[i][j]/sum;
      else {
        (td->getA())[i][j] = 0;
      }
    }
  }

  //b[i][k] estimation
  for(i=0; i<N; i++) {
    for(k=0; k<M; k++)  {
      if (output[k].find(i)!=output[k].end()) {
        if (tags_estimate[i]>0)
          (td->getB())[i][k] = (classes_ocurrences[k]/output[k].size())/tags_estimate[i];
        else 
        (td->getB())[i][k] = 0;
      }
    }
  }
  wcerr<<L"\n";
}


Generated by  Doxygen 1.6.0   Back to index