Logo Search packages:      
Sourcecode: apertium version File versions  Download package

void LexTor::trainlch ( wistream &  wis,
int  left,
int  right,
LexTorData wordmodel,
FSTProcessor &  dic,
FSTProcessor &  bildic,
double  weigth_exponent = 0 
)

!!!! Para hacer que no tenga en cuenta las polisemicas del contexto

translation_weighs[j]=0; !!!!

Definition at line 197 of file lextor.cc.

References LexTorData::ensure_stopwords_ok(), LexTorData::get_lexchoice_sum(), LexTorData::get_lexical_choices(), LexTorWord::get_word_string(), LexTorData::get_wordcount(), LexTorData::get_words(), LexTorData::is_stopword(), LexTorWord::next_word(), LexTorData::reduce(), LexTorData::set_cooccurrence_context(), LexTorData::set_lexchoice_sum(), LexTorData::set_wordcount(), and LexTorData::vote_from_word().

                                                                                  {
  if (lextor_data==NULL) {
    wcerr<<L"Error in LexTor::trainlch, you must call set_lextor_data before training\n";
    exit(EXIT_FAILURE);
  }

  lextor_data->ensure_stopwords_ok();

  wcerr<<L"Number of words to take into account on the left side: "<<left<<L"\n";
  wcerr<<L"Number of words to take into account on the right side: "<<right<<L"\n";

  set<wstring> words2workwith=lextor_data->get_words();
  set<wstring>::iterator itword;

  map<wstring, COUNT_DATA_TYPE> wordsum;
  map<wstring, COUNT_DATA_TYPE> lechsum;

  wcerr<<L"Words to work with:\n";
  for(itword=words2workwith.begin(); itword!=words2workwith.end(); itword++) {
    wcerr<<*itword<<L"\n";
  }
  wcerr<<L"\n";

  //For a given lexical choice it stores its translation
  map<wstring, wstring> lexchoice_translation;
  map<wstring, set<wstring> > lexical_choices_of_word;

  wcerr<<L"Lexical choices:\n";
  for(itword=words2workwith.begin(); itword!=words2workwith.end(); itword++) {
    set<wstring> lexical_choices=lextor_data->get_lexical_choices(*itword);
    lexical_choices_of_word[*itword]=lexical_choices;
    set<wstring>::iterator itlch;
    for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) {
      lexchoice_translation[*itlch]=tlwordmodel.reduce(bildic.biltrans(*itlch,false));
      wcerr<<*itlch<<L", tr:"<<lexchoice_translation[*itlch]<<L"\n";
    }
  }
  wcerr<<L"\n";


  is.clear();
  is.seekg(ios::beg);

  int nw=0;

  map<wstring, map<wstring, COUNT_DATA_TYPE> > context;
  deque<LexTorWord> buffer;

  int word_index=left;
  unsigned buffer_max_size=left+right+1;

  LexTorWord *ltword;
  ltword=LexTorWord::next_word(is,&dic);
  while(ltword!=NULL) {
    if (debug) {
      wcerr<<L"Word read from corpus: "<<ltword->get_word_string()<<L", reduce: "<<lextor_data->reduce(ltword->get_word_string());
      getchar();
    }
    if ((++nw%250000)==0)
      wcerr<<nw<<L" words processed\n";

    wstring reduced_word=lextor_data->reduce(ltword->get_word_string());

    if (!lextor_data->is_stopword(reduced_word)) {    
      if (buffer.size()>=buffer_max_size) {
      buffer.pop_front();
      }
      buffer.push_back(*ltword);

      wordsum[reduced_word]+=1.0;

      //The buffer is already full
      if (buffer.size()==buffer_max_size) {

      wstring reduced_buffer_word=lextor_data->reduce(buffer[word_index].get_word_string());

        for(itword=words2workwith.begin(); itword!=words2workwith.end(); itword++) {
        if (reduced_buffer_word==(*itword)) {
          //We translate each word in the context
          //Note: Words in the context can also be ambiguous (with more than one lexical choice)
          //In that case the count will come from all the possible
          //translations 
          vector<wstring> translation_buffer[buffer_max_size];
          wstring reduced_buffer[buffer_max_size];

          for (int i=0; i<(int)buffer_max_size; i++) {
            reduced_buffer[i]=lextor_data->reduce(buffer[i].get_word_string());           
          }

          if(debug) {
            wcerr<<L"WINDOW: ";
            for (unsigned i=0; i<buffer.size(); i++) {
            if(i==(unsigned)word_index)
              wcerr<<L"[>>>>"<<reduced_buffer[i]<<L"<<<<] ";
            else
              wcerr<<L"["<<reduced_buffer[i]<<L"] ";
            }
            wcerr<<L"\n";
            wcerr<<L"TRANSLATED: ";
          }

          for (int i=0; i<(int)buffer_max_size; i++) {
            wstring str_translations=L"";
            for(int j=0; j<buffer[i].n_lexical_choices(); j++) {
            wstring aux_tr=buffer[i].translate(bildic,j);
            if (aux_tr.length()>0) {
              wstring tr=tlwordmodel.reduce(aux_tr);
              translation_buffer[i].push_back(tr);
              str_translations+=tr+L"/";
            } else {
              wcerr<<L"Warning in LexTor::trainlch: translation of ["<<buffer[i].get_word_string()
                  <<L"] is empty\n";
            }
            }
            if (debug) {
            if (i==word_index)
              wcerr<<L"[>>>>"<<str_translations<<L"<<<<] ";
            else
              wcerr<<L"["<<str_translations<<L"] ";
            }
          }

          if(debug)
            wcerr<<L"\n";

          set<wstring> lexical_choices=lexical_choices_of_word[*itword];
          set<wstring>::iterator itlch;

          map<wstring, map<wstring, COUNT_DATA_TYPE> > local_context;
          map<wstring, COUNT_DATA_TYPE> sumvotes_context;

          //For each lexical choice the counts from the TL are collected
          for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) {
            for (int i=0; i<(int)buffer_max_size; i++) {
            if ((i!=word_index)&&(reduced_buffer[i]!=(*itword))) {
              COUNT_DATA_TYPE target_vote=0;

              //The counts of the TL co-occurrence model are transferred to the SL. If the SL word is ambiguous
              //it will have more than one translation into TL, so we need to normalize using the frequency of words
              //in the TL
              double translation_weighs[translation_buffer[i].size()];
              double sum=0.0;
              if (translation_buffer[i].size()>1) {
                for(int j=0; j<(int)translation_buffer[i].size(); j++) {
                  translation_weighs[j]=tlwordmodel.get_lexchoice_sum(translation_buffer[i][j]);
                  sum+=translation_weighs[j];

                  //!!!!! Para hacer que no tenga en cuenta las polisemicas del contexto
                  ///////translation_weighs[j]=0;
                  //!!!!!

                  if (debug) {
                  wcerr<<L"Frequency of translation ["<<translation_buffer[i][j]<<L"] ="
                      <<translation_weighs[j]<<L"\n";
                  }
                }
              } else {
                translation_weighs[0]=1;
                sum=1;
              }

              for(int j=0; j<(int)translation_buffer[i].size(); j++) {
                translation_weighs[j]=translation_weighs[j]/sum;
                if (debug) {
                  wcerr<<L"Weight of translation ["<<translation_buffer[i][j]<<L"] ="
                    <<translation_weighs[j]<<L"\n";
                }
              }

              for(int j=0; j<(int)translation_buffer[i].size(); j++) {
                if (lexchoice_translation[*itlch].length()==0) {
                  wcerr<<L"Error: Translation of lexical choice '"<<*itlch<<L"' is empty\n";
                }

                double aux_vote=0;
                //aux_vote=tlwordmodel.vote_from_word(lexchoice_translation[*itlch], 
                //                            translation_buffer[i][j])*translation_weighs[j];
                if (tlwordmodel.get_wordcount(lexchoice_translation[*itlch])>0) {
                  aux_vote=(tlwordmodel.vote_from_word(lexchoice_translation[*itlch],translation_buffer[i][j])/
                        tlwordmodel.get_wordcount(lexchoice_translation[*itlch]))*translation_weighs[j];
                  if (debug) {
                  wcerr<<L"C("<<lexchoice_translation[*itlch]<<L", "<<translation_buffer[i][j]<<L") = "
                      <<tlwordmodel.vote_from_word(lexchoice_translation[*itlch],translation_buffer[i][j])<<L"\n";
                  wcerr<<L"C("<<lexchoice_translation[*itlch]<<L") = "<<tlwordmodel.get_wordcount(lexchoice_translation[*itlch])<<L"\n";
                  }
                } else {
                  if (tlwordmodel.vote_from_word(lexchoice_translation[*itlch],translation_buffer[i][j])>0) {
                  wcerr<<L"Error in LexTor::trainlch: TL vote is not null, but its word count is null.\n";
                  wcerr<<L"lexchoice_translation: "<<lexchoice_translation[*itlch]<<L"\n";
                  wcerr<<L"translation_buffer: "<<translation_buffer[i][j]<<L"\n";
                  }
                }
                target_vote+=aux_vote;

                if(debug) {
                  wcerr<<L"Target vote for ["<<lexchoice_translation[*itlch]
                    <<L"] from ["<<translation_buffer[i][j]<<L"] = "<<aux_vote<<L"\n";
                }
              }

              if (target_vote>0) {
                local_context[*itlch][reduced_buffer[i]]+=target_vote;
                sumvotes_context[reduced_buffer[i]]+=target_vote;
              }
            }
            }
          }

          if (debug) {
            wcerr<<L"COUNTS NORMALIZATION\n";
          }

          //Now we normalize the counts and estimate the number of
          //times each lexical choice has been seen.
          map<wstring, COUNT_DATA_TYPE> local_lexsum;
          double local_lexsumsum=0.0;
          for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) {
            int distance=(-1)*left;
            for (int i=0; i<(int)buffer_max_size; i++) { 
            if ((i!=word_index)&&(reduced_buffer[i]!=(*itword))) {
              if (local_context[*itlch][reduced_buffer[i]]>0) {
                double cc=local_context[*itlch][reduced_buffer[i]]/sumvotes_context[reduced_buffer[i]];
                double count_to_apply=cc/pow(fabs((double)distance),weigth_exponent);
                context[*itlch][reduced_buffer[i]]+=count_to_apply;
                if (debug) {
                  wcerr<<L"Lexical choice: ["<<*itlch
                          <<L"], context word: ["<<reduced_buffer[i]<<L"], "
                    <<L"normalize count: "<<cc<<L"\n";
                  wcerr<<L"Distance: "<<distance<<L", count to apply: "
                    <<count_to_apply<<L"\n";

                }

                local_lexsum[*itlch]+=count_to_apply;
                local_lexsumsum+=count_to_apply;

                if (debug) {
                  wcerr<<L"local_lexsum["<<*itlch<<L"] = "<<local_lexsum[*itlch]<<L"\n";
                  wcerr<<L"local_lexsumsum = "<<local_lexsumsum<<L"\n";
                }

              }
            }
            distance++;
            }
          }

          for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) {
            if ((local_lexsum[*itlch]>0) && (local_lexsumsum>0))
            lechsum[*itlch]+=local_lexsum[*itlch]/local_lexsumsum;
            if (debug) {
            wcerr<<L"lechsum["<<*itlch<<L"] = "<<lechsum[*itlch]<<L"\n";
            }
          }
          

          if(debug) {
            wcerr<<L"\n";
            getchar();
          }

          break;
        }
      }
      }
    } 

    delete ltword;
    ltword=LexTorWord::next_word(is,&dic);
  }
  
  wcerr<<L"Corpus has "<<nw<<L" words\n";

  //Set the count of each word
  map<wstring, COUNT_DATA_TYPE>::iterator itws;
  for(itws=wordsum.begin(); itws!=wordsum.end(); itws++) {
    lextor_data->set_wordcount(itws->first,itws->second);
    //if(debug) {
    wcerr<<L"wordcount("<<itws->first<<L") = "<<itws->second<<L"\n";
    //}
  }

  //All co-occurrences have been collected. We need to filter them
  //so as to take into account only the n most frequent
  for(itword=words2workwith.begin(); itword!=words2workwith.end(); itword++) {
    set<wstring> lexical_choices=lexical_choices_of_word[*itword];
    set<wstring>::iterator itlch;
    for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) {
      PairStringCountComparer comparer;
      vector<pair<wstring, COUNT_DATA_TYPE> > context_v;
      map<wstring, COUNT_DATA_TYPE>::iterator itm;

      while(context[*itlch].size()>0) {
      itm=context[*itlch].begin();
      //wcerr<<itm->first<<L" "<<itm->second<<L"\n";
      context_v.push_back(*itm);
      context[*itlch].erase(itm);
      }
    
      sort(context_v.begin(), context_v.end(), comparer);
      wstring lch=*itlch;
      lextor_data->set_cooccurrence_context(lch, context_v);
      //lextor_data->set_lexchoice_sum(lch, tlwordmodel.get_lexchoice_sum(lexchoice_translation[lch]));

      //wcerr<<L"lexchoice_sum("<<lch<<L") = lexchoice_sum_target("<<lexchoice_translation[lch]<<L") ="
      //    <<tlwordmodel.get_lexchoice_sum(lexchoice_translation[lch])<<L"\n";
    }
  } 

  //Set the count of each lexical choice
  map<wstring, COUNT_DATA_TYPE>::iterator itlcs;
  for(itlcs=lechsum.begin(); itlcs!=lechsum.end(); itlcs++) {
    lextor_data->set_lexchoice_sum(itlcs->first,itlcs->second);
    //if(debug) {
    wcerr<<L"lexchoice_sum("<<itlcs->first<<L") = "<<itlcs->second<<L"\n";
    //}
  }


  wcerr<<L"Training done\n"; 
}


Generated by  Doxygen 1.6.0   Back to index