Logo Search packages:      
Sourcecode: apertium version File versions  Download package

lextor_data.cc

/*
 * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante
 * author: Felipe Sánchez-Martínez
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA.
 */

#include <apertium/lextor_data.h>
#include <apertium/string_utils.h>
#include <apertium/lextor_word.h>
#include <apertium/lextor.h>

#include <lttoolbox/compression.h>
#include <lttoolbox/endian_util.h>
#include <apertium/string_utils.h>

using namespace Apertium;
LexTorData::LexTorData() {
  n_stopwords=0;
  n_words=0;
  n_words_per_set=0;
  n_set=0;

  index2word.push_back(NULLWORD);
  word2index[NULLWORD]=0;
  n_words++;
}
  
LexTorData::LexTorData(const LexTorData& ltd) {
  n_stopwords=ltd.n_stopwords;
  n_words=ltd.n_words;
  n_words_per_set=ltd.n_words_per_set;
  n_set=ltd.n_set;

  word2index=ltd.word2index;
  index2word=ltd.index2word;

  lexchoice_set=ltd.lexchoice_set;
  lexchoice_sum=ltd.lexchoice_sum;
  //lexchoice_prob=ltd.lexchoice_prob;

  stopwords=ltd.stopwords;
  words=ltd.words;
  lexical_choices=ltd.lexical_choices;
  reduced_lexical_choices=ltd.reduced_lexical_choices;
}
  
LexTorData::~LexTorData() {
}

COUNT_DATA_TYPE
LexTorData::vote_from_word(const wstring& lexical_choice, const wstring& word) {
  WORD_DATA_TYPE ind_lexchoice=word2index[StringUtils::tolower(lexical_choice)];
  WORD_DATA_TYPE ind_word=word2index[StringUtils::tolower(word)];

  //To avoid creating a null entry in lexchoice_set[lexical_choice]
  if (lexchoice_set[ind_lexchoice].find(ind_word)==lexchoice_set[ind_lexchoice].end())
    return 0;
  else
    return lexchoice_set[ind_lexchoice][ind_word];
}

//double 
//LexTorData::get_lexchoice_prob(const string& lexical_choice) {
//  return lexchoice_prob[word2index[lexical_choice]];
//}


void 
LexTorData::set_wordcount(const wstring& word, COUNT_DATA_TYPE c) {
  WORD_DATA_TYPE ind_word=word2index[StringUtils::tolower(word)];
  wordcount[ind_word]=c;
}

COUNT_DATA_TYPE 
LexTorData::get_wordcount(const wstring& word) {
  WORD_DATA_TYPE ind_word=word2index[StringUtils::tolower(word)];

  if (wordcount.find(ind_word)==wordcount.end())
    return 0;
  else
    return wordcount[ind_word];
}

COUNT_DATA_TYPE
LexTorData::get_lexchoice_sum(const wstring& lexical_choice) {
  return lexchoice_sum[word2index[StringUtils::tolower(lexical_choice)]];
}

void 
LexTorData::set_lexchoice_sum(const wstring& lexical_choice, COUNT_DATA_TYPE sum) {
  lexchoice_sum[word2index[StringUtils::tolower(lexical_choice)]]=sum;
}

bool
LexTorData::is_stopword(const wstring& word) {
  return (stopwords.find(StringUtils::tolower(word))!=stopwords.end());
}

void 
LexTorData::read(FILE *is) {
  //cerr<<"LexTorData::read------------------------------------\n";
  n_stopwords=(WORD_DATA_TYPE)Compression::multibyte_read(is);
  n_words=(WORD_DATA_TYPE)Compression::multibyte_read(is);
  n_words_per_set=(WORD_DATA_TYPE)Compression::multibyte_read(is);
  n_set=(WORD_DATA_TYPE)Compression::multibyte_read(is);

  //cerr<<n_stopwords<<"\n";
  //cerr<<n_words<<"\n";
  //cerr<<n_words_per_set<<"\n";
  //cerr<<n_set<<"\n";


  //Read the set of stopwords
  //cerr<<"stopwords--------------------------------------------\n";
  for (unsigned int i=0; i<n_stopwords; i++) {
    stopwords.insert(Compression::wstring_read(is));
    //cerr<<"len: "<<len<<" str: "<<str<<"\n";
  }

  //Read the list of words
  //cerr<<"list of words----------------------------------------\n";
  for(unsigned int i=1; i<n_words; i++) {
    wstring str = Compression::wstring_read(is);
    index2word.push_back(str);
    word2index[str]=i;
    wordcount[i]=EndianUtil<COUNT_DATA_TYPE>::read(is);
    //cerr<<"len: "<<len<<" str: "<<str<<" index: "<<i<<" word_count: "<<wordcount[i]<<"\n";
  }

  //Read data of each set associate to each lexical choice (or word)
  for(unsigned int i=0; i<n_set; i++) {
    WORD_DATA_TYPE lexchoice;
    COUNT_DATA_TYPE sum;
    //double prob;

    lexchoice=(WORD_DATA_TYPE)Compression::multibyte_read(is);
    sum=EndianUtil<COUNT_DATA_TYPE>::read(is);

    //cerr<<"lexchoice: "<<lexchoice<<" sum: "<<sum<<" "<<index2word[lexchoice]<<"\n";

    reduced_lexical_choices.insert(index2word[lexchoice]);

    lexchoice_sum[lexchoice]=sum;
    //lexchoice_prob[lexchoice]=prob;

    /////lexical_choices.insert(index2word[lexchoice]);

    for(unsigned int j=0; j<n_words_per_set; j++) {
      WORD_DATA_TYPE word;
      COUNT_DATA_TYPE count;

      word=(WORD_DATA_TYPE)Compression::multibyte_read(is);
      count=EndianUtil<COUNT_DATA_TYPE>::read(is);
      //cerr<<"     word: "<<word<<" count: "<<count<<"\n";
      lexchoice_set[lexchoice][word]=count;
    }
  }

  //First we read the number of words to take into account
  WORD_DATA_TYPE nwords2workwith;

  nwords2workwith=(WORD_DATA_TYPE)Compression::multibyte_read(is);
  for (unsigned int i=0; i<nwords2workwith; i++) {
    WORD_DATA_TYPE word;

    word=(WORD_DATA_TYPE)Compression::multibyte_read(is);
    words.insert(index2word[word]);
    //cerr<<"word: "<<index2word[word]<<"\n";
  }
}

void 
LexTorData::write(FILE *os) {
  //cerr<<"LexTorData::write------------------------------------\n";
  //cerr<<n_stopwords<<"\n";
  //cerr<<n_words<<"\n";
  //cerr<<n_words_per_set<<"\n";
  //cerr<<n_set<<"\n";
  Compression::multibyte_write(n_stopwords, os);
  Compression::multibyte_write(n_words, os);
  Compression::multibyte_write(n_words_per_set, os);
  Compression::multibyte_write(n_set, os);

  //Write the set of stopwords
  //cerr<<"stopwords--------------------------------------------\n";
  set<wstring>::iterator it;
  for (it=stopwords.begin(); it!=stopwords.end(); it++) {
    Compression::wstring_write(*it, os);
  }

  //Write the list of words
  //cerr<<"list of words----------------------------------------\n";
  for(unsigned int i=1; i<index2word.size(); i++) {
    Compression::wstring_write(index2word[i], os);
    EndianUtil<COUNT_DATA_TYPE>::write(os, wordcount[i]);
  }

  //Write data of each set associate to each lexical choice (or word)
  map<WORD_DATA_TYPE, map<WORD_DATA_TYPE, COUNT_DATA_TYPE> >::iterator it_lch_set;
  map<WORD_DATA_TYPE, COUNT_DATA_TYPE>::iterator it_w_lch_set;
  //map<WORD_DATA_TYPE, double>::iterator it_lch_prob;

  for(it_lch_set=lexchoice_set.begin(); it_lch_set!=lexchoice_set.end(); it_lch_set++) {
    WORD_DATA_TYPE lexchoice=it_lch_set->first;
    COUNT_DATA_TYPE sum=lexchoice_sum[lexchoice];
    //double prob=lexchoice_prob[lexchoice];

    //cerr<<"lexchoice: "<<lexchoice<<" sum: "<<sum<<" "<<index2word[lexchoice]<<"\n";
    Compression::multibyte_write(lexchoice, os);    
    //os.write(reinterpret_cast<char * const> (&prob), sizeof(double));
    EndianUtil<COUNT_DATA_TYPE>::write(os, sum);

    int nwritten_words=0;
    for(it_w_lch_set=it_lch_set->second.begin(); 
        it_w_lch_set!=it_lch_set->second.end(); 
        it_w_lch_set++) {
      WORD_DATA_TYPE word=it_w_lch_set->first;
      COUNT_DATA_TYPE count=it_w_lch_set->second;
      //cerr<<"     word: "<<word<<" count: "<<count<<"\n";
      Compression::multibyte_write(word, os);
      EndianUtil<COUNT_DATA_TYPE>::write(os, count);
      nwritten_words++;
    }

    //If there were less written words than expected
    while (nwritten_words<n_words_per_set){
      WORD_DATA_TYPE word=word2index[NULLWORD];
      COUNT_DATA_TYPE count=0;
      //cerr<<"     word: "<<word<<" count: "<<count<<"\n";
      Compression::multibyte_write(word, os);
      EndianUtil<COUNT_DATA_TYPE>::write(os, count);
      nwritten_words++;
    }
  }

  //First we write the number of words to take into account
  WORD_DATA_TYPE nwords2workwith=words.size();
  Compression::multibyte_write(nwords2workwith, os);

  set<wstring>::iterator sit;
  for(sit=words.begin(); sit!=words.end(); sit++) {
    WORD_DATA_TYPE word=word2index[*sit];
    Compression::multibyte_write(word, os);
    //cerr<<"word: "<<*sit<<"\n";
  }
}

void 
LexTorData::read_stopwords(wistream& is) {
  while (!is.eof()) {
    wstring w;
    getline(is,w);
    w=StringUtils::tolower(w);
    if (w.length()>0) {
      stopwords.insert(w);
      wcerr<<L"stopword: "<<w<<L"\n";
    }
  }
  n_stopwords=stopwords.size();
  wcerr<<L"# stopwords read from file: "<<n_stopwords<<L"\n";
}  

void 
LexTorData::read_words(wistream& is) {
  while(!is.eof()) {
    wstring w;
    getline(is,w);
    w=StringUtils::tolower(w);
    if (w.length()>0) {
      words.insert(w);
      new_word_register(w);
    }
  }
  n_set=words.size();
  wcerr<<L"# words: "<<n_set<<L"\n";
}

void 
LexTorData::read_lexical_choices(FSTProcessor& fstp) {
  set<wstring>::iterator it;
  int nlexchoices=0;

  for(it=words.begin(); it!=words.end(); it++) {
    LexTorWord ambiguousword(*it, &fstp);
    nlexchoices+=ambiguousword.n_lexical_choices();

    for(int i=0; i<ambiguousword.n_lexical_choices(); i++) {
      lexical_choices[*it].insert(ambiguousword.get_lexical_choice(i,false));
      //lexical_choices[*it].insert(reduce_lexical_choice(ambiguousword.get_lexical_choice(i,false)));
    }
  }

  n_set=nlexchoices;

  wcerr<<L"# lexical choices: "<<n_set<<L"\n";
}

set<wstring>
LexTorData::get_words() {
  return words;
}

set<wstring> 
LexTorData::get_lexical_choices(const wstring& word) {
  return lexical_choices[StringUtils::tolower(word)];
}

void 
LexTorData::set_nwords_per_set(int i){
  n_words_per_set=i;
  wcerr<<L"# words per co-ocurrence model: "<<n_words_per_set<<L"\n";
}

void 
LexTorData::set_cooccurrence_context(const wstring& lexical_choice, 
                                     const vector<pair<wstring, COUNT_DATA_TYPE> >& context) {
  wcerr<<L"Co-occurrence model for lexical_choice/word: "<<lexical_choice<<L"\n";

  if (context.size()==0) {
    wcerr<<L"Warning: co-occurrence model for lexical_choice/word: "<<lexical_choice<<L" is empty\n";
    wcerr<<L"It seems that training corpus is too small or thematically homogeneous\n";
    n_set--;
  }

  new_word_register(lexical_choice);

  for (unsigned int i=0; ((i<n_words_per_set)&&(i<context.size())); i++) {
    wcerr<<context[i].first<<L" "<<context[i].second<<L"\n";

    new_word_register(context[i].first);

    lexchoice_set[word2index[StringUtils::tolower(lexical_choice)]][word2index[StringUtils::tolower(context[i].first)]]=context[i].second;

    //////wordcount[word2index[StringUtils::tolower(context[i].first)]]+=context[i].second;
  }
}

void
LexTorData::ensure_stopwords_ok() {
  set<wstring>::iterator its, itw;
  set<wstring> swaux;

  //Notice that stopwords consist of lemma and first tag while words
  //consist of lemma and one (the first one) or more tags

  for(its=stopwords.begin(); its!=stopwords.end(); its++) {
    bool is_ok=true;
    for(itw=words.begin(); itw!=words.end(); itw++) {
      //cerr<<"sw: "<<*its<<" w: "<<*itw<<"\n";
      if (itw->find(*its)==0) {
      wcerr<<L"Warning: Word '"<<*itw<<L"' for which co-ocurrence models will"
          <<L" be estimated is also a stopword. ";
      wcerr<<L"Removing it from the stopwords list\n";
      is_ok=false;
      break;
      }
    }
    if(is_ok)
      swaux.insert(*its);
  }

  stopwords=swaux;

  wcerr<<n_stopwords-stopwords.size()<<L" stopwords were discarded\n";

  n_stopwords=stopwords.size();

  wcerr<<L"# stopwords finally taken into account: "<<n_stopwords<<L"\n";
}

wstring 
LexTorData::reduce(const wstring& s) {
  wstring str;

  if ((s.length()>0) && (s[0]=='^') && (s[s.length()-1]=='$'))
    str=StringUtils::tolower(s.substr(1, s.length()-1));
  else
    str=StringUtils::tolower(s);

  set<wstring>::iterator it;
  for(it=words.begin(); it!=words.end(); it++) {
    if (str.find(*it)==0) {
      return (*it);
    }
  }

  unsigned int p=str.find(L">");
  unsigned int i=0;
  if (p==static_cast<unsigned int>(wstring::npos)) { //s could correspond to an unknown word
    p=str.length();
    if ((str.length()>0) && (str[0]=='*'))
      i=1; // to remove the star (unknown word mark)
  }
  else
    p++;
  
  if (i>=p) {
    wcerr<<L"Warning in LexTorData::reduce: input string: '"<<s<<L"', string after operation: '"<<str<<L"'\n";
    wcerr<<L"begin index: "<<i<<", end index: "<<p<<L"\n";
    i=0;
  }

  return str.substr(i,p);
}

wstring 
LexTorData::reduce_lexical_choice(const wstring& s) {
  wstring str;

  if ((s.length()>0) && (s[0]=='^') && (s[s.length()-1]=='$'))
    str=StringUtils::tolower(s.substr(1, s.length()-1));
  else
    str=StringUtils::tolower(s);

  set<wstring>::iterator it;
  for(it=reduced_lexical_choices.begin(); it!=reduced_lexical_choices.end(); it++) {
    if (str.find(*it)==0) {
      return (*it);
    }
  }

  //return StringUtils::substitute(str," d<", " D<");

  return str;
}

void 
LexTorData::new_word_register(const wstring& word) {
  wstring w=StringUtils::tolower(word);

  if (word2index.find(w)==word2index.end()) {
    index2word.push_back(w);
    int ind=index2word.size()-1;
    if (ind>MAX_WORD_INDEX) {
      wcerr<<L"Error: The number of words to be considered is greater that the maximum allowed by\n";
      wcerr<<L"the data type used to store words\n";
      wcerr<<L"Edit source file LexTorData.H and change the WORD_DATA_TYPE define\n";
      exit(EXIT_FAILURE);
    }
    word2index[w]=(WORD_DATA_TYPE)ind;
    n_words=index2word.size();
    wordcount[(WORD_DATA_TYPE)ind]=0;
  }
}

/*
vector<pair<WORD_DATA_TYPE, double> > 
LexTorData::get_cooccurrence_vector(const string& lexical_choice) {
  vector<pair<WORD_DATA_TYPE, double> > v;
  WORD_DATA_TYPE ind_lexchoice=word2index[StringUtils::tolower(lexical_choice)];
  map<WORD_DATA_TYPE, COUNT_DATA_TYPE>::iterator it;

  for(it=lexchoice_set[ind_lexchoice].begin(); it!= lexchoice_set[ind_lexchoice].end(); it++) 
    v.push_back(*it);
  
  return v;
}
*/


double 
LexTorData::get_module_lexchoice_vector(const wstring& lexical_choice) {
  WORD_DATA_TYPE ind_lexchoice=word2index[StringUtils::tolower(lexical_choice)];
  map<WORD_DATA_TYPE, COUNT_DATA_TYPE>::iterator it;

  double module=0;

  for(it=lexchoice_set[ind_lexchoice].begin(); it!= lexchoice_set[ind_lexchoice].end(); it++) 
    module+=(it->second)*(it->second);

  module=sqrt(module);

  return module;
}

double 
LexTorData::cosine(const wstring& reduced_lexch1, const wstring& reduced_lexch2) {
  WORD_DATA_TYPE ind_lexchoice1=word2index[StringUtils::tolower(reduced_lexch1)];
  WORD_DATA_TYPE ind_lexchoice2=word2index[StringUtils::tolower(reduced_lexch2)];
  map<WORD_DATA_TYPE, COUNT_DATA_TYPE>::iterator it;

  //We calculate the scalar product
  double scalar_product=0;
  for(it=lexchoice_set[ind_lexchoice1].begin(); it!= lexchoice_set[ind_lexchoice1].end(); it++) {
    if (lexchoice_set[ind_lexchoice2].find(it->first)!=
        lexchoice_set[ind_lexchoice2].end()) {
      scalar_product+=(it->second)*lexchoice_set[ind_lexchoice2][it->first];
    }
  }

  //We get the module of the lexchoice vectors, ||lexchoice vector||
  double module_lexch1_vector=get_module_lexchoice_vector(reduced_lexch1);
  double module_lexch2_vector=get_module_lexchoice_vector(reduced_lexch2);


  if (module_lexch1_vector==0) {
    if (LexTor::debug) {
      wcerr<<L"Warning in LexTorData::cosine: module_lexch1_vector is equal zero.\n"
        <<L"The cosine cannot be compute\n";
      wcerr<<L"reduced lexical choice: "<<reduced_lexch1<<L"\n";
    }
    return -2;
  }

  if (module_lexch2_vector==0) {
    if (LexTor::debug) {
      wcerr<<L"Warning in LexTorData::cosine: module_lexch2_vector is equal zero.\n"
        <<L"The cosine cannot be compute\n";
      wcerr<<L"reduced lexical choice: "<<reduced_lexch2<<L"\n";
    }
    return -2;
  }

  return scalar_product/(module_lexch1_vector*module_lexch2_vector);
}

Generated by  Doxygen 1.6.0   Back to index