Logo Search packages:      
Sourcecode: apertium version File versions  Download package

HMM.H

/*
 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA.
 */
/**
 *  First order hidden Markov model (HMM) implementation (header)
 *
 *  @author   Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es
 */

#ifndef __HMM_H
#define __HMM_H

#include <cstdio>
#include <fstream>
#include <math.h>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <cfloat>

#include <apertium/Collection.H>
#include <apertium/ConstantManager.H>
#include <apertium/MorphoStream.H>
#include <apertium/TaggerData.H>
#include <apertium/TaggerUtils.H>
#include <apertium/TaggerWord.H>

using namespace std;

#define ZERO 1e-10

//#define AMBIGUOUS 'ř'
//#define UNKNOWN   '*'

/** Data type in which data for the evaluation of the PoS tagger
 *  performance is stored
 */
00054 struct data_tagger_eval {
  double nwords;
  double nunknown;
  double nignored;
  double nambiguous;
  double nerrors_noamb;
  double nerrors_amb;
  double nerrors_unk;
};

/** HMM
 *  first-order hidden Markov Model
 */
00067 class HMM {
private:
  int N; //Number of states (tags)
  int M; //Number of observable outputs (ambiguity clases)
  TTag eos; // end-of-sentence tag
  set <TTag>   open_class; //Open class (unknown words are asigned this ambiguity class)
  Collection output; // Collection of ambiguity classes
  map<string, int> tags_index;
  ConstantManager constants;
  vector<string> prefer_rules;

  int nword;
  int nword_eval;
   
  double **a, **b;  // Transition (a) and emission (b) matrices  
  bool debug;  //If true, print error messages when tagging input text

  struct data_tagger_eval eval_data;
  
  /** It allocs memory for the transition (a) and the emission (b) matrices.
   *  Before calling this method the number of ambiguity classes must be known.
   *  This methos is called within read_ambiguity_classes and read_dictionary.
   *  @see: read_ambiguity_classes, read_dictionary
   */
  void init(); 
   
  /** This method returns a knwon ambiguity class that is a subset of
   *  the one received as a parameter. This is useful when a new
   *  ambiguty class is found because of changes in the morphological
   *  dictionary used by the MT system.
   *  @param c set of tags (ambiguity class)
   *  @return a known ambiguity class 
   */
  set<TTag> find_similar_ambiguity_class(set<TTag> c);
   
public:  
 
  /** Constructor
   */
  HMM(set<TTag> const &oc, map<string, int> const &ti, ConstantManager const &cm,
      vector<string> const &pr);

  /**
   *  TaggerData constructor
   */
  HMM(TaggerData &td);
  
  /** Destructor
   */
  ~HMM();
  
  /** Get the transition probabilities matrix */
  double** get_a();

  /** Get the emission probabilities matrix */
  double** get_b();

  /** Get the number of HMM states N */
  int get_number_states();

  /* Get the number of HMM ambiguity classes M */
  int get_number_ambiguity_classes();

  /* Get the collection of ambiguity classes */
  Collection get_ambiguity_classes();


  /** Used to set the end-of-sentence tag
   *  @param t the end-of-sentence tag
   */
  void set_eos(TTag t);
   
  /** Used to set the debug flag
   *
   */
  void set_debug(bool d);

  /** It reads the ambiguity classes from the stream received as
   *  input
   *  @param is the input stream
   */  
  void read_ambiguity_classes(istream& is);
  
  /** It writes the ambiguity classes to the stream received as
   *  a parameter
   *  @param iosthe output stream
   */
  void write_ambiguity_classes(ostream& os);
  
  /** It reads the probabilities (matrices a and b) from the stream 
   *  received as a parameter
   *  @param is the input stream
   */
  void read_probabilities(istream& is);

  /** It writes the probabilities (matrices a and b) to the stream 
   *  received as a parameter
   *  @param os the output stream
   */ 
  void write_probabilities(ostream& os);
  
  /** It reads the expanded dictionary received as a parameter and calculates
   *  the set of ambiguity classes that the tagger will manage.
   *  @param is the input stream with the expanded dictionary to read
   */
  void read_dictionary(FILE *is);  
           
  /** It initializes the transtion (a) and emission (b) probabilities
   *  from an untagged inputr text by means of Kupiec's method
   *  @param is the input stream with the untagged corpus to process
   */
  void init_probabilities_kupiec (FILE *is);
  
  /** It initializes the transtion (a) and emission (b) probabilities
   *  from a tagged input text by means of the expected-likelihood 
   *  estimate (ELE) method
   *  @param ftagged the input stream with the tagged corpus to process
   *  @param funtagged the same corpus to porcesss but untagged
   */   
  void init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged);

  /** It applies the forbid and enforce rules found in tagger specification.
   *  To to so the transition matrix is modified by introducing null probabilitites
   *  in the involded transitions.
   */
  void apply_rules(vector<TForbidRule> const &forbid_rules,
               vector<TEnforceAfterRule> const &enforce_rules);
   
  /** Unsupervised training algorithm (Baum-Welch implementation).
   *  @param is the input stream with the untagged corpus to process
   */  
  void train (FILE *is);  
  
  /** Tagging algorithm (Viterbi implementation).
   *  @param is the input stream with the untagged text to tag
   *  @param iseval the input stream with the tagged text to be used for evaluation
   *  @param for_hand_tagging flag that tells the method whether the output tagged text
   *         will be used as an input to a hand tagging program.
   */
  void tagger (FILE *is, FILE *iseval=NULL, bool for_hand_tagging=false);
        
  /** Prints the A matrix.
   */
  void print_A();

  /** Prints the B matrix.
   */ 
  void print_B();

  /** Prints the ambiguity classes.
   */
  void print_ambiguity_classes();

  /** Prints the evaluation results.
   */
  void print_evaluation();
  
  /** It updates the data structure that holds the tagger evaluation.
   *  @param word the word that has been tagged 
   *  @param tag the tag assigned as the correct one 
   *  @param l the MorphoStream from wich the correct tag must be read 
   *  @return the number of word that must be ignored because of the 
   *          superficial forms from the word and the MorphoStream are 
   *          not the same
   */
  int evaltagger(TaggerWord& word, TTag& tag, MorphoStream& l);    
};

#endif

Generated by  Doxygen 1.6.0   Back to index