Tagger
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/tagger.properties
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/tagger.properties (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/tagger.properties (working copy)
@@ -1,23 +1,34 @@
## This is the default tagger.properties file
-MODEL = models/english/BrownModel.dat
-# MODEL = TigerKomplettModel_Mod.dat
-#MODEL = TuebaModel.dat
-# MODEL = TuebaModelKomplett_new.dat
-#MODEL = PennKomplett.sdat
-#MODEL = TigerTrainingModelKomplett.dat
+## This file is used for tagging, training and testing
-# MODEL = TuebaModel.dat
-## uncomment the following two lines if training is desirable, and change the corpus directory destination path,
-## e.g. T_CORPUS='brown' for all the files of the brown corpus which are in the 'brown/' -directory
+## uncomment for English
+#MODEL = resources/english/BrownModel.dat
-# TRAINING = false
+## uncomment for German
+MODEL = resources/german/TuebaModel.dat
## N =2 or N= 3 are supported, default N=3
N = 3
-## not yet integrated into the program at this level
-END_OF_SENT_TAG = .
+## If mapping of tags is desired, uncomment the following
+DO_MAPPING = true
-## not yet integrated into the program at this level
-#MAPPING = false
\ No newline at end of file
+## Basic mapping for the Brown corpus (nltk distribution) tagset: to get 93 tags out of 473
+MAPPING = org.apache.uima.examples.tagger.TagMapping
+
+
+## Basic mapping for STTS tagset: from 54 tags onto the basic ca. 15 classes plus punctuation
+#MAPPING = org.apache.uima.examples.tagger.GrobMapping
+
+### THIS SECTION SHOULD BE MODIFIED ONLY IN CASE OF OWN MODEL TRAINING
+#FILE = D:/Jane/IBM/Tagger_komplett/tueba_tigerFormat.txt
+FILE = ../brown/
+#FILE_OUTPUT = resources/german/TuebaModel.dat
+FILE_OUTPUT = resources/english/BrownModel.dat
+#CORPUS_READER = org.apache.uima.examples.tagger.trainAndTest.TT_FormatReader
+CORPUS_READER = org.apache.uima.examples.tagger.trainAndTest.BrownReader
+
+
+### FOR EVALUATION IF GOLD STANDARD CORPUS IS PRESENT
+GOLD_STANDARD = C:/TreeTagger/src/corpora/WacGold_NormalizedLength30012008
\ No newline at end of file
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/stylesheet.css
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/stylesheet.css (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/stylesheet.css (revision 0)
@@ -0,0 +1,29 @@
+/* Javadoc style sheet */
+
+/* Define colors, fonts and other style attributes here to override the defaults */
+
+/* Page background color */
+body { background-color: #FFFFFF }
+
+/* Headings */
+h1 { font-size: 145% }
+
+/* Table colors */
+.TableHeadingColor { background: #CCCCFF } /* Dark mauve */
+.TableSubHeadingColor { background: #EEEEFF } /* Light mauve */
+.TableRowColor { background: #FFFFFF } /* White */
+
+/* Font used in left-hand frame lists */
+.FrameTitleFont { font-size: 100%; font-family: Helvetica, Arial, sans-serif }
+.FrameHeadingFont { font-size: 90%; font-family: Helvetica, Arial, sans-serif }
+.FrameItemFont { font-size: 90%; font-family: Helvetica, Arial, sans-serif }
+
+/* Navigation bar fonts and colors */
+.NavBarCell1 { background-color:#EEEEFF;} /* Light mauve */
+.NavBarCell1Rev { background-color:#00008B;} /* Dark Blue */
+.NavBarFont1 { font-family: Arial, Helvetica, sans-serif; color:#000000;}
+.NavBarFont1Rev { font-family: Arial, Helvetica, sans-serif; color:#FFFFFF;}
+
+.NavBarCell2 { font-family: Arial, Helvetica, sans-serif; background-color:#FFFFFF;}
+.NavBarCell3 { font-family: Arial, Helvetica, sans-serif; background-color:#FFFFFF;}
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/index-all.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/index-all.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/index-all.html (revision 0)
@@ -0,0 +1,603 @@
+
+
+
+
+
+
+Index
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+A B C D E F G H I L M N O P R S T U V W
+
+A
+
+active_point -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+ add_prefix(int, SuffixTree.Suffix) -
+Method in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+
+
+
+B
+
+bigrams -
+Static variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ BrownReader - Class in org.apache.uima.examples.tagger.trainAndTest Reader for Brown Corpus from NLTK Distribution (nltk.sourceforge.net) BrownReader() -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.BrownReader
+
+
+
+
+C
+
+canonize() -
+Method in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+ capitalized(String) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+Check is the token is capitalized
+ casFeat_posTag -
+Variable in class org.apache.uima.TokenAnnotation_Type
+
+ casFeat_tokenType -
+Variable in class org.apache.uima.TokenAnnotation_Type
+
+ casFeatCode_posTag -
+Variable in class org.apache.uima.TokenAnnotation_Type
+
+ casFeatCode_tokenType -
+Variable in class org.apache.uima.TokenAnnotation_Type
+
+ chars -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+ corpus -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ CorpusReader - Interface in org.apache.uima.examples.tagger.trainAndTest Reads (annotated) text file(s) and transforms every word into a Token-object
+
+
+D
+
+DO_MAPPING -
+Variable in class org.apache.uima.examples.tagger.HMMTagger
+
+
+
+
+E
+
+edges -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+ end_node -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge
+
+
+
+
+F
+
+featOkTst -
+Static variable in class org.apache.uima.SentenceAnnotation_Type
+
+ featOkTst -
+Static variable in class org.apache.uima.TokenAnnotation_Type
+
+ first_char_index -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge
+
+ first_char_index -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+
+
+
+G
+
+get_eval(ModelGeneration, List<String>, List<String>, List<String>) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.TaggerEvaluation
+
+ get_lexicon(List<Token>) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+Reads sentences, extracts <word, possible parts-of-speech> frequency patterns
+ get_max(double, double, double) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ get_model(String) -
+Static method in class org.apache.uima.examples.tagger.HMMTagger
+Reads a saved MODEL object from a file
+ get_ngrams(int) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+Computes N-gram frequencies
+ get_transition_probs(int) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ get_word_probs(Map<String, Map<String, Double>>) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ getFSGenerator() -
+Method in class org.apache.uima.SentenceAnnotation_Type
+
+ getFSGenerator() -
+Method in class org.apache.uima.TokenAnnotation_Type
+
+ getPosTag() -
+Method in class org.apache.uima.TokenAnnotation
+getter for posTag - gets contains part-of-speech of a corresponding token
+ getPosTag(int) -
+Method in class org.apache.uima.TokenAnnotation_Type
+
+ getTokenType() -
+Method in class org.apache.uima.TokenAnnotation
+getter for tokenType - gets token type
+ getTokenType(int) -
+Method in class org.apache.uima.TokenAnnotation_Type
+
+ getTypeIndexID() -
+Method in class org.apache.uima.SentenceAnnotation
+
+ getTypeIndexID() -
+Method in class org.apache.uima.TokenAnnotation
+
+ GrobMapping - Class in org.apache.uima.examples.tagger GrobMapping() -
+Constructor for class org.apache.uima.examples.tagger.GrobMapping
+
+
+
+
+H
+
+HMMTagger - Class in org.apache.uima.examples.tagger UIMA Analysis Engine that invokes HMM POS tagger. HMMTagger() -
+Constructor for class org.apache.uima.examples.tagger.HMMTagger
+
+
+
+
+I
+
+init_probs(String, Map<String, Double>) -
+Static method in class org.apache.uima.examples.tagger.Viterbi
+
+ initialize(UimaContext) -
+Method in class org.apache.uima.examples.tagger.HMMTagger
+Initialize the Annotator.
+ initialize(UimaContext) -
+Method in interface org.apache.uima.examples.tagger.Tagger
+Instantiates MODEL for current tagger
+ insert_edge(SuffixTree.Edge) -
+Method in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+ isExplicit() -
+Method in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+ isImplicit() -
+Method in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+
+
+
+L
+
+lambdas2 -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ lambdas3 -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ last_char_index -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge
+
+ last_char_index -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+ logify_probs(Map<String, Map<String, Double>>) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+Make LOGs out of probabilities.. there was a reason to separate it from the get_word_probs method at the initial step
+
+
+
+M
+
+main(String[]) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ main(String[]) -
+Static method in class org.apache.uima.examples.tagger.trainAndTest.TaggerEvaluation
+
+ map_tags(List) -
+Method in class org.apache.uima.examples.tagger.GrobMapping
+Defines mapping for List<Token >
+ E.g. if we need to map tags, given a list of Tokens, we need to map the
+ pos field of every Token to a different pos.
+ map_tags(List) -
+Method in interface org.apache.uima.examples.tagger.MappingInterface
+
+ map_tags(List) -
+Method in class org.apache.uima.examples.tagger.TagMapping
+Defines mapping for List<Token >
+ E.g. if we need to map tags, given a list of Tokens, we need to map the
+ pos field of every Token to a different pos.
+ MAPPING -
+Variable in class org.apache.uima.examples.tagger.HMMTagger
+
+ MappingInterface - Interface in org.apache.uima.examples.tagger Defines mapping for a tagset. MODEL -
+Variable in class org.apache.uima.examples.tagger.HMMTagger
+Model file name
+ ModelGeneration - Class in org.apache.uima.examples.tagger.trainAndTest Trains an N-gram model for the tagger, iterating over the files from some predefined training directory. ModelGeneration(List<Token>, String) -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ my_model -
+Variable in class org.apache.uima.examples.tagger.HMMTagger
+
+
+
+
+N
+
+N -
+Variable in class org.apache.uima.examples.tagger.HMMTagger
+for a bigram model: N = 2, for a trigram model N=3 N is defined in parameter file
+ N -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ nodes -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+
+
+
+O
+
+org.apache.uima - package org.apache.uima org.apache.uima.examples.tagger - package org.apache.uima.examples.tagger org.apache.uima.examples.tagger.trainAndTest - package org.apache.uima.examples.tagger.trainAndTest origin_node -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+ OutputFile -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+
+
+
+P
+
+pos -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.Token
+
+ posList -
+Static variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ process(JCas) -
+Method in class org.apache.uima.examples.tagger.HMMTagger
+Process a CAS.
+ process(JCas) -
+Method in interface org.apache.uima.examples.tagger.Tagger
+Trains a new model for tagger, if a training is defined in tagger.properties file
+ process(int, List<String>, String, Map<String, Map<String, Double>>, Map<String, Map<String, Double>>, Map<String, Double>, Map<String, Map<String, Double>>, double[], double[], double) -
+Static method in class org.apache.uima.examples.tagger.Viterbi
+
+
+
+
+R
+
+read_corpus(String, MappingInterface) -
+Method in class org.apache.uima.examples.tagger.trainAndTest.BrownReader
+Reads Brown Corpus from NLTK Distribution Format.
+ read_corpus(String, MappingInterface) -
+Method in interface org.apache.uima.examples.tagger.trainAndTest.CorpusReader
+
+ read_corpus(String, MappingInterface) -
+Method in class org.apache.uima.examples.tagger.trainAndTest.TT_FormatReader
+
+
+
+
+S
+
+SentenceAnnotation - Class in org.apache.uima sentence annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 XML source:
+ C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml SentenceAnnotation() -
+Constructor for class org.apache.uima.SentenceAnnotation
+Never called.
+ SentenceAnnotation(int, TOP_Type) -
+Constructor for class org.apache.uima.SentenceAnnotation
+Internal - constructor used by generator
+ SentenceAnnotation(JCas) -
+Constructor for class org.apache.uima.SentenceAnnotation
+
+ SentenceAnnotation(JCas, int, int) -
+Constructor for class org.apache.uima.SentenceAnnotation
+
+ SentenceAnnotation_Type - Class in org.apache.uima sentence annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 SentenceAnnotation_Type(JCas, Type) -
+Constructor for class org.apache.uima.SentenceAnnotation_Type
+initialize variables to correspond with Cas Type and Features
+ setPosTag(String) -
+Method in class org.apache.uima.TokenAnnotation
+setter for posTag - sets contains part-of-speech of a corresponding token
+ setPosTag(int, String) -
+Method in class org.apache.uima.TokenAnnotation_Type
+
+ setTokenType(String) -
+Method in class org.apache.uima.TokenAnnotation
+setter for tokenType - sets token type
+ setTokenType(int, String) -
+Method in class org.apache.uima.TokenAnnotation_Type
+
+ setUp() -
+Method in class unittests.TaggerTest
+Set up the test fixture
+ sm -
+Static variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+Computes word_probs using ModelGeneration.get_lexicon(List) frequency counts for known words..
+ sm2 -
+Static variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ split_edge(SuffixTree.Suffix) -
+Method in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge
+
+ start_node -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge
+
+ suffix_node -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Node
+
+ suffix_tree -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ suffix_tree_capitalized -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ SuffixTree - Class in org.apache.uima.examples.tagger.trainAndTest Java implementation of the Ukkonen's suffix tree inspired by Mark Nelson's tutorial:
+ http://marknelson.us/1996/08/01/suffix-trees/ SuffixTree() -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+ SuffixTree(String) -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+ SuffixTree.Edge - Class in org.apache.uima.examples.tagger.trainAndTest Internal Class EDGE SuffixTree.Edge(int, int, int, int) -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge
+
+ SuffixTree.Node - Class in org.apache.uima.examples.tagger.trainAndTest Internal Class NODE SuffixTree.Node() -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Node
+
+ SuffixTree.Suffix - Class in org.apache.uima.examples.tagger.trainAndTest Internal Class SUFFIX SuffixTree.Suffix(int, int, int) -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+
+
+
+T
+
+Tagger - Interface in org.apache.uima.examples.tagger General tagger interface in case one would want to define further types of taggers. TaggerEvaluation - Class in org.apache.uima.examples.tagger.trainAndTest Evaluation of Tagger
+ NB. TaggerEvaluation() -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.TaggerEvaluation
+
+ TaggerTest - Class in unittests TaggerTest() -
+Constructor for class unittests.TaggerTest
+
+ TagMapping - Class in org.apache.uima.examples.tagger TagMapping() -
+Constructor for class org.apache.uima.examples.tagger.TagMapping
+
+ testEnglishTagger() -
+Method in class unittests.TaggerTest
+Tests English trigram tagger
+ testGermanTagger() -
+Method in class unittests.TaggerTest
+Tests tagging for German.
+ text -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+ theta -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ Token - Class in org.apache.uima.examples.tagger.trainAndTest Defines token features. Token() -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.Token
+
+ Token(String) -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.Token
+
+ Token(String, String) -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.Token
+
+ TokenAnnotation - Class in org.apache.uima Single token annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 XML source:
+ C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml TokenAnnotation() -
+Constructor for class org.apache.uima.TokenAnnotation
+Never called.
+ TokenAnnotation(int, TOP_Type) -
+Constructor for class org.apache.uima.TokenAnnotation
+Internal - constructor used by generator
+ TokenAnnotation(JCas) -
+Constructor for class org.apache.uima.TokenAnnotation
+
+ TokenAnnotation(JCas, int, int) -
+Constructor for class org.apache.uima.TokenAnnotation
+
+ TokenAnnotation_Type - Class in org.apache.uima Single token annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 TokenAnnotation_Type(JCas, Type) -
+Constructor for class org.apache.uima.TokenAnnotation_Type
+initialize variables to correspond with Cas Type and Features
+ tokens_count_all_corpus -
+Static variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ transition_probs -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+Map containing N-gram probabilities
+ trigrams -
+Static variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+ TT_FormatReader - Class in org.apache.uima.examples.tagger.trainAndTest TT_FormatReader() -
+Constructor for class org.apache.uima.examples.tagger.trainAndTest.TT_FormatReader
+
+ type -
+Static variable in class org.apache.uima.SentenceAnnotation
+
+ type -
+Static variable in class org.apache.uima.TokenAnnotation
+
+ typeIndexID -
+Static variable in class org.apache.uima.SentenceAnnotation
+
+ typeIndexID -
+Static variable in class org.apache.uima.SentenceAnnotation_Type
+
+ typeIndexID -
+Static variable in class org.apache.uima.TokenAnnotation
+
+ typeIndexID -
+Static variable in class org.apache.uima.TokenAnnotation_Type
+
+
+
+
+U
+
+unigrams -
+Static variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+Computes transition_probs using ModelGeneration.get_ngrams(int) frequency counts for N-grams..
+ unittests - package unittests
+
+
+V
+
+Viterbi - Class in org.apache.uima.examples.tagger Viterbi Algorithm: Given a model and a sequence of observations, what is the most likely
+ sequence of states in the model that produces the observations? Viterbi() -
+Constructor for class org.apache.uima.examples.tagger.Viterbi
+
+
+
+
+W
+
+word -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.Token
+
+ word_probs -
+Variable in class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+Map containing <word,tag> probabilities, that is probability of a certain word given a certain tag at a time t: P(wordt |tagt ))
+
+
+A B C D E F G H I L M N O P R S T U V W
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/allclasses-frame.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/allclasses-frame.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/allclasses-frame.html (revision 0)
@@ -0,0 +1,64 @@
+
+
+
+
+
+
+All Classes
+
+
+
+
+
+
+
+
+
+
+All Classes
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/TokenAnnotation_Type.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/TokenAnnotation_Type.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/TokenAnnotation_Type.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.TokenAnnotation_Type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.TokenAnnotation_Type
+
+No usage of org.apache.uima.TokenAnnotation_Type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/SentenceAnnotation.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/SentenceAnnotation.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/SentenceAnnotation.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.SentenceAnnotation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.SentenceAnnotation
+
+No usage of org.apache.uima.SentenceAnnotation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/SentenceAnnotation_Type.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/SentenceAnnotation_Type.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/SentenceAnnotation_Type.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.SentenceAnnotation_Type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.SentenceAnnotation_Type
+
+No usage of org.apache.uima.SentenceAnnotation_Type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/TokenAnnotation.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/TokenAnnotation.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/class-use/TokenAnnotation.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.TokenAnnotation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.TokenAnnotation
+
+No usage of org.apache.uima.TokenAnnotation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/TokenAnnotation_Type.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/TokenAnnotation_Type.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/TokenAnnotation_Type.html (revision 0)
@@ -0,0 +1,506 @@
+
+
+
+
+
+
+TokenAnnotation_Type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima
+
+Class TokenAnnotation_Type
+
+java.lang.Object
+ org.apache.uima.jcas.cas.TOP_Type
+ org.apache.uima.jcas.cas.AnnotationBase_Type
+ org.apache.uima.jcas.tcas.Annotation_Type
+ org.apache.uima.TokenAnnotation_Type
+
+
+
+public class TokenAnnotation_Type extends org.apache.uima.jcas.tcas.Annotation_Type
+
+
+
+Single token annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
+
+
+
+
+
+
+
+
+
+
+
+
+
+Fields inherited from class org.apache.uima.jcas.cas.TOP_Type
+
+
+casImpl, casType, casTypeCode, instanceOf_Type, jcas, ll_cas, lowLevelArrayBoundChecks, lowLevelTypeChecks, useExistingInstance
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+TokenAnnotation_Type (org.apache.uima.jcas.JCas jcas,
+ org.apache.uima.cas.Type casType)
+
+
+ initialize variables to correspond with Cas Type and Features
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+protected org.apache.uima.cas.impl.FSGenerator
+getFSGenerator ()
+
+
+
+
+
+
+ java.lang.String
+getPosTag (int addr)
+
+
+
+
+
+
+ java.lang.String
+getTokenType (int addr)
+
+
+
+
+
+
+ void
+setPosTag (int addr,
+ java.lang.String v)
+
+
+
+
+
+
+ void
+setTokenType (int addr,
+ java.lang.String v)
+
+
+
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.tcas.Annotation_Type
+
+
+getBegin, getCoveredText, getEnd, setBegin, setEnd
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.cas.AnnotationBase_Type
+
+
+getSofa, getView
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.cas.TOP_Type
+
+
+addToIndexes, checkType, getTypeIndexID, invalidTypeArg, noObjCreate, removeFromIndexes
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+typeIndexID
+
+public static final int typeIndexID
+
+
+
+
+
+
+
+featOkTst
+
+public static final boolean featOkTst
+
+
+
+
+
+
+
+casFeat_tokenType
+
+final org.apache.uima.cas.Feature casFeat_tokenType
+
+
+
+
+
+
+
+casFeatCode_tokenType
+
+final int casFeatCode_tokenType
+
+
+
+
+
+
+
+casFeat_posTag
+
+final org.apache.uima.cas.Feature casFeat_posTag
+
+
+
+
+
+
+
+casFeatCode_posTag
+
+final int casFeatCode_posTag
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TokenAnnotation_Type
+
+public TokenAnnotation_Type (org.apache.uima.jcas.JCas jcas,
+ org.apache.uima.cas.Type casType)
+
+initialize variables to correspond with Cas Type and Features
+
+
+
+
+
+
+
+
+
+getFSGenerator
+
+protected org.apache.uima.cas.impl.FSGenerator getFSGenerator ()
+
+
+Overrides: getFSGenerator in class org.apache.uima.jcas.tcas.Annotation_Type
+
+
+
+
+
+
+
+
+getTokenType
+
+public java.lang.String getTokenType (int addr)
+
+
+
+
+
+
+
+
+setTokenType
+
+public void setTokenType (int addr,
+ java.lang.String v)
+
+
+
+
+
+
+
+
+getPosTag
+
+public java.lang.String getPosTag (int addr)
+
+
+
+
+
+
+
+
+setPosTag
+
+public void setPosTag (int addr,
+ java.lang.String v)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/SentenceAnnotation.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/SentenceAnnotation.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/SentenceAnnotation.html (revision 0)
@@ -0,0 +1,442 @@
+
+
+
+
+
+
+SentenceAnnotation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima
+
+Class SentenceAnnotation
+
+java.lang.Object
+ org.apache.uima.cas.impl.FeatureStructureImpl
+ org.apache.uima.jcas.cas.TOP
+ org.apache.uima.jcas.cas.AnnotationBase
+ org.apache.uima.jcas.tcas.Annotation
+ org.apache.uima.SentenceAnnotation
+
+
+All Implemented Interfaces: java.lang.Cloneable, org.apache.uima.cas.AnnotationBaseFS, org.apache.uima.cas.FeatureStructure, org.apache.uima.cas.text.AnnotationFS
+
+
+
+public class SentenceAnnotation extends org.apache.uima.jcas.tcas.Annotation
+
+
+
+sentence annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 XML source:
+ C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml
+
+
+
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+static int
+type
+
+
+
+
+
+
+static int
+typeIndexID
+
+
+
+
+
+
+
+
+Fields inherited from class org.apache.uima.jcas.cas.TOP
+
+
+addr, jcasType
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+
+protected
+SentenceAnnotation ()
+
+
+ Never called.
+
+
+
+
+SentenceAnnotation (int addr,
+ org.apache.uima.jcas.cas.TOP_Type type)
+
+
+ Internal - constructor used by generator
+
+
+
+
+SentenceAnnotation (org.apache.uima.jcas.JCas jcas)
+
+
+
+
+
+
+
+SentenceAnnotation (org.apache.uima.jcas.JCas jcas,
+ int begin,
+ int end)
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.tcas.Annotation
+
+
+getBegin, getCoveredText, getEnd, getStart, setBegin, setEnd
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.cas.AnnotationBase
+
+
+getSofa, getView
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.cas.TOP
+
+
+addToIndexes, addToIndexes, equals, getAddress, getCAS, getCASImpl, getLowLevelCas, hashCode, removeFromIndexes, removeFromIndexes
+
+
+
+
+
+Methods inherited from class org.apache.uima.cas.impl.FeatureStructureImpl
+
+
+clone, getBooleanValue, getByteValue, getDoubleValue, getFeatureValue, getFeatureValueAsString, getFloatValue, getIntValue, getLongValue, getShortValue, getStringValue, getType, prettyPrint, prettyPrint, prettyPrint, setBooleanValue, setByteValue, setDoubleValue, setFeatureValue, setFeatureValueFromString, setFloatValue, setIntValue, setLongValue, setShortValue, setStringValue, toString, toString
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+finalize, getClass, notify, notifyAll, wait, wait, wait
+
+
+
+
+
+Methods inherited from interface org.apache.uima.cas.AnnotationBaseFS
+
+
+getView
+
+
+
+
+
+Methods inherited from interface org.apache.uima.cas.FeatureStructure
+
+
+clone, equals, getBooleanValue, getByteValue, getCAS, getDoubleValue, getFeatureValue, getFeatureValueAsString, getFloatValue, getIntValue, getLongValue, getShortValue, getStringValue, getType, hashCode, setBooleanValue, setByteValue, setDoubleValue, setFeatureValue, setFeatureValueFromString, setFloatValue, setIntValue, setLongValue, setShortValue, setStringValue
+
+
+
+
+
+
+
+
+
+
+
+typeIndexID
+
+public static final int typeIndexID
+
+
+
+
+
+
+
+type
+
+public static final int type
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SentenceAnnotation
+
+protected SentenceAnnotation ()
+
+Never called. Disable default constructor
+
+
+
+
+
+SentenceAnnotation
+
+public SentenceAnnotation (int addr,
+ org.apache.uima.jcas.cas.TOP_Type type)
+
+Internal - constructor used by generator
+
+
+
+
+
+SentenceAnnotation
+
+public SentenceAnnotation (org.apache.uima.jcas.JCas jcas)
+
+
+
+
+
+SentenceAnnotation
+
+public SentenceAnnotation (org.apache.uima.jcas.JCas jcas,
+ int begin,
+ int end)
+
+
+
+
+
+
+
+
+
+getTypeIndexID
+
+public int getTypeIndexID ()
+
+
+Overrides: getTypeIndexID in class org.apache.uima.jcas.tcas.Annotation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/SentenceAnnotation_Type.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/SentenceAnnotation_Type.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/SentenceAnnotation_Type.html (revision 0)
@@ -0,0 +1,354 @@
+
+
+
+
+
+
+SentenceAnnotation_Type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima
+
+Class SentenceAnnotation_Type
+
+java.lang.Object
+ org.apache.uima.jcas.cas.TOP_Type
+ org.apache.uima.jcas.cas.AnnotationBase_Type
+ org.apache.uima.jcas.tcas.Annotation_Type
+ org.apache.uima.SentenceAnnotation_Type
+
+
+
+public class SentenceAnnotation_Type extends org.apache.uima.jcas.tcas.Annotation_Type
+
+
+
+sentence annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
+
+
+
+
+
+
+
+
+
+
+
+
+
+Fields inherited from class org.apache.uima.jcas.cas.TOP_Type
+
+
+casImpl, casType, casTypeCode, instanceOf_Type, jcas, ll_cas, lowLevelArrayBoundChecks, lowLevelTypeChecks, useExistingInstance
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+SentenceAnnotation_Type (org.apache.uima.jcas.JCas jcas,
+ org.apache.uima.cas.Type casType)
+
+
+ initialize variables to correspond with Cas Type and Features
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+protected org.apache.uima.cas.impl.FSGenerator
+getFSGenerator ()
+
+
+
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.tcas.Annotation_Type
+
+
+getBegin, getCoveredText, getEnd, setBegin, setEnd
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.cas.AnnotationBase_Type
+
+
+getSofa, getView
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.cas.TOP_Type
+
+
+addToIndexes, checkType, getTypeIndexID, invalidTypeArg, noObjCreate, removeFromIndexes
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+typeIndexID
+
+public static final int typeIndexID
+
+
+
+
+
+
+
+featOkTst
+
+public static final boolean featOkTst
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SentenceAnnotation_Type
+
+public SentenceAnnotation_Type (org.apache.uima.jcas.JCas jcas,
+ org.apache.uima.cas.Type casType)
+
+initialize variables to correspond with Cas Type and Features
+
+
+
+
+
+
+
+
+
+getFSGenerator
+
+protected org.apache.uima.cas.impl.FSGenerator getFSGenerator ()
+
+
+Overrides: getFSGenerator in class org.apache.uima.jcas.tcas.Annotation_Type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-frame.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-frame.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-frame.html (revision 0)
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+org.apache.uima
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-use.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-use.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-use.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Package org.apache.uima
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Package org.apache.uima
+
+No usage of org.apache.uima
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-summary.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-summary.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-summary.html (revision 0)
@@ -0,0 +1,169 @@
+
+
+
+
+
+
+org.apache.uima
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Package org.apache.uima
+
+
+
+
+
+Class Summary
+
+
+SentenceAnnotation
+sentence annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 XML source:
+ C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml
+
+
+SentenceAnnotation_Type
+sentence annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
+
+
+TokenAnnotation
+Single token annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 XML source:
+ C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml
+
+
+TokenAnnotation_Type
+Single token annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-tree.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-tree.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/package-tree.html (revision 0)
@@ -0,0 +1,167 @@
+
+
+
+
+
+
+org.apache.uima Class Hierarchy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Hierarchy For Package org.apache.uima
+
+
+
+Package Hierarchies: All Packages
+
+
+Class Hierarchy
+
+
+java.lang.Object
+org.apache.uima.cas.impl.FeatureStructureImpl (implements java.lang.Cloneable, org.apache.uima.cas.FeatureStructure)
+
+org.apache.uima.jcas.cas.TOP
+org.apache.uima.jcas.cas.AnnotationBase (implements org.apache.uima.cas.AnnotationBaseFS)
+
+org.apache.uima.jcas.tcas.Annotation (implements org.apache.uima.cas.text.AnnotationFS)
+
+
+
+
+ org.apache.uima.jcas.cas.TOP_Type
+org.apache.uima.jcas.cas.AnnotationBase_Type
+org.apache.uima.jcas.tcas.Annotation_Type
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/MappingInterface.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/MappingInterface.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/MappingInterface.html (revision 0)
@@ -0,0 +1,213 @@
+
+
+
+
+
+
+MappingInterface
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Interface MappingInterface
+
+All Known Implementing Classes: GrobMapping , TagMapping
+
+
+
+public interface MappingInterface
+
+
+
+Defines mapping for a tagset. For example, one may wish to map a more detailed
+ tagset to a less distinctive one (i.e. tell a program to tag all verbs as just
+ VERB instead of differentiating between verb infinitive, verb imperative, etc ...
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+ java.util.List
+map_tags (java.util.List pos)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+map_tags
+
+java.util.List map_tags (java.util.List pos)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/ModelGeneration.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/ModelGeneration.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/ModelGeneration.html (revision 0)
@@ -0,0 +1,778 @@
+
+
+
+
+
+
+ModelGeneration
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class ModelGeneration
+
+java.lang.Object
+ org.apache.uima.examples.tagger.ModelGeneration
+
+
+All Implemented Interfaces: java.io.Serializable
+
+
+
+public class ModelGeneration extends java.lang.Objectimplements java.io.Serializable
+
+
+
+Trains an N-gram model for the tagger, iterating over the files from some predefined training directory
+ Writes the resulting model to a binary fileSystem
+
+
+ NB. At the moment- both bi-and trigram statistics are saved in one model file..
+
+
+
+
+See Also: Serialized Form
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+bigrams
+
+
+
+
+
+
+(package private) java.util.List
+corpus
+
+
+
+
+
+
+ double[]
+lambdas2
+
+
+
+
+
+
+ double[]
+lambdas3
+
+
+
+
+
+
+(package private) int
+N
+
+
+
+
+
+
+(package private) java.lang.String
+OutputFile
+
+
+
+
+
+
+(package private) static java.util.List<java.lang.String>
+posList
+
+
+
+
+
+
+(package private) static java.util.Map
+sm
+
+
+ Computes word_probs using get_lexicon(List) frequency counts for known words..
+
+
+
+(package private) static java.util.Map
+sm2
+
+
+
+
+
+
+ java.util.Map
+suffix_tree
+
+
+
+
+
+
+ java.util.Map
+suffix_tree_capitalized
+
+
+
+
+
+
+ double
+theta
+
+
+
+
+
+
+(package private) static int
+tokens_count_all_corpus
+
+
+
+
+
+
+ java.util.Map<java.lang.String,java.lang.Double>
+transition_probs
+
+
+ Map containing N-gram probabilities
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+trigrams
+
+
+
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+unigrams
+
+
+ Computes transition_probs using get_ngrams(int) frequency counts for N-grams..
+
+
+
+ java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>
+word_probs
+
+
+ Map containing <word,tag> probabilities, that is probability of a certain word given a certain tag at a time t: P(wordt |tagt ))
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+ModelGeneration (java.util.List<Token > corpus,
+ java.lang.String OutputFile)
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+(package private) static boolean
+capitalized (java.lang.String word)
+
+
+ Check is the token is capitalized
+
+
+
+(package private) static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>
+get_lexicon (java.util.List<Token > corpus)
+
+
+ Reads sentences, extracts <word, possible parts-of-speech> frequency patterns
+
+
+
+(package private) static double
+get_max (double a,
+ double b,
+ double c)
+
+
+
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+get_ngrams (int N)
+
+
+ Computes N-gram frequencies
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+get_transition_probs (int N)
+
+
+
+
+
+
+(package private) static java.util.List<java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>>
+get_word_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> corpus)
+
+
+
+
+
+
+(package private) static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>
+logify_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> probs)
+
+
+ Make LOGs out of probabilities.. there was a reason to separate it from the get_word_probs method at the initial step
+
+
+
+static void
+main (java.lang.String[] args)
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+suffix_tree
+
+public java.util.Map suffix_tree
+
+
+
+
+
+
+
+suffix_tree_capitalized
+
+public java.util.Map suffix_tree_capitalized
+
+
+
+
+
+
+
+word_probs
+
+public java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> word_probs
+
+Map containing <word,tag> probabilities, that is probability of a certain word given a certain tag at a time t: P(wordt |tagt ))
+
+
+
+
+
+
+
+transition_probs
+
+public java.util.Map<java.lang.String,java.lang.Double> transition_probs
+
+Map containing N-gram probabilities
+
+
+
+
+
+
+
+posList
+
+static java.util.List<java.lang.String> posList
+
+
+
+
+
+
+
+N
+
+int N
+
+
+
+
+
+
+
+lambdas2
+
+public double[] lambdas2
+
+
+
+
+
+
+
+lambdas3
+
+public double[] lambdas3
+
+
+
+
+
+
+
+theta
+
+public double theta
+
+
+
+
+
+
+
+OutputFile
+
+transient java.lang.String OutputFile
+
+
+
+
+
+
+
+corpus
+
+transient java.util.List corpus
+
+
+
+
+
+
+
+tokens_count_all_corpus
+
+static int tokens_count_all_corpus
+
+
+
+
+
+
+
+sm
+
+static java.util.Map sm
+
+Computes word_probs using get_lexicon(List) frequency counts for known words..
+ TO_DO: ADD SMOOTHING FOR UNKNOWNS?? OR add smoothing directly when come across unknown..
+
+
+
+
+
+
+
+sm2
+
+static java.util.Map sm2
+
+
+
+
+
+
+
+unigrams
+
+static java.util.Map<java.lang.String,java.lang.Double> unigrams
+
+Computes transition_probs using get_ngrams(int) frequency counts for N-grams..
+
+
+
+
+
+
+
+bigrams
+
+static java.util.Map<java.lang.String,java.lang.Double> bigrams
+
+
+
+
+
+
+
+trigrams
+
+static java.util.Map<java.lang.String,java.lang.Double> trigrams
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+ModelGeneration
+
+public ModelGeneration (java.util.List<Token > corpus,
+ java.lang.String OutputFile)
+
+
+Parameters: N - N=1, 2 or 3InputDir - input directory nameOutputFile - output file name
+ MapBrownToPenn TagMapping
+
+
+
+
+
+
+
+
+capitalized
+
+static boolean capitalized (java.lang.String word)
+
+Check is the token is capitalized
+
+
+
+
+
+
+
+
+
+
+
+get_lexicon
+
+static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> get_lexicon (java.util.List<Token > corpus)
+
+Reads sentences, extracts <word, possible parts-of-speech> frequency patterns
+
+
+
+
+
+Parameters: corpus - list containing all tokens of the training corpus of the type Token }
+Returns: map containing frequency counts for <word, its pos>
+
+
+
+
+
+get_word_probs
+
+static java.util.List<java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>> get_word_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> corpus)
+
+
+
+
+
+
+
+
+
+
+
+logify_probs
+
+static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> logify_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> probs)
+
+Make LOGs out of probabilities.. there was a reason to separate it from the get_word_probs method at the initial step
+
+
+
+
+
+
+
+
+
+
+
+get_ngrams
+
+static java.util.Map<java.lang.String,java.lang.Double> get_ngrams (int N)
+ throws java.lang.IllegalArgumentException
+
+Computes N-gram frequencies
+
+
+
+
+
+Parameters: N -
+Returns: Map N-grams of parts-of-speech, where N = 1, 2 or 3
+Throws:
+java.lang.IllegalArgumentException
+
+
+
+
+
+get_transition_probs
+
+static java.util.Map<java.lang.String,java.lang.Double> get_transition_probs (int N)
+ throws java.lang.IllegalArgumentException
+
+
+
+
+
+
+Throws:
+java.lang.IllegalArgumentException
+
+
+
+
+
+get_max
+
+static double get_max (double a,
+ double b,
+ double c)
+
+
+
+
+
+
+
+
+
+
+
+main
+
+public static void main (java.lang.String[] args)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/Token.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/Token.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/Token.html (revision 0)
@@ -0,0 +1,306 @@
+
+
+
+
+
+
+Token
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class Token
+
+java.lang.Object
+ org.apache.uima.examples.tagger.Token
+
+
+
+public class Token extends java.lang.Object
+
+
+
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+ java.lang.String
+pos
+
+
+
+
+
+
+ java.lang.String
+word
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+Token ()
+
+
+
+
+
+Token (java.lang.String word)
+
+
+
+
+
+Token (java.lang.String word,
+ java.lang.String pos)
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+pos
+
+public java.lang.String pos
+
+
+
+
+
+
+
+word
+
+public java.lang.String word
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+Token
+
+public Token ()
+
+
+
+
+
+Token
+
+public Token (java.lang.String word)
+
+
+
+
+
+Token
+
+public Token (java.lang.String word,
+ java.lang.String pos)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TT_FormatReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TT_FormatReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TT_FormatReader.html (revision 0)
@@ -0,0 +1,280 @@
+
+
+
+
+
+
+TT_FormatReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class TT_FormatReader
+
+java.lang.Object
+ org.apache.uima.examples.tagger.TT_FormatReader
+
+
+All Implemented Interfaces: CorpusReader
+
+
+
+public class TT_FormatReader extends java.lang.Objectimplements CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TT_FormatReader
+
+public TT_FormatReader ()
+
+
+
+
+
+
+
+
+
+read_corpus
+
+public java.util.List<Token > read_corpus (java.lang.String file,
+ MappingInterface Mapping)
+
+
+Specified by: read_corpus in interface CorpusReader
+
+
+
+
+
+
+
+
+read_corpus
+
+public java.util.List<Token > read_corpus (java.lang.String file)
+
+
+Specified by: read_corpus in interface CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/ModelGenerationBytes.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/ModelGenerationBytes.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/ModelGenerationBytes.html (revision 0)
@@ -0,0 +1,754 @@
+
+
+
+
+
+
+ModelGenerationBytes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class ModelGenerationBytes
+
+java.lang.Object
+ org.apache.uima.examples.tagger.ModelGenerationBytes
+
+
+All Implemented Interfaces: java.io.Serializable
+
+
+
+public class ModelGenerationBytes extends java.lang.Objectimplements java.io.Serializable
+
+
+
+Trains an N-gram model for the tagger, iterating over the files from some predefined training
+ directory Writes the resulting model to a binary fileSystem
+
+
+ NB. At the moment- both bi-and trigram statistics are saved in one model file..
+
+
+
+
+See Also: Serialized Form
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+(package private) java.util.Map<java.lang.String,java.lang.Double>
+bigrams
+
+
+
+
+
+
+(package private) java.util.List
+corpus
+
+
+
+
+
+
+(package private) java.util.Map<java.lang.String,java.lang.Integer>
+counts
+
+
+
+
+
+
+(package private) java.lang.String
+InputDir
+
+
+
+
+
+
+(package private) double[]
+lambdas2
+
+
+
+
+
+
+(package private) double[]
+lambdas3
+
+
+
+
+
+
+(package private) int
+N
+
+
+
+
+
+
+(package private) java.lang.String
+OutputFile
+
+
+
+
+
+
+(package private) java.util.List<java.lang.String>
+posList
+
+
+
+
+
+
+ java.util.Map
+posMap
+
+
+
+
+
+
+(package private) java.util.Map
+sm
+
+
+ Computes word_probs using get_lexicon(List) frequency counts for known words..
+
+
+
+(package private) java.util.Map
+sm2
+
+
+
+
+
+
+ java.util.Map
+suffix_tree
+
+
+
+
+
+
+ java.util.Map
+suffix_tree_capitalized
+
+
+
+
+
+
+ double
+theta
+
+
+
+
+
+
+(package private) int
+tokens_count_all_corpus
+
+
+
+
+
+
+ java.util.Map
+transition_probs
+
+
+ Map containing N-gram probabilities
+
+
+
+(package private) java.util.Map<java.lang.String,java.lang.Double>
+trigrams
+
+
+
+
+
+
+(package private) java.util.Map<java.lang.String,java.lang.Double>
+unigrams
+
+
+ Computes transition_probs using get_ngrams(int) frequency counts for N-grams..
+
+
+
+ java.util.Map
+word_probs
+
+
+ Map containing <word,tag> probabilities, that is probability of a certain word given a
+ certain tag at a time t: P(wordt |tagt ))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+ double[]
+calculate_lambda (int N)
+
+
+ Computes alphas for linear interpolation smoothing of unknown n-grams
+
+
+
+(package private) static boolean
+capitalized (java.lang.String word)
+
+
+ Check is the token is capitalized
+
+
+
+ java.util.Map
+get_transition_probs (int N)
+
+
+
+
+
+
+ java.util.List
+get_word_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.Byte,java.lang.Double>> corpus)
+
+
+
+
+
+
+static void
+main (java.lang.String[] args)
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+posMap
+
+public java.util.Map posMap
+
+
+
+
+
+
+
+suffix_tree
+
+public java.util.Map suffix_tree
+
+
+
+
+
+
+
+suffix_tree_capitalized
+
+public java.util.Map suffix_tree_capitalized
+
+
+
+
+
+
+
+word_probs
+
+public java.util.Map word_probs
+
+Map containing <word,tag> probabilities, that is probability of a certain word given a
+ certain tag at a time t: P(wordt |tagt ))
+
+
+
+
+
+
+
+transition_probs
+
+public java.util.Map transition_probs
+
+Map containing N-gram probabilities
+
+
+
+
+
+
+
+posList
+
+transient java.util.List<java.lang.String> posList
+
+
+
+
+
+
+
+N
+
+int N
+
+
+
+
+
+
+
+lambdas2
+
+double[] lambdas2
+
+
+
+
+
+
+
+lambdas3
+
+double[] lambdas3
+
+
+
+
+
+
+
+theta
+
+public double theta
+
+
+
+
+
+
+
+counts
+
+java.util.Map<java.lang.String,java.lang.Integer> counts
+
+
+
+
+
+
+
+InputDir
+
+transient java.lang.String InputDir
+
+
+
+
+
+
+
+OutputFile
+
+transient java.lang.String OutputFile
+
+
+
+
+
+
+
+corpus
+
+transient java.util.List corpus
+
+
+
+
+
+
+
+tokens_count_all_corpus
+
+int tokens_count_all_corpus
+
+
+
+
+
+
+
+sm
+
+transient java.util.Map sm
+
+Computes word_probs using get_lexicon(List) frequency counts for known words..
+ TO_DO: ADD SMOOTHING FOR UNKNOWNS?? OR add smoothing directly when come across unknown..
+
+
+
+
+
+
+
+sm2
+
+transient java.util.Map sm2
+
+
+
+
+
+
+
+unigrams
+
+java.util.Map<java.lang.String,java.lang.Double> unigrams
+
+Computes transition_probs using get_ngrams(int) frequency counts for N-grams..
+
+
+
+
+
+
+
+bigrams
+
+java.util.Map<java.lang.String,java.lang.Double> bigrams
+
+
+
+
+
+
+
+trigrams
+
+java.util.Map<java.lang.String,java.lang.Double> trigrams
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+ModelGenerationBytes
+
+public ModelGenerationBytes (java.util.List<Token > corpus,
+ java.lang.String OutputFile)
+
+
+Parameters: N - N=1, 2 or 3InputDir - input directory nameOutputFile - output file name MapBrownToPenn TagMapping
+
+
+
+
+
+
+
+
+capitalized
+
+static boolean capitalized (java.lang.String word)
+
+Check is the token is capitalized
+
+
+
+
+
+
+
+
+
+
+
+get_word_probs
+
+public java.util.List get_word_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.Byte,java.lang.Double>> corpus)
+
+
+
+
+
+
+
+
+
+
+
+get_transition_probs
+
+public java.util.Map get_transition_probs (int N)
+ throws java.lang.IllegalArgumentException
+
+
+
+
+
+
+Throws:
+java.lang.IllegalArgumentException
+
+
+
+
+
+calculate_lambda
+
+public double[] calculate_lambda (int N)
+
+Computes alphas for linear interpolation smoothing of unknown n-grams
+
+
+
+
+
+Parameters: N - N-gram currently lambdas are calculated as in (Brants, 2000)
+
+
+
+
+
+main
+
+public static void main (java.lang.String[] args)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-frame.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-frame.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-frame.html (revision 0)
@@ -0,0 +1,51 @@
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.html (revision 0)
@@ -0,0 +1,431 @@
+
+
+
+
+
+
+SuffixTree
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class SuffixTree
+
+java.lang.Object
+ org.apache.uima.examples.tagger.SuffixTree
+
+
+
+public class SuffixTree extends java.lang.Object
+
+
+
+Java implementation of the Ukkonen's suffix tree inspired by Mark Nelson's tutorial:
+ http://marknelson.us/1996/08/01/suffix-trees/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+text
+
+public java.lang.String text
+
+
+
+
+
+
+
+nodes
+
+public java.util.List<SuffixTree.Node > nodes
+
+
+
+
+
+
+
+edges
+
+public java.util.Map edges
+
+
+
+
+
+
+
+chars
+
+char[] chars
+
+
+
+
+
+
+
+active_point
+
+SuffixTree.Suffix active_point
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SuffixTree
+
+public SuffixTree ()
+
+
+
+
+
+SuffixTree
+
+public SuffixTree (java.lang.String text)
+
+
+
+
+
+
+
+
+
+insert_edge
+
+public void insert_edge (SuffixTree.Edge edge)
+
+
+
+
+
+
+
+
+add_prefix
+
+public void add_prefix (int last_char,
+ SuffixTree.Suffix active_point)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-use.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-use.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-use.html (revision 0)
@@ -0,0 +1,192 @@
+
+
+
+
+
+
+Uses of Package org.apache.uima.examples.tagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Package org.apache.uima.examples.tagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-summary.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-summary.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-summary.html (revision 0)
@@ -0,0 +1,186 @@
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Package org.apache.uima.examples.tagger
+
+
+
+
+
+Interface Summary
+
+
+MappingInterface
+Defines mapping for a tagset.
+
+
+Tagger
+General tagger interface in case one would want to define further types of taggers.
+
+
+
+
+
+
+
+
+
+Class Summary
+
+
+GrobMapping
+
+
+
+HMMTagger
+UIMA Analysis Engine that invokes HMM POS tagger.
+
+
+TagMapping
+
+
+
+Viterbi
+Viterbi Algorithm: Given a model and a sequence of observations, what is the most likely
+ sequence of states in the model that produces the observations?
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.Suffix.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.Suffix.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.Suffix.html (revision 0)
@@ -0,0 +1,371 @@
+
+
+
+
+
+
+SuffixTree.Suffix
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class SuffixTree.Suffix
+
+java.lang.Object
+ org.apache.uima.examples.tagger.SuffixTree.Suffix
+
+
+Enclosing class: SuffixTree
+
+
+
+ class SuffixTree.Suffix extends java.lang.Object
+
+
+
+Internal Class SUFFIX
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+SuffixTree.Suffix (int node,
+ int begin,
+ int end)
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+(package private) void
+canonize ()
+
+
+
+
+
+
+(package private) boolean
+isExplicit ()
+
+
+
+
+
+
+(package private) boolean
+isImplicit ()
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+origin_node
+
+int origin_node
+
+
+
+
+
+
+
+first_char_index
+
+int first_char_index
+
+
+
+
+
+
+
+last_char_index
+
+int last_char_index
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SuffixTree.Suffix
+
+public SuffixTree.Suffix (int node,
+ int begin,
+ int end)
+
+
+
+
+
+
+
+
+
+isExplicit
+
+boolean isExplicit ()
+
+
+
+
+
+
+
+
+isImplicit
+
+boolean isImplicit ()
+
+
+
+
+
+
+
+
+canonize
+
+void canonize ()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TagMapping.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TagMapping.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TagMapping.html (revision 0)
@@ -0,0 +1,264 @@
+
+
+
+
+
+
+TagMapping
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class TagMapping
+
+java.lang.Object
+ org.apache.uima.examples.tagger.TagMapping
+
+
+All Implemented Interfaces: MappingInterface
+
+
+
+ class TagMapping extends java.lang.Objectimplements MappingInterface
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+TagMapping ()
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+ java.util.List
+map_tags (java.util.List tokens)
+
+
+ Defines mapping for List<Token >
+ E.g. if we need to map tags, given a list of Tokens, we need to map the
+ pos field of every Token to a different pos.
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TagMapping
+
+TagMapping ()
+
+
+
+
+
+
+
+
+
+map_tags
+
+public java.util.List map_tags (java.util.List tokens)
+
+Defines mapping for List<Token >
+ E.g. if we need to map tags, given a list of Tokens, we need to map the
+ pos field of every Token to a different pos.
+ Basically the mapping performed in this class is just a case of simple "normalization",
+ we just discard compound tags of the Brown corpus here.
+
+
+Specified by: map_tags in interface MappingInterface
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.Edge.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.Edge.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.Edge.html (revision 0)
@@ -0,0 +1,353 @@
+
+
+
+
+
+
+SuffixTree.Edge
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class SuffixTree.Edge
+
+java.lang.Object
+ org.apache.uima.examples.tagger.SuffixTree.Edge
+
+
+Enclosing class: SuffixTree
+
+
+
+public class SuffixTree.Edge extends java.lang.Object
+
+
+
+Internal Class EDGE
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+SuffixTree.Edge (int parent_node,
+ int end_node,
+ int first_char_index,
+ int last_char_index)
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+first_char_index
+
+public int first_char_index
+
+
+
+
+
+
+
+last_char_index
+
+public int last_char_index
+
+
+
+
+
+
+
+start_node
+
+int start_node
+
+
+
+
+
+
+
+end_node
+
+public int end_node
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SuffixTree.Edge
+
+public SuffixTree.Edge (int parent_node,
+ int end_node,
+ int first_char_index,
+ int last_char_index)
+
+
+
+
+
+
+
+
+
+split_edge
+
+public int split_edge (SuffixTree.Suffix suffix)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-tree.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-tree.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/package-tree.html (revision 0)
@@ -0,0 +1,165 @@
+
+
+
+
+
+
+org.apache.uima.examples.tagger Class Hierarchy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Hierarchy For Package org.apache.uima.examples.tagger
+
+
+
+Package Hierarchies: All Packages
+
+
+Class Hierarchy
+
+
+java.lang.Object
+org.apache.uima.analysis_component.AnalysisComponent_ImplBase (implements org.apache.uima.analysis_component.AnalysisComponent)
+
+org.apache.uima.analysis_component.Annotator_ImplBase
+org.apache.uima.analysis_component.JCasAnnotator_ImplBase
+org.apache.uima.examples.tagger.HMMTagger (implements org.apache.uima.examples.tagger.Tagger )
+
+
+
+ org.apache.uima.examples.tagger.GrobMapping (implements org.apache.uima.examples.tagger.MappingInterface )
+ org.apache.uima.examples.tagger.TagMapping (implements org.apache.uima.examples.tagger.MappingInterface )
+ org.apache.uima.examples.tagger.Viterbi
+
+
+Interface Hierarchy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/ModelGeneration.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/ModelGeneration.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/ModelGeneration.html (revision 0)
@@ -0,0 +1,212 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.ModelGeneration
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.ModelGeneration
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/MappingInterface.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/MappingInterface.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/MappingInterface.html (revision 0)
@@ -0,0 +1,249 @@
+
+
+
+
+
+
+Uses of Interface org.apache.uima.examples.tagger.MappingInterface
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Interface org.apache.uima.examples.tagger.MappingInterface
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/Token.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/Token.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/Token.html (revision 0)
@@ -0,0 +1,227 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.Token
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.Token
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method parameters in org.apache.uima.examples.tagger.trainAndTest with type arguments of type Token
+
+
+
+(package private) static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>
+ModelGeneration. get_lexicon (java.util.List<Token > corpus)
+
+
+ Reads sentences, extracts <word, possible parts-of-speech> frequency patterns
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TT_FormatReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TT_FormatReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TT_FormatReader.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.TT_FormatReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.TT_FormatReader
+
+No usage of org.apache.uima.examples.tagger.TT_FormatReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/ModelGenerationBytes.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/ModelGenerationBytes.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/ModelGenerationBytes.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.ModelGenerationBytes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.ModelGenerationBytes
+
+No usage of org.apache.uima.examples.tagger.ModelGenerationBytes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.SuffixTree
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.SuffixTree
+
+No usage of org.apache.uima.examples.tagger.SuffixTree
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TagMapping.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TagMapping.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TagMapping.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.TagMapping
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.TagMapping
+
+No usage of org.apache.uima.examples.tagger.TagMapping
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.Suffix.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.Suffix.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.Suffix.html (revision 0)
@@ -0,0 +1,202 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.SuffixTree.Suffix
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.SuffixTree.Suffix
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.Edge.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.Edge.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.Edge.html (revision 0)
@@ -0,0 +1,177 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.SuffixTree.Edge
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.SuffixTree.Edge
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/CorpusReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/CorpusReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/CorpusReader.html (revision 0)
@@ -0,0 +1,185 @@
+
+
+
+
+
+
+Uses of Interface org.apache.uima.examples.tagger.CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Interface org.apache.uima.examples.tagger.CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TaggerEvaluation.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TaggerEvaluation.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TaggerEvaluation.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.TaggerEvaluation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.TaggerEvaluation
+
+No usage of org.apache.uima.examples.tagger.TaggerEvaluation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TestExperiments.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TestExperiments.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/TestExperiments.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.TestExperiments
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.TestExperiments
+
+No usage of org.apache.uima.examples.tagger.TestExperiments
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/Viterbi.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/Viterbi.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/Viterbi.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.Viterbi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.Viterbi
+
+No usage of org.apache.uima.examples.tagger.Viterbi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/GrobMapping.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/GrobMapping.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/GrobMapping.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.GrobMapping
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.GrobMapping
+
+No usage of org.apache.uima.examples.tagger.GrobMapping
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.Node.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.Node.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/SuffixTree.Node.html (revision 0)
@@ -0,0 +1,177 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.SuffixTree.Node
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.SuffixTree.Node
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/MapBrownToPenn.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/MapBrownToPenn.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/MapBrownToPenn.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.MapBrownToPenn
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.MapBrownToPenn
+
+No usage of org.apache.uima.examples.tagger.MapBrownToPenn
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/Tagger.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/Tagger.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/Tagger.html (revision 0)
@@ -0,0 +1,177 @@
+
+
+
+
+
+
+Uses of Interface org.apache.uima.examples.tagger.Tagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Interface org.apache.uima.examples.tagger.Tagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/BrownReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/BrownReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/BrownReader.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.BrownReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.BrownReader
+
+No usage of org.apache.uima.examples.tagger.BrownReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/HMMTagger.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/HMMTagger.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/class-use/HMMTagger.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.HMMTagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.HMMTagger
+
+No usage of org.apache.uima.examples.tagger.HMMTagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.html (revision 0)
@@ -0,0 +1,778 @@
+
+
+
+
+
+
+ModelGeneration
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Class ModelGeneration
+
+java.lang.Object
+ org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+
+All Implemented Interfaces: java.io.Serializable
+
+
+
+public class ModelGeneration extends java.lang.Objectimplements java.io.Serializable
+
+
+
+Trains an N-gram model for the tagger, iterating over the files from some predefined training directory.
+
+ Writes the resulting model to a binary file.
+
+ NB. At the moment: both bi-and trigram statistics are saved in one model file..
+
+
+
+
+See Also: Serialized Form
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+bigrams
+
+
+
+
+
+
+(package private) java.util.List
+corpus
+
+
+
+
+
+
+ double[]
+lambdas2
+
+
+
+
+
+
+ double[]
+lambdas3
+
+
+
+
+
+
+(package private) int
+N
+
+
+
+
+
+
+(package private) java.lang.String
+OutputFile
+
+
+
+
+
+
+(package private) static java.util.List<java.lang.String>
+posList
+
+
+
+
+
+
+(package private) static java.util.Map
+sm
+
+
+ Computes word_probs using get_lexicon(List) frequency counts for known words..
+
+
+
+(package private) static java.util.Map
+sm2
+
+
+
+
+
+
+ java.util.Map
+suffix_tree
+
+
+
+
+
+
+ java.util.Map
+suffix_tree_capitalized
+
+
+
+
+
+
+ double
+theta
+
+
+
+
+
+
+(package private) static int
+tokens_count_all_corpus
+
+
+
+
+
+
+ java.util.Map<java.lang.String,java.lang.Double>
+transition_probs
+
+
+ Map containing N-gram probabilities
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+trigrams
+
+
+
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+unigrams
+
+
+ Computes transition_probs using get_ngrams(int) frequency counts for N-grams..
+
+
+
+ java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>
+word_probs
+
+
+ Map containing <word,tag> probabilities, that is probability of a certain word given a certain tag at a time t: P(wordt |tagt ))
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+ModelGeneration (java.util.List<Token > corpus,
+ java.lang.String OutputFile)
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+static boolean
+capitalized (java.lang.String word)
+
+
+ Check is the token is capitalized
+
+
+
+(package private) static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>
+get_lexicon (java.util.List<Token > corpus)
+
+
+ Reads sentences, extracts <word, possible parts-of-speech> frequency patterns
+
+
+
+(package private) static double
+get_max (double a,
+ double b,
+ double c)
+
+
+
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+get_ngrams (int N)
+
+
+ Computes N-gram frequencies
+
+
+
+(package private) static java.util.Map<java.lang.String,java.lang.Double>
+get_transition_probs (int N)
+
+
+
+
+
+
+(package private) static java.util.List<java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>>
+get_word_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> corpus)
+
+
+
+
+
+
+(package private) static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>
+logify_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> probs)
+
+
+ Make LOGs out of probabilities.. there was a reason to separate it from the get_word_probs method at the initial step
+
+
+
+static void
+main (java.lang.String[] args)
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+suffix_tree
+
+public java.util.Map suffix_tree
+
+
+
+
+
+
+
+suffix_tree_capitalized
+
+public java.util.Map suffix_tree_capitalized
+
+
+
+
+
+
+
+word_probs
+
+public java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> word_probs
+
+Map containing <word,tag> probabilities, that is probability of a certain word given a certain tag at a time t: P(wordt |tagt ))
+
+
+
+
+
+
+
+transition_probs
+
+public java.util.Map<java.lang.String,java.lang.Double> transition_probs
+
+Map containing N-gram probabilities
+
+
+
+
+
+
+
+posList
+
+static java.util.List<java.lang.String> posList
+
+
+
+
+
+
+
+N
+
+int N
+
+
+
+
+
+
+
+lambdas2
+
+public double[] lambdas2
+
+
+
+
+
+
+
+lambdas3
+
+public double[] lambdas3
+
+
+
+
+
+
+
+theta
+
+public double theta
+
+
+
+
+
+
+
+OutputFile
+
+transient java.lang.String OutputFile
+
+
+
+
+
+
+
+corpus
+
+transient java.util.List corpus
+
+
+
+
+
+
+
+tokens_count_all_corpus
+
+static int tokens_count_all_corpus
+
+
+
+
+
+
+
+sm
+
+static java.util.Map sm
+
+Computes word_probs using get_lexicon(List) frequency counts for known words..
+ TO_DO: ADD SMOOTHING FOR UNKNOWNS?? OR add smoothing directly when come across unknown..
+
+
+
+
+
+
+
+sm2
+
+static java.util.Map sm2
+
+
+
+
+
+
+
+unigrams
+
+static java.util.Map<java.lang.String,java.lang.Double> unigrams
+
+Computes transition_probs using get_ngrams(int) frequency counts for N-grams..
+
+
+
+
+
+
+
+bigrams
+
+static java.util.Map<java.lang.String,java.lang.Double> bigrams
+
+
+
+
+
+
+
+trigrams
+
+static java.util.Map<java.lang.String,java.lang.Double> trigrams
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+ModelGeneration
+
+public ModelGeneration (java.util.List<Token > corpus,
+ java.lang.String OutputFile)
+
+
+Parameters: N - N=1, 2 or 3InputDir - input directory nameOutputFile - output file name
+ MapBrownToPenn TagMapping
+
+
+
+
+
+
+
+
+capitalized
+
+public static boolean capitalized (java.lang.String word)
+
+Check is the token is capitalized
+
+
+
+
+
+
+
+
+
+
+
+get_lexicon
+
+static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> get_lexicon (java.util.List<Token > corpus)
+
+Reads sentences, extracts <word, possible parts-of-speech> frequency patterns
+
+
+
+
+
+Parameters: corpus - list containing all tokens of the training corpus of the type Token }
+Returns: map containing frequency counts for <word, its pos>
+
+
+
+
+
+get_word_probs
+
+static java.util.List<java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>> get_word_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> corpus)
+
+
+
+
+
+
+
+
+
+
+
+logify_probs
+
+static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> logify_probs (java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> probs)
+
+Make LOGs out of probabilities.. there was a reason to separate it from the get_word_probs method at the initial step
+
+
+
+
+
+
+
+
+
+
+
+get_ngrams
+
+static java.util.Map<java.lang.String,java.lang.Double> get_ngrams (int N)
+ throws java.lang.IllegalArgumentException
+
+Computes N-gram frequencies
+
+
+
+
+
+Parameters: N -
+Returns: Map N-grams of parts-of-speech, where N = 1, 2 or 3
+Throws:
+java.lang.IllegalArgumentException
+
+
+
+
+
+get_transition_probs
+
+static java.util.Map<java.lang.String,java.lang.Double> get_transition_probs (int N)
+ throws java.lang.IllegalArgumentException
+
+
+
+
+
+
+Throws:
+java.lang.IllegalArgumentException
+
+
+
+
+
+get_max
+
+static double get_max (double a,
+ double b,
+ double c)
+
+
+
+
+
+
+
+
+
+
+
+main
+
+public static void main (java.lang.String[] args)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/Token.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/Token.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/Token.html (revision 0)
@@ -0,0 +1,311 @@
+
+
+
+
+
+
+Token
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Class Token
+
+java.lang.Object
+ org.apache.uima.examples.tagger.trainAndTest.Token
+
+
+
+public class Token extends java.lang.Object
+
+
+
+Defines token features.
+ Helpful as an intermediate layer between a text and the tagger.
+
+
+
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+ java.lang.String
+pos
+
+
+
+
+
+
+ java.lang.String
+word
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+Token ()
+
+
+
+
+
+Token (java.lang.String word)
+
+
+
+
+
+Token (java.lang.String word,
+ java.lang.String pos)
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+pos
+
+public java.lang.String pos
+
+
+
+
+
+
+
+word
+
+public java.lang.String word
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+Token
+
+public Token ()
+
+
+
+
+
+Token
+
+public Token (java.lang.String word)
+
+
+
+
+
+Token
+
+public Token (java.lang.String word,
+ java.lang.String pos)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/TT_FormatReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/TT_FormatReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/TT_FormatReader.html (revision 0)
@@ -0,0 +1,258 @@
+
+
+
+
+
+
+TT_FormatReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Class TT_FormatReader
+
+java.lang.Object
+ org.apache.uima.examples.tagger.trainAndTest.TT_FormatReader
+
+
+All Implemented Interfaces: CorpusReader
+
+
+
+public class TT_FormatReader extends java.lang.Objectimplements CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TT_FormatReader
+
+public TT_FormatReader ()
+
+
+
+
+
+
+
+
+
+read_corpus
+
+public java.util.List<Token > read_corpus (java.lang.String file,
+ MappingInterface Mapping)
+
+
+Specified by: read_corpus in interface CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-frame.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-frame.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-frame.html (revision 0)
@@ -0,0 +1,53 @@
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.html (revision 0)
@@ -0,0 +1,431 @@
+
+
+
+
+
+
+SuffixTree
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Class SuffixTree
+
+java.lang.Object
+ org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+
+
+public class SuffixTree extends java.lang.Object
+
+
+
+Java implementation of the Ukkonen's suffix tree inspired by Mark Nelson's tutorial:
+ http://marknelson.us/1996/08/01/suffix-trees/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+text
+
+public java.lang.String text
+
+
+
+
+
+
+
+nodes
+
+public java.util.List<SuffixTree.Node > nodes
+
+
+
+
+
+
+
+edges
+
+public java.util.Map edges
+
+
+
+
+
+
+
+chars
+
+char[] chars
+
+
+
+
+
+
+
+active_point
+
+SuffixTree.Suffix active_point
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SuffixTree
+
+public SuffixTree ()
+
+
+
+
+
+SuffixTree
+
+public SuffixTree (java.lang.String text)
+
+
+
+
+
+
+
+
+
+insert_edge
+
+public void insert_edge (SuffixTree.Edge edge)
+
+
+
+
+
+
+
+
+add_prefix
+
+public void add_prefix (int last_char,
+ SuffixTree.Suffix active_point)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-use.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-use.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-use.html (revision 0)
@@ -0,0 +1,216 @@
+
+
+
+
+
+
+Uses of Package org.apache.uima.examples.tagger.trainAndTest
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Package org.apache.uima.examples.tagger.trainAndTest
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-summary.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-summary.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-summary.html (revision 0)
@@ -0,0 +1,191 @@
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Package org.apache.uima.examples.tagger.trainAndTest
+
+
+
+
+
+Interface Summary
+
+
+CorpusReader
+Reads (annotated) text file(s) and transforms every word into a Token-object
+
+
+
+
+
+
+
+
+
+Class Summary
+
+
+BrownReader
+Reader for Brown Corpus from NLTK Distribution (nltk.sourceforge.net)
+
+
+ModelGeneration
+Trains an N-gram model for the tagger, iterating over the files from some predefined training directory.
+
+
+SuffixTree
+Java implementation of the Ukkonen's suffix tree inspired by Mark Nelson's tutorial:
+ http://marknelson.us/1996/08/01/suffix-trees/
+
+
+TaggerEvaluation
+Evaluation of Tagger
+ NB.
+
+
+Token
+Defines token features.
+
+
+TT_FormatReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.Edge.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.Edge.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.Edge.html (revision 0)
@@ -0,0 +1,353 @@
+
+
+
+
+
+
+SuffixTree.Edge
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Class SuffixTree.Edge
+
+java.lang.Object
+ org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge
+
+
+Enclosing class: SuffixTree
+
+
+
+public class SuffixTree.Edge extends java.lang.Object
+
+
+
+Internal Class EDGE
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+SuffixTree.Edge (int parent_node,
+ int end_node,
+ int first_char_index,
+ int last_char_index)
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+first_char_index
+
+public int first_char_index
+
+
+
+
+
+
+
+last_char_index
+
+public int last_char_index
+
+
+
+
+
+
+
+start_node
+
+int start_node
+
+
+
+
+
+
+
+end_node
+
+public int end_node
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SuffixTree.Edge
+
+public SuffixTree.Edge (int parent_node,
+ int end_node,
+ int first_char_index,
+ int last_char_index)
+
+
+
+
+
+
+
+
+
+split_edge
+
+public int split_edge (SuffixTree.Suffix suffix)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.Suffix.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.Suffix.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.Suffix.html (revision 0)
@@ -0,0 +1,371 @@
+
+
+
+
+
+
+SuffixTree.Suffix
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Class SuffixTree.Suffix
+
+java.lang.Object
+ org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+
+Enclosing class: SuffixTree
+
+
+
+ class SuffixTree.Suffix extends java.lang.Object
+
+
+
+Internal Class SUFFIX
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+SuffixTree.Suffix (int node,
+ int begin,
+ int end)
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+(package private) void
+canonize ()
+
+
+
+
+
+
+(package private) boolean
+isExplicit ()
+
+
+
+
+
+
+(package private) boolean
+isImplicit ()
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+origin_node
+
+int origin_node
+
+
+
+
+
+
+
+first_char_index
+
+int first_char_index
+
+
+
+
+
+
+
+last_char_index
+
+int last_char_index
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SuffixTree.Suffix
+
+public SuffixTree.Suffix (int node,
+ int begin,
+ int end)
+
+
+
+
+
+
+
+
+
+isExplicit
+
+boolean isExplicit ()
+
+
+
+
+
+
+
+
+isImplicit
+
+boolean isImplicit ()
+
+
+
+
+
+
+
+
+canonize
+
+void canonize ()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-tree.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-tree.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/package-tree.html (revision 0)
@@ -0,0 +1,158 @@
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest Class Hierarchy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Hierarchy For Package org.apache.uima.examples.tagger.trainAndTest
+
+
+
+Package Hierarchies: All Packages
+
+
+Class Hierarchy
+
+
+
+Interface Hierarchy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/ModelGeneration.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/ModelGeneration.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/ModelGeneration.html (revision 0)
@@ -0,0 +1,225 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.ModelGeneration
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/CorpusReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/CorpusReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/CorpusReader.html (revision 0)
@@ -0,0 +1,185 @@
+
+
+
+
+
+
+Uses of Interface org.apache.uima.examples.tagger.trainAndTest.CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Interface org.apache.uima.examples.tagger.trainAndTest.CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/Token.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/Token.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/Token.html (revision 0)
@@ -0,0 +1,227 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.Token
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.Token
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method parameters in org.apache.uima.examples.tagger.trainAndTest with type arguments of type Token
+
+
+
+(package private) static java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>>
+ModelGeneration. get_lexicon (java.util.List<Token > corpus)
+
+
+ Reads sentences, extracts <word, possible parts-of-speech> frequency patterns
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/TaggerEvaluation.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/TaggerEvaluation.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/TaggerEvaluation.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.TaggerEvaluation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.TaggerEvaluation
+
+No usage of org.apache.uima.examples.tagger.trainAndTest.TaggerEvaluation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/TT_FormatReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/TT_FormatReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/TT_FormatReader.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.TT_FormatReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.TT_FormatReader
+
+No usage of org.apache.uima.examples.tagger.trainAndTest.TT_FormatReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+No usage of org.apache.uima.examples.tagger.trainAndTest.SuffixTree
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.Edge.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.Edge.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.Edge.html (revision 0)
@@ -0,0 +1,177 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.Suffix.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.Suffix.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.Suffix.html (revision 0)
@@ -0,0 +1,202 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.Node.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.Node.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/SuffixTree.Node.html (revision 0)
@@ -0,0 +1,177 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Node
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Node
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/BrownReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/BrownReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/class-use/BrownReader.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.BrownReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class org.apache.uima.examples.tagger.trainAndTest.BrownReader
+
+No usage of org.apache.uima.examples.tagger.trainAndTest.BrownReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/CorpusReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/CorpusReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/CorpusReader.html (revision 0)
@@ -0,0 +1,213 @@
+
+
+
+
+
+
+CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Interface CorpusReader
+
+All Known Implementing Classes: BrownReader , TT_FormatReader
+
+
+
+public interface CorpusReader
+
+
+
+Reads (annotated) text file(s) and transforms every word into a Token-object
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+read_corpus
+
+java.util.List<Token > read_corpus (java.lang.String file,
+ MappingInterface mapping)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/TaggerEvaluation.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/TaggerEvaluation.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/TaggerEvaluation.html (revision 0)
@@ -0,0 +1,281 @@
+
+
+
+
+
+
+TaggerEvaluation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Class TaggerEvaluation
+
+java.lang.Object
+ org.apache.uima.examples.tagger.trainAndTest.TaggerEvaluation
+
+
+
+public class TaggerEvaluation extends java.lang.Object
+
+
+
+Evaluation of Tagger
+ NB. As it is implemented at hte moment, to be used just for small tests with small files ..
+ (very naive and takes quite a long time with big files..)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+(package private) static void
+get_eval (ModelGeneration my_model,
+ java.util.List<java.lang.String> wordList,
+ java.util.List<java.lang.String> posList,
+ java.util.List<java.lang.String> TagList)
+
+
+
+
+
+
+static void
+main (java.lang.String[] args)
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TaggerEvaluation
+
+public TaggerEvaluation ()
+
+
+
+
+
+
+
+
+
+get_eval
+
+static void get_eval (ModelGeneration my_model,
+ java.util.List<java.lang.String> wordList,
+ java.util.List<java.lang.String> posList,
+ java.util.List<java.lang.String> TagList)
+
+
+
+
+
+
+
+
+main
+
+public static void main (java.lang.String[] args)
+
+
+Parameters: args -
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.Node.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.Node.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.Node.html (revision 0)
@@ -0,0 +1,265 @@
+
+
+
+
+
+
+SuffixTree.Node
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Class SuffixTree.Node
+
+java.lang.Object
+ org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Node
+
+
+Enclosing class: SuffixTree
+
+
+
+public class SuffixTree.Node extends java.lang.Object
+
+
+
+Internal Class NODE
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+suffix_node
+
+public int suffix_node
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SuffixTree.Node
+
+public SuffixTree.Node ()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/BrownReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/BrownReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/trainAndTest/BrownReader.html (revision 0)
@@ -0,0 +1,267 @@
+
+
+
+
+
+
+BrownReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger.trainAndTest
+
+Class BrownReader
+
+java.lang.Object
+ org.apache.uima.examples.tagger.trainAndTest.BrownReader
+
+
+All Implemented Interfaces: CorpusReader
+
+
+
+public class BrownReader extends java.lang.Objectimplements CorpusReader
+
+
+
+Reader for Brown Corpus from NLTK Distribution (nltk.sourceforge.net)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+ java.util.List<Token >
+read_corpus (java.lang.String directory,
+ MappingInterface Mapping)
+
+
+ Reads Brown Corpus from NLTK Distribution Format.
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+BrownReader
+
+public BrownReader ()
+
+
+
+
+
+
+
+
+
+read_corpus
+
+public java.util.List<Token > read_corpus (java.lang.String directory,
+ MappingInterface Mapping)
+
+Reads Brown Corpus from NLTK Distribution Format. Iterates over all files in the directory,
+ which are in a sentence per line format, and returns all tokens in the collection in a List of
+ Token s}
+
+
+Specified by: read_corpus in interface CorpusReader
+
+
+Parameters: directory - an array of file names
+Returns: a list of tokens from all files
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/CorpusReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/CorpusReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/CorpusReader.html (revision 0)
@@ -0,0 +1,228 @@
+
+
+
+
+
+
+CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Interface CorpusReader
+
+All Known Implementing Classes: BrownReader , TT_FormatReader
+
+
+
+public interface CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+read_corpus
+
+java.util.List<Token > read_corpus (java.lang.String file)
+
+
+
+
+
+
+
+
+read_corpus
+
+java.util.List<Token > read_corpus (java.lang.String file,
+ MappingInterface mapping)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TaggerEvaluation.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TaggerEvaluation.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TaggerEvaluation.html (revision 0)
@@ -0,0 +1,281 @@
+
+
+
+
+
+
+TaggerEvaluation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class TaggerEvaluation
+
+java.lang.Object
+ org.apache.uima.examples.tagger.TaggerEvaluation
+
+
+
+public class TaggerEvaluation extends java.lang.Object
+
+
+
+Evaluation of Tagger
+ NB. As it is implemented at hte moment, to be used just for small tests with small files ..
+ (very naive and takes quite a long time with big files..)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+(package private) static void
+get_eval (ModelGeneration my_model,
+ java.util.List<java.lang.String> wordList,
+ java.util.List<java.lang.String> posList,
+ java.util.List<java.lang.String> TagList)
+
+
+
+
+
+
+static void
+main (java.lang.String[] args)
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TaggerEvaluation
+
+public TaggerEvaluation ()
+
+
+
+
+
+
+
+
+
+get_eval
+
+static void get_eval (ModelGeneration my_model,
+ java.util.List<java.lang.String> wordList,
+ java.util.List<java.lang.String> posList,
+ java.util.List<java.lang.String> TagList)
+
+
+
+
+
+
+
+
+main
+
+public static void main (java.lang.String[] args)
+
+
+Parameters: args -
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TestExperiments.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TestExperiments.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/TestExperiments.html (revision 0)
@@ -0,0 +1,389 @@
+
+
+
+
+
+
+TestExperiments
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class TestExperiments
+
+java.lang.Object
+ org.apache.uima.examples.tagger.TestExperiments
+
+
+
+public class TestExperiments extends java.lang.Object
+
+
+
+
+Author:
+ Eugenie Giesbrecht
+ Evaluation of Tagger
+ NB. As it is implemented at hte moment, to be used just for small tests with small files ..
+ (very naive and takes quite a long time with big files..)
+
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+(package private) boolean
+DO_MAPPING
+
+
+
+
+
+
+(package private) java.lang.String
+END_OF_SENT_TAG
+
+
+
+
+
+
+(package private) MappingInterface
+MAPPING
+
+
+
+
+
+
+(package private) java.lang.String
+MODEL
+
+
+
+
+
+
+(package private) ModelGeneration
+my_model
+
+
+ Model used for current tagging
+
+
+
+(package private) int
+N
+
+
+ for a bigram model: N = 2, for a trigram model N=3
+ N is defined in parameter file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+static void
+main (java.lang.String[] args)
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+MODEL
+
+java.lang.String MODEL
+
+
+
+
+
+
+
+N
+
+int N
+
+for a bigram model: N = 2, for a trigram model N=3
+ N is defined in parameter file
+
+
+
+
+
+
+
+END_OF_SENT_TAG
+
+java.lang.String END_OF_SENT_TAG
+
+
+
+
+
+
+
+MAPPING
+
+MappingInterface MAPPING
+
+
+
+
+
+
+
+DO_MAPPING
+
+boolean DO_MAPPING
+
+
+
+
+
+
+
+my_model
+
+ModelGeneration my_model
+
+Model used for current tagging
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TestExperiments
+
+public TestExperiments ()
+
+
+
+
+
+
+
+
+
+main
+
+public static void main (java.lang.String[] args)
+
+
+Parameters: args -
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/Viterbi.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/Viterbi.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/Viterbi.html (revision 0)
@@ -0,0 +1,295 @@
+
+
+
+
+
+
+Viterbi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class Viterbi
+
+java.lang.Object
+ org.apache.uima.examples.tagger.Viterbi
+
+
+
+public class Viterbi extends java.lang.Object
+
+
+
+Viterbi Algorithm: Given a model and a sequence of observations, what is the most likely
+ sequence of states in the model that produces the observations?
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+Viterbi ()
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+static java.util.Map<java.lang.String,java.util.List>
+init_probs (java.lang.String END_OF_SENT_TAG,
+ java.util.Map<java.lang.String,java.lang.Double> pos_s)
+
+
+
+
+
+
+static java.util.List
+process (int N,
+ java.util.List<java.lang.String> sentence,
+ java.lang.String END_OF_SENT_TAG,
+ java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> suffix_tree,
+ java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> suffix_tree_cap,
+ java.util.Map<java.lang.String,java.lang.Double> transition_probs,
+ java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> word_probs,
+ double[] lambdas2,
+ double[] lambdas3,
+ double theta)
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+Viterbi
+
+public Viterbi ()
+
+
+
+
+
+
+
+
+
+init_probs
+
+public static java.util.Map<java.lang.String,java.util.List> init_probs (java.lang.String END_OF_SENT_TAG,
+ java.util.Map<java.lang.String,java.lang.Double> pos_s)
+
+
+
+
+
+
+
+
+process
+
+public static java.util.List process (int N,
+ java.util.List<java.lang.String> sentence,
+ java.lang.String END_OF_SENT_TAG,
+ java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> suffix_tree,
+ java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> suffix_tree_cap,
+ java.util.Map<java.lang.String,java.lang.Double> transition_probs,
+ java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.lang.Double>> word_probs,
+ double[] lambdas2,
+ double[] lambdas3,
+ double theta)
+
+
+Parameters: N - sentence - END_OF_SENT_TAG - pos_s - transition_probs - word_probs - lambdas2, - lambdas3
+Returns:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/GrobMapping.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/GrobMapping.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/GrobMapping.html (revision 0)
@@ -0,0 +1,262 @@
+
+
+
+
+
+
+GrobMapping
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class GrobMapping
+
+java.lang.Object
+ org.apache.uima.examples.tagger.GrobMapping
+
+
+All Implemented Interfaces: MappingInterface
+
+
+
+public class GrobMapping extends java.lang.Objectimplements MappingInterface
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+ java.util.List
+map_tags (java.util.List tokens)
+
+
+ Defines mapping for List<Token >
+ E.g. if we need to map tags, given a list of Tokens, we need to map the
+ pos field of every Token to a different pos.
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+GrobMapping
+
+public GrobMapping ()
+
+
+
+
+
+
+
+
+
+map_tags
+
+public java.util.List map_tags (java.util.List tokens)
+
+Defines mapping for List<Token >
+ E.g. if we need to map tags, given a list of Tokens, we need to map the
+ pos field of every Token to a different pos.
+
+
+Specified by: map_tags in interface MappingInterface
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.Node.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.Node.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/SuffixTree.Node.html (revision 0)
@@ -0,0 +1,265 @@
+
+
+
+
+
+
+SuffixTree.Node
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class SuffixTree.Node
+
+java.lang.Object
+ org.apache.uima.examples.tagger.SuffixTree.Node
+
+
+Enclosing class: SuffixTree
+
+
+
+public class SuffixTree.Node extends java.lang.Object
+
+
+
+Internal Class NODE
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+suffix_node
+
+public int suffix_node
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+SuffixTree.Node
+
+public SuffixTree.Node ()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/Tagger.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/Tagger.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/Tagger.html (revision 0)
@@ -0,0 +1,245 @@
+
+
+
+
+
+
+Tagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Interface Tagger
+
+All Known Implementing Classes: HMMTagger
+
+
+
+public interface Tagger
+
+
+
+General tagger interface in case one would want to define further types of taggers.
+
+ Known implementations: HMMTagger using Viterbi algorithm
+ to compute the most probable path of parts of speech for a given sequence of tokens
+
+
+
+
+See Also: Viterbi, HMMTagger
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+ void
+initialize (org.apache.uima.UimaContext aContext)
+
+
+ Instantiates MODEL for current tagger
+
+
+
+ void
+process (org.apache.uima.jcas.JCas aJCas)
+
+
+ Trains a new model for tagger, if a training is defined in tagger.properties file
+
+
+
+
+
+
+
+
+
+
+
+initialize
+
+void initialize (org.apache.uima.UimaContext aContext)
+ throws org.apache.uima.resource.ResourceInitializationException
+
+Instantiates MODEL for current tagger
+
+
+
+Throws:
+org.apache.uima.resource.ResourceInitializationException
+
+
+
+
+
+process
+
+void process (org.apache.uima.jcas.JCas aJCas)
+ throws org.apache.uima.analysis_engine.AnalysisEngineProcessException
+
+Trains a new model for tagger, if a training is defined in tagger.properties file
+
+
+
+Throws:
+org.apache.uima.analysis_engine.AnalysisEngineProcessExceptionSee Also: ModelGenerator
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/MapBrownToPenn.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/MapBrownToPenn.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/MapBrownToPenn.html (revision 0)
@@ -0,0 +1,254 @@
+
+
+
+
+
+
+MapBrownToPenn
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class MapBrownToPenn
+
+java.lang.Object
+ org.apache.uima.examples.tagger.MapBrownToPenn
+
+
+
+public class MapBrownToPenn extends java.lang.Object
+
+
+
+just a "game"-mapping at the moment..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+static java.util.List<Token >
+map_tags (java.util.List<Token > tokens)
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+MapBrownToPenn
+
+public MapBrownToPenn ()
+
+
+
+
+
+
+
+
+
+map_tags
+
+public static java.util.List<Token > map_tags (java.util.List<Token > tokens)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/BrownReader.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/BrownReader.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/BrownReader.html (revision 0)
@@ -0,0 +1,341 @@
+
+
+
+
+
+
+BrownReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class BrownReader
+
+java.lang.Object
+ org.apache.uima.examples.tagger.BrownReader
+
+
+All Implemented Interfaces: CorpusReader
+
+
+
+public class BrownReader extends java.lang.Objectimplements CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+(package private) java.util.List<java.lang.String>
+all_words
+
+
+ Reads file names from Directory
+
+
+
+(package private) java.util.List<Token >
+corpus
+
+
+
+
+
+
+(package private) java.lang.String
+InputDir
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+ java.util.List<Token >
+read_corpus (java.lang.String directory)
+
+
+
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+
+
+corpus
+
+java.util.List<Token > corpus
+
+
+
+
+
+
+
+InputDir
+
+java.lang.String InputDir
+
+
+
+
+
+
+
+all_words
+
+java.util.List<java.lang.String> all_words
+
+Reads file names from Directory
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+BrownReader
+
+public BrownReader (java.lang.String InputDir,
+ MappingInterface Mapping)
+
+
+
+
+
+
+
+
+
+read_corpus
+
+public java.util.List<Token > read_corpus (java.lang.String directory)
+
+
+Specified by: read_corpus in interface CorpusReader
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/HMMTagger.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/HMMTagger.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/examples/tagger/HMMTagger.html (revision 0)
@@ -0,0 +1,464 @@
+
+
+
+
+
+
+HMMTagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima.examples.tagger
+
+Class HMMTagger
+
+java.lang.Object
+ org.apache.uima.analysis_component.AnalysisComponent_ImplBase
+ org.apache.uima.analysis_component.Annotator_ImplBase
+ org.apache.uima.analysis_component.JCasAnnotator_ImplBase
+ org.apache.uima.examples.tagger.HMMTagger
+
+
+All Implemented Interfaces: org.apache.uima.analysis_component.AnalysisComponent, Tagger
+
+
+
+public class HMMTagger extends org.apache.uima.analysis_component.JCasAnnotator_ImplBaseimplements Tagger
+
+
+
+UIMA Analysis Engine that invokes HMM POS tagger. HMM POS tagger generates part-of-speech tags
+ for every token. This annotator assumes that sentences and tokens have already been annotated in the CAS
+ with Sentence and Token annotations, respectively. We iterate over sentences, then iterate over
+ tokens in the current sentence to accumulate a list of words, then invoke the HMM POS tagger on
+ the list of words. For each Token we then update the posTag field with the POS tag. The model
+ file for the HMM POS tagger is specified as a parameter (MODEL_FILE_PARAM).
+
+
+
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+(package private) boolean
+DO_MAPPING
+
+
+
+
+
+
+(package private) MappingInterface
+MAPPING
+
+
+
+
+
+
+(package private) java.lang.String
+MODEL
+
+
+ Model file name
+
+
+
+ ModelGeneration
+my_model
+
+
+
+
+
+
+ int
+N
+
+
+ for a bigram model: N = 2, for a trigram model N=3 N is defined in parameter file
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+HMMTagger ()
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+static ModelGeneration
+get_model (java.lang.String filename)
+
+
+ Reads a saved MODEL object from a file
+
+
+
+ void
+initialize (org.apache.uima.UimaContext aContext)
+
+
+ Initialize the Annotator.
+
+
+
+ void
+process (org.apache.uima.jcas.JCas aJCas)
+
+
+ Process a CAS.
+
+
+
+
+
+Methods inherited from class org.apache.uima.analysis_component.JCasAnnotator_ImplBase
+
+
+getRequiredCasInterface, process
+
+
+
+
+
+Methods inherited from class org.apache.uima.analysis_component.Annotator_ImplBase
+
+
+getCasInstancesRequired, hasNext, next
+
+
+
+
+
+Methods inherited from class org.apache.uima.analysis_component.AnalysisComponent_ImplBase
+
+
+batchProcessComplete, collectionProcessComplete, destroy, getContext, getResultSpecification, reconfigure, setResultSpecification
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+MODEL
+
+java.lang.String MODEL
+
+Model file name
+
+
+
+
+
+
+
+N
+
+public int N
+
+for a bigram model: N = 2, for a trigram model N=3 N is defined in parameter file
+
+
+
+
+
+
+
+my_model
+
+public ModelGeneration my_model
+
+
+
+
+
+
+
+MAPPING
+
+MappingInterface MAPPING
+
+
+
+
+
+
+
+DO_MAPPING
+
+boolean DO_MAPPING
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+HMMTagger
+
+public HMMTagger ()
+
+
+
+
+
+
+
+
+
+initialize
+
+public void initialize (org.apache.uima.UimaContext aContext)
+ throws org.apache.uima.resource.ResourceInitializationException
+
+Initialize the Annotator.
+
+
+Specified by: initialize in interface org.apache.uima.analysis_component.AnalysisComponentSpecified by: initialize in interface Tagger Overrides: initialize in class org.apache.uima.analysis_component.AnalysisComponent_ImplBase
+
+
+
+Throws:
+org.apache.uima.resource.ResourceInitializationExceptionSee Also: AnalysisComponent_ImplBase.initialize(UimaContext)
+
+
+
+
+
+get_model
+
+public static ModelGeneration get_model (java.lang.String filename)
+
+Reads a saved MODEL object from a file
+
+
+
+
+
+Parameters: filename - model file
+Returns: ModelGeneration
+
+
+
+
+
+process
+
+public void process (org.apache.uima.jcas.JCas aJCas)
+ throws org.apache.uima.analysis_engine.AnalysisEngineProcessException
+
+Process a CAS.
+
+
+Specified by: process in interface Tagger Specified by: process in class org.apache.uima.analysis_component.JCasAnnotator_ImplBase
+
+
+
+Throws:
+org.apache.uima.analysis_engine.AnalysisEngineProcessExceptionSee Also: JCasAnnotator_ImplBase.process(JCas)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/TokenAnnotation.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/TokenAnnotation.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/org/apache/uima/TokenAnnotation.html (revision 0)
@@ -0,0 +1,526 @@
+
+
+
+
+
+
+TokenAnnotation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+org.apache.uima
+
+Class TokenAnnotation
+
+java.lang.Object
+ org.apache.uima.cas.impl.FeatureStructureImpl
+ org.apache.uima.jcas.cas.TOP
+ org.apache.uima.jcas.cas.AnnotationBase
+ org.apache.uima.jcas.tcas.Annotation
+ org.apache.uima.TokenAnnotation
+
+
+All Implemented Interfaces: java.lang.Cloneable, org.apache.uima.cas.AnnotationBaseFS, org.apache.uima.cas.FeatureStructure, org.apache.uima.cas.text.AnnotationFS
+
+
+
+public class TokenAnnotation extends org.apache.uima.jcas.tcas.Annotation
+
+
+
+Single token annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 XML source:
+ C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml
+
+
+
+
+
+
+
+
+
+
+
+
+Field Summary
+
+
+
+static int
+type
+
+
+
+
+
+
+static int
+typeIndexID
+
+
+
+
+
+
+
+
+Fields inherited from class org.apache.uima.jcas.cas.TOP
+
+
+addr, jcasType
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+
+protected
+TokenAnnotation ()
+
+
+ Never called.
+
+
+
+
+TokenAnnotation (int addr,
+ org.apache.uima.jcas.cas.TOP_Type type)
+
+
+ Internal - constructor used by generator
+
+
+
+
+TokenAnnotation (org.apache.uima.jcas.JCas jcas)
+
+
+
+
+
+
+
+TokenAnnotation (org.apache.uima.jcas.JCas jcas,
+ int begin,
+ int end)
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+ java.lang.String
+getPosTag ()
+
+
+ getter for posTag - gets contains part-of-speech of a corresponding token
+
+
+
+ java.lang.String
+getTokenType ()
+
+
+ getter for tokenType - gets token type
+
+
+
+ int
+getTypeIndexID ()
+
+
+
+
+
+
+ void
+setPosTag (java.lang.String v)
+
+
+ setter for posTag - sets contains part-of-speech of a corresponding token
+
+
+
+ void
+setTokenType (java.lang.String v)
+
+
+ setter for tokenType - sets token type
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.tcas.Annotation
+
+
+getBegin, getCoveredText, getEnd, getStart, setBegin, setEnd
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.cas.AnnotationBase
+
+
+getSofa, getView
+
+
+
+
+
+Methods inherited from class org.apache.uima.jcas.cas.TOP
+
+
+addToIndexes, addToIndexes, equals, getAddress, getCAS, getCASImpl, getLowLevelCas, hashCode, removeFromIndexes, removeFromIndexes
+
+
+
+
+
+Methods inherited from class org.apache.uima.cas.impl.FeatureStructureImpl
+
+
+clone, getBooleanValue, getByteValue, getDoubleValue, getFeatureValue, getFeatureValueAsString, getFloatValue, getIntValue, getLongValue, getShortValue, getStringValue, getType, prettyPrint, prettyPrint, prettyPrint, setBooleanValue, setByteValue, setDoubleValue, setFeatureValue, setFeatureValueFromString, setFloatValue, setIntValue, setLongValue, setShortValue, setStringValue, toString, toString
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+finalize, getClass, notify, notifyAll, wait, wait, wait
+
+
+
+
+
+Methods inherited from interface org.apache.uima.cas.AnnotationBaseFS
+
+
+getView
+
+
+
+
+
+Methods inherited from interface org.apache.uima.cas.FeatureStructure
+
+
+clone, equals, getBooleanValue, getByteValue, getCAS, getDoubleValue, getFeatureValue, getFeatureValueAsString, getFloatValue, getIntValue, getLongValue, getShortValue, getStringValue, getType, hashCode, setBooleanValue, setByteValue, setDoubleValue, setFeatureValue, setFeatureValueFromString, setFloatValue, setIntValue, setLongValue, setShortValue, setStringValue
+
+
+
+
+
+
+
+
+
+
+
+typeIndexID
+
+public static final int typeIndexID
+
+
+
+
+
+
+
+type
+
+public static final int type
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TokenAnnotation
+
+protected TokenAnnotation ()
+
+Never called. Disable default constructor
+
+
+
+
+
+TokenAnnotation
+
+public TokenAnnotation (int addr,
+ org.apache.uima.jcas.cas.TOP_Type type)
+
+Internal - constructor used by generator
+
+
+
+
+
+TokenAnnotation
+
+public TokenAnnotation (org.apache.uima.jcas.JCas jcas)
+
+
+
+
+
+TokenAnnotation
+
+public TokenAnnotation (org.apache.uima.jcas.JCas jcas,
+ int begin,
+ int end)
+
+
+
+
+
+
+
+
+
+getTypeIndexID
+
+public int getTypeIndexID ()
+
+
+Overrides: getTypeIndexID in class org.apache.uima.jcas.tcas.Annotation
+
+
+
+
+
+
+
+
+getTokenType
+
+public java.lang.String getTokenType ()
+
+getter for tokenType - gets token type
+
+
+
+
+
+
+
+
+setTokenType
+
+public void setTokenType (java.lang.String v)
+
+setter for tokenType - sets token type
+
+
+
+
+
+
+
+
+getPosTag
+
+public java.lang.String getPosTag ()
+
+getter for posTag - gets contains part-of-speech of a corresponding token
+
+
+
+
+
+
+
+
+setPosTag
+
+public void setPosTag (java.lang.String v)
+
+setter for posTag - sets contains part-of-speech of a corresponding token
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/constant-values.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/constant-values.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/constant-values.html (revision 0)
@@ -0,0 +1,142 @@
+
+
+
+
+
+
+Constant Field Values
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constant Field Values
+
+
+Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/help-doc.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/help-doc.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/help-doc.html (revision 0)
@@ -0,0 +1,220 @@
+
+
+
+
+
+
+API Help
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+How This API Document Is Organized
+
+This API (Application Programming Interface) document has pages corresponding to the items in the navigation bar, described as follows.
+Overview
+
+
+
+The Overview page is the front page of this API document and provides a list of all packages with a summary for each. This page can also contain an overall description of the set of packages.
+
+Package
+
+
+
+Each package has a page that contains a list of its classes and interfaces, with a summary for each. This page can contain four categories:
+Interfaces (italic) Classes Enums Exceptions Errors Annotation Types
+
+
+Class/Interface
+
+
+
+Each class, interface, nested class and nested interface has its own separate page. Each of these pages has three sections consisting of a class/interface description, summary tables, and detailed member descriptions:
+Class inheritance diagram Direct Subclasses All Known Subinterfaces All Known Implementing Classes Class/interface declaration Class/interface description
+
+
Nested Class Summary Field Summary Constructor Summary Method Summary
+
+
Field Detail Constructor Detail Method Detail
+Each summary entry contains the first sentence from the detailed description for that item. The summary entries are alphabetical, while the detailed descriptions are in the order they appear in the source code. This preserves the logical groupings established by the programmer.
+
+
+Annotation Type
+
+
+
+Each annotation type has its own separate page with the following sections:
+Annotation Type declaration Annotation Type description Required Element Summary Optional Element Summary Element Detail
+
+
+
+Enum
+
+
+
+Each enum has its own separate page with the following sections:
+Enum declaration Enum description Enum Constant Summary Enum Constant Detail
+
+
+Use
+
+Each documented package, class and interface has its own Use page. This page describes what packages, classes, methods, constructors and fields use any part of the given class or package. Given a class or interface A, its Use page includes subclasses of A, fields declared as A, methods that return A, and methods and constructors with parameters of type A. You can access this page by first going to the package, class or interface, then clicking on the "Use" link in the navigation bar.
+
+Tree (Class Hierarchy)
+
+There is a Class Hierarchy page for all packages, plus a hierarchy for each package. Each hierarchy page contains a list of classes and a list of interfaces. The classes are organized by inheritance structure starting with java.lang.Object. The interfaces do not inherit from java.lang.Object.
+When viewing the Overview page, clicking on "Tree" displays the hierarchy for all packages. When viewing a particular package, class or interface page, clicking "Tree" displays the hierarchy for only that package.
+
+
+Deprecated API
+
+The Deprecated API page lists all of the API that have been deprecated. A deprecated API is not recommended for use, generally due to improvements, and a replacement API is usually given. Deprecated APIs may be removed in future implementations.
+
+Index
+
+The Index contains an alphabetic list of all classes, interfaces, constructors, methods, and fields.
+
+Prev/Next
+These links take you to the next or previous class, interface, package, or related page.
+Frames/No Frames
+These links show and hide the HTML frames. All pages are available with or without frames.
+
+
+Serialized Form
+Each serializable or externalizable class has a description of its serialization fields and methods. This information is of interest to re-implementors, not to developers using the API. While there is no link in the navigation bar, you can get to this information by going to any serialized class and clicking "Serialized Form" in the "See also" section of the class description.
+
+
+Constant Field Values
+The Constant Field Values page lists the static final fields and their values.
+
+
+
+This help file applies to API documentation generated using the standard doclet.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/serialized-form.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/serialized-form.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/serialized-form.html (revision 0)
@@ -0,0 +1,437 @@
+
+
+
+
+
+
+Serialized Form
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Serialized Form
+
+
+
+
+
+
+Package org.apache.uima
+
+
+
+
+
+
+
+
+Class org.apache.uima.InternationalizedException extends java.lang.Exception implements Serializable
+
+
+
+
+serialVersionUID: 2306587442280738385L
+
+
+
+
+
+
+Serialized Fields
+
+
+
+
+mResourceBundleName
+
+java.lang.String mResourceBundleName
+
+
+
+
+
+
+mMessageKey
+
+java.lang.String mMessageKey
+
+
+
+
+
+
+mArguments
+
+java.lang.Object[] mArguments
+
+
+
+
+
+
+mCause
+
+java.lang.Throwable mCause
+
+
+
+
+
+
+
+
+
+
+Class org.apache.uima.InternationalizedRuntimeException extends java.lang.RuntimeException implements Serializable
+
+
+
+
+serialVersionUID: 6387360855459370559L
+
+
+
+
+
+
+Serialized Fields
+
+
+
+
+mResourceBundleName
+
+java.lang.String mResourceBundleName
+
+
+
+
+
+
+mMessageKey
+
+java.lang.String mMessageKey
+
+
+
+
+
+
+mArguments
+
+java.lang.Object[] mArguments
+
+
+
+
+
+
+mCause
+
+java.lang.Throwable mCause
+
+
+
+
+
+
+
+
+
+
+Class org.apache.uima.UIMA_IllegalArgumentException extends org.apache.uima.UIMARuntimeException implements Serializable
+
+
+
+
+serialVersionUID: -4820565402946868828L
+
+
+
+
+
+
+
+
+Class org.apache.uima.UIMA_IllegalStateException extends org.apache.uima.UIMARuntimeException implements Serializable
+
+
+
+
+serialVersionUID: -8081807814100358556L
+
+
+
+
+
+
+
+
+Class org.apache.uima.UIMA_UnsupportedOperationException extends org.apache.uima.UIMARuntimeException implements Serializable
+
+
+
+
+serialVersionUID: 9056907160021698405L
+
+
+
+
+
+
+
+
+Class org.apache.uima.UIMAException extends org.apache.uima.InternationalizedException implements Serializable
+
+
+
+
+serialVersionUID: 7521732353239537026L
+
+
+
+
+
+
+
+
+Class org.apache.uima.UIMARuntimeException extends org.apache.uima.InternationalizedRuntimeException implements Serializable
+
+
+
+
+serialVersionUID: 6738051692628592989L
+
+
+
+
+
+
+
+Package org.apache.uima.examples.tagger.trainAndTest
+
+
+
+
+
+
+
+
+serialVersionUID: 1L
+
+
+
+
+
+
+Serialized Fields
+
+
+
+
+suffix_tree
+
+java.util.Map<K,V> suffix_tree
+
+
+
+
+
+
+suffix_tree_capitalized
+
+java.util.Map<K,V> suffix_tree_capitalized
+
+
+
+
+
+
+word_probs
+
+java.util.Map<K,V> word_probs
+
+Map containing <word,tag> probabilities, that is probability of a certain word given a certain tag at a time t: P(wordt |tagt ))
+
+
+
+
+
+
+transition_probs
+
+java.util.Map<K,V> transition_probs
+
+Map containing N-gram probabilities
+
+
+
+
+
+
+N
+
+int N
+
+
+
+
+
+
+lambdas2
+
+double[] lambdas2
+
+
+
+
+
+
+lambdas3
+
+double[] lambdas3
+
+
+
+
+
+
+theta
+
+double theta
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/overview-frame.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/overview-frame.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/overview-frame.html (revision 0)
@@ -0,0 +1,48 @@
+
+
+
+
+
+
+Overview
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/class-use/TestCase1.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/class-use/TestCase1.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/class-use/TestCase1.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class unittests.TestCase1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class unittests.TestCase1
+
+No usage of unittests.TestCase1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/class-use/TaggerTest.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/class-use/TaggerTest.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/class-use/TaggerTest.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Class unittests.TaggerTest
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Class unittests.TaggerTest
+
+No usage of unittests.TaggerTest
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/TestCase1.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/TestCase1.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/TestCase1.html (revision 0)
@@ -0,0 +1,320 @@
+
+
+
+
+
+
+TestCase1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+unittests
+
+Class TestCase1
+
+java.lang.Object
+ junit.framework.Assert
+ junit.framework.TestCase
+ unittests.TestCase1
+
+
+All Implemented Interfaces: junit.framework.Test
+
+
+
+public class TestCase1 extends junit.framework.TestCase
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+TestCase1 ()
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+protected void
+setUp ()
+
+
+ Set up the test fixture
+
+
+
+ void
+testEnglishTagger ()
+
+
+ Tests English trigram tagger
+
+
+
+ void
+testGermanTagger ()
+
+
+ Tests tagging for German.
+
+
+
+
+
+Methods inherited from class junit.framework.TestCase
+
+
+countTestCases, createResult, getName, run, run, runBare, runTest, setName, tearDown, toString
+
+
+
+
+
+Methods inherited from class junit.framework.Assert
+
+
+assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertFalse, assertFalse, assertNotNull, assertNotNull, assertNotSame, assertNotSame, assertNull, assertNull, assertSame, assertSame, assertTrue, assertTrue, fail, fail
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TestCase1
+
+public TestCase1 ()
+
+
+
+
+
+
+
+
+
+setUp
+
+protected void setUp ()
+
+Set up the test fixture
+
+
+Overrides: setUp in class junit.framework.TestCase
+
+
+
+
+
+
+
+
+testGermanTagger
+
+public void testGermanTagger ()
+
+Tests tagging for German.
+
+
+
+
+
+
+
+
+testEnglishTagger
+
+public void testEnglishTagger ()
+
+Tests English trigram tagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-frame.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-frame.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-frame.html (revision 0)
@@ -0,0 +1,32 @@
+
+
+
+
+
+
+unittests
+
+
+
+
+
+
+
+
+
+
+
+unittests
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-use.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-use.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-use.html (revision 0)
@@ -0,0 +1,141 @@
+
+
+
+
+
+
+Uses of Package unittests
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Uses of Package unittests
+
+No usage of unittests
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-summary.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-summary.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-summary.html (revision 0)
@@ -0,0 +1,155 @@
+
+
+
+
+
+
+unittests
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Package unittests
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-tree.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-tree.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/package-tree.html (revision 0)
@@ -0,0 +1,155 @@
+
+
+
+
+
+
+unittests Class Hierarchy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Hierarchy For Package unittests
+
+
+
+Package Hierarchies: All Packages
+
+
+Class Hierarchy
+
+
+java.lang.Object
+junit.framework.Assert
+junit.framework.TestCase (implements junit.framework.Test)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/TaggerTest.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/TaggerTest.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/unittests/TaggerTest.html (revision 0)
@@ -0,0 +1,320 @@
+
+
+
+
+
+
+TaggerTest
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+unittests
+
+Class TaggerTest
+
+java.lang.Object
+ junit.framework.Assert
+ junit.framework.TestCase
+ unittests.TaggerTest
+
+
+All Implemented Interfaces: junit.framework.Test
+
+
+
+public class TaggerTest extends junit.framework.TestCase
+
+
+
+
+
+
+
+
+
+
+
+
+
+Constructor Summary
+
+
+TaggerTest ()
+
+
+
+
+
+
+
+
+
+
+
+
+Method Summary
+
+
+
+protected void
+setUp ()
+
+
+ Set up the test fixture
+
+
+
+ void
+testEnglishTagger ()
+
+
+ Tests English trigram tagger
+
+
+
+ void
+testGermanTagger ()
+
+
+ Tests tagging for German.
+
+
+
+
+
+Methods inherited from class junit.framework.TestCase
+
+
+countTestCases, createResult, getName, run, run, runBare, runTest, setName, tearDown, toString
+
+
+
+
+
+Methods inherited from class junit.framework.Assert
+
+
+assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertEquals, assertFalse, assertFalse, assertNotNull, assertNotNull, assertNotSame, assertNotSame, assertNull, assertNull, assertSame, assertSame, assertTrue, assertTrue, fail, fail
+
+
+
+
+
+Methods inherited from class java.lang.Object
+
+
+clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
+
+
+
+
+
+
+
+
+
+
+
+Constructor Detail
+
+
+
+
+TaggerTest
+
+public TaggerTest ()
+
+
+
+
+
+
+
+
+
+setUp
+
+protected void setUp ()
+
+Set up the test fixture
+
+
+Overrides: setUp in class junit.framework.TestCase
+
+
+
+
+
+
+
+
+testGermanTagger
+
+public void testGermanTagger ()
+
+Tests tagging for German.
+
+
+
+
+
+
+
+
+testEnglishTagger
+
+public void testEnglishTagger ()
+
+Tests English trigram tagger
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/overview-summary.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/overview-summary.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/overview-summary.html (revision 0)
@@ -0,0 +1,166 @@
+
+
+
+
+
+
+Overview
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Hidden Markov Model Tagger @ Apache
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/package-list
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/package-list (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/package-list (revision 0)
@@ -0,0 +1,4 @@
+org.apache.uima
+org.apache.uima.examples.tagger
+org.apache.uima.examples.tagger.trainAndTest
+unittests
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/overview-tree.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/overview-tree.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/overview-tree.html (revision 0)
@@ -0,0 +1,189 @@
+
+
+
+
+
+
+Class Hierarchy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Hierarchy For All Packages
+
+
+Package Hierarchies: org.apache.uima , org.apache.uima.examples.tagger , org.apache.uima.examples.tagger.trainAndTest , unittests
+
+
+Class Hierarchy
+
+
+java.lang.Object
+org.apache.uima.analysis_component.AnalysisComponent_ImplBase (implements org.apache.uima.analysis_component.AnalysisComponent)
+
+org.apache.uima.analysis_component.Annotator_ImplBase
+org.apache.uima.analysis_component.JCasAnnotator_ImplBase
+org.apache.uima.examples.tagger.HMMTagger (implements org.apache.uima.examples.tagger.Tagger )
+
+
+
+ junit.framework.Assert
+junit.framework.TestCase (implements junit.framework.Test)
+
+
+ org.apache.uima.examples.tagger.trainAndTest.BrownReader (implements org.apache.uima.examples.tagger.trainAndTest.CorpusReader )
+ org.apache.uima.cas.impl.FeatureStructureImpl (implements java.lang.Cloneable, org.apache.uima.cas.FeatureStructure)
+
+org.apache.uima.jcas.cas.TOP
+org.apache.uima.jcas.cas.AnnotationBase (implements org.apache.uima.cas.AnnotationBaseFS)
+
+org.apache.uima.jcas.tcas.Annotation (implements org.apache.uima.cas.text.AnnotationFS)
+
+
+
+
+ org.apache.uima.examples.tagger.GrobMapping (implements org.apache.uima.examples.tagger.MappingInterface )
+ org.apache.uima.examples.tagger.trainAndTest.ModelGeneration (implements java.io.Serializable)
+ org.apache.uima.examples.tagger.trainAndTest.SuffixTree org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Edge org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Node org.apache.uima.examples.tagger.trainAndTest.SuffixTree.Suffix org.apache.uima.examples.tagger.trainAndTest.TaggerEvaluation org.apache.uima.examples.tagger.TagMapping (implements org.apache.uima.examples.tagger.MappingInterface )
+ org.apache.uima.examples.tagger.trainAndTest.Token org.apache.uima.jcas.cas.TOP_Type
+org.apache.uima.jcas.cas.AnnotationBase_Type
+org.apache.uima.jcas.tcas.Annotation_Type
+
+
+ org.apache.uima.examples.tagger.trainAndTest.TT_FormatReader (implements org.apache.uima.examples.tagger.trainAndTest.CorpusReader )
+ org.apache.uima.examples.tagger.Viterbi
+
+
+Interface Hierarchy
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/index.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/index.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/index.html (revision 0)
@@ -0,0 +1,39 @@
+
+
+
+
+
+
+Generated Documentation (Untitled)
+
+
+
+
+
+
+
+
+
+
+
+
+
+Frame Alert
+
+
+This document is designed to be viewed using the frames feature. If you see this message, you are using a non-frame-capable web client.
+
+Link toNon-frame version.
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/resources/inherit.gif
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: D:\Jane\IBM\ECLIPSE_Workspaces_maven\Tagger\doc\resources\inherit.gif
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/deprecated-list.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/deprecated-list.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/deprecated-list.html (revision 0)
@@ -0,0 +1,143 @@
+
+
+
+
+
+
+Deprecated List
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Deprecated API
+
+
+Contents
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/allclasses-noframe.html
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/allclasses-noframe.html (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/doc/allclasses-noframe.html (revision 0)
@@ -0,0 +1,64 @@
+
+
+
+
+
+
+All Classes
+
+
+
+
+
+
+
+
+
+
+All Classes
+
+
+
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/unittests/TaggerTest.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/unittests/TaggerTest.java (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/unittests/TaggerTest.java (revision 0)
@@ -0,0 +1,192 @@
+/*
+ *Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package unittests;
+
+import junit.framework.TestCase;
+import java.util.*;
+import java.util.Map.Entry;
+
+import org.apache.uima.examples.tagger.HMMTagger;
+import org.apache.uima.examples.tagger.Viterbi;
+
+
+public class TaggerTest extends TestCase {
+
+ private HMMTagger hmm;
+
+ private List sent; // sentence
+
+ private List gold_standard;
+
+ private List tagger_output;
+
+ /**
+ * Set up the test fixture
+ */
+
+ protected void setUp() {
+
+ hmm = new HMMTagger();
+ gold_standard = new ArrayList();
+ sent = new ArrayList();
+ tagger_output = new ArrayList();
+
+ }
+
+ /**
+ * Tests tagging for German.
+ *
+ */
+ @SuppressWarnings("unchecked")
+ public void testGermanTagger() {
+
+ System.out.println("Tesing German Model... ");
+ List POS = new ArrayList();
+
+ try {
+ hmm.my_model = HMMTagger.get_model("resources/german/TuebaModel.dat");
+ } catch (Exception e) {
+ System.out.println("Model which is supposed to be used for testing does not exist");
+ }
+ System.out.println(hmm.my_model.word_probs.size() + " distinct words in the model");
+
+ Iterator>> keyValuePairs = hmm.my_model.word_probs.entrySet()
+ .iterator(); // iterate over words
+
+ for (int i = 0; i < hmm.my_model.word_probs.size(); i++) {
+ Map.Entry> entry = (Map.Entry>) keyValuePairs
+ .next();
+ Object key = entry.getKey();
+ Map pos = (Map) hmm.my_model.word_probs.get(key); // map of possible pos-s of
+ // the word
+ Object[] pos_s = pos.entrySet().toArray(); // for iteration over possible pos_s
+
+ for (int u = 0; u < pos_s.length; u++) {
+
+ Map.Entry> entry2 = (Map.Entry>) pos_s[u];
+ Object key2 = entry2.getKey(); // pos of a word
+ if (POS.contains(key2)) {
+ continue;
+
+ } else {
+ POS.add(key2);
+ }
+ }
+
+ }
+ Collections.sort(POS);
+ System.out.println("Number of part-of-speech tags used: " + POS.size());
+ System.out.println("These are: " + POS);
+
+ System.out.println("Testing German trigram tagger..");
+
+ sent.add("Jerry");
+ sent.add("liebt");
+ sent.add("Wansley");
+ sent.add(".");
+
+ System.out.println(sent);
+
+ hmm.N = 3;
+ // hmm.END_OF_SENT_TAG = "$.";
+
+ String[] out = new String[] { "NE", "VVFIN", "NE", "$." };
+ gold_standard.addAll(Arrays.asList(out));
+ tagger_output = Viterbi.process(hmm.N, sent, "$.", hmm.my_model.suffix_tree,
+ hmm.my_model.suffix_tree_capitalized, hmm.my_model.transition_probs,
+ hmm.my_model.word_probs, hmm.my_model.lambdas2, hmm.my_model.lambdas3,
+ hmm.my_model.theta);
+ System.out.println("expected: " + gold_standard);
+ System.out.println("tagger output: " + tagger_output);
+ assertEquals(gold_standard, tagger_output);
+ System.out.println("Very Good!");
+ System.out.println("==========================================================");
+ }
+
+ /**
+ * Tests English trigram tagger
+ *
+ */
+ @SuppressWarnings("unchecked")
+ public void testEnglishTagger() {
+
+ System.out.println("Tesing English Model... ");
+ List POS = new ArrayList();
+
+ try {
+ hmm.my_model = HMMTagger.get_model("resources/english/BrownModel.dat");
+ } catch (Exception e) {
+ System.out.println("Model which is supposed to be used for testing does not exist");
+ }
+ System.out.println(hmm.my_model.word_probs.size() + " distinct words in the model");
+
+ Iterator>> keyValuePairs = hmm.my_model.word_probs.entrySet()
+ .iterator(); // iterate over words
+
+ for (int i = 0; i < hmm.my_model.word_probs.size(); i++) {
+ Map.Entry> entry = (Map.Entry>) keyValuePairs
+ .next();
+ Object key = entry.getKey();
+ Map pos = (Map) hmm.my_model.word_probs.get(key); // map of possible pos-s of
+ // the word
+ Object[] pos_s = pos.entrySet().toArray(); // for iteration over possible pos_s
+
+ for (int u = 0; u < pos_s.length; u++) {
+
+ Map.Entry> entry2 = (Map.Entry>) pos_s[u];
+ Object key2 = entry2.getKey(); // pos of a word
+ if (POS.contains(key2)) {
+ continue;
+
+ } else {
+ POS.add(key2);
+ }
+ }
+
+ }
+ Collections.sort(POS);
+ System.out.println("Number of part-of-speech tags used: " + POS.size());
+ System.out.println("These are: " + POS);
+
+ System.out.println("Testing English trigram tagger...");
+
+ sent.add("Jerry");
+ sent.add("loves");
+ sent.add("Wansley");
+ sent.add(".");
+
+ System.out.println(sent);
+
+ hmm.N = 3;
+ // hmm.END_OF_SENT_TAG = "$.";
+
+ String[] out = new String[] { "np", "vbz", "np", "." };
+ gold_standard.addAll(Arrays.asList(out));
+ tagger_output = Viterbi.process(hmm.N, sent, ".", hmm.my_model.suffix_tree,
+ hmm.my_model.suffix_tree_capitalized, hmm.my_model.transition_probs,
+ hmm.my_model.word_probs, hmm.my_model.lambdas2, hmm.my_model.lambdas3,
+ hmm.my_model.theta);
+ System.out.println("expected: " + gold_standard);
+ System.out.println("tagger output: " + tagger_output);
+ assertEquals(gold_standard, tagger_output);
+ System.out.println("Very Good!");
+ }
+
+}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/SentenceAnnotation.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/SentenceAnnotation.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/SentenceAnnotation.java (working copy)
@@ -1,62 +1,75 @@
-
/* First created by JCasGen Thu Oct 25 11:28:37 CEST 2007 */
package org.apache.uima;
-import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JCasRegistry;
import org.apache.uima.jcas.cas.TOP_Type;
import org.apache.uima.jcas.tcas.Annotation;
-
-/** sentence annotation
- * Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
- * XML source: C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml
- * @generated */
+/**
+ * sentence annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 XML source:
+ * C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml
+ *
+ * @generated
+ */
public class SentenceAnnotation extends Annotation {
- /** @generated
- * @ordered
+ /**
+ * @generated
+ * @ordered
*/
public final static int typeIndexID = JCasRegistry.register(SentenceAnnotation.class);
- /** @generated
- * @ordered
+
+ /**
+ * @generated
+ * @ordered
*/
public final static int type = typeIndexID;
- /** @generated */
- public int getTypeIndexID() {return typeIndexID;}
-
- /** Never called. Disable default constructor
- * @generated */
- protected SentenceAnnotation() {}
-
- /** Internal - constructor used by generator
- * @generated */
+
+ /** @generated */
+ public int getTypeIndexID() {
+ return typeIndexID;
+ }
+
+ /**
+ * Never called. Disable default constructor
+ *
+ * @generated
+ */
+ protected SentenceAnnotation() {
+ }
+
+ /**
+ * Internal - constructor used by generator
+ *
+ * @generated
+ */
public SentenceAnnotation(int addr, TOP_Type type) {
super(addr, type);
readObject();
}
-
+
/** @generated */
public SentenceAnnotation(JCas jcas) {
super(jcas);
- readObject();
- }
+ readObject();
+ }
- /** @generated */
+ /** @generated */
public SentenceAnnotation(JCas jcas, int begin, int end) {
super(jcas);
setBegin(begin);
setEnd(end);
readObject();
- }
+ }
- /**
- * Write your own initialization here
- *
- @generated modifiable */
- private void readObject() {}
-
+ /**
+ * Write your own initialization here
+ *
+ * @generated modifiable
+ */
+ private void readObject() {
+ }
+
}
-
-
\ No newline at end of file
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/SentenceAnnotation_Type.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/SentenceAnnotation_Type.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/SentenceAnnotation_Type.java (working copy)
@@ -1,4 +1,3 @@
-
/* First created by JCasGen Thu Oct 25 11:28:37 CEST 2007 */
package org.apache.uima;
@@ -11,45 +10,52 @@
import org.apache.uima.cas.Type;
import org.apache.uima.jcas.tcas.Annotation_Type;
-/** sentence annotation
- * Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
- * @generated */
+/**
+ * sentence annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
+ *
+ * @generated
+ */
public class SentenceAnnotation_Type extends Annotation_Type {
/** @generated */
- protected FSGenerator getFSGenerator() {return fsGenerator;}
+ protected FSGenerator getFSGenerator() {
+ return fsGenerator;
+ }
+
/** @generated */
- private final FSGenerator fsGenerator =
- new FSGenerator() {
- public FeatureStructure createFS(int addr, CASImpl cas) {
- if (SentenceAnnotation_Type.this.useExistingInstance) {
- // Return eq fs instance if already created
- FeatureStructure fs = SentenceAnnotation_Type.this.jcas.getJfsFromCaddr(addr);
- if (null == fs) {
- fs = new SentenceAnnotation(addr, SentenceAnnotation_Type.this);
- SentenceAnnotation_Type.this.jcas.putJfsFromCaddr(addr, fs);
- return fs;
- }
- return fs;
- } else return new SentenceAnnotation(addr, SentenceAnnotation_Type.this);
- }
- };
+ private final FSGenerator fsGenerator = new FSGenerator() {
+ public FeatureStructure createFS(int addr, CASImpl cas) {
+ if (SentenceAnnotation_Type.this.useExistingInstance) {
+ // Return eq fs instance if already created
+ FeatureStructure fs = SentenceAnnotation_Type.this.jcas.getJfsFromCaddr(addr);
+ if (null == fs) {
+ fs = new SentenceAnnotation(addr, SentenceAnnotation_Type.this);
+ SentenceAnnotation_Type.this.jcas.putJfsFromCaddr(addr, fs);
+ return fs;
+ }
+ return fs;
+ } else
+ return new SentenceAnnotation(addr, SentenceAnnotation_Type.this);
+ }
+ };
+
/** @generated */
public final static int typeIndexID = SentenceAnnotation.typeIndexID;
- /** @generated
- @modifiable */
- public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.uima.SentenceAnnotation");
+ /**
+ * @generated
+ * @modifiable
+ */
+ public final static boolean featOkTst = JCasRegistry
+ .getFeatOkTst("org.apache.uima.SentenceAnnotation");
-
- /** initialize variables to correspond with Cas Type and Features
- * @generated */
+ /**
+ * initialize variables to correspond with Cas Type and Features
+ *
+ * @generated
+ */
public SentenceAnnotation_Type(JCas jcas, Type casType) {
super(jcas, casType);
- casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
+ casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl) this.casType, getFSGenerator());
}
}
-
-
-
-
\ No newline at end of file
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/TokenAnnotation.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/TokenAnnotation.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/TokenAnnotation.java (working copy)
@@ -1,98 +1,130 @@
-
/* First created by JCasGen Thu Oct 25 11:28:37 CEST 2007 */
package org.apache.uima;
-import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JCasRegistry;
import org.apache.uima.jcas.cas.TOP_Type;
import org.apache.uima.jcas.tcas.Annotation;
-
-/** Single token annotation
- * Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
- * XML source: C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml
- * @generated */
+/**
+ * Single token annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007 XML source:
+ * C:/code/ApacheUIMA/Tagger/desc/HmmTaggerTAE.xml
+ *
+ * @generated
+ */
public class TokenAnnotation extends Annotation {
- /** @generated
- * @ordered
+ /**
+ * @generated
+ * @ordered
*/
public final static int typeIndexID = JCasRegistry.register(TokenAnnotation.class);
- /** @generated
- * @ordered
+
+ /**
+ * @generated
+ * @ordered
*/
public final static int type = typeIndexID;
- /** @generated */
- public int getTypeIndexID() {return typeIndexID;}
-
- /** Never called. Disable default constructor
- * @generated */
- protected TokenAnnotation() {}
-
- /** Internal - constructor used by generator
- * @generated */
+
+ /** @generated */
+ public int getTypeIndexID() {
+ return typeIndexID;
+ }
+
+ /**
+ * Never called. Disable default constructor
+ *
+ * @generated
+ */
+ protected TokenAnnotation() {
+ }
+
+ /**
+ * Internal - constructor used by generator
+ *
+ * @generated
+ */
public TokenAnnotation(int addr, TOP_Type type) {
super(addr, type);
readObject();
}
-
+
/** @generated */
public TokenAnnotation(JCas jcas) {
super(jcas);
- readObject();
- }
+ readObject();
+ }
- /** @generated */
+ /** @generated */
public TokenAnnotation(JCas jcas, int begin, int end) {
super(jcas);
setBegin(begin);
setEnd(end);
readObject();
- }
+ }
- /**
- * Write your own initialization here
- *
- @generated modifiable */
- private void readObject() {}
-
-
-
- //*--------------*
- //* Feature: tokenType
+ /**
+ * Write your own initialization here
+ *
+ * @generated modifiable
+ */
+ private void readObject() {
+ }
- /** getter for tokenType - gets token type
- * @generated */
+ // *--------------*
+ // * Feature: tokenType
+
+ /**
+ * getter for tokenType - gets token type
+ *
+ * @generated
+ */
public String getTokenType() {
- if (TokenAnnotation_Type.featOkTst && ((TokenAnnotation_Type)jcasType).casFeat_tokenType == null)
+ if (TokenAnnotation_Type.featOkTst
+ && ((TokenAnnotation_Type) jcasType).casFeat_tokenType == null)
jcasType.jcas.throwFeatMissing("tokenType", "org.apache.uima.TokenAnnotation");
- return jcasType.ll_cas.ll_getStringValue(addr, ((TokenAnnotation_Type)jcasType).casFeatCode_tokenType);}
-
- /** setter for tokenType - sets token type
- * @generated */
+ return jcasType.ll_cas.ll_getStringValue(addr,
+ ((TokenAnnotation_Type) jcasType).casFeatCode_tokenType);
+ }
+
+ /**
+ * setter for tokenType - sets token type
+ *
+ * @generated
+ */
public void setTokenType(String v) {
- if (TokenAnnotation_Type.featOkTst && ((TokenAnnotation_Type)jcasType).casFeat_tokenType == null)
+ if (TokenAnnotation_Type.featOkTst
+ && ((TokenAnnotation_Type) jcasType).casFeat_tokenType == null)
jcasType.jcas.throwFeatMissing("tokenType", "org.apache.uima.TokenAnnotation");
- jcasType.ll_cas.ll_setStringValue(addr, ((TokenAnnotation_Type)jcasType).casFeatCode_tokenType, v);}
-
-
- //*--------------*
- //* Feature: posTag
+ jcasType.ll_cas.ll_setStringValue(addr,
+ ((TokenAnnotation_Type) jcasType).casFeatCode_tokenType, v);
+ }
- /** getter for posTag - gets contains part-of-speech of a corresponding token
- * @generated */
+ // *--------------*
+ // * Feature: posTag
+
+ /**
+ * getter for posTag - gets contains part-of-speech of a corresponding token
+ *
+ * @generated
+ */
public String getPosTag() {
- if (TokenAnnotation_Type.featOkTst && ((TokenAnnotation_Type)jcasType).casFeat_posTag == null)
+ if (TokenAnnotation_Type.featOkTst && ((TokenAnnotation_Type) jcasType).casFeat_posTag == null)
jcasType.jcas.throwFeatMissing("posTag", "org.apache.uima.TokenAnnotation");
- return jcasType.ll_cas.ll_getStringValue(addr, ((TokenAnnotation_Type)jcasType).casFeatCode_posTag);}
-
- /** setter for posTag - sets contains part-of-speech of a corresponding token
- * @generated */
+ return jcasType.ll_cas.ll_getStringValue(addr,
+ ((TokenAnnotation_Type) jcasType).casFeatCode_posTag);
+ }
+
+ /**
+ * setter for posTag - sets contains part-of-speech of a corresponding token
+ *
+ * @generated
+ */
public void setPosTag(String v) {
- if (TokenAnnotation_Type.featOkTst && ((TokenAnnotation_Type)jcasType).casFeat_posTag == null)
+ if (TokenAnnotation_Type.featOkTst && ((TokenAnnotation_Type) jcasType).casFeat_posTag == null)
jcasType.jcas.throwFeatMissing("posTag", "org.apache.uima.TokenAnnotation");
- jcasType.ll_cas.ll_setStringValue(addr, ((TokenAnnotation_Type)jcasType).casFeatCode_posTag, v);}
+ jcasType.ll_cas
+ .ll_setStringValue(addr, ((TokenAnnotation_Type) jcasType).casFeatCode_posTag, v);
}
-
-
\ No newline at end of file
+}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/TokenAnnotation_Type.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/TokenAnnotation_Type.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/TokenAnnotation_Type.java (working copy)
@@ -1,4 +1,3 @@
-
/* First created by JCasGen Thu Oct 25 11:28:37 CEST 2007 */
package org.apache.uima;
@@ -13,89 +12,101 @@
import org.apache.uima.cas.Feature;
import org.apache.uima.jcas.tcas.Annotation_Type;
-/** Single token annotation
- * Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
- * @generated */
+/**
+ * Single token annotation Updated by JCasGen Thu Oct 25 11:28:37 CEST 2007
+ *
+ * @generated
+ */
public class TokenAnnotation_Type extends Annotation_Type {
/** @generated */
- protected FSGenerator getFSGenerator() {return fsGenerator;}
+ protected FSGenerator getFSGenerator() {
+ return fsGenerator;
+ }
+
/** @generated */
- private final FSGenerator fsGenerator =
- new FSGenerator() {
- public FeatureStructure createFS(int addr, CASImpl cas) {
- if (TokenAnnotation_Type.this.useExistingInstance) {
- // Return eq fs instance if already created
- FeatureStructure fs = TokenAnnotation_Type.this.jcas.getJfsFromCaddr(addr);
- if (null == fs) {
- fs = new TokenAnnotation(addr, TokenAnnotation_Type.this);
- TokenAnnotation_Type.this.jcas.putJfsFromCaddr(addr, fs);
- return fs;
- }
- return fs;
- } else return new TokenAnnotation(addr, TokenAnnotation_Type.this);
- }
- };
+ private final FSGenerator fsGenerator = new FSGenerator() {
+ public FeatureStructure createFS(int addr, CASImpl cas) {
+ if (TokenAnnotation_Type.this.useExistingInstance) {
+ // Return eq fs instance if already created
+ FeatureStructure fs = TokenAnnotation_Type.this.jcas.getJfsFromCaddr(addr);
+ if (null == fs) {
+ fs = new TokenAnnotation(addr, TokenAnnotation_Type.this);
+ TokenAnnotation_Type.this.jcas.putJfsFromCaddr(addr, fs);
+ return fs;
+ }
+ return fs;
+ } else
+ return new TokenAnnotation(addr, TokenAnnotation_Type.this);
+ }
+ };
+
/** @generated */
public final static int typeIndexID = TokenAnnotation.typeIndexID;
- /** @generated
- @modifiable */
- public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.uima.TokenAnnotation");
-
+
+ /**
+ * @generated
+ * @modifiable
+ */
+ public final static boolean featOkTst = JCasRegistry
+ .getFeatOkTst("org.apache.uima.TokenAnnotation");
+
/** @generated */
final Feature casFeat_tokenType;
+
/** @generated */
- final int casFeatCode_tokenType;
- /** @generated */
+ final int casFeatCode_tokenType;
+
+ /** @generated */
public String getTokenType(int addr) {
- if (featOkTst && casFeat_tokenType == null)
+ if (featOkTst && casFeat_tokenType == null)
jcas.throwFeatMissing("tokenType", "org.apache.uima.TokenAnnotation");
return ll_cas.ll_getStringValue(addr, casFeatCode_tokenType);
}
- /** @generated */
+
+ /** @generated */
public void setTokenType(int addr, String v) {
- if (featOkTst && casFeat_tokenType == null)
+ if (featOkTst && casFeat_tokenType == null)
jcas.throwFeatMissing("tokenType", "org.apache.uima.TokenAnnotation");
- ll_cas.ll_setStringValue(addr, casFeatCode_tokenType, v);}
-
-
-
+ ll_cas.ll_setStringValue(addr, casFeatCode_tokenType, v);
+ }
+
/** @generated */
final Feature casFeat_posTag;
+
/** @generated */
- final int casFeatCode_posTag;
- /** @generated */
+ final int casFeatCode_posTag;
+
+ /** @generated */
public String getPosTag(int addr) {
- if (featOkTst && casFeat_posTag == null)
+ if (featOkTst && casFeat_posTag == null)
jcas.throwFeatMissing("posTag", "org.apache.uima.TokenAnnotation");
return ll_cas.ll_getStringValue(addr, casFeatCode_posTag);
}
- /** @generated */
+
+ /** @generated */
public void setPosTag(int addr, String v) {
- if (featOkTst && casFeat_posTag == null)
+ if (featOkTst && casFeat_posTag == null)
jcas.throwFeatMissing("posTag", "org.apache.uima.TokenAnnotation");
- ll_cas.ll_setStringValue(addr, casFeatCode_posTag, v);}
-
-
+ ll_cas.ll_setStringValue(addr, casFeatCode_posTag, v);
+ }
-
-
- /** initialize variables to correspond with Cas Type and Features
- * @generated */
+ /**
+ * initialize variables to correspond with Cas Type and Features
+ *
+ * @generated
+ */
public TokenAnnotation_Type(JCas jcas, Type casType) {
super(jcas, casType);
- casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
+ casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl) this.casType, getFSGenerator());
-
- casFeat_tokenType = jcas.getRequiredFeatureDE(casType, "tokenType", "uima.cas.String", featOkTst);
- casFeatCode_tokenType = (null == casFeat_tokenType) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_tokenType).getCode();
+ casFeat_tokenType = jcas.getRequiredFeatureDE(casType, "tokenType", "uima.cas.String",
+ featOkTst);
+ casFeatCode_tokenType = (null == casFeat_tokenType) ? JCas.INVALID_FEATURE_CODE
+ : ((FeatureImpl) casFeat_tokenType).getCode();
-
casFeat_posTag = jcas.getRequiredFeatureDE(casType, "posTag", "uima.cas.String", featOkTst);
- casFeatCode_posTag = (null == casFeat_posTag) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_posTag).getCode();
+ casFeatCode_posTag = (null == casFeat_posTag) ? JCas.INVALID_FEATURE_CODE
+ : ((FeatureImpl) casFeat_posTag).getCode();
}
}
-
-
-
-
\ No newline at end of file
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/Token.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/Token.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/Token.java (working copy)
@@ -1,44 +0,0 @@
-/*
- *Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-package org.apache.uima.examples.tagger;
-
-/**
- *
- */
-public class Token {
-
- String pos;
-
- String word;
-
- public Token() {
- this(null, null);
- }
-
- public Token(String word) {
- this(word, null);
- }
-
- public Token(String word, String pos) {
- this.word = word;
- this.pos = pos;
- }
-}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/Token.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/Token.java (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/Token.java (working copy)
@@ -18,16 +18,17 @@
*
*/
-package org.apache.uima.examples.tagger;
+package org.apache.uima.examples.tagger.trainAndTest;
/**
- *
+ * Defines token features.
+ * Helpful as an intermediate layer between a text and the tagger.
*/
public class Token {
- String pos;
+ public String pos;
- String word;
+ public String word;
public Token() {
this(null, null);
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/TaggerEvaluation.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/TaggerEvaluation.java (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/TaggerEvaluation.java (revision 0)
@@ -0,0 +1,193 @@
+/*
+ *Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+
+package org.apache.uima.examples.tagger.trainAndTest;
+
+import java.io.FileInputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.uima.examples.tagger.HMMTagger;
+import org.apache.uima.examples.tagger.MappingInterface;
+import org.apache.uima.examples.tagger.Token;
+import org.apache.uima.examples.tagger.Viterbi;
+
+
+/**
+ * Evaluation of Tagger
+ * NB. As it is implemented at hte moment, to be used just for small tests with small files ..
+ * (very naive and takes quite a long time with big files..)
+ */
+public class TaggerEvaluation {
+
+
+ @SuppressWarnings("unchecked")
+ static void get_eval(ModelGeneration my_model, List wordList, List posList, List TagList){
+
+ int wrong_tag =0;
+ int right_tags = 0;
+ List unknown_list = new ArrayList();
+ int unknown_count = 0; // counter for erroronously tagged unknown words
+ int unknown_all = 0; // counter for all unknown words
+ List wrong_tags = new ArrayList();
+ Map wrong_tag_counts = new HashMap();
+ Map wrong_tag_map = new HashMap();
+ int words = wordList.size();
+ for (int u = 0; u < posList.size(); u++){
+
+ if (! posList.get(u).equalsIgnoreCase((String)TagList.get(u))){
+ String test = TagList.get(u).toString().toLowerCase();
+
+ wrong_tags.add(posList.get(u));
+
+ Integer freq = (Integer) wrong_tag_counts.get(posList.get(u));
+ wrong_tag_counts.put(posList.get(u), (freq == null) ? 1 : freq + 1);
+
+ if (wrong_tag_map.containsKey(posList.get(u))){
+ // if the token is already in a dictionary, then get its erronous tags
+ Map tags = wrong_tag_map.get(posList.get(u));
+ Integer freq2= (Integer) tags.get(TagList.get(u));
+ // if a given POS is already in its values, then add its corresponding count, otherwise add a POS value with a count of 1
+ tags.put((String)TagList.get(u), (freq2 == null) ? 1 : freq2 + 1);
+ } else {
+ Map tags = new HashMap();
+ tags.put((String)TagList.get(u), 1);
+ wrong_tag_map.put(posList.get(u), tags);
+ }
+
+ wrong_tag+=1;
+ if (! my_model.word_probs.containsKey(wordList.get(u))){
+ unknown_list.add(wordList.get(u));
+ // System.out.println(wordList.get(u));
+ unknown_count+=1;
+ }
+ } else {
+ right_tags +=1;
+ }
+ // count unknown words
+ if (! my_model.word_probs.containsKey(wordList.get(u))){
+ unknown_all+=1;
+ }
+ }
+
+
+
+ System.out.println(wordList.size() + " tokens in the corpus");
+ double percent_unknown_errors = (double)unknown_count/(double)wrong_tag;
+ System.out.println("percent of unknown words among erronously tagged: "+unknown_count+" ("+percent_unknown_errors*100+"%"+")");
+ System.out.println("percent of correctly tagged unknown words from all unknown: "+(((double)unknown_all-unknown_count)/(double)unknown_all)*100+"%");
+ System.out.println("total words: "+words);
+ System.out.println("total unknown words: "+unknown_all+" == "+(double)unknown_all/(double)words + "%");
+ double percent_errors = (double)wrong_tag/(double)words;
+ System.out.println("total errors"+ wrong_tag+" which makes up "+percent_errors+" of tokens");
+
+ double accuracy = (double)right_tags/(double)words;
+ System.out.println("accuracy: "+ accuracy);
+
+
+ ArrayList sortedValues = new ArrayList(wrong_tag_counts.values());
+ Collections.sort(sortedValues);
+
+ // System.out.println(sortedValues.toString());
+ Object [] keys = wrong_tag_counts.keySet().toArray();
+ Object [] keys2 = wrong_tag_map.keySet().toArray();
+ for (int i =0; i < keys.length; i++){
+
+ System.out.print(keys[i]+" ");
+ System.out.println(wrong_tag_counts.get(keys[i]));
+
+ System.out.print(keys2[i]);
+ System.out.print(":");
+ System.out.println(wrong_tag_map.get(keys2[i]));
+
+ }
+ }
+
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+
+
+ TaggerEvaluation test = new TaggerEvaluation();
+ ModelGeneration my_model;
+ CorpusReader reader;
+ MappingInterface MAPPING;
+ String file;
+
+ try {
+ // Get configuration parameter values
+ String paramFile = "tagger.properties";
+
+
+ // create and load default properties
+ Properties defaultProps = new Properties();
+ FileInputStream in = new FileInputStream(paramFile);
+ defaultProps.load(in);
+ in.close();
+
+ String MODEL = defaultProps.getProperty("MODEL");
+ String n = defaultProps.getProperty("N");
+
+ my_model = HMMTagger.get_model(MODEL);
+ String t = defaultProps.getProperty("DO_MAPPING");
+
+ my_model.N = Integer.parseInt(n);
+
+ boolean DO_MAPPING = Boolean.valueOf(t);
+ System.out.println("DO_MAPPING = "+DO_MAPPING);
+
+ if (DO_MAPPING){
+ String m = defaultProps.getProperty("MAPPING");
+ MAPPING = (MappingInterface)(Class.forName(m)).newInstance();
+ } else {
+ MAPPING = null;
+ }
+ String r = defaultProps.getProperty("CORPUS_READER");
+ reader = (CorpusReader)(Class.forName(r)).newInstance();
+ file = defaultProps.getProperty("GOLD_STANDARD");
+
+ List corpus = reader.read_corpus(file, MAPPING);
+ List wordList = new ArrayList();
+ // List posSent = new ArrayList(); // for pos-s on the level of sentence
+
+ List posList = new ArrayList();
+
+ for (int x=0; x corpus;
- public TT_FormatReader(String InputFile) {
+ public TT_FormatReader(String InputFile, MappingInterface Mapping) {
corpus = read_corpus(InputFile);
-
+ if (Mapping == null) {
+ this.corpus = read_corpus(InputFile);
+ } else {
+ this.corpus = Mapping.map_tags(read_corpus(InputFile)); // in case we need to map
+ }
}
+*/
+
- public static List read_corpus(String file) {
+ public List read_corpus(String file, MappingInterface Mapping) {
// Text is already tokenized
@@ -66,7 +76,10 @@
System.out.println(e);
return null;
}
-
+ if (Mapping != null) {
+ text = Mapping.map_tags(text); // in case we need to map
+ }
return text;
}
+
}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.java (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/SuffixTree.java (working copy)
@@ -18,7 +18,7 @@
*
*/
-package org.apache.uima.examples.tagger;
+package org.apache.uima.examples.tagger.trainAndTest;
import java.util.ArrayList;
import java.util.HashMap;
@@ -26,18 +26,19 @@
import java.util.Map;
/**
- * Java implementation of the Ukkonen's suffix tree (the fastest known suffix tree, O(n)) inspired
- * by Mark Nelson's http://marknelson.us/1996/08/01/suffix-trees/
+ * Java implementation of the Ukkonen's suffix tree inspired by Mark Nelson's tutorial:
+ * http://marknelson.us/1996/08/01/suffix-trees/
*/
public class SuffixTree {
- String text = "";
+ public String text = "";
- List nodes = new ArrayList();
+ public List nodes = new ArrayList();
- Map edges = new HashMap(); // Map to store the starting node of the edges and their corresponding
- // first characters
+ public Map edges = new HashMap(); // Map to store the starting node of the edges and their corresponding
+ // first characters
+
char[] chars;
public SuffixTree() {
@@ -45,8 +46,9 @@
}
Suffix active_point = new Suffix(0, 0, -1); // initial active point is the first non-leaf suffix
- // in the tree
+ // in the tree
+
public SuffixTree(String text) {
// int token_begin = words.length();
@@ -135,14 +137,14 @@
* Internal Class EDGE
*/
- class Edge {
- int first_char_index;
+ public class Edge {
+ public int first_char_index;
- int last_char_index;
+ public int last_char_index;
int start_node;
- int end_node;
+ public int end_node;
public Edge(int parent_node, int end_node, int first_char_index, int last_char_index) {
this.first_char_index = first_char_index;
@@ -181,7 +183,7 @@
int suf_span = suffix.last_char_index - suffix.first_char_index + 1;
nodes.get(new_node_index - 1).suffix_node = suffix.origin_node;
Edge new_edge = new Edge(new_node_index, this.end_node, this.first_char_index + suf_span,
- this.last_char_index);
+ this.last_char_index);
insert_edge(new_edge);
// SuffixTree.remove_edge(this.start_node, this.first_char_index);
@@ -190,7 +192,7 @@
// this.end_node = new_node_index;
insert_edge(new Edge(this.start_node, new_node_index, this.first_char_index,
- this.last_char_index));
+ this.last_char_index));
return new_node_index; // return the new origin node index of the last edge
}
@@ -206,8 +208,8 @@
// at a particular node can find the next smaller suffix
// by following the suffix_node link to a new node. Nodes
// are stored in a simple array.
- class Node {
- int suffix_node;
+ public class Node {
+ public int suffix_node;
// static int count=0;
@@ -235,7 +237,7 @@
break;
}
} else if (active_point.isImplicit()) { // if suffix is implicit, i.e. it does not end in a
- // leaf node $
+ // leaf node $
List keys2 = new ArrayList();
keys2.add(active_point.origin_node);
keys2.add(chars[active_point.first_char_index]);
@@ -282,44 +284,4 @@
active_point.canonize();
}
- // ///////// TEST
-
- /*
- * public void walk_tree(SuffixTree suffix_tree, int current_node_index, Suffix current_suffix,
- * int current_suffix_len){ int edge_count = 0; Set alphabet = new HashSet(); char []
- * chars = text.toCharArray(); for (char c : chars){ if(!alphabet.add(c)){ // }
- * System.out.println(alphabet.toString()); Character [] letters = (Character [])
- * alphabet.toArray(); for (char ch : letters){ try { List keys = new ArrayList();
- * keys.add(current_node_index); keys.add(ch); Edge edge = (Edge) suffix_tree.edges.get(keys); if
- * (current_node_index != edge.start_node){ System.err.println("ERROR"); }
- * System.out.println(current_node_index+" "+edge.start_node); edge_count+=1; int l =
- * current_suffix_len; for (int j : Range((Comparable)edge.last_char_index, (Comparable)
- * (edge.first_char_index+1))){ current_suffix. = suffix_tree.text.charAt(j); } } } }
- * }
- *
- *
- *
- *
- *
- *
- * public static void main(String[] args) {
- *
- * String test_str="pace"; SuffixTree suffix_tree = new SuffixTree(test_str);
- *
- *
- * System.out.println(edges.keySet().toString());
- *
- * Iterator kv = edges.entrySet().iterator(); for (int u= 0; u corpus;
-
- String InputDir;
-
- public BrownReader(String InputDir) {
- this.InputDir = InputDir;
- if (MAPPING) {
- this.corpus = TagMapping.map_tags(read_corpus(read_dir(InputDir))); // in case we need to map
- // tags, TODO: trasfer to
- // parameter file
- } else {
- this.corpus = read_corpus(read_dir(InputDir));
- }
-
- }
-
+
/**
- * Reads file names from Directory
+ * Reads Brown Corpus from NLTK Distribution Format. Iterates over all files in the directory,
+ * which are in a sentence per line format, and returns all tokens in the collection in a List of
+ * {@link Token}s}
*
* @param directory
- * name
- * @return an array of file names in the directory
+ * an array of file names
+ * @return a list of tokens from all files
+ *
*/
- public static String[] read_dir(String directory) {
+
+
+ public List read_corpus(String directory, MappingInterface Mapping) {
+
+
+ // Reads file names from Directory and returns an array of file names in the directory
File dir = new File(directory);
String[] list = dir.list();
- String[] new_list = dir.list();
+ String[] new_list = new String[list.length];
for (int i = 0; i < list.length; i++) {
String dir_list = directory + "/" + list[i];
new_list[i] = dir_list;
}
- return new_list;
- }
-
- /**
- * Reads Brown Corpus from NLTK Distribution Format. Iterates over all files in the directory,
- * which are in a sentence per line format, and returns all tokens in the collection in a List of
- * Tokens {@link Token}}
- *
- * @param files
- * an array of file names
- * @return a list of tokens from all files
- *
- */
-
- List all_words = new ArrayList();
-
- public static List read_corpus(String[] files) {
-
+
String line;
List text = new ArrayList();
@@ -96,8 +73,8 @@
int line_count = 0;
- for (int i = 0; i < files.length; i++) {
- String file = files[i];
+ for (int i = 0; i < new_list.length; i++) {
+ String file = new_list[i];
try {
BufferedReader in = new BufferedReader(new FileReader(file));
@@ -107,7 +84,7 @@
String[] tokens = delimiters.split(line);
for (int x = 0; x < tokens.length; x++) { // iterate over tokens with their
- // corresponding POS
+ // corresponding POS
tokens[x] = tokens[x].replaceAll("[\\n\\t]+", "");
// for cases in Brown corpus like "//in" :(
@@ -160,7 +137,9 @@
}
}
System.out.println(line_count + " sentences in the corpus");
-
+ if (Mapping != null) {
+ text = Mapping.map_tags(text); // in case we need to map
+ }
return text;
}
/*
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.java (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.java (working copy)
@@ -18,9 +18,10 @@
*/
-package org.apache.uima.examples.tagger;
+package org.apache.uima.examples.tagger.trainAndTest;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
@@ -30,16 +31,20 @@
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.Properties;
import java.util.Set;
import java.util.Map.Entry;
+import org.apache.uima.examples.tagger.*;
+
+
/**
- * Trains an N-gram model for the tagger, iterating over the files from some predefined training directory
- * Writes the resulting model to a binary fileSystem
+ * Trains an N-gram model for the tagger, iterating over the files from some predefined training directory.
+ *
+ * Writes the resulting model to a binary file.
+ *
+ * NB. At the moment: both bi-and trigram statistics are saved in one model file..
*
- *
- * NB. At the moment- both bi-and trigram statistics are saved in one model file..
- *
*/
@@ -67,8 +72,8 @@
int N; // for the N-gram model
- double [] lambdas2 = new double[2];
- double [] lambdas3 = new double[3];
+ public double [] lambdas2 = new double[2];
+ public double [] lambdas3 = new double[3];
public double theta; // for suffix probabiliites smoothing
// transient String InputDir;
@@ -83,63 +88,63 @@
*/
@SuppressWarnings("unchecked")
public ModelGeneration(List corpus, String OutputFile) {
- this.OutputFile = OutputFile;
- this.corpus = corpus;
+ this.OutputFile = OutputFile;
+ this.corpus = corpus;
}
private void init(){
-
- List>> l = get_word_probs(get_lexicon(corpus));
-
- this.word_probs = (Map) l.get(0);
- this.suffix_tree = (Map) l.get(1);
- this.suffix_tree_capitalized = (Map) l.get(2);
- /* Map test2 = get_ngrams(2);
- Map test3 = get_ngrams(3);
- Map test1 = get_ngrams(1);
- Iterator> it = test2.entrySet().iterator(); // iterate over words
-
- for (int g = 0; g>> l = get_word_probs(get_lexicon(corpus));
+
+ this.word_probs = (Map) l.get(0);
+ this.suffix_tree = (Map) l.get(1);
+ this.suffix_tree_capitalized = (Map) l.get(2);
+ /* Map test2 = get_ngrams(2);
+ Map test3 = get_ngrams(3);
+ Map test1 = get_ngrams(1);
+ Iterator> it = test2.entrySet().iterator(); // iterate over words
+
+ for (int g = 0; g pos = lexicon.get(current_token.word);
@@ -189,7 +193,7 @@
static Map sm2 = new HashMap ( ); // suffix map for capitalized;
@SuppressWarnings("unchecked")
static List>> get_word_probs(Map> corpus){
-
+
Map> word_counts=corpus;
Map> word_probs = new HashMap>();
@@ -203,25 +207,25 @@
{
Map.Entry> entry = (Map.Entry>) keyValuePairs.next();
Object key = entry.getKey();
-
- Map pos2= word_counts.get(key); // map of possible pos-s of the word
-
- Iterator> keyValuePairs_pos = pos2.entrySet().iterator(); // iterate over words
- Map lokal = new HashMap();
+
+ Map pos2= word_counts.get(key); // map of possible pos-s of the word
+
+ Iterator> keyValuePairs_pos = pos2.entrySet().iterator(); // iterate over words
+ Map lokal = new HashMap();
for (int u = 0; u < pos2.size(); u++)
{
- Map.Entry entry_pos = (Map.Entry) keyValuePairs_pos.next();
- Object key2 = entry_pos.getKey(); // pos of a word
+ Map.Entry entry_pos = (Map.Entry) keyValuePairs_pos.next();
+ Object key2 = entry_pos.getKey(); // pos of a word
- if (key2 != "count") {
-
- Object value2 = entry_pos.getValue(); // its count
- double freq_pos=pos_counts.get(key2);
- Double val2 = (Double)value2 / freq_pos; // Prob(w|t) = freq(w,t)/freq(t)
- lokal.put((String)key2, val2); // save probability as a log2
+ if (key2 != "count") {
+
+ Object value2 = entry_pos.getValue(); // its count
+ double freq_pos=pos_counts.get(key2);
+ Double val2 = (Double)value2 / freq_pos; // Prob(w|t) = freq(w,t)/freq(t)
+ lokal.put((String)key2, val2); // save probability as a log2
} else {
- lokal.remove("count");
+ lokal.remove("count");
}
}
// insert the word and its corresponding tags as well as their common probabilities into the words maps
@@ -238,99 +242,106 @@
// here we get words with counts under 10 for suffix probabilities.. the condition can be changed upt o you..
if (word_counts.get(key).get("count")<10){
-
- SuffixTree st = new SuffixTree((String) key);
-
- Iterator kv = st.edges.entrySet().iterator();
-
- for (int f= 0; f 9){
+ word_end = key.toString().substring(key.toString().length()-9, key.toString().length());
+ } else {
+ word_end = (String) key;
+ }
+
+ SuffixTree st = new SuffixTree(word_end);
+
+ Iterator kv = st.edges.entrySet().iterator();
+
+ for (int f= 0; f etwas = new HashMap();
+ while (it.hasNext()){
+ Object element = it.next();
+
+ //for (int u = 0; u< local_suffixes.size(); u++){
+ Map etwas = new HashMap();
- if(local_sm.containsKey(element)){
- Map pos_suffix = new HashMap();
+ if(local_sm.containsKey(element)){
+ Map pos_suffix = new HashMap();
- // get map of possible pos-s of the suffix
- pos_suffix= (Map) local_sm.get(element);
- Iterator> pos_suf = pos_suffix.entrySet().iterator(); // iterate over words
+ // get map of possible pos-s of the suffix
+ pos_suffix= (Map) local_sm.get(element);
+ Iterator> pos_suf = pos_suffix.entrySet().iterator(); // iterate over words
- for (int k = 0; k < pos_suffix.size(); k++)
- {
- Map.Entry entry3 = (Map.Entry) pos_suf.next();
-
- Object key_pos = entry3.getKey(); // pos of a suffix
- Object value_pos = entry3.getValue(); // its probability count
-
-
- // If a given pos of a suffix is also present in the pos-s of a corresponding word
- // then add both up
-
- if (word_probs.get(key).containsKey(key_pos)){
- Double val_suffix = (Double)value_pos + word_probs.get(key).get(key_pos); // Prob(w|t) = freq(w,t)/freq(t)
- etwas.put((String)key_pos, val_suffix);
- } else {
- etwas.put((String)key_pos, (Double)value_pos);
-
- }
- // add the pos of a corresponding word with its probability into the suffix map,
- // (which is not yet present in the poss of the suffix)
-
- Set smth2 = word_probs.get(key).keySet();
- Object [] smth = smth2.toArray();
-
- for (int r=0; r entry3 = (Map.Entry) pos_suf.next();
+
+ Object key_pos = entry3.getKey(); // pos of a suffix
+ Object value_pos = entry3.getValue(); // its probability count
+
+
+ // If a given pos of a suffix is also present in the pos-s of a corresponding word
+ // then add both up
+
+ if (word_probs.get(key).containsKey(key_pos)){
+ Double val_suffix = (Double)value_pos + word_probs.get(key).get(key_pos); // Prob(w|t) = freq(w,t)/freq(t)
+ etwas.put((String)key_pos, val_suffix);
+ } else {
+ etwas.put((String)key_pos, (Double)value_pos);
+
+ }
+ // add the pos of a corresponding word with its probability into the suffix map,
+ // (which is not yet present in the poss of the suffix)
+
+ Set smth2 = word_probs.get(key).keySet();
+ Object [] smth = smth2.toArray();
+
+ for (int r=0; r();
@@ -348,36 +359,36 @@
@SuppressWarnings("unchecked")
static Map> logify_probs(Map> probs){
-
- Map> logs = new HashMap>();
+
+ Map> logs = new HashMap>();
- Iterator>> keyValuePairs = probs.entrySet().iterator(); // iterate over words
-
- for (int i = 0; i < probs.size(); i++)
- {
- Map.Entry> entry = (Map.Entry>) keyValuePairs.next();
- Object key = entry.getKey();
- Map poss= probs.get(key); // map of possible pos-s of the word
-
- Object [] pos_s = poss.entrySet().toArray(); // for iteration over possible pos_s
-
- for (int u = 0; u < pos_s.length; u++)
- {
-
- Map.Entry entry2 = (Map.Entry) pos_s[u];
- // System.out.println(entry);
- Object key2 = entry2.getKey(); // pos of a word
-
- Double value2 = (Double) entry2.getValue(); // its count
- poss.put((String)key2, Math.log(value2)); // save probability as a log2
- }
- // insert the word and its corresponding tags as well as their common probabilities into the words maps
- logs.put((String)key, poss);
-
- }
- return logs;
+ Iterator>> keyValuePairs = probs.entrySet().iterator(); // iterate over words
+
+ for (int i = 0; i < probs.size(); i++)
+ {
+ Map.Entry> entry = (Map.Entry>) keyValuePairs.next();
+ Object key = entry.getKey();
+ Map poss= probs.get(key); // map of possible pos-s of the word
+
+ Object [] pos_s = poss.entrySet().toArray(); // for iteration over possible pos_s
+
+ for (int u = 0; u < pos_s.length; u++)
+ {
+
+ Map.Entry entry2 = (Map.Entry) pos_s[u];
+ // System.out.println(entry);
+ Object key2 = entry2.getKey(); // pos of a word
+
+ Double value2 = (Double) entry2.getValue(); // its count
+ poss.put((String)key2, Math.log(value2)); // save probability as a log2
+ }
+ // insert the word and its corresponding tags as well as their common probabilities into the words maps
+ logs.put((String)key, poss);
+
+ }
+ return logs;
}
-
+
/**
* Computes N-gram frequencies
@@ -426,8 +437,9 @@
static Map bigrams;
static Map trigrams;
+ @SuppressWarnings("unchecked")
static Map get_transition_probs(int N) throws IllegalArgumentException{
- Map probs1= new HashMap();
+ Map probs1= new HashMap();
Map probs2= new HashMap();
Map probs3= new HashMap();
unigrams = get_ngrams(1);
@@ -435,7 +447,7 @@
trigrams = get_ngrams(3);
if (N==1) {
- Iterator keyValuePairs = unigrams.entrySet().iterator();
+ Iterator keyValuePairs = unigrams.entrySet().iterator();
for (int i = 0; i < unigrams.size(); i++) // for all bigrams
{
Map.Entry entry = (Map.Entry) keyValuePairs.next();
@@ -496,90 +508,92 @@
* currently lambdas are calculated as in (Brants, 2000)
*/
+ @SuppressWarnings("unchecked")
private double [] calculate_lambda(int N){
- double lambda1 = 0;
- double lambda2 = 0;
- double lambda3 = 0;
-
- double count2 = 0;
- double count3 = 0;
-
- if (N ==2) {
- Iterator keyValuePairs = bigrams.entrySet().iterator();
- for (int i = 0; i < bigrams.size(); i++) // for all bigrams
- {
- Map.Entry entry = (Map.Entry) keyValuePairs.next();
- Object key = entry.getKey(); // get a bigram
- String [] t = ((String)key).split("_");
- double freq1 = unigrams.get(t[0]); // get a count of a preceding unigram
- Double freq2 = (Double) entry.getValue(); // get a count of a bigram
-
- double f2 = (freq2-1)/(freq1-1);
- double f1 = (freq1-1)/(tokens_count_all_corpus-1);
-
- double freq = get_max(f2, f1, 0);
- if (freq == f2){lambda2+=freq2;count2+=freq2;}
- else {lambda1+=freq2;count2+=freq2;}
- } lambdas2[0] = lambda1/count2; lambdas2[1]= lambda2/count2;
- }
-
- if (N == 3) {
-
- Iterator keyValuePairs = trigrams.entrySet().iterator();
- for (int i = 0; i < trigrams.size(); i++) // for all trigrams
- {
- Map.Entry entry = (Map.Entry) keyValuePairs.next();
- Object key = entry.getKey(); // get a trigram
- String [] t = ((String)key).split("_");
- String tt = t[0]+"_"+t[1];
+ double lambda1 = 0;
+ double lambda2 = 0;
+ double lambda3 = 0;
+
+ double count2 = 0;
+ double count3 = 0;
+
+ if (N ==2) {
+ Iterator keyValuePairs = bigrams.entrySet().iterator();
+ for (int i = 0; i < bigrams.size(); i++) // for all bigrams
+ {
+ Map.Entry entry = (Map.Entry) keyValuePairs.next();
+ Object key = entry.getKey(); // get a bigram
+ String [] t = ((String)key).split("_");
+ double freq1 = unigrams.get(t[0]); // get a count of a preceding unigram
+ Double freq2 = (Double) entry.getValue(); // get a count of a bigram
+
+ double f2 = (freq2-1)/(freq1-1);
+ double f1 = (freq1-1)/(tokens_count_all_corpus-1);
+
+ double freq = get_max(f2, f1, 0);
+ if (freq == f2){lambda2+=freq2;count2+=freq2;}
+ else {lambda1+=freq2;count2+=freq2;}
+ } lambdas2[0] = lambda1/count2; lambdas2[1]= lambda2/count2;
+ }
+
+ if (N == 3) {
+
+ Iterator keyValuePairs = trigrams.entrySet().iterator();
+ for (int i = 0; i < trigrams.size(); i++) // for all trigrams
+ {
+ Map.Entry entry = (Map.Entry) keyValuePairs.next();
+ Object key = entry.getKey(); // get a trigram
+ String [] t = ((String)key).split("_");
+ String tt = t[0]+"_"+t[1];
- Double freq2 = bigrams.containsKey(tt) ? bigrams.get(tt) : 0.0; // get a count of a preceding bigram
- Double freq3 = (Double) entry.getValue(); // get a count of a trigram
- Double freq1 = unigrams.containsKey(t[0]) ? unigrams.get(t[0]) : 0.0;
-
- double f3 = (freq3-1)/(freq2-1);
- double f2 = (freq2-1)/(freq1-1);
- double f1 = (freq1-1)/(tokens_count_all_corpus-1);
- double freq = get_max(f3, f2, f1);
- if (freq == f3) {lambda3+= freq3; count3+=freq3;} // or just real frequency?
- else if (freq == f2){lambda2+=freq3; count3+=freq3;}
- else {lambda1+=freq3;count3+=freq3;}
- } lambdas3[0] = lambda1/count3; lambdas3[1]=lambda2/count3; lambdas3[2]=lambda3/count3;
- }
- System.out.println("lambdas for 2-grams = "+lambdas2[0]+" "+lambdas2[1]+"\n"+"lambdas for 3-grams"+lambdas3[0]+" "+lambdas3[1]+" "+lambdas3[2]);
- return ((N == 2) ? lambdas2 : lambdas3);
+ Double freq2 = bigrams.containsKey(tt) ? bigrams.get(tt) : 0.0; // get a count of a preceding bigram
+ Double freq3 = (Double) entry.getValue(); // get a count of a trigram
+ Double freq1 = unigrams.containsKey(t[0]) ? unigrams.get(t[0]) : 0.0;
+
+ double f3 = (freq3-1)/(freq2-1);
+ double f2 = (freq2-1)/(freq1-1);
+ double f1 = (freq1-1)/(tokens_count_all_corpus-1);
+ double freq = get_max(f3, f2, f1);
+ if (freq == f3) {lambda3+= freq3; count3+=freq3;} // or just real frequency?
+ else if (freq == f2){lambda2+=freq3; count3+=freq3;}
+ else {lambda1+=freq3;count3+=freq3;}
+ } lambdas3[0] = lambda1/count3; lambdas3[1]=lambda2/count3; lambdas3[2]=lambda3/count3;
+ }
+ System.out.println("lambdas for 2-grams = "+lambdas2[0]+" "+lambdas2[1]+"\n"+"lambdas for 3-grams"+lambdas3[0]+" "+lambdas3[1]+" "+lambdas3[2]);
+ return ((N == 2) ? lambdas2 : lambdas3);
}
+ @SuppressWarnings("unchecked")
private double get_theta (Map m) {
- double d = 0;
- double sum1 = 0;
- double tagset_size = m.size();
- Iterator tags_probs = m.entrySet().iterator();
- for (int h = 0; hmax){ max = b; }
- if (c>max) {max = c; }
- return max;
+ double max = a;
+ if (b>max){ max = b; }
+ if (c>max) {max = c; }
+ return max;
}
/**
@@ -612,15 +626,50 @@
}
public static void main(String[] args) {
-
- // PennReader penn = new PennReader("F:/pos/wsj/training");
- // BrownReader brown = new BrownReader("../brown");
- TT_FormatReader tiger = new TT_FormatReader("D:/Jane/IBM/Tagger_komplett/tueba_tigerFormat.txt");
-
- ModelGeneration md = new ModelGeneration(tiger.corpus, "TuebaModel.dat");
- md.init();
+ ModelGeneration md;
+ CorpusReader reader;
+ MappingInterface MAPPING;
+ String file;
+ String fileOutput;
+
+ try {
+ String paramFile = "tagger.properties";
+
+ // create and load default properties
+ Properties defaultProps = new Properties();
+ FileInputStream in = new FileInputStream(paramFile);
+ defaultProps.load(in);
+ in.close();
+
+ file = defaultProps.getProperty("FILE");
+ fileOutput = defaultProps.getProperty("FILE_OUTPUT");
+
+ boolean DO_MAPPING;
+ String b = defaultProps.getProperty("DO_MAPPING");
+ DO_MAPPING = Boolean.valueOf(b);
+ System.out.println("HALLO");
+ if (DO_MAPPING){
+ String m = defaultProps.getProperty("MAPPING");
+ MAPPING = (MappingInterface)(Class.forName(m)).newInstance();
+ System.out.println("HALLO2");
+
+ } else {
+ MAPPING = null;
+ }
+
+
+ String r = defaultProps.getProperty("CORPUS_READER");
+ reader = (CorpusReader)(Class.forName(r)).newInstance();
+
+ // md = new ModelGeneration(new BrownReader().read_corpus("../brown",m),fileOutput);
+
+ System.out.println(fileOutput);
+ md = new ModelGeneration(reader.read_corpus(file, MAPPING),fileOutput);
+ md.init();
+ } catch (Exception e) {
+ System.err.println(e);
+ }
}
-
+
}
-
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/CorpusReader.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/CorpusReader.java (revision 0)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/CorpusReader.java (revision 0)
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.examples.tagger.trainAndTest;
+
+
+import java.util.List;
+import org.apache.uima.examples.tagger.MappingInterface;
+
+
+/**
+ * Reads (annotated) text file(s) and transforms every word into a {@code Token}-object
+ *
+ */
+public interface CorpusReader {
+
+ // public List corpus = new ArrayList();
+ // public List read_corpus(String file);
+ public List read_corpus(String file, MappingInterface mapping);
+}
+
+
+
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/TT_FormatReader.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/TT_FormatReader.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/TT_FormatReader.java (working copy)
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/**
- * Tree Tagger Format Reader: a word per line with possibly html-tags,
- * which are ignored and 2/3 columns separated by tabs.
- * Columns are: "word \t tag \t lemma"
- *
- */
-
-package org.apache.uima.examples.tagger;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-public class TT_FormatReader {
-
- List corpus;
-
- public TT_FormatReader(String InputFile) {
- corpus = read_corpus(InputFile);
-
- }
-
- public static List read_corpus(String file) {
-
- // Text is already tokenized
-
- int line_count = 0;
- String line;
- List text = new ArrayList();
- try {
- BufferedReader in = new BufferedReader(new FileReader(file));
-
- while ((line = in.readLine()) != null) {
- if (line.trim().length() > 0) {
- line_count += 1;
- String[] t = line.split("\t");
- if (t.length >= 2) {
- Token token = new Token(t[0], t[1]);
- text.add(token);
- }
- }
- }
- in.close();
- } catch (IOException e) {
- System.out.println(e);
- return null;
- }
-
- return text;
- }
-}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/ModelGenerationBytes.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/ModelGenerationBytes.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/ModelGenerationBytes.java (working copy)
@@ -1,810 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-package org.apache.uima.examples.tagger;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.ObjectOutputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.Map.Entry;
-
-/**
- * Trains an N-gram model for the tagger, iterating over the files from some predefined training directory
- * Writes the resulting model to a binary fileSystem
- *
- *
- * NB. At the moment- both bi-and trigram statistics are saved in one model file..
- *
- */
-
-
-public class ModelGenerationBytes implements java.io.Serializable{
-
- private static final long serialVersionUID = 1L;
-
- public Map posMap = new HashMap();
- public Map suffix_tree= new HashMap ( ) ;
-
- public Map suffix_tree_capitalized = new HashMap();
- /**
- * Map containing {@code } probabilities, that is probability of a certain word given a certain tag at a time t: P(wordt |tagt ))
- *
- */
- public Map word_probs = new HashMap();
-
- /**
- * Map containing N-gram probabilities
- */
-
- public Map transition_probs = new HashMap() ;
-
-
- transient List posList = new ArrayList();
-
- int N; // for the N-gram model
-
- double [] lambdas2 = new double[2];
- double [] lambdas3 = new double[3];
- public double theta; // for suffix probabiliites smoothing
-
- // Word Counts...
- Map counts = new HashMap();
- transient String InputDir;
- transient String OutputFile;
- transient List corpus;
-
- /**
- * @param N N=1, 2 or 3
- * @param InputDir input directory name
- * @param OutputFile output file name
- * MapBrownToPenn TagMapping
- */
- @SuppressWarnings("unchecked")
-public ModelGenerationBytes(List corpus, String OutputFile) {
-
- this.OutputFile = OutputFile;
- this.corpus = corpus;
-
- }
-
- /**
- * Check is the token is capitalized
- */
- static boolean capitalized(String word){
- boolean b;
- char first_letter = word.charAt(0);
- char capitalized = word.toUpperCase().charAt(0);
- if (first_letter == capitalized) {
- b = true;
- } else {
- b = false;
- }
- return b;
- }
-
-
- int tokens_count_all_corpus = 0;
-
-
-
-
- /**
- * Reads sentences, extracts {@code } frequency patterns
- * @param corpus list containing all tokens of the training corpus of the type {@link Token}}
- * @return map containing frequency counts for {@code }
- */
- private Map> get_lexicon(List corpus){
-
- /*
- * Extracts all available POS from the text and puts them into the map of POS - string and corresponding ID
- */
-
- for (int x=0; x> lexicon= new HashMap>();
-
- // Map counts = new HashMap();
- for (int x=0; x pos = new HashMap();
- tokens_count_all_corpus +=1;
- Token current_token = corpus.get(x);
- // System.out.println(current_token.word);
- // posList.add(current_token.pos); // Filling of POS tags list with available in the training corpus POSs.. (TODO: admissable POSs should probably be trasmitted with parameter file..to avoid noise)
-
- /*
- // a try to get a separate entry for numbers, decreased the accuracy, so discarded at the moment..
- Pattern p = Pattern.compile("[0-9]*");
- Matcher m = p.matcher(current_token.word);
- boolean b = m.matches();
- // if a current tokren is a number
- if (b) {
- System.out.println(current_token.word+" "+current_token.pos);
- current_token.word = "@card";
- }
- */
- if (lexicon.containsKey(current_token.word)){
- // System.out.println(current_token.pos);
-
- // if the token is already in a dictionary, then get its POS-s
- // Map pos = lexicon.get(current_token.word);
- pos = lexicon.get(current_token.word);
-
- // System.out.println(current_token.word+" "+current_token.pos);
- Double freq=lexicon.get(current_token.word).get((Byte)posMap.get(current_token.pos));
- // if a given POS is already in its values, then add its corresponding count, otherwise add a POS value with a count of 1
- pos.put((Byte)posMap.get(current_token.pos), (freq == null) ? 1 : freq + 1);
- // System.out.println(pos.get(posMap.get(current_token.pos)));
- counts.put(current_token.word, counts.get(current_token.word)+1);
- } // if a token is unknown yet
- else {
- // Map pos = new HashMap();
- pos.put((Byte)posMap.get(current_token.pos),new Double(1));
- counts.put(current_token.word, new Integer(1));
- lexicon.put(current_token.word, pos);
- }
- }
- System.out.println(tokens_count_all_corpus);
- return lexicon;
- }
-
- /**
- * Convert posList to a byte array, and operate further on bytes.
- * Preserver original mapping in a hashmap
-
- byte [] b;
- private void getBytes(List pos){
- String [] classes = (String []) pos.toArray(new String[pos.size()]);
- for(int i = 0; i();
- // static byte b = 0;
-
- private Map getHashmap(List posList){
- byte b = 0;
- // String [] classes = (String []) posList.toArray(new String[posList.size()]);
- for(int i = 0; i> convertLexicon(Map> words) {
- Map>convertedWords = new HashMap>();
- Iterator>> keyValuePairs = words.entrySet().iterator(); // iterate over words
- for (int i = 0; i < words.size(); i++)
- {
- Map.Entry> entry = (Map.Entry>) keyValuePairs.next();
- Object key = entry.getKey();
-
- Map pos2= words.get(key); // map of possible pos-s of the word
-
- Iterator> keyValuePairs_pos = pos2.entrySet().iterator(); // iterate over words
- Map lokal = new HashMap();
- for (int u = 0; u < pos2.size(); u++)
- {
-
- return convertedWords;
- }
- */
-
-
- /**
- * Computes {@code word_probs} using {@link #get_lexicon(List)} frequency counts for known words..
- * TO_DO: ADD SMOOTHING FOR UNKNOWNS?? OR add smoothing directly when come across unknown..
- */
-
- transient Map sm = new HashMap ( ) ; // suffix map for non-capitalized words
- transient Map sm2 = new HashMap ( ); // suffix map for capitalized;
- @SuppressWarnings("unchecked")
- public List get_word_probs(Map> corpus){
-
- Map word_counts=corpus;
- //System.out.println(word_counts.entrySet().toString());
- Map word_probs = new HashMap>();
-
- int mapsize = word_counts.size();
-
- Iterator> keyValuePairs = word_counts.entrySet().iterator(); // iterate over words
- Map pos_counts = get_ngrams(1);
-
- for (int i = 0; i < mapsize; i++)
- {
- Map.Entry entry = (Map.Entry) keyValuePairs.next();
- Object key = entry.getKey();
-
- Map pos2= (Map) word_counts.get(key); // map of possible pos-s of the word
-
- Iterator> keyValuePairs_pos = pos2.entrySet().iterator(); // iterate over words
- Map lokal = new HashMap();
- for (int u = 0; u < pos2.size(); u++)
- {
- Map.Entry entry_pos = (Map.Entry) keyValuePairs_pos.next();
- Object key2 = entry_pos.getKey(); // pos of a word
- // System.out.println(key2);
-
- // if (!key2.equals("count")) {
- // byte b = (Byte)posMap.get(key2);
- Object value2 = entry_pos.getValue(); // its count
- // System.out.println(pos_counts);
- double freq_pos=pos_counts.get(key2);
- Double val2 = (Double)value2 / freq_pos; // Prob(w|t) = freq(w,t)/freq(t)
- // lokal.put((String)key2, val2); // save probability as a log2
- lokal.put(key2, val2); //
- // } else {
- // lokal.remove("count");
- // }
- }
- // insert the word and its corresponding tags as well as their common probabilities into the words maps
- word_probs.put((String)key, lokal);
-
- /**
- * Get suffix probabilities from the words with frequency lower than 10 in the corpus
- */
- Set local_suffixes = new HashSet();
-
- // We maintain two different suffix counts for capitalized and non-capitalized words
-
- Set local_suffixes_capitalized = new HashSet();
-
- // here we get words with counts under 10 for suffix probabilities.. the condition can be changed upt o you..
- if ((Integer)(counts.get(key))<10){
-
- SuffixTree st = new SuffixTree((String) key);
-
- Iterator kv = st.edges.entrySet().iterator();
-
- for (int f= 0; f pos_suf = pos_suffix.entrySet().iterator(); // iterate over words
-
-
- for (int k = 0; k < pos_suffix.size(); k++)
- {
- Map.Entry entry3 = (Map.Entry) pos_suf.next();
-
- Object key_pos = entry3.getKey(); // pos of a suffix
- // System.out.println(key_pos);
- // System.out.println(key_pos);
- // byte b = (Byte)posMap.get(key_pos);
-
- Object value_pos = entry3.getValue(); // its probability count
-
-
- // If a given pos of a suffix is also present in the pos-s of a corresponding word
- // then add both up
-
- if (((Map)word_probs.get(key)).containsKey(key_pos)){
- Double val_suffix = (Double)value_pos + (Double)((Map)word_probs.get(key)).get(key_pos); // Prob(w|t) = freq(w,t)/freq(t)
- // etwas.put((String)key_pos, val_suffix);
- etwas.put(key_pos, val_suffix);
- } else {
- // etwas.put((String)key_pos, (Double)value_pos);
- etwas.put(key_pos, value_pos);
- }
- // add the pos of a corresponding word with its probability into the suffix map,
- // (which is not yet present in the poss of the suffix)
-
- Set smth2 = ((Map)word_probs.get(key)).keySet();
- Object [] smth = smth2.toArray();
-
- for (int r=0; r();
- l.add(word_probs); //add word probabilities
- l.add(sm); // add suffixes of non-capitalized words
- l.add(sm2); // add suffixes of capitalized words
-
- return l;
- }
-
- /**
- * Make LOGs out of probabilities.. there was a reason to separate it from the get_word_probs method at the initial step
- *
- */
- /*
- @SuppressWarnings("unchecked")
-private Map> logify_probs(Map> probs){
-
- Map> logs = new HashMap>();
-
- Iterator>> keyValuePairs = probs.entrySet().iterator(); // iterate over words
-
- for (int i = 0; i < probs.size(); i++)
- {
- Map.Entry> entry = (Map.Entry>) keyValuePairs.next();
- Object key = entry.getKey();
- Map poss= probs.get(key); // map of possible pos-s of the word
-
- Object [] pos_s = poss.entrySet().toArray(); // for iteration over possible pos_s
-
- for (int u = 0; u < pos_s.length; u++)
- {
-
- Map.Entry entry2 = (Map.Entry) pos_s[u];
- // System.out.println(entry);
- Object key2 = entry2.getKey(); // pos of a word
-
- Double value2 = (Double) entry2.getValue(); // its count
- poss.put((String)key2, Math.log(value2)); // save probability as a log2
- }
- // insert the word and its corresponding tags as well as their common probabilities into the words maps
- logs.put((String)key, poss);
-
- }
- return logs;
- }*/
-
- /* Helper class
- *
-
-
- class CompoundKey{
-
- private byte [] keys;
-
- private CompoundKey(byte [] vals){
- initArray(vals);
- }
- private void initArray(byte [] numbers){
- this.keys = numbers;
- }
-
- /**
- * @see java.lang.Object#equals(java.lang.Object)
-
- @Override
- public boolean equals(Object object) {
-
- CompoundKey key = (CompoundKey) object;
-
- String s1 = hardwareClassname;
- String s2 = key.hardwareClassname;
- if (s1 != null && s1.equals(s2)) {
- String t1 = hardwareDriverType;
- String t2 = key.hardwareDriverType;
-
- if (t1 != null && t1.equals(t2)) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * @see java.lang.Object#hashCode()
-
- @Override
- public int hashCode() {
- int hashCode = (hardwareClassname + hardwareDriverType).hashCode();
- //int hashCode = 1;
-
- return hashCode;
- }
-
- /**
- * @see java.lang.Object#toString()
-
- @Override
- public String toString() {
- return hardwareClassname + ", " + hardwareDriverType;
- }
-
-
- }*/
-
- /**
- * Computes N-gram frequencies
- * @param N
- * @return Map N-grams of parts-of-speech, where {@code N = 1, 2 or 3}
- * @throws IllegalArgumentException
- */
- private Map get_ngrams(int N) throws IllegalArgumentException{
-
- Map ngrams1= new HashMap();
- Map ngrams2= new HashMap();
- Map ngrams3= new HashMap();
-
- if (N==1){
- for (int y=0; y(2);
- Byte b2 = (Byte) posMap.get(posList.get(y+1));
- Byte b1 = (Byte) posMap.get(posList.get(y));
-
- // String s2 = posList.get(y)+"_"+posList.get(y+1);
-
- b_key.add(b1);
- b_key.add(b2);
- // Double freq= (Double)ngrams2.get(s2);
- Double freq= (Double)ngrams2.get(b_key);
-
- // System.out.println(b_key.toString());
- // System.out.println(ngrams2.containsKey(b_key));
-
- ngrams2.put(b_key, (freq == null) ? 1 : freq + 1);
- }
- }
- else if (N==3){
- for (int y=0; y<(posList.size()-2); y++){
- // String s3 = posList.get(y)+"_"+posList.get(y+1)+"_"+posList.get(y+2);
- List b_key = new ArrayList(2);
-
- byte b1 = (Byte)posMap.get(posList.get(y));
- byte b2 = (Byte)posMap.get(posList.get(y+1));
- byte b3 = (Byte)posMap.get(posList.get(y+2));
- // String s2 = posList.get(y)+"_"+posList.get(y+1);
-
- b_key.add(b1);
- b_key.add(b2);
- b_key.add(b3);
-
- Double freq= (Double)ngrams2.get(b_key);
- // Double freq= (Double)ngrams3.get(s3);
- ngrams3.put(b_key, (freq == null) ? 1 : freq + 1);
- }
- } else{
- throw new IllegalArgumentException ("N=1, N=2 or N=3, no further N-grams are supported at the moment");
- }
- return ((N==1)? ngrams1: (N == 2) ? ngrams2 : ngrams3);
- }
-
- // @SuppressWarnings("unchecked")
- /**
- * Computes {@code transition_probs} using {@link #get_ngrams(int)} frequency counts for N-grams..
- */
-
- Map unigrams;
- Map bigrams;
- Map trigrams;
-
- @SuppressWarnings("unchecked")
-public Map get_transition_probs(int N) throws IllegalArgumentException{
- Map probs1= new HashMap();
- Map probs2= new HashMap();
- Map probs3= new HashMap();
- unigrams = get_ngrams(1);
- bigrams = get_ngrams(2);
- trigrams = get_ngrams(3);
-
- if (N==1) {
- Iterator keyValuePairs = unigrams.entrySet().iterator();
- for (int i = 0; i < unigrams.size(); i++) // for all bigrams
- {
- Map.Entry entry = (Map.Entry) keyValuePairs.next();
- Object key = entry.getKey(); // get a bigram
- double freq1 = unigrams.get(key); // get a count of a unigram
- double prob1 = (Double)freq1 / tokens_count_all_corpus; // Prob(key) = freq(tag)/freq(all_tags)?
- probs1.put(key, prob1); // save probability as a log2 : Math.log(prob1)
-
- }
- }
-
- else if (N==2){
-
- Iterator keyValuePairs = bigrams.entrySet().iterator();
- for (int i = 0; i < bigrams.size(); i++) // for all bigrams
- {
- Map.Entry entry = (Map.Entry) keyValuePairs.next();
- ArrayList key = (ArrayList) entry.getKey(); // get a bigram
- // String [] t = ((String)key).split("_");
- // double freq1 = unigrams.get(t[0]); // get a count of a preceding unigram
- double freq1 = unigrams.get(key.get(0));
- Object freq2 = entry.getValue(); // get a count of a bigram
-
- double prob2 = (Double)freq2 / freq1; // Prob(t2|t1) = freq(t1,t2)/freq(t1)
- probs2.put(key, prob2); // save probability as a log2: Math.log(prob2)
- }
- } else if (N==3){ // for trigram models
-
- Iterator keyValuePairs = trigrams.entrySet().iterator();
- for (int i = 0; i < trigrams.size(); i++) // for all trigrams
- {
- Map.Entry entry = (Map.Entry) keyValuePairs.next();
- ArrayList key = (ArrayList) entry.getKey(); // get a trigram
- // String [] t = ((String)key).split("_");
- // String tt = t[0]+"_"+t[1];
-
- Double freq1 = bigrams.get(key.subList(0, 2)); // get a count of a preceding bigram
-
- Object freq2 = entry.getValue(); // get a count of a trigram
-
- double prob3 = (Double)freq2/freq1; // Prob(t3|(t1_t2)) = freq(t1_t2_t3)/freq(t1_t2)
- probs3.put(key, prob3); // save probability as a log2: Math.log(prob3)
- }
- } else{
- throw new IllegalArgumentException ("only uni-, bi-, and trigramms are supported at the moment");
- }
- return ((N == 2) ? probs2 : (N==3)? probs3 : probs1);
- }
-
-
-
- /**
- * Computes alphas for linear interpolation smoothing of unknown n-grams
- * @param N N-gram
- * currently lambdas are calculated as in (Brants, 2000)
- */
-
- public double [] calculate_lambda(int N){
- double lambda1 = 0;
- double lambda2 = 0;
- double lambda3 = 0;
-
- double count2 = 0;
- double count3 = 0;
-
- if (N ==2) {
- Iterator keyValuePairs = bigrams.entrySet().iterator();
- for (int i = 0; i < bigrams.size(); i++) // for all bigrams
- {
- Map.Entry entry = (Map.Entry) keyValuePairs.next();
- ArrayList key = (ArrayList) entry.getKey(); // get a bigram
- // String [] t = ((String)key).split("_");
- double freq1 = unigrams.get(key.get(0)); // get a count of a preceding unigram
- Double freq2 = (Double) entry.getValue(); // get a count of a bigram
-
- double f2 = (freq2-1)/(freq1-1);
- double f1 = (freq1-1)/(tokens_count_all_corpus-1);
-
- double freq = get_max(f2, f1, 0);
-
- if (freq == f2){lambda2+=freq2;count2+=freq2;}
- else {lambda1+=freq2;count2+=freq2;}
- } lambdas2[0] = lambda1/count2; lambdas2[1]= lambda2/count2;
- }
-
- if (N == 3) {
-
- Iterator keyValuePairs = trigrams.entrySet().iterator();
- for (int i = 0; i < trigrams.size(); i++) // for all trigrams
- {
- Map.Entry entry = (Map.Entry) keyValuePairs.next();
- ArrayList key = (ArrayList) entry.getKey(); // get a trigram
- // String [] t = ((String)key).split("_");
- // String tt = t[0]+"_"+t[1];
-
- Double freq2 = bigrams.containsKey(key.subList(0, 3)) ? bigrams.get(key.subList(0, 3)) : 0.0; // get a count of a preceding bigram
- Double freq3 = (Double) entry.getValue(); // get a count of a trigram
- Double freq1 = unigrams.containsKey(key.get(0)) ? unigrams.get(key.get(0)) : 0.0;
-
- double f3 = (freq3-1)/(freq2-1);
- double f2 = (freq2-1)/(freq1-1);
- double f1 = (freq1-1)/(tokens_count_all_corpus-1);
- double freq = get_max(f3, f2, f1);
- if (freq == f3) {lambda3+= freq3; count3+=freq3;} // or just real frequency?
- else if (freq == f2){lambda2+=freq3; count3+=freq3;}
- else {lambda1+=freq3;count3+=freq3;}
- } lambdas3[0] = lambda1/count3; lambdas3[1]=lambda2/count3; lambdas3[2]=lambda3/count3;
- }
- System.out.println("lambdas for 2-grams = "+lambdas2[0]+" "+lambdas2[1]+"\n"+"lambdas for 3-grams"+lambdas3[0]+" "+lambdas3[1]+" "+lambdas3[2]);
- return ((N == 2) ? lambdas2 : lambdas3);
- }
-
- private double get_theta (Map m) {
- double d = 0;
- double sum1 = 0;
- double tagset_size = m.size();
- Iterator tags_probs = m.entrySet().iterator();
- for (int h = 0; hmax){ max = b; }
- if (c>max) {max = c; }
- return max;
- }
-
- /**
- * Writes the model to a binary file
- * @param filename output file name
- */
-
- private void write_to_file(String filename){
-
- File file = null;
- if (filename!=null) { file = new File (filename);}
- // or use a default file name
- if (file == null) {
- System.out.println ("Default: model.dat");
- file = new File ("model.dat");
- }
-
- try {
- // Create an output stream to the file.
- FileOutputStream file_output = new FileOutputStream (file);
- ObjectOutputStream o = new ObjectOutputStream( file_output );
- o.writeObject(this);
-
- file_output.close ();
- }
- catch (IOException e) {
- System.err.println ("IO exception = " + e );
- }
-
- }
-
- private void init(){
-
- List>> l = get_word_probs(get_lexicon(corpus));
-
- this.word_probs = (Map) l.get(0);
- this.suffix_tree = (Map) l.get(1);
- this.suffix_tree_capitalized = (Map) l.get(2);
- this.counts.entrySet().toString();
- Map test2 = get_ngrams(2);
- Map test3 = get_ngrams(3);
- Map test1 = get_ngrams(1);
- /* Iterator> it = test2.entrySet().iterator(); // iterate over words
-
- for (int g = 0; g sentence, String END_OF_SENT_TAG,
- Map> suffix_tree,Map> suffix_tree_cap, Map transition_probs,
+ Map> suffix_tree,Map> suffix_tree_cap, Map transition_probs,
Map> word_probs, double[] lambdas2, double[] lambdas3, double theta) {
sentence.add(0, END_OF_SENT_TAG);
@@ -104,87 +105,88 @@
if (i == 0) {
- // lookup for the non-capitalized variant of the word for the first word in a sentence and
- // weight by relative frequencies of the corresponding forms and sum them
- token = sentence.get(1);
- String non_cap = token.toLowerCase();
- /* Matcher m = p.matcher(token);
- boolean b = m.matches();
-
- if (b) {
- System.out.println(token);
- cardinals+=1;
- available_pos = word_probs.get("@card");
- //available_pos.put("CARD", 1.00);
- }
- else */
- if (word_probs.containsKey(token)| word_probs.containsKey(non_cap)) {
- if (ModelGeneration.capitalized(sentence.get(1)) & word_probs.containsKey(sentence.get(1))){
- available_pos = word_probs.get(sentence.get(1)); // here we get available states of the
- } else{
- // if a lexicon contains a non-capitalized variant of a word
- if (word_probs.containsKey(non_cap)) {
- available_pos = word_probs.get(non_cap);
- }}
+ // lookup for the non-capitalized variant of the word for the first word in a sentence and
+ // weight by relative frequencies of the corresponding forms and sum them
+ token = sentence.get(1);
+ String non_cap = token.toLowerCase();
+ /* Matcher m = p.matcher(token);
+ boolean b = m.matches();
+
+ if (b) {
+ System.out.println(token);
+ cardinals+=1;
+ available_pos = word_probs.get("@card");
+ //available_pos.put("CARD", 1.00);
+ }
+ else */
+ if (word_probs.containsKey(token)| word_probs.containsKey(non_cap)) {
+ if (ModelGeneration.capitalized(sentence.get(1)) & word_probs.containsKey(sentence.get(1))){
+ available_pos = word_probs.get(sentence.get(1)); // here we get available states of the
+ } else{
+ // if a lexicon contains a non-capitalized variant of a word
+ if (word_probs.containsKey(non_cap)) {
+ available_pos = word_probs.get(non_cap);
+ }}
} else
- // 2. smoothed suffix- the strategy described in (Brants, 2000)
+ // 2. smoothed suffix- the strategy described in (Brants, 2000)
{
- Map> suffix_tree_local;
- // if a word is capitalized ...
- if (ModelGeneration.capitalized(sentence.get(1))){
- suffix_tree_local = suffix_tree_cap;
- } else {
- suffix_tree_local = suffix_tree;
- }
-
- char [] unknown = sentence.get(1).toCharArray();
- for (int j=0;j entry = (Map.Entry) posValuePairs.next();
- Object key = entry.getKey();
- Double value = (Double) entry.getValue();
- if (available_pos_zwischen.containsKey(key)) {
- double zwischen_prob = (value + theta*(Double)available_pos_zwischen.get(key))/(1+theta);
- available_pos_zwischen.put(key, zwischen_prob);
- }/* else {
- available_pos_zwischen.put(key, value);
- }*/
- }
- } else {
- Iterator posValuePairs2 = available_pos_zwischen.entrySet().iterator();
- while(posValuePairs2.hasNext()){
- Map.Entry entry = (Map.Entry) posValuePairs2.next();
- Object key = entry.getKey();
- Double value = (Double) entry.getValue();
- // smooth suffix probability P(suffix|tag)
- double zwischen_prob = (0 + theta*value)/(1+theta);
- pos.put(key, zwischen_prob);
-
- } available_pos_zwischen = pos;
- }
-
- }
- available_pos = available_pos_zwischen;
- break;
- }
- //
- else if (j==unknown.length-1){
- available_pos = word_probs.get("Clinton");
- }
- }
+ Map> suffix_tree_local;
+ // if a word is capitalized ...
+ if (ModelGeneration.capitalized(sentence.get(1))){
+ suffix_tree_local = suffix_tree_cap;
+ } else {
+ suffix_tree_local = suffix_tree;
+ }
+
+ char [] unknown = sentence.get(1).toCharArray();
+ for (int j=0;j entry = (Map.Entry) posValuePairs.next();
+ Object key = entry.getKey();
+ Double value = (Double) entry.getValue();
+ if (available_pos_zwischen.containsKey(key)) {
+ double zwischen_prob = (value + theta*(Double)available_pos_zwischen.get(key))/(1+theta);
+ available_pos_zwischen.put(key, zwischen_prob);
+ }/* else {
+ available_pos_zwischen.put(key, value);
+ }*/
+ }
+ } else {
+ Iterator posValuePairs2 = available_pos_zwischen.entrySet().iterator();
+ while(posValuePairs2.hasNext()){
+ Map.Entry entry = (Map.Entry) posValuePairs2.next();
+ Object key = entry.getKey();
+ Double value = (Double) entry.getValue();
+ // smooth suffix probability P(suffix|tag)
+ double zwischen_prob = (0 + theta*value)/(1+theta);
+ pos.put(key, zwischen_prob);
+
+ } available_pos_zwischen = pos;
+ }
+
+ }
+ available_pos = available_pos_zwischen;
+ break;
+ }
+ //
+ else if (j==unknown.length-1){
+ // available_pos = word_probs.get("Clinton");
+ available_pos = word_probs.get("(");
+ }
+ }
}
all.putAll(init_probs(END_OF_SENT_TAG, available_pos));
continue; // go over to the next token
@@ -207,81 +209,81 @@
// next token
/* Matcher m2 = p.matcher(token);
- boolean b = m2.matches();
-
- if (b) {
- cardinals+=1;
- possible_pos_next = word_probs.get("@card");
- // possible_pos_next.put("CARD", 1.00);
- }
- else */
+ boolean b = m2.matches();
+
+ if (b) {
+ cardinals+=1;
+ possible_pos_next = word_probs.get("@card");
+ // possible_pos_next.put("CARD", 1.00);
+ }
+ else */
if (word_probs.containsKey(sentence.get(i + 1))) { // if the next token is known
possible_pos_next = word_probs.get(sentence.get(i + 1)); // get possible POS of the next
} else
{
- Map> suffix_tree_local;
- // if a word is capitalized ...
- if (ModelGeneration.capitalized(sentence.get(i+1))){
- suffix_tree_local = suffix_tree_cap;
- } else {
- suffix_tree_local = suffix_tree;
- }
-
- char [] unknown = sentence.get(i+1).toCharArray();
- for (int j=0;j entry = (Map.Entry) posValuePairs.next();
- Object key = entry.getKey();
- Double value = (Double) entry.getValue();
- if (available_pos_zwischen.containsKey(key)) {
- // smooth suffix probability P(suffix|tag)
- double zwischen_prob = (value + theta*(Double)available_pos_zwischen.get(key))/(1+theta);
- available_pos_zwischen.put(key, zwischen_prob);
- }/* else {
- available_pos_zwischen.put(key, value);
- }*/
- }
- } else {
- Iterator posValuePairs2 = available_pos_zwischen.entrySet().iterator(); // iterate over words
- while(posValuePairs2.hasNext()){
- Map.Entry entry = (Map.Entry) posValuePairs2.next();
- Object key = entry.getKey();
- Double value = (Double) entry.getValue();
- // smooth suffix probability P(suffix|tag)
- double zwischen_prob = (0 + theta*value)/(1+theta);
- pos.put(key, zwischen_prob);
-
- } available_pos_zwischen = pos;
- }
-
- }
- possible_pos_next = available_pos_zwischen;
- break;
- }
- else if (j==unknown.length-1){
- possible_pos_next = word_probs.get("Clinton");
- }
- }
+ Map> suffix_tree_local;
+ // if a word is capitalized ...
+ if (ModelGeneration.capitalized(sentence.get(i+1))){
+ suffix_tree_local = suffix_tree_cap;
+ } else {
+ suffix_tree_local = suffix_tree;
+ }
+
+ char [] unknown = sentence.get(i+1).toCharArray();
+ for (int j=0;j entry = (Map.Entry) posValuePairs.next();
+ Object key = entry.getKey();
+ Double value = (Double) entry.getValue();
+ if (available_pos_zwischen.containsKey(key)) {
+ // smooth suffix probability P(suffix|tag)
+ double zwischen_prob = (value + theta*(Double)available_pos_zwischen.get(key))/(1+theta);
+ available_pos_zwischen.put(key, zwischen_prob);
+ }/* else {
+ available_pos_zwischen.put(key, value);
+ }*/
+ }
+ } else {
+ Iterator posValuePairs2 = available_pos_zwischen.entrySet().iterator(); // iterate over words
+ while(posValuePairs2.hasNext()){
+ Map.Entry entry = (Map.Entry) posValuePairs2.next();
+ Object key = entry.getKey();
+ Double value = (Double) entry.getValue();
+ // smooth suffix probability P(suffix|tag)
+ double zwischen_prob = (0 + theta*value)/(1+theta);
+ pos.put(key, zwischen_prob);
+
+ } available_pos_zwischen = pos;
+ }
+
+ }
+ possible_pos_next = available_pos_zwischen;
+ break;
+ }
+ else if (j==unknown.length-1){
+ possible_pos_next = word_probs.get("(");
+ }
+ }
}
-
+
Iterator keyValuePairs_next = possible_pos_next.entrySet().iterator();
for (int u = 0; u < possible_pos_next.size(); u++) // for every possible tag of the next
// token, if the token is known..
{
- Map.Entry entry_next = (Map.Entry) keyValuePairs_next.next();
+ Map.Entry entry_next = (Map.Entry) keyValuePairs_next.next();
String key_next = (String) entry_next.getKey();
Double value_next = (Double) entry_next.getValue(); // get
double total_prob = 0.0; // just for fun, for forward algorithm
@@ -324,7 +326,7 @@
ngram = path_local.get(path_local.size() - 1) + "_" + key_next;
} else if (N == 2) {
-
+
ngram = path_local.get(path_local.size() - 1) + "_" + key_next;
} else if (N == 3 && i != 1) {
@@ -339,35 +341,55 @@
/* -- till here -- */
double pp = 0;
-
- // If an n-gram is known
- if (transition_probs.containsKey(ngram)) {
+
+ // smoothing only unknown n-grams strategy
+ /* if (transition_probs.containsKey(ngram)) {
// P(t2|t1) || use logs because of small numbers: log(pq) = log(p) +
// log(q); if model parameters are stored logged then only addition
// is performed at runtime
pp = value_next + transition_probs.get(ngram);
- } else {
- // System.err.println("UNKNOWN NGRAM");
+ } else { */
+ // System.err.println("UNKNOWN NGRAM");
- // TODO add unknown ngram handler
- pp = value_next + 0.001; // vorl�ufig, zum testen
- //
- }
+ // At the moment we smooth both - known and unknown n-grams (seems to perform better)
+ double ppp;
+ if (N ==3 && i==1){
+ double lambda1 = lambdas2[0];
+ double lambda2 = lambdas2[1];
+
+ ppp = (transition_probs.containsKey(ngram)) ? ((lambda2*transition_probs.get(ngram))+(lambda1*transition_probs.get(key_next))) : (lambda1*transition_probs.get(key_next));
+ pp = Math.log(value_next) + Math.log(ppp); // P(t|w) * P(t1,t2,t3)
+ }
+ else if (N==3){
+ double lambda1 = lambdas3[0];
+ double lambda2 = lambdas3[1];
+ double lambda3 = lambdas3[2];
+
+ if(transition_probs.containsKey(ngram)){
+
+
+ ppp = (lambda3*transition_probs.get(ngram))+(lambda2*transition_probs.get(ngram2))+(lambda1*transition_probs.get(key_next));
+ } else {
+ // System.out.println(ngram2);
+ ppp = (transition_probs.containsKey(ngram2)) ? ((lambda2*transition_probs.get(ngram2))+(lambda1*transition_probs.get(key_next))) : (lambda1*transition_probs.get(key_next));
+ }pp = Math.log(value_next) + Math.log(ppp);
+ }
+ if (N ==2){
+ double lambda1 = lambdas2[0];
+ double lambda2 = lambdas2[1];
+
+ ppp = (transition_probs.containsKey(ngram)) ? ((lambda2*transition_probs.get(ngram))+(lambda1*transition_probs.get(key_next))) : (lambda1*transition_probs.get(key_next));
+ pp = Math.log(value_next) + Math.log(ppp);
+ }
+
vprob_local += pp;
if (y == 0) {
max_viterbi_prob = vprob_local;
- // System.out.println("initial max_viterbi_prob"+max_viterbi_prob);
}
-
- // total_prob += prob_local; // sum is no more good as we changed to
- // logarithms
- // System.out.println("P("+sentence.get(i+1)+"|"+key_next+")*P("+ngram+")="+vprob_local);
- if (vprob_local >= max_viterbi_prob) { // HIER ENTSCHEIDET WELCHE von
- // den m�glichen states wird
- // �bernehmen
+ if (vprob_local >= max_viterbi_prob) {
+
max_viterbi_prob = vprob_local;
-
max_viterbi_path = new ArrayList(path_local);
max_viterbi_path.add(key_next);
}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/SuffixTree.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/SuffixTree.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/SuffixTree.java (working copy)
@@ -1,325 +0,0 @@
-/*
- *Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-package org.apache.uima.examples.tagger;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-/**
- * Java implementation of the Ukkonen's suffix tree (the fastest known suffix tree, O(n)) inspired
- * by Mark Nelson's http://marknelson.us/1996/08/01/suffix-trees/
- */
-public class SuffixTree {
-
- String text = "";
-
- List nodes = new ArrayList();
-
- Map edges = new HashMap(); // Map to store the starting node of the edges and their corresponding
- // first characters
-
- char[] chars;
-
- public SuffixTree() {
- this("");
- }
-
- Suffix active_point = new Suffix(0, 0, -1); // initial active point is the first non-leaf suffix
- // in the tree
-
- public SuffixTree(String text) {
- // int token_begin = words.length();
-
- nodes.removeAll(nodes);
-
- this.text = text;
- chars = text.toCharArray();
-
- for (int i = 0; i < chars.length; i++) {
- add_prefix(i, active_point); //
- }
- }
-
- //
- public void insert_edge(Edge edge) {
- List keys = new ArrayList();
- keys.add(edge.start_node);
- keys.add(chars[edge.first_char_index]);
- edges.put(keys, edge);
- }
-
- /**
- *
- * Internal Class SUFFIX
- */
-
- class Suffix {
-
- int origin_node;
-
- int first_char_index;
-
- int last_char_index;
-
- public Suffix(int node, int begin, int end) {
- this.origin_node = node;
- this.first_char_index = begin;
- this.last_char_index = end;
- }
-
- boolean isExplicit() {
- return first_char_index > last_char_index;
- }
-
- boolean isImplicit() {
- return last_char_index >= first_char_index;
- }
-
- // "The canonical representation of the suffix simply requires that the origin_node
- // in the Suffix object be the closest parent to the end point of the string.
- // This means that the suffix string represented by the pair (0, "ABABABC"),
- // would be canonized by moving first to (1, "ABABC"), then (4, "ABC"), and finally (8,"").
- // When a suffix string ends on an explicit node, the canonical representation will use an empty
- // string
- // to define the remaining characters in the string.
- //
- // An empty string is defined by setting first_char_index to be greater than last_char_index.
- // When this is the case, we know that the suffix ends on an explicit node.
- // If first_char_index is less than or equal to last_char_index,
- // it means that the suffix string ends on an implicit node."
- // (Mark Nelson. Fast String Searching With Suffix Trees.
- // http://marknelson.us/1996/08/01/suffix-trees/)
-
- void canonize() {
- if (!this.isExplicit()) {
- List keys = new ArrayList();
- keys.add(this.origin_node);
- keys.add(chars[this.first_char_index]);
- Edge edge = (Edge) edges.get(keys);
-
- int edge_span = edge.last_char_index - edge.first_char_index + 1;
-
- int suffix_span = this.last_char_index - this.first_char_index + 1;
-
- if (edge_span <= suffix_span) {
- this.first_char_index = this.first_char_index + edge_span;
- this.origin_node = edge.end_node;
- this.canonize();
-
- }
- }
- }
- }
-
- /**
- * Internal Class EDGE
- */
-
- class Edge {
- int first_char_index;
-
- int last_char_index;
-
- int start_node;
-
- int end_node;
-
- public Edge(int parent_node, int end_node, int first_char_index, int last_char_index) {
- this.first_char_index = first_char_index;
- this.last_char_index = last_char_index;
- this.start_node = parent_node;
- this.end_node = end_node;
- }
-
- // When a suffix ends on an implicit node, adding a new character
- // means I have to split an existing edge.
- // The existing edge loses its parent, as well as some of its leading
- // characters. The newly created edge descends from the original
- // parent, and now has the existing edge as a child.
- //
- // Since the existing edge is getting a new parent and starting
- // character, it is re-inserted.
- //
- // The number of characters stolen from the original node and given
- // to the new node is equal to the number of characters in the suffix
- // argument, which is last - first + 1;
-
- // Comment to the suffix pointers: The suffix pointers are built at the same time the update to
- // the tree is taking place.
- // As I move from the active point to the end point, I keep track of the parent node of each of
- // the new leaves I create.
- // Each time I create a new edge, I also create a suffix pointer
- // from the parent node of the last leaf edge I created to the current parent edge.
-
- public int split_edge(Suffix suffix) {
-
- Node next = new Node();
- nodes.add(next);
-
- int new_node_index = nodes.size();
-
- int suf_span = suffix.last_char_index - suffix.first_char_index + 1;
- nodes.get(new_node_index - 1).suffix_node = suffix.origin_node;
- Edge new_edge = new Edge(new_node_index, this.end_node, this.first_char_index + suf_span,
- this.last_char_index);
- insert_edge(new_edge);
-
- // SuffixTree.remove_edge(this.start_node, this.first_char_index);
- // shorten existing edge
- this.last_char_index = this.first_char_index + suf_span - 1;
- // this.end_node = new_node_index;
-
- insert_edge(new Edge(this.start_node, new_node_index, this.first_char_index,
- this.last_char_index));
-
- return new_node_index; // return the new origin node index of the last edge
- }
-
- }
-
- /**
- * Internal Class NODE
- */
-
- // The only information contained in a node is the
- // suffix link. Each suffix in the tree that ends
- // at a particular node can find the next smaller suffix
- // by following the suffix_node link to a new node. Nodes
- // are stored in a simple array.
- class Node {
- int suffix_node;
-
- // static int count=0;
-
- // Nodes with suffix link of (-1) are leaf nodes
- public Node() {
- suffix_node = -1;
- }
-
- }
-
- public void add_prefix(int last_char, Suffix active_point) {
- int last_parent_node = -1;
- int parent_node = 0;
-
- for (;;) {
- parent_node = active_point.origin_node;
-
- Edge edge;
- if (active_point.isExplicit()) {
- List keys = new ArrayList();
- keys.add(active_point.origin_node);
- keys.add(chars[last_char]);
- if (edges.containsKey(keys)) {
- edge = (Edge) edges.get(keys);
- break;
- }
- } else if (active_point.isImplicit()) { // if suffix is implicit, i.e. it does not end in a
- // leaf node $
- List keys2 = new ArrayList();
- keys2.add(active_point.origin_node);
- keys2.add(chars[active_point.first_char_index]);
- edge = (Edge) edges.get(keys2);
-
- int span = active_point.last_char_index - active_point.first_char_index;
- // if the given prefix is already in the tree, do nothing
- if (chars[edge.first_char_index + span + 1] == chars[last_char]) {
- break;
- } else {
- parent_node = edge.split_edge(active_point);
- }
-
- }
-
- Node next = new Node();
- nodes.add(next);
- int new_node_index = nodes.size();
- edge = new Edge(parent_node, new_node_index, last_char, chars.length - 1);
- insert_edge(edge);
-
- // When we create a new node, it also means we need to create a suffix link to the new node
- // from
- // the last node we visited.
- if (last_parent_node > 0) {
- nodes.get(last_parent_node - 1).suffix_node = parent_node;
- }
- last_parent_node = parent_node;
-
- // here we move to the next smaller suffix
- if (active_point.origin_node == 0) {
- active_point.first_char_index += 1;
- } else {
-
- active_point.origin_node = nodes.get(active_point.origin_node - 1).suffix_node;
-
- }
- active_point.canonize();
- }
- if (last_parent_node > 0) {
- nodes.get(last_parent_node - 1).suffix_node = parent_node;
- }
- active_point.last_char_index += 1;
- active_point.canonize();
- }
-
- // ///////// TEST
-
- /*
- * public void walk_tree(SuffixTree suffix_tree, int current_node_index, Suffix current_suffix,
- * int current_suffix_len){ int edge_count = 0; Set alphabet = new HashSet(); char []
- * chars = text.toCharArray(); for (char c : chars){ if(!alphabet.add(c)){ // }
- * System.out.println(alphabet.toString()); Character [] letters = (Character [])
- * alphabet.toArray(); for (char ch : letters){ try { List keys = new ArrayList();
- * keys.add(current_node_index); keys.add(ch); Edge edge = (Edge) suffix_tree.edges.get(keys); if
- * (current_node_index != edge.start_node){ System.err.println("ERROR"); }
- * System.out.println(current_node_index+" "+edge.start_node); edge_count+=1; int l =
- * current_suffix_len; for (int j : Range((Comparable)edge.last_char_index, (Comparable)
- * (edge.first_char_index+1))){ current_suffix. = suffix_tree.text.charAt(j); } } } }
- * }
- *
- *
- *
- *
- *
- *
- * public static void main(String[] args) {
- *
- * String test_str="pace"; SuffixTree suffix_tree = new SuffixTree(test_str);
- *
- *
- * System.out.println(edges.keySet().toString());
- *
- * Iterator kv = edges.entrySet().iterator(); for (int u= 0; u
+ * E.g. if we need to map tags, given a list of {@code Tokens}, we need to map the
+ * {@code pos} field of every {@code Token} to a different {@code pos}.
+ */
+ public List map_tags(List tokens){
+
+
+ // for (int i=0; i tokens2 = new ArrayList(tokens.size());
+
+ for (int x=0; x}.
+ * E.g. if we need to map pos-tags given simply as {@code Strings} in a {@code List}.
+
+
+ public List map_pos(List pos){
+
+
+ // for (int i=0; i pos2 = new ArrayList(pos.size());
+
+ for (int x=0; x map_tags(List tokens) {
+public class TagMapping implements MappingInterface{
+ /**
+ * Defines mapping for List<{@link Token}>
+ * E.g. if we need to map tags, given a list of {@code Tokens}, we need to map the
+ * {@code pos} field of every {@code Token} to a different {@code pos}.
+ * Basically the mapping performed in this class is just a case of simple "normalization",
+ * we just discard compound tags of the Brown corpus here.
+ */
+ @SuppressWarnings("unchecked")
+ public List map_tags(List tokens) {
+
// for (int i=0; i tokens2 = new ArrayList(tokens.size());
for (int x = 0; x < tokens.size(); x++) { // iterate over tokens of the sentence with their
- // corresponding POS
- Token current_token = tokens.get(x);
+ // corresponding POS
+ Token current_token = (Token)tokens.get(x);
String[] z = new String[2];
if (current_token.pos.contains("+")) {
z = current_token.pos.split("[+]");
@@ -69,4 +77,5 @@
}
+
}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/MapBrownToPenn.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/MapBrownToPenn.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/MapBrownToPenn.java (working copy)
@@ -1,339 +0,0 @@
-/*
- *Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-package org.apache.uima.examples.tagger;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * just a "game"-mapping at the moment..
- */
-public class MapBrownToPenn {
-
- public MapBrownToPenn() {
- }
-
- public static List map_tags(List tokens) {
-
- List tokens2 = new ArrayList(tokens.size());
-
- for (int x = 0; x < tokens.size(); x++) { // iterate over tokens of the sentence with their
- // corresponding POS
- Token current_token = tokens.get(x);
- String[] z = new String[2];
-
- // First we eliminate compound tags in the corpus //
-
- if (current_token.pos.contains("+")) {
- z = current_token.pos.split("[+]");
- current_token.pos = z[0];
- }
- // for cases like : BEZ*
- if (current_token.pos.contains("*") && !(current_token.pos.startsWith("*"))) {
- z[0] = current_token.pos.replace("*", "");
- current_token.pos = z[0];
- }
-
- // for: *-h1
- if (current_token.pos.startsWith("*")) {
- z[0] = "*";
- current_token.pos = z[0];
- }
- if (current_token.pos.contains("-") && !(current_token.pos.startsWith("--"))) {
- z = current_token.pos.split("[-]");
- current_token.pos = z[0];
- }
- if (current_token.pos.startsWith("--")) {
- z[0] = "--";
- current_token.pos = z[0];
- }
-
- // ******** TILL HERE ************//
-
- // last but not least, we map the rest of the tags to the Penn tree bank notation
- // first come straightforward mappings
-
- // ******** FROM HERE ************//
-
- if (current_token.pos.equalsIgnoreCase("od")) {
- z[0] = "jj";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("jjt")) {
- z[0] = "jjs";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("jjs")) {
- z[0] = "jj";
- current_token.pos = z[0];
- }
- if (current_token.pos.equalsIgnoreCase("*")) {
- z[0] = "rb";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("rbt")) {
- z[0] = "rbs";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("wql")) {
- z[0] = "wrb";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("ql")) {
- z[0] = "rb";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("rn")) {
- z[0] = "rb";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("cs")) {
- z[0] = "in";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("dti")) {
- z[0] = "dt";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("dts")) {
- z[0] = "dt";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("abl")) {
- z[0] = "pdt";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("abn")) {
- z[0] = "pdt";
- current_token.pos = z[0];
- }
-
- // both ??
- if (current_token.pos.equalsIgnoreCase("abx")) {
- z[0] = "dt|cc";
- current_token.pos = z[0];
- }
-
- // either/neither ???
-
- if (current_token.pos.equalsIgnoreCase("dtx")) {
- z[0] = "dt|cc";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("at")) {
- z[0] = "dt";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("ap")) {
- z[0] = "jj";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("pp$")) {
- z[0] = "prp$";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("pp$$")) {
- z[0] = "prp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("np")) {
- z[0] = "nnp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("nps")) {
- z[0] = "nnps";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("nr")) {
- z[0] = "nn";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("nrs")) {
- z[0] = "nns";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("pn")) {
- z[0] = "nn";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("ppss")) {
- z[0] = "prp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("pps")) {
- z[0] = "prp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("ppo")) {
- z[0] = "prp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("ppl")) {
- z[0] = "prp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("ppls")) {
- z[0] = "prp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("wps")) {
- z[0] = "wp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("wpo")) {
- z[0] = "wp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("dod")) {
- z[0] = "vbd";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("doz")) {
- z[0] = "vbz";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("hvd")) {
- z[0] = "vbd";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("hvg")) {
- z[0] = "vbg";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("hvn")) {
- z[0] = "vbn";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("hvz")) {
- z[0] = "vbz";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("bed")) {
- z[0] = "vbd";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("bedz")) {
- z[0] = "vbd";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("beg")) {
- z[0] = "vbg";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("ben")) {
- z[0] = "vbn";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("bez")) {
- z[0] = "vbz";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("bem")) {
- z[0] = "vbp";
- current_token.pos = z[0];
- }
-
- if (current_token.pos.equalsIgnoreCase("ber")) {
- z[0] = "vbp";
- current_token.pos = z[0];
- }
-
- // ?? preposition "to" should be changed from TO to IN
-
- if (current_token.pos.equalsIgnoreCase("$")) {
- z[0] = "pos";
- current_token.pos = z[0];
- }
-
- // in Penn Treebank: extra sign for DOLLAR
- if (current_token.word.equals("$")) {
- z[1] = "$";
- current_token.pos = z[1];
- }
-
- // // *** Here an ***attempt**** to map some of the syntactic function differences ***, not
- // all are that easily possible////
- /*
- * if (t[0].equals("one") && tokens[x-1].startsWith("the")) { z[0] = "nn";
- * current_token.pos=z[0]; }
- */
-
- // ?? Verb base present form , non - infinitive
- // if (current_token.pos.equalsIgnoreCase("vb")) { // the same for "do", base non-infinitive
- // z[0] = "vbp"; // -||- "have"
- // current_token.pos=z[0];
- // }
- //
- // if (current_token.pos.equalsIgnoreCase("do")) { // infinitive be, do, have, any verb
- // z[0] = "vb";
- // current_token.pos=z[0];
- // }
- // extra symbols for list items LS and SYM for different non-identifable symbols..
- // not present in Brown Corpus
- // ******** TILL HERE ************//
- Token zwischen = new Token(current_token.word, current_token.pos);
-
- tokens2.add(zwischen);
- }
- return tokens2;
-
- }
-
-}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/Tagger.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/Tagger.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/Tagger.java (working copy)
@@ -23,47 +23,35 @@
package org.apache.uima.examples.tagger;
-import java.util.List;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
/**
- * known implementations: - HMMTagger using Viterbi algorithm
+ * General tagger interface in case one would want to define further types of taggers.
+ *
+ * Known implementations: {@link HMMTagger} using {@link Viterbi} algorithm
* to compute the most probable path of parts of speech for a given sequence of tokens
- * @see Viterbi
+ * @see Viterbi, HMMTagger
*/
-// AT THE MOMENT IS USELESS TODO: check if we need it at all..and integrate if :)
public interface Tagger {
- /**
- * Initiates smoothing procedure for unknown words and N-grams
- * @return true or false, depending on whether {@code smoothing} is set in a {@code param.txt} file
- */
- public boolean set_smoothing();
/**
* Instantiates {@code MODEL} for current tagger
*/
- public void init();
+ public void initialize(UimaContext aContext) throws ResourceInitializationException;
/**
* Trains a new model for tagger, if a training is defined in {@code tagger.properties} file
* @see ModelGenerator
*/
- public void train();
+
+ public void process(JCas aJCas) throws AnalysisEngineProcessException;
- /**
- * Tags a sequence
- * @param wordList
- */
-
- public List process(List wordList);
-
- /**
- * Tests tagging accuracy if reference corpus is available
- */
- public void test();
-
}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/BrownReader.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/BrownReader.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/BrownReader.java (working copy)
@@ -1,169 +0,0 @@
-/*
- *Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-package org.apache.uima.examples.tagger;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-
-/**
- */
-public class BrownReader {
-
- static boolean MAPPING = true;
-
- List corpus;
-
- String InputDir;
-
- public BrownReader(String InputDir) {
- this.InputDir = InputDir;
- if (MAPPING) {
- this.corpus = TagMapping.map_tags(read_corpus(read_dir(InputDir))); // in case we need to map
- // tags, TODO: trasfer to
- // parameter file
- } else {
- this.corpus = read_corpus(read_dir(InputDir));
- }
-
- }
-
- /**
- * Reads file names from Directory
- *
- * @param directory
- * name
- * @return an array of file names in the directory
- */
-
- public static String[] read_dir(String directory) {
- File dir = new File(directory);
- String[] list = dir.list();
- String[] new_list = dir.list();
- for (int i = 0; i < list.length; i++) {
- String dir_list = directory + "/" + list[i];
- new_list[i] = dir_list;
- }
-
- return new_list;
- }
-
- /**
- * Reads Brown Corpus from NLTK Distribution Format. Iterates over all files in the directory,
- * which are in a sentence per line format, and returns all tokens in the collection in a List of
- * Tokens {@link Token}}
- *
- * @param files
- * an array of file names
- * @return a list of tokens from all files
- *
- */
-
- List all_words = new ArrayList();
-
- public static List read_corpus(String[] files) {
-
- String line;
- List text = new ArrayList();
-
- // simple tokenizer: match one or more spaces
- // String delimiters = " +";
-
- Pattern delimiters = Pattern.compile("[ ]+");
- // Split input with the pattern
-
- int line_count = 0;
-
- for (int i = 0; i < files.length; i++) {
- String file = files[i];
- try {
- BufferedReader in = new BufferedReader(new FileReader(file));
-
- while ((line = in.readLine()) != null) {
- if (line.trim().length() > 0) {
- line_count += 1;
- String[] tokens = delimiters.split(line);
-
- for (int x = 0; x < tokens.length; x++) { // iterate over tokens with their
- // corresponding POS
- tokens[x] = tokens[x].replaceAll("[\\n\\t]+", "");
-
- // for cases in Brown corpus like "//in" :(
- if (tokens[x].startsWith("//")) {
- String t = tokens[x].replace("//", "per/");
- tokens[x] = t;
- }
-
- // and that was not all, further for cases like:
- // "before/in /l//nn and/cc AAb//nn or/cc /r//nn ./. " (text j in NLTK distribution)
- if (tokens[x].startsWith("/", 0)) {
- String t = tokens[x].substring(1);
- tokens[x] = t;
- }
- // for cases like : "AAb//nn" (s. above)
- if (tokens[x].contains("//")) {
- int j = tokens[x].indexOf("//");
-
- String t = tokens[x].substring(0, j) + tokens[x].substring(j + 1);
- tokens[x] = t;
- }
-
- // for cases in brown like: "lb/day/nn" (text 'J', sentence N. 8940)
- int first = tokens[x].indexOf("/");
- int last = tokens[x].lastIndexOf("/");
- if (first != last) {
- String[] zw = tokens[x].split("/");
- String t = "";
- for (int w = 0; w < zw.length - 1; w++) {
-
- t = t + zw[w];
- }
-
- t = t + "/" + zw[zw.length - 1];
- tokens[x] = t;
- }
-
- String[] t = tokens[x].split("/");
-
- Token token = new Token(t[0], t[1]);
-
- text.add(token);
- }
- }
- }
- in.close();
- } catch (IOException e) {
- System.out.println(e);
- return null;
- }
- }
- System.out.println(line_count + " sentences in the corpus");
-
- return text;
- }
- /*
- * public static void main(String[] args) { // BrownReader b = new BrownReader("Brown_test"); }
- */
-}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/HMMTagger.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/HMMTagger.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/HMMTagger.java (working copy)
@@ -17,7 +17,6 @@
* under the License.
*/
-
package org.apache.uima.examples.tagger;
import java.io.FileInputStream;
@@ -35,45 +34,39 @@
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.examples.tagger.trainAndTest.ModelGeneration;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
/**
- * UIMA Analysis Engine that invokes HMM POS tagger. HMM POS tagger generates a
- * Brown Corpus style POS tags. This annotator assumes that sentences and tokens have already been
- * annotated in the CAS with Sentence and Token annotations, respectively. We iterate over
- * sentences, then iterate over tokens in the current sentence to accumlate a list of words, then
- * invoke the HMM POS tagger on the list of words. For each Token we then update the posTag
- * field with the POS tag. The model file for the HMM POS tagger is specified as a parameter
- * (MODEL_FILE_PARAM).
- * Implements {@link Tagger}
- *
- */
+ * UIMA Analysis Engine that invokes HMM POS tagger. HMM POS tagger generates part-of-speech tags
+ * for every token. This annotator assumes that sentences and tokens have already been annotated in the CAS
+ * with Sentence and Token annotations, respectively. We iterate over sentences, then iterate over
+ * tokens in the current sentence to accumulate a list of words, then invoke the HMM POS tagger on
+ * the list of words. For each Token we then update the posTag field with the POS tag. The model
+ * file for the HMM POS tagger is specified as a parameter (MODEL_FILE_PARAM).
+ *
+ */
-public class HMMTagger extends JCasAnnotator_ImplBase{
+public class HMMTagger extends JCasAnnotator_ImplBase implements Tagger{
-
/**
* Model file name
*/
- String MODEL;
-
+ String MODEL;
/**
- * for a bigram model: N = 2, for a trigram model N=3
- * N is defined in parameter file
+ * for a bigram model: N = 2, for a trigram model N=3 N is defined in parameter file
*/
- int N;
+ public int N;
+
+ // public String END_OF_SENT_TAG;
+
+ public ModelGeneration my_model;
- String END_OF_SENT_TAG;
-
- /**
- * Model used for current tagging
- */
+ MappingInterface MAPPING;
+ boolean DO_MAPPING;
- ModelGeneration my_model;
- HMMTagger tagger;
-
/**
* Initialize the Annotator.
*
@@ -85,22 +78,30 @@
try {
// Get configuration parameter values
String paramFile = (String) aContext.getConfigParameterValue("PARAM_FILE");
-
-
- // create and load default properties
+
+ // create and load default properties
Properties defaultProps = new Properties();
FileInputStream in = new FileInputStream(paramFile);
defaultProps.load(in);
in.close();
-
+
MODEL = defaultProps.getProperty("MODEL");
-
+
String n = defaultProps.getProperty("N");
N = Integer.parseInt(n);
+
+ String b = defaultProps.getProperty("DO_MAPPING");
+ DO_MAPPING = Boolean.valueOf(b);
+
+ if (DO_MAPPING){
+ String m = defaultProps.getProperty("MAPPING");
+ MappingInterface klasse = (MappingInterface)(Class.forName(m)).newInstance();
+ MAPPING = klasse;
+ } else {
+ MAPPING = null;
+ }
+ my_model = get_model(MODEL);
- END_OF_SENT_TAG = defaultProps.getProperty("END_OF_SENT_TAG");
- my_model = get_model(MODEL);
-
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
@@ -108,63 +109,76 @@
/**
* Reads a saved {@code MODEL} object from a file
- * @param filename model file
+ *
+ * @param filename
+ * model file
* @return {@link ModelGeneration}
*/
- public static ModelGeneration get_model(String filename){
-
+ public static ModelGeneration get_model(String filename) {
+
System.out.println("The used model is:" + filename);
- InputStream model = null;
+ InputStream model = null;
ModelGeneration oRead = null;
-
- try
- {
- model = new FileInputStream( filename );
- ObjectInputStream p = new ObjectInputStream( model);
+
+ try {
+ model = new FileInputStream(filename);
+ ObjectInputStream p = new ObjectInputStream(model);
oRead = (ModelGeneration) p.readObject();
- }
-
- catch ( IOException e ) { System.err.println( e ); }
- catch ( ClassNotFoundException e ) { System.err.println( e ); }
- finally { try {model.close(); } catch ( Exception e ) { } }
+ }
+
+ catch (IOException e) {
+ System.err.println(e);
+ } catch (ClassNotFoundException e) {
+ System.err.println(e);
+ } finally {
+ try {
+ model.close();
+ } catch (Exception e) {
+ }
+ }
return oRead;
}
-
-
+
/**
* Process a CAS.
*
* @see JCasAnnotator_ImplBase#process(JCas)
*/
+ @SuppressWarnings("unchecked")
public void process(JCas aJCas) throws AnalysisEngineProcessException {
ArrayList tokenList = new ArrayList();
ArrayList wordList = new ArrayList();
-
AnnotationIndex sentenceIndex = aJCas.getAnnotationIndex(SentenceAnnotation.type);
AnnotationIndex tokenIndex = aJCas.getAnnotationIndex(TokenAnnotation.type);
-
-
+
// iterate over Sentences
FSIterator sentenceIterator = sentenceIndex.iterator();
-
+
while (sentenceIterator.hasNext()) {
SentenceAnnotation sentence = (SentenceAnnotation) sentenceIterator.next();
-
+
tokenList.clear();
wordList.clear();
-
+
FSIterator tokenIterator = tokenIndex.subiterator(sentence);
while (tokenIterator.hasNext()) {
TokenAnnotation token = (TokenAnnotation) tokenIterator.next();
-
+
tokenList.add(token);
wordList.add(token.getCoveredText());
}
-
- List wordTagList = Viterbi.process(N, wordList, END_OF_SENT_TAG, my_model.suffix_tree , my_model.suffix_tree_capitalized, my_model.transition_probs, my_model.word_probs, my_model.lambdas2, my_model.lambdas3, my_model.theta);
-
+
+ List wordTagList = Viterbi.process(N, wordList, ".",
+ my_model.suffix_tree, my_model.suffix_tree_capitalized, my_model.transition_probs,
+ my_model.word_probs, my_model.lambdas2, my_model.lambdas3, my_model.theta);
+
+
+ if (MAPPING != null){
+ wordTagList = MAPPING.map_tags(wordTagList);
+ }
+
try {
for (int i = 0; i < tokenList.size(); i++) {
TokenAnnotation token = (TokenAnnotation) tokenList.get(i);
@@ -173,8 +187,8 @@
}
} catch (IndexOutOfBoundsException e) {
System.err.println("POS tagger error - list of tags shorter than list of words");
- }
- }
+ }
+ }
}
}
Index: D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/ModelGeneration.java
===================================================================
--- D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/ModelGeneration.java (revision 617479)
+++ D:/Jane/IBM/ECLIPSE_Workspaces_maven/Tagger/src/main/java/org/apache/uima/examples/tagger/ModelGeneration.java (working copy)
@@ -1,626 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-package org.apache.uima.examples.tagger;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.ObjectOutputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.Map.Entry;
-
-/**
- * Trains an N-gram model for the tagger, iterating over the files from some predefined training directory
- * Writes the resulting model to a binary fileSystem
- *
- *
- * NB. At the moment- both bi-and trigram statistics are saved in one model file..
- *
- */
-
-
-public class ModelGeneration implements java.io.Serializable{
-
- private static final long serialVersionUID = 1L;
-
- public Map suffix_tree= new HashMap ( ) ;
-
- public Map suffix_tree_capitalized = new HashMap();
- /**
- * Map containing {@code } probabilities, that is probability of a certain word given a certain tag at a time t: P(wordt |tagt ))
- *
- */
- public Map> word_probs = new HashMap>();
-
- /**
- * Map containing N-gram probabilities
- */
-
- public Map transition_probs = new HashMap() ;
-
-
- static List posList = new ArrayList();
-
- int N; // for the N-gram model
-
- double [] lambdas2 = new double[2];
- double [] lambdas3 = new double[3];
- public double theta; // for suffix probabiliites smoothing
-
- // transient String InputDir;
- transient String OutputFile;
- transient List corpus;
-
- /**
- * @param N N=1, 2 or 3
- * @param InputDir input directory name
- * @param OutputFile output file name
- * MapBrownToPenn TagMapping
- */
- @SuppressWarnings("unchecked")
- public ModelGeneration(List corpus, String OutputFile) {
- this.OutputFile = OutputFile;
- this.corpus = corpus;
- }
-
-
- private void init(){
-
- List