========================================================= * KoreanFilter is changed as follows 1. modification of the keyword extraction logic 2. the properties relating the keyword extration has been removed ========================================================= package org.apache.lucene.analysis.kr; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.kr.IndexWord; import org.apache.lucene.analysis.kr.KoreanTokenizer; import org.apache.lucene.analysis.kr.morph.AnalysisOutput; import org.apache.lucene.analysis.kr.morph.CompoundEntry; import org.apache.lucene.analysis.kr.morph.CompoundNounAnalyzer; import org.apache.lucene.analysis.kr.morph.MorphAnalyzer; import org.apache.lucene.analysis.kr.morph.MorphException; import org.apache.lucene.analysis.kr.morph.WordEntry; import org.apache.lucene.analysis.kr.morph.WordSpaceAnalyzer; import org.apache.lucene.analysis.kr.utils.DictionaryUtil; import org.apache.lucene.analysis.kr.utils.HanjaUtils; import org.apache.lucene.analysis.standard.ClassicTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public final class KoreanFilter extends TokenFilter { private LinkedList morphQueue; private MorphAnalyzer morph; private WordSpaceAnalyzer wsAnal; private char[] curTermBuffer; private String curType; private String prevType; private int tokStart; private CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer(); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE]; private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM]; public KoreanFilter(TokenStream input) { super(input); morphQueue = new LinkedList(); morph = new MorphAnalyzer(); wsAnal = new WordSpaceAnalyzer(); cnAnalyzer.setExactMach(false); } // this methos is changed for the modification of keyword extraction logic public boolean incrementToken() throws IOException { if(curTermBuffer!=null&&morphQueue.size()>0) { setTermBufferByQueue(false); return true; } if(!input.incrementToken()) return false; curTermBuffer = termAtt.buffer().clone(); tokStart = offsetAtt.startOffset(); curType = typeAtt.type(); String source = new String(curTermBuffer,0,termAtt.length()); // if a chinese letter + josa is. if(isChinesePrevTerm(source)) { prevType = curType; return incrementToken(); } try { if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN].equals(curType)) { analysisKorean(source); } else if(KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE].equals(curType)) { analysisChinese(source); } else { analysisETC(source); } }catch(MorphException e) { throw new IOException("Korean Filter MorphException\n"+e.getMessage()); } if(morphQueue!=null&&morphQueue.size()>0) { setTermBufferByQueue(true); } else { return incrementToken(); } return true; } /** * return if previous term is chinese letters. * @param source source text * @return return if true or not */ private boolean isChinesePrevTerm(String source) { boolean exist = false; if(prevType!=null && KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.CHINESE].equals(prevType) && KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN].equals(curType) ) { try { exist = DictionaryUtil.existJosa(source); } catch (MorphException e) {} } return exist; } /** * queue¿¡ ÀúÀåµÈ °ªÀ¸·Î bufferÀÇ °ªÀ» º¹»çÇÑ´Ù. * @throws MorphException */ private void setTermBufferByQueue(boolean isFirst) { clearAttributes(); IndexWord iw = morphQueue.removeFirst(); int pos = iw.getOffset(); termAtt.copyBuffer(iw.getWord().toCharArray(), 0, iw.getWord().length()); offsetAtt.setOffset(tokStart+pos, tokStart + pos + iw.getWord().length()); if(iw.getType()==null) typeAtt.setType(curType); else typeAtt.setType(iw.getType()); posIncrAtt.setPositionIncrement(iw.getIncrement()); prevType = curType; // store to check the type at next term } /** * ÇѱÛÀ» ºÐ¼®ÇÑ´Ù. * @param token * @param skipinc * @return * @throws MorphException */ private void analysisKorean(String input) throws MorphException { List outputs = morph.analyze(input); if(outputs.size()==0) return; Map map = new LinkedHashMap(); if(outputs.get(0).getScore()>=AnalysisOutput.SCORE_COMPOUNDS) { morphQueue.addAll( extractKeyword(outputs, input)); } else { List list = Collections.EMPTY_LIST; try { list = wsAnal.analyze(input); } catch(Exception e) { } if(list.size()>1) { // sucess for(AnalysisOutput o : list) { List subs = morph.analyze(o.getSource()); morphQueue.addAll( extractKeyword(subs, input)); } } else { // fail morphQueue.addAll( extractKeyword(outputs, input)); } } } private List extractKeyword(List outputs, String input) throws MorphException { if(outputs.size()==0) return Collections.EMPTY_LIST; List segments = new ArrayList(); int offset = 0; for(AnalysisOutput output : outputs) { if(output.getStem().length()<=offset) continue; if(offset==0) { splitCompoundNoun(output, segments, offset); } else { segments.add(new IndexWord(output.getStem().substring(offset),offset)); } offset = output.getStem().length(); } return segments; } private void splitCompoundNoun(AnalysisOutput output, List segments, int offset) { if(output.getCNounList().size()==0) { segments.add( new IndexWord(output.getStem().substring(offset),offset)); return; } for(CompoundEntry ce : output.getCNounList()) { segments.add( new IndexWord(ce.getWord(),offset)); offset += ce.getWord().length(); } } /** * extracting the keyword from chinese text * the korean sound text is also extracted from the chinese text * @param term * @throws MorphException */ private void analysisChinese(String term) throws MorphException { List candiList = new ArrayList(); candiList.add(new StringBuffer()); for(int i=0;i removeList = new ArrayList(); // store the removal candidate term int caniSize = candiList.size(); for(int j=0;j0) sb = new StringBuffer(origin); sb.append(chs[k]); if(k>0) candiList.add(sb); Iterator iter = DictionaryUtil.findWithPrefix(sb.toString()); if(!iter.hasNext()) // if a word doesn't exist in the dictionary, it is a removal candidate. removeList.add(sb); } } if(removeList.size()==candiList.size()) { // no word founded in the dictionary candiList = candiList.subList(0, 1); // choose the first candidate } for(StringBuffer rsb : removeList) { if(candiList.size()>1) candiList.remove(rsb); } } List results = confirmCNoun(candiList.get(0).toString()); if(results.size()==0) { // no compound noun founded morphQueue.add(new IndexWord(term,0)); if(candiList.size()>0) morphQueue.add( new IndexWord(candiList.get(0).toString(),0, 0, KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN])); return; } Map cnounMap = new HashMap(); int pos = 0; int offset = 0; for(CompoundEntry entry : results) { pos += entry.getWord().length(); if(cnounMap.get(entry.getWord())!=null) continue; // segment chinese letters into words that has the same length to the korean sound text morphQueue.add(new IndexWord(term.substring(offset,pos),offset)); cnounMap.put(entry.getWord(), entry.getWord()); // ºÐ¸®µÈ ÇѱÛÀ» Å¥¿¡ ÀúÀåÇÑ´Ù. morphQueue.add( new IndexWord(entry.getWord(),offset, 0, KoreanTokenizer.TOKEN_TYPES[KoreanTokenizer.KOREAN])); offset = pos; } } // this method is not changed private List confirmCNoun(String input) throws MorphException { WordEntry cnoun = DictionaryUtil.getCNoun(input); if(cnoun!=null && cnoun.getFeature(WordEntry.IDX_NOUN)=='2') { return cnoun.getCompounds(); } return cnAnalyzer.analyze(input); } // this method is not changed private void analysisETC(String term) throws MorphException { final char[] buffer = termAtt.buffer(); final int bufferLength = termAtt.length(); final String type = typeAtt.type(); if (type == APOSTROPHE_TYPE && // remove 's bufferLength >= 2 && buffer[bufferLength-2] == '\'' && (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { // Strip last 2 characters off morphQueue.add(new IndexWord(term.substring(0,bufferLength - 2),0)); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for(int i=0;i type // this properties is added private String type; public IndexWord() { } public IndexWord(String word, int pos) { this.word = word; this.offset = pos; } public IndexWord(String word, int pos, int inc) { this(word, pos); this.increment = inc; } public IndexWord(String word, int pos, int inc, String t) { this(word, pos, inc); this.type = t; } public String getWord() { return word; } public void setWord(String word) { this.word = word; } public int getOffset() { return offset; } public void setOffset(int offset) { this.offset = offset; } public int getIncrement() { return increment; } public void setIncrement(int increment) { this.increment = increment; } public String getType() { return type; } public void setType(String type) { this.type = type; } } ========================================================================= * KoreanAnalyzer is changed as follows 1. the propertes relating to keyword extraction has been removed. ========================================================================= package org.apache.lucene.analysis.kr; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; /** * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words. */ public class KoreanAnalyzer extends StopwordAnalyzerBase { /** Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; /** * Specifies whether deprecated acronyms should be replaced with HOST type. * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"} */ private final boolean replaceInvalidAcronym; private Set stopSet; public static final String DIC_ENCODING = "UTF-8"; /** An unmodifiable set containing some common English words that are usually not useful for searching. */ public static final CharArraySet STOP_WORDS_SET; static { List stopWords = Arrays.asList(new String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "ÀÌ","±×","Àú","°Í","¼ö","µî","µé","¹×","¿¡¼­","±×¸®°í","±×·¡¼­","¶Ç","¶Ç´Â"} ); CharArraySet stopSet = new CharArraySet(Version.LUCENE_42, stopWords.size(), false); stopSet.addAll(stopWords); STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); } public KoreanAnalyzer() { this(Version.LUCENE_42, STOP_WORDS_SET); } /** * °Ë»öÀ» À§ÇÑ ÇüżҺм® */ public KoreanAnalyzer(boolean exactMatch) { this(Version.LUCENE_42, STOP_WORDS_SET); } public KoreanAnalyzer(Version matchVersion, String[] stopWords) throws IOException { this(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords)); } public KoreanAnalyzer(Version matchVersion) throws IOException { this(matchVersion, STOP_WORDS_SET); } public KoreanAnalyzer(Version matchVersion, File stopwords) throws IOException { this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } public KoreanAnalyzer(Version matchVersion, File stopwords, String encoding) throws IOException { this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } public KoreanAnalyzer(Version matchVersion, Reader stopwords) throws IOException { this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } public KoreanAnalyzer(Version matchVersion, CharArraySet stopWords) { super(matchVersion, stopWords); replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_42); } @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final KoreanTokenizer src = new KoreanTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new KoreanFilter(src); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(KoreanAnalyzer.this.maxTokenLength); super.setReader(reader); } }; } } =========================================================================== * KoreanFilterFactory is changed as follows 1. the properties relating to the keyword extraction has been removed. ========================================================================== package org.apache.lucene.analysis.kr; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.TokenFilterFactory; public class KoreanFilterFactory extends TokenFilterFactory { public void init(Map args) { super.init(args); } /** * create a korean filter */ public TokenStream create(TokenStream tokenstream) { return new KoreanFilter(tokenstream); } } ====================================================================== KoreanAnalyzerTest is added. ====================================================================== package org.apache.lucene.analysis.kr; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.Version; import org.apache.lucene.util.LuceneTestCase.Slow; /** * Test class for Arirang korean morphological analyzer. * @author S.M. Lee * */ @Slow public class KoreanAnalyzerTest extends BaseTokenStreamTestCase { private Version TEST_VERSION = Version.LUCENE_42; /** * a basic example sentence * @throws Exception */ public void testBasicSentence() throws Exception { assertAnalyzesTo(new KoreanAnalyzer(TEST_VERSION), "¹ýÁ¤°øÈÞÀÏ¿¡ Ä¡¸¥ ½ÃÇè¿¡¼­ ¸¹Àº ÇлýÀÌ ¶³¾îÁ³´Ù", new String[] { "¹ýÁ¤", "°øÈÞÀÏ", "Ä¡¸£", "½ÃÇè", "¸¹", "Çлý", "¶³¾îÁö"}, new int[] {0, 2, 7, 10, 15, 18, 22}, // start-offset new int[] {2, 5, 9, 12, 16, 20, 25}, // end-offset new int[] {1, 1, 1, 1, 1, 1, 1} // position increment ); } /** * a example that include some chinese letters. * @throws Exception */ public void testChineseSentence() throws Exception { assertAnalyzesTo(new KoreanAnalyzer(TEST_VERSION), "ÛöïÒÍëýÌìí¿¡ Ä¡¸¥ ãËúп¡¼­ ¸¹Àº ÇлýÀÌ ¶³¾îÁ³´Ù", new String[] { "ÛöïÒ", "¹ýÁ¤", "ÍëýÌìí", "°øÈÞÀÏ", "Ä¡¸£", "ãËúÐ", "½ÃÇè", "¸¹", "Çлý", "¶³¾îÁö"}, new int[] {0, 0, 2, 2, 7, 10, 10, 15, 18, 22}, // start-offset new int[] {2, 2, 5, 5, 9, 12, 12, 16, 20, 25}, // end-offset new int[] {1, 0, 1, 0, 1, 1, 0, 1, 1, 1} // position increment ); } /** * * @throws Exception */ public void testOutputTokenStream() throws Exception { String source = "ÛöïÒÍëýÌìí¿¡ Ä¡¸¥ ãËúп¡¼­ ¸¹Àº ÇлýÀÌ ¶³¾îÁ³´Ù"; KoreanAnalyzer analyzer = new KoreanAnalyzer(); // StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream stream = analyzer.tokenStream("s", new StringReader(source)); while(stream.incrementToken()) { CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class); TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class); System.out.print(posAttr.getPositionIncrement()+"/"); System.out.print(offAttr.startOffset()+"~"+offAttr.endOffset()+"/"); System.out.print(typeAttr.type()+"/"); System.out.println(new String(termAttr.buffer(),0,termAttr.length())); } } }