+ * <fieldType name="text_kr" class="solr.TextField"> + * <analyzer> + * <tokenizer class="solr.KoreanTokenizerFilterFactory"/> + * <filter class="solr.KoreanFilter" + * bigrammable="true" + * hasOrigin="true" + * hasCNoun="true" + * exactMatch="false" + * /> + * </filter> + * </fieldType> + *+ */ + +public class KoreanFilterFactory extends TokenFilterFactory { + + private static final String BIGRAMMABLE_PARAM = "bigrammable"; + + private static final String HAS_ORIGIN_PARAM = "hasOrigin"; + + private static final String HAS_COMPOUND_NOUN_PARAM = "hasCNoun"; + + // Decides whether the original compound noun is returned or not if analyzed morphologically + private static final String EXACT_MATCH_PARAM = "exactMatch"; + + private boolean bigrammable; + + private boolean hasOrigin; + + private boolean hasCNoun; + + private boolean exactMatch; + + /** + * Initialize this factory via a set of key-value pairs. + */ + public KoreanFilterFactory(Map
input to the newly created JFlex scanner.
+ *
+ * @param input The input reader
+ *
+ * See http://issues.apache.org/jira/browse/LUCENE-1068
+ */
+ public KoreanTokenizer(Version matchVersion, Reader input) {
+ super(input);
+ this.scanner = new KoreanTokenizerImpl(input);
+ init(input, matchVersion);
+ }
+
+ /**
+ * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
+ */
+ public KoreanTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
+ super(factory, input);
+ this.scanner = new KoreanTokenizerImpl(input);
+ init(input, matchVersion);
+ }
+
+ private final void init(Reader input, Version matchVersion) {
+ if (matchVersion.onOrAfter(Version.LUCENE_42)) {
+ replaceInvalidAcronym = true;
+ } else {
+ replaceInvalidAcronym = false;
+ }
+ this.input = input;
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ @Override
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ int posIncr = 1;
+
+ while(true) {
+ int tokenType = scanner.getNextToken();
+
+ if (tokenType == KoreanTokenizerImpl.YYEOF) {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength) {
+ posIncrAtt.setPositionIncrement(posIncr);
+ scanner.getText(termAtt);
+ final int start = scanner.yychar();
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
+ typeAtt.setType(KoreanTokenizer.TOKEN_TYPES[tokenType]);
+
+ return true;
+ } else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ posIncr++;
+ }
+ }
+
+ @Override
+ public final void end() {
+ // set final offset
+ int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ scanner.yyreset(input);
+ }
+}
diff -N -u -r lucene-trunk/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java lucene4956-branch/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java
--- lucene-trunk/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java 1970-01-01 09:00:00.000000000 +0900
+++ lucene4956-branch/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerFactory.java 2013-05-23 01:27:56.000000000 +0900
@@ -0,0 +1,43 @@
+package org.apache.lucene.analysis.ko;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Version;
+
+public class KoreanTokenizerFactory extends TokenizerFactory {
+
+ private Version version;
+
+ /**
+ * Initialize this factory via a set of key-value pairs.
+ */
+ public KoreanTokenizerFactory(MapaState
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\1\0\1\11\12\1\6\0\5\1\3\0\11\1\3\0"+
+ "\7\1\2\0\2\1\2\0\7\1\1\0\3\1\4\0"+
+ "\3\1\1\0\2\1\2\0\4\1\1\0\23\1\1\0"+
+ "\2\1\1\0\5\1\2\0\4\1\1\0\1\1\1\0"+
+ "\2\1\1\0\1\1\1\0\1\1\1\0\12\1\2\0"+
+ "\2\1\1\0\2\1\1\0\5\1\2\0\3\1\1\0"+
+ "\2\1\2\0\1\1\1\0\3\1\2\0\2\1\1\0"+
+ "\1\1\1\0\2\1\1\0\3\1";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[174];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the textposition at the last state to be included in yytext */
+ private int zzPushbackPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /* user code: */
+
+public static final int ALPHANUM = 0;
+public static final int APOSTROPHE = 1;
+public static final int ACRONYM = 2;
+public static final int COMPANY = 3;
+public static final int EMAIL = 4;
+public static final int HOST = 5;
+public static final int NUM = 6;
+public static final int CJ = 7;
+/**
+ * @deprecated this solves a bug where HOSTs that end with '.' are identified
+ * as ACRONYMs. It is deprecated and will be removed in the next
+ * release.
+ */
+public static final int ACRONYM_DEP = 8;
+public static final int KOREAN = 9;
+public static final int CHINESE = 10;
+
+public static final String [] TOKEN_TYPES = new String [] {
+ "false, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzPushbackPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead < 0) {
+ return true;
+ }
+ else {
+ zzEndRead+= numRead;
+ return false;
+ }
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * cannot be reused (internal buffer is discarded and lost).
+ * Lexical state is set to ZZ_INITIAL.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position pos from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public int getNextToken() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ yychar+= zzMarkedPosL-zzStartRead;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = zzLexicalState;
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 8:
+ { return HOST;
+ }
+ case 13: break;
+ case 11:
+ { return ACRONYM_DEP;
+ }
+ case 14: break;
+ case 10:
+ { return ACRONYM;
+ }
+ case 15: break;
+ case 1:
+ { /* ignore */
+ }
+ case 16: break;
+ case 6:
+ { return NUM;
+ }
+ case 17: break;
+ case 5:
+ { return CJ;
+ }
+ case 18: break;
+ case 2:
+ { return ALPHANUM;
+ }
+ case 19: break;
+ case 9:
+ { return COMPANY;
+ }
+ case 20: break;
+ case 7:
+ { return APOSTROPHE;
+ }
+ case 21: break;
+ case 4:
+ { return CHINESE;
+ }
+ case 22: break;
+ case 3:
+ { return KOREAN;
+ }
+ case 23: break;
+ case 12:
+ { return EMAIL;
+ }
+ case 24: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return YYEOF;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
diff -N -u -r lucene-trunk/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerImpl.jflex lucene4956-branch/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerImpl.jflex
--- lucene-trunk/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerImpl.jflex 1970-01-01 09:00:00.000000000 +0900
+++ lucene4956-branch/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanTokenizerImpl.jflex 2013-05-23 01:27:56.000000000 +0900
@@ -0,0 +1,156 @@
+package org.apache.lucene.analysis.ko;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+%%
+
+%class KoreanTokenizerImpl
+%unicode
+%integer
+%function getNextToken
+%pack
+%char
+
+%{
+
+public static final int ALPHANUM = 0;
+public static final int APOSTROPHE = 1;
+public static final int ACRONYM = 2;
+public static final int COMPANY = 3;
+public static final int EMAIL = 4;
+public static final int HOST = 5;
+public static final int NUM = 6;
+public static final int CJ = 7;
+/**
+ * @deprecated this solves a bug where HOSTs that end with '.' are identified
+ * as ACRONYMs. It is deprecated and will be removed in the next
+ * release.
+ */
+public static final int ACRONYM_DEP = 8;
+public static final int KOREAN = 9;
+public static final int CHINESE = 10;
+
+public static final String [] TOKEN_TYPES = new String [] {
+ "null
+ * @param encoding the encoding to use, null means platform default
+ * @return the list of Strings representing each line in the file, never null
+ * @throws IOException in case of an I/O error
+ * @throws java.io.UnsupportedEncodingException if the encoding is not supported by the VM
+ * @since Commons IO 1.1
+ */
+ public static Listnull
+ * @param encoding the encoding to use, null means platform default
+ * @return the list of Strings representing each line in the file, never null
+ * @throws MorphException
+ * @throws IOException
+ * @throws Exception
+ * @throws java.io.UnsupportedEncodingException if the encoding is not supported by the VM
+ * @since Commons IO 1.1
+ */
+ public static Listnew FileInputStream(file).
+ * + * At the end of the method either the stream will be successfully opened, + * or an exception will have been thrown. + *
+ * An exception is thrown if the file does not exist.
+ * An exception is thrown if the file object exists but is a directory.
+ * An exception is thrown if the file exists but cannot be read.
+ *
+ * @param file the file to open for input, must not be null
+ * @return a new {@link FileInputStream} for the specified file
+ * @throws FileNotFoundException if the file does not exist
+ * @throws IOException if the file object is a directory
+ * @throws IOException if the file cannot be read
+ * @since Commons IO 1.3
+ */
+ public static FileInputStream openInputStream(File file) throws IOException {
+ if (file.exists()) {
+ if (file.isDirectory()) {
+ throw new IOException("File '" + file + "' exists but is a directory");
+ }
+ if (file.canRead() == false) {
+ throw new IOException("File '" + file + "' cannot be read");
+ }
+ } else {
+ throw new FileNotFoundException("File '" + file + "' does not exist");
+ }
+ return new FileInputStream(file);
+ }
+
+ // readLines
+ //-----------------------------------------------------------------------
+ /**
+ * Get the contents of an InputStream as a list of Strings,
+ * one entry per line, using the default character encoding of the platform.
+ *
+ * This method buffers the input internally, so there is no need to use a
+ *
+ * Character encoding names can be found at
+ * IANA.
+ *
+ * This method buffers the input internally, so there is no need to use a
+ *
+ * This method buffers the input internally, so there is no need to use a
+ * BufferedInputStream.
+ *
+ * @param input the InputStream to read from, not null
+ * @return the list of Strings, never null
+ * @throws NullPointerException if the input is null
+ * @throws IOException if an I/O error occurs
+ * @since Commons IO 1.1
+ */
+ public static ListInputStream as a list of Strings,
+ * one entry per line, using the specified character encoding.
+ * BufferedInputStream.
+ *
+ * @param input the InputStream to read from, not null
+ * @param encoding the encoding to use, null means platform default
+ * @return the list of Strings, never null
+ * @throws NullPointerException if the input is null
+ * @throws IOException if an I/O error occurs
+ * @since Commons IO 1.1
+ */
+ public static ListReader as a list of Strings,
+ * one entry per line.
+ * BufferedReader.
+ *
+ * @param input the Reader to read from, not null
+ * @return the list of Strings, never null
+ * @throws NullPointerException if the input is null
+ * @throws IOException if an I/O error occurs
+ * @since Commons IO 1.1
+ */
+ public static List