Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/NGramSynonymTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/NGramSynonymTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/NGramSynonymTokenizerFactory.java (revision 0) @@ -0,0 +1,99 @@ +package org.apache.lucene.analysis.synonym; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.text.ParseException; +import java.util.Map; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.util.AttributeSource.AttributeFactory; + +/** + * Factory for {@link NGramSynonymTokenizer}. + *
+ * <fieldType name="text_2gs" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ *   <analyzer type="index">
+ *     <tokenizer class="org.apache.lucene.analysis.synonym.NGramSynonymTokenizerFactory"
+ *                expand="true" indexMode="true" synonyms="synonyms.txt"/>
+ *   </analyzer>
+ *   <analyzer type="query">
+ *     <tokenizer class="org.apache.lucene.analysis.synonym.NGramSynonymTokenizerFactory"
+ *                expand="false" indexMode="false" synonyms="synonyms.txt"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public final class NGramSynonymTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware { + + private final String synonymFiles; + private final boolean ignoreCase; + private final int n; + private final String delimiters; + private final boolean indexMode; + private final String format; + private final boolean expand; + + private SynonymMap map; + + public NGramSynonymTokenizerFactory(Map args) { + super(args); + synonymFiles = get(args, "synonyms"); + ignoreCase = getBoolean(args, "ignoreCase", true); + n = getInt( args, "n", NGramSynonymTokenizer.DEFAULT_N_SIZE ); + delimiters = get( args, "delimiters", NGramSynonymTokenizer.DEFAULT_DELIMITERS ); + indexMode = getBoolean(args, "indexMode", false); + format = get(args, "format"); + expand = getBoolean(args, "expand", true); + if (!args.isEmpty()){ + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + public Tokenizer create(AttributeFactory factory, Reader input) { + return new NGramSynonymTokenizer(input, n, delimiters, indexMode, ignoreCase, map); + } + + @Override + public void inform(ResourceLoader loader) throws IOException { + if(synonymFiles == null){ + map = null; + return; + } + + Analyzer analyzer = new KeywordAnalyzer(); + + try { + String formatClass = format; + if (format == null || format.equals("solr")) { + formatClass = SolrSynonymParser.class.getName(); + } else if (format.equals("wordnet")) { + formatClass = WordnetSynonymParser.class.getName(); + } + // TODO: expose dedup as a parameter? + map = FSTSynonymFilterFactory.loadSynonyms(loader, formatClass, true, analyzer, expand, synonymFiles); + } catch (ParseException e) { + throw new IOException("Error parsing synonyms file:", e); + } + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilterFactory.java (revision 1527509) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FSTSynonymFilterFactory.java (working copy) @@ -109,7 +109,7 @@ formatClass = WordnetSynonymParser.class.getName(); } // TODO: expose dedup as a parameter? - map = loadSynonyms(loader, formatClass, true, analyzer); + map = loadSynonyms(loader, formatClass, true, analyzer, expand, synonyms); } catch (ParseException e) { throw new IOException("Error parsing synonyms file:", e); } @@ -118,7 +118,8 @@ /** * Load synonyms with the given {@link SynonymMap.Parser} class. */ - private SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException { + public static SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer, + boolean expand, String synonyms) throws IOException, ParseException { CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/NGramSynonymTokenizer.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/NGramSynonymTokenizer.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/NGramSynonymTokenizer.java (revision 0) @@ -0,0 +1,405 @@ +package org.apache.lucene.analysis.synonym; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.PriorityQueue; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.fst.FST; + +public final class NGramSynonymTokenizer extends Tokenizer { + + public static final int DEFAULT_N_SIZE = 2; + public static final String DEFAULT_DELIMITERS = "  \t\n\r"; + private final int n; + private final String delimiters; + private final boolean indexMode; + private final boolean ignoreCase; + private final SynonymMap map; + + private FST.Arc scratchArc; + private FST fst; + private FST.BytesReader fstReader; + private BytesRef scratchBytes = new BytesRef(); + private CharsRef scratchChars = new CharsRef(); + private int longestMatchEndOffset; + + private int ch; + static final int BUFFER_SIZE = 4096; + private char[] readBuffer; + private int readBufferIndex; + private int readBufferLen; + StringBuilder block; + int blkStart, nextBlkStart, finalOffset; + final PriorityQueue queue; + MyToken prevToken; + final List synonyms; + + private CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); + private OffsetAttribute offsetAttr = addAttribute(OffsetAttribute.class); + private PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class); + + protected NGramSynonymTokenizer(Reader input, int n, String delimiters, boolean indexMode, boolean ignoreCase, SynonymMap map) { + super(input); + this.n = n; + this.delimiters = delimiters; + this.indexMode = indexMode; + this.ignoreCase = ignoreCase; + this.map = map; + if(map != null){ + this.fst = map.fst; + if (fst == null) { + throw new IllegalArgumentException("fst must be non-null"); + } + this.fstReader = fst.getBytesReader(); + scratchArc = new FST.Arc(); + } + + ch = 0; + readBuffer = new char[BUFFER_SIZE]; + readBufferIndex = BUFFER_SIZE; + readBufferLen = 0; + block = new StringBuilder(); + nextBlkStart = 0; + queue = new PriorityQueue(100, new MyTokensComparator()); + this.synonyms = new ArrayList(); + } + + public boolean incrementToken() throws IOException { + while(true){ + MyToken nextToken = getNextUniqueToken(queue, prevToken); + if(nextToken == null){ + getNextBlock(); + if(block.length() == 0) return false; + consultDictionary(); + tokenizeWholeBlock(); + } + else{ + prevToken = nextToken; + clearAttributes(); + termAttr.append(nextToken.word); + finalOffset = correctOffset(blkStart + nextToken.endOffset); + offsetAttr.setOffset(correctOffset(blkStart + nextToken.startOffset), finalOffset); + posIncAttr.setPositionIncrement(nextToken.posInc); + return true; + } + } + } + + static MyToken getNextUniqueToken(PriorityQueue que, MyToken prev){ + while(true){ + MyToken token = que.poll(); + if(token == null) return null; + if(prev == null || !prev.identical(token)){ + return token; + } + } + } + + void consultDictionary() throws IOException { + if(map == null) return; + synonyms.clear(); + char[] key = block.toString().toCharArray(); + for(int start = 0; start < block.length();){ + BytesRef matchOutput = getLongestMatchOutput(key, start); + if(matchOutput == null){ + start++; + continue; + } + + synonyms.add(new MyToken(key, start, longestMatchEndOffset, 1, matchOutput.clone())); + start = longestMatchEndOffset; + } + } + + BytesRef getLongestMatchOutput(char[] src, int start) throws IOException { + BytesRef pendingOutput = fst.outputs.getNoOutput(); + fst.getFirstArc(scratchArc); + assert scratchArc.output == fst.outputs.getNoOutput(); + BytesRef matchOutput = null; + + int index = 0; + while(index < src.length){ + final int codePoint = Character.codePointAt(src, start + index, src.length); + if(fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, + scratchArc, scratchArc, fstReader) == null){ + return matchOutput; + } + + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + + if(scratchArc.isFinal()){ + matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); + longestMatchEndOffset = start + index + Character.charCount(codePoint); + } + + index += Character.charCount(codePoint); + } + + return matchOutput; + } + + void tokenizeWholeBlock(){ + queue.clear(); + int nextStart = 0; + int end = block.length(); + boolean afterSynonymProduced = false; + final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + for(int idx = 0; idx < synonyms.size(); idx++){ + MyToken synonym = synonyms.get(idx); + tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced); + + // enqueue prev-synonym + if(indexMode){ + int limitOffset = 0; + if(idx > 0) + limitOffset = synonyms.get(idx - 1).endOffset; + processPrevSynonym(synonym.startOffset, limitOffset); + } + + // enqueue synonyms + if(indexMode){ + bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length); + final int code = bytesReader.readVInt(); + final boolean keepOrig = (code & 0x1) == 0; // not used + final int count = code >>> 1; + for(int i = 0; i < count; i++){ + map.words.get(bytesReader.readVInt(), scratchBytes); + UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars); + String word = scratchChars.toString(); + int posInc = 0, seq = i + 1; + if(synonym.word.equals(word)){ + posInc = 1; + seq = 0; + } + queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq)); + } + } + else{ + queue.add(synonym); + } + + // enqueue after-synonym + if(indexMode){ + int limitOffset = block.length(); + if(idx < synonyms.size() - 1) + limitOffset = synonyms.get(idx + 1).startOffset; + afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset); + } + + nextStart = synonym.endOffset; + } + tokenizePartialBlock(nextStart, end, afterSynonymProduced); + } + + void tokenizePartialBlock(int startOffset, int endOffset, boolean afterSynonymProduced){ + if(startOffset >= endOffset) return; + + int posInc = afterSynonymProduced ? 0 : 1; + if(endOffset - startOffset < n){ + queue.add(new MyToken(block.substring(startOffset, endOffset), startOffset, endOffset, posInc)); + return; + } + + for(int i = startOffset; i + n <= endOffset; i++){ + queue.add(new MyToken(block.substring(i, i + n), i, i + n, posInc)); + posInc = 1; + } + } + + void processPrevSynonym(final int endOffset, final int limitOffset){ + int startOffset = endOffset - 1; + for(int len = 1; len < n && startOffset >= limitOffset; len++){ + queue.add(new MyToken(block.substring(startOffset, endOffset), startOffset, endOffset, 0)); + startOffset--; + } + } + + boolean processAfterSynonym(final int startOffset, final int limitOffset){ + int qSize = queue.size(); + int endOffset = startOffset + 1; + int posInc = 1; + for(int len = 1; len < n && endOffset <= limitOffset; len++){ + queue.add(new MyToken(block.substring(startOffset, endOffset), startOffset, endOffset, posInc)); + endOffset++; + posInc = 0; + } + return queue.size() > qSize; + } + + @Override + public void end(){ + offsetAttr.setOffset(finalOffset, finalOffset); + } + + @Override + public void reset() throws IOException { + super.reset(); + block.setLength( 0 ); + readBufferIndex = BUFFER_SIZE; + readBufferLen = 0; + ch = 0; + nextBlkStart = 0; + } + + boolean getNextBlock() throws IOException { + blkStart = nextBlkStart; + block.setLength(0); + prevToken = null; + while( true ){ + if( ch != -1 ) + ch = readCharFromBuffer(); + if( ch == -1 ) break; + else if( !isDelimiter( ch ) ) + block.append( (char)ch ); + else if(block.length() > 0) + break; + else + blkStart++; + } + if( block.length() == 0 ) + return false; + return true; + } + + int readCharFromBuffer() throws IOException { + if( readBufferIndex >= readBufferLen ){ + readBufferLen = input.read( readBuffer ); + if( readBufferLen == -1 ){ + return -1; + } + readBufferIndex = 0; + } + int c = (int)readBuffer[readBufferIndex++]; + nextBlkStart++; + return c; + } + + boolean isDelimiter( int c ){ + return delimiters.indexOf( c ) >= 0; + } + + static class MyToken { + final String word; + final int startOffset, endOffset, posInc, seq; + final BytesRef output; + + public MyToken(char[] key, int startOffset, int endOffset, int posInc, BytesRef output){ + this.word = new String(key, startOffset, endOffset - startOffset); + this.startOffset = startOffset; + this.endOffset = endOffset; + this.posInc = posInc; + this.output = output; + this.seq = 0; // zero for seq means that this token is the original of synonyms + } + + public MyToken(String word, int startOffset, int endOffset, int posInc){ + this(word, startOffset, endOffset, posInc, Integer.MAX_VALUE); // Integer.MAX_VALUE for seq means unused + } + + public MyToken(String word, int startOffset, int endOffset, int posInc, int seq){ + this.word = word; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.posInc = posInc; + this.output = null; // means unused + this.seq = seq; + } + + public boolean identical(MyToken o){ + if(o.posInc != 0) return false; + if(!word.equals(o.word)) return false; + if(startOffset != o.startOffset) return false; + if(endOffset != o.endOffset) return false; + return true; + } + + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append(word).append(',').append(startOffset).append(',').append(endOffset).append(',').append(posInc); + return sb.toString(); + } + + @Override + public boolean equals(Object other){ + if(other == null || !(other instanceof MyToken)) return false; + MyToken o = (MyToken)other; + if(!word.equals(o.word)) return false; + if(startOffset != o.startOffset) return false; + if(endOffset != o.endOffset) return false; + if(posInc != o.posInc) return false; + return true; + } + + @Override + public int hashCode(){ + return word.hashCode() + posInc << 30 + startOffset << 15 + endOffset; + } + } + +/* + static class SynInfo { + final String src; + final int offset, length; + final String[] synonyms; + Mode mode; + int count; + SynInfo(String src, int offset, int length, String[] synonyms){ + this.src = src; + this.offset = offset; + this.length = length; + this.synonyms = synonyms; + } + + static enum Mode { + PREV, SYN, AFTER; + } + } + */ + + static class MyTokensComparator implements Comparator { + public int compare(MyToken t1, MyToken t2) { + if(t1.startOffset < t2.startOffset) return -1; + else if(t1.startOffset > t2.startOffset) return 1; + + if(t1.endOffset < t2.endOffset) return -1; + else if(t1.endOffset > t2.endOffset) return 1; + + if(t1.posInc > t2.posInc) return -1; + else if(t1.posInc < t2.posInc) return 1; + + if(t1.seq < t2.seq) return -1; + else if(t1.seq > t2.seq) return 1; + + return -1; + } + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java (revision 1527509) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AbstractAnalysisFactory.java (working copy) @@ -292,7 +292,7 @@ * @param fileNames the string containing file names * @return a list of file names with the escaping backslashed removed */ - protected final List splitFileNames(String fileNames) { + protected static final List splitFileNames(String fileNames) { if (fileNames == null) return Collections.emptyList();