Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (revision 637499) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (working copy) @@ -18,9 +18,22 @@ */ import org.apache.lucene.analysis.Token; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Query; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.queryParser.QueryParser; import java.io.StringReader; import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.io.Reader; import junit.framework.TestCase; @@ -59,63 +72,82 @@ public void testUnigrams() throws Exception { NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1); + // ArrayList tokens=new ArrayList(); + ArrayList tokens=new ArrayList(); Token token = null; do { token = tokenizer.next(); if (token != null) { - tokens.add(token.toString()); -// System.out.println(token.termText()); -// System.out.println(token); -// Thread.sleep(1000); + tokens.add(token); } } while (token != null); assertEquals(5, tokens.size()); - ArrayList exp = new ArrayList(); - exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)"); - assertEquals(exp, tokens); + String[] texts=new String[]{"a","b","c","d","e"}; + for(int i=0;i tokens=new ArrayList(); + ArrayList tokens=new ArrayList(); Token token = null; do { token = tokenizer.next(); if (token != null) { - tokens.add(token.toString()); -// System.out.println(token.termText()); -// System.out.println(token); -// Thread.sleep(1000); + tokens.add(token); } } while (token != null); assertEquals(4, tokens.size()); - ArrayList exp = new ArrayList(); - exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)"); - assertEquals(exp, tokens); + String[] texts=new String[]{"ab","bc","cd","de"}; + for(int i=0;i tokens=new HashMap(); + Map tokens=new HashMap(); Token token = null; do { token = tokenizer.next(); if (token != null) { - tokens.add(token.toString()); -// System.out.println(token.termText()); -// System.out.println(token); -// Thread.sleep(1000); + tokens.put(token.termText(),token); } } while (token != null); - assertEquals(12, tokens.size()); - ArrayList exp = new ArrayList(); - exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)"); - exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)"); - exp.add("(abc,0,3)"); exp.add("(bcd,1,4)"); exp.add("(cde,2,5)"); - assertEquals(exp, tokens); + assertEquals(12, tokens.keySet().size()); + + String[] grams=new String[]{ + "a","b","c","d","e", + "ab","bc","cd","de", + "abc","bcd","cde"}; + int[] starts =new int[]{ + 0,1,2,3,4, + 0,1,2,3, + 0,1,2}; + int[] ends = new int[]{ + 1,2,3,4,5, + 2,3,4,5, + 3,4,5 + }; + for(int i=0; i<12; i++){ + Token tk=(Token)tokens.get(grams[i]); + assertEquals(tk.startOffset(),starts[i]); + assertEquals(tk.endOffset(),ends[i]); + } } public void testOversizedNgrams() throws Exception { @@ -126,12 +158,62 @@ token = tokenizer.next(); if (token != null) { tokens.add(token.toString()); -// System.out.println(token.termText()); -// System.out.println(token); -// Thread.sleep(1000); } } while (token != null); assertTrue(tokens.isEmpty()); } + + public void testIndexAndQueryLong() throws Exception { + RAMDirectory ramDir = new RAMDirectory(); + + IndexWriter writer = new IndexWriter(ramDir, new Analyzer(){ + public TokenStream tokenStream(String fieldName, Reader reader){ return new NGramTokenizer(reader,2,2); } + }, true, IndexWriter.MaxFieldLength.LIMITED); + + StringBuffer bf=new StringBuffer(); + for(int i=0;i<1024;i++){ bf.append("A"); } + bf.append("B"); + + Document d=new Document(); + d.add(new Field("content", bf.toString(), Field.Store.YES, Field.Index.TOKENIZED)); + writer.addDocument(d); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(ramDir); + QueryParser parser = new QueryParser("content", new Analyzer(){ + public TokenStream tokenStream(String fieldName, Reader reader){ return new NGramTokenizer(reader,2,2); } + }); + + Query query=parser.parse("AB"); + Hits hits=searcher.search(query); + assertEquals(1,hits.length()); + searcher.close(); + } + + public void testIndexAndQuery() throws Exception { + RAMDirectory ramDir = new RAMDirectory(); + + IndexWriter writer = new IndexWriter(ramDir, new Analyzer(){ + public TokenStream tokenStream(String fieldName, Reader reader){ return new NGramTokenizer(reader,2,3); } + }, true, IndexWriter.MaxFieldLength.LIMITED); + + Document d=new Document(); + d.add(new Field("content", "ABCDEF", Field.Store.YES, Field.Index.TOKENIZED)); + writer.addDocument(d); + writer.close(); + + IndexSearcher searcher = new IndexSearcher(ramDir); + QueryParser parser = new QueryParser("content", new Analyzer(){ + public TokenStream tokenStream(String fieldName, Reader reader){ return new NGramTokenizer(reader,2,3); } + }); + + String[] queryConds=new String[]{"AB","ABC","ABCD"}; + for(int i=0; i inLen) { // if we hit the end of the string - pos = 0; // reset to beginning of string - gramSize++; // increase n-gram size - if (gramSize > maxGram) // we are done - return null; - if (pos+gramSize > inLen) - return null; + + int increment = 0; + if(gramSize < minGram){ + int insertPos=buffpos; + increment = 1; + if (pos==0 && gramSize==-1) { + char[] firstRead = new char[maxGram]; + bufflen = input.read(firstRead); + if(bufflen < 0) return null; + + for(int i=0; i 0){ + bufflen++; + for(int i=0; i<2; i++){ + if((insertPos+maxGram*i) >= buff.length) break; + buff[insertPos+maxGram*i]=charRead[0]; + } + } + } + while(bufflen < minGram){ // catch for charReadLen==0 case + int charReadLen = input.read(charRead); + if(charReadLen > 0){ + bufflen++; + for(int i=0; i<2; i++){ + if((insertPos+maxGram*i) >= buff.length) break; + buff[insertPos+maxGram*i]=charRead[0]; + } + } + if(charReadLen < 0) return null; + } + gramSize = bufflen; } - String gram = inStr.substring(pos, pos+gramSize); - int oldPos = pos; - pos++; - return new Token(gram, oldPos, oldPos+gramSize); + + Token tk = new Token(new String(buff,buffpos,gramSize), pos, pos+gramSize); + tk.setPositionIncrement(increment); + gramSize--; + + return tk; } + + public void reset(Reader input) throws IOException { + super.reset(input); + this.pos=0; + this.buffpos = 0; + this.gramSize = -1; + } + + /** + * Set optimization criteria + * + *
  • NO_OPTIMIZE generates all avaulable tokens + *
  • QUERY_OPTIMIZE skip some tokens that is nessessary in query phase + */ + public final void setOptimize(int flag){ + this.optimize=flag; + } }