Index: contrib/CHANGES.txt =================================================================== --- contrib/CHANGES.txt (revision 688494) +++ contrib/CHANGES.txt (arbetskopia) @@ -32,6 +32,9 @@ might not be compatible with these updated classes as some algorithms have changed. (Karl Wettin) + 3. LUCENE-1016: TermVectorAccessor, transparent vector space access via stored vectors + or resolving the inverted index. (Karl Wettin) + Documentation (None) Index: contrib/miscellaneous/src/test/org/apache/lucene/index/TestTermVectorAccessor.java =================================================================== --- contrib/miscellaneous/src/test/org/apache/lucene/index/TestTermVectorAccessor.java (revision 0) +++ contrib/miscellaneous/src/test/org/apache/lucene/index/TestTermVectorAccessor.java (revision 0) @@ -0,0 +1,111 @@ +package org.apache.lucene.index; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import java.util.Collections; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +public class TestTermVectorAccessor extends TestCase { + + public void test() throws Exception { + + Directory dir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Collections.EMPTY_SET), true); + + Document doc; + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); + doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + iw.addDocument(doc); + + iw.close(); + + IndexReader ir = IndexReader.open(dir); + + TermVectorAccessor accessor = new TermVectorAccessor(); + + ParallelArrayTermVectorMapper mapper; + TermFreqVector tfv; + + for (int i = 0; i < ir.maxDoc(); i++) { + + mapper = new ParallelArrayTermVectorMapper(); + accessor.accept(ir, i, "a", mapper); + tfv = mapper.materializeVector(); + assertEquals("doc " + i, "a", tfv.getTerms()[0]); + assertEquals("doc " + i, 8, tfv.getTermFrequencies()[0]); + + mapper = new ParallelArrayTermVectorMapper(); + accessor.accept(ir, i, "b", mapper); + tfv = mapper.materializeVector(); + assertEquals("doc " + i, 8, tfv.getTermFrequencies().length); + assertEquals("doc " + i, "b", tfv.getTerms()[1]); + assertEquals("doc " + i, 7, tfv.getTermFrequencies()[1]); + + mapper = new ParallelArrayTermVectorMapper(); + accessor.accept(ir, i, "c", mapper); + tfv = mapper.materializeVector(); + assertEquals("doc " + i, 8, tfv.getTermFrequencies().length); + assertEquals("doc " + i, "c", tfv.getTerms()[2]); + assertEquals("doc " + i, 7, tfv.getTermFrequencies()[2]); + + mapper = new ParallelArrayTermVectorMapper(); + accessor.accept(ir, i, "q", mapper); + tfv = mapper.materializeVector(); + assertNull("doc " + i, tfv); + + } + + ir.close(); + + dir.close(); + + + } + +} Index: contrib/miscellaneous/src/java/org/apache/lucene/index/TermVectorAccessor.java =================================================================== --- contrib/miscellaneous/src/java/org/apache/lucene/index/TermVectorAccessor.java (revision 0) +++ contrib/miscellaneous/src/java/org/apache/lucene/index/TermVectorAccessor.java (revision 0) @@ -0,0 +1,168 @@ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Collection; +import java.util.Iterator; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Transparent access to the vector space model, + * either via TermFreqVector or by resolving it from the inverted index. + *

+ * Resolving a term vector from a large index can be a time consuming process. + *

+ * Warning! This class is not thread safe! + */ +public class TermVectorAccessor { + + public TermVectorAccessor() { + } + + /** + * Instance reused to save garbage collector some time + */ + private TermVectorMapperDecorator decoratedMapper = new TermVectorMapperDecorator(); + + + /** + * Visits the TermVectorMapper and populates it with terms available for a given document, + * either via a vector created at index time or by resolving them from the inverted index. + * + * @param indexReader Index source + * @param documentNumber Source document to access + * @param fieldName Field to resolve + * @param mapper Mapper to be mapped with data + * @throws IOException + */ + public void accept(IndexReader indexReader, int documentNumber, String fieldName, TermVectorMapper mapper) throws IOException { + + fieldName = fieldName.intern(); + + decoratedMapper.decorated = mapper; + decoratedMapper.termVectorStored = false; + + indexReader.getTermFreqVector(documentNumber, fieldName, decoratedMapper); + + if (!decoratedMapper.termVectorStored) { + mapper.setDocumentNumber(documentNumber); + build(indexReader, fieldName, mapper, documentNumber); + } + } + + /** Instance reused to save garbage collector some time */ + private List/**/ tokens; + + /** Instance reused to save garbage collector some time */ + private List/**/ positions; + + /** Instance reused to save garbage collector some time */ + private List/**/ frequencies; + + + /** + * Populates the mapper with terms available for the given field in a document + * by resolving the inverted index. + * + * @param indexReader + * @param field + * @param mapper + * @param documentNumber + * @throws IOException + */ + private void build(IndexReader indexReader, String field, TermVectorMapper mapper, int documentNumber) throws IOException { + + if (tokens == null) { + tokens = new ArrayList/**/(500); + positions = new ArrayList/**/(500); + frequencies = new ArrayList/**/(500); + } else { + tokens.clear(); + frequencies.clear(); + positions.clear(); + } + + TermEnum termEnum = indexReader.terms(); + if (termEnum.skipTo(new Term(field, ""))) { + + while (termEnum.term().field() == field) { + TermPositions termPositions = indexReader.termPositions(termEnum.term()); + if (termPositions.skipTo(documentNumber)) { + + frequencies.add(new Integer(termPositions.freq())); + tokens.add(termEnum.term().text()); + + + if (!mapper.isIgnoringPositions()) { + int[] positions = new int[termPositions.freq()]; + for (int i = 0; i < positions.length; i++) { + positions[i] = termPositions.nextPosition(); + } + this.positions.add(positions); + } else { + positions.add(null); + } + } + termPositions.close(); + if (!termEnum.next()) { + break; + } + } + + mapper.setDocumentNumber(documentNumber); + mapper.setExpectations(field, tokens.size(), false, !mapper.isIgnoringPositions()); + for (int i = 0; i < tokens.size(); i++) { + mapper.map((String) tokens.get(i), ((Integer) frequencies.get(i)).intValue(), (TermVectorOffsetInfo[]) null, (int[]) positions.get(i)); + } + + } + termEnum.close(); + + + } + + + private static class TermVectorMapperDecorator extends TermVectorMapper { + + private TermVectorMapper decorated; + + public boolean isIgnoringPositions() { + return decorated.isIgnoringPositions(); + } + + public boolean isIgnoringOffsets() { + return decorated.isIgnoringOffsets(); + } + + private boolean termVectorStored = false; + + public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { + decorated.setExpectations(field, numTerms, storeOffsets, storePositions); + termVectorStored = true; + } + + public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + decorated.map(term, frequency, offsets, positions); + } + + public void setDocumentNumber(int documentNumber) { + decorated.setDocumentNumber(documentNumber); + } + } + +}