Index: dev-tools/idea/.idea/ant.xml =================================================================== --- dev-tools/idea/.idea/ant.xml (revision 1632428) +++ dev-tools/idea/.idea/ant.xml (working copy) @@ -18,6 +18,7 @@ + Index: dev-tools/idea/.idea/modules.xml =================================================================== --- dev-tools/idea/.idea/modules.xml (revision 1632428) +++ dev-tools/idea/.idea/modules.xml (working copy) @@ -23,6 +23,7 @@ + Index: dev-tools/idea/.idea/workspace.xml =================================================================== --- dev-tools/idea/.idea/workspace.xml (revision 1632428) +++ dev-tools/idea/.idea/workspace.xml (working copy) @@ -108,6 +108,14 @@ + + + + + - + @@ -339,32 +347,33 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: dev-tools/idea/lucene/concordance/concordance.iml =================================================================== --- dev-tools/idea/lucene/concordance/concordance.iml (revision 0) +++ dev-tools/idea/lucene/concordance/concordance.iml (working copy) @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + Property changes on: dev-tools/idea/lucene/concordance/concordance.iml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: dev-tools/maven/lucene/concordance/pom.xml.template =================================================================== --- dev-tools/maven/lucene/concordance/pom.xml.template (revision 0) +++ dev-tools/maven/lucene/concordance/pom.xml.template (working copy) @@ -0,0 +1,68 @@ + + + 4.0.0 + + org.apache.lucene + lucene-parent + @version@ + ../pom.xml + + org.apache.lucene + lucene-concordance + jar + Lucene Concordance + Lucene Concordance Module + + lucene/concordance + ../../.. + ${relative-top-level}/${module-directory} + + + scm:svn:${vc-anonymous-base-url}/${module-directory} + scm:svn:${vc-dev-base-url}/${module-directory} + ${vc-browse-base-url}/${module-directory} + + + + + org.apache.lucene + lucene-test-framework + test + + @lucene-concordance.internal.dependencies@ + @lucene-concordance.external.dependencies@ + @lucene-concordance.internal.test.dependencies@ + @lucene-concordance.external.test.dependencies@ + + + ${module-path}/src/java + ${module-path}/src/test + + + ${project.build.testSourceDirectory} + + **/*.java + + + + + Property changes on: dev-tools/maven/lucene/concordance/pom.xml.template ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: dev-tools/maven/lucene/pom.xml.template =================================================================== --- dev-tools/maven/lucene/pom.xml.template (revision 1632428) +++ dev-tools/maven/lucene/pom.xml.template (working copy) @@ -47,6 +47,7 @@ analysis benchmark classification + concordance demo expressions facet Index: lucene/build.xml =================================================================== --- lucene/build.xml (revision 1632428) +++ lucene/build.xml (working copy) @@ -173,6 +173,7 @@ + Index: lucene/concordance =================================================================== --- lucene/concordance (revision 1632428) +++ lucene/concordance (working copy) Property changes on: lucene/concordance ___________________________________________________________________ Added: svn:ignore ## -0,0 +1 ## +*.iml Index: lucene/concordance/build.xml =================================================================== --- lucene/concordance/build.xml (revision 0) +++ lucene/concordance/build.xml (working copy) @@ -0,0 +1,40 @@ + + + + + + Executes concordance search + + + + + + + + + + + + + + + + + + Index: lucene/concordance/ivy.xml =================================================================== --- lucene/concordance/ivy.xml (revision 0) +++ lucene/concordance/ivy.xml (working copy) @@ -0,0 +1,21 @@ + + + + Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/BasicMetadataExtractor.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/BasicMetadataExtractor.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/BasicMetadataExtractor.java (working copy) @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.HashMap; +import java.util.Set; + +import org.apache.lucene.index.StoredDocument; + + +public class BasicMetadataExtractor implements DocumentMetadataExtractor { + + private Set fields = new HashSet<>(); + + public void setFieldSelector(Set f) { + fields.clear(); + for (String s : f) { + fields.add(s); + } + } + + @Override + public Set getFieldSelector() { + return Collections.unmodifiableSet(fields); + } + + @Override + public Map extract(StoredDocument d) { + //only takes the first value in a multi-valued field!!! + Map map = new HashMap<>(); + for (String fieldName : getFieldSelector()) { + String[] fieldValues = d.getValues(fieldName); + + if (fieldValues != null && fieldValues.length > 0) { + map.put(fieldName, fieldValues[0]); + } + } + return map; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceConfig.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceConfig.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceConfig.java (working copy) @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance; + +public class ConcordanceConfig { + + private final static int defaultTokensBefore = 10; + private final static int defaultTokensAfter = 10; + private final static int defaultMaxWindows = 100000; + private final static int defaultMaxTargetDisplaySizeChars = 1000; + private final static int defaultMaxContextDisplaySizeChars = 10000; + + private final static ConcordanceSortOrder defaultSortOrder = ConcordanceSortOrder.PRE; + + /** + * Number of tokens to capture before the target + */ + private int tokensBefore = defaultTokensBefore; + + /** + * Number of tokens to capture after the target + */ + private int tokensAfter = defaultTokensAfter; + + /** + * Maximum number of windows to retrieve + */ + private int maxWindows = defaultMaxWindows; + + /** + * Maximum target length in characters. + */ + private int maxTargetDisplaySizeChars = defaultMaxTargetDisplaySizeChars; + + /** + * Dual purpose: + * 1) Maximum length in characters for the string before the target {@see #ConcordanceWindow.pre}. + * 2) Maximum length in characters for the string after the target {@see #ConcordanceWindow.post}. + */ + private int maxContextDisplaySizeChars = defaultMaxContextDisplaySizeChars; + + /** + * field to search + */ + private final String fieldName; + + /** + * The results of a SpanQuery in some versions of Lucene allow + * for target overlaps. + */ + private boolean allowTargetOverlaps = false; + + /** + * Sort order for the windows + */ + private ConcordanceSortOrder sortOrder = defaultSortOrder; + + public ConcordanceConfig(String fieldName) { + this.fieldName = fieldName; + } + + public int getTokensBefore() { + return tokensBefore; + } + + public void setTokensBefore(int tokensBefore) { + this.tokensBefore = tokensBefore; + } + + public int getTokensAfter() { + return tokensAfter; + } + + public void setTokensAfter(int tokensAfter) { + this.tokensAfter = tokensAfter; + } + + public int getMaxWindows() { + return maxWindows; + } + + public void setMaxWindows(int maxWindows) { + this.maxWindows = maxWindows; + } + + public int getMaxTargetDisplaySizeChars() { + return maxTargetDisplaySizeChars; + } + + public void setMaxTargetDisplaySizeChars(int maxTargetDisplaySizeChars) { + this.maxTargetDisplaySizeChars = maxTargetDisplaySizeChars; + } + + public int getMaxContextDisplaySizeChars() { + return maxContextDisplaySizeChars; + } + + public void setMaxContextDisplaySizeChars(int maxContextDisplaySizeChars) { + this.maxContextDisplaySizeChars = maxContextDisplaySizeChars; + } + + public String getFieldName() { + return fieldName; + } + + public boolean isAllowTargetOverlaps() { + return allowTargetOverlaps; + } + + public void setAllowTargetOverlaps(boolean allowTargetOverlaps) { + this.allowTargetOverlaps = allowTargetOverlaps; + } + + public ConcordanceSortOrder getSortOrder() { + return sortOrder; + } + + public void setSortOrder(ConcordanceSortOrder sortOrder) { + this.sortOrder = sortOrder; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("sort order: ").append(sortOrder.toString()).append("\n"); + sb.append("tokens before: ").append(tokensBefore).append("\n"); + sb.append("tokens after: ").append(tokensAfter).append("\n"); + sb.append("max results: ").append(maxWindows).append("\n"); + sb.append("maxTargetDisplaySizeChars: ").append(maxTargetDisplaySizeChars).append("\n"); + sb.append("maxContextDisplaySizeChars: ").append(maxContextDisplaySizeChars).append("\n"); + + return sb.toString(); + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceResults.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceResults.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceResults.java (working copy) @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance; + +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.util.FixedBitSet; + +/** + * Results of a concordance search. This includes windows and information about the search. + * + */ + +public class ConcordanceResults { + private final List windows; + private final boolean hitMax; //did the search hit the maximum number of windows + + private int numTotalWindows; + private int numTotalDocs; + private FixedBitSet docIDs; //underlying Lucene document ids that had a hit. + + /* nocommit: remove this constructor if it's not used anywhere + public ConcordanceResults(){ + windows = new LinkedList(); + hitMax = false; + numTotalWindows = 0; + numTotalDocs = 0; + docIDs = new FixedBitSet(); + } + */ + + public ConcordanceResults(List windows, FixedBitSet docIDs2, int numTotalDocs, int numTotalWindows, boolean hitMax){ + this.windows = windows; + this.hitMax = hitMax; + this.docIDs = docIDs2.clone(); + this.numTotalWindows = numTotalWindows; + this.numTotalDocs = numTotalDocs; + } + + /** + * Sorts the windows according to the windows' sortKey and returns the list. + * Does not perform defensive copying of list, and the underlying list's order is changed + * by this call. + */ + public List getSortedWindows(){ + Collections.sort(windows, new ConcordanceSorter()); + return windows; + } + + /** + * Gets the windows in whatever order they are currently in... + * could be insertion order or could be sorted order depending on whether + * {@link #getSortedWindows()} has been called. + * + * {@see #getSortedWindows()} + */ + public List getWindows(){ + return windows; + } + + public boolean getHitMax(){ + return hitMax; + } + + public int getNumWindows(){ + return windows.size(); + } + + public long getNumDocs(){ + return docIDs.cardinality(); + } + + public int getNumTotalDocs(){ + return numTotalDocs; + } + + public int getNumTotalWindows(){ + return numTotalWindows; + } + + /** + * The caller must beware not to add duplicate windows. This call does not check + * for duplicates. + * + * The purpose of this is to allow a union of concordance results from multiple + * concordance searches (e.g. concordance results + * across different fields). This assumes that the underlying Lucene document id + * has not changed across the multiple searches!!! + */ + public void addAll(ConcordanceResults results){ + windows.addAll(results.getWindows()); + docIDs.or(results.getDocIDs()); + + numTotalWindows = windows.size(); + numTotalDocs += results.numTotalDocs; + } + + public FixedBitSet getDocIDs(){ + return docIDs; + } + + public void setDocIDs(FixedBitSet docIDs){ + this.docIDs = docIDs; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java (working copy) @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.concordance; + +import java.io.IOException; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.queries.BooleanFilter; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryWrapperFilter; +import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsets; +import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsetsIterator; +import org.apache.lucene.search.concordance.charoffsets.OffsetLengthStartComparator; +import org.apache.lucene.search.concordance.charoffsets.OffsetUtil; +import org.apache.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader; +import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException; +import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetsReader; +import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests; +import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetResults; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.util.FixedBitSet; + +/** + * Searches an IndexReader and returns concordance windows via ConcordanceResults. + */ +public class ConcordanceSearcher { + /** + * + * @param query if SpanQuery, this gets passed through. If a regular Query, the Query is first + * converted to a SpanQuery and the filter is modified to include the original Query. + * @param metadataExtractor a simple extractor that a user can implement to pull out custom + * metadata from the document for each window. + */ + public ConcordanceResults search(IndexReader reader, Query query, Filter filter, Analyzer analyzer, + ConcordanceConfig config, DocumentMetadataExtractor metadataExtractor) + throws TargetTokenNotFoundException, IllegalArgumentException, IOException { + + if (query instanceof SpanQuery) { + //pass through + return searchSpan(reader, (SpanQuery)query, filter, analyzer, config, metadataExtractor); + } else { + //convert regular query to a SpanQuery. + SpanQueryConverter converter = new SpanQueryConverter(); + SpanQuery spanQuery = converter.convert(config.getFieldName(), query); + + Filter origQueryFilter = new QueryWrapperFilter(query); + Filter updatedFilter = origQueryFilter; + + if (filter != null) { + BooleanFilter combinedFilter = new BooleanFilter(); + combinedFilter.add(origQueryFilter, Occur.MUST); + combinedFilter.add(filter, Occur.MUST); + updatedFilter = combinedFilter; + } + return searchSpan(reader, spanQuery, updatedFilter, analyzer, config, metadataExtractor); + } + } + + /** + * Like {@link #search(IndexReader, Query, Filter, Analyzer, ConcordanceConfig, DocumentMetadataExtractor)}, + * but this takes an actual SpanQuery. + */ + public ConcordanceResults searchSpan(IndexReader reader, SpanQuery spanQuery, Filter filter, Analyzer analyzer, + ConcordanceConfig config, DocumentMetadataExtractor metadataExtractor) + throws TargetTokenNotFoundException, IllegalArgumentException, IOException { + + spanQuery = (SpanQuery)spanQuery.rewrite(reader); + DocTokenOffsetsIterator itr = new DocTokenOffsetsIterator(); + Set fields = new HashSet<>(metadataExtractor.getFieldSelector()); + fields.add(config.getFieldName()); + itr.reset(spanQuery, filter, reader, fields); + return buildResults(itr, reader, analyzer, config, metadataExtractor); + } + + + private ConcordanceResults buildResults(DocTokenOffsetsIterator itr, IndexReader reader, Analyzer analyzer, + ConcordanceConfig config, DocumentMetadataExtractor metadataExtractor) + throws IllegalArgumentException, TargetTokenNotFoundException, IOException { + List windows = new LinkedList<>(); + + boolean stop = false; + int totalNumDocs = reader.numDocs(); + + int numTotalWindows = 0; + + TokenCharOffsetRequests requests; + WindowBuilder windowBuilder = new WindowBuilder(); + TokenCharOffsetsReader tokenOffsetsRecordReader = new ReanalyzingTokenCharOffsetsReader(analyzer); + + TokenCharOffsetResults offsetResults = new TokenCharOffsetResults(); + FixedBitSet docIDs = new FixedBitSet(reader.maxDoc()); + DocTokenOffsets result = null; + OffsetLengthStartComparator offsetLengthStartComparator = new OffsetLengthStartComparator(); + while (itr.next() && ! stop) { + result = itr.getDocTokenOffsets(); + StoredDocument document = result.getDocument(); + + docIDs.set(result.getUniqueDocId()); + + String[] fieldValues = document.getValues(config.getFieldName()); + if (fieldValues.length == 0) { + throw new IllegalArgumentException("did you forget to load or specify the correct content field?!"); + } + + Map metadata = metadataExtractor.extract(document); + List offsets = result.getOffsets(); + if (! config.isAllowTargetOverlaps()) { + //remove overlapping hits!!! + offsets = OffsetUtil.removeOverlapsAndSort(offsets, offsetLengthStartComparator, null); + } + //get the required character offsets + requests = ConcordanceSearcherUtil.getCharOffsetRequests(offsets, config); + offsetResults.clear(); + + offsetResults = tokenOffsetsRecordReader.getTokenCharOffsetResults + (document, config.getFieldName(), requests, offsetResults); + + for (OffsetAttribute offset : offsets) { + + ConcordanceWindow w = windowBuilder.buildConcordanceWindow + (result.getUniqueDocId(), offset.startOffset(), + offset.endOffset()-1, metadata, config, fieldValues, offsetResults); + + windows.add(w); + numTotalWindows++; + if (config.getMaxWindows() > -1 && windows.size() >= config.getMaxWindows()) { + stop = true; + break; + } + } + } + + return new ConcordanceResults(windows, docIDs, totalNumDocs, numTotalWindows, stop); + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java (working copy) @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance; + +import java.util.List; + +import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * In other applications with variations on the ConcordanceSearcher, it has been useful + * to factor out the getCharOffsetRequests. + * + * This class should be used for functionality that is generally useful for concordance searching. + * + */ +public class ConcordanceSearcherUtil { + + + public static TokenCharOffsetRequests getCharOffsetRequests(List offsets, ConcordanceConfig config) { + return getCharOffsetRequests(offsets, config.getTokensBefore(), config.getTokensAfter() + 1); + } + + public static TokenCharOffsetRequests getCharOffsetRequests(List offsets) { + return getCharOffsetRequests(offsets, 0, 1); + } + + /** + */ + public static TokenCharOffsetRequests getCharOffsetRequests(List offsets, int tokensBefore, int tokensAfter) { + TokenCharOffsetRequests requests = new TokenCharOffsetRequests(); + + for (OffsetAttribute offset : offsets) { + int start = offset.startOffset()-tokensBefore; + start = (start < 0) ? 0 : start; + int end = offset.endOffset()+tokensAfter; + for (int i = start; i < end; i++) { + requests.add(i); + } + } + return requests; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java (working copy) @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance; + +public enum ConcordanceSortOrder { + PRE, //sort on the first token before the target, then the second word, etc. + POST, //sort on words after the target + TARGET_PRE, //sort on the target and then words before the target + TARGET_POST, //sort on the target and then words after the target + DOC, //sort on the Lucene document id + NONE //no sort +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java (working copy) @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance; + +import java.io.Serializable; +import java.util.Comparator; + + +public class ConcordanceSorter implements Comparator, Serializable { + private static final long serialVersionUID = 7526472295622776147L; + @Override + public int compare(ConcordanceWindow w1, ConcordanceWindow w2) { + + return w1.getSortKey().compareTo(w2.getSortKey()); + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java (working copy) @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.concordance; + +import java.util.Map; + +/** + * Key element in a concordance view of data. + * A window consists of the words before a target term (pre), the target term and then the words + * after the target term (post). A window also has a sort key to allow for various methods + * of sorting. + * + * For various applications, it has also been useful to store the (admittedly ephemeral) + * Lucene document id, character offset (start and end) of the full window + * as well as metadata from the document for the given window. + * + * This class is experimental and may change in incompatible ways in the future. + * + * Areas for improvement: + * 1) convert sortKey to an array of Comparables + * 2) ... + */ +public class ConcordanceWindow { + + private final String sortKey; + private final String pre; + private final String target; + private final String post; + private final Map metadata; + private final int charStart; + private final int charEnd; + private final long docID; + + public ConcordanceWindow(long docID, int charStart, int charEnd, + String pre, String target, String post, String sortKey, Map metadata) { + this.pre = pre; + this.target = target; + this.post = post; + this.docID = docID; + this.charStart = charStart; + this.charEnd = charEnd; + this.metadata = metadata; + this.sortKey = sortKey; + } + public long getDocID() { + return docID; + } + + public int getStart() { + return charStart; + } + public int getEnd() { + return charEnd; + } + public Map getMetadata() { + return metadata; + } + + public String getPre() { + return pre; + } + public String getPost() { + return post; + } + public String getTarget() { + return target; + } + public int getSize() { + int size = 0; + if (pre != null) { + size += pre.length(); + } + if (target != null) { + size += target.length(); + } + if (post != null) { + size += post.length(); + } + return size; + } + public String getSortKey() { + return sortKey; + } + + public String toString() { + //this assumes left to right language + StringBuilder sb = new StringBuilder(); + sb.append(pre).append(":").append(target).append(":").append(post); + return sb.toString(); + } + + @Override + public int hashCode() + { + final int prime = 31; + int result = 1; + result = prime * result + charEnd; + result = prime * result + charStart; + result = prime * result + (int) (docID ^ (docID >>> 32)); + result = prime * result + ((metadata == null) ? 0 : metadata.hashCode()); + result = prime * result + ((post == null) ? 0 : post.hashCode()); + result = prime * result + ((pre == null) ? 0 : pre.hashCode()); + result = prime * result + ((sortKey == null) ? 0 : sortKey.hashCode()); + result = prime * result + ((target == null) ? 0 : target.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) + return true; + if (obj == null) + return false; + if ( ! (obj instanceof ConcordanceWindow)) + return false; + + ConcordanceWindow other = (ConcordanceWindow)obj; + + if (charEnd != other.charEnd) + return false; + if (charStart != other.charStart) + return false; + if (docID != other.docID) + return false; + + if (metadata == null) { + if (other.metadata != null) + return false; + } else if (!metadata.equals(other.metadata)) { + return false; + } + + if (post == null) { + if (other.post != null) + return false; + } else if (!post.equals(other.post)) { + return false; + } + + if (pre == null) { + if (other.pre != null) + return false; + } else if (!pre.equals(other.pre)) { + return false; + } + + if (sortKey == null) { + if (other.sortKey != null) + return false; + } else if (!sortKey.equals(other.sortKey)) { + return false; + } + + if (target == null) { + if (other.target != null) + return false; + } else if (!target.equals(other.target)) { + return false; + } + + return true; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentMetadataExtractor.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentMetadataExtractor.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentMetadataExtractor.java (working copy) @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.concordance; + +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.StoredDocument; + +/** + * Lightweight interface that extracts document metadata to be stored + * with each window that is extracted. + * + * For now, it can only extract key-value pairs of type String, String. + */ + +public interface DocumentMetadataExtractor { + public Set getFieldSelector(); + public Map extract(StoredDocument document); +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/SpanQueryConverter.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/SpanQueryConverter.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/SpanQueryConverter.java (working copy) @@ -0,0 +1,262 @@ +package org.apache.lucene.search.concordance; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.CommonTermsQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; + +public class SpanQueryConverter { + /** + * Converts a regular query to a {@link SpanQuery} for use in a highlighter. + * Because of subtle differences in {@link SpanQuery} and {@link Query}, this {@link SpanQuery} will + * not necessarily return the same documents as the initial Query. For example, + * the generated SpanQuery will not include clauses of type {@link BooleanClause.Occur#MUST_NOT}. + * Also, the {@link SpanQuery} will only cover a single field, whereas the {@link Query} might contain + * multiple fields. + * + * Returns an empty SpanQuery if it can't convert from a {@link Query} to a {@link SpanQuery}. + * This can happen for many reasons: e.g. if the Query contains no terms in the requested "field". + * + * This class does not rewrite the SpanQuery. Consumers are required to rewrite if necessary. + * + * Much of this code is copied directly from oal.search.highlight.WeightedSpanTermExtractor. + * There are some subtle differences. + * + * @return SpanQuery for use in highlighting; can return empty SpanQuery + */ + public SpanQuery convert(String field, Query query) throws IOException { + /* copied nearly verbatim from org.apache.lucene.search.highlight.WeightedSpanTermExtractor + * TODO: refactor to avoid duplication of code if possible. Beware: there are some subtle differences. + */ + if (query instanceof SpanQuery) { + SpanQuery sq = (SpanQuery)query; + if (sq.getField().equals(field)) { + return (SpanQuery)query; + } else { + return getEmptySpanQuery(); + } + } else if (query instanceof BooleanQuery) { + BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses(); + List spanQs = new ArrayList<>(); + for (BooleanClause queryClause : queryClauses) { + if (!queryClause.isProhibited()) { + tryToAdd(field, convert(field, queryClause.getQuery()), spanQs); + } + } + if (spanQs.size() == 0) { + return getEmptySpanQuery(); + } else if (spanQs.size() == 1) { + return spanQs.get(0); + } else { + return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()])); + } + } else if (query instanceof PhraseQuery) { + PhraseQuery phraseQuery = ((PhraseQuery) query); + + Term[] phraseQueryTerms = phraseQuery.getTerms(); + if (phraseQueryTerms.length == 0) { + return getEmptySpanQuery(); + } else if (! phraseQueryTerms[0].field().equals(field)) { + return getEmptySpanQuery(); + } + SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length]; + for (int i = 0; i < phraseQueryTerms.length; i++) { + clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); + } + int slop = phraseQuery.getSlop(); + int[] positions = phraseQuery.getPositions(); + // add largest position increment to slop + if (positions.length > 0) { + int lastPos = positions[0]; + int largestInc = 0; + int sz = positions.length; + for (int i = 1; i < sz; i++) { + int pos = positions[i]; + int inc = pos - lastPos; + if (inc > largestInc) { + largestInc = inc; + } + lastPos = pos; + } + if (largestInc > 1) { + slop += largestInc; + } + } + + boolean inorder = false; + + if (slop == 0) { + inorder = true; + } + + SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); + sp.setBoost(query.getBoost()); + return sp; + } else if (query instanceof TermQuery) { + TermQuery tq = (TermQuery)query; + if (tq.getTerm().field().equals(field)) { + return new SpanTermQuery(tq.getTerm()); + } else { + return getEmptySpanQuery(); + } + } else if (query instanceof FilteredQuery) { + return convert(field, ((FilteredQuery)query).getQuery()); + } else if (query instanceof ConstantScoreQuery) { + return convert(field, ((ConstantScoreQuery) query).getQuery()); + } else if (query instanceof CommonTermsQuery) { + // specialized since rewriting would change the result query + // this query is TermContext sensitive. + CommonTermsQuery ctq = (CommonTermsQuery)query; + + Set terms = new HashSet<>(); + ctq.extractTerms(terms); + List spanQs = new LinkedList<>(); + + for (Term term : terms) { + if (term.field().equals(field)) { + spanQs.add(new SpanTermQuery(term)); + } + } + if (spanQs.size() == 0) { + return getEmptySpanQuery(); + } else if (spanQs.size() == 1) { + return spanQs.get(0); + } else { + return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()])); + } + } else if (query instanceof DisjunctionMaxQuery) { + List spanQs = new ArrayList<>(); + for (Query q : ((DisjunctionMaxQuery)query)) { + tryToAdd(field, convert(field, q), spanQs); + } + if (spanQs.size() == 0) { + return getEmptySpanQuery(); + } else if (spanQs.size() == 1) { + return spanQs.get(0); + } else { + return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()])); + } + } else if (query instanceof MultiPhraseQuery) { + final MultiPhraseQuery mpq = (MultiPhraseQuery) query; + + final List termArrays = mpq.getTermArrays(); + if (termArrays.size() == 0) { + return getEmptySpanQuery(); + } else if (termArrays.size() > 1) { + Term[] ts = termArrays.get(0); + if (ts.length > 0) { + Term t = ts[0]; + if (! t.field().equals(field)) { + return getEmptySpanQuery(); + } + } + } + final int[] positions = mpq.getPositions(); + if (positions.length > 0) { + + int maxPosition = positions[positions.length - 1]; + for (int i = 0; i < positions.length - 1; ++i) { + if (positions[i] > maxPosition) { + maxPosition = positions[i]; + } + } + + @SuppressWarnings("unchecked") + final List[] disjunctLists = new List[maxPosition + 1]; + int distinctPositions = 0; + + for (int i = 0; i < termArrays.size(); ++i) { + final Term[] termArray = termArrays.get(i); + List disjuncts = disjunctLists[positions[i]]; + if (disjuncts == null) { + disjuncts = (disjunctLists[positions[i]] = new ArrayList<>(termArray.length)); + ++distinctPositions; + } + for (Term aTermArray : termArray) { + disjuncts.add(new SpanTermQuery(aTermArray)); + } + } + + int positionGaps = 0; + int position = 0; + final SpanQuery[] clauses = new SpanQuery[distinctPositions]; + for (List disjuncts : disjunctLists) { + if (disjuncts != null) { + clauses[position++] = new SpanOrQuery(disjuncts.toArray(new SpanQuery[disjuncts.size()])); + } else { + ++positionGaps; + } + } + + final int slop = mpq.getSlop(); + final boolean inorder = (slop == 0); + + SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); + sp.setBoost(query.getBoost()); + return sp; + } + + } + return convertUnknownQuery(query); + } + + private void tryToAdd(String field, SpanQuery q, List qs) { + if (q == null || isEmptyQuery(q) || ! q.getField().equals(field)) { + return; + } + qs.add(q); + } + + protected SpanQuery convertUnknownQuery(Query query) throws IOException { + // for sub-classing to extract custom queries + return getEmptySpanQuery(); + } + + private SpanQuery getEmptySpanQuery() { + SpanQuery q = new SpanOrQuery(new SpanTermQuery[0]); + return q; + } + + private boolean isEmptyQuery(SpanQuery q) { + if (q instanceof SpanOrQuery + && ((SpanOrQuery)q).getClauses().length == 0) { + return true; + } + return false; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java (working copy) @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance; + +import java.util.Map; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl; +import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException; +import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetResults; + +/** + * Builds a {@link ConcordanceWindow} + */ +public class WindowBuilder { + + private final static String SPACE = " "; + private final static String EMPTY_STRING = ""; + public static String INTER_MULTIVALUE_FIELD_PADDING = SPACE; + + + /** + * Makes the assumption that the target token start and target token end + * can be found. If not, this returns a null. + * + * @param docID + * Lucene internal docid, used only if sort type is DOC + * @param targetTokenStart + * Target's start token + * @param targetTokenEnd + * Target's end token + * @param metadata + * Metadata to be stored with the window + * @param config + * ConcordanceConfig + * @param fieldValues + * nocommit: describe me + * @param offsets + * TokenOffsetResults from nocommit: where are they from??? + * @return + * ConcordanceWindow or null if character offset information cannot be found for + * both the targetTokenStart and the targetTokenEnd + */ + public ConcordanceWindow buildConcordanceWindow(long docID, int targetTokenStart, int targetTokenEnd, + Map metadata, ConcordanceConfig config, + String[] fieldValues, TokenCharOffsetResults offsets) + throws TargetTokenNotFoundException, IllegalArgumentException { + + if (targetTokenStart < 0 || targetTokenEnd < 0) { + throw new IllegalArgumentException("targetTokenStart and targetTokenEnd must be >= 0"); + } + if (targetTokenEnd < targetTokenStart) { + throw new IllegalArgumentException("targetTokenEnd must be >= targetTokenStart"); + } + int startFieldIndex = offsets.getFieldIndex(targetTokenStart); + int endFieldIndex = offsets.getFieldIndex(targetTokenEnd); + /*if (fieldIndex != offsets.getFieldIndex(targetTokenEnd)) { + //you're asking for a window across different entries in a field. + //no soup for you. + throw new IllegalArgumentException("Can't request a window across different field indices in a multi-valued field"); + }*/ + if (startFieldIndex < 0 || endFieldIndex < 0) { + //target not found + throw new IllegalArgumentException("field index must be >= 0"); + } + if (startFieldIndex >= fieldValues.length || endFieldIndex >= fieldValues.length) { + //something went horribly wrong. + //can't ask for a window from array index out of bounds exception + throw new IllegalArgumentException("fieldIndex out of bounds exception"); + } + String startS = fieldValues[startFieldIndex]; + String endS = (startFieldIndex == endFieldIndex) ? startS : fieldValues[endFieldIndex]; + if (startS == null || endS == null) { + //something went horribly wrong. + throw new IllegalArgumentException("field value is null"); + } + int targetCharStart = offsets.getCharacterOffsetStart(targetTokenStart); + int targetCharEnd = offsets.getCharacterOffsetEnd(targetTokenEnd); + + if (targetCharStart < 0 || targetCharEnd < 0) { + throw new TargetTokenNotFoundException("couldn't find character offsets for a target token.\n"+ + "Check that your analyzers are configured properly.\n"); + } + + OffsetAttribute preOffset = getPreOffset(startFieldIndex, targetTokenStart, targetCharStart, config, offsets); + String preString = silentlySafeSubstring(startS, preOffset); + + OffsetAttribute postOffset = getPostOffset(endFieldIndex, targetTokenEnd, targetCharEnd, config, offsets); + String postString = silentlySafeSubstring(endS, postOffset); + + String targ = getTargetString(targetTokenStart, targetTokenEnd, targetCharStart, + targetCharEnd, fieldValues, offsets); + + String sortKey = getSortKey(docID, startFieldIndex, endFieldIndex, targetTokenStart, targetTokenEnd, config, offsets); + int charStart = (preOffset == null) ? targetCharStart : preOffset.startOffset(); + int charEnd = (postOffset == null) ? targetCharEnd : postOffset.endOffset(); + return new ConcordanceWindow(docID, charStart, charEnd, preString, targ, postString, + sortKey, metadata); + } + + private String getTargetString(int targetTokenStart, int targetTokenEnd, + int targetCharStart, int targetCharEnd, String[] fieldValues, + TokenCharOffsetResults offsets) { + + int startIndex = offsets.getFieldIndex(targetTokenStart); + int endIndex = offsets.getFieldIndex(targetTokenEnd); + + if (startIndex == endIndex) { + String s = fieldValues[startIndex]; + return silentlySafeSubstring(s, targetCharStart, targetCharEnd); + } + StringBuilder sb = new StringBuilder(); + String fStart = fieldValues[startIndex]; + sb.append(fStart.substring(targetCharStart)); + for (int i = startIndex+1; i < endIndex; i++) { + sb.append(INTER_MULTIVALUE_FIELD_PADDING); + sb.append(fieldValues[i]); + } + sb.append(INTER_MULTIVALUE_FIELD_PADDING); + sb.append(fieldValues[endIndex].substring(0,targetCharEnd)); + return sb.toString(); + } + + private String getSortKey(long docID, int startFieldIndex, int endFieldIndex, int start, int end, + ConcordanceConfig config, TokenCharOffsetResults charOffsets) { + //TODO: Create interface for sort key generator + //for room to grow. Hard coded for now. + + StringBuilder sb = new StringBuilder(); + ConcordanceSortOrder sortOrder = config.getSortOrder(); + if (sortOrder == ConcordanceSortOrder.NONE) { + return EMPTY_STRING; + } + //hack zero left pad the tokenoffset with 10 0's + if (sortOrder == ConcordanceSortOrder.DOC) { + String docIDString = padLeft(10, "0", Long.toString(docID)); + String startOffsetString = padLeft(10, "0", + Integer.toString(start)); + sb.append(docIDString).append(SPACE).append(startOffsetString); + } + + if (sortOrder == ConcordanceSortOrder.TARGET_POST || + sortOrder == ConcordanceSortOrder.TARGET_PRE) { + + for (int i = start; i <= end; i++) { + String tmp = charOffsets.getTerm(i); + if (tmp != null && tmp.length() > 0) + sb.append(tmp).append(" "); + } + } + if (sortOrder == ConcordanceSortOrder.PRE || + sortOrder == ConcordanceSortOrder.TARGET_PRE) { + int tmpStart = start-1; + int tmpEnd = Math.max(0, start-config.getTokensBefore()); + if (tmpStart < 0) { + sb.append(" "); + } + + for (int i = tmpStart; i >= tmpEnd; i--) { + if (charOffsets.getFieldIndex(i) == startFieldIndex) { + String tmp = charOffsets.getTerm(i); + if (tmp != null && tmp.length() > 0) { + sb.append(tmp).append(" "); + } + } else { + break; + } + } + + } else if (sortOrder == ConcordanceSortOrder.POST || + sortOrder == ConcordanceSortOrder.TARGET_POST) { + + int tmpStart = end+1; + int tmpEnd = Math.min(end+config.getTokensAfter(), charOffsets.getLast()); + + if (tmpStart > charOffsets.getLast()) { + sb.append(" "); + } + for (int i = tmpStart; i <= tmpEnd; i++) { + if (charOffsets.getFieldIndex(i) == endFieldIndex) { + String tmp = charOffsets.getTerm(i); + if (tmp != null && tmp.length() > 0) { + sb.append(tmp).append(SPACE); + } + } else { + break; + } + } + } + return sb.toString().trim(); + } + + private OffsetAttribute getPreOffset(int fieldIndex, int targetTokenStart, int targetCharStart, + ConcordanceConfig config, TokenCharOffsetResults charOffsets) { + if (config.getTokensBefore() == 0) + return null; + + if (targetTokenStart == 0) { + return null; + } + int startTokenOffset = Math.max(0, targetTokenStart-config.getTokensBefore()); + + int windowStartChar = charOffsets.getClosestCharStart(fieldIndex, startTokenOffset, targetTokenStart); + + int windowEndChar = Math.max(windowStartChar, targetCharStart - 1); + + return buildOffsetAttribute(windowStartChar, windowEndChar); + } + + + private OffsetAttribute getPostOffset(int fieldIndex, int targetTokenEnd, int targetCharEnd, + ConcordanceConfig config, TokenCharOffsetResults charOffsets) { + if (config.getTokensAfter() == 0) + return null; + int windowTokenEnd = targetTokenEnd+config.getTokensAfter(); + int windowCharStart = targetCharEnd; + int windowCharEnd = charOffsets.getClosestCharEnd(fieldIndex, windowTokenEnd, targetTokenEnd+1); + if (windowCharStart >= windowCharEnd) { + return null; + } + return buildOffsetAttribute(windowCharStart, windowCharEnd); + } + + private OffsetAttribute buildOffsetAttribute(int start, int end) { + OffsetAttribute off = new OffsetAttributeImpl(); + off.setOffset(start, end); + return off; + } + + private String padLeft(int number, String add, String s) { + if (s.length() >= number) + return s; + + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < number- s.length(); i++) { + sb.append(add); + } + sb.append(s); + return sb.toString(); + } + + private String silentlySafeSubstring(String s, OffsetAttribute offset) { + if (offset == null) + return EMPTY_STRING; + + return silentlySafeSubstring(s, offset.startOffset(), offset.endOffset()); + } + + private String silentlySafeSubstring(String s, int startOffset, int endOffset) { + + if (startOffset >= endOffset || startOffset < 0 || + startOffset >= s.length() || endOffset > s.length()) { + return EMPTY_STRING; + } + return s.substring(startOffset, endOffset); + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java (working copy) @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.concordance.charoffsets; + +import java.util.ArrayList; + +import java.util.List; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl; +import org.apache.lucene.index.StoredDocument; + +/** + * Simple class to store a document id (leaf and unique), a StoredDocument, and the offsets + * for a SpanQuery hit + * + */ + +public class DocTokenOffsets { + private int leafDocId = -1; + private int uniqueId = -1; + private StoredDocument document = null; + private List offsets = new ArrayList<>(); + + public void setDocument(StoredDocument d) { + this.document = d; + } + public void addOffset(int start, int end) { + OffsetAttributeImpl offset = new OffsetAttributeImpl(); + offset.setOffset(start, end); + offsets.add(offset); + } + + public void reset(int base, int leafDocId, StoredDocument d, int start, int end) { + this.leafDocId = leafDocId; + this.uniqueId = base+leafDocId; + setDocument(d); + offsets.clear(); + addOffset(start,end); + } + + public List getOffsets() { + return offsets; + } + + public StoredDocument getDocument() { + return document; + } + + public int getLeafDocId() { + return leafDocId; + } + + public int getUniqueDocId() { + return uniqueId; + } + + public DocTokenOffsets deepishCopy() { + DocTokenOffsets copy = new DocTokenOffsets(); + copy.leafDocId = leafDocId; + copy.uniqueId = uniqueId; + copy.document = document; + List copyOffsets = new ArrayList<>(); + copyOffsets.addAll(offsets); + copy.offsets = copyOffsets; + return copy; + } + + public boolean isEmpty() { + if (leafDocId < 0) + return true; + return false; + } + + public void pseudoEmpty() { + leafDocId = -1; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java (working copy) @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.concordance.charoffsets; + +import java.io.IOException; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; + +/** + * Scaffolding/Sugar class around SpanQuery.getSpans(...). This allows the client + * to iterate on an IndexReader (not necessarily a leaf) by document (DocTokenOffsets). + */ +public class DocTokenOffsetsIterator { + /* + * NOT THREAD SAFE!!! + */ + private SpanQuery spanQuery; + private Filter filter; + private LinkedList leafReaders = new LinkedList<>(); + private LeafReader currReader = null; + private Set fields; + private Spans spans = null; + private DocTokenOffsets docTokenOffsets = new DocTokenOffsets(); + private DocTokenOffsets docTokenOffsetsBuffer = new DocTokenOffsets(); + private int currentBase = -1; + + private Map termMap = new HashMap<>(); + + public DocTokenOffsetsIterator() { + } + + public void reset(SpanQuery q, Filter f, IndexReader reader, Set fields) throws IOException { + + this.spanQuery = q; + this.filter = f; + + this.fields = fields; + leafReaders.addAll(reader.leaves()); + if (leafReaders.size() > 0) { + reinitSpans(); + } + } + public boolean next() throws IOException { + + if (spans == null || docTokenOffsetsBuffer.isEmpty()) { + if (leafReaders.size()==0) { + return false; + } else if (! reinitSpans()) { + return false; + } + + } + boolean currSpansHasMore = false; + while (spans.next()) { + if (spans.doc() == docTokenOffsetsBuffer.getLeafDocId()) { + docTokenOffsetsBuffer.addOffset(spans.start(), spans.end()); + } else { + currSpansHasMore = true; + break; + } + } + docTokenOffsets = docTokenOffsetsBuffer.deepishCopy(); + + if (currSpansHasMore) { + StoredDocument d = currReader.document(spans.doc(), fields); + docTokenOffsetsBuffer.reset(currentBase, spans.doc(), d, spans.start(), spans.end()); + } else { + docTokenOffsetsBuffer.pseudoEmpty(); + } + return true; + } + + public DocTokenOffsets getDocTokenOffsets() { + return docTokenOffsets; + } + + private boolean reinitSpans() throws IOException { + //must check that leafReaders.size() > 0 before running this!!! + LeafReaderContext ctx = leafReaders.pop(); + currentBase = ctx.docBase; + currReader = ctx.reader(); + Bits bits = null; + Bits liveBits = currReader.getLiveDocs(); + //liveBits can be null if all of the docs are live!!! + if (filter == null) { + bits = liveBits; + } else { + DocIdSet idSet = filter.getDocIdSet(ctx,liveBits); + if (idSet instanceof FixedBitSet) { + bits = (FixedBitSet)idSet; + } else { + DocIdSetIterator itr = idSet.iterator(); + if (itr != null) { + FixedBitSet tmpBits = new FixedBitSet(currReader.maxDoc()); + while (itr.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + tmpBits.set(itr.docID()); + } + bits = tmpBits; + } + } + } + /*bits() is optional; this doesn't work!!!! + bits = idSet.bits(); + */ + + //bits can be null if all the docs are live + //or if the filter returned an empty docidset. + if (filter != null && bits == null) { + if (leafReaders.size() > 0) { + return reinitSpans(); + } else { + return false; + } + } + + spans = spanQuery.getSpans(ctx, bits, termMap); + //can getSpans return null? + if (spans != null && spans.next()) { + StoredDocument d = currReader.document(spans.doc(), fields); + + docTokenOffsetsBuffer.reset(currentBase, spans.doc(), d, spans.start(), spans.end()); + return true; + } else if (leafReaders.size() > 0) { + return reinitSpans(); + } else { + return false; + } + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/FieldIndexCharacterOffsetPair.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/FieldIndexCharacterOffsetPair.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/FieldIndexCharacterOffsetPair.java (working copy) @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance.charoffsets; + +/** + * Pair of field index and character offset. + * The fieldIndex records the index in a potentially multi-valued field (array). + * The charOffset records the character offset within that field within that value in the potentially + * multi-valued field. + */ +public class FieldIndexCharacterOffsetPair { + private final int fieldIndex; + private final int charOffset; + + public FieldIndexCharacterOffsetPair(int fieldIndex, int charOffset) { + this.fieldIndex = fieldIndex; + this.charOffset = charOffset; + } + + public int getFieldIndex() { + return fieldIndex; + } + + public int getCharOffset() { + return charOffset; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java (working copy) @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance.charoffsets; + +import java.util.Comparator; +import java.io.Serializable; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * Sorts length desc, start offset asc + * + */ + +public class OffsetLengthStartComparator implements Comparator, Serializable { + private static final long serialVersionUID = 7526472295622776147L; + + @Override + public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) { + + int lenA = offsetA.endOffset()-offsetA.startOffset(); + int lenB = offsetB.endOffset()-offsetB.startOffset(); + if (lenA < lenB) { + return 1; + } else if (lenA > lenB) { + return -1; + //by here, the length is the same + } else if (offsetA.startOffset() < offsetB.startOffset()) { + return -1; + } else if (offsetA.startOffset() > offsetB.startOffset()) { + return 1; + } + return 0; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java (working copy) @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance.charoffsets; + +import java.util.Comparator; +import java.io.Serializable; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * sort on offset start + */ +public class OffsetStartComparator implements Comparator, Serializable{ + private static final long serialVersionUID = 7526472295622776147L; + + @Override + public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) { + + if (offsetA.startOffset() < offsetB.startOffset()){ + return -1; + } else if (offsetA.startOffset() > offsetB.startOffset()){ + return 1; + } + return 0; + } + +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java (working copy) @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.concordance.charoffsets; + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * In some versions of Lucene, getSpans returned overlapping spans. + * This class can remove the overlapping spans and will sort them + * if startComparator is not null. + * + * + */ +public class OffsetUtil { + + + public static List removeOverlapsAndSort(List offsets, + OffsetLengthStartComparator comparator, OffsetStartComparator startComparator) { + if (offsets == null || offsets.size() < 2) + return offsets; + + Collections.sort(offsets, comparator); + Set seen = new HashSet<>(); + List filtered = new ArrayList<>(); + for (OffsetAttribute offset : offsets) { + if (! alreadySeen(offset, seen)) { + filtered.add(offset); + for (int i = offset.startOffset(); i < offset.endOffset(); i++) { + seen.add(i); + } + } + } + if (startComparator != null) { + Collections.sort(filtered, startComparator); + } + return filtered; + } + + private static boolean alreadySeen(OffsetAttribute offset, Set seen) { + for (int i = offset.startOffset(); i <= offset.endOffset(); i++) { + if (seen.contains(i)) + return true; + } + return false; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java (working copy) @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance.charoffsets; + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.index.StoredDocument; + +/** + * TokenCharOffsetsReader that captures character offsets by + * reanalyzing a field. + */ +public class ReanalyzingTokenCharOffsetsReader implements TokenCharOffsetsReader { + + //TODO: figure out how to stop analysis after "getLast()" request is hit + private final static int GOT_ALL_REQUESTS = -2; + private Analyzer baseAnalyzer; + + public ReanalyzingTokenCharOffsetsReader(Analyzer analyzer) { + this.baseAnalyzer = analyzer; + } + + @Override + public TokenCharOffsetResults getTokenCharOffsetResults + (StoredDocument d, String fieldName, TokenCharOffsetRequests requests, TokenCharOffsetResults results) + throws IOException { + int fieldIndex = 0; + int currInd = -1; + int gap = baseAnalyzer.getPositionIncrementGap(fieldName); + + for (String fieldValue : d.getValues(fieldName)) { + + currInd = addFieldValue(fieldIndex, currInd, fieldValue, requests, results); + if (currInd == GOT_ALL_REQUESTS) { + break; + } + currInd += gap; + fieldIndex++; + } + return results; + } + + private int addFieldValue(int fieldIndex, int currInd, + String fieldValue, TokenCharOffsetRequests requests, + TokenCharOffsetResults results) throws IOException { + TokenStream stream = baseAnalyzer.tokenStream("", fieldValue); + //stream = new LimitTokenCountFilter(stream, requests.getLast()+1-currInd); + stream.reset(); + + int defaultInc = 1; + + CharTermAttribute termAtt = + stream.getAttribute( + org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); + OffsetAttribute offsetAtt = + stream.getAttribute( + org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); + PositionIncrementAttribute incAtt = null; + if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) { + + incAtt = stream.getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); + } + + while (stream.incrementToken()) { + + currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc; + if (requests.contains(currInd)) { + results.add(currInd, fieldIndex, offsetAtt.startOffset(), offsetAtt.endOffset(), termAtt.toString()); + } + if (currInd > requests.getLast()) { + //TODO: Is there a way to avoid this? Or, is this an imaginary performance hit? + while (stream.incrementToken()) { + //NOOP: clear stream + } + stream.end(); + stream.close(); + return GOT_ALL_REQUESTS; + } + } + stream.end(); + stream.close(); + return currInd; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java (working copy) @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.concordance.charoffsets; + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * Simple util class to get List of String from reader+analyzer + */ +public class SimpleAnalyzerUtil { + private final static String DEFAULT_FIELD = "FIELD"; + + + public static List getTermStrings(Reader reader, Analyzer analyzer) throws IOException { + List terms = new ArrayList<>(); + return getTermStrings(reader, analyzer, terms); + } + /** + * allows reuse of terms, this method calls terms.clear() before adding new terms + */ + public static List getTermStrings(Reader reader, Analyzer analyzer, List terms) throws IOException { + if (terms == null) { + terms = new ArrayList<>(); + } + terms.clear(); + TokenStream stream = analyzer.tokenStream(DEFAULT_FIELD, reader); + stream.reset(); + CharTermAttribute termAtt = stream.getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); + + while (stream.incrementToken()) { + terms.add(termAtt.toString()); + } + stream.end(); + stream.close(); + return terms; + } + +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java (working copy) @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance.charoffsets; + +/** + * Token offset identified by .getSpans() is not found in the TokenCharOffsetResults. + * Typical cause is a mismatch between analyzers at index and search times. + * When this happens, something very bad has happened and this should be its own exception. + */ +public class TargetTokenNotFoundException extends Exception { + + private static final long serialVersionUID = 1L; + public TargetTokenNotFoundException(String message) { + super(message); + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java (working copy) @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance.charoffsets; + +import java.util.BitSet; + +/** + * Util class used to specify the tokens for which character offsets are requested. + */ +public class TokenCharOffsetRequests { + private BitSet set = new BitSet(); + private int last = -1; + + public boolean contains(int i){ + return set.get(i); + } + public void add(int start, int end){ + for (int i = start; i <= end; i++){ + add(i); + } + } + public void add(int i){ + set.set(i); + last = (i > last) ? i : last; + } + public void clear(){ + set.clear(); + } + + public int getLast(){ + return last; + } + protected BitSet getSet(){ + return set; + } +/* public Integer[] getSortedRequests(){ + List ints = new ArrayList(); + ints.addAll(set); + Collections.sort(ints); + return ints.toArray(new Integer[set.size()]); + } + */ +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetResults.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetResults.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetResults.java (working copy) @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance.charoffsets; + +import java.util.BitSet; +import java.util.HashMap; +import java.util.Map; + +/** + * Class to record results for looking up normalized terms (String) + * and character offsets for specified tokens. + * Will return NULL_TERM/NULL_OFFSET if a token offset was not found. + * + * Has utility methods for safely getting the closest found token. This + * is useful for when a concordance window ends in a stop word (no term/offset info). + */ +public class TokenCharOffsetResults { + + public final static String NULL_TERM = ""; + public final static int NULL_OFFSET = -1; + // nocommit: is this needed? + // public final static FieldIndexCharacterOffsetPair NULL_FIELDINDEXCHAROFFSETPAIR = new FieldIndexCharacterOffsetPair(-1,-1); + + private BitSet set = new BitSet(); + private int last = -1; + private Map terms = new HashMap<>(); + private Map starts = + new HashMap<>(); + private Map ends = + new HashMap<>(); + + public void add(int tokenOffset, int fieldIndex, int startCharOffset, int endCharOffset, String term) { + addStart(tokenOffset, fieldIndex, startCharOffset); + addEnd(tokenOffset, fieldIndex, endCharOffset); + addTerm(tokenOffset, term); + set.set(tokenOffset); + } + + private void addTerm(int tokenOffset, String term) { + if (term != null) { + terms.put(tokenOffset, term); + } + last = (tokenOffset > last) ? tokenOffset : last; + } + + private void addStart(int tokenOffset, int fieldIndex, int charOffset) { + starts.put(tokenOffset, new FieldIndexCharacterOffsetPair(fieldIndex, charOffset)); + last = (tokenOffset > last) ? tokenOffset : last; + } + private void addEnd(int tokenOffset, int fieldIndex, int charOffset) { + ends.put(tokenOffset, new FieldIndexCharacterOffsetPair(fieldIndex,charOffset)); + last = (tokenOffset > last) ? tokenOffset : last; + } + + public int getCharacterOffsetStart(int tokenOffset) { + FieldIndexCharacterOffsetPair cand = starts.get(tokenOffset); + if (cand == null) + return NULL_OFFSET; + + return cand.getCharOffset(); + } + + public int getCharacterOffsetEnd(int tokenOffset) { + FieldIndexCharacterOffsetPair cand = ends.get(tokenOffset); + if (cand == null) + return NULL_OFFSET; + + return cand.getCharOffset(); + + } + + public String getTerm(int tokenOffset) { + String s = terms.get(tokenOffset); + if (s == null) { + return NULL_TERM; + } + return s; + } + + + public int getLast() { + return last; + } + + public void clear() { + terms.clear(); + starts.clear(); + ends.clear(); + last = -1; + set.clear(); + } + protected boolean isEmpty() { + return set.isEmpty(); + } + + private int getClosestToken(int fieldIndex, int start, int stop, Map map) { + if (start < 0 || stop < 0) { + return NULL_OFFSET; + } + if (start == stop) { + return start; + } + if (start < stop) { + for (int i = start ; i <= stop; i++) { + FieldIndexCharacterOffsetPair p = map.get(i); + if (p != null && p.getFieldIndex() == fieldIndex) { + return i; + } + } + } else if (start > stop) { + for (int i = start; i >= stop; i--) { + FieldIndexCharacterOffsetPair p = map.get(i); + if (p != null && p.getFieldIndex() == fieldIndex) { + return i; + } + } + } + return NULL_OFFSET; + } + + public int getClosestCharStart(int fieldIndex, int start, int stop) { + + int i = getClosestToken(fieldIndex, start, stop, starts); + return getCharacterOffsetStart(i); + } + + public int getClosestCharEnd(int fieldIndex, int start, int stop) { + int i = getClosestToken(fieldIndex, start, stop, ends); + + return getCharacterOffsetEnd(i); + } + + protected String getClosestTerm(int fieldIndex, int start, int stop) { + int i = getClosestToken(fieldIndex, start, stop, starts); + return getTerm(i); + } + + /* + * return: -1 if + */ + public int getFieldIndex(int tokenOffset) { + FieldIndexCharacterOffsetPair p = starts.get(tokenOffset); + if (p == null) { + return NULL_OFFSET; + } + return p.getFieldIndex(); + } + protected String debugToString() { + StringBuilder sb = new StringBuilder(); + for (Integer i : terms.keySet()) { + sb.append(i).append(" : ").append(terms.get(i)).append(" : "); + sb.append(starts.get(i)).append(" : ").append(ends.get(i)).append("\n"); + } + return sb.toString(); + } + + protected BitSet getSet() { + return set; + } +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java (working copy) @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance.charoffsets; + +import java.io.IOException; + +import org.apache.lucene.index.StoredDocument; + +/** + * Interface to allow flexibility/optimizations in returning character offsets for + * tokens + */ +public interface TokenCharOffsetsReader { + + TokenCharOffsetResults getTokenCharOffsetResults + (StoredDocument document, String fieldName, TokenCharOffsetRequests requests, TokenCharOffsetResults results) + throws IOException; +} Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html =================================================================== --- lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html (revision 0) +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html (working copy) @@ -0,0 +1,22 @@ + + + + +ConcordanceSearcher performs a search on an index and returns concordance windows. + + Index: lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java =================================================================== --- lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java (revision 0) +++ lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java (working copy) @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; + +public class ConcordanceTestUtils extends LuceneTestCase { + public final static String FIELD = "content"; + + + public static Directory getDirectory(Analyzer analyzer, String[] vals) throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs + (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); + + for (String s : vals) { + Document d = new Document(); + d.add(newTextField(FIELD, s, Field.Store.YES)); + writer.addDocument(d); + + } + writer.close(); + return directory; + } + + public static Directory getDirectory(Analyzer analyzer, List input) throws IOException { + + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs + (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); + for (String[] vals : input) { + Document d = new Document(); + for (String s : vals) { + d.add(newTextField(FIELD, s, Field.Store.YES)); + } + writer.addDocument(d); + + } + writer.close(); + return directory; + } + + public static Analyzer getAnalyzer(final CharacterRunAutomaton stops, final int posIncGap) { + //stops will usually be either: + //MockTokenFilter.EMPTY_STOPSET; + //MockTokenFilter.ENGLISH_STOPSET + return new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); + + TokenFilter filter = new MockTokenFilter(tokenizer, stops); + return new TokenStreamComponents(tokenizer, filter); + } + @Override + public int getPositionIncrementGap(String fieldName) { + return posIncGap; + } + }; + } +} Index: lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java =================================================================== --- lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java (revision 0) +++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java (working copy) @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.concordance; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryWrapperFilter; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + + +public class TestConcordanceSearcher extends LuceneTestCase { + + private final static DocumentMetadataExtractor metadataExtractor = new DocumentMetadataExtractor() { + private final Set fields = new HashSet<>(); + private final Map data = new HashMap<>(); + @Override + public Set getFieldSelector() { + return fields; + } + + @Override + public Map extract(StoredDocument d) { + return data; + } + + }; + @BeforeClass + public static void beforeClass() throws Exception { + //NOOP for now + } + + @AfterClass + public static void afterClass() throws Exception { + //NOOP for now + } + + @Test + public void testSimple() throws Exception { + String[] docs = new String[] { + "a b c a b c", + "c b a c b a" + }; + Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50); + Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD); + ConcordanceSearcher searcher = new ConcordanceSearcher(); + SpanQuery q = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "a")); + + config.setMaxWindows(3); + ConcordanceResults results = searcher.search(reader, q, null, analyzer, + config, metadataExtractor); + + assertEquals(3, results.getWindows().size()); + + config.setMaxWindows(Integer.MAX_VALUE); + results = searcher.search(reader, q, null, analyzer, + config, metadataExtractor); + + //test result size + assertEquals(4, results.getWindows().size()); + + //test result with sort order = pre + List windows = results.getSortedWindows(); + String[] pres = new String[] { + "", + "c b", + "c b a c b", + "a b c" + }; + String[] posts = new String[] { + " b c a b c", + " c b a", + "", + " b c" + }; + + for (int i = 0; i < windows.size(); i++) { + ConcordanceWindow w = windows.get(i); + + assertEquals(pres[i], w.getPre()); + assertEquals(posts[i], w.getPost()); + } + + //test sort order post + //sort key is built at search time, so must re-search + config.setSortOrder(ConcordanceSortOrder.POST); + results = searcher.search(reader, q, null, analyzer, + config, metadataExtractor); + + windows = results.getSortedWindows(); + + posts = new String[] { + "", + " b c", + " b c a b c", + " c b a", + }; + for (int i = 0; i < windows.size(); i++) { + ConcordanceWindow w = windows.get(i); + assertEquals(posts[i], w.getPost()); + } + reader.close(); + directory.close(); + } + + @Test + public void testSimpleMultiValuedField() throws Exception { + String[] doc = new String[] { + "a b c a b c", + "c b a c b a" + }; + List docs = new ArrayList<>(); + docs.add(doc); + Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50); + Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD); + ConcordanceSearcher searcher = new ConcordanceSearcher(); + SpanQuery q = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "a")); + + + ConcordanceResults results = searcher.search(reader, q, null, analyzer, + config, metadataExtractor); + + //test result size + assertEquals(4, results.getWindows().size()); + + //test result with sort order = pre + List windows = results.getSortedWindows(); + String[] pres = new String[] { + "", + "c b", + "c b a c b", + "a b c" + }; + String[] posts = new String[] { + " b c a b c", + " c b a", + "", + " b c" + }; + + for (int i = 0; i < windows.size(); i++) { + ConcordanceWindow w = windows.get(i); + + assertEquals(pres[i], w.getPre()); + + assertEquals(posts[i], w.getPost()); + } + + //test sort order post + //sort key is built at search time, so must re-search + config.setSortOrder(ConcordanceSortOrder.POST); + results = searcher.search(reader, q, null, analyzer, + config, metadataExtractor); + + windows = results.getSortedWindows(); + + posts = new String[] { + "", + " b c", + " b c a b c", + " c b a", + }; + for (int i = 0; i < windows.size(); i++) { + ConcordanceWindow w = windows.get(i); + assertEquals(posts[i], w.getPost()); + } + reader.close(); + directory.close(); + } + + @Test + public void testWindowLengths() throws Exception { + String[] doc = new String[] { + "a b c d e f g", + }; + List docs = new ArrayList<>(); + docs.add(doc); + Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50); + Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD); + + ConcordanceSearcher searcher = new ConcordanceSearcher(); + SpanQuery q = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "d")); + + String[] pres = {"", "c", "b c", "a b c", "a b c", "a b c"}; + String[] posts = {"", " e", " e f", " e f g", " e f g", " e f g"}; + + for (int preLen = 0; preLen < pres.length; preLen++) { + for (int postLen = 0; postLen < posts.length; postLen++) { + config.setTokensBefore(preLen); + config.setTokensAfter(postLen); + ConcordanceResults results = searcher.search(reader, q, null, analyzer, + config, metadataExtractor); + //ConcordanceWindow w = results.getWindows().get(0); + ConcordanceWindow w = results.getWindows().iterator().next();//java 1.7 difference? + assertEquals(preLen+" : "+postLen, w.getPre(), pres[preLen]); + assertEquals(preLen+" : "+postLen, w.getPost(), posts[postLen]); + } + } + + reader.close(); + directory.close(); + + } + + @Test + public void testClockworkOrangMultiValuedFieldProblem() throws Exception { + /* test handling of target spread out over several + * indices in a multivalued field array + */ + String[] doc = new String[] { + "a b c a b the", + "clockwork", + "orange b a c b a" + }; + List docs = new ArrayList<>(); + docs.add(doc); + Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0); + Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD); + config.setTokensBefore(3); + config.setTokensAfter(3); + + ConcordanceSearcher searcher = new ConcordanceSearcher(); + SpanQuery q1 = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "the")); + SpanQuery q2 = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "clockwork")); + SpanQuery q3 = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "orange")); + SpanQuery q = new SpanNearQuery(new SpanQuery[] {q1, q2, q3}, 3, true); + ConcordanceResults results = searcher.search(reader, q, null, analyzer, + config, metadataExtractor); + assertEquals(1, results.getWindows().size()); + //ConcordanceWindow w = results.getWindows().get(0); + ConcordanceWindow w = results.getWindows().iterator().next(); + assertEquals("target", "the clockwork orange", w.getTarget()); + assertEquals("pre", "c a b", w.getPre()); + assertEquals("post", " b a c", w.getPost()); + + reader.close(); + directory.close(); + + //test hit even over long intra-field gap + analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50); + directory = ConcordanceTestUtils.getDirectory(analyzer, docs); + reader = DirectoryReader.open(directory); + config = new ConcordanceConfig(ConcordanceTestUtils.FIELD); + config.setTokensBefore(3); + config.setTokensAfter(3); + + searcher = new ConcordanceSearcher(); + q = new SpanNearQuery(new SpanQuery[] {q1, q2, q3}, 120, true); + results = searcher.search(reader, q, null, analyzer, + config, metadataExtractor); + + assertEquals(1, results.getWindows().size()); + //w = results.getWindows().get(0); + w = results.getWindows().iterator().next(); + assertEquals("target", "the clockwork orange", w.getTarget()); + assertEquals("pre", "c a b", w.getPre()); + assertEquals("post", " b a c", w.getPost()); + + reader.close(); + directory.close(); + //test miss + analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50); + directory = ConcordanceTestUtils.getDirectory(analyzer, docs); + reader = DirectoryReader.open(directory); + config = new ConcordanceConfig(ConcordanceTestUtils.FIELD); + + searcher = new ConcordanceSearcher(); + q = new SpanNearQuery(new SpanQuery[] {q1, q2, q3}, 5, true); + results = searcher.search(reader, q, null, analyzer, config, metadataExtractor); + + assertEquals(0, results.getWindows().size()); + + reader.close(); + directory.close(); + } + + @Test + public void testWithStops() throws Exception { + String[] docs = new String[] { + "a b the d e the f", + "g h the d the j" + }; + Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.ENGLISH_STOPSET, 50); + Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + + ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD); + config.setTokensBefore(2); + config.setTokensAfter(2); + + ConcordanceSearcher searcher = new ConcordanceSearcher(); + SpanQuery q = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "d")); + ConcordanceResults results = searcher.search(reader, q, null, analyzer, + config, metadataExtractor); + List windows = results.getSortedWindows(); + assertEquals(2, windows.size()); + + //the second word after the target is a stop word + //this post-component of this window should only go to the first word after the target + assertEquals("b the:d: e", windows.get(0).toString()); + + assertEquals("h the:d: the j", windows.get(1).toString()); + + reader.close(); + directory.close(); + } + + @Test + public void testBasicStandardQueryConversion() throws Exception { + String[] docs = new String[] { + "a b c a b c", + "c b a c b a d e a", + "c b a c b a e a b c a" + }; + Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50); + Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD); + ConcordanceSearcher searcher = new ConcordanceSearcher(); + BooleanQuery q = new BooleanQuery(); + q.add(new TermQuery(new Term(ConcordanceTestUtils.FIELD, "a")), Occur.MUST); + q.add(new TermQuery(new Term(ConcordanceTestUtils.FIELD, "d")), Occur.MUST_NOT); + + config.setMaxWindows(10); + ConcordanceResults results = searcher.search(reader, q, null, analyzer, config, metadataExtractor); + //shouldn't include document with "d" + assertEquals(6, results.getWindows().size()); + + //should only include document with "e" and not "d" + Filter filter = new QueryWrapperFilter(new TermQuery(new Term(ConcordanceTestUtils.FIELD, "e"))); + results = searcher.search(reader, q, filter, analyzer, config, metadataExtractor); + assertEquals(4, results.getWindows().size()); + + reader.close(); + directory.close(); + } + + @Test + public void testMismatchingFieldsInStandardQueryConversion() throws Exception { + //tests what happens if a Query doesn't contain a term in the "span" field + //in the searcher...should be no documents returned. + + String[] docs = new String[] { + "a b c a b c", + }; + Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50); + Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD); + ConcordanceSearcher searcher = new ConcordanceSearcher(); + + Query q = new TermQuery(new Term("_"+ConcordanceTestUtils.FIELD, "a")); + + int windowCount = -1; + + ConcordanceResults results = searcher.search(reader, q, null, analyzer, config, metadataExtractor); + windowCount = results.getWindows().size(); + assertEquals(0, windowCount); + reader.close(); + directory.close(); + } +} Index: lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java =================================================================== --- lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java (revision 0) +++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java (working copy) @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.concordance; + +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + + +public class TestSpanQueryConverter extends LuceneTestCase { + private static IndexReader reader; + private static Directory directory; + private static Analyzer analyzer; + private final static String FIELD = "field"; + + @BeforeClass + public static void beforeClass() throws Exception { + analyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs + (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); + String[] docs = new String[] { + "a b c a b c", + "c b a c b a" + }; + for (String val : docs) { + Document doc = new Document(); + doc.add(newTextField(FIELD, val, Field.Store.YES)); + writer.addDocument(doc); + } + reader = writer.getReader(); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + reader = null; + directory = null; + analyzer = null; + } + + @Test + public void testBooleanTwoFields() throws Exception { + + Query q1 = new TermQuery(new Term(FIELD, "a")); + Query q2 = new TermQuery(new Term("another_field", "b")); + BooleanQuery q = new BooleanQuery(); + q.add(q1, Occur.SHOULD); + q.add(q2, Occur.SHOULD); + SpanQueryConverter converter = new SpanQueryConverter(); + boolean success = true; + try { + SpanQuery span = converter.convert(FIELD, q); + } catch (IllegalArgumentException e) { + success = false; + } + assertEquals(true, success); + Query q3 = new TermQuery(new Term("another_field", "c")); + BooleanQuery bq2 = new BooleanQuery(); + bq2.add(q, Occur.MUST); + bq2.add(q3, Occur.SHOULD); + try { + SpanQuery span = converter.convert(FIELD, bq2); + } catch (IllegalArgumentException e) { + success = false; + } + assertEquals(true, success); + } +} Index: lucene/module-build.xml =================================================================== --- lucene/module-build.xml (revision 1632428) +++ lucene/module-build.xml (working copy) @@ -628,4 +628,27 @@ + + + + + + + + + + + + + + + + + + + + + + +