Index: dev-tools/idea/.idea/ant.xml
===================================================================
--- dev-tools/idea/.idea/ant.xml (revision 1632428)
+++ dev-tools/idea/.idea/ant.xml (working copy)
@@ -18,6 +18,7 @@
+
Index: dev-tools/idea/.idea/modules.xml
===================================================================
--- dev-tools/idea/.idea/modules.xml (revision 1632428)
+++ dev-tools/idea/.idea/modules.xml (working copy)
@@ -23,6 +23,7 @@
+
Index: dev-tools/idea/.idea/workspace.xml
===================================================================
--- dev-tools/idea/.idea/workspace.xml (revision 1632428)
+++ dev-tools/idea/.idea/workspace.xml (working copy)
@@ -108,6 +108,14 @@
+
+
+
+
+
+
+
+
@@ -325,7 +333,7 @@
-
+
@@ -339,32 +347,33 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: dev-tools/idea/lucene/concordance/concordance.iml
===================================================================
--- dev-tools/idea/lucene/concordance/concordance.iml (revision 0)
+++ dev-tools/idea/lucene/concordance/concordance.iml (working copy)
@@ -0,0 +1,19 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Property changes on: dev-tools/idea/lucene/concordance/concordance.iml
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: dev-tools/maven/lucene/concordance/pom.xml.template
===================================================================
--- dev-tools/maven/lucene/concordance/pom.xml.template (revision 0)
+++ dev-tools/maven/lucene/concordance/pom.xml.template (working copy)
@@ -0,0 +1,68 @@
+
+
+ 4.0.0
+
+ org.apache.lucene
+ lucene-parent
+ @version@
+ ../pom.xml
+
+ org.apache.lucene
+ lucene-concordance
+ jar
+ Lucene Concordance
+ Lucene Concordance Module
+
+ lucene/concordance
+ ../../..
+ ${relative-top-level}/${module-directory}
+
+
+ scm:svn:${vc-anonymous-base-url}/${module-directory}
+ scm:svn:${vc-dev-base-url}/${module-directory}
+ ${vc-browse-base-url}/${module-directory}
+
+
+
+
+ org.apache.lucene
+ lucene-test-framework
+ test
+
+ @lucene-concordance.internal.dependencies@
+ @lucene-concordance.external.dependencies@
+ @lucene-concordance.internal.test.dependencies@
+ @lucene-concordance.external.test.dependencies@
+
+
+ ${module-path}/src/java
+ ${module-path}/src/test
+
+
+ ${project.build.testSourceDirectory}
+
+ **/*.java
+
+
+
+
+
Property changes on: dev-tools/maven/lucene/concordance/pom.xml.template
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: dev-tools/maven/lucene/pom.xml.template
===================================================================
--- dev-tools/maven/lucene/pom.xml.template (revision 1632428)
+++ dev-tools/maven/lucene/pom.xml.template (working copy)
@@ -47,6 +47,7 @@
analysisbenchmarkclassification
+ concordancedemoexpressionsfacet
Index: lucene/build.xml
===================================================================
--- lucene/build.xml (revision 1632428)
+++ lucene/build.xml (working copy)
@@ -173,6 +173,7 @@
+
Index: lucene/concordance
===================================================================
--- lucene/concordance (revision 1632428)
+++ lucene/concordance (working copy)
Property changes on: lucene/concordance
___________________________________________________________________
Added: svn:ignore
## -0,0 +1 ##
+*.iml
Index: lucene/concordance/build.xml
===================================================================
--- lucene/concordance/build.xml (revision 0)
+++ lucene/concordance/build.xml (working copy)
@@ -0,0 +1,40 @@
+
+
+
+
+
+ Executes concordance search
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: lucene/concordance/ivy.xml
===================================================================
--- lucene/concordance/ivy.xml (revision 0)
+++ lucene/concordance/ivy.xml (working copy)
@@ -0,0 +1,21 @@
+
+
+
+
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/BasicMetadataExtractor.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/BasicMetadataExtractor.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/BasicMetadataExtractor.java (working copy)
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Set;
+
+import org.apache.lucene.index.StoredDocument;
+
+
+public class BasicMetadataExtractor implements DocumentMetadataExtractor {
+
+ private Set fields = new HashSet<>();
+
+ public void setFieldSelector(Set f) {
+ fields.clear();
+ for (String s : f) {
+ fields.add(s);
+ }
+ }
+
+ @Override
+ public Set getFieldSelector() {
+ return Collections.unmodifiableSet(fields);
+ }
+
+ @Override
+ public Map extract(StoredDocument d) {
+ //only takes the first value in a multi-valued field!!!
+ Map map = new HashMap<>();
+ for (String fieldName : getFieldSelector()) {
+ String[] fieldValues = d.getValues(fieldName);
+
+ if (fieldValues != null && fieldValues.length > 0) {
+ map.put(fieldName, fieldValues[0]);
+ }
+ }
+ return map;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceConfig.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceConfig.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceConfig.java (working copy)
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance;
+
+public class ConcordanceConfig {
+
+ private final static int defaultTokensBefore = 10;
+ private final static int defaultTokensAfter = 10;
+ private final static int defaultMaxWindows = 100000;
+ private final static int defaultMaxTargetDisplaySizeChars = 1000;
+ private final static int defaultMaxContextDisplaySizeChars = 10000;
+
+ private final static ConcordanceSortOrder defaultSortOrder = ConcordanceSortOrder.PRE;
+
+ /**
+ * Number of tokens to capture before the target
+ */
+ private int tokensBefore = defaultTokensBefore;
+
+ /**
+ * Number of tokens to capture after the target
+ */
+ private int tokensAfter = defaultTokensAfter;
+
+ /**
+ * Maximum number of windows to retrieve
+ */
+ private int maxWindows = defaultMaxWindows;
+
+ /**
+ * Maximum target length in characters.
+ */
+ private int maxTargetDisplaySizeChars = defaultMaxTargetDisplaySizeChars;
+
+ /**
+ * Dual purpose:
+ * 1) Maximum length in characters for the string before the target {@see #ConcordanceWindow.pre}.
+ * 2) Maximum length in characters for the string after the target {@see #ConcordanceWindow.post}.
+ */
+ private int maxContextDisplaySizeChars = defaultMaxContextDisplaySizeChars;
+
+ /**
+ * field to search
+ */
+ private final String fieldName;
+
+ /**
+ * The results of a SpanQuery in some versions of Lucene allow
+ * for target overlaps.
+ */
+ private boolean allowTargetOverlaps = false;
+
+ /**
+ * Sort order for the windows
+ */
+ private ConcordanceSortOrder sortOrder = defaultSortOrder;
+
+ public ConcordanceConfig(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ public int getTokensBefore() {
+ return tokensBefore;
+ }
+
+ public void setTokensBefore(int tokensBefore) {
+ this.tokensBefore = tokensBefore;
+ }
+
+ public int getTokensAfter() {
+ return tokensAfter;
+ }
+
+ public void setTokensAfter(int tokensAfter) {
+ this.tokensAfter = tokensAfter;
+ }
+
+ public int getMaxWindows() {
+ return maxWindows;
+ }
+
+ public void setMaxWindows(int maxWindows) {
+ this.maxWindows = maxWindows;
+ }
+
+ public int getMaxTargetDisplaySizeChars() {
+ return maxTargetDisplaySizeChars;
+ }
+
+ public void setMaxTargetDisplaySizeChars(int maxTargetDisplaySizeChars) {
+ this.maxTargetDisplaySizeChars = maxTargetDisplaySizeChars;
+ }
+
+ public int getMaxContextDisplaySizeChars() {
+ return maxContextDisplaySizeChars;
+ }
+
+ public void setMaxContextDisplaySizeChars(int maxContextDisplaySizeChars) {
+ this.maxContextDisplaySizeChars = maxContextDisplaySizeChars;
+ }
+
+ public String getFieldName() {
+ return fieldName;
+ }
+
+ public boolean isAllowTargetOverlaps() {
+ return allowTargetOverlaps;
+ }
+
+ public void setAllowTargetOverlaps(boolean allowTargetOverlaps) {
+ this.allowTargetOverlaps = allowTargetOverlaps;
+ }
+
+ public ConcordanceSortOrder getSortOrder() {
+ return sortOrder;
+ }
+
+ public void setSortOrder(ConcordanceSortOrder sortOrder) {
+ this.sortOrder = sortOrder;
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("sort order: ").append(sortOrder.toString()).append("\n");
+ sb.append("tokens before: ").append(tokensBefore).append("\n");
+ sb.append("tokens after: ").append(tokensAfter).append("\n");
+ sb.append("max results: ").append(maxWindows).append("\n");
+ sb.append("maxTargetDisplaySizeChars: ").append(maxTargetDisplaySizeChars).append("\n");
+ sb.append("maxContextDisplaySizeChars: ").append(maxContextDisplaySizeChars).append("\n");
+
+ return sb.toString();
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceResults.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceResults.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceResults.java (working copy)
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Results of a concordance search. This includes windows and information about the search.
+ *
+ */
+
+public class ConcordanceResults {
+ private final List windows;
+ private final boolean hitMax; //did the search hit the maximum number of windows
+
+ private int numTotalWindows;
+ private int numTotalDocs;
+ private FixedBitSet docIDs; //underlying Lucene document ids that had a hit.
+
+ /* nocommit: remove this constructor if it's not used anywhere
+ public ConcordanceResults(){
+ windows = new LinkedList();
+ hitMax = false;
+ numTotalWindows = 0;
+ numTotalDocs = 0;
+ docIDs = new FixedBitSet();
+ }
+ */
+
+ public ConcordanceResults(List windows, FixedBitSet docIDs2, int numTotalDocs, int numTotalWindows, boolean hitMax){
+ this.windows = windows;
+ this.hitMax = hitMax;
+ this.docIDs = docIDs2.clone();
+ this.numTotalWindows = numTotalWindows;
+ this.numTotalDocs = numTotalDocs;
+ }
+
+ /**
+ * Sorts the windows according to the windows' sortKey and returns the list.
+ * Does not perform defensive copying of list, and the underlying list's order is changed
+ * by this call.
+ */
+ public List getSortedWindows(){
+ Collections.sort(windows, new ConcordanceSorter());
+ return windows;
+ }
+
+ /**
+ * Gets the windows in whatever order they are currently in...
+ * could be insertion order or could be sorted order depending on whether
+ * {@link #getSortedWindows()} has been called.
+ *
+ * {@see #getSortedWindows()}
+ */
+ public List getWindows(){
+ return windows;
+ }
+
+ public boolean getHitMax(){
+ return hitMax;
+ }
+
+ public int getNumWindows(){
+ return windows.size();
+ }
+
+ public long getNumDocs(){
+ return docIDs.cardinality();
+ }
+
+ public int getNumTotalDocs(){
+ return numTotalDocs;
+ }
+
+ public int getNumTotalWindows(){
+ return numTotalWindows;
+ }
+
+ /**
+ * The caller must beware not to add duplicate windows. This call does not check
+ * for duplicates.
+ *
+ * The purpose of this is to allow a union of concordance results from multiple
+ * concordance searches (e.g. concordance results
+ * across different fields). This assumes that the underlying Lucene document id
+ * has not changed across the multiple searches!!!
+ */
+ public void addAll(ConcordanceResults results){
+ windows.addAll(results.getWindows());
+ docIDs.or(results.getDocIDs());
+
+ numTotalWindows = windows.size();
+ numTotalDocs += results.numTotalDocs;
+ }
+
+ public FixedBitSet getDocIDs(){
+ return docIDs;
+ }
+
+ public void setDocIDs(FixedBitSet docIDs){
+ this.docIDs = docIDs;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java (working copy)
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.concordance;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.queries.BooleanFilter;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsets;
+import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsetsIterator;
+import org.apache.lucene.search.concordance.charoffsets.OffsetLengthStartComparator;
+import org.apache.lucene.search.concordance.charoffsets.OffsetUtil;
+import org.apache.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader;
+import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException;
+import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetsReader;
+import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests;
+import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetResults;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Searches an IndexReader and returns concordance windows via ConcordanceResults.
+ */
+public class ConcordanceSearcher {
+ /**
+ *
+ * @param query if SpanQuery, this gets passed through. If a regular Query, the Query is first
+ * converted to a SpanQuery and the filter is modified to include the original Query.
+ * @param metadataExtractor a simple extractor that a user can implement to pull out custom
+ * metadata from the document for each window.
+ */
+ public ConcordanceResults search(IndexReader reader, Query query, Filter filter, Analyzer analyzer,
+ ConcordanceConfig config, DocumentMetadataExtractor metadataExtractor)
+ throws TargetTokenNotFoundException, IllegalArgumentException, IOException {
+
+ if (query instanceof SpanQuery) {
+ //pass through
+ return searchSpan(reader, (SpanQuery)query, filter, analyzer, config, metadataExtractor);
+ } else {
+ //convert regular query to a SpanQuery.
+ SpanQueryConverter converter = new SpanQueryConverter();
+ SpanQuery spanQuery = converter.convert(config.getFieldName(), query);
+
+ Filter origQueryFilter = new QueryWrapperFilter(query);
+ Filter updatedFilter = origQueryFilter;
+
+ if (filter != null) {
+ BooleanFilter combinedFilter = new BooleanFilter();
+ combinedFilter.add(origQueryFilter, Occur.MUST);
+ combinedFilter.add(filter, Occur.MUST);
+ updatedFilter = combinedFilter;
+ }
+ return searchSpan(reader, spanQuery, updatedFilter, analyzer, config, metadataExtractor);
+ }
+ }
+
+ /**
+ * Like {@link #search(IndexReader, Query, Filter, Analyzer, ConcordanceConfig, DocumentMetadataExtractor)},
+ * but this takes an actual SpanQuery.
+ */
+ public ConcordanceResults searchSpan(IndexReader reader, SpanQuery spanQuery, Filter filter, Analyzer analyzer,
+ ConcordanceConfig config, DocumentMetadataExtractor metadataExtractor)
+ throws TargetTokenNotFoundException, IllegalArgumentException, IOException {
+
+ spanQuery = (SpanQuery)spanQuery.rewrite(reader);
+ DocTokenOffsetsIterator itr = new DocTokenOffsetsIterator();
+ Set fields = new HashSet<>(metadataExtractor.getFieldSelector());
+ fields.add(config.getFieldName());
+ itr.reset(spanQuery, filter, reader, fields);
+ return buildResults(itr, reader, analyzer, config, metadataExtractor);
+ }
+
+
+ private ConcordanceResults buildResults(DocTokenOffsetsIterator itr, IndexReader reader, Analyzer analyzer,
+ ConcordanceConfig config, DocumentMetadataExtractor metadataExtractor)
+ throws IllegalArgumentException, TargetTokenNotFoundException, IOException {
+ List windows = new LinkedList<>();
+
+ boolean stop = false;
+ int totalNumDocs = reader.numDocs();
+
+ int numTotalWindows = 0;
+
+ TokenCharOffsetRequests requests;
+ WindowBuilder windowBuilder = new WindowBuilder();
+ TokenCharOffsetsReader tokenOffsetsRecordReader = new ReanalyzingTokenCharOffsetsReader(analyzer);
+
+ TokenCharOffsetResults offsetResults = new TokenCharOffsetResults();
+ FixedBitSet docIDs = new FixedBitSet(reader.maxDoc());
+ DocTokenOffsets result = null;
+ OffsetLengthStartComparator offsetLengthStartComparator = new OffsetLengthStartComparator();
+ while (itr.next() && ! stop) {
+ result = itr.getDocTokenOffsets();
+ StoredDocument document = result.getDocument();
+
+ docIDs.set(result.getUniqueDocId());
+
+ String[] fieldValues = document.getValues(config.getFieldName());
+ if (fieldValues.length == 0) {
+ throw new IllegalArgumentException("did you forget to load or specify the correct content field?!");
+ }
+
+ Map metadata = metadataExtractor.extract(document);
+ List offsets = result.getOffsets();
+ if (! config.isAllowTargetOverlaps()) {
+ //remove overlapping hits!!!
+ offsets = OffsetUtil.removeOverlapsAndSort(offsets, offsetLengthStartComparator, null);
+ }
+ //get the required character offsets
+ requests = ConcordanceSearcherUtil.getCharOffsetRequests(offsets, config);
+ offsetResults.clear();
+
+ offsetResults = tokenOffsetsRecordReader.getTokenCharOffsetResults
+ (document, config.getFieldName(), requests, offsetResults);
+
+ for (OffsetAttribute offset : offsets) {
+
+ ConcordanceWindow w = windowBuilder.buildConcordanceWindow
+ (result.getUniqueDocId(), offset.startOffset(),
+ offset.endOffset()-1, metadata, config, fieldValues, offsetResults);
+
+ windows.add(w);
+ numTotalWindows++;
+ if (config.getMaxWindows() > -1 && windows.size() >= config.getMaxWindows()) {
+ stop = true;
+ break;
+ }
+ }
+ }
+
+ return new ConcordanceResults(windows, docIDs, totalNumDocs, numTotalWindows, stop);
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java (working copy)
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance;
+
+import java.util.List;
+
+import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * In other applications with variations on the ConcordanceSearcher, it has been useful
+ * to factor out the getCharOffsetRequests.
+ *
+ * This class should be used for functionality that is generally useful for concordance searching.
+ *
+ */
+public class ConcordanceSearcherUtil {
+
+
+ public static TokenCharOffsetRequests getCharOffsetRequests(List offsets, ConcordanceConfig config) {
+ return getCharOffsetRequests(offsets, config.getTokensBefore(), config.getTokensAfter() + 1);
+ }
+
+ public static TokenCharOffsetRequests getCharOffsetRequests(List offsets) {
+ return getCharOffsetRequests(offsets, 0, 1);
+ }
+
+ /**
+ */
+ public static TokenCharOffsetRequests getCharOffsetRequests(List offsets, int tokensBefore, int tokensAfter) {
+ TokenCharOffsetRequests requests = new TokenCharOffsetRequests();
+
+ for (OffsetAttribute offset : offsets) {
+ int start = offset.startOffset()-tokensBefore;
+ start = (start < 0) ? 0 : start;
+ int end = offset.endOffset()+tokensAfter;
+ for (int i = start; i < end; i++) {
+ requests.add(i);
+ }
+ }
+ return requests;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java (working copy)
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance;
+
+public enum ConcordanceSortOrder {
+ PRE, //sort on the first token before the target, then the second word, etc.
+ POST, //sort on words after the target
+ TARGET_PRE, //sort on the target and then words before the target
+ TARGET_POST, //sort on the target and then words after the target
+ DOC, //sort on the Lucene document id
+ NONE //no sort
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java (working copy)
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance;
+
+import java.io.Serializable;
+import java.util.Comparator;
+
+
+public class ConcordanceSorter implements Comparator, Serializable {
+ private static final long serialVersionUID = 7526472295622776147L;
+ @Override
+ public int compare(ConcordanceWindow w1, ConcordanceWindow w2) {
+
+ return w1.getSortKey().compareTo(w2.getSortKey());
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java (working copy)
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.concordance;
+
+import java.util.Map;
+
+/**
+ * Key element in a concordance view of data.
+ * A window consists of the words before a target term (pre), the target term and then the words
+ * after the target term (post). A window also has a sort key to allow for various methods
+ * of sorting.
+ *
+ * For various applications, it has also been useful to store the (admittedly ephemeral)
+ * Lucene document id, character offset (start and end) of the full window
+ * as well as metadata from the document for the given window.
+ *
+ * This class is experimental and may change in incompatible ways in the future.
+ *
+ * Areas for improvement:
+ * 1) convert sortKey to an array of Comparables
+ * 2) ...
+ */
+public class ConcordanceWindow {
+
+ private final String sortKey;
+ private final String pre;
+ private final String target;
+ private final String post;
+ private final Map metadata;
+ private final int charStart;
+ private final int charEnd;
+ private final long docID;
+
+ public ConcordanceWindow(long docID, int charStart, int charEnd,
+ String pre, String target, String post, String sortKey, Map metadata) {
+ this.pre = pre;
+ this.target = target;
+ this.post = post;
+ this.docID = docID;
+ this.charStart = charStart;
+ this.charEnd = charEnd;
+ this.metadata = metadata;
+ this.sortKey = sortKey;
+ }
+ public long getDocID() {
+ return docID;
+ }
+
+ public int getStart() {
+ return charStart;
+ }
+ public int getEnd() {
+ return charEnd;
+ }
+ public Map getMetadata() {
+ return metadata;
+ }
+
+ public String getPre() {
+ return pre;
+ }
+ public String getPost() {
+ return post;
+ }
+ public String getTarget() {
+ return target;
+ }
+ public int getSize() {
+ int size = 0;
+ if (pre != null) {
+ size += pre.length();
+ }
+ if (target != null) {
+ size += target.length();
+ }
+ if (post != null) {
+ size += post.length();
+ }
+ return size;
+ }
+ public String getSortKey() {
+ return sortKey;
+ }
+
+ public String toString() {
+ //this assumes left to right language
+ StringBuilder sb = new StringBuilder();
+ sb.append(pre).append(":").append(target).append(":").append(post);
+ return sb.toString();
+ }
+
+ @Override
+ public int hashCode()
+ {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + charEnd;
+ result = prime * result + charStart;
+ result = prime * result + (int) (docID ^ (docID >>> 32));
+ result = prime * result + ((metadata == null) ? 0 : metadata.hashCode());
+ result = prime * result + ((post == null) ? 0 : post.hashCode());
+ result = prime * result + ((pre == null) ? 0 : pre.hashCode());
+ result = prime * result + ((sortKey == null) ? 0 : sortKey.hashCode());
+ result = prime * result + ((target == null) ? 0 : target.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj)
+ {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if ( ! (obj instanceof ConcordanceWindow))
+ return false;
+
+ ConcordanceWindow other = (ConcordanceWindow)obj;
+
+ if (charEnd != other.charEnd)
+ return false;
+ if (charStart != other.charStart)
+ return false;
+ if (docID != other.docID)
+ return false;
+
+ if (metadata == null) {
+ if (other.metadata != null)
+ return false;
+ } else if (!metadata.equals(other.metadata)) {
+ return false;
+ }
+
+ if (post == null) {
+ if (other.post != null)
+ return false;
+ } else if (!post.equals(other.post)) {
+ return false;
+ }
+
+ if (pre == null) {
+ if (other.pre != null)
+ return false;
+ } else if (!pre.equals(other.pre)) {
+ return false;
+ }
+
+ if (sortKey == null) {
+ if (other.sortKey != null)
+ return false;
+ } else if (!sortKey.equals(other.sortKey)) {
+ return false;
+ }
+
+ if (target == null) {
+ if (other.target != null)
+ return false;
+ } else if (!target.equals(other.target)) {
+ return false;
+ }
+
+ return true;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentMetadataExtractor.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentMetadataExtractor.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentMetadataExtractor.java (working copy)
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.concordance;
+
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * Lightweight interface that extracts document metadata to be stored
+ * with each window that is extracted.
+ *
+ * For now, it can only extract key-value pairs of type String, String.
+ */
+
+public interface DocumentMetadataExtractor {
+ public Set getFieldSelector();
+ public Map extract(StoredDocument document);
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/SpanQueryConverter.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/SpanQueryConverter.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/SpanQueryConverter.java (working copy)
@@ -0,0 +1,262 @@
+package org.apache.lucene.search.concordance;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.CommonTermsQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.FilteredQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
+public class SpanQueryConverter {
+ /**
+ * Converts a regular query to a {@link SpanQuery} for use in a highlighter.
+ * Because of subtle differences in {@link SpanQuery} and {@link Query}, this {@link SpanQuery} will
+ * not necessarily return the same documents as the initial Query. For example,
+ * the generated SpanQuery will not include clauses of type {@link BooleanClause.Occur#MUST_NOT}.
+ * Also, the {@link SpanQuery} will only cover a single field, whereas the {@link Query} might contain
+ * multiple fields.
+ *
+ * Returns an empty SpanQuery if it can't convert from a {@link Query} to a {@link SpanQuery}.
+ * This can happen for many reasons: e.g. if the Query contains no terms in the requested "field".
+ *
+ * This class does not rewrite the SpanQuery. Consumers are required to rewrite if necessary.
+ *
+ * Much of this code is copied directly from oal.search.highlight.WeightedSpanTermExtractor.
+ * There are some subtle differences.
+ *
+ * @return SpanQuery for use in highlighting; can return empty SpanQuery
+ */
+ public SpanQuery convert(String field, Query query) throws IOException {
+ /* copied nearly verbatim from org.apache.lucene.search.highlight.WeightedSpanTermExtractor
+ * TODO: refactor to avoid duplication of code if possible. Beware: there are some subtle differences.
+ */
+ if (query instanceof SpanQuery) {
+ SpanQuery sq = (SpanQuery)query;
+ if (sq.getField().equals(field)) {
+ return (SpanQuery)query;
+ } else {
+ return getEmptySpanQuery();
+ }
+ } else if (query instanceof BooleanQuery) {
+ BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
+ List spanQs = new ArrayList<>();
+ for (BooleanClause queryClause : queryClauses) {
+ if (!queryClause.isProhibited()) {
+ tryToAdd(field, convert(field, queryClause.getQuery()), spanQs);
+ }
+ }
+ if (spanQs.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (spanQs.size() == 1) {
+ return spanQs.get(0);
+ } else {
+ return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()]));
+ }
+ } else if (query instanceof PhraseQuery) {
+ PhraseQuery phraseQuery = ((PhraseQuery) query);
+
+ Term[] phraseQueryTerms = phraseQuery.getTerms();
+ if (phraseQueryTerms.length == 0) {
+ return getEmptySpanQuery();
+ } else if (! phraseQueryTerms[0].field().equals(field)) {
+ return getEmptySpanQuery();
+ }
+ SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length];
+ for (int i = 0; i < phraseQueryTerms.length; i++) {
+ clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
+ }
+ int slop = phraseQuery.getSlop();
+ int[] positions = phraseQuery.getPositions();
+ // add largest position increment to slop
+ if (positions.length > 0) {
+ int lastPos = positions[0];
+ int largestInc = 0;
+ int sz = positions.length;
+ for (int i = 1; i < sz; i++) {
+ int pos = positions[i];
+ int inc = pos - lastPos;
+ if (inc > largestInc) {
+ largestInc = inc;
+ }
+ lastPos = pos;
+ }
+ if (largestInc > 1) {
+ slop += largestInc;
+ }
+ }
+
+ boolean inorder = false;
+
+ if (slop == 0) {
+ inorder = true;
+ }
+
+ SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
+ sp.setBoost(query.getBoost());
+ return sp;
+ } else if (query instanceof TermQuery) {
+ TermQuery tq = (TermQuery)query;
+ if (tq.getTerm().field().equals(field)) {
+ return new SpanTermQuery(tq.getTerm());
+ } else {
+ return getEmptySpanQuery();
+ }
+ } else if (query instanceof FilteredQuery) {
+ return convert(field, ((FilteredQuery)query).getQuery());
+ } else if (query instanceof ConstantScoreQuery) {
+ return convert(field, ((ConstantScoreQuery) query).getQuery());
+ } else if (query instanceof CommonTermsQuery) {
+ // specialized since rewriting would change the result query
+ // this query is TermContext sensitive.
+ CommonTermsQuery ctq = (CommonTermsQuery)query;
+
+ Set terms = new HashSet<>();
+ ctq.extractTerms(terms);
+ List spanQs = new LinkedList<>();
+
+ for (Term term : terms) {
+ if (term.field().equals(field)) {
+ spanQs.add(new SpanTermQuery(term));
+ }
+ }
+ if (spanQs.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (spanQs.size() == 1) {
+ return spanQs.get(0);
+ } else {
+ return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()]));
+ }
+ } else if (query instanceof DisjunctionMaxQuery) {
+ List spanQs = new ArrayList<>();
+ for (Query q : ((DisjunctionMaxQuery)query)) {
+ tryToAdd(field, convert(field, q), spanQs);
+ }
+ if (spanQs.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (spanQs.size() == 1) {
+ return spanQs.get(0);
+ } else {
+ return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()]));
+ }
+ } else if (query instanceof MultiPhraseQuery) {
+ final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
+
+ final List termArrays = mpq.getTermArrays();
+ if (termArrays.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (termArrays.size() > 1) {
+ Term[] ts = termArrays.get(0);
+ if (ts.length > 0) {
+ Term t = ts[0];
+ if (! t.field().equals(field)) {
+ return getEmptySpanQuery();
+ }
+ }
+ }
+ final int[] positions = mpq.getPositions();
+ if (positions.length > 0) {
+
+ int maxPosition = positions[positions.length - 1];
+ for (int i = 0; i < positions.length - 1; ++i) {
+ if (positions[i] > maxPosition) {
+ maxPosition = positions[i];
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ final List[] disjunctLists = new List[maxPosition + 1];
+ int distinctPositions = 0;
+
+ for (int i = 0; i < termArrays.size(); ++i) {
+ final Term[] termArray = termArrays.get(i);
+ List disjuncts = disjunctLists[positions[i]];
+ if (disjuncts == null) {
+ disjuncts = (disjunctLists[positions[i]] = new ArrayList<>(termArray.length));
+ ++distinctPositions;
+ }
+ for (Term aTermArray : termArray) {
+ disjuncts.add(new SpanTermQuery(aTermArray));
+ }
+ }
+
+ int positionGaps = 0;
+ int position = 0;
+ final SpanQuery[] clauses = new SpanQuery[distinctPositions];
+ for (List disjuncts : disjunctLists) {
+ if (disjuncts != null) {
+ clauses[position++] = new SpanOrQuery(disjuncts.toArray(new SpanQuery[disjuncts.size()]));
+ } else {
+ ++positionGaps;
+ }
+ }
+
+ final int slop = mpq.getSlop();
+ final boolean inorder = (slop == 0);
+
+ SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
+ sp.setBoost(query.getBoost());
+ return sp;
+ }
+
+ }
+ return convertUnknownQuery(query);
+ }
+
+ private void tryToAdd(String field, SpanQuery q, List qs) {
+ if (q == null || isEmptyQuery(q) || ! q.getField().equals(field)) {
+ return;
+ }
+ qs.add(q);
+ }
+
+ protected SpanQuery convertUnknownQuery(Query query) throws IOException {
+ // for sub-classing to extract custom queries
+ return getEmptySpanQuery();
+ }
+
+ private SpanQuery getEmptySpanQuery() {
+ SpanQuery q = new SpanOrQuery(new SpanTermQuery[0]);
+ return q;
+ }
+
+ private boolean isEmptyQuery(SpanQuery q) {
+ if (q instanceof SpanOrQuery
+ && ((SpanOrQuery)q).getClauses().length == 0) {
+ return true;
+ }
+ return false;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java (working copy)
@@ -0,0 +1,272 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl;
+import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException;
+import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetResults;
+
+/**
+ * Builds a {@link ConcordanceWindow}
+ */
+public class WindowBuilder {
+
+ private final static String SPACE = " ";
+ private final static String EMPTY_STRING = "";
+ public static String INTER_MULTIVALUE_FIELD_PADDING = SPACE;
+
+
+ /**
+ * Makes the assumption that the target token start and target token end
+ * can be found. If not, this returns a null.
+ *
+ * @param docID
+ * Lucene internal docid, used only if sort type is DOC
+ * @param targetTokenStart
+ * Target's start token
+ * @param targetTokenEnd
+ * Target's end token
+ * @param metadata
+ * Metadata to be stored with the window
+ * @param config
+ * ConcordanceConfig
+ * @param fieldValues
+ * nocommit: describe me
+ * @param offsets
+ * TokenOffsetResults from nocommit: where are they from???
+ * @return
+ * ConcordanceWindow or null if character offset information cannot be found for
+ * both the targetTokenStart and the targetTokenEnd
+ */
+ public ConcordanceWindow buildConcordanceWindow(long docID, int targetTokenStart, int targetTokenEnd,
+ Map metadata, ConcordanceConfig config,
+ String[] fieldValues, TokenCharOffsetResults offsets)
+ throws TargetTokenNotFoundException, IllegalArgumentException {
+
+ if (targetTokenStart < 0 || targetTokenEnd < 0) {
+ throw new IllegalArgumentException("targetTokenStart and targetTokenEnd must be >= 0");
+ }
+ if (targetTokenEnd < targetTokenStart) {
+ throw new IllegalArgumentException("targetTokenEnd must be >= targetTokenStart");
+ }
+ int startFieldIndex = offsets.getFieldIndex(targetTokenStart);
+ int endFieldIndex = offsets.getFieldIndex(targetTokenEnd);
+ /*if (fieldIndex != offsets.getFieldIndex(targetTokenEnd)) {
+ //you're asking for a window across different entries in a field.
+ //no soup for you.
+ throw new IllegalArgumentException("Can't request a window across different field indices in a multi-valued field");
+ }*/
+ if (startFieldIndex < 0 || endFieldIndex < 0) {
+ //target not found
+ throw new IllegalArgumentException("field index must be >= 0");
+ }
+ if (startFieldIndex >= fieldValues.length || endFieldIndex >= fieldValues.length) {
+ //something went horribly wrong.
+ //can't ask for a window from array index out of bounds exception
+ throw new IllegalArgumentException("fieldIndex out of bounds exception");
+ }
+ String startS = fieldValues[startFieldIndex];
+ String endS = (startFieldIndex == endFieldIndex) ? startS : fieldValues[endFieldIndex];
+ if (startS == null || endS == null) {
+ //something went horribly wrong.
+ throw new IllegalArgumentException("field value is null");
+ }
+ int targetCharStart = offsets.getCharacterOffsetStart(targetTokenStart);
+ int targetCharEnd = offsets.getCharacterOffsetEnd(targetTokenEnd);
+
+ if (targetCharStart < 0 || targetCharEnd < 0) {
+ throw new TargetTokenNotFoundException("couldn't find character offsets for a target token.\n"+
+ "Check that your analyzers are configured properly.\n");
+ }
+
+ OffsetAttribute preOffset = getPreOffset(startFieldIndex, targetTokenStart, targetCharStart, config, offsets);
+ String preString = silentlySafeSubstring(startS, preOffset);
+
+ OffsetAttribute postOffset = getPostOffset(endFieldIndex, targetTokenEnd, targetCharEnd, config, offsets);
+ String postString = silentlySafeSubstring(endS, postOffset);
+
+ String targ = getTargetString(targetTokenStart, targetTokenEnd, targetCharStart,
+ targetCharEnd, fieldValues, offsets);
+
+ String sortKey = getSortKey(docID, startFieldIndex, endFieldIndex, targetTokenStart, targetTokenEnd, config, offsets);
+ int charStart = (preOffset == null) ? targetCharStart : preOffset.startOffset();
+ int charEnd = (postOffset == null) ? targetCharEnd : postOffset.endOffset();
+ return new ConcordanceWindow(docID, charStart, charEnd, preString, targ, postString,
+ sortKey, metadata);
+ }
+
+ private String getTargetString(int targetTokenStart, int targetTokenEnd,
+ int targetCharStart, int targetCharEnd, String[] fieldValues,
+ TokenCharOffsetResults offsets) {
+
+ int startIndex = offsets.getFieldIndex(targetTokenStart);
+ int endIndex = offsets.getFieldIndex(targetTokenEnd);
+
+ if (startIndex == endIndex) {
+ String s = fieldValues[startIndex];
+ return silentlySafeSubstring(s, targetCharStart, targetCharEnd);
+ }
+ StringBuilder sb = new StringBuilder();
+ String fStart = fieldValues[startIndex];
+ sb.append(fStart.substring(targetCharStart));
+ for (int i = startIndex+1; i < endIndex; i++) {
+ sb.append(INTER_MULTIVALUE_FIELD_PADDING);
+ sb.append(fieldValues[i]);
+ }
+ sb.append(INTER_MULTIVALUE_FIELD_PADDING);
+ sb.append(fieldValues[endIndex].substring(0,targetCharEnd));
+ return sb.toString();
+ }
+
+ private String getSortKey(long docID, int startFieldIndex, int endFieldIndex, int start, int end,
+ ConcordanceConfig config, TokenCharOffsetResults charOffsets) {
+ //TODO: Create interface for sort key generator
+ //for room to grow. Hard coded for now.
+
+ StringBuilder sb = new StringBuilder();
+ ConcordanceSortOrder sortOrder = config.getSortOrder();
+ if (sortOrder == ConcordanceSortOrder.NONE) {
+ return EMPTY_STRING;
+ }
+ //hack zero left pad the tokenoffset with 10 0's
+ if (sortOrder == ConcordanceSortOrder.DOC) {
+ String docIDString = padLeft(10, "0", Long.toString(docID));
+ String startOffsetString = padLeft(10, "0",
+ Integer.toString(start));
+ sb.append(docIDString).append(SPACE).append(startOffsetString);
+ }
+
+ if (sortOrder == ConcordanceSortOrder.TARGET_POST ||
+ sortOrder == ConcordanceSortOrder.TARGET_PRE) {
+
+ for (int i = start; i <= end; i++) {
+ String tmp = charOffsets.getTerm(i);
+ if (tmp != null && tmp.length() > 0)
+ sb.append(tmp).append(" ");
+ }
+ }
+ if (sortOrder == ConcordanceSortOrder.PRE ||
+ sortOrder == ConcordanceSortOrder.TARGET_PRE) {
+ int tmpStart = start-1;
+ int tmpEnd = Math.max(0, start-config.getTokensBefore());
+ if (tmpStart < 0) {
+ sb.append(" ");
+ }
+
+ for (int i = tmpStart; i >= tmpEnd; i--) {
+ if (charOffsets.getFieldIndex(i) == startFieldIndex) {
+ String tmp = charOffsets.getTerm(i);
+ if (tmp != null && tmp.length() > 0) {
+ sb.append(tmp).append(" ");
+ }
+ } else {
+ break;
+ }
+ }
+
+ } else if (sortOrder == ConcordanceSortOrder.POST ||
+ sortOrder == ConcordanceSortOrder.TARGET_POST) {
+
+ int tmpStart = end+1;
+ int tmpEnd = Math.min(end+config.getTokensAfter(), charOffsets.getLast());
+
+ if (tmpStart > charOffsets.getLast()) {
+ sb.append(" ");
+ }
+ for (int i = tmpStart; i <= tmpEnd; i++) {
+ if (charOffsets.getFieldIndex(i) == endFieldIndex) {
+ String tmp = charOffsets.getTerm(i);
+ if (tmp != null && tmp.length() > 0) {
+ sb.append(tmp).append(SPACE);
+ }
+ } else {
+ break;
+ }
+ }
+ }
+ return sb.toString().trim();
+ }
+
+ private OffsetAttribute getPreOffset(int fieldIndex, int targetTokenStart, int targetCharStart,
+ ConcordanceConfig config, TokenCharOffsetResults charOffsets) {
+ if (config.getTokensBefore() == 0)
+ return null;
+
+ if (targetTokenStart == 0) {
+ return null;
+ }
+ int startTokenOffset = Math.max(0, targetTokenStart-config.getTokensBefore());
+
+ int windowStartChar = charOffsets.getClosestCharStart(fieldIndex, startTokenOffset, targetTokenStart);
+
+ int windowEndChar = Math.max(windowStartChar, targetCharStart - 1);
+
+ return buildOffsetAttribute(windowStartChar, windowEndChar);
+ }
+
+
+ private OffsetAttribute getPostOffset(int fieldIndex, int targetTokenEnd, int targetCharEnd,
+ ConcordanceConfig config, TokenCharOffsetResults charOffsets) {
+ if (config.getTokensAfter() == 0)
+ return null;
+ int windowTokenEnd = targetTokenEnd+config.getTokensAfter();
+ int windowCharStart = targetCharEnd;
+ int windowCharEnd = charOffsets.getClosestCharEnd(fieldIndex, windowTokenEnd, targetTokenEnd+1);
+ if (windowCharStart >= windowCharEnd) {
+ return null;
+ }
+ return buildOffsetAttribute(windowCharStart, windowCharEnd);
+ }
+
+ private OffsetAttribute buildOffsetAttribute(int start, int end) {
+ OffsetAttribute off = new OffsetAttributeImpl();
+ off.setOffset(start, end);
+ return off;
+ }
+
+ private String padLeft(int number, String add, String s) {
+ if (s.length() >= number)
+ return s;
+
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < number- s.length(); i++) {
+ sb.append(add);
+ }
+ sb.append(s);
+ return sb.toString();
+ }
+
+ private String silentlySafeSubstring(String s, OffsetAttribute offset) {
+ if (offset == null)
+ return EMPTY_STRING;
+
+ return silentlySafeSubstring(s, offset.startOffset(), offset.endOffset());
+ }
+
+ private String silentlySafeSubstring(String s, int startOffset, int endOffset) {
+
+ if (startOffset >= endOffset || startOffset < 0 ||
+ startOffset >= s.length() || endOffset > s.length()) {
+ return EMPTY_STRING;
+ }
+ return s.substring(startOffset, endOffset);
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java (working copy)
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.util.ArrayList;
+
+import java.util.List;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl;
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * Simple class to store a document id (leaf and unique), a StoredDocument, and the offsets
+ * for a SpanQuery hit
+ *
+ */
+
+public class DocTokenOffsets {
+ private int leafDocId = -1;
+ private int uniqueId = -1;
+ private StoredDocument document = null;
+ private List offsets = new ArrayList<>();
+
+ public void setDocument(StoredDocument d) {
+ this.document = d;
+ }
+ public void addOffset(int start, int end) {
+ OffsetAttributeImpl offset = new OffsetAttributeImpl();
+ offset.setOffset(start, end);
+ offsets.add(offset);
+ }
+
+ public void reset(int base, int leafDocId, StoredDocument d, int start, int end) {
+ this.leafDocId = leafDocId;
+ this.uniqueId = base+leafDocId;
+ setDocument(d);
+ offsets.clear();
+ addOffset(start,end);
+ }
+
+ public List getOffsets() {
+ return offsets;
+ }
+
+ public StoredDocument getDocument() {
+ return document;
+ }
+
+ public int getLeafDocId() {
+ return leafDocId;
+ }
+
+ public int getUniqueDocId() {
+ return uniqueId;
+ }
+
+ public DocTokenOffsets deepishCopy() {
+ DocTokenOffsets copy = new DocTokenOffsets();
+ copy.leafDocId = leafDocId;
+ copy.uniqueId = uniqueId;
+ copy.document = document;
+ List copyOffsets = new ArrayList<>();
+ copyOffsets.addAll(offsets);
+ copy.offsets = copyOffsets;
+ return copy;
+ }
+
+ public boolean isEmpty() {
+ if (leafDocId < 0)
+ return true;
+ return false;
+ }
+
+ public void pseudoEmpty() {
+ leafDocId = -1;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java (working copy)
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Scaffolding/Sugar class around SpanQuery.getSpans(...). This allows the client
+ * to iterate on an IndexReader (not necessarily a leaf) by document (DocTokenOffsets).
+ */
+public class DocTokenOffsetsIterator {
+ /*
+ * NOT THREAD SAFE!!!
+ */
+ private SpanQuery spanQuery;
+ private Filter filter;
+ private LinkedList leafReaders = new LinkedList<>();
+ private LeafReader currReader = null;
+ private Set fields;
+ private Spans spans = null;
+ private DocTokenOffsets docTokenOffsets = new DocTokenOffsets();
+ private DocTokenOffsets docTokenOffsetsBuffer = new DocTokenOffsets();
+ private int currentBase = -1;
+
+ private Map termMap = new HashMap<>();
+
+ public DocTokenOffsetsIterator() {
+ }
+
+ public void reset(SpanQuery q, Filter f, IndexReader reader, Set fields) throws IOException {
+
+ this.spanQuery = q;
+ this.filter = f;
+
+ this.fields = fields;
+ leafReaders.addAll(reader.leaves());
+ if (leafReaders.size() > 0) {
+ reinitSpans();
+ }
+ }
+ public boolean next() throws IOException {
+
+ if (spans == null || docTokenOffsetsBuffer.isEmpty()) {
+ if (leafReaders.size()==0) {
+ return false;
+ } else if (! reinitSpans()) {
+ return false;
+ }
+
+ }
+ boolean currSpansHasMore = false;
+ while (spans.next()) {
+ if (spans.doc() == docTokenOffsetsBuffer.getLeafDocId()) {
+ docTokenOffsetsBuffer.addOffset(spans.start(), spans.end());
+ } else {
+ currSpansHasMore = true;
+ break;
+ }
+ }
+ docTokenOffsets = docTokenOffsetsBuffer.deepishCopy();
+
+ if (currSpansHasMore) {
+ StoredDocument d = currReader.document(spans.doc(), fields);
+ docTokenOffsetsBuffer.reset(currentBase, spans.doc(), d, spans.start(), spans.end());
+ } else {
+ docTokenOffsetsBuffer.pseudoEmpty();
+ }
+ return true;
+ }
+
+ public DocTokenOffsets getDocTokenOffsets() {
+ return docTokenOffsets;
+ }
+
+ private boolean reinitSpans() throws IOException {
+ //must check that leafReaders.size() > 0 before running this!!!
+ LeafReaderContext ctx = leafReaders.pop();
+ currentBase = ctx.docBase;
+ currReader = ctx.reader();
+ Bits bits = null;
+ Bits liveBits = currReader.getLiveDocs();
+ //liveBits can be null if all of the docs are live!!!
+ if (filter == null) {
+ bits = liveBits;
+ } else {
+ DocIdSet idSet = filter.getDocIdSet(ctx,liveBits);
+ if (idSet instanceof FixedBitSet) {
+ bits = (FixedBitSet)idSet;
+ } else {
+ DocIdSetIterator itr = idSet.iterator();
+ if (itr != null) {
+ FixedBitSet tmpBits = new FixedBitSet(currReader.maxDoc());
+ while (itr.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+ tmpBits.set(itr.docID());
+ }
+ bits = tmpBits;
+ }
+ }
+ }
+ /*bits() is optional; this doesn't work!!!!
+ bits = idSet.bits();
+ */
+
+ //bits can be null if all the docs are live
+ //or if the filter returned an empty docidset.
+ if (filter != null && bits == null) {
+ if (leafReaders.size() > 0) {
+ return reinitSpans();
+ } else {
+ return false;
+ }
+ }
+
+ spans = spanQuery.getSpans(ctx, bits, termMap);
+ //can getSpans return null?
+ if (spans != null && spans.next()) {
+ StoredDocument d = currReader.document(spans.doc(), fields);
+
+ docTokenOffsetsBuffer.reset(currentBase, spans.doc(), d, spans.start(), spans.end());
+ return true;
+ } else if (leafReaders.size() > 0) {
+ return reinitSpans();
+ } else {
+ return false;
+ }
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/FieldIndexCharacterOffsetPair.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/FieldIndexCharacterOffsetPair.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/FieldIndexCharacterOffsetPair.java (working copy)
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance.charoffsets;
+
+/**
+ * Pair of field index and character offset.
+ * The fieldIndex records the index in a potentially multi-valued field (array).
+ * The charOffset records the character offset within that field within that value in the potentially
+ * multi-valued field.
+ */
+public class FieldIndexCharacterOffsetPair {
+ private final int fieldIndex;
+ private final int charOffset;
+
+ public FieldIndexCharacterOffsetPair(int fieldIndex, int charOffset) {
+ this.fieldIndex = fieldIndex;
+ this.charOffset = charOffset;
+ }
+
+ public int getFieldIndex() {
+ return fieldIndex;
+ }
+
+ public int getCharOffset() {
+ return charOffset;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java (working copy)
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.util.Comparator;
+import java.io.Serializable;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Sorts length desc, start offset asc
+ *
+ */
+
+public class OffsetLengthStartComparator implements Comparator, Serializable {
+ private static final long serialVersionUID = 7526472295622776147L;
+
+ @Override
+ public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) {
+
+ int lenA = offsetA.endOffset()-offsetA.startOffset();
+ int lenB = offsetB.endOffset()-offsetB.startOffset();
+ if (lenA < lenB) {
+ return 1;
+ } else if (lenA > lenB) {
+ return -1;
+ //by here, the length is the same
+ } else if (offsetA.startOffset() < offsetB.startOffset()) {
+ return -1;
+ } else if (offsetA.startOffset() > offsetB.startOffset()) {
+ return 1;
+ }
+ return 0;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java (working copy)
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.util.Comparator;
+import java.io.Serializable;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * sort on offset start
+ */
+public class OffsetStartComparator implements Comparator, Serializable{
+ private static final long serialVersionUID = 7526472295622776147L;
+
+ @Override
+ public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) {
+
+ if (offsetA.startOffset() < offsetB.startOffset()){
+ return -1;
+ } else if (offsetA.startOffset() > offsetB.startOffset()){
+ return 1;
+ }
+ return 0;
+ }
+
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java (working copy)
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Set;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * In some versions of Lucene, getSpans returned overlapping spans.
+ * This class can remove the overlapping spans and will sort them
+ * if startComparator is not null.
+ *
+ *
+ */
+public class OffsetUtil {
+
+
+ public static List removeOverlapsAndSort(List offsets,
+ OffsetLengthStartComparator comparator, OffsetStartComparator startComparator) {
+ if (offsets == null || offsets.size() < 2)
+ return offsets;
+
+ Collections.sort(offsets, comparator);
+ Set seen = new HashSet<>();
+ List filtered = new ArrayList<>();
+ for (OffsetAttribute offset : offsets) {
+ if (! alreadySeen(offset, seen)) {
+ filtered.add(offset);
+ for (int i = offset.startOffset(); i < offset.endOffset(); i++) {
+ seen.add(i);
+ }
+ }
+ }
+ if (startComparator != null) {
+ Collections.sort(filtered, startComparator);
+ }
+ return filtered;
+ }
+
+ private static boolean alreadySeen(OffsetAttribute offset, Set seen) {
+ for (int i = offset.startOffset(); i <= offset.endOffset(); i++) {
+ if (seen.contains(i))
+ return true;
+ }
+ return false;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java (working copy)
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * TokenCharOffsetsReader that captures character offsets by
+ * reanalyzing a field.
+ */
+public class ReanalyzingTokenCharOffsetsReader implements TokenCharOffsetsReader {
+
+ //TODO: figure out how to stop analysis after "getLast()" request is hit
+ private final static int GOT_ALL_REQUESTS = -2;
+ private Analyzer baseAnalyzer;
+
+ public ReanalyzingTokenCharOffsetsReader(Analyzer analyzer) {
+ this.baseAnalyzer = analyzer;
+ }
+
+ @Override
+ public TokenCharOffsetResults getTokenCharOffsetResults
+ (StoredDocument d, String fieldName, TokenCharOffsetRequests requests, TokenCharOffsetResults results)
+ throws IOException {
+ int fieldIndex = 0;
+ int currInd = -1;
+ int gap = baseAnalyzer.getPositionIncrementGap(fieldName);
+
+ for (String fieldValue : d.getValues(fieldName)) {
+
+ currInd = addFieldValue(fieldIndex, currInd, fieldValue, requests, results);
+ if (currInd == GOT_ALL_REQUESTS) {
+ break;
+ }
+ currInd += gap;
+ fieldIndex++;
+ }
+ return results;
+ }
+
+ private int addFieldValue(int fieldIndex, int currInd,
+ String fieldValue, TokenCharOffsetRequests requests,
+ TokenCharOffsetResults results) throws IOException {
+ TokenStream stream = baseAnalyzer.tokenStream("", fieldValue);
+ //stream = new LimitTokenCountFilter(stream, requests.getLast()+1-currInd);
+ stream.reset();
+
+ int defaultInc = 1;
+
+ CharTermAttribute termAtt =
+ stream.getAttribute(
+ org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
+ OffsetAttribute offsetAtt =
+ stream.getAttribute(
+ org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
+ PositionIncrementAttribute incAtt = null;
+ if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) {
+
+ incAtt = stream.getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);
+ }
+
+ while (stream.incrementToken()) {
+
+ currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc;
+ if (requests.contains(currInd)) {
+ results.add(currInd, fieldIndex, offsetAtt.startOffset(), offsetAtt.endOffset(), termAtt.toString());
+ }
+ if (currInd > requests.getLast()) {
+ //TODO: Is there a way to avoid this? Or, is this an imaginary performance hit?
+ while (stream.incrementToken()) {
+ //NOOP: clear stream
+ }
+ stream.end();
+ stream.close();
+ return GOT_ALL_REQUESTS;
+ }
+ }
+ stream.end();
+ stream.close();
+ return currInd;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java (working copy)
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Simple util class to get List of String from reader+analyzer
+ */
+public class SimpleAnalyzerUtil {
+ private final static String DEFAULT_FIELD = "FIELD";
+
+
+ public static List getTermStrings(Reader reader, Analyzer analyzer) throws IOException {
+ List terms = new ArrayList<>();
+ return getTermStrings(reader, analyzer, terms);
+ }
+ /**
+ * allows reuse of terms, this method calls terms.clear() before adding new terms
+ */
+ public static List getTermStrings(Reader reader, Analyzer analyzer, List terms) throws IOException {
+ if (terms == null) {
+ terms = new ArrayList<>();
+ }
+ terms.clear();
+ TokenStream stream = analyzer.tokenStream(DEFAULT_FIELD, reader);
+ stream.reset();
+ CharTermAttribute termAtt = stream.getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
+
+ while (stream.incrementToken()) {
+ terms.add(termAtt.toString());
+ }
+ stream.end();
+ stream.close();
+ return terms;
+ }
+
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java (working copy)
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance.charoffsets;
+
+/**
+ * Token offset identified by .getSpans() is not found in the TokenCharOffsetResults.
+ * Typical cause is a mismatch between analyzers at index and search times.
+ * When this happens, something very bad has happened and this should be its own exception.
+ */
+public class TargetTokenNotFoundException extends Exception {
+
+ private static final long serialVersionUID = 1L;
+ public TargetTokenNotFoundException(String message) {
+ super(message);
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java (working copy)
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.util.BitSet;
+
+/**
+ * Util class used to specify the tokens for which character offsets are requested.
+ */
+public class TokenCharOffsetRequests {
+ private BitSet set = new BitSet();
+ private int last = -1;
+
+ public boolean contains(int i){
+ return set.get(i);
+ }
+ public void add(int start, int end){
+ for (int i = start; i <= end; i++){
+ add(i);
+ }
+ }
+ public void add(int i){
+ set.set(i);
+ last = (i > last) ? i : last;
+ }
+ public void clear(){
+ set.clear();
+ }
+
+ public int getLast(){
+ return last;
+ }
+ protected BitSet getSet(){
+ return set;
+ }
+/* public Integer[] getSortedRequests(){
+ List ints = new ArrayList();
+ ints.addAll(set);
+ Collections.sort(ints);
+ return ints.toArray(new Integer[set.size()]);
+ }
+ */
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetResults.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetResults.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetResults.java (working copy)
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Class to record results for looking up normalized terms (String)
+ * and character offsets for specified tokens.
+ * Will return NULL_TERM/NULL_OFFSET if a token offset was not found.
+ *
+ * Has utility methods for safely getting the closest found token. This
+ * is useful for when a concordance window ends in a stop word (no term/offset info).
+ */
+public class TokenCharOffsetResults {
+
+ public final static String NULL_TERM = "";
+ public final static int NULL_OFFSET = -1;
+ // nocommit: is this needed?
+ // public final static FieldIndexCharacterOffsetPair NULL_FIELDINDEXCHAROFFSETPAIR = new FieldIndexCharacterOffsetPair(-1,-1);
+
+ private BitSet set = new BitSet();
+ private int last = -1;
+ private Map terms = new HashMap<>();
+ private Map starts =
+ new HashMap<>();
+ private Map ends =
+ new HashMap<>();
+
+ public void add(int tokenOffset, int fieldIndex, int startCharOffset, int endCharOffset, String term) {
+ addStart(tokenOffset, fieldIndex, startCharOffset);
+ addEnd(tokenOffset, fieldIndex, endCharOffset);
+ addTerm(tokenOffset, term);
+ set.set(tokenOffset);
+ }
+
+ private void addTerm(int tokenOffset, String term) {
+ if (term != null) {
+ terms.put(tokenOffset, term);
+ }
+ last = (tokenOffset > last) ? tokenOffset : last;
+ }
+
+ private void addStart(int tokenOffset, int fieldIndex, int charOffset) {
+ starts.put(tokenOffset, new FieldIndexCharacterOffsetPair(fieldIndex, charOffset));
+ last = (tokenOffset > last) ? tokenOffset : last;
+ }
+ private void addEnd(int tokenOffset, int fieldIndex, int charOffset) {
+ ends.put(tokenOffset, new FieldIndexCharacterOffsetPair(fieldIndex,charOffset));
+ last = (tokenOffset > last) ? tokenOffset : last;
+ }
+
+ public int getCharacterOffsetStart(int tokenOffset) {
+ FieldIndexCharacterOffsetPair cand = starts.get(tokenOffset);
+ if (cand == null)
+ return NULL_OFFSET;
+
+ return cand.getCharOffset();
+ }
+
+ public int getCharacterOffsetEnd(int tokenOffset) {
+ FieldIndexCharacterOffsetPair cand = ends.get(tokenOffset);
+ if (cand == null)
+ return NULL_OFFSET;
+
+ return cand.getCharOffset();
+
+ }
+
+ public String getTerm(int tokenOffset) {
+ String s = terms.get(tokenOffset);
+ if (s == null) {
+ return NULL_TERM;
+ }
+ return s;
+ }
+
+
+ public int getLast() {
+ return last;
+ }
+
+ public void clear() {
+ terms.clear();
+ starts.clear();
+ ends.clear();
+ last = -1;
+ set.clear();
+ }
+ protected boolean isEmpty() {
+ return set.isEmpty();
+ }
+
+ private int getClosestToken(int fieldIndex, int start, int stop, Map map) {
+ if (start < 0 || stop < 0) {
+ return NULL_OFFSET;
+ }
+ if (start == stop) {
+ return start;
+ }
+ if (start < stop) {
+ for (int i = start ; i <= stop; i++) {
+ FieldIndexCharacterOffsetPair p = map.get(i);
+ if (p != null && p.getFieldIndex() == fieldIndex) {
+ return i;
+ }
+ }
+ } else if (start > stop) {
+ for (int i = start; i >= stop; i--) {
+ FieldIndexCharacterOffsetPair p = map.get(i);
+ if (p != null && p.getFieldIndex() == fieldIndex) {
+ return i;
+ }
+ }
+ }
+ return NULL_OFFSET;
+ }
+
+ public int getClosestCharStart(int fieldIndex, int start, int stop) {
+
+ int i = getClosestToken(fieldIndex, start, stop, starts);
+ return getCharacterOffsetStart(i);
+ }
+
+ public int getClosestCharEnd(int fieldIndex, int start, int stop) {
+ int i = getClosestToken(fieldIndex, start, stop, ends);
+
+ return getCharacterOffsetEnd(i);
+ }
+
+ protected String getClosestTerm(int fieldIndex, int start, int stop) {
+ int i = getClosestToken(fieldIndex, start, stop, starts);
+ return getTerm(i);
+ }
+
+ /*
+ * return: -1 if
+ */
+ public int getFieldIndex(int tokenOffset) {
+ FieldIndexCharacterOffsetPair p = starts.get(tokenOffset);
+ if (p == null) {
+ return NULL_OFFSET;
+ }
+ return p.getFieldIndex();
+ }
+ protected String debugToString() {
+ StringBuilder sb = new StringBuilder();
+ for (Integer i : terms.keySet()) {
+ sb.append(i).append(" : ").append(terms.get(i)).append(" : ");
+ sb.append(starts.get(i)).append(" : ").append(ends.get(i)).append("\n");
+ }
+ return sb.toString();
+ }
+
+ protected BitSet getSet() {
+ return set;
+ }
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java (working copy)
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance.charoffsets;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * Interface to allow flexibility/optimizations in returning character offsets for
+ * tokens
+ */
+public interface TokenCharOffsetsReader {
+
+ TokenCharOffsetResults getTokenCharOffsetResults
+ (StoredDocument document, String fieldName, TokenCharOffsetRequests requests, TokenCharOffsetResults results)
+ throws IOException;
+}
Index: lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html
===================================================================
--- lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html (revision 0)
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html (working copy)
@@ -0,0 +1,22 @@
+
+
+
+
+ConcordanceSearcher performs a search on an index and returns concordance windows.
+
+
Index: lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java
===================================================================
--- lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java (revision 0)
+++ lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java (working copy)
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+public class ConcordanceTestUtils extends LuceneTestCase {
+ public final static String FIELD = "content";
+
+
+ public static Directory getDirectory(Analyzer analyzer, String[] vals) throws IOException {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs
+ (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
+
+ for (String s : vals) {
+ Document d = new Document();
+ d.add(newTextField(FIELD, s, Field.Store.YES));
+ writer.addDocument(d);
+
+ }
+ writer.close();
+ return directory;
+ }
+
+ public static Directory getDirectory(Analyzer analyzer, List input) throws IOException {
+
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs
+ (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
+ for (String[] vals : input) {
+ Document d = new Document();
+ for (String s : vals) {
+ d.add(newTextField(FIELD, s, Field.Store.YES));
+ }
+ writer.addDocument(d);
+
+ }
+ writer.close();
+ return directory;
+ }
+
+ public static Analyzer getAnalyzer(final CharacterRunAutomaton stops, final int posIncGap) {
+ //stops will usually be either:
+ //MockTokenFilter.EMPTY_STOPSET;
+ //MockTokenFilter.ENGLISH_STOPSET
+ return new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
+
+ TokenFilter filter = new MockTokenFilter(tokenizer, stops);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ @Override
+ public int getPositionIncrementGap(String fieldName) {
+ return posIncGap;
+ }
+ };
+ }
+}
Index: lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java
===================================================================
--- lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java (revision 0)
+++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java (working copy)
@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.concordance;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+
+public class TestConcordanceSearcher extends LuceneTestCase {
+
+ private final static DocumentMetadataExtractor metadataExtractor = new DocumentMetadataExtractor() {
+ private final Set fields = new HashSet<>();
+ private final Map data = new HashMap<>();
+ @Override
+ public Set getFieldSelector() {
+ return fields;
+ }
+
+ @Override
+ public Map extract(StoredDocument d) {
+ return data;
+ }
+
+ };
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ //NOOP for now
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ //NOOP for now
+ }
+
+ @Test
+ public void testSimple() throws Exception {
+ String[] docs = new String[] {
+ "a b c a b c",
+ "c b a c b a"
+ };
+ Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50);
+ Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD);
+ ConcordanceSearcher searcher = new ConcordanceSearcher();
+ SpanQuery q = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "a"));
+
+ config.setMaxWindows(3);
+ ConcordanceResults results = searcher.search(reader, q, null, analyzer,
+ config, metadataExtractor);
+
+ assertEquals(3, results.getWindows().size());
+
+ config.setMaxWindows(Integer.MAX_VALUE);
+ results = searcher.search(reader, q, null, analyzer,
+ config, metadataExtractor);
+
+ //test result size
+ assertEquals(4, results.getWindows().size());
+
+ //test result with sort order = pre
+ List windows = results.getSortedWindows();
+ String[] pres = new String[] {
+ "",
+ "c b",
+ "c b a c b",
+ "a b c"
+ };
+ String[] posts = new String[] {
+ " b c a b c",
+ " c b a",
+ "",
+ " b c"
+ };
+
+ for (int i = 0; i < windows.size(); i++) {
+ ConcordanceWindow w = windows.get(i);
+
+ assertEquals(pres[i], w.getPre());
+ assertEquals(posts[i], w.getPost());
+ }
+
+ //test sort order post
+ //sort key is built at search time, so must re-search
+ config.setSortOrder(ConcordanceSortOrder.POST);
+ results = searcher.search(reader, q, null, analyzer,
+ config, metadataExtractor);
+
+ windows = results.getSortedWindows();
+
+ posts = new String[] {
+ "",
+ " b c",
+ " b c a b c",
+ " c b a",
+ };
+ for (int i = 0; i < windows.size(); i++) {
+ ConcordanceWindow w = windows.get(i);
+ assertEquals(posts[i], w.getPost());
+ }
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testSimpleMultiValuedField() throws Exception {
+ String[] doc = new String[] {
+ "a b c a b c",
+ "c b a c b a"
+ };
+ List docs = new ArrayList<>();
+ docs.add(doc);
+ Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50);
+ Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD);
+ ConcordanceSearcher searcher = new ConcordanceSearcher();
+ SpanQuery q = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "a"));
+
+
+ ConcordanceResults results = searcher.search(reader, q, null, analyzer,
+ config, metadataExtractor);
+
+ //test result size
+ assertEquals(4, results.getWindows().size());
+
+ //test result with sort order = pre
+ List windows = results.getSortedWindows();
+ String[] pres = new String[] {
+ "",
+ "c b",
+ "c b a c b",
+ "a b c"
+ };
+ String[] posts = new String[] {
+ " b c a b c",
+ " c b a",
+ "",
+ " b c"
+ };
+
+ for (int i = 0; i < windows.size(); i++) {
+ ConcordanceWindow w = windows.get(i);
+
+ assertEquals(pres[i], w.getPre());
+
+ assertEquals(posts[i], w.getPost());
+ }
+
+ //test sort order post
+ //sort key is built at search time, so must re-search
+ config.setSortOrder(ConcordanceSortOrder.POST);
+ results = searcher.search(reader, q, null, analyzer,
+ config, metadataExtractor);
+
+ windows = results.getSortedWindows();
+
+ posts = new String[] {
+ "",
+ " b c",
+ " b c a b c",
+ " c b a",
+ };
+ for (int i = 0; i < windows.size(); i++) {
+ ConcordanceWindow w = windows.get(i);
+ assertEquals(posts[i], w.getPost());
+ }
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testWindowLengths() throws Exception {
+ String[] doc = new String[] {
+ "a b c d e f g",
+ };
+ List docs = new ArrayList<>();
+ docs.add(doc);
+ Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50);
+ Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD);
+
+ ConcordanceSearcher searcher = new ConcordanceSearcher();
+ SpanQuery q = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "d"));
+
+ String[] pres = {"", "c", "b c", "a b c", "a b c", "a b c"};
+ String[] posts = {"", " e", " e f", " e f g", " e f g", " e f g"};
+
+ for (int preLen = 0; preLen < pres.length; preLen++) {
+ for (int postLen = 0; postLen < posts.length; postLen++) {
+ config.setTokensBefore(preLen);
+ config.setTokensAfter(postLen);
+ ConcordanceResults results = searcher.search(reader, q, null, analyzer,
+ config, metadataExtractor);
+ //ConcordanceWindow w = results.getWindows().get(0);
+ ConcordanceWindow w = results.getWindows().iterator().next();//java 1.7 difference?
+ assertEquals(preLen+" : "+postLen, w.getPre(), pres[preLen]);
+ assertEquals(preLen+" : "+postLen, w.getPost(), posts[postLen]);
+ }
+ }
+
+ reader.close();
+ directory.close();
+
+ }
+
+ @Test
+ public void testClockworkOrangMultiValuedFieldProblem() throws Exception {
+ /* test handling of target spread out over several
+ * indices in a multivalued field array
+ */
+ String[] doc = new String[] {
+ "a b c a b the",
+ "clockwork",
+ "orange b a c b a"
+ };
+ List docs = new ArrayList<>();
+ docs.add(doc);
+ Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0);
+ Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD);
+ config.setTokensBefore(3);
+ config.setTokensAfter(3);
+
+ ConcordanceSearcher searcher = new ConcordanceSearcher();
+ SpanQuery q1 = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "the"));
+ SpanQuery q2 = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "clockwork"));
+ SpanQuery q3 = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "orange"));
+ SpanQuery q = new SpanNearQuery(new SpanQuery[] {q1, q2, q3}, 3, true);
+ ConcordanceResults results = searcher.search(reader, q, null, analyzer,
+ config, metadataExtractor);
+ assertEquals(1, results.getWindows().size());
+ //ConcordanceWindow w = results.getWindows().get(0);
+ ConcordanceWindow w = results.getWindows().iterator().next();
+ assertEquals("target", "the clockwork orange", w.getTarget());
+ assertEquals("pre", "c a b", w.getPre());
+ assertEquals("post", " b a c", w.getPost());
+
+ reader.close();
+ directory.close();
+
+ //test hit even over long intra-field gap
+ analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50);
+ directory = ConcordanceTestUtils.getDirectory(analyzer, docs);
+ reader = DirectoryReader.open(directory);
+ config = new ConcordanceConfig(ConcordanceTestUtils.FIELD);
+ config.setTokensBefore(3);
+ config.setTokensAfter(3);
+
+ searcher = new ConcordanceSearcher();
+ q = new SpanNearQuery(new SpanQuery[] {q1, q2, q3}, 120, true);
+ results = searcher.search(reader, q, null, analyzer,
+ config, metadataExtractor);
+
+ assertEquals(1, results.getWindows().size());
+ //w = results.getWindows().get(0);
+ w = results.getWindows().iterator().next();
+ assertEquals("target", "the clockwork orange", w.getTarget());
+ assertEquals("pre", "c a b", w.getPre());
+ assertEquals("post", " b a c", w.getPost());
+
+ reader.close();
+ directory.close();
+ //test miss
+ analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50);
+ directory = ConcordanceTestUtils.getDirectory(analyzer, docs);
+ reader = DirectoryReader.open(directory);
+ config = new ConcordanceConfig(ConcordanceTestUtils.FIELD);
+
+ searcher = new ConcordanceSearcher();
+ q = new SpanNearQuery(new SpanQuery[] {q1, q2, q3}, 5, true);
+ results = searcher.search(reader, q, null, analyzer, config, metadataExtractor);
+
+ assertEquals(0, results.getWindows().size());
+
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testWithStops() throws Exception {
+ String[] docs = new String[] {
+ "a b the d e the f",
+ "g h the d the j"
+ };
+ Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.ENGLISH_STOPSET, 50);
+ Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+
+ ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD);
+ config.setTokensBefore(2);
+ config.setTokensAfter(2);
+
+ ConcordanceSearcher searcher = new ConcordanceSearcher();
+ SpanQuery q = new SpanTermQuery(new Term(ConcordanceTestUtils.FIELD, "d"));
+ ConcordanceResults results = searcher.search(reader, q, null, analyzer,
+ config, metadataExtractor);
+ List windows = results.getSortedWindows();
+ assertEquals(2, windows.size());
+
+ //the second word after the target is a stop word
+ //this post-component of this window should only go to the first word after the target
+ assertEquals("b the:d: e", windows.get(0).toString());
+
+ assertEquals("h the:d: the j", windows.get(1).toString());
+
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testBasicStandardQueryConversion() throws Exception {
+ String[] docs = new String[] {
+ "a b c a b c",
+ "c b a c b a d e a",
+ "c b a c b a e a b c a"
+ };
+ Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50);
+ Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD);
+ ConcordanceSearcher searcher = new ConcordanceSearcher();
+ BooleanQuery q = new BooleanQuery();
+ q.add(new TermQuery(new Term(ConcordanceTestUtils.FIELD, "a")), Occur.MUST);
+ q.add(new TermQuery(new Term(ConcordanceTestUtils.FIELD, "d")), Occur.MUST_NOT);
+
+ config.setMaxWindows(10);
+ ConcordanceResults results = searcher.search(reader, q, null, analyzer, config, metadataExtractor);
+ //shouldn't include document with "d"
+ assertEquals(6, results.getWindows().size());
+
+ //should only include document with "e" and not "d"
+ Filter filter = new QueryWrapperFilter(new TermQuery(new Term(ConcordanceTestUtils.FIELD, "e")));
+ results = searcher.search(reader, q, filter, analyzer, config, metadataExtractor);
+ assertEquals(4, results.getWindows().size());
+
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testMismatchingFieldsInStandardQueryConversion() throws Exception {
+ //tests what happens if a Query doesn't contain a term in the "span" field
+ //in the searcher...should be no documents returned.
+
+ String[] docs = new String[] {
+ "a b c a b c",
+ };
+ Analyzer analyzer = ConcordanceTestUtils.getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50);
+ Directory directory = ConcordanceTestUtils.getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceConfig config = new ConcordanceConfig(ConcordanceTestUtils.FIELD);
+ ConcordanceSearcher searcher = new ConcordanceSearcher();
+
+ Query q = new TermQuery(new Term("_"+ConcordanceTestUtils.FIELD, "a"));
+
+ int windowCount = -1;
+
+ ConcordanceResults results = searcher.search(reader, q, null, analyzer, config, metadataExtractor);
+ windowCount = results.getWindows().size();
+ assertEquals(0, windowCount);
+ reader.close();
+ directory.close();
+ }
+}
Index: lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java
===================================================================
--- lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java (revision 0)
+++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java (working copy)
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.concordance;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+
+public class TestSpanQueryConverter extends LuceneTestCase {
+ private static IndexReader reader;
+ private static Directory directory;
+ private static Analyzer analyzer;
+ private final static String FIELD = "field";
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ analyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs
+ (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
+ String[] docs = new String[] {
+ "a b c a b c",
+ "c b a c b a"
+ };
+ for (String val : docs) {
+ Document doc = new Document();
+ doc.add(newTextField(FIELD, val, Field.Store.YES));
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ writer.close();
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ reader.close();
+ directory.close();
+ reader = null;
+ directory = null;
+ analyzer = null;
+ }
+
+ @Test
+ public void testBooleanTwoFields() throws Exception {
+
+ Query q1 = new TermQuery(new Term(FIELD, "a"));
+ Query q2 = new TermQuery(new Term("another_field", "b"));
+ BooleanQuery q = new BooleanQuery();
+ q.add(q1, Occur.SHOULD);
+ q.add(q2, Occur.SHOULD);
+ SpanQueryConverter converter = new SpanQueryConverter();
+ boolean success = true;
+ try {
+ SpanQuery span = converter.convert(FIELD, q);
+ } catch (IllegalArgumentException e) {
+ success = false;
+ }
+ assertEquals(true, success);
+ Query q3 = new TermQuery(new Term("another_field", "c"));
+ BooleanQuery bq2 = new BooleanQuery();
+ bq2.add(q, Occur.MUST);
+ bq2.add(q3, Occur.SHOULD);
+ try {
+ SpanQuery span = converter.convert(FIELD, bq2);
+ } catch (IllegalArgumentException e) {
+ success = false;
+ }
+ assertEquals(true, success);
+ }
+}
Index: lucene/module-build.xml
===================================================================
--- lucene/module-build.xml (revision 1632428)
+++ lucene/module-build.xml (working copy)
@@ -628,4 +628,27 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+