diff --git dev-tools/idea/.idea/ant.xml dev-tools/idea/.idea/ant.xml
index 2cd14fd..b0ebf58 100644
--- dev-tools/idea/.idea/ant.xml
+++ dev-tools/idea/.idea/ant.xml
@@ -18,6 +18,7 @@
+
diff --git dev-tools/idea/.idea/modules.xml dev-tools/idea/.idea/modules.xml
index 5c096a6..cfdf28e 100644
--- dev-tools/idea/.idea/modules.xml
+++ dev-tools/idea/.idea/modules.xml
@@ -23,6 +23,7 @@
+
diff --git dev-tools/idea/.idea/workspace.xml dev-tools/idea/.idea/workspace.xml
index 2db9014..4fe00be 100644
--- dev-tools/idea/.idea/workspace.xml
+++ dev-tools/idea/.idea/workspace.xml
@@ -108,6 +108,14 @@
+
+
+
+
+
+
+
+
@@ -325,7 +333,7 @@
-
+
@@ -339,32 +347,33 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git dev-tools/idea/lucene/concordance/concordance.iml dev-tools/idea/lucene/concordance/concordance.iml
new file mode 100644
index 0000000..141f1ad
--- /dev/null
+++ dev-tools/idea/lucene/concordance/concordance.iml
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git dev-tools/maven/lucene/concordance/pom.xml.template dev-tools/maven/lucene/concordance/pom.xml.template
new file mode 100644
index 0000000..dd4382e
--- /dev/null
+++ dev-tools/maven/lucene/concordance/pom.xml.template
@@ -0,0 +1,69 @@
+
+
+ 4.0.0
+
+ org.apache.lucene
+ lucene-parent
+ @version@
+ ../pom.xml
+
+ org.apache.lucene
+ lucene-concordance
+ jar
+ Lucene Concordance
+ Lucene Concordance Module
+
+ lucene/concordance
+ ../../..
+ ${relative-top-level}/${module-directory}
+
+
+ scm:svn:${vc-anonymous-base-url}/${module-directory}
+ scm:svn:${vc-dev-base-url}/${module-directory}
+ ${vc-browse-base-url}/${module-directory}
+
+
+
+
+ org.apache.lucene
+ lucene-test-framework
+ test
+
+ @lucene-concordance.internal.dependencies@
+ @lucene-concordance.external.dependencies@
+ @lucene-concordance.internal.test.dependencies@
+ @lucene-concordance.external.test.dependencies@
+
+
+ ${module-path}/src/java
+ ${module-path}/src/test
+
+
+ ${project.build.testSourceDirectory}
+
+ **/*.java
+
+
+
+
+
+
diff --git dev-tools/maven/lucene/pom.xml.template dev-tools/maven/lucene/pom.xml.template
index e7551c4..580fec6 100644
--- dev-tools/maven/lucene/pom.xml.template
+++ dev-tools/maven/lucene/pom.xml.template
@@ -47,6 +47,7 @@
analysisbenchmarkclassification
+ concordancedemoexpressionsfacet
diff --git lucene/build.xml lucene/build.xml
index 0b98bb6..5d1adb8 100644
--- lucene/build.xml
+++ lucene/build.xml
@@ -173,6 +173,7 @@
+
diff --git lucene/concordance/build.xml lucene/concordance/build.xml
new file mode 100644
index 0000000..20d955d
--- /dev/null
+++ lucene/concordance/build.xml
@@ -0,0 +1,40 @@
+
+
+
+
+
+ Executes concordance search
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git lucene/concordance/ivy.xml lucene/concordance/ivy.xml
new file mode 100644
index 0000000..3ad64e3
--- /dev/null
+++ lucene/concordance/ivy.xml
@@ -0,0 +1,21 @@
+
+
+
+
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/AbstractConcordanceWindowCollector.java lucene/concordance/src/java/org/apache/lucene/search/concordance/AbstractConcordanceWindowCollector.java
new file mode 100644
index 0000000..37ed3b3
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/AbstractConcordanceWindowCollector.java
@@ -0,0 +1,134 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Abstract class to handle basic information for a ConcordanceWindowSearcher
+ *
+ */
+public abstract class AbstractConcordanceWindowCollector {
+ //value to use if all windows should be collected
+ public static final int COLLECT_ALL = -1;
+
+ private final ConcordanceSorter sorter = new ConcordanceSorter();
+
+ private Set docIds = new HashSet<>();
+ private final int maxWindows;
+ private boolean hitMax = false;
+ private long totalDocs = 0;
+
+ /**
+ *
+ * @param maxWindows maximum windows to collect
+ */
+ public AbstractConcordanceWindowCollector(int maxWindows) {
+ this.maxWindows = maxWindows;
+ }
+
+ /**
+ * Collect/process this window
+ * @param w window to be processed
+ */
+ public abstract void collect(ConcordanceWindow w);
+
+ /**
+ *
+ * @return number of windows collected
+ */
+ public abstract int size();
+
+ /**
+ *
+ * @return collected windows (unsorted)
+ */
+ public abstract List getWindows();
+
+ /**
+ *
+ * @param hitMax did the searcher collect the maximum number of windows
+ * and stop early
+ */
+ public void setHitMax(boolean hitMax) {
+ this.hitMax = hitMax;
+ }
+
+ /**
+ *
+ * @param docId unique key for a document
+ */
+ public void addDocId(String docId) {
+ docIds.add(docId);
+ }
+
+ /**
+ *
+ * Sort according to {@link #sorter} and return windows
+ * @return sorted list of windows
+ */
+ public List getSortedWindows() {
+ List windows = getWindows();
+ Collections.sort(windows, sorter);
+ return windows;
+ }
+
+ /**
+ *
+ * @return whether or not the searcher collected the maximum number of
+ * windows and stopped early.
+ */
+ public boolean getHitMax() {
+ return hitMax;
+ }
+
+ /**
+ *
+ * @return the maximum number of windows to collect.
+ * Can be equal to {@link #COLLECT_ALL}
+ */
+ public int getMaxWindows() {
+ return maxWindows;
+ }
+
+ /**
+ *
+ * @param totalDocs see {@link #getTotalDocs()}
+ */
+ public void setTotalDocs(long totalDocs) {
+ this.totalDocs = totalDocs;
+ }
+
+ /**
+ *
+ * @param totalDocs add this value to {@link #totalDocs}
+ */
+ public void incrementTotalDocs(long totalDocs) {
+ this.totalDocs += totalDocs;
+ }
+
+ /**
+ * @return total number of documents in all indices
+ */
+ public long getTotalDocs() {
+ return totalDocs;
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java
new file mode 100644
index 0000000..d1b0540
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java
@@ -0,0 +1,253 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.StorableField;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.queries.BooleanFilter;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsets;
+import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsetsIterator;
+import org.apache.lucene.search.concordance.charoffsets.OffsetLengthStartComparator;
+import org.apache.lucene.search.concordance.charoffsets.OffsetUtil;
+import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer;
+import org.apache.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader;
+import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException;
+import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests;
+import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetsReader;
+import org.apache.lucene.search.spans.SimpleSpanQueryConverter;
+import org.apache.lucene.search.spans.SpanQuery;
+
+
+/**
+ * Searches an IndexReader and returns concordance windows via ConcordanceResults.
+ */
+public class ConcordanceSearcher {
+
+ /**
+ * Allow overlapping targets in hits, default = false
+ */
+ private boolean allowTargetOverlaps = false;
+
+ private WindowBuilder windowBuilder;
+
+ private SimpleSpanQueryConverter spanQueryConverter;
+
+ /**
+ * Constructor with default WindowBuilder and SimpleSpanQueryConverter
+ */
+ public ConcordanceSearcher() {
+ this(new WindowBuilder(), new SimpleSpanQueryConverter());
+ }
+
+ /**
+ * Constructor for windowbuilder and SimpleSpanQueryConverter
+ * @param windowBuilder builder to use for windows
+ */
+ public ConcordanceSearcher(WindowBuilder windowBuilder) {
+ this(windowBuilder, new SimpleSpanQueryConverter());
+ }
+
+ /**
+ * Constructor for windowBuilder and converter
+ * @param windowBuilder windowBuilder to use to build windows
+ * @param converter converter to use to convert Query to SpanQuery
+ */
+ public ConcordanceSearcher(WindowBuilder windowBuilder,
+ SimpleSpanQueryConverter converter) {
+ this.windowBuilder = windowBuilder;
+ this.spanQueryConverter = converter;
+ }
+
+
+ /**
+ *
+ * @param reader reader to search
+ * @param fieldName field to build the windows on
+ * @param query if SpanQuery, this gets passed through as is. If a regular Query, the
+ * Query is first converted to a SpanQuery and the filter is modified
+ * to include the original Query.
+ * @param filter include a filter query. Value can be null
+ * @param analyzer analyzer to use for (re)calculating character offsets and for normalizing
+ * the sort keys
+ * @throws org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException
+ * @throws IllegalArgumentException
+ * @throws java.io.IOException
+ */
+ public void search(IndexReader reader, String fieldName, Query query,
+ Filter filter, Analyzer analyzer, AbstractConcordanceWindowCollector collector)
+ throws TargetTokenNotFoundException, IllegalArgumentException,
+ IOException {
+ if (query == null) {
+ return;
+ }
+ if (query instanceof SpanQuery) {
+ // pass through
+ searchSpan(reader, (SpanQuery)query, filter, analyzer, collector);
+ } else {
+ // convert regular query to a SpanQuery.
+ SpanQuery spanQuery = spanQueryConverter.convert(fieldName, query);
+
+ Filter origQueryFilter = new QueryWrapperFilter(query);
+ Filter updatedFilter = origQueryFilter;
+
+ if (filter != null) {
+ BooleanFilter combinedFilter = new BooleanFilter();
+ combinedFilter.add(origQueryFilter, BooleanClause.Occur.MUST);
+ combinedFilter.add(filter, BooleanClause.Occur.MUST);
+ updatedFilter = combinedFilter;
+ }
+ searchSpan(reader, spanQuery, updatedFilter, analyzer, collector);
+ }
+ }
+
+ /**
+ * Like
+ * {@link #search(org.apache.lucene.index.IndexReader, String, org.apache.lucene.search.Query, org.apache.lucene.search.Filter, org.apache.lucene.analysis.Analyzer, AbstractConcordanceWindowCollector)}
+ * but this takes a SpanQuery
+ *
+ * @param reader reader to search
+ * @param spanQuery query to use to identify the targets
+ * @param filter filter for document retrieval
+ * @param analyzer to re-analyze terms for window calculations and sort key building
+ * @param collector to process (and store) the results
+ * @throws org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException
+ * @throws IllegalArgumentException
+ * @throws java.io.IOException
+ */
+ public void searchSpan(IndexReader reader,
+ SpanQuery spanQuery,
+ Filter filter, Analyzer analyzer, AbstractConcordanceWindowCollector collector)
+ throws TargetTokenNotFoundException, IllegalArgumentException,
+ IOException {
+
+ spanQuery = (SpanQuery)spanQuery.rewrite(reader);
+ DocTokenOffsetsIterator itr = new DocTokenOffsetsIterator();
+ Set fields = new HashSet<>(
+ windowBuilder.getFieldSelector());
+ fields.add(spanQuery.getField());
+ itr.reset(spanQuery, filter, reader, fields);
+ buildResults(itr, reader, spanQuery.getField(), analyzer, collector);
+
+ }
+
+ private void buildResults(DocTokenOffsetsIterator itr,
+ IndexReader reader, String fieldName, Analyzer analyzer, AbstractConcordanceWindowCollector collector)
+ throws IllegalArgumentException, TargetTokenNotFoundException,
+ IOException {
+
+ collector.setTotalDocs(reader.numDocs());
+ TokenCharOffsetRequests requests = new TokenCharOffsetRequests();
+
+ TokenCharOffsetsReader tokenOffsetsRecordReader =
+ new ReanalyzingTokenCharOffsetsReader(analyzer);
+
+ RandomAccessCharOffsetContainer offsetResults = new RandomAccessCharOffsetContainer();
+ DocTokenOffsets result = null;
+ OffsetLengthStartComparator offsetLengthStartComparator = new OffsetLengthStartComparator();
+ boolean stop = false;
+ while (itr.next() && !stop) {
+ result = itr.getDocTokenOffsets();
+ StoredDocument document = result.getDocument();
+
+ String[] fieldValues = document.getValues(fieldName);
+
+ if (fieldValues == null || fieldValues.length == 0) {
+ throwMissingField(document);
+ }
+ Map metadata = windowBuilder.extractMetadata(document);
+ String docId = windowBuilder.getUniqueDocumentId(document, result.getUniqueDocId());
+
+ List tokenOffsets = result.getOffsets();
+ if (! allowTargetOverlaps) {
+ // remove overlapping hits!!!
+ tokenOffsets = OffsetUtil.removeOverlapsAndSort(tokenOffsets,
+ offsetLengthStartComparator, null);
+ }
+
+ //clear then get new requests
+ requests.clear();
+ ConcordanceSearcherUtil.getCharOffsetRequests(tokenOffsets,
+ windowBuilder.getTokensBefore(), windowBuilder.getTokensAfter(), requests);
+
+ offsetResults.clear();
+
+ tokenOffsetsRecordReader.getTokenCharOffsetResults(
+ document, fieldName, requests, offsetResults);
+
+ for (OffsetAttribute offset : tokenOffsets) {
+
+ ConcordanceWindow w = windowBuilder.buildConcordanceWindow(
+ docId, offset.startOffset(),
+ offset.endOffset() - 1, fieldValues,
+ offsetResults, metadata);
+
+ collector.collect(w);
+ if (collector.getHitMax()) {
+ stop = true;
+ break;
+ }
+ }
+ }
+ }
+
+ /**
+ * Spans can overlap: a search for ["ab cd" "ab"] would have
+ * two spans on the string "ab cd" if this is set to true.
+ * If this is set to false, this will return the longest span
+ * that appears earliest in the string if there is overlap.
+ *
+ * @param allowTargetOverlaps are targets allowed to overlap.
+ */
+ public void setAllowTargetOverlaps(boolean allowTargetOverlaps) {
+ this.allowTargetOverlaps = allowTargetOverlaps;
+ }
+
+ private void throwMissingField(StoredDocument document) throws IllegalArgumentException {
+ StringBuilder sb = new StringBuilder();
+ sb.append("Did you forget to load or specify the correct content field?!");
+ sb.append("\n");
+ sb.append("I only see these fields:\n");
+
+ for (StorableField f : document.getFields()) {
+ sb.append(f.name()).append("\n");
+ }
+ throw new IllegalArgumentException(sb.toString());
+ }
+
+ /**
+ * Set the converter to use to convert a Query to a SpanQuery.
+ * The need for this will go away when LUCENE-2878 is completed.
+ * @param converter converter from Query to SpanQuery
+ */
+ public void setSpanQueryConverter(SimpleSpanQueryConverter converter){
+ this.spanQueryConverter = converter;
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java
new file mode 100644
index 0000000..83a331d
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java
@@ -0,0 +1,59 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.List;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests;
+
+/**
+ * In other applications with variations on the ConcordanceSearcher, it has been
+ * useful to factor out the getCharOffsetRequests.
+ *
+ * This class should be used for functionality that is generally useful for
+ * concordance searching.
+ *
+ */
+public class ConcordanceSearcherUtil {
+
+
+ /**
+ * Simple utility method to build a TokenCharOffsetRequests object
+ * from a list of desired tokenOffsets, the number of tokensBefore
+ * and the number of tokensAfter.
+ *
+ * @param tokenOffsets the tokenOffsets that are desired
+ * @param tokensBefore the number of tokens before a desired tokenOffset
+ * @param tokensAfter the number of tokens after a desired tokenOffset
+ */
+ public static void getCharOffsetRequests(
+ List tokenOffsets,
+ int tokensBefore, int tokensAfter,
+ TokenCharOffsetRequests requests) {
+
+ for (OffsetAttribute tokenOffset : tokenOffsets) {
+ int start = tokenOffset.startOffset() - tokensBefore;
+ start = (start < 0) ? 0 : start;
+ int end = tokenOffset.endOffset() + tokensAfter+1;
+ for (int i = start; i < end; i++) {
+ requests.add(i);
+ }
+ }
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortKey.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortKey.java
new file mode 100644
index 0000000..a599a7f
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortKey.java
@@ -0,0 +1,61 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Simple comparable class to allow for subclassing.
+ *
+ */
+public class ConcordanceSortKey implements Comparable {
+
+ private final String concSortString;
+
+ public ConcordanceSortKey(String s) {
+ this.concSortString = s;
+ }
+
+ @Override
+ public int compareTo(ConcordanceSortKey other) {
+ return concSortString.compareTo(other.concSortString);
+ }
+
+ @Override
+ public int hashCode() {
+ return concSortString.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (!(obj instanceof ConcordanceSortKey))
+ return false;
+ ConcordanceSortKey other = (ConcordanceSortKey) obj;
+ if (concSortString == null) {
+ if (other.concSortString != null)
+ return false;
+ } else if (!concSortString.equals(other.concSortString))
+ return false;
+ return true;
+ }
+
+
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java
new file mode 100644
index 0000000..85d3c15
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java
@@ -0,0 +1,32 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Options for sorting ConcordanceWindows
+ *
+ */
+public enum ConcordanceSortOrder {
+ PRE, //sort on the first token before the target, then the second word, etc.
+ POST, //sort on words after the target
+ TARGET_PRE, //sort on the target and then words before the target
+ TARGET_POST, //sort on the target and then words after the target
+ DOC, //sort on the Lucene document id
+ NONE //no sort
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java
new file mode 100644
index 0000000..6732e68
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java
@@ -0,0 +1,32 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.Comparator;
+
+
+public class ConcordanceSorter implements Comparator {
+ private static final long serialVersionUID = 7526472295622776147L;
+
+ @Override
+ public int compare(ConcordanceWindow w1, ConcordanceWindow w2) {
+
+ return w1.getSortKey().compareTo(w2.getSortKey());
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java
new file mode 100644
index 0000000..0cebcb6
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java
@@ -0,0 +1,181 @@
+package org.apache.lucene.search.concordance;
+
+import java.util.Map;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Key element in a concordance view of data. A window consists of the words
+ * before a target term (pre), the target term and then the words after the
+ * target term (post). A window also has a sort key to allow for various methods
+ * of sorting.
+ *
+ * For various applications, it has also been useful to store a unique document key,
+ * character offset (start and end) of the full
+ * window as well as metadata from the document for the given window.
+ *
+ * This class is experimental and may change in incompatible ways in the future.
+ *
+ * Areas for improvement:
+ * 1) convert sortKey to an array of Comparables
+ * 2) ...
+ */
+public class ConcordanceWindow {
+
+ private final ConcordanceSortKey sortKey;
+ private final String pre;
+ private final String target;
+ private final String post;
+ private final int charStart;
+ private final int charEnd;
+ private final String uniqueDocID;
+ //used by hide duplicates to count more than one occurrence of a window
+ private int count = 1;
+ private Map metadata;
+
+ /**
+ *
+ * @param uniqueDocID string representing what should be a unique document identifier
+ * @param charStart character offset start for the window
+ * @param charEnd character offset end for the window
+ * @param pre words before the target in reading order and unanalyzed
+ * @param target target string
+ * @param post string after the target in reading order and unanalyzed
+ * @param sortKey key to use for sorting this window
+ * @param metadata metadata to store with this window
+ */
+ public ConcordanceWindow(String uniqueDocID, int charStart, int charEnd, String pre,
+ String target, String post, ConcordanceSortKey sortKey, Map metadata) {
+ this.pre = pre;
+ this.target = target;
+ this.post = post;
+ this.uniqueDocID = uniqueDocID;
+ this.charStart = charStart;
+ this.charEnd = charEnd;
+ this.metadata = metadata;
+ this.sortKey = sortKey;
+ }
+
+ public String getUniqueDocID() {
+ return uniqueDocID;
+ }
+
+ public int getStart() {
+ return charStart;
+ }
+
+ public int getEnd() {
+ return charEnd;
+ }
+
+ public Map getMetadata() {
+ return metadata;
+ }
+
+ public String getPre() {
+ return pre;
+ }
+
+ public String getPost() {
+ return post;
+ }
+
+ public String getTarget() {
+ return target;
+ }
+
+ public int getCount() {
+ return count;
+ }
+
+ public void incrementCount() {
+ count++;
+ }
+
+ public void setCount(int count) {
+ this.count = count;
+ }
+
+ public int getSize() {
+ int size = 0;
+ if (pre != null) {
+ size += pre.length();
+ }
+ if (target != null) {
+ size += target.length();
+ }
+ if (post != null) {
+ size += post.length();
+ }
+ return size;
+ }
+
+ public ConcordanceSortKey getSortKey() {
+ return sortKey;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ ConcordanceWindow that = (ConcordanceWindow) o;
+
+ if (charEnd != that.charEnd) return false;
+ if (charStart != that.charStart) return false;
+ if (count != that.count) return false;
+ if (metadata != null ? !metadata.equals(that.metadata) : that.metadata != null) return false;
+ if (post != null ? !post.equals(that.post) : that.post != null) return false;
+ if (pre != null ? !pre.equals(that.pre) : that.pre != null) return false;
+ if (sortKey != null ? !sortKey.equals(that.sortKey) : that.sortKey != null) return false;
+ if (target != null ? !target.equals(that.target) : that.target != null) return false;
+ if (uniqueDocID != null ? !uniqueDocID.equals(that.uniqueDocID) : that.uniqueDocID != null) return false;
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = sortKey != null ? sortKey.hashCode() : 0;
+ result = 31 * result + (pre != null ? pre.hashCode() : 0);
+ result = 31 * result + (target != null ? target.hashCode() : 0);
+ result = 31 * result + (post != null ? post.hashCode() : 0);
+ result = 31 * result + charStart;
+ result = 31 * result + charEnd;
+ result = 31 * result + (uniqueDocID != null ? uniqueDocID.hashCode() : 0);
+ result = 31 * result + count;
+ result = 31 * result + (metadata != null ? metadata.hashCode() : 0);
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "ConcordanceWindow{" +
+ "sortKey=" + sortKey +
+ ", pre='" + pre + '\'' +
+ ", target='" + target + '\'' +
+ ", post='" + post + '\'' +
+ ", charStart=" + charStart +
+ ", charEnd=" + charEnd +
+ ", uniqueDocID='" + uniqueDocID + '\'' +
+ ", count=" + count +
+ ", metadata=" + metadata +
+ '}';
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindowCollector.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindowCollector.java
new file mode 100644
index 0000000..66d9d32
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindowCollector.java
@@ -0,0 +1,53 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class ConcordanceWindowCollector extends AbstractConcordanceWindowCollector{
+
+ private List windows = new ArrayList<>();
+
+ public ConcordanceWindowCollector(int maxWindows) {
+ super(maxWindows);
+ }
+
+ @Override
+ public void collect(ConcordanceWindow w) {
+ if (getMaxWindows() != AbstractConcordanceWindowCollector.COLLECT_ALL
+ && windows.size() >= getMaxWindows()) {
+ setHitMax(true);
+ return;
+ }
+ windows.add(w);
+ addDocId(w.getUniqueDocID());
+ }
+
+ @Override
+ public int size() {
+ return windows.size();
+ }
+
+ @Override
+ public List getWindows() {
+ return windows;
+ }
+
+
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DedupingConcordanceWindowCollector.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DedupingConcordanceWindowCollector.java
new file mode 100644
index 0000000..660c3f7
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DedupingConcordanceWindowCollector.java
@@ -0,0 +1,103 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Like ConcordanceWindowCollector, but this collector
+ * doesn't store duplicate windows. Windows are defined as duplicates by
+ * {@link #buildEqualityKey(ConcordanceWindow, StringBuilder)}.
+ *
+ */
+public class DedupingConcordanceWindowCollector extends AbstractConcordanceWindowCollector {
+
+ Map map = new HashMap<>();
+ private StringBuilder sb = new StringBuilder();
+
+ /**
+ *
+ * @param maxHits maximum number of windows to store. This could potentially
+ * visit lots more windows than maxHits.
+ */
+ public DedupingConcordanceWindowCollector(int maxHits) {
+ super(maxHits);
+ }
+
+ @Override
+ public void collect(ConcordanceWindow w) {
+ if (getHitMax() == true) {
+ return;
+ }
+ buildEqualityKey(w, sb);
+ String key = sb.toString();
+ ConcordanceWindow oldWindow = map.get(key);
+ if (oldWindow == null) {
+ //we would have added a new window here
+ if (getMaxWindows() != AbstractConcordanceWindowCollector.COLLECT_ALL &&
+ map.size() >= getMaxWindows()) {
+ setHitMax(true);
+ return;
+ }
+ oldWindow = w;
+ } else {
+ //if the old window existed (i.e. new window is a duplicate)
+ //keep incrementing the count
+ oldWindow.incrementCount();
+ }
+
+ map.put(key, oldWindow);
+ }
+
+
+ /**
+ * number of windows collected
+ */
+ @Override
+ public int size() {
+ return map.size();
+ }
+
+ @Override
+ public List getWindows() {
+ List windows = new ArrayList<>();
+ windows.addAll(map.values());
+ return windows;
+ }
+
+ /**
+ * Public for easy overriding. Generate a key to be used to determine
+ * whether two windows are the same. Some implementations
+ * might want to lowercase, some might want genuine case folding,
+ * some might want to strip non-alphanumerics, etc.
+
+ * @param w ConcordanceWindow
+ * @param sb reuseable StringBuilder; sb.setLength(0) is called before use!
+ */
+ public void buildEqualityKey(ConcordanceWindow w, StringBuilder sb) {
+ sb.setLength(0);
+ sb.append(w.getPre().toLowerCase());
+ sb.append(">>>");
+ sb.append(w.getTarget().toLowerCase());
+ sb.append("<<<");
+ sb.append(w.getPost().toLowerCase());
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DefaultSortKeyBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DefaultSortKeyBuilder.java
new file mode 100644
index 0000000..275a55c
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DefaultSortKeyBuilder.java
@@ -0,0 +1,149 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer;
+
+/**
+ * Builds basic sort key for the values available in ConcordanceSortOrder
+ */
+public class DefaultSortKeyBuilder implements SortKeyBuilder {
+
+ private final static String SPACE = " ";
+ private final static String EMPTY_STRING = "";
+ //what filler to use when a "term" comes back as null from the
+ //TokenCharOffsetResults
+ private static String NULL_FILLER = "";
+ private final ConcordanceSortOrder sortOrder;
+
+ /**
+ * Calls {@link #DefaultSortKeyBuilder(ConcordanceSortOrder)}
+ * with value of: ConcordanceSortOrder.PRE
+ */
+ public DefaultSortKeyBuilder() {
+ this.sortOrder = ConcordanceSortOrder.PRE;
+ }
+
+ /**
+ *
+ * @param sortOrder sort order to use
+ */
+ public DefaultSortKeyBuilder(ConcordanceSortOrder sortOrder) {
+ this.sortOrder = sortOrder;
+ }
+
+ @Override
+ public ConcordanceSortKey buildKey(String docKey,
+ int startTargetTokenOffset,
+ int endTargetTokenOffset,
+ RandomAccessCharOffsetContainer charOffsets,
+ int tokensBefore, int tokensAfter,
+ Map metadata) {
+
+ if (sortOrder == ConcordanceSortOrder.NONE) {
+ return new ConcordanceSortKey(EMPTY_STRING);
+ }
+
+ if (sortOrder == ConcordanceSortOrder.DOC) {
+ int targCharStart = charOffsets.getCharacterOffsetStart(startTargetTokenOffset);
+ return new DocumentOrderSortKey(docKey, targCharStart);
+ }
+
+ StringBuilder sb = new StringBuilder();
+ //order is important for appending to sb, target must come before pre/post
+ if (sortOrder == ConcordanceSortOrder.TARGET_POST
+ || sortOrder == ConcordanceSortOrder.TARGET_PRE) {
+
+ for (int i = startTargetTokenOffset; i <= endTargetTokenOffset; i++) {
+ String tmp = charOffsets.getTerm(i);
+ if (tmp != null && tmp.length() > 0) {
+ sb.append(tmp).append(SPACE);
+ } else {
+ sb.append(NULL_FILLER);
+ }
+ }
+ }
+
+ if (sortOrder == ConcordanceSortOrder.PRE
+ || sortOrder == ConcordanceSortOrder.TARGET_PRE) {
+ int tmpStart = startTargetTokenOffset - 1;
+ int tmpEnd = Math.max(0, startTargetTokenOffset - tokensBefore);
+ if (tmpStart < 0) {
+ sb.append(SPACE);
+ }
+
+ for (int i = tmpStart; i >= tmpEnd; i--) {
+ String tmp = charOffsets.getTerm(i);
+ if (tmp != null && tmp.length() > 0) {
+ sb.append(tmp).append(SPACE);
+ } else {
+ sb.append(NULL_FILLER);
+ }
+ }
+
+ } else if (sortOrder == ConcordanceSortOrder.POST
+ || sortOrder == ConcordanceSortOrder.TARGET_POST) {
+
+ int tmpStart = endTargetTokenOffset + 1;
+ int tmpEnd = Math.min(charOffsets.getLast(), endTargetTokenOffset+tokensAfter);
+
+ if (tmpStart > charOffsets.getLast()) {
+ sb.append(SPACE);
+ }
+ for (int i = tmpStart; i <= tmpEnd; i++) {
+ String tmp = charOffsets.getTerm(i);
+ if (tmp != null && tmp.length() > 0) {
+ sb.append(tmp).append(SPACE);
+ } else {
+ sb.append(NULL_FILLER);
+ }
+ }
+ }
+ return new ConcordanceSortKey(sb.toString().trim());
+ }
+
+ @Override
+ public boolean requiresAnalysisOfPre() {
+ if (sortOrder == ConcordanceSortOrder.PRE
+ || sortOrder == ConcordanceSortOrder.TARGET_PRE) {
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public boolean requiresAnalysisOfPost() {
+ if (sortOrder == ConcordanceSortOrder.POST
+ || sortOrder == ConcordanceSortOrder.TARGET_POST) {
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public boolean requiresAnalysisOfTarget() {
+ if (sortOrder == ConcordanceSortOrder.TARGET_PRE
+ || sortOrder == ConcordanceSortOrder.TARGET_POST) {
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DocIdBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DocIdBuilder.java
new file mode 100644
index 0000000..8772b2a
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DocIdBuilder.java
@@ -0,0 +1,30 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * Pair of field index and character offset.
+ * The fieldIndex records the index in a potentially multi-valued field (array).
+ * The charOffset records the character offset within that field within that value in the potentially
+ * multi-valued field.
+ */
+public interface DocIdBuilder {
+ public String build(StoredDocument document, long docId);
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DocMetadataExtractor.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DocMetadataExtractor.java
new file mode 100644
index 0000000..6f6216f
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DocMetadataExtractor.java
@@ -0,0 +1,47 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * Simple interface for a component that extracts metadata from
+ * a document to be stored with a ConcordanceWindow
+ *
+ */
+public interface DocMetadataExtractor {
+ /**
+ *
+ * @return the fields that need to be retrieved for the document
+ * for proper processing
+ */
+ public Set getFieldSelector();
+
+ /**
+ *
+ * @param document to be processed for metadata. Only those fields
+ * that were returned by {@link #getFieldSelector()} will be loaded
+ * in the document
+ * @return document metadata to be stored with each window
+ */
+ public Map extract(StoredDocument document);
+
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentOrderSortKey.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentOrderSortKey.java
new file mode 100644
index 0000000..ed16cbc
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentOrderSortKey.java
@@ -0,0 +1,47 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This sorts based alphabetically on the document key
+ * and then numerically on the targetCharStart
+ *
+ */
+public class DocumentOrderSortKey extends ConcordanceSortKey{
+
+ protected final int targetCharStart;
+
+ public DocumentOrderSortKey(String docKey, int targetCharStart) {
+ super(docKey);
+ this.targetCharStart = targetCharStart;
+ }
+
+ @Override
+ public int compareTo(ConcordanceSortKey o) {
+ if (o instanceof DocumentOrderSortKey) {
+ DocumentOrderSortKey other = (DocumentOrderSortKey)o;
+ int cmp = super.compareTo(o);
+ if (cmp == 0) {
+ return Integer.compare(targetCharStart, other.targetCharStart);
+ }
+ return cmp;
+ } else {
+ return super.compareTo(o);
+ }
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/FieldBasedDocIdBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/FieldBasedDocIdBuilder.java
new file mode 100644
index 0000000..75d70fd
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/FieldBasedDocIdBuilder.java
@@ -0,0 +1,69 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.index.StorableField;
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * Simple class that grabs the stringValue() of a specified
+ * field to use as a document's unique key for the ConcordanceWindow
+ * building process.
+ *
+ * Note that this takes only the first value of the field.
+ * If a multi-valued field is selected, surprises might happen.
+ *
+ * Also, note that if the field is not found, this returns
+ * a string representation of the ephemeral Lucene docId.
+ *
+ * Some users might want to throw an exception instead of this behavior.
+ *
+ */
+public class FieldBasedDocIdBuilder implements DocIdBuilder {
+
+ private final String fieldName;
+
+ /**
+ *
+ * @param fieldName, name of field to be used as a document's unique key
+ */
+ public FieldBasedDocIdBuilder(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ @Override
+ public String build(StoredDocument d, long docId) {
+ StorableField field = d.getField(fieldName);
+ if (field == null) {
+ return Long.toString(docId);
+ }
+ return field.toString();
+ }
+ /**
+ * Instead of getField(String fieldName), this allows for extension
+ * @return single field to grab from doc
+ */
+ public Set getFields() {
+ Set fields = new HashSet<>();
+ fields.add(fieldName);
+ return fields;
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/IndexIdDocIdBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/IndexIdDocIdBuilder.java
new file mode 100644
index 0000000..ae8e490
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/IndexIdDocIdBuilder.java
@@ -0,0 +1,35 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * Simple id builder based on ephemeral Lucene doc ids.
+ * Use this only if your documents do not have a unique key.
+ * Then, use only with great care.
+ *
+ */
+public class IndexIdDocIdBuilder implements DocIdBuilder {
+
+ @Override
+ public String build(StoredDocument d, long docId) {
+ return Long.toString(docId);
+ }
+
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/SimpleDocMetadataExtractor.java lucene/concordance/src/java/org/apache/lucene/search/concordance/SimpleDocMetadataExtractor.java
new file mode 100644
index 0000000..6b42410
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/SimpleDocMetadataExtractor.java
@@ -0,0 +1,65 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * Simple class that returns a map of key value pairs
+ * for the fields specified by {@link #setFieldSelector(java.util.Set)}.
+ *
+ * For multi-valued fields, this will take only the first value.
+ *
+ */
+public class SimpleDocMetadataExtractor implements DocMetadataExtractor {
+
+ private Set fields = new HashSet<>();
+
+ public void setFieldSelector(Set f) {
+ fields.clear();
+ for (String s : f) {
+ fields.add(s);
+ }
+ }
+
+ @Override
+ public Set getFieldSelector() {
+ return Collections.unmodifiableSet(fields);
+ }
+
+ @Override
+ public Map extract(StoredDocument d) {
+ Map map = new HashMap<>();
+ // only takes the first value in a multi-valued field!!!
+ for (String fieldName : getFieldSelector()) {
+ String[] fieldValues = d.getValues(fieldName);
+
+ if (fieldValues != null && fieldValues.length > 0) {
+ map.put(fieldName, fieldValues[0]);
+ }
+ }
+ return map;
+ }
+
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/SortKeyBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/SortKeyBuilder.java
new file mode 100644
index 0000000..d8ff324
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/SortKeyBuilder.java
@@ -0,0 +1,46 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.Map;
+
+import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer;
+
+public interface SortKeyBuilder {
+
+ /**
+ * Builds a sort key from the classic TokenCharOffsetResults object
+ * @param docKey to be used if sorting by document key
+ * @param startTargetTokenOffset target start offset
+ * @param endTargetTokenOffset target end offset
+ * @param charOffsets charOffsets to use for lookup
+ * @param numTokensPre number of tokens to include before target
+ * @param numTokensPost number of tokens to include after target
+ * @param metadata metadata to use
+ * @return ConcordanceSortKey
+ */
+ ConcordanceSortKey buildKey(String docKey,
+ int startTargetTokenOffset, int endTargetTokenOffset,
+ RandomAccessCharOffsetContainer charOffsets,
+ int numTokensPre, int numTokensPost, Map metadata);
+
+ public boolean requiresAnalysisOfPre();
+ public boolean requiresAnalysisOfPost();
+ public boolean requiresAnalysisOfTarget();
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java
new file mode 100644
index 0000000..ac59bd5
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java
@@ -0,0 +1,241 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer;
+import org.apache.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil;
+import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException;
+
+
+/**
+ *
+ * Builds a ConcordanceWindow.
+ *
+ * This class includes basic functionality for building a window from token offsets.
+ *
+ * It also calls three other components:
+ *
+ *
DocIdBuilder - extracts or builds a unique key for each document
+ *
DocMetadataExtractor - extracts metadata from a document to be stored with each window
+ *
SortKeyBuilder - builds a window's sort key
+ *
+ *
+ */
+public class WindowBuilder {
+
+ private static String INTER_MULTIVALUE_FIELD_PADDING = " | ";
+ private final static String EMPTY_STRING = "";
+
+ private final int tokensBefore;
+ private final int tokensAfter;
+ private final SortKeyBuilder sortKeyBuilder;
+ private final DocMetadataExtractor metadataExtractor;
+ private final DocIdBuilder docIdBuilder;
+ private final int offsetGap;
+
+ public WindowBuilder() {
+ this(
+ 10, //tokens before
+ 10, //tokens after
+ 0,
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE),
+ new SimpleDocMetadataExtractor(),
+ new IndexIdDocIdBuilder()
+ );
+ }
+
+ public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap) {
+ this(
+ tokensBefore,
+ tokensAfter,
+ offsetGap,
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE),
+ new SimpleDocMetadataExtractor(),
+ new IndexIdDocIdBuilder()
+ );
+
+ }
+
+ public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap, SortKeyBuilder sortKeyBuilder,
+ DocMetadataExtractor metadataExtractor, DocIdBuilder docIdBuilder) {
+ this.tokensBefore = tokensBefore;
+ this.tokensAfter = tokensAfter;
+ this.offsetGap = offsetGap;
+ this.sortKeyBuilder = sortKeyBuilder;
+ this.metadataExtractor = metadataExtractor;
+ this.docIdBuilder = docIdBuilder;
+ }
+
+ /**
+ * Makes the assumption that the target token start and target token end can
+ * be found. If not, this returns a null.
+ * @param uniqueDocID ephemeral internal lucene unique document id
+ * @param targetTokenStart
+ * Target's start token
+ *
+ * @param targetTokenEnd
+ * Target's end token
+ * @param fieldValues field values
+ * @param metadata
+ * Metadata to be stored with the window
+ * @param offsets
+ * TokenOffsetResults from
+ * @return ConcordanceWindow or null if character offset information cannot be
+ * found for both the targetTokenStart and the targetTokenEnd
+ */
+ public ConcordanceWindow buildConcordanceWindow(String uniqueDocID,
+ int targetTokenStart, int targetTokenEnd,
+ String[] fieldValues,
+ RandomAccessCharOffsetContainer offsets, Map metadata)
+ throws TargetTokenNotFoundException,
+ IllegalArgumentException {
+
+ if (targetTokenStart < 0 || targetTokenEnd < 0) {
+ throw new IllegalArgumentException(
+ "targetTokenStart and targetTokenEnd must be >= 0");
+ }
+ if (targetTokenEnd < targetTokenStart) {
+ throw new IllegalArgumentException(
+ "targetTokenEnd must be >= targetTokenStart");
+ }
+
+ int targetCharStart = offsets.getCharacterOffsetStart(targetTokenStart);
+ int targetCharEnd = offsets.getCharacterOffsetEnd(targetTokenEnd);
+
+ if (targetCharStart < 0 ||
+ targetCharEnd < 0) {
+ throw new TargetTokenNotFoundException(
+ "couldn't find character offsets for a target token.\n"
+ + "Check that your analyzers are configured properly.\n");
+ }
+
+ OffsetAttribute preCharOffset = getPreCharOffset(targetTokenStart,
+ targetCharStart, offsets);
+ String preString = (preCharOffset == null) ? EMPTY_STRING :
+ SimpleAnalyzerUtil.substringFromMultiValuedFields(
+ preCharOffset.startOffset(), preCharOffset.endOffset(), fieldValues,
+ offsetGap, INTER_MULTIVALUE_FIELD_PADDING);
+
+ OffsetAttribute postCharOffset = getPostCharOffset(targetTokenEnd,
+ targetCharEnd, offsets);
+
+ String postString = (postCharOffset == null) ? EMPTY_STRING :
+ SimpleAnalyzerUtil.substringFromMultiValuedFields(
+ postCharOffset.startOffset(), postCharOffset.endOffset(), fieldValues,
+ offsetGap, INTER_MULTIVALUE_FIELD_PADDING);
+
+ String targString = SimpleAnalyzerUtil.substringFromMultiValuedFields(
+ targetCharStart, targetCharEnd, fieldValues,
+ offsetGap, INTER_MULTIVALUE_FIELD_PADDING);
+ ConcordanceSortKey sortKey = sortKeyBuilder.buildKey(uniqueDocID,
+ targetTokenStart, targetTokenEnd, offsets, tokensBefore, tokensAfter, metadata);
+
+ int charStart = (preCharOffset == null) ? targetCharStart :
+ preCharOffset.startOffset();
+
+ int charEnd = (postCharOffset == null) ? targetCharEnd : postCharOffset.endOffset();
+ return new ConcordanceWindow(uniqueDocID, charStart, charEnd, preString, targString,
+ postString, sortKey, metadata);
+
+ }
+
+
+ private OffsetAttribute getPreCharOffset(int targetTokenStart,
+ int targetCharStart,
+ RandomAccessCharOffsetContainer charOffsets) {
+ if (tokensBefore == 0)
+ return null;
+
+ if (targetTokenStart == 0) {
+ return null;
+ }
+ int contextTokenStart = Math.max(0,
+ targetTokenStart - tokensBefore);
+
+ int contextCharStart = charOffsets.getClosestCharStart(contextTokenStart, targetTokenStart);
+ //closest start wasn't actually found
+ //this can happen if there is a large posInc and the target
+ //lands at the start of a field index
+ if (contextCharStart < 0) {
+ return null;
+ }
+ int contextCharEnd = Math.max(contextCharStart, targetCharStart - 1);
+
+ return buildOffsetAttribute(contextCharStart, contextCharEnd);
+ }
+
+ private OffsetAttribute getPostCharOffset(int targetTokenEnd,
+ int targetCharEnd,
+ RandomAccessCharOffsetContainer charOffsets) {
+
+ if (tokensAfter == 0)
+ return null;
+
+ int contextTokenEnd = targetTokenEnd + tokensAfter;
+ int contextCharStart = targetCharEnd;
+ int contextCharEnd = charOffsets.getClosestCharEnd(
+ contextTokenEnd, targetTokenEnd + 1);
+
+ if (contextCharStart >= contextCharEnd) {
+ return null;
+ }
+ return buildOffsetAttribute(contextCharStart, contextCharEnd);
+ }
+
+ private OffsetAttribute buildOffsetAttribute(int start, int end) {
+ OffsetAttribute off = new OffsetAttributeImpl();
+ off.setOffset(start, end);
+ return off;
+ }
+
+
+ public Set getFieldSelector() {
+ Set set = metadataExtractor.getFieldSelector();
+ if (docIdBuilder instanceof FieldBasedDocIdBuilder) {
+ set.addAll(((FieldBasedDocIdBuilder)docIdBuilder).getFields());
+ }
+ return set;
+ }
+
+ /**
+ * Simple wrapper around metadataExtractor
+ * @param document from which to extract metadata
+ * @return map of metadata
+ */
+ public Map extractMetadata(StoredDocument document) {
+ return metadataExtractor.extract(document);
+ }
+
+ public String getUniqueDocumentId(StoredDocument document, long docId) {
+ return docIdBuilder.build(document, docId);
+ }
+
+ public int getTokensBefore() {
+ return tokensBefore;
+ }
+
+ public int getTokensAfter() {
+ return tokensAfter;
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java
new file mode 100644
index 0000000..82f6f03
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java
@@ -0,0 +1,91 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+
+import java.util.List;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl;
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * Simple class to store a document id (leaf and unique), a StoredDocument, and the offsets
+ * for a SpanQuery hit
+ *
+ */
+
+public class DocTokenOffsets {
+ private int leafDocId = -1;
+ private int uniqueId = -1;
+ private StoredDocument document = null;
+ private List offsets = new ArrayList<>();
+
+ public void setDocument(StoredDocument d) {
+ this.document = d;
+ }
+ public void addOffset(int start, int end) {
+ OffsetAttributeImpl offset = new OffsetAttributeImpl();
+ offset.setOffset(start, end);
+ offsets.add(offset);
+ }
+
+ public void reset(int base, int leafDocId, StoredDocument d, int start, int end) {
+ this.leafDocId = leafDocId;
+ this.uniqueId = base+leafDocId;
+ setDocument(d);
+ offsets.clear();
+ addOffset(start,end);
+ }
+
+ public List getOffsets() {
+ return offsets;
+ }
+
+ public StoredDocument getDocument() {
+ return document;
+ }
+
+ public int getLeafDocId() {
+ return leafDocId;
+ }
+
+ public int getUniqueDocId() {
+ return uniqueId;
+ }
+
+ public DocTokenOffsets deepishCopy() {
+ DocTokenOffsets copy = new DocTokenOffsets();
+ copy.leafDocId = leafDocId;
+ copy.uniqueId = uniqueId;
+ copy.document = document;
+ List copyOffsets = new ArrayList<>();
+ copyOffsets.addAll(offsets);
+ copy.offsets = copyOffsets;
+ return copy;
+ }
+
+ public boolean isEmpty() {
+ return leafDocId < 0;
+ }
+
+ public void pseudoEmpty() {
+ leafDocId = -1;
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java
new file mode 100644
index 0000000..690b0e6
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java
@@ -0,0 +1,163 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * Scaffolding/Sugar class around SpanQuery.getSpans(...). This allows the client
+ * to iterate on an IndexReader (not necessarily a leaf) by document (DocTokenOffsets).
+ */
+public class DocTokenOffsetsIterator {
+ /*
+ * NOT THREAD SAFE!!!
+ */
+ private SpanQuery spanQuery;
+ private Filter filter;
+ private LinkedList leafReaders = new LinkedList<>();
+ private LeafReader currReader = null;
+ private Set fields;
+ private Spans spans = null;
+ private DocTokenOffsets docTokenOffsets = new DocTokenOffsets();
+ private DocTokenOffsets docTokenOffsetsBuffer = new DocTokenOffsets();
+ private int currentBase = -1;
+
+ private Map termMap = new HashMap<>();
+
+ public DocTokenOffsetsIterator() {
+ }
+
+ public void reset(SpanQuery q, Filter f, IndexReader reader, Set fields) throws IOException {
+
+ this.spanQuery = q;
+ this.filter = f;
+
+ this.fields = fields;
+ leafReaders.addAll(reader.leaves());
+ if (leafReaders.size() > 0) {
+ reinitSpans();
+ }
+ }
+
+ public boolean next() throws IOException {
+
+ if (spans == null || docTokenOffsetsBuffer.isEmpty()) {
+ if (leafReaders.size() == 0) {
+ return false;
+ } else if (!reinitSpans()) {
+ return false;
+ }
+
+ }
+ boolean currSpansHasMore = false;
+ while (spans.next()) {
+ if (spans.doc() == docTokenOffsetsBuffer.getLeafDocId()) {
+ docTokenOffsetsBuffer.addOffset(spans.start(), spans.end());
+ } else {
+ currSpansHasMore = true;
+ break;
+ }
+ }
+ docTokenOffsets = docTokenOffsetsBuffer.deepishCopy();
+
+ if (currSpansHasMore) {
+ StoredDocument d = currReader.document(spans.doc(), fields);
+ docTokenOffsetsBuffer.reset(currentBase, spans.doc(), d, spans.start(), spans.end());
+ } else {
+ docTokenOffsetsBuffer.pseudoEmpty();
+ }
+ return true;
+ }
+
+ public DocTokenOffsets getDocTokenOffsets() {
+ return docTokenOffsets;
+ }
+
+ private boolean reinitSpans() throws IOException {
+ //must check that leafReaders.size() > 0 before running reinitSpans!!!
+ LeafReaderContext ctx = leafReaders.pop();
+ currentBase = ctx.docBase;
+ currReader = ctx.reader();
+ Bits bits = null;
+ Bits liveBits = currReader.getLiveDocs();
+ //liveBits can be null if all of the docs are live!!!
+ if (filter == null) {
+ bits = liveBits;
+ } else {
+ DocIdSet idSet = filter.getDocIdSet(ctx, liveBits);
+
+/* only works in 5.x. branch, not trunk
+ if (idSet instanceof FixedBitSet) {
+ bits = (FixedBitSet)idSet;
+ } else {*/
+ DocIdSetIterator itr = idSet.iterator();
+ if (itr != null) {
+ FixedBitSet tmpBits = new FixedBitSet(currReader.maxDoc());
+ while (itr.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+ tmpBits.set(itr.docID());
+ }
+ bits = tmpBits;
+ }
+ }
+
+ /*bits() is optional; this doesn't work!!!!
+ bits = idSet.bits();
+ */
+
+ //bits can be null if all the docs are live
+ //or if the filter returned an empty docidset.
+ if (filter != null && bits == null) {
+ if (leafReaders.size() > 0) {
+ return reinitSpans();
+ } else {
+ return false;
+ }
+ }
+
+ spans = spanQuery.getSpans(ctx, bits, termMap);
+ //can getSpans return null?
+ if (spans != null && spans.next()) {
+ StoredDocument d = currReader.document(spans.doc(), fields);
+
+ docTokenOffsetsBuffer.reset(currentBase, spans.doc(), d, spans.start(), spans.end());
+ return true;
+ } else if (leafReaders.size() > 0) {
+ return reinitSpans();
+ } else {
+ return false;
+ }
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java
new file mode 100644
index 0000000..c19f61c
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java
@@ -0,0 +1,50 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Comparator;
+import java.io.Serializable;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Sorts length desc, start offset asc
+ *
+ */
+
+public class OffsetLengthStartComparator implements Comparator, Serializable {
+ private static final long serialVersionUID = 7526472295622776147L;
+
+ @Override
+ public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) {
+
+ int lenA = offsetA.endOffset()-offsetA.startOffset();
+ int lenB = offsetB.endOffset()-offsetB.startOffset();
+ if (lenA < lenB) {
+ return 1;
+ } else if (lenA > lenB) {
+ return -1;
+ //by here, the length is the same
+ } else if (offsetA.startOffset() < offsetB.startOffset()) {
+ return -1;
+ } else if (offsetA.startOffset() > offsetB.startOffset()) {
+ return 1;
+ }
+ return 0;
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java
new file mode 100644
index 0000000..685dbb2
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java
@@ -0,0 +1,42 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Comparator;
+import java.io.Serializable;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * sort on offset start
+ */
+public class OffsetStartComparator implements Comparator, Serializable{
+ private static final long serialVersionUID = 7526472295622776147L;
+
+ @Override
+ public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) {
+
+ if (offsetA.startOffset() < offsetB.startOffset()) {
+ return -1;
+ } else if (offsetA.startOffset() > offsetB.startOffset()) {
+ return 1;
+ }
+ return 0;
+ }
+
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java
new file mode 100644
index 0000000..369925d
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java
@@ -0,0 +1,68 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Set;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * In some versions of Lucene, getSpans returned overlapping spans.
+ * This class can remove the overlapping spans and will sort them
+ * if startComparator is not null.
+ *
+ *
+ */
+public class OffsetUtil {
+
+
+ public static List removeOverlapsAndSort(List offsets,
+ OffsetLengthStartComparator comparator,
+ OffsetStartComparator startComparator) {
+ if (offsets == null || offsets.size() < 2)
+ return offsets;
+
+ Collections.sort(offsets, comparator);
+ Set seen = new HashSet<>();
+ List filtered = new ArrayList<>();
+ for (OffsetAttribute offset : offsets) {
+ if (! alreadySeen(offset, seen)) {
+ filtered.add(offset);
+ for (int i = offset.startOffset(); i < offset.endOffset(); i++) {
+ seen.add(i);
+ }
+ }
+ }
+ if (startComparator != null) {
+ Collections.sort(filtered, startComparator);
+ }
+ return filtered;
+ }
+
+ private static boolean alreadySeen(OffsetAttribute offset, Set seen) {
+ for (int i = offset.startOffset(); i <= offset.endOffset(); i++) {
+ if (seen.contains(i))
+ return true;
+ }
+ return false;
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/RandomAccessCharOffsetContainer.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/RandomAccessCharOffsetContainer.java
new file mode 100644
index 0000000..a1857f4
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/RandomAccessCharOffsetContainer.java
@@ -0,0 +1,225 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.BitSet;
+import java.util.HashMap;
+import java.util.Map;
+
+
+/**
+ * Class to record results for looking up normalized terms (String) and
+ * character offsets for specified tokens. Will return NULL_TERM/NULL_OFFSET if
+ * a token offset was not found.
+ *
+ * Has utility methods for safely getting the closest found token. This is
+ * useful for when a concordance window ends in a stop word (no term/offset
+ * info).
+ */
+
+public class RandomAccessCharOffsetContainer {
+
+ public final static String NULL_TERM = "";
+ public final static int NULL_OFFSET = -1;
+
+ private BitSet set = new BitSet();
+ private int last = -1;
+ private Map terms = new HashMap<>();
+ private Map starts = new HashMap<>();
+ private Map ends = new HashMap<>();
+
+ /**
+ *
+ * @param tokenOffset token of interest
+ * @param startCharOffset start character offset within the string stored in StoredField[fieldIndex]
+ * @param endCharOffset end character offset within the string stored in StoredField[fieldIndex]
+ * @param term string term at that position
+ */
+ public void add(int tokenOffset, int startCharOffset,
+ int endCharOffset, String term) {
+ addStart(tokenOffset, startCharOffset);
+ addEnd(tokenOffset, endCharOffset);
+ addTerm(tokenOffset, term);
+ set.set(tokenOffset);
+ }
+
+ private void addTerm(int tokenOffset, String term) {
+ if (term != null) {
+ terms.put(tokenOffset, term);
+ }
+ last = (tokenOffset > last) ? tokenOffset : last;
+ }
+
+ private void addStart(int tokenOffset, int charOffset) {
+ starts.put(tokenOffset, charOffset);
+ last = (tokenOffset > last) ? tokenOffset : last;
+ }
+
+ private void addEnd(int tokenOffset, int charOffset) {
+ ends.put(tokenOffset, charOffset);
+ last = (tokenOffset > last) ? tokenOffset : last;
+ }
+
+ /**
+ * @param tokenOffset target token
+ * @return the character offset for the first character of the tokenOffset.
+ * returns {@link #NULL_OFFSET} if tokenOffset wasn't found
+ */
+ public int getCharacterOffsetStart(int tokenOffset) {
+ Integer start = starts.get(tokenOffset);
+ if (start == null) {
+ return NULL_OFFSET;
+ }
+ return start;
+ }
+
+ /**
+ * @param tokenOffset target token
+ * @return the character offset for the final character of the tokenOffset.
+ */
+ public int getCharacterOffsetEnd(int tokenOffset) {
+ Integer end = ends.get(tokenOffset);
+ if (end == null) {
+ return NULL_OFFSET;
+ }
+ return end;
+ }
+
+ /**
+ *
+ * @param tokenOffset tokenOffset
+ * @return term stored at this tokenOffset; can return {@link #NULL_TERM}
+ */
+ public String getTerm(int tokenOffset) {
+ String s = terms.get(tokenOffset);
+ if (s == null) {
+ return NULL_TERM;
+ }
+ return s;
+ }
+
+ /**
+ *
+ * @return last/largest token offset
+ */
+ public int getLast() {
+ return last;
+ }
+
+ /**
+ * reset state
+ */
+ public void clear() {
+ terms.clear();
+ starts.clear();
+ ends.clear();
+ last = -1;
+ set.clear();
+ }
+
+ protected boolean isEmpty() {
+ return set.isEmpty();
+ }
+
+ /**
+ * Find the closest non-null token starting from startToken
+ * and ending with stopToken (inclusive).
+ *
+ * @param startToken token at which to start the search
+ * @param stopToken token at which to end
+ * @param map map to search
+ * @return closest non-null token offset to the startToken; can return
+ * {@link #NULL_OFFSET} if no non-null offset was found
+ */
+ private int getClosestToken(int startToken, int stopToken,
+ Map map) {
+
+ if (startToken < 0 || stopToken < 0) {
+ return NULL_OFFSET;
+ }
+ if (startToken == stopToken) {
+ return startToken;
+ }
+ if (startToken < stopToken) {
+ for (int i = startToken; i <= stopToken; i++) {
+ Integer charOffset = map.get(i);
+ if (charOffset != null && charOffset != NULL_OFFSET) {
+ return i;
+ }
+ }
+ } else if (startToken > stopToken) {
+ for (int i = startToken; i >= stopToken; i--) {
+ Integer charOffset = map.get(i);
+ if (charOffset != null && charOffset != NULL_OFFSET) {
+ return i;
+ }
+ }
+ }
+ return NULL_OFFSET;
+ }
+
+ public int getClosestCharStart(int startToken, int stopToken) {
+
+ int i = getClosestToken(startToken, stopToken, starts);
+ Integer charStart = getCharacterOffsetStart(i);
+ if (charStart == null) {
+ return NULL_OFFSET;
+ }
+ return charStart;
+ }
+
+ public int getClosestCharEnd(int startToken, int stopToken) {
+ int i = getClosestToken(startToken, stopToken, ends);
+ Integer charEnd = getCharacterOffsetEnd(i);
+ if (charEnd == null) {
+ return NULL_OFFSET;
+ }
+ return charEnd;
+ }
+
+ protected String getClosestTerm(int startToken, int stopToken) {
+ int i = getClosestToken(startToken, stopToken, starts);
+ return getTerm(i);
+ }
+
+ /*
+ * return: -1 if
+
+ public int getFieldIndex(int tokenOffset) {
+ CharCoordinate p = starts.get(tokenOffset);
+ if (p == null) {
+ return NULL_OFFSET;
+ }
+ return p.getFieldIndex();
+ }
+*/
+
+ protected BitSet getSet() {
+ return set;
+ }
+
+ public void remove(int token) {
+ if (token == last) {
+ last = getClosestToken(last-1, 0, starts);
+ }
+ set.clear(token);
+ terms.remove(token);
+ starts.remove(token);
+ ends.remove(token);
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java
new file mode 100644
index 0000000..79d035a
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java
@@ -0,0 +1,116 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.index.StoredDocument;
+
+/**
+ * TokenCharOffsetsReader that captures character offsets by reanalyzing a
+ * field.
+ *
+ */
+public class ReanalyzingTokenCharOffsetsReader implements
+ TokenCharOffsetsReader {
+
+ private final static int GOT_ALL_REQUESTS = -2;
+ private Analyzer baseAnalyzer;
+
+ /**
+ * Constructor
+ * @param analyzer to use to get character offsets
+ */
+ public ReanalyzingTokenCharOffsetsReader(Analyzer analyzer) {
+ this.baseAnalyzer = analyzer;
+ }
+
+ @Override
+ public void getTokenCharOffsetResults(final StoredDocument d,
+ final String fieldName, final TokenCharOffsetRequests requests,
+ final RandomAccessCharOffsetContainer results) throws IOException {
+
+ int fieldIndex = 0;
+ int currPosInc = -1;
+ int posIncrementGap = baseAnalyzer.getPositionIncrementGap(fieldName);
+ int charOffsetGap = baseAnalyzer.getOffsetGap(fieldName);
+ int charBase = 0;
+ for (String fieldValue : d.getValues(fieldName)) {
+
+ currPosInc = addFieldValue(fieldIndex, currPosInc, charBase, fieldValue, requests,
+ results);
+
+ if (currPosInc == GOT_ALL_REQUESTS) {
+ break;
+ }
+ charBase += fieldValue.length()+charOffsetGap;
+ currPosInc += posIncrementGap;
+ fieldIndex++;
+ }
+
+ }
+
+ private int addFieldValue(int fieldIndex, int currInd, int charBase, String fieldValue,
+ TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results)
+ throws IOException {
+ //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true);
+ TokenStream stream = baseAnalyzer.tokenStream("", fieldValue);
+ stream.reset();
+
+ int defaultInc = 1;
+
+ CharTermAttribute termAtt = stream
+ .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
+ OffsetAttribute offsetAtt = stream
+ .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
+ PositionIncrementAttribute incAtt = null;
+ if (stream
+ .hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) {
+ incAtt = stream
+ .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);
+ }
+
+ while (stream.incrementToken()) {
+
+ currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc;
+ if (requests.contains(currInd)) {
+ results.add(currInd, offsetAtt.startOffset()+charBase,
+ offsetAtt.endOffset()+charBase, termAtt.toString());
+ }
+ if (currInd > requests.getLast()) {
+ // TODO: Is there a way to avoid this? Or, is this
+ // an imaginary performance hit?
+ while (stream.incrementToken()) {
+ //NO-OP
+ }
+ stream.end();
+ stream.close();
+ return GOT_ALL_REQUESTS;
+ }
+ }
+ stream.end();
+ stream.close();
+ return currInd;
+ }
+
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java
new file mode 100644
index 0000000..10579bb
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java
@@ -0,0 +1,156 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+
+/**
+ * Simple util class for Analyzers
+ */
+public class SimpleAnalyzerUtil {
+ private final static String DEFAULT_FIELD = "FIELD";
+
+ /**
+ * Returns simple list of analyzed strings
+ *
+ * @param s string to analyze
+ * @param analyzer analyzer
+ * @return list of string tokens
+ * @throws java.io.IOException
+ */
+ public static List getTermStrings(String s, Analyzer analyzer)
+ throws IOException {
+ List terms = new ArrayList<>();
+ return getTermStrings(s, analyzer, terms);
+ }
+
+ /**
+ * allows reuse of terms, this method calls terms.clear() before adding new
+ * terms
+ *
+ * @param s string to analyze
+ * @param analyzer analyzer
+ * @param terms list for reuse
+ * @return list of strings
+ * @throws java.io.IOException
+ */
+ public static List getTermStrings(String s, Analyzer analyzer,
+ List terms) throws IOException {
+ if (terms == null) {
+ terms = new ArrayList<>();
+ }
+ terms.clear();
+ TokenStream stream = analyzer.tokenStream(DEFAULT_FIELD, s);
+ stream.reset();
+ CharTermAttribute termAtt = stream
+ .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
+
+
+ while (stream.incrementToken()) {
+ terms.add(termAtt.toString());
+ }
+ stream.end();
+ stream.close();
+
+ return terms;
+ }
+
+ /**
+ * This calculates a substring from an array of StorableFields.
+ *
+ * This attempts to do the best job possible, and at worst will
+ * return an empty string. If the start or end is within a gap,
+ * or before 0 or after the total number of characters, this will
+ * gracefully (blithely?) handle those cases.
+ *
+ *
+ * @param start character offset to start
+ * @param end character offset to end
+ * @param fieldValues array of Strings to process
+ * @param offsetGap offsetGap as typically returned by Analyzer's .getOffsetGap()
+ * @param interFieldJoiner string to use to mark that a substring goes beyond a single
+ * field entry
+ * @return substring, potentially empty, never null.
+ */
+ public static String substringFromMultiValuedFields(int start,
+ int end, String[] fieldValues, int offsetGap, String interFieldJoiner) {
+ start = (start < 0) ? 0 : start;
+ end = (end < 0) ? 0: end;
+
+ if (start > end) {
+ start = end;
+ }
+
+ int charBase = 0;
+ StringBuilder sb = new StringBuilder();
+ int lastFieldIndex = 0;
+ int localStart = 0;
+ boolean foundStart = false;
+ //get start
+ for (int fieldIndex = 0; fieldIndex < fieldValues.length; fieldIndex++) {
+ String fString = fieldValues[fieldIndex];
+ if (start < charBase+fString.length()) {
+ localStart = start-charBase;
+ lastFieldIndex = fieldIndex;
+ foundStart = true;
+ break;
+ }
+ charBase += fString.length()+offsetGap;
+ }
+ if (!foundStart) {
+ return "";
+ }
+ //if start occurred in a gap, reset localStart to 0
+ if (localStart < 0) {
+ sb.append(interFieldJoiner);
+ localStart = 0;
+ }
+ //now append and look for end
+ for (int fieldIndex = lastFieldIndex; fieldIndex < fieldValues.length; fieldIndex++) {
+ String fString = fieldValues[fieldIndex];
+
+ if (end <= charBase+fString.length()) {
+ int localEnd = end-charBase;
+ //must be in gap
+ if (charBase > end) {
+ return sb.toString();
+ }
+ if (fieldIndex != lastFieldIndex) {
+ sb.append(interFieldJoiner);
+ }
+ sb.append(fString.substring(localStart, localEnd));
+ break;
+ } else {
+ if (fieldIndex != lastFieldIndex) {
+ sb.append(interFieldJoiner);
+ }
+ sb.append(fString.substring(localStart));
+ localStart = 0;
+ }
+ charBase += fString.length()+offsetGap;
+ }
+ return sb.toString();
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java
new file mode 100644
index 0000000..8ffc82b
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java
@@ -0,0 +1,31 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Token offset identified by .getSpans() is not found in the TokenCharOffsetResults.
+ * Typical cause is a mismatch between analyzers at index and search times.
+ * When this happens, something very bad has happened and this should be its own exception.
+ */
+public class TargetTokenNotFoundException extends Exception {
+
+ private static final long serialVersionUID = 1L;
+ public TargetTokenNotFoundException(String message) {
+ super(message);
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java
new file mode 100644
index 0000000..f1954dc
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java
@@ -0,0 +1,81 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.BitSet;
+
+/**
+ * Util class used to specify the tokens for which character offsets are requested.
+ */
+public class TokenCharOffsetRequests {
+ private BitSet set = new BitSet();
+ private int last = -1;
+
+ /**
+ * Is a specific token requested?
+ * @param i token number to test
+ * @return whether or not this token is requested
+ */
+ public boolean contains(int i) {
+ return set.get(i);
+ }
+
+ /**
+ * add a request from start to end inclusive
+ * @param start range of token offsets to request (inclusive)
+ * @param end end range of token offsets to request (inclusive)
+ */
+ public void add(int start, int end) {
+ for (int i = start; i <= end; i++) {
+ add(i);
+ }
+ }
+
+ /**
+ * add a request for a specific token
+ * @param i token offset to request the character offsets for
+ */
+ public void add(int i) {
+ set.set(i);
+ last = (i > last) ? i : last;
+ }
+
+ /**
+ * clear the state of this request object for reuse
+ */
+ public void clear() {
+ set.clear();
+ last = -1;
+ }
+
+ /**
+ *
+ * @return greatest/last token offset in the request
+ */
+ public int getLast() {
+ return last;
+ }
+
+ /**
+ *
+ * @return the set of tokens whose character offsets are requested
+ */
+ protected BitSet getSet() {
+ return set;
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java
new file mode 100644
index 0000000..481087c
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java
@@ -0,0 +1,34 @@
+package org.apache.lucene.search.concordance.charoffsets;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.StoredDocument;
+
+
+/**
+ * Interface to allow flexibility/optimizations in returning character offsets
+ * for tokens
+ */
+public interface TokenCharOffsetsReader {
+
+ public void getTokenCharOffsetResults(final StoredDocument document,
+ final String fieldName, final TokenCharOffsetRequests requests,
+ final RandomAccessCharOffsetContainer results) throws IOException;
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html
new file mode 100644
index 0000000..28bd921
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html
@@ -0,0 +1,23 @@
+
+
+
+
+
+ConcordanceSearcher performs a search on an index and returns concordance windows.
+
+
diff --git lucene/concordance/src/java/org/apache/lucene/search/queries/SpanQueryConverter.java lucene/concordance/src/java/org/apache/lucene/search/queries/SpanQueryConverter.java
new file mode 100644
index 0000000..7fe0bcc
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/queries/SpanQueryConverter.java
@@ -0,0 +1,68 @@
+package org.apache.lucene.search.queries;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.CommonTermsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SimpleSpanQueryConverter;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
+/**
+ * This adds CommonTermsQuery to SimpleSpanQueryConverter.
+ * This had to be broken into a separate class to maintain
+ * clean compilation units (core vs. queries).
+ */
+public class SpanQueryConverter extends SimpleSpanQueryConverter {
+
+ @Override
+ protected SpanQuery convertUnknownQuery(String field, Query query) {
+ if (query instanceof CommonTermsQuery) {
+
+ // specialized since rewriting would change the result query
+ // this query is TermContext sensitive.
+ CommonTermsQuery ctq = (CommonTermsQuery) query;
+
+ Set terms = new HashSet<>();
+ ctq.extractTerms(terms);
+ List spanQs = new LinkedList<>();
+
+ for (Term term : terms) {
+ if (term.field().equals(field)) {
+ spanQs.add(new SpanTermQuery(term));
+ }
+ }
+ if (spanQs.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (spanQs.size() == 1) {
+ return spanQs.get(0);
+ } else {
+ return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()]));
+ }
+ }
+ super.convertUnknownQuery(field, query);
+ return null;
+ }
+}
diff --git lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java
new file mode 100644
index 0000000..a945b0f
--- /dev/null
+++ lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java
@@ -0,0 +1,285 @@
+package org.apache.lucene.search.spans;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.FilteredQuery;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+
+public class SimpleSpanQueryConverter {
+ /**
+ * Converts a regular query to a {@link org.apache.lucene.search.spans.SpanQuery} for use in a highlighter.
+ * Because of subtle differences in {@link org.apache.lucene.search.spans.SpanQuery} and {@link org.apache.lucene.search.Query}, this
+ * {@link org.apache.lucene.search.spans.SpanQuery} will not necessarily return the same documents as the
+ * initial Query. For example, the generated SpanQuery will not include
+ * clauses of type BooleanClause.Occur.MUST_NOT. Also, the
+ * {@link org.apache.lucene.search.spans.SpanQuery} will only cover a single field, whereas the {@link org.apache.lucene.search.Query}
+ * might contain multiple fields.
+ *
+ * Returns an empty SpanQuery if the {@link org.apache.lucene.search.Query} is a class that
+ * is handled, but for some reason can't be converted from a {@link org.apache.lucene.search.Query} to a
+ * {@link org.apache.lucene.search.spans.SpanQuery}. This can happen for many reasons: e.g. if the Query
+ * contains no terms in the requested "field" or the Query is a MatchAllDocsQuery.
+ *
+ * Throws IllegalArgumentException if the Query is a class that is
+ * is not yet handled.
+ *
+ * This class does not rewrite the SpanQuery before returning it.
+ * Clients are required to rewrite if necessary.
+ *
+ * Much of this code is copied directly from
+ * oal.search.highlight.WeightedSpanTermExtractor. There are some subtle
+ * differences.
+ *
+ * @param field single field to extract SpanQueries for
+ * @param query query to convert
+ * @return SpanQuery for use in highlighting; can return empty SpanQuery
+ * @throws java.io.IOException, IllegalArgumentException
+ */
+ public SpanQuery convert(String field, Query query) throws IOException {
+ /*
+ * copied nearly verbatim from
+ * org.apache.lucene.search.highlight.WeightedSpanTermExtractor
+ * TODO:refactor to avoid duplication of code if possible.
+ * Beware: there are some subtle differences.
+ */
+ if (query instanceof SpanQuery) {
+ SpanQuery sq = (SpanQuery) query;
+ if (sq.getField().equals(field)) {
+ return (SpanQuery) query;
+ } else {
+ return getEmptySpanQuery();
+ }
+ } else if (query instanceof BooleanQuery) {
+ BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
+ List spanQs = new ArrayList<>();
+ for (int i = 0; i < queryClauses.length; i++) {
+ if (!queryClauses[i].isProhibited()) {
+ tryToAdd(field, convert(field, queryClauses[i].getQuery()), spanQs);
+ }
+ }
+ if (spanQs.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (spanQs.size() == 1) {
+ return spanQs.get(0);
+ } else {
+ return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()]));
+ }
+ } else if (query instanceof PhraseQuery) {
+ PhraseQuery phraseQuery = ((PhraseQuery) query);
+
+ Term[] phraseQueryTerms = phraseQuery.getTerms();
+ if (phraseQueryTerms.length == 0) {
+ return getEmptySpanQuery();
+ } else if (!phraseQueryTerms[0].field().equals(field)) {
+ return getEmptySpanQuery();
+ }
+ SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length];
+ for (int i = 0; i < phraseQueryTerms.length; i++) {
+ clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
+ }
+ int slop = phraseQuery.getSlop();
+ int[] positions = phraseQuery.getPositions();
+ // sum position increments (>1) and add to slop
+ if (positions.length > 0) {
+ int lastPos = positions[0];
+ int sz = positions.length;
+ for (int i = 1; i < sz; i++) {
+ int pos = positions[i];
+ int inc = pos - lastPos-1;
+ slop += inc;
+ lastPos = pos;
+ }
+ }
+
+ boolean inorder = false;
+
+ if (phraseQuery.getSlop() == 0) {
+ inorder = true;
+ }
+
+ SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
+ sp.setBoost(query.getBoost());
+ return sp;
+ } else if (query instanceof TermQuery) {
+ TermQuery tq = (TermQuery) query;
+ if (tq.getTerm().field().equals(field)) {
+ return new SpanTermQuery(tq.getTerm());
+ } else {
+ return getEmptySpanQuery();
+ }
+ } else if (query instanceof FilteredQuery) {
+ return convert(field, ((FilteredQuery) query).getQuery());
+ } else if (query instanceof ConstantScoreQuery) {
+ return convert(field, ((ConstantScoreQuery) query).getQuery());
+ } else if (query instanceof DisjunctionMaxQuery) {
+ List spanQs = new ArrayList<>();
+ for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator
+ .hasNext();) {
+ tryToAdd(field, convert(field, iterator.next()), spanQs);
+ }
+ if (spanQs.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (spanQs.size() == 1) {
+ return spanQs.get(0);
+ } else {
+ return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()]));
+ }
+ } else if (query instanceof MatchAllDocsQuery) {
+ return getEmptySpanQuery();
+ } else if (query instanceof MultiPhraseQuery) {
+
+ final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
+ final List termArrays = mpq.getTermArrays();
+ //test for empty or wrong field
+ if (termArrays.size() == 0) {
+ return getEmptySpanQuery();
+ } else if (termArrays.size() > 1) {
+ Term[] ts = termArrays.get(0);
+ if (ts.length > 0) {
+ Term t = ts[0];
+ if (!t.field().equals(field)) {
+ return getEmptySpanQuery();
+ }
+ }
+ }
+ final int[] positions = mpq.getPositions();
+ if (positions.length > 0) {
+
+ int maxPosition = positions[positions.length - 1];
+ for (int i = 0; i < positions.length - 1; ++i) {
+ if (positions[i] > maxPosition) {
+ maxPosition = positions[i];
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ final List[] disjunctLists = new List[maxPosition + 1];
+ int distinctPositions = 0;
+
+ for (int i = 0; i < termArrays.size(); ++i) {
+ final Term[] termArray = termArrays.get(i);
+ List disjuncts = disjunctLists[positions[i]];
+ if (disjuncts == null) {
+ disjuncts = (disjunctLists[positions[i]] = new ArrayList<>(
+ termArray.length));
+ ++distinctPositions;
+ }
+ for (int j = 0; j < termArray.length; ++j) {
+ disjuncts.add(new SpanTermQuery(termArray[j]));
+ }
+ }
+
+ int positionGaps = 0;
+ int position = 0;
+ final SpanQuery[] clauses = new SpanQuery[distinctPositions];
+ for (int i = 0; i < disjunctLists.length; ++i) {
+ List disjuncts = disjunctLists[i];
+ if (disjuncts != null) {
+ if (disjuncts.size() == 1) {
+ clauses[position++] = disjuncts.get(0);
+ } else {
+ clauses[position++] = new SpanOrQuery(
+ disjuncts.toArray(new SpanQuery[disjuncts.size()]));
+ }
+ } else {
+ ++positionGaps;
+ }
+ }
+
+ final int slop = mpq.getSlop();
+ final boolean inorder = (slop == 0);
+
+ SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps,
+ inorder);
+ sp.setBoost(query.getBoost());
+ return sp;
+ }
+
+ } else if (query instanceof MultiTermQuery) {
+ return new SpanMultiTermQueryWrapper<>((MultiTermQuery)query);
+ }
+ return convertUnknownQuery(field, query);
+ }
+
+ private void tryToAdd(String field, SpanQuery q, List qs) {
+ if (q == null || isEmptyQuery(q) || !q.getField().equals(field)) {
+ return;
+ }
+ qs.add(q);
+ }
+
+ /**
+ * Extend this to handle queries that are not currently handled.
+ * Might consider extending SpanQueryConverter in the queries compilation unit;
+ * that includes CommonTermsQuery.
+ *
+ * In this class, this always throws an IllegalArgumentException
+ * @param field field to convert
+ * @param query query to convert
+ * @return nothing. Throws IllegalArgumentException
+ */
+ protected SpanQuery convertUnknownQuery(String field, Query query) {
+ throw new IllegalArgumentException("SpanQueryConverter is unable to convert this class "+
+ query.getClass().toString());
+ }
+
+ /**
+ *
+ * @return an empty SpanQuery (SpanOrQuery with no cluases)
+ */
+ protected SpanQuery getEmptySpanQuery() {
+ SpanQuery q = new SpanOrQuery(new SpanTermQuery[0]);
+ return q;
+ }
+
+ /**
+ * Is this a null or empty SpanQuery
+ * @param q query to test
+ * @return whether a null or empty SpanQuery
+ */
+ protected boolean isEmptyQuery(SpanQuery q) {
+ if (q == null) {
+ return true;
+ }
+ if (q instanceof SpanOrQuery) {
+ SpanOrQuery soq = (SpanOrQuery)q;
+ for (SpanQuery sq : soq.getClauses()) {
+ if (! isEmptyQuery(sq)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+}
diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestBase.java lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestBase.java
new file mode 100644
index 0000000..50f412f
--- /dev/null
+++ lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestBase.java
@@ -0,0 +1,207 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+public class ConcordanceTestBase extends LuceneTestCase {
+
+ protected final static String FIELD = "f1";
+
+ public Directory getDirectory(Analyzer analyzer, String[] vals)
+ throws IOException {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(analyzer)
+ .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+
+ for (String s : vals) {
+ Document d = new Document();
+ d.add(newTextField(FIELD, s, Field.Store.YES));
+ writer.addDocument(d);
+ }
+ writer.close();
+ return directory;
+ }
+
+ public Directory getDirectory(Analyzer analyzer, List input)
+ throws IOException {
+
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(analyzer)
+ .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy()));
+
+ for (String[] vals : input) {
+ Document d = new Document();
+ for (String s : vals) {
+ d.add(newTextField(FIELD, s, Field.Store.YES));
+ }
+ writer.addDocument(d);
+ }
+ writer.close();
+ return directory;
+ }
+
+ public static Analyzer getAnalyzer(final CharacterRunAutomaton stops) {
+ return getAnalyzer(stops, random().nextInt(10000), random().nextInt(10000));
+ }
+
+ public static Analyzer getAnalyzer(final CharacterRunAutomaton stops,
+ final int posIncGap, final int charOffsetGap) {
+
+ Analyzer analyzer = new Analyzer() {
+
+ @Override
+ public TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+ TokenFilter filter = new MockTokenFilter(tokenizer, stops);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+
+ @Override
+ public int getPositionIncrementGap(String fieldName) {
+ return posIncGap;
+ }
+
+ @Override
+ public int getOffsetGap(String fieldName) {
+ return charOffsetGap;
+ }
+ };
+ return analyzer;
+ }
+
+ protected Directory buildNeedleIndex(String needle,
+ Analyzer analyzer, int numFieldValues) throws Exception {
+
+ IndexWriterConfig config = newIndexWriterConfig(random(), analyzer)
+ .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000))
+ .setMergePolicy(newLogMergePolicy());
+
+ Directory directory = newDirectory();
+ /*
+ Don't think we still need this
+ String pf = TestUtil.getPostingsFormat(FIELD);
+ if (doesntSupportOffsets.contains(pf)) {
+ //just use Asserting
+ Codec codec = new AssertingCodec();
+ config.setCodec(codec);
+ }*/
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config);
+ //create document with multivalued field
+ String[] fs = new String[numFieldValues];
+ for (int i = 0; i < numFieldValues; i++) {
+ float r = random().nextFloat();
+ String doc = "";
+ if (r <= 0.33) {
+ doc = needle+" "+getRandomWords(29, needle, analyzer);
+ } else if (r <= 0.66) {
+ doc = getRandomWords(13, needle, analyzer)+" "+needle+" "+getRandomWords(17, needle, analyzer);
+ } else {
+ doc = getRandomWords(31, needle, analyzer)+" "+needle;
+ }
+ fs[i] = doc;
+ }
+
+ Document d = new Document();
+ FieldType type = new FieldType();
+ type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ type.setStored(true);
+
+ for (String s : fs) {
+ d.add(newField(FIELD, s, type));
+ }
+ writer.addDocument(d);
+ writer.close();
+ return directory;
+ }
+
+
+
+ /**
+ * this assumes no stop filter in the analyzer.
+ * Best to use whitespace tokenizer.
+ */
+ private String getRandomWords(int numWords, String needle, Analyzer analyzer) throws Exception {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < numWords; i++ ) {
+ sb.append(TestUtil.randomUnicodeString(random(), 31));
+ sb.append(" ");
+ }
+ List terms = SimpleAnalyzerUtil.getTermStrings(sb.toString(), analyzer);
+ StringBuilder rsb = new StringBuilder();
+ int words = -1;
+ while (words++ < numWords && words < terms.size()) {
+ String cand = terms.get(words);
+ if (!needle.equals(cand)) {
+ if (words > 0) {
+ rsb.append(" ");
+ }
+ rsb.append(cand);
+ }
+ }
+ return rsb.toString();
+ }
+
+
+ protected String getNeedle(Analyzer analyzer) {
+ //try to get a term that would come out of the analyzer
+ for (int i = 0; i < 10; i++) {
+ //start with a random base string
+ String baseString = TestUtil.randomUnicodeString(random(), random().nextInt(10) + 2);
+
+ try{
+ //run it through the analyzer, and take the first thing
+ //that comes out of it if the length > 0
+ List terms = SimpleAnalyzerUtil.getTermStrings(baseString, analyzer);
+ for (String t : terms) {
+ if (t.length() > 0) {
+ return t;
+ }
+ }
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ //if nothing is found in 10 tries,
+ //return literal string "needle"
+
+ return "needle";
+ }
+}
diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java
new file mode 100644
index 0000000..a2d9bc6
--- /dev/null
+++ lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java
@@ -0,0 +1,92 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+public class ConcordanceTestUtils extends LuceneTestCase {
+ public final static String FIELD = "content";
+
+
+ public static Directory getDirectory(Analyzer analyzer, String[] vals) throws IOException {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs
+ (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
+
+ for (String s : vals) {
+ Document d = new Document();
+ d.add(newTextField(FIELD, s, Field.Store.YES));
+ writer.addDocument(d);
+
+ }
+ writer.close();
+ return directory;
+ }
+
+ public static Directory getDirectory(Analyzer analyzer, List input) throws IOException {
+
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs
+ (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
+ for (String[] vals : input) {
+ Document d = new Document();
+ for (String s : vals) {
+ d.add(newTextField(FIELD, s, Field.Store.YES));
+ }
+ writer.addDocument(d);
+
+ }
+ writer.close();
+ return directory;
+ }
+
+ public static Analyzer getAnalyzer(final CharacterRunAutomaton stops, final int posIncGap) {
+ //stops will usually be either:
+ //MockTokenFilter.EMPTY_STOPSET;
+ //MockTokenFilter.ENGLISH_STOPSET
+ return new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
+
+ TokenFilter filter = new MockTokenFilter(tokenizer, stops);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ @Override
+ public int getPositionIncrementGap(String fieldName) {
+ return posIncGap;
+ }
+ };
+ }
+}
diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java
new file mode 100644
index 0000000..04cf7a9
--- /dev/null
+++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java
@@ -0,0 +1,498 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestConcordanceSearcher extends ConcordanceTestBase {
+
+ private final static DocMetadataExtractor metadataExtractor =
+ new DocMetadataExtractor() {
+ private final Set fields = new HashSet<>();
+ private final Map data = new HashMap<>();
+
+ @Override
+ public Set getFieldSelector() {
+ return fields;
+ }
+
+ @Override
+ public Map extract(StoredDocument d) {
+ return data;
+ }
+ };
+
+ private final static DocIdBuilder docIdBuilder = new IndexIdDocIdBuilder();
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ // NOOP for now
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ // NOOP for now
+ }
+
+ @Test
+ public void testSimple() throws Exception {
+ String[] docs = new String[]{"a b c a b c", "c b a c b a"};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ WindowBuilder wb = new WindowBuilder(10, 10,
+ analyzer.getOffsetGap(FIELD),
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));
+
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
+ searcher.search(reader, FIELD,
+ q, null, analyzer, collector);
+
+ assertEquals(3, collector.size());
+
+ collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
+ searcher.search(reader, FIELD, q, null, analyzer, collector);
+
+ // test result size
+ assertEquals(4, collector.size());
+
+ // test result with sort order = pre
+ List windows = collector.getSortedWindows();
+ String[] pres = new String[]{"", "c b", "c b a c b", "a b c"};
+ String[] posts = new String[]{" b c a b c", " c b a", "", " b c"};
+
+ for (int i = 0; i < windows.size(); i++) {
+ ConcordanceWindow w = windows.get(i);
+
+ assertEquals(pres[i], w.getPre());
+ assertEquals(posts[i], w.getPost());
+ }
+
+ // test sort order post
+ // sort key is built at search time, so must re-search
+ wb = new WindowBuilder(10, 10,
+ analyzer.getOffsetGap(FIELD),
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
+ searcher = new ConcordanceSearcher(wb);
+
+ collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL);
+ searcher.search(reader, FIELD, q,
+ null, analyzer, collector);
+
+ windows = collector.getSortedWindows();
+
+ posts = new String[]{"", " b c", " b c a b c", " c b a",};
+ for (int i = 0; i < windows.size(); i++) {
+ ConcordanceWindow w = windows.get(i);
+ assertEquals(posts[i], w.getPost());
+ }
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testSimpleMultiValuedField() throws Exception {
+ String[] doc = new String[]{"a b c a b c", "c b a c b a"};
+ List docs = new ArrayList<>();
+ docs.add(doc);
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "a"));
+
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);
+
+ searcher.search(reader, FIELD,
+ q, null, analyzer, collector);
+
+ // test result size
+ assertEquals(4, collector.size());
+
+ // test result with sort order = pre
+ List windows = collector.getSortedWindows();
+ String[] pres = new String[]{"", "c b", "c b a c b", "a b c"};
+ String[] posts = new String[]{" b c a b c", " c b a", "", " b c"};
+
+ for (int i = 0; i < pres.length; i++) {
+ ConcordanceWindow w = windows.get(i);
+
+ assertEquals("pres: " + i, pres[i], w.getPre());
+
+ assertEquals("posts: " + i, posts[i], w.getPost());
+ }
+
+ // test sort order post
+ // sort key is built at search time, so must re-search
+ WindowBuilder wb = new WindowBuilder(10, 10,
+ analyzer.getOffsetGap(FIELD),
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder);
+ searcher = new ConcordanceSearcher(wb);
+
+ collector = new ConcordanceWindowCollector(100);
+
+ searcher.search(reader, FIELD, q, null, analyzer, collector);
+
+ windows = collector.getSortedWindows();
+
+ posts = new String[]{"", " b c", " b c a b c", " c b a",};
+ for (int i = 0; i < posts.length; i++) {
+ ConcordanceWindow w = windows.get(i);
+ assertEquals(posts[i], w.getPost());
+ }
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testWindowLengths() throws Exception {
+ String[] doc = new String[]{"a b c d e f g"};
+ List docs = new ArrayList<>();
+ docs.add(doc);
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
+
+ String[] pres = {"", "c", "b c", "a b c", "a b c", "a b c"};
+ String[] posts = {"", " e", " e f", " e f g", " e f g", " e f g"};
+
+ for (int tokensBefore = 0; tokensBefore < pres.length; tokensBefore++) {
+ for (int tokensAfter = 0; tokensAfter < posts.length; tokensAfter++) {
+ WindowBuilder wb = new WindowBuilder(tokensBefore, tokensAfter,
+ analyzer.getOffsetGap(FIELD));
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100);
+ searcher.search(reader, FIELD, q, null, analyzer, collector);
+ ConcordanceWindow w = collector.getSortedWindows().get(0);
+ assertEquals(tokensBefore + " : " + tokensAfter, pres[tokensBefore], w.getPre());
+ assertEquals(tokensBefore + " : " + tokensAfter, posts[tokensAfter], w.getPost());
+ }
+ }
+
+ reader.close();
+ directory.close();
+
+ }
+
+ @Test
+ public void testClockworkOrangeMultiValuedFieldProblem() throws Exception {
+ /*
+ * test handling of target match (or not) over different indices into multivalued
+ * field array
+ */
+ String[] doc = new String[]{"a b c a b the", "clockwork",
+ "orange b a c b a"};
+ List docs = new ArrayList<>();
+ docs.add(doc);
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 10);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ WindowBuilder wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));
+
+
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ SpanQuery q1 = new SpanTermQuery(
+ new Term(FIELD, "the"));
+ SpanQuery q2 = new SpanTermQuery(new Term(FIELD,
+ "clockwork"));
+ SpanQuery q3 = new SpanTermQuery(new Term(FIELD,
+ "orange"));
+ SpanQuery q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 3, true);
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
+
+ searcher.search(reader, FIELD,
+ q, null, analyzer, collector);
+ assertEquals(1, collector.size());
+
+ ConcordanceWindow w = collector.getSortedWindows().iterator().next();
+ assertEquals("target", "the | clockwork | orange", w.getTarget());
+ assertEquals("pre", "c a b", w.getPre());
+ assertEquals("post", " b a c", w.getPost());
+
+ reader.close();
+ directory.close();
+
+ // test hit even over long inter-field gap
+ analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 20, 50);
+ directory = getDirectory(analyzer, docs);
+ reader = DirectoryReader.open(directory);
+
+ wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD));
+
+ searcher = new ConcordanceSearcher(wb);
+ q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 120, true);
+ collector = new ConcordanceWindowCollector(100);
+
+ searcher.search(reader, FIELD, q, null, analyzer, collector);
+
+ assertEquals(1, collector.size());
+ w = collector.getSortedWindows().iterator().next();
+ assertEquals("target", "the | clockwork | orange", w.getTarget());
+ assertEquals("pre", "c a b", w.getPre());
+ assertEquals("post", " b a c", w.getPost());
+
+ reader.close();
+ directory.close();
+ // test miss
+ analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 100, 100);
+ directory = getDirectory(analyzer, docs);
+ reader = DirectoryReader.open(directory);
+
+ wb = new WindowBuilder();
+ searcher = new ConcordanceSearcher(wb);
+ q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 5, true);
+ collector = new ConcordanceWindowCollector(100);
+
+ searcher.search(reader, FIELD, q, null, analyzer, collector);
+
+ assertEquals(0, collector.size());
+
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testWithStops() throws Exception {
+ String[] docs = new String[]{"a b the d e the f", "g h the d the j"};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.ENGLISH_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+
+ WindowBuilder wb = new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD));
+
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3);
+
+ searcher.search(reader, FIELD,
+ q, null, analyzer, collector);
+ List windows = collector.getSortedWindows();
+ assertEquals(2, windows.size());
+
+ // the second word after the target is a stop word
+ // this post-component of this window should only go to the first word after
+ // the target
+ assertEquals("b the", windows.get(0).getPre());
+ assertEquals("d", windows.get(0).getTarget());
+ assertEquals(" e", windows.get(0).getPost());
+
+ assertEquals("h the", windows.get(1).getPre());
+ assertEquals("d", windows.get(1).getTarget());
+ assertEquals(" the j", windows.get(1).getPost());
+
+
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testBasicStandardQueryConversion() throws Exception {
+ String[] docs = new String[]{"a b c a b c", "c b a c b a d e a",
+ "c b a c b a e a b c a"};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+ BooleanQuery q = new BooleanQuery();
+ q.add(new TermQuery(new Term(FIELD, "a")), Occur.MUST);
+ q.add(new TermQuery(new Term(FIELD, "d")),
+ Occur.MUST_NOT);
+
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
+ searcher.search(reader,
+ FIELD, (Query) q, null,
+ analyzer, collector);
+ // shouldn't include document with "d"
+ assertEquals(6, collector.size());
+
+ // should only include document with "e" and not "d"
+ Filter filter = new QueryWrapperFilter(new TermQuery(new Term(
+ FIELD, "e")));
+ collector = new ConcordanceWindowCollector(10);
+
+ searcher.search(reader, FIELD, (Query) q, filter, analyzer, collector);
+ assertEquals(4, collector.size());
+
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testMismatchingFieldsInStandardQueryConversion() throws Exception {
+ // tests what happens if a Query doesn't contain a term in the "span" field
+ // in the searcher...should be no exception and zero documents returned.
+
+ String[] docs = new String[]{"a b c a b c",};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+
+ Query q = new TermQuery(new Term("_" + FIELD, "a"));
+
+ int windowCount = -1;
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
+
+ searcher.search(reader, FIELD,
+ q, null, analyzer, collector);
+ windowCount = collector.size();
+ assertEquals(0, windowCount);
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testUniqueCollector() throws Exception {
+ String[] docs = new String[]{"a b c d c b a",
+ "a B C d c b a",
+ "a b C d C B a",
+ "a b c d C B A",
+ "e f g d g f e",
+ "h i j d j i h"
+ };
+
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
+
+ DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(2);
+ searcher.search(reader,
+ FIELD, (Query) q, null,
+ analyzer, collector);
+ assertEquals(2, collector.size());
+
+
+ collector =
+ new DedupingConcordanceWindowCollector(AbstractConcordanceWindowCollector.COLLECT_ALL);
+ searcher.search(reader,
+ FIELD, (Query) q, null,
+ analyzer, collector);
+ assertEquals(3, collector.size());
+
+
+ reader.close();
+ directory.close();
+
+ }
+
+
+ @Test
+ public void testUniqueCollectorWithSameWindowOverflow() throws Exception {
+ String[] docs = new String[]{"a b c d c b a",
+ "a b c d c b a",
+ "a b c d c b a",
+ "a b c d c b a",
+ "e f g d g f e",
+ "h i j d j i h"
+ };
+
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(
+ new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD)));
+
+ SpanQuery q = new SpanTermQuery(new Term(FIELD, "d"));
+
+ DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(3);
+ searcher.search(reader,
+ FIELD, (Query) q, null,
+ analyzer, collector);
+ assertEquals(3, collector.size());
+ assertEquals(4, collector.getSortedWindows().get(0).getCount());
+ reader.close();
+ directory.close();
+ }
+
+ @Test
+ public void testAllowTargetOverlaps() throws Exception {
+ String[] docs = new String[]{"a b c"};
+ Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET);
+
+ Directory directory = getDirectory(analyzer, docs);
+ IndexReader reader = DirectoryReader.open(directory);
+ WindowBuilder wb = new WindowBuilder(10, 10,
+ analyzer.getOffsetGap(FIELD),
+ new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder);
+ ConcordanceSearcher searcher = new ConcordanceSearcher(wb);
+ SpanQuery term = new SpanTermQuery(new Term(FIELD, "a"));
+ SpanQuery phrase = new SpanNearQuery(
+ new SpanQuery[]{
+ new SpanTermQuery(new Term(FIELD, "a")),
+ new SpanTermQuery(new Term(FIELD, "b"))
+ }, 0, true);
+ SpanOrQuery q = new SpanOrQuery();
+ q.addClause(term);
+ q.addClause(phrase);
+
+ ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
+ searcher.search(reader, FIELD,
+ q, null, analyzer, collector);
+
+ //default should be: don't allow target overlaps
+ assertEquals(1, collector.size());
+
+ searcher.setAllowTargetOverlaps(true);
+ collector = new ConcordanceWindowCollector(10);
+ searcher.search(reader, FIELD,
+ q, null, analyzer, collector);
+
+ //now there should be two windows with allowTargetOverlaps = true
+ assertEquals(2, collector.size());
+ reader.close();
+ directory.close();
+ }
+}
diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSimpleAnalyzerUtil.java lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSimpleAnalyzerUtil.java
new file mode 100644
index 0000000..8bd1c44
--- /dev/null
+++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSimpleAnalyzerUtil.java
@@ -0,0 +1,158 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil;
+import org.apache.lucene.store.Directory;
+import org.junit.BeforeClass;
+
+public class TestSimpleAnalyzerUtil extends ConcordanceTestBase {
+
+ private static Analyzer defaultCharOffsetGapAnalyzer;
+
+ private static Analyzer customCharOffsetGapAnalyzer;
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ defaultCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 1);
+ //customCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50, 213);
+ customCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50, 213);
+ }
+ /*
+ public void testDebug() throws Exception {
+ String[] values = new String[]{
+ "the quick brown fox jumped over the lazy dog",
+ "the fast green toad slid under the slothful rabbit",
+ "the happy blue wolverine devoured the lazy moose",
+ "the depressed purple aardvark the the the the the the the devoured the energetic komodo",
+ "the exasperated lavender lion",
+ "the excited orange tiger the the the the the",
+ "the colorless green idea slept furiously the"
+ };
+ System.out.println(values[0].length());
+ List docs = new ArrayList<>();
+ docs.add(values);
+
+ Directory directory = getDirectory(defaultCharOffsetGapAnalyzer, docs);
+
+ String joiner = " | ";
+ int gap = defaultCharOffsetGapAnalyzer.getOffsetGap(FIELD);
+ IndexReader reader = DirectoryReader.open(directory);
+ Document d = reader.document(0);
+ String[] fieldValues = d.getValues(FIELD);
+ //69, 103
+ assertEquals("basic", "", testSimple(42, 45, fieldValues, gap, joiner));
+ reader.close();
+ directory.close();
+ }*/
+
+ public void testHitInGaps() throws Exception {
+ String[] values = new String[]{
+ "abc",
+ "def",
+ "ghi",
+ "jkl"
+ };
+ List docs = new ArrayList<>();
+ docs.add(values);
+
+ Directory directory = getDirectory(customCharOffsetGapAnalyzer, docs);
+
+ String joiner = " | ";
+ int gap = customCharOffsetGapAnalyzer.getOffsetGap(FIELD);
+ IndexReader reader = DirectoryReader.open(directory);
+ StoredDocument d = reader.document(0);
+ String[] fieldValues = d.getValues(FIELD);
+
+ assertEquals("two negs", "", testSimple(-10, -1, fieldValues, gap, joiner));
+
+ assertEquals("two way beyonds", "", testSimple(1000, 1020, fieldValues, gap, joiner));
+
+ assertEquals("two in betweens", " | ", testSimple(100, 110, fieldValues, gap, joiner));
+
+
+ assertEquals("one neg", "abc", testSimple(-20, 3, fieldValues, gap, joiner));
+ assertEquals("end < start 1", "", testSimple(3, -20, fieldValues, gap, joiner));
+ assertEquals("end < start 2", "", testSimple(3, 2, fieldValues, gap, joiner));
+ assertEquals("end in between", "abc", testSimple(0, 50, fieldValues, gap, joiner));
+ //TODO: these used to be "def"; need to fix
+ assertEquals("start in between", " | def", testSimple(5, 219, fieldValues, gap, joiner));
+ assertEquals("start in between and end in between1", " | def", testSimple(5, 300, fieldValues, gap, joiner));
+ assertEquals("start in between and end in between2", " | def | ghi", testSimple(5, 600, fieldValues, gap, joiner));
+ assertEquals("", "def | ghi | jkl", testSimple(216, 10000, fieldValues, gap, joiner));
+
+ reader.close();
+ directory.close();
+
+ }
+
+ public void testRandomWithNeedleOnGaps() throws Exception {
+ executeNeedleTests(defaultCharOffsetGapAnalyzer);
+ executeNeedleTests(customCharOffsetGapAnalyzer);
+ }
+
+ private void executeNeedleTests(Analyzer analyzer) throws Exception {
+
+ String needle = getNeedle(analyzer);
+ int numFieldValues = 23;
+
+ Directory directory = buildNeedleIndex(needle, analyzer, numFieldValues);
+
+ IndexReader reader = DirectoryReader.open(directory);
+
+ LeafReaderContext ctx = reader.leaves().get(0);
+ LeafReader r = ctx.reader();
+ DocsAndPositionsEnum dpe = r.termPositionsEnum(new Term(FIELD, needle));
+ int docId = dpe.nextDoc();
+ int numTests = 0;
+ while (docId != DocIdSetIterator.NO_MORE_DOCS) {
+ int frq = dpe.freq();
+ int advanced = 1;
+ dpe.nextPosition();
+ String[] fieldValues = r.document(docId).getValues(FIELD);
+ while (advanced++ < frq) {
+ String rebuilt = SimpleAnalyzerUtil.substringFromMultiValuedFields(dpe.startOffset(),
+ dpe.endOffset(), fieldValues, analyzer.getOffsetGap(FIELD), " | ");
+ assertEquals(needle, rebuilt);
+ numTests++;
+ dpe.nextPosition();
+ }
+ docId = dpe.nextDoc();
+ }
+ reader.close();
+ directory.close();
+ assertEquals("number of tests", numFieldValues-1, numTests);
+ }
+
+ private String testSimple(int start, int end, String[] fieldValues, int gap, String joiner) {
+ return SimpleAnalyzerUtil.substringFromMultiValuedFields(start, end, fieldValues, gap, joiner);
+ }
+}
diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java
new file mode 100644
index 0000000..7121f9c
--- /dev/null
+++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java
@@ -0,0 +1,111 @@
+package org.apache.lucene.search.concordance;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.queries.SpanQueryConverter;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+
+public class TestSpanQueryConverter extends LuceneTestCase {
+ private static IndexReader reader;
+ private static Directory directory;
+ private static Analyzer analyzer;
+ private final static String FIELD = "field";
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ analyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ };
+
+ directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
+ newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs
+ (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
+ String[] docs = new String[] {
+ "a b c a b c",
+ "c b a c b a"
+ };
+ for (String val : docs) {
+ Document doc = new Document();
+ doc.add(newTextField(FIELD, val, Field.Store.YES));
+ writer.addDocument(doc);
+ }
+ reader = writer.getReader();
+ writer.close();
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ reader.close();
+ directory.close();
+ reader = null;
+ directory = null;
+ analyzer = null;
+ }
+
+ @Test
+ public void testBooleanTwoFields() throws Exception {
+
+ Query q1 = new TermQuery(new Term(FIELD, "a"));
+ Query q2 = new TermQuery(new Term("another_field", "b"));
+ BooleanQuery q = new BooleanQuery();
+ q.add(q1, Occur.SHOULD);
+ q.add(q2, Occur.SHOULD);
+ SpanQueryConverter converter = new SpanQueryConverter();
+ boolean success = true;
+ try {
+ SpanQuery span = converter.convert(FIELD, q);
+ } catch (IllegalArgumentException e) {
+ success = false;
+ }
+ assertEquals(true, success);
+ Query q3 = new TermQuery(new Term("another_field", "c"));
+ BooleanQuery bq2 = new BooleanQuery();
+ bq2.add(q, Occur.MUST);
+ bq2.add(q3, Occur.SHOULD);
+ try {
+ SpanQuery span = converter.convert(FIELD, bq2);
+ } catch (IllegalArgumentException e) {
+ success = false;
+ }
+ assertEquals(true, success);
+ }
+}
diff --git lucene/module-build.xml lucene/module-build.xml
index c68900a..027338e 100644
--- lucene/module-build.xml
+++ lucene/module-build.xml
@@ -628,4 +628,27 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+