diff --git dev-tools/idea/.idea/ant.xml dev-tools/idea/.idea/ant.xml index 2cd14fd..b0ebf58 100644 --- dev-tools/idea/.idea/ant.xml +++ dev-tools/idea/.idea/ant.xml @@ -18,6 +18,7 @@ + diff --git dev-tools/idea/.idea/modules.xml dev-tools/idea/.idea/modules.xml index 5c096a6..cfdf28e 100644 --- dev-tools/idea/.idea/modules.xml +++ dev-tools/idea/.idea/modules.xml @@ -23,6 +23,7 @@ + diff --git dev-tools/idea/.idea/workspace.xml dev-tools/idea/.idea/workspace.xml index 2db9014..4fe00be 100644 --- dev-tools/idea/.idea/workspace.xml +++ dev-tools/idea/.idea/workspace.xml @@ -108,6 +108,14 @@ + + + + + - + @@ -339,32 +347,33 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git dev-tools/idea/lucene/concordance/concordance.iml dev-tools/idea/lucene/concordance/concordance.iml new file mode 100644 index 0000000..141f1ad --- /dev/null +++ dev-tools/idea/lucene/concordance/concordance.iml @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git dev-tools/maven/lucene/concordance/pom.xml.template dev-tools/maven/lucene/concordance/pom.xml.template new file mode 100644 index 0000000..dd4382e --- /dev/null +++ dev-tools/maven/lucene/concordance/pom.xml.template @@ -0,0 +1,69 @@ + + + 4.0.0 + + org.apache.lucene + lucene-parent + @version@ + ../pom.xml + + org.apache.lucene + lucene-concordance + jar + Lucene Concordance + Lucene Concordance Module + + lucene/concordance + ../../.. + ${relative-top-level}/${module-directory} + + + scm:svn:${vc-anonymous-base-url}/${module-directory} + scm:svn:${vc-dev-base-url}/${module-directory} + ${vc-browse-base-url}/${module-directory} + + + + + org.apache.lucene + lucene-test-framework + test + + @lucene-concordance.internal.dependencies@ + @lucene-concordance.external.dependencies@ + @lucene-concordance.internal.test.dependencies@ + @lucene-concordance.external.test.dependencies@ + + + ${module-path}/src/java + ${module-path}/src/test + + + ${project.build.testSourceDirectory} + + **/*.java + + + + + + diff --git dev-tools/maven/lucene/pom.xml.template dev-tools/maven/lucene/pom.xml.template index e7551c4..580fec6 100644 --- dev-tools/maven/lucene/pom.xml.template +++ dev-tools/maven/lucene/pom.xml.template @@ -47,6 +47,7 @@ analysis benchmark classification + concordance demo expressions facet diff --git lucene/build.xml lucene/build.xml index 0b98bb6..5d1adb8 100644 --- lucene/build.xml +++ lucene/build.xml @@ -173,6 +173,7 @@ + diff --git lucene/concordance/build.xml lucene/concordance/build.xml new file mode 100644 index 0000000..20d955d --- /dev/null +++ lucene/concordance/build.xml @@ -0,0 +1,40 @@ + + + + + + Executes concordance search + + + + + + + + + + + + + + + + + + diff --git lucene/concordance/ivy.xml lucene/concordance/ivy.xml new file mode 100644 index 0000000..3ad64e3 --- /dev/null +++ lucene/concordance/ivy.xml @@ -0,0 +1,21 @@ + + + + diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/AbstractConcordanceWindowCollector.java lucene/concordance/src/java/org/apache/lucene/search/concordance/AbstractConcordanceWindowCollector.java new file mode 100644 index 0000000..37ed3b3 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/AbstractConcordanceWindowCollector.java @@ -0,0 +1,134 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Abstract class to handle basic information for a ConcordanceWindowSearcher + * + */ +public abstract class AbstractConcordanceWindowCollector { + //value to use if all windows should be collected + public static final int COLLECT_ALL = -1; + + private final ConcordanceSorter sorter = new ConcordanceSorter(); + + private Set docIds = new HashSet<>(); + private final int maxWindows; + private boolean hitMax = false; + private long totalDocs = 0; + + /** + * + * @param maxWindows maximum windows to collect + */ + public AbstractConcordanceWindowCollector(int maxWindows) { + this.maxWindows = maxWindows; + } + + /** + * Collect/process this window + * @param w window to be processed + */ + public abstract void collect(ConcordanceWindow w); + + /** + * + * @return number of windows collected + */ + public abstract int size(); + + /** + * + * @return collected windows (unsorted) + */ + public abstract List getWindows(); + + /** + * + * @param hitMax did the searcher collect the maximum number of windows + * and stop early + */ + public void setHitMax(boolean hitMax) { + this.hitMax = hitMax; + } + + /** + * + * @param docId unique key for a document + */ + public void addDocId(String docId) { + docIds.add(docId); + } + + /** + * + * Sort according to {@link #sorter} and return windows + * @return sorted list of windows + */ + public List getSortedWindows() { + List windows = getWindows(); + Collections.sort(windows, sorter); + return windows; + } + + /** + * + * @return whether or not the searcher collected the maximum number of + * windows and stopped early. + */ + public boolean getHitMax() { + return hitMax; + } + + /** + * + * @return the maximum number of windows to collect. + * Can be equal to {@link #COLLECT_ALL} + */ + public int getMaxWindows() { + return maxWindows; + } + + /** + * + * @param totalDocs see {@link #getTotalDocs()} + */ + public void setTotalDocs(long totalDocs) { + this.totalDocs = totalDocs; + } + + /** + * + * @param totalDocs add this value to {@link #totalDocs} + */ + public void incrementTotalDocs(long totalDocs) { + this.totalDocs += totalDocs; + } + + /** + * @return total number of documents in all indices + */ + public long getTotalDocs() { + return totalDocs; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java new file mode 100644 index 0000000..d1b0540 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcher.java @@ -0,0 +1,253 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StorableField; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.queries.BooleanFilter; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryWrapperFilter; +import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsets; +import org.apache.lucene.search.concordance.charoffsets.DocTokenOffsetsIterator; +import org.apache.lucene.search.concordance.charoffsets.OffsetLengthStartComparator; +import org.apache.lucene.search.concordance.charoffsets.OffsetUtil; +import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer; +import org.apache.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader; +import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException; +import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests; +import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetsReader; +import org.apache.lucene.search.spans.SimpleSpanQueryConverter; +import org.apache.lucene.search.spans.SpanQuery; + + +/** + * Searches an IndexReader and returns concordance windows via ConcordanceResults. + */ +public class ConcordanceSearcher { + + /** + * Allow overlapping targets in hits, default = false + */ + private boolean allowTargetOverlaps = false; + + private WindowBuilder windowBuilder; + + private SimpleSpanQueryConverter spanQueryConverter; + + /** + * Constructor with default WindowBuilder and SimpleSpanQueryConverter + */ + public ConcordanceSearcher() { + this(new WindowBuilder(), new SimpleSpanQueryConverter()); + } + + /** + * Constructor for windowbuilder and SimpleSpanQueryConverter + * @param windowBuilder builder to use for windows + */ + public ConcordanceSearcher(WindowBuilder windowBuilder) { + this(windowBuilder, new SimpleSpanQueryConverter()); + } + + /** + * Constructor for windowBuilder and converter + * @param windowBuilder windowBuilder to use to build windows + * @param converter converter to use to convert Query to SpanQuery + */ + public ConcordanceSearcher(WindowBuilder windowBuilder, + SimpleSpanQueryConverter converter) { + this.windowBuilder = windowBuilder; + this.spanQueryConverter = converter; + } + + + /** + * + * @param reader reader to search + * @param fieldName field to build the windows on + * @param query if SpanQuery, this gets passed through as is. If a regular Query, the + * Query is first converted to a SpanQuery and the filter is modified + * to include the original Query. + * @param filter include a filter query. Value can be null + * @param analyzer analyzer to use for (re)calculating character offsets and for normalizing + * the sort keys + * @throws org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException + * @throws IllegalArgumentException + * @throws java.io.IOException + */ + public void search(IndexReader reader, String fieldName, Query query, + Filter filter, Analyzer analyzer, AbstractConcordanceWindowCollector collector) + throws TargetTokenNotFoundException, IllegalArgumentException, + IOException { + if (query == null) { + return; + } + if (query instanceof SpanQuery) { + // pass through + searchSpan(reader, (SpanQuery)query, filter, analyzer, collector); + } else { + // convert regular query to a SpanQuery. + SpanQuery spanQuery = spanQueryConverter.convert(fieldName, query); + + Filter origQueryFilter = new QueryWrapperFilter(query); + Filter updatedFilter = origQueryFilter; + + if (filter != null) { + BooleanFilter combinedFilter = new BooleanFilter(); + combinedFilter.add(origQueryFilter, BooleanClause.Occur.MUST); + combinedFilter.add(filter, BooleanClause.Occur.MUST); + updatedFilter = combinedFilter; + } + searchSpan(reader, spanQuery, updatedFilter, analyzer, collector); + } + } + + /** + * Like + * {@link #search(org.apache.lucene.index.IndexReader, String, org.apache.lucene.search.Query, org.apache.lucene.search.Filter, org.apache.lucene.analysis.Analyzer, AbstractConcordanceWindowCollector)} + * but this takes a SpanQuery + * + * @param reader reader to search + * @param spanQuery query to use to identify the targets + * @param filter filter for document retrieval + * @param analyzer to re-analyze terms for window calculations and sort key building + * @param collector to process (and store) the results + * @throws org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException + * @throws IllegalArgumentException + * @throws java.io.IOException + */ + public void searchSpan(IndexReader reader, + SpanQuery spanQuery, + Filter filter, Analyzer analyzer, AbstractConcordanceWindowCollector collector) + throws TargetTokenNotFoundException, IllegalArgumentException, + IOException { + + spanQuery = (SpanQuery)spanQuery.rewrite(reader); + DocTokenOffsetsIterator itr = new DocTokenOffsetsIterator(); + Set fields = new HashSet<>( + windowBuilder.getFieldSelector()); + fields.add(spanQuery.getField()); + itr.reset(spanQuery, filter, reader, fields); + buildResults(itr, reader, spanQuery.getField(), analyzer, collector); + + } + + private void buildResults(DocTokenOffsetsIterator itr, + IndexReader reader, String fieldName, Analyzer analyzer, AbstractConcordanceWindowCollector collector) + throws IllegalArgumentException, TargetTokenNotFoundException, + IOException { + + collector.setTotalDocs(reader.numDocs()); + TokenCharOffsetRequests requests = new TokenCharOffsetRequests(); + + TokenCharOffsetsReader tokenOffsetsRecordReader = + new ReanalyzingTokenCharOffsetsReader(analyzer); + + RandomAccessCharOffsetContainer offsetResults = new RandomAccessCharOffsetContainer(); + DocTokenOffsets result = null; + OffsetLengthStartComparator offsetLengthStartComparator = new OffsetLengthStartComparator(); + boolean stop = false; + while (itr.next() && !stop) { + result = itr.getDocTokenOffsets(); + StoredDocument document = result.getDocument(); + + String[] fieldValues = document.getValues(fieldName); + + if (fieldValues == null || fieldValues.length == 0) { + throwMissingField(document); + } + Map metadata = windowBuilder.extractMetadata(document); + String docId = windowBuilder.getUniqueDocumentId(document, result.getUniqueDocId()); + + List tokenOffsets = result.getOffsets(); + if (! allowTargetOverlaps) { + // remove overlapping hits!!! + tokenOffsets = OffsetUtil.removeOverlapsAndSort(tokenOffsets, + offsetLengthStartComparator, null); + } + + //clear then get new requests + requests.clear(); + ConcordanceSearcherUtil.getCharOffsetRequests(tokenOffsets, + windowBuilder.getTokensBefore(), windowBuilder.getTokensAfter(), requests); + + offsetResults.clear(); + + tokenOffsetsRecordReader.getTokenCharOffsetResults( + document, fieldName, requests, offsetResults); + + for (OffsetAttribute offset : tokenOffsets) { + + ConcordanceWindow w = windowBuilder.buildConcordanceWindow( + docId, offset.startOffset(), + offset.endOffset() - 1, fieldValues, + offsetResults, metadata); + + collector.collect(w); + if (collector.getHitMax()) { + stop = true; + break; + } + } + } + } + + /** + * Spans can overlap: a search for ["ab cd" "ab"] would have + * two spans on the string "ab cd" if this is set to true. + * If this is set to false, this will return the longest span + * that appears earliest in the string if there is overlap. + * + * @param allowTargetOverlaps are targets allowed to overlap. + */ + public void setAllowTargetOverlaps(boolean allowTargetOverlaps) { + this.allowTargetOverlaps = allowTargetOverlaps; + } + + private void throwMissingField(StoredDocument document) throws IllegalArgumentException { + StringBuilder sb = new StringBuilder(); + sb.append("Did you forget to load or specify the correct content field?!"); + sb.append("\n"); + sb.append("I only see these fields:\n"); + + for (StorableField f : document.getFields()) { + sb.append(f.name()).append("\n"); + } + throw new IllegalArgumentException(sb.toString()); + } + + /** + * Set the converter to use to convert a Query to a SpanQuery. + * The need for this will go away when LUCENE-2878 is completed. + * @param converter converter from Query to SpanQuery + */ + public void setSpanQueryConverter(SimpleSpanQueryConverter converter){ + this.spanQueryConverter = converter; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java new file mode 100644 index 0000000..83a331d --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSearcherUtil.java @@ -0,0 +1,59 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.List; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.search.concordance.charoffsets.TokenCharOffsetRequests; + +/** + * In other applications with variations on the ConcordanceSearcher, it has been + * useful to factor out the getCharOffsetRequests. + * + * This class should be used for functionality that is generally useful for + * concordance searching. + * + */ +public class ConcordanceSearcherUtil { + + + /** + * Simple utility method to build a TokenCharOffsetRequests object + * from a list of desired tokenOffsets, the number of tokensBefore + * and the number of tokensAfter. + * + * @param tokenOffsets the tokenOffsets that are desired + * @param tokensBefore the number of tokens before a desired tokenOffset + * @param tokensAfter the number of tokens after a desired tokenOffset + */ + public static void getCharOffsetRequests( + List tokenOffsets, + int tokensBefore, int tokensAfter, + TokenCharOffsetRequests requests) { + + for (OffsetAttribute tokenOffset : tokenOffsets) { + int start = tokenOffset.startOffset() - tokensBefore; + start = (start < 0) ? 0 : start; + int end = tokenOffset.endOffset() + tokensAfter+1; + for (int i = start; i < end; i++) { + requests.add(i); + } + } + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortKey.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortKey.java new file mode 100644 index 0000000..a599a7f --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortKey.java @@ -0,0 +1,61 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Simple comparable class to allow for subclassing. + * + */ +public class ConcordanceSortKey implements Comparable { + + private final String concSortString; + + public ConcordanceSortKey(String s) { + this.concSortString = s; + } + + @Override + public int compareTo(ConcordanceSortKey other) { + return concSortString.compareTo(other.concSortString); + } + + @Override + public int hashCode() { + return concSortString.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (!(obj instanceof ConcordanceSortKey)) + return false; + ConcordanceSortKey other = (ConcordanceSortKey) obj; + if (concSortString == null) { + if (other.concSortString != null) + return false; + } else if (!concSortString.equals(other.concSortString)) + return false; + return true; + } + + +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java new file mode 100644 index 0000000..85d3c15 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSortOrder.java @@ -0,0 +1,32 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Options for sorting ConcordanceWindows + * + */ +public enum ConcordanceSortOrder { + PRE, //sort on the first token before the target, then the second word, etc. + POST, //sort on words after the target + TARGET_PRE, //sort on the target and then words before the target + TARGET_POST, //sort on the target and then words after the target + DOC, //sort on the Lucene document id + NONE //no sort +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java new file mode 100644 index 0000000..6732e68 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceSorter.java @@ -0,0 +1,32 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.util.Comparator; + + +public class ConcordanceSorter implements Comparator { + private static final long serialVersionUID = 7526472295622776147L; + + @Override + public int compare(ConcordanceWindow w1, ConcordanceWindow w2) { + + return w1.getSortKey().compareTo(w2.getSortKey()); + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java new file mode 100644 index 0000000..0cebcb6 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindow.java @@ -0,0 +1,181 @@ +package org.apache.lucene.search.concordance; + +import java.util.Map; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Key element in a concordance view of data. A window consists of the words + * before a target term (pre), the target term and then the words after the + * target term (post). A window also has a sort key to allow for various methods + * of sorting. + * + * For various applications, it has also been useful to store a unique document key, + * character offset (start and end) of the full + * window as well as metadata from the document for the given window. + * + * This class is experimental and may change in incompatible ways in the future. + * + * Areas for improvement: + * 1) convert sortKey to an array of Comparables + * 2) ... + */ +public class ConcordanceWindow { + + private final ConcordanceSortKey sortKey; + private final String pre; + private final String target; + private final String post; + private final int charStart; + private final int charEnd; + private final String uniqueDocID; + //used by hide duplicates to count more than one occurrence of a window + private int count = 1; + private Map metadata; + + /** + * + * @param uniqueDocID string representing what should be a unique document identifier + * @param charStart character offset start for the window + * @param charEnd character offset end for the window + * @param pre words before the target in reading order and unanalyzed + * @param target target string + * @param post string after the target in reading order and unanalyzed + * @param sortKey key to use for sorting this window + * @param metadata metadata to store with this window + */ + public ConcordanceWindow(String uniqueDocID, int charStart, int charEnd, String pre, + String target, String post, ConcordanceSortKey sortKey, Map metadata) { + this.pre = pre; + this.target = target; + this.post = post; + this.uniqueDocID = uniqueDocID; + this.charStart = charStart; + this.charEnd = charEnd; + this.metadata = metadata; + this.sortKey = sortKey; + } + + public String getUniqueDocID() { + return uniqueDocID; + } + + public int getStart() { + return charStart; + } + + public int getEnd() { + return charEnd; + } + + public Map getMetadata() { + return metadata; + } + + public String getPre() { + return pre; + } + + public String getPost() { + return post; + } + + public String getTarget() { + return target; + } + + public int getCount() { + return count; + } + + public void incrementCount() { + count++; + } + + public void setCount(int count) { + this.count = count; + } + + public int getSize() { + int size = 0; + if (pre != null) { + size += pre.length(); + } + if (target != null) { + size += target.length(); + } + if (post != null) { + size += post.length(); + } + return size; + } + + public ConcordanceSortKey getSortKey() { + return sortKey; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ConcordanceWindow that = (ConcordanceWindow) o; + + if (charEnd != that.charEnd) return false; + if (charStart != that.charStart) return false; + if (count != that.count) return false; + if (metadata != null ? !metadata.equals(that.metadata) : that.metadata != null) return false; + if (post != null ? !post.equals(that.post) : that.post != null) return false; + if (pre != null ? !pre.equals(that.pre) : that.pre != null) return false; + if (sortKey != null ? !sortKey.equals(that.sortKey) : that.sortKey != null) return false; + if (target != null ? !target.equals(that.target) : that.target != null) return false; + if (uniqueDocID != null ? !uniqueDocID.equals(that.uniqueDocID) : that.uniqueDocID != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = sortKey != null ? sortKey.hashCode() : 0; + result = 31 * result + (pre != null ? pre.hashCode() : 0); + result = 31 * result + (target != null ? target.hashCode() : 0); + result = 31 * result + (post != null ? post.hashCode() : 0); + result = 31 * result + charStart; + result = 31 * result + charEnd; + result = 31 * result + (uniqueDocID != null ? uniqueDocID.hashCode() : 0); + result = 31 * result + count; + result = 31 * result + (metadata != null ? metadata.hashCode() : 0); + return result; + } + + @Override + public String toString() { + return "ConcordanceWindow{" + + "sortKey=" + sortKey + + ", pre='" + pre + '\'' + + ", target='" + target + '\'' + + ", post='" + post + '\'' + + ", charStart=" + charStart + + ", charEnd=" + charEnd + + ", uniqueDocID='" + uniqueDocID + '\'' + + ", count=" + count + + ", metadata=" + metadata + + '}'; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindowCollector.java lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindowCollector.java new file mode 100644 index 0000000..66d9d32 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/ConcordanceWindowCollector.java @@ -0,0 +1,53 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.List; + +public class ConcordanceWindowCollector extends AbstractConcordanceWindowCollector{ + + private List windows = new ArrayList<>(); + + public ConcordanceWindowCollector(int maxWindows) { + super(maxWindows); + } + + @Override + public void collect(ConcordanceWindow w) { + if (getMaxWindows() != AbstractConcordanceWindowCollector.COLLECT_ALL + && windows.size() >= getMaxWindows()) { + setHitMax(true); + return; + } + windows.add(w); + addDocId(w.getUniqueDocID()); + } + + @Override + public int size() { + return windows.size(); + } + + @Override + public List getWindows() { + return windows; + } + + +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DedupingConcordanceWindowCollector.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DedupingConcordanceWindowCollector.java new file mode 100644 index 0000000..660c3f7 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DedupingConcordanceWindowCollector.java @@ -0,0 +1,103 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Like ConcordanceWindowCollector, but this collector + * doesn't store duplicate windows. Windows are defined as duplicates by + * {@link #buildEqualityKey(ConcordanceWindow, StringBuilder)}. + * + */ +public class DedupingConcordanceWindowCollector extends AbstractConcordanceWindowCollector { + + Map map = new HashMap<>(); + private StringBuilder sb = new StringBuilder(); + + /** + * + * @param maxHits maximum number of windows to store. This could potentially + * visit lots more windows than maxHits. + */ + public DedupingConcordanceWindowCollector(int maxHits) { + super(maxHits); + } + + @Override + public void collect(ConcordanceWindow w) { + if (getHitMax() == true) { + return; + } + buildEqualityKey(w, sb); + String key = sb.toString(); + ConcordanceWindow oldWindow = map.get(key); + if (oldWindow == null) { + //we would have added a new window here + if (getMaxWindows() != AbstractConcordanceWindowCollector.COLLECT_ALL && + map.size() >= getMaxWindows()) { + setHitMax(true); + return; + } + oldWindow = w; + } else { + //if the old window existed (i.e. new window is a duplicate) + //keep incrementing the count + oldWindow.incrementCount(); + } + + map.put(key, oldWindow); + } + + + /** + * number of windows collected + */ + @Override + public int size() { + return map.size(); + } + + @Override + public List getWindows() { + List windows = new ArrayList<>(); + windows.addAll(map.values()); + return windows; + } + + /** + * Public for easy overriding. Generate a key to be used to determine + * whether two windows are the same. Some implementations + * might want to lowercase, some might want genuine case folding, + * some might want to strip non-alphanumerics, etc. + + * @param w ConcordanceWindow + * @param sb reuseable StringBuilder; sb.setLength(0) is called before use! + */ + public void buildEqualityKey(ConcordanceWindow w, StringBuilder sb) { + sb.setLength(0); + sb.append(w.getPre().toLowerCase()); + sb.append(">>>"); + sb.append(w.getTarget().toLowerCase()); + sb.append("<<<"); + sb.append(w.getPost().toLowerCase()); + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DefaultSortKeyBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DefaultSortKeyBuilder.java new file mode 100644 index 0000000..275a55c --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DefaultSortKeyBuilder.java @@ -0,0 +1,149 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer; + +/** + * Builds basic sort key for the values available in ConcordanceSortOrder + */ +public class DefaultSortKeyBuilder implements SortKeyBuilder { + + private final static String SPACE = " "; + private final static String EMPTY_STRING = ""; + //what filler to use when a "term" comes back as null from the + //TokenCharOffsetResults + private static String NULL_FILLER = ""; + private final ConcordanceSortOrder sortOrder; + + /** + * Calls {@link #DefaultSortKeyBuilder(ConcordanceSortOrder)} + * with value of: ConcordanceSortOrder.PRE + */ + public DefaultSortKeyBuilder() { + this.sortOrder = ConcordanceSortOrder.PRE; + } + + /** + * + * @param sortOrder sort order to use + */ + public DefaultSortKeyBuilder(ConcordanceSortOrder sortOrder) { + this.sortOrder = sortOrder; + } + + @Override + public ConcordanceSortKey buildKey(String docKey, + int startTargetTokenOffset, + int endTargetTokenOffset, + RandomAccessCharOffsetContainer charOffsets, + int tokensBefore, int tokensAfter, + Map metadata) { + + if (sortOrder == ConcordanceSortOrder.NONE) { + return new ConcordanceSortKey(EMPTY_STRING); + } + + if (sortOrder == ConcordanceSortOrder.DOC) { + int targCharStart = charOffsets.getCharacterOffsetStart(startTargetTokenOffset); + return new DocumentOrderSortKey(docKey, targCharStart); + } + + StringBuilder sb = new StringBuilder(); + //order is important for appending to sb, target must come before pre/post + if (sortOrder == ConcordanceSortOrder.TARGET_POST + || sortOrder == ConcordanceSortOrder.TARGET_PRE) { + + for (int i = startTargetTokenOffset; i <= endTargetTokenOffset; i++) { + String tmp = charOffsets.getTerm(i); + if (tmp != null && tmp.length() > 0) { + sb.append(tmp).append(SPACE); + } else { + sb.append(NULL_FILLER); + } + } + } + + if (sortOrder == ConcordanceSortOrder.PRE + || sortOrder == ConcordanceSortOrder.TARGET_PRE) { + int tmpStart = startTargetTokenOffset - 1; + int tmpEnd = Math.max(0, startTargetTokenOffset - tokensBefore); + if (tmpStart < 0) { + sb.append(SPACE); + } + + for (int i = tmpStart; i >= tmpEnd; i--) { + String tmp = charOffsets.getTerm(i); + if (tmp != null && tmp.length() > 0) { + sb.append(tmp).append(SPACE); + } else { + sb.append(NULL_FILLER); + } + } + + } else if (sortOrder == ConcordanceSortOrder.POST + || sortOrder == ConcordanceSortOrder.TARGET_POST) { + + int tmpStart = endTargetTokenOffset + 1; + int tmpEnd = Math.min(charOffsets.getLast(), endTargetTokenOffset+tokensAfter); + + if (tmpStart > charOffsets.getLast()) { + sb.append(SPACE); + } + for (int i = tmpStart; i <= tmpEnd; i++) { + String tmp = charOffsets.getTerm(i); + if (tmp != null && tmp.length() > 0) { + sb.append(tmp).append(SPACE); + } else { + sb.append(NULL_FILLER); + } + } + } + return new ConcordanceSortKey(sb.toString().trim()); + } + + @Override + public boolean requiresAnalysisOfPre() { + if (sortOrder == ConcordanceSortOrder.PRE + || sortOrder == ConcordanceSortOrder.TARGET_PRE) { + return true; + } + return false; + } + + @Override + public boolean requiresAnalysisOfPost() { + if (sortOrder == ConcordanceSortOrder.POST + || sortOrder == ConcordanceSortOrder.TARGET_POST) { + return true; + } + return false; + } + + @Override + public boolean requiresAnalysisOfTarget() { + if (sortOrder == ConcordanceSortOrder.TARGET_PRE + || sortOrder == ConcordanceSortOrder.TARGET_POST) { + return true; + } + return false; + } + +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DocIdBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DocIdBuilder.java new file mode 100644 index 0000000..8772b2a --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DocIdBuilder.java @@ -0,0 +1,30 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.StoredDocument; + +/** + * Pair of field index and character offset. + * The fieldIndex records the index in a potentially multi-valued field (array). + * The charOffset records the character offset within that field within that value in the potentially + * multi-valued field. + */ +public interface DocIdBuilder { + public String build(StoredDocument document, long docId); +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DocMetadataExtractor.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DocMetadataExtractor.java new file mode 100644 index 0000000..6f6216f --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DocMetadataExtractor.java @@ -0,0 +1,47 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.StoredDocument; + +/** + * Simple interface for a component that extracts metadata from + * a document to be stored with a ConcordanceWindow + * + */ +public interface DocMetadataExtractor { + /** + * + * @return the fields that need to be retrieved for the document + * for proper processing + */ + public Set getFieldSelector(); + + /** + * + * @param document to be processed for metadata. Only those fields + * that were returned by {@link #getFieldSelector()} will be loaded + * in the document + * @return document metadata to be stored with each window + */ + public Map extract(StoredDocument document); + +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentOrderSortKey.java lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentOrderSortKey.java new file mode 100644 index 0000000..ed16cbc --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/DocumentOrderSortKey.java @@ -0,0 +1,47 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This sorts based alphabetically on the document key + * and then numerically on the targetCharStart + * + */ +public class DocumentOrderSortKey extends ConcordanceSortKey{ + + protected final int targetCharStart; + + public DocumentOrderSortKey(String docKey, int targetCharStart) { + super(docKey); + this.targetCharStart = targetCharStart; + } + + @Override + public int compareTo(ConcordanceSortKey o) { + if (o instanceof DocumentOrderSortKey) { + DocumentOrderSortKey other = (DocumentOrderSortKey)o; + int cmp = super.compareTo(o); + if (cmp == 0) { + return Integer.compare(targetCharStart, other.targetCharStart); + } + return cmp; + } else { + return super.compareTo(o); + } + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/FieldBasedDocIdBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/FieldBasedDocIdBuilder.java new file mode 100644 index 0000000..75d70fd --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/FieldBasedDocIdBuilder.java @@ -0,0 +1,69 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.index.StorableField; +import org.apache.lucene.index.StoredDocument; + +/** + * Simple class that grabs the stringValue() of a specified + * field to use as a document's unique key for the ConcordanceWindow + * building process. + * + * Note that this takes only the first value of the field. + * If a multi-valued field is selected, surprises might happen. + * + * Also, note that if the field is not found, this returns + * a string representation of the ephemeral Lucene docId. + * + * Some users might want to throw an exception instead of this behavior. + * + */ +public class FieldBasedDocIdBuilder implements DocIdBuilder { + + private final String fieldName; + + /** + * + * @param fieldName, name of field to be used as a document's unique key + */ + public FieldBasedDocIdBuilder(String fieldName) { + this.fieldName = fieldName; + } + + @Override + public String build(StoredDocument d, long docId) { + StorableField field = d.getField(fieldName); + if (field == null) { + return Long.toString(docId); + } + return field.toString(); + } + /** + * Instead of getField(String fieldName), this allows for extension + * @return single field to grab from doc + */ + public Set getFields() { + Set fields = new HashSet<>(); + fields.add(fieldName); + return fields; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/IndexIdDocIdBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/IndexIdDocIdBuilder.java new file mode 100644 index 0000000..ae8e490 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/IndexIdDocIdBuilder.java @@ -0,0 +1,35 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.StoredDocument; + +/** + * Simple id builder based on ephemeral Lucene doc ids. + * Use this only if your documents do not have a unique key. + * Then, use only with great care. + * + */ +public class IndexIdDocIdBuilder implements DocIdBuilder { + + @Override + public String build(StoredDocument d, long docId) { + return Long.toString(docId); + } + +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/SimpleDocMetadataExtractor.java lucene/concordance/src/java/org/apache/lucene/search/concordance/SimpleDocMetadataExtractor.java new file mode 100644 index 0000000..6b42410 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/SimpleDocMetadataExtractor.java @@ -0,0 +1,65 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.StoredDocument; + +/** + * Simple class that returns a map of key value pairs + * for the fields specified by {@link #setFieldSelector(java.util.Set)}. + *

+ * For multi-valued fields, this will take only the first value. + * + */ +public class SimpleDocMetadataExtractor implements DocMetadataExtractor { + + private Set fields = new HashSet<>(); + + public void setFieldSelector(Set f) { + fields.clear(); + for (String s : f) { + fields.add(s); + } + } + + @Override + public Set getFieldSelector() { + return Collections.unmodifiableSet(fields); + } + + @Override + public Map extract(StoredDocument d) { + Map map = new HashMap<>(); + // only takes the first value in a multi-valued field!!! + for (String fieldName : getFieldSelector()) { + String[] fieldValues = d.getValues(fieldName); + + if (fieldValues != null && fieldValues.length > 0) { + map.put(fieldName, fieldValues[0]); + } + } + return map; + } + +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/SortKeyBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/SortKeyBuilder.java new file mode 100644 index 0000000..d8ff324 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/SortKeyBuilder.java @@ -0,0 +1,46 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.util.Map; + +import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer; + +public interface SortKeyBuilder { + + /** + * Builds a sort key from the classic TokenCharOffsetResults object + * @param docKey to be used if sorting by document key + * @param startTargetTokenOffset target start offset + * @param endTargetTokenOffset target end offset + * @param charOffsets charOffsets to use for lookup + * @param numTokensPre number of tokens to include before target + * @param numTokensPost number of tokens to include after target + * @param metadata metadata to use + * @return ConcordanceSortKey + */ + ConcordanceSortKey buildKey(String docKey, + int startTargetTokenOffset, int endTargetTokenOffset, + RandomAccessCharOffsetContainer charOffsets, + int numTokensPre, int numTokensPost, Map metadata); + + public boolean requiresAnalysisOfPre(); + public boolean requiresAnalysisOfPost(); + public boolean requiresAnalysisOfTarget(); +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java new file mode 100644 index 0000000..ac59bd5 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/WindowBuilder.java @@ -0,0 +1,241 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.search.concordance.charoffsets.RandomAccessCharOffsetContainer; +import org.apache.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil; +import org.apache.lucene.search.concordance.charoffsets.TargetTokenNotFoundException; + + +/** + * + * Builds a ConcordanceWindow. + * + * This class includes basic functionality for building a window from token offsets. + * + * It also calls three other components: + *

    + *
  1. DocIdBuilder - extracts or builds a unique key for each document
  2. + *
  3. DocMetadataExtractor - extracts metadata from a document to be stored with each window
  4. + *
  5. SortKeyBuilder - builds a window's sort key
  6. + *
+ * + */ +public class WindowBuilder { + + private static String INTER_MULTIVALUE_FIELD_PADDING = " | "; + private final static String EMPTY_STRING = ""; + + private final int tokensBefore; + private final int tokensAfter; + private final SortKeyBuilder sortKeyBuilder; + private final DocMetadataExtractor metadataExtractor; + private final DocIdBuilder docIdBuilder; + private final int offsetGap; + + public WindowBuilder() { + this( + 10, //tokens before + 10, //tokens after + 0, + new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), + new SimpleDocMetadataExtractor(), + new IndexIdDocIdBuilder() + ); + } + + public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap) { + this( + tokensBefore, + tokensAfter, + offsetGap, + new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), + new SimpleDocMetadataExtractor(), + new IndexIdDocIdBuilder() + ); + + } + + public WindowBuilder(int tokensBefore, int tokensAfter, int offsetGap, SortKeyBuilder sortKeyBuilder, + DocMetadataExtractor metadataExtractor, DocIdBuilder docIdBuilder) { + this.tokensBefore = tokensBefore; + this.tokensAfter = tokensAfter; + this.offsetGap = offsetGap; + this.sortKeyBuilder = sortKeyBuilder; + this.metadataExtractor = metadataExtractor; + this.docIdBuilder = docIdBuilder; + } + + /** + * Makes the assumption that the target token start and target token end can + * be found. If not, this returns a null. + * @param uniqueDocID ephemeral internal lucene unique document id + * @param targetTokenStart + * Target's start token + * + * @param targetTokenEnd + * Target's end token + * @param fieldValues field values + * @param metadata + * Metadata to be stored with the window + * @param offsets + * TokenOffsetResults from + * @return ConcordanceWindow or null if character offset information cannot be + * found for both the targetTokenStart and the targetTokenEnd + */ + public ConcordanceWindow buildConcordanceWindow(String uniqueDocID, + int targetTokenStart, int targetTokenEnd, + String[] fieldValues, + RandomAccessCharOffsetContainer offsets, Map metadata) + throws TargetTokenNotFoundException, + IllegalArgumentException { + + if (targetTokenStart < 0 || targetTokenEnd < 0) { + throw new IllegalArgumentException( + "targetTokenStart and targetTokenEnd must be >= 0"); + } + if (targetTokenEnd < targetTokenStart) { + throw new IllegalArgumentException( + "targetTokenEnd must be >= targetTokenStart"); + } + + int targetCharStart = offsets.getCharacterOffsetStart(targetTokenStart); + int targetCharEnd = offsets.getCharacterOffsetEnd(targetTokenEnd); + + if (targetCharStart < 0 || + targetCharEnd < 0) { + throw new TargetTokenNotFoundException( + "couldn't find character offsets for a target token.\n" + + "Check that your analyzers are configured properly.\n"); + } + + OffsetAttribute preCharOffset = getPreCharOffset(targetTokenStart, + targetCharStart, offsets); + String preString = (preCharOffset == null) ? EMPTY_STRING : + SimpleAnalyzerUtil.substringFromMultiValuedFields( + preCharOffset.startOffset(), preCharOffset.endOffset(), fieldValues, + offsetGap, INTER_MULTIVALUE_FIELD_PADDING); + + OffsetAttribute postCharOffset = getPostCharOffset(targetTokenEnd, + targetCharEnd, offsets); + + String postString = (postCharOffset == null) ? EMPTY_STRING : + SimpleAnalyzerUtil.substringFromMultiValuedFields( + postCharOffset.startOffset(), postCharOffset.endOffset(), fieldValues, + offsetGap, INTER_MULTIVALUE_FIELD_PADDING); + + String targString = SimpleAnalyzerUtil.substringFromMultiValuedFields( + targetCharStart, targetCharEnd, fieldValues, + offsetGap, INTER_MULTIVALUE_FIELD_PADDING); + ConcordanceSortKey sortKey = sortKeyBuilder.buildKey(uniqueDocID, + targetTokenStart, targetTokenEnd, offsets, tokensBefore, tokensAfter, metadata); + + int charStart = (preCharOffset == null) ? targetCharStart : + preCharOffset.startOffset(); + + int charEnd = (postCharOffset == null) ? targetCharEnd : postCharOffset.endOffset(); + return new ConcordanceWindow(uniqueDocID, charStart, charEnd, preString, targString, + postString, sortKey, metadata); + + } + + + private OffsetAttribute getPreCharOffset(int targetTokenStart, + int targetCharStart, + RandomAccessCharOffsetContainer charOffsets) { + if (tokensBefore == 0) + return null; + + if (targetTokenStart == 0) { + return null; + } + int contextTokenStart = Math.max(0, + targetTokenStart - tokensBefore); + + int contextCharStart = charOffsets.getClosestCharStart(contextTokenStart, targetTokenStart); + //closest start wasn't actually found + //this can happen if there is a large posInc and the target + //lands at the start of a field index + if (contextCharStart < 0) { + return null; + } + int contextCharEnd = Math.max(contextCharStart, targetCharStart - 1); + + return buildOffsetAttribute(contextCharStart, contextCharEnd); + } + + private OffsetAttribute getPostCharOffset(int targetTokenEnd, + int targetCharEnd, + RandomAccessCharOffsetContainer charOffsets) { + + if (tokensAfter == 0) + return null; + + int contextTokenEnd = targetTokenEnd + tokensAfter; + int contextCharStart = targetCharEnd; + int contextCharEnd = charOffsets.getClosestCharEnd( + contextTokenEnd, targetTokenEnd + 1); + + if (contextCharStart >= contextCharEnd) { + return null; + } + return buildOffsetAttribute(contextCharStart, contextCharEnd); + } + + private OffsetAttribute buildOffsetAttribute(int start, int end) { + OffsetAttribute off = new OffsetAttributeImpl(); + off.setOffset(start, end); + return off; + } + + + public Set getFieldSelector() { + Set set = metadataExtractor.getFieldSelector(); + if (docIdBuilder instanceof FieldBasedDocIdBuilder) { + set.addAll(((FieldBasedDocIdBuilder)docIdBuilder).getFields()); + } + return set; + } + + /** + * Simple wrapper around metadataExtractor + * @param document from which to extract metadata + * @return map of metadata + */ + public Map extractMetadata(StoredDocument document) { + return metadataExtractor.extract(document); + } + + public String getUniqueDocumentId(StoredDocument document, long docId) { + return docIdBuilder.build(document, docId); + } + + public int getTokensBefore() { + return tokensBefore; + } + + public int getTokensAfter() { + return tokensAfter; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java new file mode 100644 index 0000000..82f6f03 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsets.java @@ -0,0 +1,91 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; + +import java.util.List; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttributeImpl; +import org.apache.lucene.index.StoredDocument; + +/** + * Simple class to store a document id (leaf and unique), a StoredDocument, and the offsets + * for a SpanQuery hit + * + */ + +public class DocTokenOffsets { + private int leafDocId = -1; + private int uniqueId = -1; + private StoredDocument document = null; + private List offsets = new ArrayList<>(); + + public void setDocument(StoredDocument d) { + this.document = d; + } + public void addOffset(int start, int end) { + OffsetAttributeImpl offset = new OffsetAttributeImpl(); + offset.setOffset(start, end); + offsets.add(offset); + } + + public void reset(int base, int leafDocId, StoredDocument d, int start, int end) { + this.leafDocId = leafDocId; + this.uniqueId = base+leafDocId; + setDocument(d); + offsets.clear(); + addOffset(start,end); + } + + public List getOffsets() { + return offsets; + } + + public StoredDocument getDocument() { + return document; + } + + public int getLeafDocId() { + return leafDocId; + } + + public int getUniqueDocId() { + return uniqueId; + } + + public DocTokenOffsets deepishCopy() { + DocTokenOffsets copy = new DocTokenOffsets(); + copy.leafDocId = leafDocId; + copy.uniqueId = uniqueId; + copy.document = document; + List copyOffsets = new ArrayList<>(); + copyOffsets.addAll(offsets); + copy.offsets = copyOffsets; + return copy; + } + + public boolean isEmpty() { + return leafDocId < 0; + } + + public void pseudoEmpty() { + leafDocId = -1; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java new file mode 100644 index 0000000..690b0e6 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/DocTokenOffsetsIterator.java @@ -0,0 +1,163 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; + +/** + * Scaffolding/Sugar class around SpanQuery.getSpans(...). This allows the client + * to iterate on an IndexReader (not necessarily a leaf) by document (DocTokenOffsets). + */ +public class DocTokenOffsetsIterator { + /* + * NOT THREAD SAFE!!! + */ + private SpanQuery spanQuery; + private Filter filter; + private LinkedList leafReaders = new LinkedList<>(); + private LeafReader currReader = null; + private Set fields; + private Spans spans = null; + private DocTokenOffsets docTokenOffsets = new DocTokenOffsets(); + private DocTokenOffsets docTokenOffsetsBuffer = new DocTokenOffsets(); + private int currentBase = -1; + + private Map termMap = new HashMap<>(); + + public DocTokenOffsetsIterator() { + } + + public void reset(SpanQuery q, Filter f, IndexReader reader, Set fields) throws IOException { + + this.spanQuery = q; + this.filter = f; + + this.fields = fields; + leafReaders.addAll(reader.leaves()); + if (leafReaders.size() > 0) { + reinitSpans(); + } + } + + public boolean next() throws IOException { + + if (spans == null || docTokenOffsetsBuffer.isEmpty()) { + if (leafReaders.size() == 0) { + return false; + } else if (!reinitSpans()) { + return false; + } + + } + boolean currSpansHasMore = false; + while (spans.next()) { + if (spans.doc() == docTokenOffsetsBuffer.getLeafDocId()) { + docTokenOffsetsBuffer.addOffset(spans.start(), spans.end()); + } else { + currSpansHasMore = true; + break; + } + } + docTokenOffsets = docTokenOffsetsBuffer.deepishCopy(); + + if (currSpansHasMore) { + StoredDocument d = currReader.document(spans.doc(), fields); + docTokenOffsetsBuffer.reset(currentBase, spans.doc(), d, spans.start(), spans.end()); + } else { + docTokenOffsetsBuffer.pseudoEmpty(); + } + return true; + } + + public DocTokenOffsets getDocTokenOffsets() { + return docTokenOffsets; + } + + private boolean reinitSpans() throws IOException { + //must check that leafReaders.size() > 0 before running reinitSpans!!! + LeafReaderContext ctx = leafReaders.pop(); + currentBase = ctx.docBase; + currReader = ctx.reader(); + Bits bits = null; + Bits liveBits = currReader.getLiveDocs(); + //liveBits can be null if all of the docs are live!!! + if (filter == null) { + bits = liveBits; + } else { + DocIdSet idSet = filter.getDocIdSet(ctx, liveBits); + +/* only works in 5.x. branch, not trunk + if (idSet instanceof FixedBitSet) { + bits = (FixedBitSet)idSet; + } else {*/ + DocIdSetIterator itr = idSet.iterator(); + if (itr != null) { + FixedBitSet tmpBits = new FixedBitSet(currReader.maxDoc()); + while (itr.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + tmpBits.set(itr.docID()); + } + bits = tmpBits; + } + } + + /*bits() is optional; this doesn't work!!!! + bits = idSet.bits(); + */ + + //bits can be null if all the docs are live + //or if the filter returned an empty docidset. + if (filter != null && bits == null) { + if (leafReaders.size() > 0) { + return reinitSpans(); + } else { + return false; + } + } + + spans = spanQuery.getSpans(ctx, bits, termMap); + //can getSpans return null? + if (spans != null && spans.next()) { + StoredDocument d = currReader.document(spans.doc(), fields); + + docTokenOffsetsBuffer.reset(currentBase, spans.doc(), d, spans.start(), spans.end()); + return true; + } else if (leafReaders.size() > 0) { + return reinitSpans(); + } else { + return false; + } + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java new file mode 100644 index 0000000..c19f61c --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetLengthStartComparator.java @@ -0,0 +1,50 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Comparator; +import java.io.Serializable; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * Sorts length desc, start offset asc + * + */ + +public class OffsetLengthStartComparator implements Comparator, Serializable { + private static final long serialVersionUID = 7526472295622776147L; + + @Override + public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) { + + int lenA = offsetA.endOffset()-offsetA.startOffset(); + int lenB = offsetB.endOffset()-offsetB.startOffset(); + if (lenA < lenB) { + return 1; + } else if (lenA > lenB) { + return -1; + //by here, the length is the same + } else if (offsetA.startOffset() < offsetB.startOffset()) { + return -1; + } else if (offsetA.startOffset() > offsetB.startOffset()) { + return 1; + } + return 0; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java new file mode 100644 index 0000000..685dbb2 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetStartComparator.java @@ -0,0 +1,42 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Comparator; +import java.io.Serializable; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * sort on offset start + */ +public class OffsetStartComparator implements Comparator, Serializable{ + private static final long serialVersionUID = 7526472295622776147L; + + @Override + public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) { + + if (offsetA.startOffset() < offsetB.startOffset()) { + return -1; + } else if (offsetA.startOffset() > offsetB.startOffset()) { + return 1; + } + return 0; + } + +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java new file mode 100644 index 0000000..369925d --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/OffsetUtil.java @@ -0,0 +1,68 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * In some versions of Lucene, getSpans returned overlapping spans. + * This class can remove the overlapping spans and will sort them + * if startComparator is not null. + * + * + */ +public class OffsetUtil { + + + public static List removeOverlapsAndSort(List offsets, + OffsetLengthStartComparator comparator, + OffsetStartComparator startComparator) { + if (offsets == null || offsets.size() < 2) + return offsets; + + Collections.sort(offsets, comparator); + Set seen = new HashSet<>(); + List filtered = new ArrayList<>(); + for (OffsetAttribute offset : offsets) { + if (! alreadySeen(offset, seen)) { + filtered.add(offset); + for (int i = offset.startOffset(); i < offset.endOffset(); i++) { + seen.add(i); + } + } + } + if (startComparator != null) { + Collections.sort(filtered, startComparator); + } + return filtered; + } + + private static boolean alreadySeen(OffsetAttribute offset, Set seen) { + for (int i = offset.startOffset(); i <= offset.endOffset(); i++) { + if (seen.contains(i)) + return true; + } + return false; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/RandomAccessCharOffsetContainer.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/RandomAccessCharOffsetContainer.java new file mode 100644 index 0000000..a1857f4 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/RandomAccessCharOffsetContainer.java @@ -0,0 +1,225 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.BitSet; +import java.util.HashMap; +import java.util.Map; + + +/** + * Class to record results for looking up normalized terms (String) and + * character offsets for specified tokens. Will return NULL_TERM/NULL_OFFSET if + * a token offset was not found. + * + * Has utility methods for safely getting the closest found token. This is + * useful for when a concordance window ends in a stop word (no term/offset + * info). + */ + +public class RandomAccessCharOffsetContainer { + + public final static String NULL_TERM = ""; + public final static int NULL_OFFSET = -1; + + private BitSet set = new BitSet(); + private int last = -1; + private Map terms = new HashMap<>(); + private Map starts = new HashMap<>(); + private Map ends = new HashMap<>(); + + /** + * + * @param tokenOffset token of interest + * @param startCharOffset start character offset within the string stored in StoredField[fieldIndex] + * @param endCharOffset end character offset within the string stored in StoredField[fieldIndex] + * @param term string term at that position + */ + public void add(int tokenOffset, int startCharOffset, + int endCharOffset, String term) { + addStart(tokenOffset, startCharOffset); + addEnd(tokenOffset, endCharOffset); + addTerm(tokenOffset, term); + set.set(tokenOffset); + } + + private void addTerm(int tokenOffset, String term) { + if (term != null) { + terms.put(tokenOffset, term); + } + last = (tokenOffset > last) ? tokenOffset : last; + } + + private void addStart(int tokenOffset, int charOffset) { + starts.put(tokenOffset, charOffset); + last = (tokenOffset > last) ? tokenOffset : last; + } + + private void addEnd(int tokenOffset, int charOffset) { + ends.put(tokenOffset, charOffset); + last = (tokenOffset > last) ? tokenOffset : last; + } + + /** + * @param tokenOffset target token + * @return the character offset for the first character of the tokenOffset. + * returns {@link #NULL_OFFSET} if tokenOffset wasn't found + */ + public int getCharacterOffsetStart(int tokenOffset) { + Integer start = starts.get(tokenOffset); + if (start == null) { + return NULL_OFFSET; + } + return start; + } + + /** + * @param tokenOffset target token + * @return the character offset for the final character of the tokenOffset. + */ + public int getCharacterOffsetEnd(int tokenOffset) { + Integer end = ends.get(tokenOffset); + if (end == null) { + return NULL_OFFSET; + } + return end; + } + + /** + * + * @param tokenOffset tokenOffset + * @return term stored at this tokenOffset; can return {@link #NULL_TERM} + */ + public String getTerm(int tokenOffset) { + String s = terms.get(tokenOffset); + if (s == null) { + return NULL_TERM; + } + return s; + } + + /** + * + * @return last/largest token offset + */ + public int getLast() { + return last; + } + + /** + * reset state + */ + public void clear() { + terms.clear(); + starts.clear(); + ends.clear(); + last = -1; + set.clear(); + } + + protected boolean isEmpty() { + return set.isEmpty(); + } + + /** + * Find the closest non-null token starting from startToken + * and ending with stopToken (inclusive). + * + * @param startToken token at which to start the search + * @param stopToken token at which to end + * @param map map to search + * @return closest non-null token offset to the startToken; can return + * {@link #NULL_OFFSET} if no non-null offset was found + */ + private int getClosestToken(int startToken, int stopToken, + Map map) { + + if (startToken < 0 || stopToken < 0) { + return NULL_OFFSET; + } + if (startToken == stopToken) { + return startToken; + } + if (startToken < stopToken) { + for (int i = startToken; i <= stopToken; i++) { + Integer charOffset = map.get(i); + if (charOffset != null && charOffset != NULL_OFFSET) { + return i; + } + } + } else if (startToken > stopToken) { + for (int i = startToken; i >= stopToken; i--) { + Integer charOffset = map.get(i); + if (charOffset != null && charOffset != NULL_OFFSET) { + return i; + } + } + } + return NULL_OFFSET; + } + + public int getClosestCharStart(int startToken, int stopToken) { + + int i = getClosestToken(startToken, stopToken, starts); + Integer charStart = getCharacterOffsetStart(i); + if (charStart == null) { + return NULL_OFFSET; + } + return charStart; + } + + public int getClosestCharEnd(int startToken, int stopToken) { + int i = getClosestToken(startToken, stopToken, ends); + Integer charEnd = getCharacterOffsetEnd(i); + if (charEnd == null) { + return NULL_OFFSET; + } + return charEnd; + } + + protected String getClosestTerm(int startToken, int stopToken) { + int i = getClosestToken(startToken, stopToken, starts); + return getTerm(i); + } + + /* + * return: -1 if + + public int getFieldIndex(int tokenOffset) { + CharCoordinate p = starts.get(tokenOffset); + if (p == null) { + return NULL_OFFSET; + } + return p.getFieldIndex(); + } +*/ + + protected BitSet getSet() { + return set; + } + + public void remove(int token) { + if (token == last) { + last = getClosestToken(last-1, 0, starts); + } + set.clear(token); + terms.remove(token); + starts.remove(token); + ends.remove(token); + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java new file mode 100644 index 0000000..79d035a --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/ReanalyzingTokenCharOffsetsReader.java @@ -0,0 +1,116 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.index.StoredDocument; + +/** + * TokenCharOffsetsReader that captures character offsets by reanalyzing a + * field. + * + */ +public class ReanalyzingTokenCharOffsetsReader implements + TokenCharOffsetsReader { + + private final static int GOT_ALL_REQUESTS = -2; + private Analyzer baseAnalyzer; + + /** + * Constructor + * @param analyzer to use to get character offsets + */ + public ReanalyzingTokenCharOffsetsReader(Analyzer analyzer) { + this.baseAnalyzer = analyzer; + } + + @Override + public void getTokenCharOffsetResults(final StoredDocument d, + final String fieldName, final TokenCharOffsetRequests requests, + final RandomAccessCharOffsetContainer results) throws IOException { + + int fieldIndex = 0; + int currPosInc = -1; + int posIncrementGap = baseAnalyzer.getPositionIncrementGap(fieldName); + int charOffsetGap = baseAnalyzer.getOffsetGap(fieldName); + int charBase = 0; + for (String fieldValue : d.getValues(fieldName)) { + + currPosInc = addFieldValue(fieldIndex, currPosInc, charBase, fieldValue, requests, + results); + + if (currPosInc == GOT_ALL_REQUESTS) { + break; + } + charBase += fieldValue.length()+charOffsetGap; + currPosInc += posIncrementGap; + fieldIndex++; + } + + } + + private int addFieldValue(int fieldIndex, int currInd, int charBase, String fieldValue, + TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) + throws IOException { + //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true); + TokenStream stream = baseAnalyzer.tokenStream("", fieldValue); + stream.reset(); + + int defaultInc = 1; + + CharTermAttribute termAtt = stream + .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); + OffsetAttribute offsetAtt = stream + .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); + PositionIncrementAttribute incAtt = null; + if (stream + .hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) { + incAtt = stream + .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); + } + + while (stream.incrementToken()) { + + currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc; + if (requests.contains(currInd)) { + results.add(currInd, offsetAtt.startOffset()+charBase, + offsetAtt.endOffset()+charBase, termAtt.toString()); + } + if (currInd > requests.getLast()) { + // TODO: Is there a way to avoid this? Or, is this + // an imaginary performance hit? + while (stream.incrementToken()) { + //NO-OP + } + stream.end(); + stream.close(); + return GOT_ALL_REQUESTS; + } + } + stream.end(); + stream.close(); + return currInd; + } + +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java new file mode 100644 index 0000000..10579bb --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/SimpleAnalyzerUtil.java @@ -0,0 +1,156 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + + +/** + * Simple util class for Analyzers + */ +public class SimpleAnalyzerUtil { + private final static String DEFAULT_FIELD = "FIELD"; + + /** + * Returns simple list of analyzed strings + * + * @param s string to analyze + * @param analyzer analyzer + * @return list of string tokens + * @throws java.io.IOException + */ + public static List getTermStrings(String s, Analyzer analyzer) + throws IOException { + List terms = new ArrayList<>(); + return getTermStrings(s, analyzer, terms); + } + + /** + * allows reuse of terms, this method calls terms.clear() before adding new + * terms + * + * @param s string to analyze + * @param analyzer analyzer + * @param terms list for reuse + * @return list of strings + * @throws java.io.IOException + */ + public static List getTermStrings(String s, Analyzer analyzer, + List terms) throws IOException { + if (terms == null) { + terms = new ArrayList<>(); + } + terms.clear(); + TokenStream stream = analyzer.tokenStream(DEFAULT_FIELD, s); + stream.reset(); + CharTermAttribute termAtt = stream + .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); + + + while (stream.incrementToken()) { + terms.add(termAtt.toString()); + } + stream.end(); + stream.close(); + + return terms; + } + + /** + * This calculates a substring from an array of StorableFields. + * + * This attempts to do the best job possible, and at worst will + * return an empty string. If the start or end is within a gap, + * or before 0 or after the total number of characters, this will + * gracefully (blithely?) handle those cases. + * + * + * @param start character offset to start + * @param end character offset to end + * @param fieldValues array of Strings to process + * @param offsetGap offsetGap as typically returned by Analyzer's .getOffsetGap() + * @param interFieldJoiner string to use to mark that a substring goes beyond a single + * field entry + * @return substring, potentially empty, never null. + */ + public static String substringFromMultiValuedFields(int start, + int end, String[] fieldValues, int offsetGap, String interFieldJoiner) { + start = (start < 0) ? 0 : start; + end = (end < 0) ? 0: end; + + if (start > end) { + start = end; + } + + int charBase = 0; + StringBuilder sb = new StringBuilder(); + int lastFieldIndex = 0; + int localStart = 0; + boolean foundStart = false; + //get start + for (int fieldIndex = 0; fieldIndex < fieldValues.length; fieldIndex++) { + String fString = fieldValues[fieldIndex]; + if (start < charBase+fString.length()) { + localStart = start-charBase; + lastFieldIndex = fieldIndex; + foundStart = true; + break; + } + charBase += fString.length()+offsetGap; + } + if (!foundStart) { + return ""; + } + //if start occurred in a gap, reset localStart to 0 + if (localStart < 0) { + sb.append(interFieldJoiner); + localStart = 0; + } + //now append and look for end + for (int fieldIndex = lastFieldIndex; fieldIndex < fieldValues.length; fieldIndex++) { + String fString = fieldValues[fieldIndex]; + + if (end <= charBase+fString.length()) { + int localEnd = end-charBase; + //must be in gap + if (charBase > end) { + return sb.toString(); + } + if (fieldIndex != lastFieldIndex) { + sb.append(interFieldJoiner); + } + sb.append(fString.substring(localStart, localEnd)); + break; + } else { + if (fieldIndex != lastFieldIndex) { + sb.append(interFieldJoiner); + } + sb.append(fString.substring(localStart)); + localStart = 0; + } + charBase += fString.length()+offsetGap; + } + return sb.toString(); + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java new file mode 100644 index 0000000..8ffc82b --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TargetTokenNotFoundException.java @@ -0,0 +1,31 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Token offset identified by .getSpans() is not found in the TokenCharOffsetResults. + * Typical cause is a mismatch between analyzers at index and search times. + * When this happens, something very bad has happened and this should be its own exception. + */ +public class TargetTokenNotFoundException extends Exception { + + private static final long serialVersionUID = 1L; + public TargetTokenNotFoundException(String message) { + super(message); + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java new file mode 100644 index 0000000..f1954dc --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetRequests.java @@ -0,0 +1,81 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.BitSet; + +/** + * Util class used to specify the tokens for which character offsets are requested. + */ +public class TokenCharOffsetRequests { + private BitSet set = new BitSet(); + private int last = -1; + + /** + * Is a specific token requested? + * @param i token number to test + * @return whether or not this token is requested + */ + public boolean contains(int i) { + return set.get(i); + } + + /** + * add a request from start to end inclusive + * @param start range of token offsets to request (inclusive) + * @param end end range of token offsets to request (inclusive) + */ + public void add(int start, int end) { + for (int i = start; i <= end; i++) { + add(i); + } + } + + /** + * add a request for a specific token + * @param i token offset to request the character offsets for + */ + public void add(int i) { + set.set(i); + last = (i > last) ? i : last; + } + + /** + * clear the state of this request object for reuse + */ + public void clear() { + set.clear(); + last = -1; + } + + /** + * + * @return greatest/last token offset in the request + */ + public int getLast() { + return last; + } + + /** + * + * @return the set of tokens whose character offsets are requested + */ + protected BitSet getSet() { + return set; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java new file mode 100644 index 0000000..481087c --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/charoffsets/TokenCharOffsetsReader.java @@ -0,0 +1,34 @@ +package org.apache.lucene.search.concordance.charoffsets; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.StoredDocument; + + +/** + * Interface to allow flexibility/optimizations in returning character offsets + * for tokens + */ +public interface TokenCharOffsetsReader { + + public void getTokenCharOffsetResults(final StoredDocument document, + final String fieldName, final TokenCharOffsetRequests requests, + final RandomAccessCharOffsetContainer results) throws IOException; +} diff --git lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html new file mode 100644 index 0000000..28bd921 --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/concordance/package.html @@ -0,0 +1,23 @@ + + + + + +ConcordanceSearcher performs a search on an index and returns concordance windows. + + diff --git lucene/concordance/src/java/org/apache/lucene/search/queries/SpanQueryConverter.java lucene/concordance/src/java/org/apache/lucene/search/queries/SpanQueryConverter.java new file mode 100644 index 0000000..7fe0bcc --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/queries/SpanQueryConverter.java @@ -0,0 +1,68 @@ +package org.apache.lucene.search.queries; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.index.Term; +import org.apache.lucene.queries.CommonTermsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.spans.SimpleSpanQueryConverter; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; + +/** + * This adds CommonTermsQuery to SimpleSpanQueryConverter. + * This had to be broken into a separate class to maintain + * clean compilation units (core vs. queries). + */ +public class SpanQueryConverter extends SimpleSpanQueryConverter { + + @Override + protected SpanQuery convertUnknownQuery(String field, Query query) { + if (query instanceof CommonTermsQuery) { + + // specialized since rewriting would change the result query + // this query is TermContext sensitive. + CommonTermsQuery ctq = (CommonTermsQuery) query; + + Set terms = new HashSet<>(); + ctq.extractTerms(terms); + List spanQs = new LinkedList<>(); + + for (Term term : terms) { + if (term.field().equals(field)) { + spanQs.add(new SpanTermQuery(term)); + } + } + if (spanQs.size() == 0) { + return getEmptySpanQuery(); + } else if (spanQs.size() == 1) { + return spanQs.get(0); + } else { + return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()])); + } + } + super.convertUnknownQuery(field, query); + return null; + } +} diff --git lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java new file mode 100644 index 0000000..a945b0f --- /dev/null +++ lucene/concordance/src/java/org/apache/lucene/search/spans/SimpleSpanQueryConverter.java @@ -0,0 +1,285 @@ +package org.apache.lucene.search.spans; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +public class SimpleSpanQueryConverter { + /** + * Converts a regular query to a {@link org.apache.lucene.search.spans.SpanQuery} for use in a highlighter. + * Because of subtle differences in {@link org.apache.lucene.search.spans.SpanQuery} and {@link org.apache.lucene.search.Query}, this + * {@link org.apache.lucene.search.spans.SpanQuery} will not necessarily return the same documents as the + * initial Query. For example, the generated SpanQuery will not include + * clauses of type BooleanClause.Occur.MUST_NOT. Also, the + * {@link org.apache.lucene.search.spans.SpanQuery} will only cover a single field, whereas the {@link org.apache.lucene.search.Query} + * might contain multiple fields. + *

+ * Returns an empty SpanQuery if the {@link org.apache.lucene.search.Query} is a class that + * is handled, but for some reason can't be converted from a {@link org.apache.lucene.search.Query} to a + * {@link org.apache.lucene.search.spans.SpanQuery}. This can happen for many reasons: e.g. if the Query + * contains no terms in the requested "field" or the Query is a MatchAllDocsQuery. + *

+ * Throws IllegalArgumentException if the Query is a class that is + * is not yet handled. + *

+ * This class does not rewrite the SpanQuery before returning it. + * Clients are required to rewrite if necessary. + *

+ * Much of this code is copied directly from + * oal.search.highlight.WeightedSpanTermExtractor. There are some subtle + * differences. + * + * @param field single field to extract SpanQueries for + * @param query query to convert + * @return SpanQuery for use in highlighting; can return empty SpanQuery + * @throws java.io.IOException, IllegalArgumentException + */ + public SpanQuery convert(String field, Query query) throws IOException { + /* + * copied nearly verbatim from + * org.apache.lucene.search.highlight.WeightedSpanTermExtractor + * TODO:refactor to avoid duplication of code if possible. + * Beware: there are some subtle differences. + */ + if (query instanceof SpanQuery) { + SpanQuery sq = (SpanQuery) query; + if (sq.getField().equals(field)) { + return (SpanQuery) query; + } else { + return getEmptySpanQuery(); + } + } else if (query instanceof BooleanQuery) { + BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses(); + List spanQs = new ArrayList<>(); + for (int i = 0; i < queryClauses.length; i++) { + if (!queryClauses[i].isProhibited()) { + tryToAdd(field, convert(field, queryClauses[i].getQuery()), spanQs); + } + } + if (spanQs.size() == 0) { + return getEmptySpanQuery(); + } else if (spanQs.size() == 1) { + return spanQs.get(0); + } else { + return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()])); + } + } else if (query instanceof PhraseQuery) { + PhraseQuery phraseQuery = ((PhraseQuery) query); + + Term[] phraseQueryTerms = phraseQuery.getTerms(); + if (phraseQueryTerms.length == 0) { + return getEmptySpanQuery(); + } else if (!phraseQueryTerms[0].field().equals(field)) { + return getEmptySpanQuery(); + } + SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length]; + for (int i = 0; i < phraseQueryTerms.length; i++) { + clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); + } + int slop = phraseQuery.getSlop(); + int[] positions = phraseQuery.getPositions(); + // sum position increments (>1) and add to slop + if (positions.length > 0) { + int lastPos = positions[0]; + int sz = positions.length; + for (int i = 1; i < sz; i++) { + int pos = positions[i]; + int inc = pos - lastPos-1; + slop += inc; + lastPos = pos; + } + } + + boolean inorder = false; + + if (phraseQuery.getSlop() == 0) { + inorder = true; + } + + SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); + sp.setBoost(query.getBoost()); + return sp; + } else if (query instanceof TermQuery) { + TermQuery tq = (TermQuery) query; + if (tq.getTerm().field().equals(field)) { + return new SpanTermQuery(tq.getTerm()); + } else { + return getEmptySpanQuery(); + } + } else if (query instanceof FilteredQuery) { + return convert(field, ((FilteredQuery) query).getQuery()); + } else if (query instanceof ConstantScoreQuery) { + return convert(field, ((ConstantScoreQuery) query).getQuery()); + } else if (query instanceof DisjunctionMaxQuery) { + List spanQs = new ArrayList<>(); + for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator + .hasNext();) { + tryToAdd(field, convert(field, iterator.next()), spanQs); + } + if (spanQs.size() == 0) { + return getEmptySpanQuery(); + } else if (spanQs.size() == 1) { + return spanQs.get(0); + } else { + return new SpanOrQuery(spanQs.toArray(new SpanQuery[spanQs.size()])); + } + } else if (query instanceof MatchAllDocsQuery) { + return getEmptySpanQuery(); + } else if (query instanceof MultiPhraseQuery) { + + final MultiPhraseQuery mpq = (MultiPhraseQuery) query; + final List termArrays = mpq.getTermArrays(); + //test for empty or wrong field + if (termArrays.size() == 0) { + return getEmptySpanQuery(); + } else if (termArrays.size() > 1) { + Term[] ts = termArrays.get(0); + if (ts.length > 0) { + Term t = ts[0]; + if (!t.field().equals(field)) { + return getEmptySpanQuery(); + } + } + } + final int[] positions = mpq.getPositions(); + if (positions.length > 0) { + + int maxPosition = positions[positions.length - 1]; + for (int i = 0; i < positions.length - 1; ++i) { + if (positions[i] > maxPosition) { + maxPosition = positions[i]; + } + } + + @SuppressWarnings("unchecked") + final List[] disjunctLists = new List[maxPosition + 1]; + int distinctPositions = 0; + + for (int i = 0; i < termArrays.size(); ++i) { + final Term[] termArray = termArrays.get(i); + List disjuncts = disjunctLists[positions[i]]; + if (disjuncts == null) { + disjuncts = (disjunctLists[positions[i]] = new ArrayList<>( + termArray.length)); + ++distinctPositions; + } + for (int j = 0; j < termArray.length; ++j) { + disjuncts.add(new SpanTermQuery(termArray[j])); + } + } + + int positionGaps = 0; + int position = 0; + final SpanQuery[] clauses = new SpanQuery[distinctPositions]; + for (int i = 0; i < disjunctLists.length; ++i) { + List disjuncts = disjunctLists[i]; + if (disjuncts != null) { + if (disjuncts.size() == 1) { + clauses[position++] = disjuncts.get(0); + } else { + clauses[position++] = new SpanOrQuery( + disjuncts.toArray(new SpanQuery[disjuncts.size()])); + } + } else { + ++positionGaps; + } + } + + final int slop = mpq.getSlop(); + final boolean inorder = (slop == 0); + + SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, + inorder); + sp.setBoost(query.getBoost()); + return sp; + } + + } else if (query instanceof MultiTermQuery) { + return new SpanMultiTermQueryWrapper<>((MultiTermQuery)query); + } + return convertUnknownQuery(field, query); + } + + private void tryToAdd(String field, SpanQuery q, List qs) { + if (q == null || isEmptyQuery(q) || !q.getField().equals(field)) { + return; + } + qs.add(q); + } + + /** + * Extend this to handle queries that are not currently handled. + * Might consider extending SpanQueryConverter in the queries compilation unit; + * that includes CommonTermsQuery. + * + * In this class, this always throws an IllegalArgumentException + * @param field field to convert + * @param query query to convert + * @return nothing. Throws IllegalArgumentException + */ + protected SpanQuery convertUnknownQuery(String field, Query query) { + throw new IllegalArgumentException("SpanQueryConverter is unable to convert this class "+ + query.getClass().toString()); + } + + /** + * + * @return an empty SpanQuery (SpanOrQuery with no cluases) + */ + protected SpanQuery getEmptySpanQuery() { + SpanQuery q = new SpanOrQuery(new SpanTermQuery[0]); + return q; + } + + /** + * Is this a null or empty SpanQuery + * @param q query to test + * @return whether a null or empty SpanQuery + */ + protected boolean isEmptyQuery(SpanQuery q) { + if (q == null) { + return true; + } + if (q instanceof SpanOrQuery) { + SpanOrQuery soq = (SpanOrQuery)q; + for (SpanQuery sq : soq.getClauses()) { + if (! isEmptyQuery(sq)) { + return false; + } + } + return true; + } + return false; + } +} diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestBase.java lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestBase.java new file mode 100644 index 0000000..50f412f --- /dev/null +++ lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestBase.java @@ -0,0 +1,207 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; + +public class ConcordanceTestBase extends LuceneTestCase { + + protected final static String FIELD = "f1"; + + public Directory getDirectory(Analyzer analyzer, String[] vals) + throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(analyzer) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + + for (String s : vals) { + Document d = new Document(); + d.add(newTextField(FIELD, s, Field.Store.YES)); + writer.addDocument(d); + } + writer.close(); + return directory; + } + + public Directory getDirectory(Analyzer analyzer, List input) + throws IOException { + + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(analyzer) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy())); + + for (String[] vals : input) { + Document d = new Document(); + for (String s : vals) { + d.add(newTextField(FIELD, s, Field.Store.YES)); + } + writer.addDocument(d); + } + writer.close(); + return directory; + } + + public static Analyzer getAnalyzer(final CharacterRunAutomaton stops) { + return getAnalyzer(stops, random().nextInt(10000), random().nextInt(10000)); + } + + public static Analyzer getAnalyzer(final CharacterRunAutomaton stops, + final int posIncGap, final int charOffsetGap) { + + Analyzer analyzer = new Analyzer() { + + @Override + public TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); + TokenFilter filter = new MockTokenFilter(tokenizer, stops); + return new TokenStreamComponents(tokenizer, filter); + } + + @Override + public int getPositionIncrementGap(String fieldName) { + return posIncGap; + } + + @Override + public int getOffsetGap(String fieldName) { + return charOffsetGap; + } + }; + return analyzer; + } + + protected Directory buildNeedleIndex(String needle, + Analyzer analyzer, int numFieldValues) throws Exception { + + IndexWriterConfig config = newIndexWriterConfig(random(), analyzer) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setMergePolicy(newLogMergePolicy()); + + Directory directory = newDirectory(); + /* + Don't think we still need this + String pf = TestUtil.getPostingsFormat(FIELD); + if (doesntSupportOffsets.contains(pf)) { + //just use Asserting + Codec codec = new AssertingCodec(); + config.setCodec(codec); + }*/ + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config); + //create document with multivalued field + String[] fs = new String[numFieldValues]; + for (int i = 0; i < numFieldValues; i++) { + float r = random().nextFloat(); + String doc = ""; + if (r <= 0.33) { + doc = needle+" "+getRandomWords(29, needle, analyzer); + } else if (r <= 0.66) { + doc = getRandomWords(13, needle, analyzer)+" "+needle+" "+getRandomWords(17, needle, analyzer); + } else { + doc = getRandomWords(31, needle, analyzer)+" "+needle; + } + fs[i] = doc; + } + + Document d = new Document(); + FieldType type = new FieldType(); + type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + type.setStored(true); + + for (String s : fs) { + d.add(newField(FIELD, s, type)); + } + writer.addDocument(d); + writer.close(); + return directory; + } + + + + /** + * this assumes no stop filter in the analyzer. + * Best to use whitespace tokenizer. + */ + private String getRandomWords(int numWords, String needle, Analyzer analyzer) throws Exception { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < numWords; i++ ) { + sb.append(TestUtil.randomUnicodeString(random(), 31)); + sb.append(" "); + } + List terms = SimpleAnalyzerUtil.getTermStrings(sb.toString(), analyzer); + StringBuilder rsb = new StringBuilder(); + int words = -1; + while (words++ < numWords && words < terms.size()) { + String cand = terms.get(words); + if (!needle.equals(cand)) { + if (words > 0) { + rsb.append(" "); + } + rsb.append(cand); + } + } + return rsb.toString(); + } + + + protected String getNeedle(Analyzer analyzer) { + //try to get a term that would come out of the analyzer + for (int i = 0; i < 10; i++) { + //start with a random base string + String baseString = TestUtil.randomUnicodeString(random(), random().nextInt(10) + 2); + + try{ + //run it through the analyzer, and take the first thing + //that comes out of it if the length > 0 + List terms = SimpleAnalyzerUtil.getTermStrings(baseString, analyzer); + for (String t : terms) { + if (t.length() > 0) { + return t; + } + } + } catch (IOException e) { + //swallow + } + } + //if nothing is found in 10 tries, + //return literal string "needle" + + return "needle"; + } +} diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java new file mode 100644 index 0000000..a2d9bc6 --- /dev/null +++ lucene/concordance/src/test/org/apache/lucene/search/concordance/ConcordanceTestUtils.java @@ -0,0 +1,92 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; + +public class ConcordanceTestUtils extends LuceneTestCase { + public final static String FIELD = "content"; + + + public static Directory getDirectory(Analyzer analyzer, String[] vals) throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs + (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); + + for (String s : vals) { + Document d = new Document(); + d.add(newTextField(FIELD, s, Field.Store.YES)); + writer.addDocument(d); + + } + writer.close(); + return directory; + } + + public static Directory getDirectory(Analyzer analyzer, List input) throws IOException { + + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs + (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); + for (String[] vals : input) { + Document d = new Document(); + for (String s : vals) { + d.add(newTextField(FIELD, s, Field.Store.YES)); + } + writer.addDocument(d); + + } + writer.close(); + return directory; + } + + public static Analyzer getAnalyzer(final CharacterRunAutomaton stops, final int posIncGap) { + //stops will usually be either: + //MockTokenFilter.EMPTY_STOPSET; + //MockTokenFilter.ENGLISH_STOPSET + return new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); + + TokenFilter filter = new MockTokenFilter(tokenizer, stops); + return new TokenStreamComponents(tokenizer, filter); + } + @Override + public int getPositionIncrementGap(String fieldName) { + return posIncGap; + } + }; + } +} diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java new file mode 100644 index 0000000..04cf7a9 --- /dev/null +++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestConcordanceSearcher.java @@ -0,0 +1,498 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryWrapperFilter; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.Directory; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestConcordanceSearcher extends ConcordanceTestBase { + + private final static DocMetadataExtractor metadataExtractor = + new DocMetadataExtractor() { + private final Set fields = new HashSet<>(); + private final Map data = new HashMap<>(); + + @Override + public Set getFieldSelector() { + return fields; + } + + @Override + public Map extract(StoredDocument d) { + return data; + } + }; + + private final static DocIdBuilder docIdBuilder = new IndexIdDocIdBuilder(); + + @BeforeClass + public static void beforeClass() throws Exception { + // NOOP for now + } + + @AfterClass + public static void afterClass() throws Exception { + // NOOP for now + } + + @Test + public void testSimple() throws Exception { + String[] docs = new String[]{"a b c a b c", "c b a c b a"}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + WindowBuilder wb = new WindowBuilder(10, 10, + analyzer.getOffsetGap(FIELD), + new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder); + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + SpanQuery q = new SpanTermQuery(new Term(FIELD, "a")); + + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); + searcher.search(reader, FIELD, + q, null, analyzer, collector); + + assertEquals(3, collector.size()); + + collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL); + searcher.search(reader, FIELD, q, null, analyzer, collector); + + // test result size + assertEquals(4, collector.size()); + + // test result with sort order = pre + List windows = collector.getSortedWindows(); + String[] pres = new String[]{"", "c b", "c b a c b", "a b c"}; + String[] posts = new String[]{" b c a b c", " c b a", "", " b c"}; + + for (int i = 0; i < windows.size(); i++) { + ConcordanceWindow w = windows.get(i); + + assertEquals(pres[i], w.getPre()); + assertEquals(posts[i], w.getPost()); + } + + // test sort order post + // sort key is built at search time, so must re-search + wb = new WindowBuilder(10, 10, + analyzer.getOffsetGap(FIELD), + new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder); + searcher = new ConcordanceSearcher(wb); + + collector = new ConcordanceWindowCollector(ConcordanceWindowCollector.COLLECT_ALL); + searcher.search(reader, FIELD, q, + null, analyzer, collector); + + windows = collector.getSortedWindows(); + + posts = new String[]{"", " b c", " b c a b c", " c b a",}; + for (int i = 0; i < windows.size(); i++) { + ConcordanceWindow w = windows.get(i); + assertEquals(posts[i], w.getPost()); + } + reader.close(); + directory.close(); + } + + @Test + public void testSimpleMultiValuedField() throws Exception { + String[] doc = new String[]{"a b c a b c", "c b a c b a"}; + List docs = new ArrayList<>(); + docs.add(doc); + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + SpanQuery q = new SpanTermQuery(new Term(FIELD, "a")); + + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100); + + searcher.search(reader, FIELD, + q, null, analyzer, collector); + + // test result size + assertEquals(4, collector.size()); + + // test result with sort order = pre + List windows = collector.getSortedWindows(); + String[] pres = new String[]{"", "c b", "c b a c b", "a b c"}; + String[] posts = new String[]{" b c a b c", " c b a", "", " b c"}; + + for (int i = 0; i < pres.length; i++) { + ConcordanceWindow w = windows.get(i); + + assertEquals("pres: " + i, pres[i], w.getPre()); + + assertEquals("posts: " + i, posts[i], w.getPost()); + } + + // test sort order post + // sort key is built at search time, so must re-search + WindowBuilder wb = new WindowBuilder(10, 10, + analyzer.getOffsetGap(FIELD), + new DefaultSortKeyBuilder(ConcordanceSortOrder.POST), metadataExtractor, docIdBuilder); + searcher = new ConcordanceSearcher(wb); + + collector = new ConcordanceWindowCollector(100); + + searcher.search(reader, FIELD, q, null, analyzer, collector); + + windows = collector.getSortedWindows(); + + posts = new String[]{"", " b c", " b c a b c", " c b a",}; + for (int i = 0; i < posts.length; i++) { + ConcordanceWindow w = windows.get(i); + assertEquals(posts[i], w.getPost()); + } + reader.close(); + directory.close(); + } + + @Test + public void testWindowLengths() throws Exception { + String[] doc = new String[]{"a b c d e f g"}; + List docs = new ArrayList<>(); + docs.add(doc); + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + + SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); + + String[] pres = {"", "c", "b c", "a b c", "a b c", "a b c"}; + String[] posts = {"", " e", " e f", " e f g", " e f g", " e f g"}; + + for (int tokensBefore = 0; tokensBefore < pres.length; tokensBefore++) { + for (int tokensAfter = 0; tokensAfter < posts.length; tokensAfter++) { + WindowBuilder wb = new WindowBuilder(tokensBefore, tokensAfter, + analyzer.getOffsetGap(FIELD)); + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(100); + searcher.search(reader, FIELD, q, null, analyzer, collector); + ConcordanceWindow w = collector.getSortedWindows().get(0); + assertEquals(tokensBefore + " : " + tokensAfter, pres[tokensBefore], w.getPre()); + assertEquals(tokensBefore + " : " + tokensAfter, posts[tokensAfter], w.getPost()); + } + } + + reader.close(); + directory.close(); + + } + + @Test + public void testClockworkOrangeMultiValuedFieldProblem() throws Exception { + /* + * test handling of target match (or not) over different indices into multivalued + * field array + */ + String[] doc = new String[]{"a b c a b the", "clockwork", + "orange b a c b a"}; + List docs = new ArrayList<>(); + docs.add(doc); + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 10); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + WindowBuilder wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD)); + + + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + SpanQuery q1 = new SpanTermQuery( + new Term(FIELD, "the")); + SpanQuery q2 = new SpanTermQuery(new Term(FIELD, + "clockwork")); + SpanQuery q3 = new SpanTermQuery(new Term(FIELD, + "orange")); + SpanQuery q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 3, true); + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); + + searcher.search(reader, FIELD, + q, null, analyzer, collector); + assertEquals(1, collector.size()); + + ConcordanceWindow w = collector.getSortedWindows().iterator().next(); + assertEquals("target", "the | clockwork | orange", w.getTarget()); + assertEquals("pre", "c a b", w.getPre()); + assertEquals("post", " b a c", w.getPost()); + + reader.close(); + directory.close(); + + // test hit even over long inter-field gap + analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 20, 50); + directory = getDirectory(analyzer, docs); + reader = DirectoryReader.open(directory); + + wb = new WindowBuilder(3, 3, analyzer.getOffsetGap(FIELD)); + + searcher = new ConcordanceSearcher(wb); + q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 120, true); + collector = new ConcordanceWindowCollector(100); + + searcher.search(reader, FIELD, q, null, analyzer, collector); + + assertEquals(1, collector.size()); + w = collector.getSortedWindows().iterator().next(); + assertEquals("target", "the | clockwork | orange", w.getTarget()); + assertEquals("pre", "c a b", w.getPre()); + assertEquals("post", " b a c", w.getPost()); + + reader.close(); + directory.close(); + // test miss + analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 100, 100); + directory = getDirectory(analyzer, docs); + reader = DirectoryReader.open(directory); + + wb = new WindowBuilder(); + searcher = new ConcordanceSearcher(wb); + q = new SpanNearQuery(new SpanQuery[]{q1, q2, q3}, 5, true); + collector = new ConcordanceWindowCollector(100); + + searcher.search(reader, FIELD, q, null, analyzer, collector); + + assertEquals(0, collector.size()); + + reader.close(); + directory.close(); + } + + @Test + public void testWithStops() throws Exception { + String[] docs = new String[]{"a b the d e the f", "g h the d the j"}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.ENGLISH_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + + WindowBuilder wb = new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD)); + + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(3); + + searcher.search(reader, FIELD, + q, null, analyzer, collector); + List windows = collector.getSortedWindows(); + assertEquals(2, windows.size()); + + // the second word after the target is a stop word + // this post-component of this window should only go to the first word after + // the target + assertEquals("b the", windows.get(0).getPre()); + assertEquals("d", windows.get(0).getTarget()); + assertEquals(" e", windows.get(0).getPost()); + + assertEquals("h the", windows.get(1).getPre()); + assertEquals("d", windows.get(1).getTarget()); + assertEquals(" the j", windows.get(1).getPost()); + + + reader.close(); + directory.close(); + } + + @Test + public void testBasicStandardQueryConversion() throws Exception { + String[] docs = new String[]{"a b c a b c", "c b a c b a d e a", + "c b a c b a e a b c a"}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + BooleanQuery q = new BooleanQuery(); + q.add(new TermQuery(new Term(FIELD, "a")), Occur.MUST); + q.add(new TermQuery(new Term(FIELD, "d")), + Occur.MUST_NOT); + + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); + searcher.search(reader, + FIELD, (Query) q, null, + analyzer, collector); + // shouldn't include document with "d" + assertEquals(6, collector.size()); + + // should only include document with "e" and not "d" + Filter filter = new QueryWrapperFilter(new TermQuery(new Term( + FIELD, "e"))); + collector = new ConcordanceWindowCollector(10); + + searcher.search(reader, FIELD, (Query) q, filter, analyzer, collector); + assertEquals(4, collector.size()); + + reader.close(); + directory.close(); + } + + @Test + public void testMismatchingFieldsInStandardQueryConversion() throws Exception { + // tests what happens if a Query doesn't contain a term in the "span" field + // in the searcher...should be no exception and zero documents returned. + + String[] docs = new String[]{"a b c a b c",}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + + Query q = new TermQuery(new Term("_" + FIELD, "a")); + + int windowCount = -1; + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); + + searcher.search(reader, FIELD, + q, null, analyzer, collector); + windowCount = collector.size(); + assertEquals(0, windowCount); + reader.close(); + directory.close(); + } + + @Test + public void testUniqueCollector() throws Exception { + String[] docs = new String[]{"a b c d c b a", + "a B C d c b a", + "a b C d C B a", + "a b c d C B A", + "e f g d g f e", + "h i j d j i h" + }; + + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); + + DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(2); + searcher.search(reader, + FIELD, (Query) q, null, + analyzer, collector); + assertEquals(2, collector.size()); + + + collector = + new DedupingConcordanceWindowCollector(AbstractConcordanceWindowCollector.COLLECT_ALL); + searcher.search(reader, + FIELD, (Query) q, null, + analyzer, collector); + assertEquals(3, collector.size()); + + + reader.close(); + directory.close(); + + } + + + @Test + public void testUniqueCollectorWithSameWindowOverflow() throws Exception { + String[] docs = new String[]{"a b c d c b a", + "a b c d c b a", + "a b c d c b a", + "a b c d c b a", + "e f g d g f e", + "h i j d j i h" + }; + + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + ConcordanceSearcher searcher = new ConcordanceSearcher( + new WindowBuilder(10, 10, analyzer.getOffsetGap(FIELD))); + + SpanQuery q = new SpanTermQuery(new Term(FIELD, "d")); + + DedupingConcordanceWindowCollector collector = new DedupingConcordanceWindowCollector(3); + searcher.search(reader, + FIELD, (Query) q, null, + analyzer, collector); + assertEquals(3, collector.size()); + assertEquals(4, collector.getSortedWindows().get(0).getCount()); + reader.close(); + directory.close(); + } + + @Test + public void testAllowTargetOverlaps() throws Exception { + String[] docs = new String[]{"a b c"}; + Analyzer analyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET); + + Directory directory = getDirectory(analyzer, docs); + IndexReader reader = DirectoryReader.open(directory); + WindowBuilder wb = new WindowBuilder(10, 10, + analyzer.getOffsetGap(FIELD), + new DefaultSortKeyBuilder(ConcordanceSortOrder.PRE), metadataExtractor, docIdBuilder); + ConcordanceSearcher searcher = new ConcordanceSearcher(wb); + SpanQuery term = new SpanTermQuery(new Term(FIELD, "a")); + SpanQuery phrase = new SpanNearQuery( + new SpanQuery[]{ + new SpanTermQuery(new Term(FIELD, "a")), + new SpanTermQuery(new Term(FIELD, "b")) + }, 0, true); + SpanOrQuery q = new SpanOrQuery(); + q.addClause(term); + q.addClause(phrase); + + ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10); + searcher.search(reader, FIELD, + q, null, analyzer, collector); + + //default should be: don't allow target overlaps + assertEquals(1, collector.size()); + + searcher.setAllowTargetOverlaps(true); + collector = new ConcordanceWindowCollector(10); + searcher.search(reader, FIELD, + q, null, analyzer, collector); + + //now there should be two windows with allowTargetOverlaps = true + assertEquals(2, collector.size()); + reader.close(); + directory.close(); + } +} diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSimpleAnalyzerUtil.java lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSimpleAnalyzerUtil.java new file mode 100644 index 0000000..8bd1c44 --- /dev/null +++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSimpleAnalyzerUtil.java @@ -0,0 +1,158 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil; +import org.apache.lucene.store.Directory; +import org.junit.BeforeClass; + +public class TestSimpleAnalyzerUtil extends ConcordanceTestBase { + + private static Analyzer defaultCharOffsetGapAnalyzer; + + private static Analyzer customCharOffsetGapAnalyzer; + + @BeforeClass + public static void beforeClass() throws Exception { + defaultCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 0, 1); + //customCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50, 213); + customCharOffsetGapAnalyzer = getAnalyzer(MockTokenFilter.EMPTY_STOPSET, 50, 213); + } + /* + public void testDebug() throws Exception { + String[] values = new String[]{ + "the quick brown fox jumped over the lazy dog", + "the fast green toad slid under the slothful rabbit", + "the happy blue wolverine devoured the lazy moose", + "the depressed purple aardvark the the the the the the the devoured the energetic komodo", + "the exasperated lavender lion", + "the excited orange tiger the the the the the", + "the colorless green idea slept furiously the" + }; + System.out.println(values[0].length()); + List docs = new ArrayList<>(); + docs.add(values); + + Directory directory = getDirectory(defaultCharOffsetGapAnalyzer, docs); + + String joiner = " | "; + int gap = defaultCharOffsetGapAnalyzer.getOffsetGap(FIELD); + IndexReader reader = DirectoryReader.open(directory); + Document d = reader.document(0); + String[] fieldValues = d.getValues(FIELD); + //69, 103 + assertEquals("basic", "", testSimple(42, 45, fieldValues, gap, joiner)); + reader.close(); + directory.close(); + }*/ + + public void testHitInGaps() throws Exception { + String[] values = new String[]{ + "abc", + "def", + "ghi", + "jkl" + }; + List docs = new ArrayList<>(); + docs.add(values); + + Directory directory = getDirectory(customCharOffsetGapAnalyzer, docs); + + String joiner = " | "; + int gap = customCharOffsetGapAnalyzer.getOffsetGap(FIELD); + IndexReader reader = DirectoryReader.open(directory); + StoredDocument d = reader.document(0); + String[] fieldValues = d.getValues(FIELD); + + assertEquals("two negs", "", testSimple(-10, -1, fieldValues, gap, joiner)); + + assertEquals("two way beyonds", "", testSimple(1000, 1020, fieldValues, gap, joiner)); + + assertEquals("two in betweens", " | ", testSimple(100, 110, fieldValues, gap, joiner)); + + + assertEquals("one neg", "abc", testSimple(-20, 3, fieldValues, gap, joiner)); + assertEquals("end < start 1", "", testSimple(3, -20, fieldValues, gap, joiner)); + assertEquals("end < start 2", "", testSimple(3, 2, fieldValues, gap, joiner)); + assertEquals("end in between", "abc", testSimple(0, 50, fieldValues, gap, joiner)); + //TODO: these used to be "def"; need to fix + assertEquals("start in between", " | def", testSimple(5, 219, fieldValues, gap, joiner)); + assertEquals("start in between and end in between1", " | def", testSimple(5, 300, fieldValues, gap, joiner)); + assertEquals("start in between and end in between2", " | def | ghi", testSimple(5, 600, fieldValues, gap, joiner)); + assertEquals("", "def | ghi | jkl", testSimple(216, 10000, fieldValues, gap, joiner)); + + reader.close(); + directory.close(); + + } + + public void testRandomWithNeedleOnGaps() throws Exception { + executeNeedleTests(defaultCharOffsetGapAnalyzer); + executeNeedleTests(customCharOffsetGapAnalyzer); + } + + private void executeNeedleTests(Analyzer analyzer) throws Exception { + + String needle = getNeedle(analyzer); + int numFieldValues = 23; + + Directory directory = buildNeedleIndex(needle, analyzer, numFieldValues); + + IndexReader reader = DirectoryReader.open(directory); + + LeafReaderContext ctx = reader.leaves().get(0); + LeafReader r = ctx.reader(); + DocsAndPositionsEnum dpe = r.termPositionsEnum(new Term(FIELD, needle)); + int docId = dpe.nextDoc(); + int numTests = 0; + while (docId != DocIdSetIterator.NO_MORE_DOCS) { + int frq = dpe.freq(); + int advanced = 1; + dpe.nextPosition(); + String[] fieldValues = r.document(docId).getValues(FIELD); + while (advanced++ < frq) { + String rebuilt = SimpleAnalyzerUtil.substringFromMultiValuedFields(dpe.startOffset(), + dpe.endOffset(), fieldValues, analyzer.getOffsetGap(FIELD), " | "); + assertEquals(needle, rebuilt); + numTests++; + dpe.nextPosition(); + } + docId = dpe.nextDoc(); + } + reader.close(); + directory.close(); + assertEquals("number of tests", numFieldValues-1, numTests); + } + + private String testSimple(int start, int end, String[] fieldValues, int gap, String joiner) { + return SimpleAnalyzerUtil.substringFromMultiValuedFields(start, end, fieldValues, gap, joiner); + } +} diff --git lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java new file mode 100644 index 0000000..7121f9c --- /dev/null +++ lucene/concordance/src/test/org/apache/lucene/search/concordance/TestSpanQueryConverter.java @@ -0,0 +1,111 @@ +package org.apache.lucene.search.concordance; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.queries.SpanQueryConverter; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + + +public class TestSpanQueryConverter extends LuceneTestCase { + private static IndexReader reader; + private static Directory directory; + private static Analyzer analyzer; + private final static String FIELD = "field"; + + @BeforeClass + public static void beforeClass() throws Exception { + analyzer = new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(random(), analyzer).setMaxBufferedDocs + (TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); + String[] docs = new String[] { + "a b c a b c", + "c b a c b a" + }; + for (String val : docs) { + Document doc = new Document(); + doc.add(newTextField(FIELD, val, Field.Store.YES)); + writer.addDocument(doc); + } + reader = writer.getReader(); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + reader = null; + directory = null; + analyzer = null; + } + + @Test + public void testBooleanTwoFields() throws Exception { + + Query q1 = new TermQuery(new Term(FIELD, "a")); + Query q2 = new TermQuery(new Term("another_field", "b")); + BooleanQuery q = new BooleanQuery(); + q.add(q1, Occur.SHOULD); + q.add(q2, Occur.SHOULD); + SpanQueryConverter converter = new SpanQueryConverter(); + boolean success = true; + try { + SpanQuery span = converter.convert(FIELD, q); + } catch (IllegalArgumentException e) { + success = false; + } + assertEquals(true, success); + Query q3 = new TermQuery(new Term("another_field", "c")); + BooleanQuery bq2 = new BooleanQuery(); + bq2.add(q, Occur.MUST); + bq2.add(q3, Occur.SHOULD); + try { + SpanQuery span = converter.convert(FIELD, bq2); + } catch (IllegalArgumentException e) { + success = false; + } + assertEquals(true, success); + } +} diff --git lucene/module-build.xml lucene/module-build.xml index c68900a..027338e 100644 --- lucene/module-build.xml +++ lucene/module-build.xml @@ -628,4 +628,27 @@ + + + + + + + + + + + + + + + + + + + + + + +