Index: src/main/java/org/apache/jackrabbit/core/query/TextFilter.java =================================================================== --- src/main/java/org/apache/jackrabbit/core/query/TextFilter.java (revision 421461) +++ src/main/java/org/apache/jackrabbit/core/query/TextFilter.java (working copy) @@ -29,6 +29,9 @@ * mime type ({@link #canFilter(String)} and if one of them returns * true the text representation is created with * {@link #doFilter(PropertyState, String)} + * + * @deprecated use the {@link org.apache.jackrabbit.extractor.TextExtractor} + * interface */ public interface TextFilter { Index: src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java =================================================================== --- src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java (revision 421461) +++ src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java (working copy) @@ -17,13 +17,14 @@ package org.apache.jackrabbit.core.query.lucene; import org.apache.jackrabbit.core.PropertyId; -import org.apache.jackrabbit.core.query.TextFilter; import org.apache.jackrabbit.core.state.ItemStateException; import org.apache.jackrabbit.core.state.ItemStateManager; import org.apache.jackrabbit.core.state.NoSuchItemStateException; import org.apache.jackrabbit.core.state.NodeState; import org.apache.jackrabbit.core.state.PropertyState; +import org.apache.jackrabbit.core.value.BLOBFileValue; import org.apache.jackrabbit.core.value.InternalValue; +import org.apache.jackrabbit.extractor.TextExtractor; import org.apache.jackrabbit.name.NoPrefixDeclaredException; import org.apache.jackrabbit.name.Path; import org.apache.jackrabbit.name.QName; @@ -37,12 +38,11 @@ import javax.jcr.NamespaceException; import javax.jcr.PropertyType; import javax.jcr.RepositoryException; + +import java.io.InputStream; import java.io.Reader; import java.util.Calendar; -import java.util.Collections; import java.util.Iterator; -import java.util.List; -import java.util.Map; import java.util.Set; /** @@ -72,9 +72,9 @@ protected final NamespaceMappings mappings; /** - * List of text filters in use. + * Content extractor. */ - protected final List textFilters; + protected final TextExtractor extractor; /** * Creates a new node indexer. @@ -82,16 +82,16 @@ * @param node the node state to index. * @param stateProvider the persistent item state manager to retrieve properties. * @param mappings internal namespace mappings. - * @param textFilters List of {@link org.apache.jackrabbit.core.query.TextFilter}s. + * @param extractor content extractor */ protected NodeIndexer(NodeState node, ItemStateManager stateProvider, NamespaceMappings mappings, - List textFilters) { + TextExtractor extractor) { this.node = node; this.stateProvider = stateProvider; this.mappings = mappings; - this.textFilters = textFilters; + this.extractor = extractor; } /** @@ -100,8 +100,7 @@ * @param node the node state to index. * @param stateProvider the state provider to retrieve property values. * @param mappings internal namespace mappings. - * @param textFilters list of text filters to use for indexing binary - * properties. + * @param extractor text extractor * @return the lucene Document. * @throws RepositoryException if an error occurs while reading property * values from the ItemStateProvider. @@ -109,9 +108,9 @@ public static Document createDocument(NodeState node, ItemStateManager stateProvider, NamespaceMappings mappings, - List textFilters) + TextExtractor extractor) throws RepositoryException { - NodeIndexer indexer = new NodeIndexer(node, stateProvider, mappings, textFilters); + NodeIndexer indexer = new NodeIndexer(node, stateProvider, mappings, extractor); return indexer.createDoc(); } @@ -269,38 +268,23 @@ // don't know how to index return; } - if (node.hasPropertyName(QName.JCR_MIMETYPE)) { - PropertyState dataProp = (PropertyState) stateProvider.getItemState( - new PropertyId(node.getNodeId(), QName.JCR_DATA)); - PropertyState mimeTypeProp = - (PropertyState) stateProvider.getItemState( - new PropertyId(node.getNodeId(), QName.JCR_MIMETYPE)); + InternalValue typeValue = getValue(QName.JCR_MIMETYPE); + if (typeValue != null) { + String type = typeValue.internalValue().toString(); + // jcr:encoding is not mandatory String encoding = null; - if (node.hasPropertyName(QName.JCR_ENCODING)) { - PropertyState encodingProp = - (PropertyState) stateProvider.getItemState( - new PropertyId(node.getNodeId(), QName.JCR_ENCODING)); - encoding = encodingProp.getValues()[0].internalValue().toString(); + InternalValue encodingValue = getValue(QName.JCR_ENCODING); + if (encodingValue != null) { + encoding = encodingValue.internalValue().toString(); } - String mimeType = mimeTypeProp.getValues()[0].internalValue().toString(); - Map fields = Collections.EMPTY_MAP; - for (Iterator it = textFilters.iterator(); it.hasNext();) { - TextFilter filter = (TextFilter) it.next(); - // use the first filter that can handle the mimeType - if (filter.canFilter(mimeType)) { - fields = filter.doFilter(dataProp, encoding); - break; - } - } - - for (Iterator it = fields.keySet().iterator(); it.hasNext();) { - String field = (String) it.next(); - Reader r = (Reader) fields.get(field); - doc.add(Field.Text(field, r)); - } + InputStream stream = + ((BLOBFileValue) internalValue).getStream(); + Reader reader = + new TextExtractorReader(extractor, stream, type, encoding); + doc.add(Field.Text(FieldNames.FULLTEXT, reader)); } } catch (ItemStateException e) { log.warn("Exception while indexing binary property: " + e.toString()); @@ -312,6 +296,31 @@ } /** + * Utility method that extracts the first value of the named property + * of the current node. Returns null if the property does + * not exist or contains no values. + * + * @param name property name + * @return value of the named property, or null + * @throws ItemStateException if the property can not be accessed + */ + protected InternalValue getValue(QName name) throws ItemStateException { + try { + PropertyId id = new PropertyId(node.getNodeId(), name); + PropertyState property = + (PropertyState) stateProvider.getItemState(id); + InternalValue[] values = property.getValues(); + if (values.length > 0) { + return values[0]; + } else { + return null; + } + } catch (NoSuchItemStateException e) { + return null; + } + } + + /** * Adds the string representation of the boolean value to the document as * the named field. * Index: src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java =================================================================== --- src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java (revision 421461) +++ src/main/java/org/apache/jackrabbit/core/query/lucene/TextPlainTextFilter.java (working copy) @@ -16,65 +16,21 @@ */ package org.apache.jackrabbit.core.query.lucene; -import org.apache.jackrabbit.core.query.TextFilter; -import org.apache.jackrabbit.core.state.PropertyState; -import org.apache.jackrabbit.core.value.BLOBFileValue; -import org.apache.jackrabbit.core.value.InternalValue; +import org.apache.jackrabbit.extractor.PlainTextExtractor; -import javax.jcr.RepositoryException; -import java.io.InputStreamReader; -import java.io.Reader; -import java.io.UnsupportedEncodingException; -import java.util.HashMap; -import java.util.Map; - /** - * Implements a {@link org.apache.jackrabbit.core.query.TextFilter} that handles binary properties of mime-type - * text/plain. + * Text filter for text/plain content. + * + * @deprecated use {@link PlainTextExtractor}, this class is kept for + * backwards compatibility with existing configuration files */ -public class TextPlainTextFilter implements TextFilter { +public class TextPlainTextFilter extends TextExtractorFilter { /** - * Returns true for text/plain; false - * in all other cases. - * @param mimeType the mime-type. - * @return true for text/plain; false - * in all other cases. + * Creates a text filter for text/plain content. */ - public boolean canFilter(String mimeType) { - return "text/plain".equalsIgnoreCase(mimeType); + public TextPlainTextFilter() { + super(new PlainTextExtractor()); } - /** - * Returns a map with a single entry for field {@link FieldNames#FULLTEXT}. - * @param data the data property. - * @param encoding the encoding - * @return a map with a single Reader value for field - * {@link FieldNames#FULLTEXT}. - * @throws RepositoryException if encoding is not supported or data is a - * multi-value property. - */ - public Map doFilter(PropertyState data, String encoding) throws RepositoryException { - InternalValue[] values = data.getValues(); - if (values.length == 1) { - BLOBFileValue blob = (BLOBFileValue) values[0].internalValue(); - try { - Reader reader; - if (encoding == null) { - // use platform default - reader = new InputStreamReader(blob.getStream()); - } else { - reader = new InputStreamReader(blob.getStream(), encoding); - } - Map result = new HashMap(); - result.put(FieldNames.FULLTEXT, reader); - return result; - } catch (UnsupportedEncodingException e) { - throw new RepositoryException(e); - } - } else { - // multi value not supported - throw new RepositoryException("Multi-valued binary properties not supported."); - } - } } Index: src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java =================================================================== --- src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java (revision 421461) +++ src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java (working copy) @@ -23,10 +23,11 @@ import org.apache.jackrabbit.core.query.AbstractQueryHandler; import org.apache.jackrabbit.core.query.ExecutableQuery; import org.apache.jackrabbit.core.query.QueryHandlerContext; -import org.apache.jackrabbit.core.query.TextFilter; import org.apache.jackrabbit.core.query.QueryHandler; import org.apache.jackrabbit.core.state.NodeState; import org.apache.jackrabbit.core.state.NodeStateIterator; +import org.apache.jackrabbit.extractor.DefaultTextExtractor; +import org.apache.jackrabbit.extractor.TextExtractor; import org.apache.jackrabbit.name.NoPrefixDeclaredException; import org.apache.jackrabbit.name.QName; import org.apache.jackrabbit.name.NameFormat; @@ -51,9 +52,7 @@ import java.io.File; import java.util.Iterator; import java.util.List; -import java.util.StringTokenizer; import java.util.ArrayList; -import java.util.Collections; import java.util.HashSet; import java.util.Set; @@ -92,11 +91,6 @@ public static final int DEFAULT_MAX_FIELD_LENGTH = 10000; /** - * Default text filters. - */ - public static final String DEFAULT_TEXT_FILTERS = TextPlainTextFilter.class.getName(); - - /** * The actual index */ private MultiIndex index; @@ -107,11 +101,19 @@ private Analyzer analyzer; /** - * List of {@link org.apache.jackrabbit.core.query.TextFilter} instance. + * List of text extractor and text filter class names. The configured + * classes will be instantiated and used to extract text content from + * binary properties. */ - private List textFilters; + private String textFilterClasses = + DefaultTextExtractor.class.getName(); /** + * Text extractor for extracting text content of binary properties. + */ + private TextExtractor extractor; + + /** * The location of the search index. *

* Note: This is a mandatory parameter! @@ -193,7 +195,6 @@ */ public SearchIndex() { this.analyzer = new StandardAnalyzer(new String[]{}); - setTextFilterClasses(DEFAULT_TEXT_FILTERS); } /** @@ -227,6 +228,8 @@ nsMappings = new NamespaceMappings(mapFile); } + extractor = new JackrabbitTextExtractor(textFilterClasses); + index = new MultiIndex(indexDir, this, context.getItemStateManager(), context.getRootId(), excludedIDs, nsMappings); if (index.getRedoLogApplied() || forceConsistencyCheck) { @@ -399,16 +402,6 @@ } /** - * Returns an unmodifiable list of {@link TextFilter} configured for - * this search index. - * - * @return unmodifiable list of text filters. - */ - protected List getTextFilters() { - return textFilters; - } - - /** * Returns the namespace mappings for the internal representation. * @return the namespace mappings for the internal representation. */ @@ -459,7 +452,7 @@ protected Document createDocument(NodeState node, NamespaceMappings nsMappings) throws RepositoryException { return NodeIndexer.createDocument(node, getContext().getItemStateManager(), - nsMappings, textFilters); + nsMappings, extractor); } /** @@ -739,53 +732,19 @@ } /** - * Sets a new set of text filter classes that are in use for indexing - * binary properties. The filterClasses must be a comma - * separated String of fully qualified class names implementing - * {@link org.apache.jackrabbit.core.query.TextFilter}. Each class must - * provide a default constructor. - *

- * Filter class names that cannot be resolved are skipped and a warn message - * is logged. + * Sets the list of text extractors (and text filters) to use for + * extracting text content from binary properties. The list must be + * comma (or whitespace) separated, and contain fully qualified class + * names of the {@link TextExtractor} (and {@link TextFilter}) classes + * to be used. The configured classes must all have a public default + * constructor. * - * @param filterClasses comma separated list of filter class names + * @param filterClasses comma separated list of class names */ public void setTextFilterClasses(String filterClasses) { - List filters = new ArrayList(); - StringTokenizer tokenizer = new StringTokenizer(filterClasses, ", \t\n\r\f"); - while (tokenizer.hasMoreTokens()) { - String className = tokenizer.nextToken(); - try { - Class filterClass = Class.forName(className); - TextFilter filter = (TextFilter) filterClass.newInstance(); - filters.add(filter); - } catch (Exception e) { - log.warn("Invalid TextFilter class: " + className, e); - } catch (LinkageError e) { - log.warn("Missing dependency for text filter: " + className); - log.warn(e.toString()); - } - } - textFilters = Collections.unmodifiableList(filters); + this.textFilterClasses = filterClasses; } - /** - * Returns the fully qualified class names of the text filter instances - * currently in use. The names are comma separated. - * - * @return class names of the text filters in use. - */ - public String getTextFilterClasses() { - StringBuffer names = new StringBuffer(); - String delim = ""; - for (Iterator it = textFilters.iterator(); it.hasNext();) { - names.append(delim); - names.append(it.next().getClass().getName()); - delim = ","; - } - return names.toString(); - } - //----------------------------< internal >---------------------------------- /** Index: src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java (revision 0) @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.core.query.lucene; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import java.util.StringTokenizer; + +import org.apache.jackrabbit.core.query.TextFilter; +import org.apache.jackrabbit.core.query.TextFilterExtractor; +import org.apache.jackrabbit.extractor.CompositeTextExtractor; +import org.apache.jackrabbit.extractor.DelegatingTextExtractor; +import org.apache.jackrabbit.extractor.EmptyTextExtractor; +import org.apache.jackrabbit.extractor.TextExtractor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Backwards-compatible Jackrabbit text extractor component. This class + * implements the following functionality: + * + */ +public class JackrabbitTextExtractor implements TextExtractor { + + /** + * Logger instance. + */ + private static final Logger logger = + LoggerFactory.getLogger(JackrabbitTextExtractor.class); + + /** + * Set of content types that are known to be supported by the + * composite extractor. + */ + private final Set types = new HashSet(); + + /** + * Composite extractor used to for all text extration tasks. Contains + * all the {@link TextExtractor} instances for directly supported content + * types, the {@link TextFilterExtractor} adapters for backwards + * compatibility with configured {@link TextFilter} instances that have + * already been used, and the dummy {@link EmptyTextExtractor} instances + * created for unsupported content types. + */ + private final CompositeTextExtractor extractor = + new CompositeTextExtractor(); + + /** + * Configured {@link TextFilter} instances. Used for backwards + * compatibility with existing configuration files and {@link TextFilter} + * implementations. + */ + private final Collection filters = new ArrayList(); + + /** + * Creates a Jackrabbit text extractor containing the configured component + * classes. + * + * @param classes configured {@link TextExtractor} (and {@link TextFilter}) + * class names (space- or comma-separated) + */ + public JackrabbitTextExtractor(String classes) { + logger.debug("JackrabbitTextExtractor({})", classes); + StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f"); + while (tokenizer.hasMoreTokens()) { + String name = tokenizer.nextToken(); + try { + Object object = Class.forName(name).newInstance(); + if (object instanceof DelegatingTextExtractor) { + ((DelegatingTextExtractor) object) + .setDelegateTextExtractor(this); + } + if (object instanceof TextExtractor) { + extractor.addTextExtractor((TextExtractor) object); + } else if (object instanceof TextFilter) { + filters.add(object); + } else { + logger.warn("Unknown text extractor class: {}", name); + } + } catch (ClassNotFoundException e) { + logger.warn("Extractor class not found: " + name, e); + } catch (LinkageError e) { + logger.warn("Extractor dependency not found: " + name, e); + } catch (IllegalAccessException e) { + logger.warn("Extractor constructor not accessible: " + name, e); + } catch (InstantiationException e) { + logger.warn("Extractor instantiation failed: " + name, e); + } + } + + types.addAll(Arrays.asList(extractor.getContentTypes())); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * Returns the content types that the component extractors are known + * to support. + * + * @return supported content types + */ + public String[] getContentTypes() { + return extractor.getContentTypes(); // and then some + } + + /** + * Extracts the text content from the given binary stream. The given + * content type is used to look up a configured text extractor to which + * to delegate the request. + *

+ * If a matching extractor is not found, then the configured text filters + * searched for an instance that claims to support the given content type. + * A text extractor adapter is created for that filter and saved in the + * extractor map for future use before delegating the request to the + * adapter. + *

+ * If not even a text filter is found for the given content type, a warning + * is logged and an empty text extractor is created for that content type + * and saved in the extractor map for future use before delegating the + * request to the empty extractor. + * + * @param stream binary stream + * @param type content type + * @param encoding character encoding, or null + * @return reader for the text content of the binary stream + * @throws IOException if the binary stream can not be read + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + logger.debug("extractText(stream, {}, {})", type, encoding); + if (!types.contains(type)) { + Iterator iterator = filters.iterator(); + while (iterator.hasNext()) { + TextFilter filter = (TextFilter) iterator.next(); + if (filter.canFilter(type)) { + types.add(type); + extractor.addTextExtractor( + new TextFilterExtractor(type, filter)); + break; + } + } + } + + if (!types.contains(type)) { + logger.warn("Full text indexing of {} is not supported", type); + types.add(type); + extractor.addTextExtractor(new EmptyTextExtractor(type)); + } + + return extractor.extractText(stream, type, encoding); + } + +} Index: src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java =================================================================== --- src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java (revision 0) +++ src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java (revision 0) @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.core.query.lucene; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; + +import org.apache.jackrabbit.extractor.TextExtractor; + +/** + * Reader that extracts the text content of a binary stream for reading + * only when the first character is requested. This class is used by the + * {@link NodeIndexer} class to postpone text extraction to when the + * content is actually needs. + * + * @see http://issues.apache.org/jira/browse/JCR-264 + */ +public class TextExtractorReader extends Reader { + + /** + * Text extractor to use in extracting text content from the binary stream. + */ + private final TextExtractor extractor; + + /** + * Binary stream from which to extract the content for reading. + */ + private final InputStream stream; + + /** + * Content type of the binary stream. + */ + private final String type; + + /** + * Character encoding of the binary stream, or null. + */ + private final String encoding; + + /** + * Reader for the extracted text content. Set to null until + * the first character request triggers the text extraction. + */ + private Reader reader; + + /** + * Creates a reader that extracts the text content from the given binary + * stream. + * + * @param extractor text extractor + * @param stream binary stream + * @param type content type + * @param encoding character encoding, or null + */ + public TextExtractorReader( + TextExtractor extractor, InputStream stream, + String type, String encoding) { + this.extractor = extractor; + this.stream = stream; + this.type = type; + this.encoding = encoding; + this.reader = null; + } + + //---------------------------------------------------------< InputStream > + + /** + * Reads up to the given number of characters to the given buffer position + * from the extracted text content reader. Uses the text extractor to + * create the text content reader when first invoked. + * + * @param buffer buffer to place characters in + * @param offset buffer offset + * @param length maximum number of characters to read + * @return number of read characters + * @throws IOException if text extraction fails + */ + public int read(char[] buffer, int offset, int length) throws IOException { + if (reader == null) { + reader = extractor.extractText(stream, type, encoding); + } + return reader.read(buffer, offset, length); + } + + /** + * Closes the reader of the extracted text, or the binary stream if the + * text content was never extracted. + * + * @throws IOException if the reader or stream can not be closed + */ + public void close() throws IOException { + if (reader != null) { + reader.close(); + } else { + stream.close(); + } + } + +} Property changes on: src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java =================================================================== --- src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java (revision 0) +++ src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorFilter.java (revision 0) @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.core.query.lucene; + +import java.io.IOException; +import java.io.Reader; +import java.util.HashMap; +import java.util.Map; + +import javax.jcr.RepositoryException; + +import org.apache.jackrabbit.core.query.TextFilter; +import org.apache.jackrabbit.core.state.PropertyState; +import org.apache.jackrabbit.core.value.BLOBFileValue; +import org.apache.jackrabbit.core.value.InternalValue; +import org.apache.jackrabbit.extractor.TextExtractor; + +/** + * Utility base class for migrating functionality from existing implementations + * of the deprecated {@link TextFilter} interface to the new + * {@link TextExtractor} interface. Once the functionality of an existing + * TextFilter has been copied to a new TextExtractor, the original class can + * be replaced with the following template to keep backwards compatibility + * while avoiding the burden of maintaining duplicate code: + *

+ * public class SomeTextFilter extends TextExtractorFilter {
+ *     public SomeTextFilter() {
+ *         super(new SomeTextExtractor());
+ *     }
+ * }
+ * 
+ */ +public class TextExtractorFilter implements TextFilter { + + /** + * The adapted text extractor. + */ + private final TextExtractor extractor; + + /** + * Creates a text filter adapter for the given text extractor. + * + * @param extractor adapted text extractor + */ + public TextExtractorFilter(TextExtractor extractor) { + this.extractor = extractor; + } + + /** + * Returns true if the adapted text extractor supports the given + * content type. + * + * @param mimeType content type + * @return true if the content type is supported, + * false otherwise + */ + public boolean canFilter(String mimeType) { + mimeType = mimeType.toLowerCase(); + String[] types = extractor.getContentTypes(); + for (int i = 0; i < types.length; i++) { + if (types[i].equals(mimeType)) { + return true; + } + } + return false; + } + + /** + * Extracts text content of the given binary property using the adapted + * text extractor. + * + * @param data binary property + * @param encoding character encoding, or null + * @return map that contains a reader for the extracted text as + * the {@link FieldNames#FULLTEXT} entry + * @throws RepositoryException if the binary property can not be read + */ + public Map doFilter(PropertyState data, String encoding) + throws RepositoryException { + InternalValue[] values = data.getValues(); + if (values.length == 1) { + try { + String type = "application/octet-stream"; + String[] types = extractor.getContentTypes(); + if (types.length > 0) { + type = types[0]; + } + + BLOBFileValue blob = (BLOBFileValue) values[0].internalValue(); + Reader reader = + extractor.extractText(blob.getStream(), type, encoding); + + Map result = new HashMap(); + result.put(FieldNames.FULLTEXT, reader); + return result; + } catch (IOException e) { + throw new RepositoryException("Text extraction error", e); + } + } else { + // multi value not supported + throw new RepositoryException( + "Multi-valued binary properties not supported."); + } + } + +} Index: src/main/java/org/apache/jackrabbit/core/query/TextFilterExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/core/query/TextFilterExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/core/query/TextFilterExtractor.java (revision 0) @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.core.query; + +import java.io.FilterReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.util.Map; + +import javax.jcr.RepositoryException; + +import org.apache.jackrabbit.core.PropertyId; +import org.apache.jackrabbit.core.query.lucene.FieldNames; +import org.apache.jackrabbit.core.state.ItemState; +import org.apache.jackrabbit.core.state.PropertyState; +import org.apache.jackrabbit.core.value.BLOBFileValue; +import org.apache.jackrabbit.core.value.InternalValue; +import org.apache.jackrabbit.extractor.TextExtractor; + +/** + * Adapter class for achieving backwards compatibility with classes + * implementing the deprectated {@link TextFilter} interface. This class + * implements the {@link TextExtract} interface through calls to an + * underlying {@link TextFilter} instance. + */ +public class TextFilterExtractor implements TextExtractor { + + /** + * Supported content types. + */ + private final String[] types; + + /** + * The adapted text filter. + */ + private final TextFilter filter; + + /** + * Creates a text extractor adapter that supports the given content + * types using the given text filter. + * + * @param types supported content types + * @param filter text filter to be adapted + */ + public TextFilterExtractor(String[] types, TextFilter filter) { + this.types = types; + this.filter = filter; + } + + /** + * Creates a text extractor adapter that supports the given content + * type using the given text filter. + * + * @param type supported content type + * @param filter text filter to be adapted + */ + public TextFilterExtractor(String type, TextFilter filter) { + this(new String[] { type }, filter); + } + + /** + * Returns the supported content types. + * + * @return supported content types + */ + public String[] getContentTypes() { + return types; + } + + /** + * Extracts the text content of the given binary stream by calling the + * underlying {@link TextFilter} instance. A dummy {@link PropertyState} + * instance is created to comply with the + * {@link TextFilter#doFilter(PropertyState, String)} method signature. + * + * @param stream binary stream + * @param type content type + * @param encoding character encoding, or null + * @return reader reader for the extracted text content + * @throws IOException if the adapted call fails + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + final InternalValue value = InternalValue.create(stream); + try { + PropertyState state = new PropertyState( + (PropertyId) null, ItemState.STATUS_EXISTING, true); + state.setValues(new InternalValue[] { value }); + Map fields = filter.doFilter(state, encoding); + Object fulltext = fields.get(FieldNames.FULLTEXT); + if (fulltext instanceof Reader) { + return new FilterReader((Reader) fulltext) { + public void close() throws IOException { + super.close(); + ((BLOBFileValue) value.internalValue()).discard(); + } + }; + } else { + ((BLOBFileValue) value.internalValue()).discard(); + return new StringReader(""); + } + } catch (RepositoryException e) { + ((BLOBFileValue) value.internalValue()).discard(); + return new StringReader(""); + } + } + +}