Index: src/test/java/org/apache/jackrabbit/extractor/TestHelper.java
===================================================================
--- src/test/java/org/apache/jackrabbit/extractor/TestHelper.java (revision 0)
+++ src/test/java/org/apache/jackrabbit/extractor/TestHelper.java (revision 0)
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.CharArrayWriter;
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Helper class for text extractor unit tests.
+ */
+class TestHelper {
+
+ /**
+ * Private constructor to prevent instantiation.
+ */
+ private TestHelper() {
+ };
+
+ /**
+ * Returns the entire content of the given reader as a string.
+ *
+ * @param reader reader to be read and closed
+ * @return entire content of the reader
+ * @throws IOException on IO errors
+ */
+ public static String read(Reader reader) throws IOException {
+ try {
+ CharArrayWriter writer = new CharArrayWriter();
+ try {
+ char[] buffer = new char[4096];
+ int n = reader.read(buffer);
+ while (n > 0) {
+ writer.write(buffer, 0, n);
+ n = reader.read(buffer);
+ }
+ } finally {
+ writer.close();
+ }
+ return new String(writer.toCharArray());
+ } finally {
+ reader.close();
+ }
+ }
+
+}
Index: src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java
===================================================================
--- src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java (revision 0)
+++ src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java (revision 0)
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link EmptyTextExtractor} class.
+ */
+public class EmptyTextExtractorTest extends TestCase {
+
+ /**
+ * Text extractor being tested.
+ */
+ private TextExtractor extractor;
+
+ /**
+ * Creates the text extractor to be tested.
+ */
+ protected void setUp() {
+ extractor = new EmptyTextExtractor("test/type");
+ }
+
+ /**
+ * Tests that the extractor supports no content types.
+ */
+ public void testContentTypes() {
+ Set types = new HashSet();
+ types.addAll(Arrays.asList(extractor.getContentTypes()));
+ assertTrue(
+ "EmptyTextExtractor does not support the given content type",
+ types.contains("test/type"));
+ assertEquals(
+ "EmptyTextExtractor supports unknown content types",
+ 1, types.size());
+ }
+
+ /**
+ * Tests that the extractor correctly handles a normal stream.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testNormalStream() throws IOException {
+ String text = "some test content";
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(text.getBytes()), "text/plain", null);
+ assertEquals("", TestHelper.read(reader));
+ }
+
+}
Index: src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java
===================================================================
--- src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java (revision 0)
+++ src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java (revision 0)
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link XMLTextExtractor} class.
+ */
+public class XMLTextExtractorTest extends TestCase {
+
+ /**
+ * Text extractor being tested.
+ */
+ private TextExtractor extractor;
+
+ /**
+ * Creates the text extractor to be tested.
+ */
+ protected void setUp() {
+ extractor = new XMLTextExtractor();
+ }
+
+ /**
+ * Tests that the extractor supportes text/xml and
+ * application/xml.
+ */
+ public void testContentTypes() {
+ Set types = new HashSet();
+ types.addAll(Arrays.asList(extractor.getContentTypes()));
+ assertTrue(
+ "XMLTextExtractor does not support text/xml",
+ types.contains("text/xml"));
+ assertTrue(
+ "XMLTextExtractor does not support application/xml",
+ types.contains("application/xml"));
+ assertEquals(
+ "XMLTextExtractor supports unknown content types",
+ 2, types.size());
+ }
+
+ /**
+ * Tests that the extractor correctly handles an empty stream.
+ */
+ public void testEmptyStream() {
+ try {
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(new byte[0]), "text/xml", null);
+ assertEquals("", TestHelper.read(reader));
+ } catch (IOException e) {
+ fail("XMLTextExtractor does not handle empty streams");
+ }
+ }
+
+ /**
+ * Tests that the extractor correctly handles a normal stream.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testNormalStream() throws IOException {
+ String xml = "text content";
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(xml.getBytes()), "text/xml", null);
+ assertEquals("attribute value text content", TestHelper.read(reader));
+ }
+
+ /**
+ * Tests that the extractor correctly handles XML parse errors.
+ */
+ public void testInvalidStream() {
+ try {
+ String xml = "text content";
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(xml.getBytes()), "text/xml", null);
+ assertEquals("", TestHelper.read(reader));
+ } catch (IOException e) {
+ fail("XMLTextExtractor does not handle XML parse errors");
+ }
+ }
+
+ /**
+ * Tests that the extractor correctly handles unsupported encodings.
+ */
+ public void testUnsupportedEncoding() {
+ try {
+ String xml = "text content";
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(xml.getBytes()),
+ "text/xml", "unsupported");
+ assertEquals("", TestHelper.read(reader));
+ } catch (UnsupportedEncodingException e) {
+ fail("XMLTextExtractor does not handle unsupported encodings");
+ } catch (IOException e) {
+ fail("XMLTextExtractor does not handle unsupported encodings");
+ }
+ }
+
+}
Index: src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java
===================================================================
--- src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java (revision 0)
+++ src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java (revision 0)
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link CompositeTextExtractor} class.
+ */
+public class CompositeTextExtractorTest extends TestCase {
+
+ /**
+ * Text extractor being tested.
+ */
+ private CompositeTextExtractor extractor;
+
+ /**
+ * Creates the text extractor to be tested.
+ */
+ protected void setUp() {
+ extractor = new CompositeTextExtractor();
+ extractor.addTextExtractor(new PlainTextExtractor());
+ extractor.addTextExtractor(new XMLTextExtractor());
+ }
+
+ /**
+ * Tests that the extractor supports all the content types of the
+ * component extractors.
+ */
+ public void testContentTypes() {
+ Set types = new HashSet();
+ types.addAll(Arrays.asList(extractor.getContentTypes()));
+ assertTrue(
+ "CompositeTextExtractor does not support component types",
+ types.contains("text/plain"));
+ assertTrue(
+ "CompositeTextExtractor does not support component types",
+ types.contains("text/xml"));
+ assertTrue(
+ "CompositeTextExtractor does not support component types",
+ types.contains("application/xml"));
+ assertEquals(
+ "CompositeTextExtractor supports unknown content types",
+ 3, types.size());
+ }
+
+ /**
+ * Tests that the extractor correctly handles an empty stream.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testEmptyStream() throws IOException {
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(new byte[0]), "text/plain", null);
+ assertEquals("", TestHelper.read(reader));
+ }
+
+ /**
+ * Tests that the extractor correctly handles a normal stream.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testNormalStream() throws IOException {
+ String text = "some test content";
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(text.getBytes()), "text/plain", null);
+ assertEquals(text, TestHelper.read(reader));
+ }
+
+ /**
+ * Tests that the extractor correctly handles unsupported content types.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testUnsupportedEncoding() throws IOException {
+ String text = "some test content";
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(text.getBytes()),
+ "unsupported", null);
+ assertEquals("", TestHelper.read(reader));
+ }
+
+}
Index: src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java
===================================================================
--- src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java (revision 0)
+++ src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java (revision 0)
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link PlainTextExtractor} class.
+ */
+public class PlainTextExtractorTest extends TestCase {
+
+ /**
+ * Text extractor being tested.
+ */
+ private TextExtractor extractor;
+
+ /**
+ * Creates the text extractor to be tested.
+ */
+ protected void setUp() {
+ extractor = new PlainTextExtractor();
+ }
+
+ /**
+ * Tests that the extractor supportes text/plain.
+ */
+ public void testContentTypes() {
+ Set types = new HashSet();
+ types.addAll(Arrays.asList(extractor.getContentTypes()));
+ assertTrue(
+ "PlainTextExtractor does not support text/plain",
+ types.contains("text/plain"));
+ assertEquals(
+ "PlainTextExtractor supports unknown content types",
+ 1, types.size());
+ }
+
+ /**
+ * Tests that the extractor correctly handles an empty stream.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testEmptyStream() throws IOException {
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(new byte[0]), "text/plain", null);
+ assertEquals("", TestHelper.read(reader));
+ }
+
+ /**
+ * Tests that the extractor correctly handles a normal stream.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testNormalStream() throws IOException {
+ String text = "some test content";
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(text.getBytes()), "text/plain", null);
+ assertEquals(text, TestHelper.read(reader));
+ }
+
+ /**
+ * Tests that the extractor correctly handles unsupported encodings.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testUnsupportedEncoding() throws IOException {
+ try {
+ String text = "some test content";
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(text.getBytes()),
+ "text/plain", "unsupported");
+ assertEquals("", TestHelper.read(reader));
+ } catch (UnsupportedEncodingException e) {
+ fail("PlainTextExtractor does not handle unsupported encodings");
+ }
+ }
+
+}
Index: src/test/java/org/apache/jackrabbit/extractor/TestAll.java
===================================================================
--- src/test/java/org/apache/jackrabbit/extractor/TestAll.java (revision 0)
+++ src/test/java/org/apache/jackrabbit/extractor/TestAll.java (revision 0)
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * Test suite for text extractor test cases.
+ */
+public class TestAll extends TestCase {
+
+ /**
+ * Returns a {@link Test} suite that executes all tests inside this
+ * package.
+ *
+ * @return test suite
+ */
+ public static Test suite() {
+ TestSuite suite = new TestSuite("Text extractor test cases");
+ suite.addTestSuite(CompositeTextExtractorTest.class);
+ suite.addTestSuite(EmptyTextExtractorTest.class);
+ suite.addTestSuite(PlainTextExtractorTest.class);
+ suite.addTestSuite(XMLTextExtractorTest.class);
+ return suite;
+ }
+
+}
Index: src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java
===================================================================
--- src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java (revision 0)
+++ src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java (revision 0)
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+
+/**
+ * Dummy text extractor that always returns and empty reader for all documents.
+ * Useful as a dummy handler for unsupported content types.
+ */
+public class EmptyTextExtractor implements TextExtractor {
+
+ /**
+ * Supported content types.
+ */
+ private final String[] types;
+
+ /**
+ * Creates a dummy text extractor for the given content types.
+ * The given array must not be modified after it has been passed
+ * to this constructor.
+ *
+ * @param types supported content types
+ */
+ public EmptyTextExtractor(String[] types) {
+ this.types = types;
+ }
+
+ /**
+ * Creates a dummy text extractor for the given content type.
+ *
+ * @param type supported content type
+ */
+ public EmptyTextExtractor(String type) {
+ this(new String[] { type });
+ }
+
+ //-------------------------------------------------------< TextExtractor >
+
+ /**
+ * Returns the supported content types.
+ *
+ * @return supported content types
+ */
+ public String[] getContentTypes() {
+ return types;
+ }
+
+ /**
+ * Closes the given stream and returns an empty reader.
+ *
+ * @param stream binary stream that simply gets closed
+ * @param type ignored
+ * @param encoding ignored
+ * @return empty reader
+ * @throws IOException if the binary stream can not be closed
+ */
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ stream.close();
+ return new StringReader("");
+ }
+
+}
Index: src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java
===================================================================
--- src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java (revision 0)
+++ src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java (revision 0)
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+
+/**
+ * Interface for extracting text content from binary streams.
+ */
+public interface TextExtractor {
+
+ /**
+ * Returns the MIME types supported by this extractor. The returned
+ * strings must be in lower case, and the returned array must not be empty.
+ *
+ * The returned array must not be modified. + * + * @return supported MIME types, lower case + */ + String[] getContentTypes(); + + /** + * Returns a reader for the text content of the given binary document. + * The content type and character encoding (if available and applicable) + * are given as arguments. The given content type is guaranteed to be + * one of the types reported by {@link #getContentTypes()} unless the + * implementation explicitly permits other content types. + *
+ * The implementation can choose either to read and parse the given + * document immediately or to return a reader that does it incrementally. + * The only constraint is that the implementation must close the given + * stream latest when the returned reader is closed. The caller on the + * other hand is responsible for closing the returned reader. + *
+ * The implemenation should only throw an exception on transient + * errors, i.e. when it can expect to be able to successfully extract + * the text content of the same binary at another time. An effort + * should be made to recover from syntax errors and other similar problems. + *
+ * This method should be thread-safe, i.e. it is possible that this
+ * method is invoked simultaneously by different threads to extract the
+ * text content of different documents. On the other hand the returned
+ * reader does not need to be thread-safe.
+ *
+ * @param stream binary document from which to extract text
+ * @param type MIME type of the given document, lower case
+ * @param encoding the character encoding of the binary data,
+ * or null if not available
+ * @return reader for the extracted text content
+ * @throws IOException on transient errors
+ */
+ Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException;
+
+}
Index: src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java
===================================================================
--- src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java (revision 0)
+++ src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java (revision 0)
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.CharArrayReader;
+import java.io.CharArrayWriter;
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+/**
+ * Text extractor for XML documents. This class extracts the text content
+ * and attribute values from XML documents.
+ *
+ * This class can handle any XML-based format
+ * (application/xml+something), not just the base XML content
+ * types reported by {@link #getContentTypes()}. However, it often makes
+ * sense to use more specialized extractors that better understand the
+ * specific content type.
+ */
+public class XMLTextExtractor implements TextExtractor {
+
+ /**
+ * Supported content types.
+ */
+ private static final String[] CONTENT_TYPES =
+ new String[] { "text/xml", "application/xml" };
+
+ //-------------------------------------------------------< TextExtractor >
+
+ /**
+ * Returns the supported content types.
+ *
+ * @return text/xml and application/xml
+ */
+ public String[] getContentTypes() {
+ return CONTENT_TYPES;
+ }
+
+ /**
+ * Returns a reader for the text content of the given XML document.
+ * Returns an empty reader if the given encoding is not supported or
+ * if the XML document could not be parsed.
+ *
+ * @param stream XML document
+ * @param type XML content type
+ * @param encoding character encoding, or null
+ * @return reader for the text content of the given XML document,
+ * or an empty reader if the document could not be parsed
+ * @throws IOException if the XML document stream can not be closed
+ */
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ try {
+ CharArrayWriter writer = new CharArrayWriter();
+ ExtractorHandler handler = new ExtractorHandler(writer);
+
+ // TODO: Use a pull parser to avoid the memory overhead
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ SAXParser parser = factory.newSAXParser();
+ XMLReader reader = parser.getXMLReader();
+ reader.setContentHandler(handler);
+ reader.setErrorHandler(handler);
+
+ // It is unspecified whether the XML parser closes the stream when
+ // done parsing. To ensure that the stream gets closed just once,
+ // we prevent the parser from closing it by catching the close()
+ // call and explicitly close the stream in a finally block.
+ InputSource source = new InputSource(new FilterInputStream(stream) {
+ public void close() {
+ }
+ });
+ if (encoding != null) {
+ source.setEncoding(encoding);
+ }
+ reader.parse(source);
+
+ return new CharArrayReader(writer.toCharArray());
+ } catch (ParserConfigurationException e) {
+ return new StringReader("");
+ } catch (SAXException e) {
+ return new StringReader("");
+ } finally {
+ stream.close();
+ }
+ }
+
+}
Index: src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java
===================================================================
--- src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java (revision 0)
+++ src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java (revision 0)
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Composite text extractor. This class presents a unified interface
+ * for a set of {@link TextExtractor} instances. The composite extractor
+ * supports all the content types supported by the component extractors,
+ * and delegates text extraction calls to the appropriate components.
+ */
+public class CompositeTextExtractor implements TextExtractor {
+
+ /**
+ * Configured {@link TextExtractor} instances, keyed by content types.
+ */
+ private final Map extractors = new HashMap();
+
+ /**
+ * Adds a component text extractor. The given extractor is registered
+ * to process all the content types it claims to support.
+ *
+ * @param extractor component extractor
+ */
+ public void addTextExtractor(TextExtractor extractor) {
+ String[] types = extractor.getContentTypes();
+ for (int i = 0; i < types.length; i++) {
+ extractors.put(types[i], extractor);
+ }
+ }
+
+ //-------------------------------------------------------< TextExtractor >
+
+ /**
+ * Returns all the content types supported by the component extractors.
+ *
+ * @return supported content types
+ */
+ public String[] getContentTypes() {
+ Set types = extractors.keySet();
+ return (String[]) types.toArray(new String[types.size()]);
+ }
+
+ /**
+ * Extracts text content using one of the component extractors. If an
+ * extractor for the given content type does not exist, then the binary
+ * stream is just closed and an empty reader is returned.
+ *
+ * @param stream binary stream
+ * @param type content type
+ * @param encoding optional character encoding
+ * @return reader for the text content of the binary stream
+ * @throws IOException if the binary stream can not be read
+ */
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ TextExtractor extractor = (TextExtractor) extractors.get(type);
+ if (extractor != null) {
+ return extractor.extractText(stream, type, encoding);
+ } else {
+ stream.close();
+ return new StringReader("");
+ }
+ }
+
+}
Index: src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java
===================================================================
--- src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java (revision 0)
+++ src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java (revision 0)
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+
+/**
+ * Text extractor for plain text.
+ */
+public class PlainTextExtractor implements TextExtractor {
+
+ /**
+ * Supported content types.
+ */
+ private static final String[] CONTENT_TYPES = new String[] { "text/plain" };
+
+ //-------------------------------------------------------< TextExtractor >
+
+ /**
+ * Returns the supported content types.
+ *
+ * @return text/plain
+ */
+ public String[] getContentTypes() {
+ return CONTENT_TYPES;
+ }
+
+ /**
+ * Wraps the given input stream to an {@link InputStreamReader} using
+ * the given encoding, or the platform default encoding if the encoding
+ * is not given. Closes the stream and returns an empty reader if the
+ * given encoding is not supported.
+ *
+ * @param stream binary stream
+ * @param type ignored
+ * @param encoding character encoding, optional
+ * @return reader for the plain text content
+ * @throws IOException if the binary stream can not be closed in case
+ * of an encoding issue
+ */
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ try {
+ if (encoding != null) {
+ return new InputStreamReader(stream, encoding);
+ } else {
+ return new InputStreamReader(stream);
+ }
+ } catch (UnsupportedEncodingException e) {
+ stream.close();
+ return new StringReader("");
+ }
+ }
+
+}
Index: src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java
===================================================================
--- src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java (revision 0)
+++ src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java (revision 0)
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+/**
+ * Interface for text extractors that need to delegate the extraction
+ * of parts of content documents to another text extractor. This interface
+ * is usually implemented by extractors of composite multimedia or archive
+ * file formats.
+ *
+ * The configured delegate text extractor is usually a composite extractor + * that may contain also the delegating extractor, thus it is possible for + * the extractor to be invoked recursively within a single thread. An + * implementation should never pass the full content document to the + * delegate extractor to avoid infinite loops. + */ +public interface DelegatingTextExtractor extends TextExtractor { + + /** + * Sets the text textractor to which this extractor should delegate + * any partial text extraction tasks. The given delegate extractor + * is expected to be able to handle any content types passed to it. + * + * @param extractor delegate text extractor + */ + void setDelegateTextExtractor(TextExtractor extractor); + +} Index: src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (revision 0) @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +/** + * Composite text extractor that by default contains the standard + * text extractors found in this package. + */ +public class DefaultTextExtractor extends CompositeTextExtractor { + + /** + * Creates the default text extractor by adding instances of the standard + * text extractors as components. + */ + public DefaultTextExtractor() { + addTextExtractor(new PlainTextExtractor()); + addTextExtractor(new XMLTextExtractor()); + } + +} Index: src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java (revision 0) @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.Writer; + +import org.xml.sax.Attributes; +import org.xml.sax.ErrorHandler; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Utility class for extracting text content from an XML document. + * An instance of this class is a SAX event handler that extracts + * character data and attribute values from the SAX events and writes + * the extracted content to a given {@link Writer}. + *
+ * Any whitespace sequences are imploded into a single space character + * and consecutive attribute values and character data are delimited + * using spaces. + *
+ * This class also implements the {@link ErrorHandler} interface by + * ignoring all errors and warnings. This is useful in avoiding the + * default console output or other error logging of many XML parsers. + * + * @see XMLTextExtractor + */ +class ExtractorHandler extends DefaultHandler implements ErrorHandler { + + /** + * Separator that is written between consecutive text and attribute values. + */ + private static final char SPACE = ' '; + + /** + * The writer to which the selected text content is written. + */ + private final Writer writer; + + /** + * Flag for outputting a space before the next character to be outputted. + * Used to implode all whitespace sequences and to separate consecutive + * attribute values and text elements. + */ + private boolean space; + + /** + * Creates an extractor handler that writes text content to the given + * writer. + * + * @param writer writer to which the XML text content is written + */ + public ExtractorHandler(Writer writer) { + this.writer = writer; + this.space = false; + } + + //------------------------------------------------------< DefaultHandler > + + /** + * Writes attribute values to the underlying writer. + * + * @param uri ignored + * @param local ignored + * @param name ignored + * @param attributes attributes, whose values to extract + * @throws SAXException on IO errors + */ + public void startElement( + String uri, String local, String name, Attributes attributes) + throws SAXException { + for (int i = 0; i < attributes.getLength(); i++) { + String value = attributes.getValue(i); + characters(value.toCharArray(), 0, value.length()); + } + } + + /** + * Writes the given characters to the underlying writer. + * + * @param ch character array that contains the characters to be written + * @param start start index within the array + * @param length number of characters to write + * @throws SAXException on IO errors + */ + public void characters(char[] ch, int start, int length) + throws SAXException { + try { + for (int i = 0; i < length; i++) { + if (Character.isSpaceChar(ch[start + i])) { + space = true; + } else { + if (space) { + writer.write(SPACE); + space = false; + } + writer.write(ch[start + i]); + } + } + space = true; + } catch (IOException e) { + throw new SAXException(e.getMessage()); + } + } + + //--------------------------------------------------------< ErrorHandler > + + /** + * Ignored. + * + * @param exception ignored + */ + public void warning(SAXParseException exception) { + } + + /** + * Ignored. + * + * @param exception ignored + */ + public void error(SAXParseException exception) { + } + + /** + * Ignored. + * + * @param exception ignored + */ + public void fatalError(SAXParseException exception) { + } + +}