Index: src/test/java/org/apache/jackrabbit/extractor/TestHelper.java =================================================================== --- src/test/java/org/apache/jackrabbit/extractor/TestHelper.java (revision 0) +++ src/test/java/org/apache/jackrabbit/extractor/TestHelper.java (revision 0) @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.CharArrayWriter; +import java.io.IOException; +import java.io.Reader; + +/** + * Helper class for text extractor unit tests. + */ +class TestHelper { + + /** + * Private constructor to prevent instantiation. + */ + private TestHelper() { + }; + + /** + * Returns the entire content of the given reader as a string. + * + * @param reader reader to be read and closed + * @return entire content of the reader + * @throws IOException on IO errors + */ + public static String read(Reader reader) throws IOException { + try { + CharArrayWriter writer = new CharArrayWriter(); + try { + char[] buffer = new char[4096]; + int n = reader.read(buffer); + while (n > 0) { + writer.write(buffer, 0, n); + n = reader.read(buffer); + } + } finally { + writer.close(); + } + return new String(writer.toCharArray()); + } finally { + reader.close(); + } + } + +} Index: src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java =================================================================== --- src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java (revision 0) +++ src/test/java/org/apache/jackrabbit/extractor/EmptyTextExtractorTest.java (revision 0) @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +/** + * Unit tests for the {@link EmptyTextExtractor} class. + */ +public class EmptyTextExtractorTest extends TestCase { + + /** + * Text extractor being tested. + */ + private TextExtractor extractor; + + /** + * Creates the text extractor to be tested. + */ + protected void setUp() { + extractor = new EmptyTextExtractor("test/type"); + } + + /** + * Tests that the extractor supports no content types. + */ + public void testContentTypes() { + Set types = new HashSet(); + types.addAll(Arrays.asList(extractor.getContentTypes())); + assertTrue( + "EmptyTextExtractor does not support the given content type", + types.contains("test/type")); + assertEquals( + "EmptyTextExtractor supports unknown content types", + 1, types.size()); + } + + /** + * Tests that the extractor correctly handles a normal stream. + * + * @throws IOException on IO errors + */ + public void testNormalStream() throws IOException { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), "text/plain", null); + assertEquals("", TestHelper.read(reader)); + } + +} Index: src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java =================================================================== --- src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java (revision 0) +++ src/test/java/org/apache/jackrabbit/extractor/XMLTextExtractorTest.java (revision 0) @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +/** + * Unit tests for the {@link XMLTextExtractor} class. + */ +public class XMLTextExtractorTest extends TestCase { + + /** + * Text extractor being tested. + */ + private TextExtractor extractor; + + /** + * Creates the text extractor to be tested. + */ + protected void setUp() { + extractor = new XMLTextExtractor(); + } + + /** + * Tests that the extractor supportes text/xml and + * application/xml. + */ + public void testContentTypes() { + Set types = new HashSet(); + types.addAll(Arrays.asList(extractor.getContentTypes())); + assertTrue( + "XMLTextExtractor does not support text/xml", + types.contains("text/xml")); + assertTrue( + "XMLTextExtractor does not support application/xml", + types.contains("application/xml")); + assertEquals( + "XMLTextExtractor supports unknown content types", + 2, types.size()); + } + + /** + * Tests that the extractor correctly handles an empty stream. + */ + public void testEmptyStream() { + try { + Reader reader = extractor.extractText( + new ByteArrayInputStream(new byte[0]), "text/xml", null); + assertEquals("", TestHelper.read(reader)); + } catch (IOException e) { + fail("XMLTextExtractor does not handle empty streams"); + } + } + + /** + * Tests that the extractor correctly handles a normal stream. + * + * @throws IOException on IO errors + */ + public void testNormalStream() throws IOException { + String xml = "text content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(xml.getBytes()), "text/xml", null); + assertEquals("attribute value text content", TestHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles XML parse errors. + */ + public void testInvalidStream() { + try { + String xml = "text content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(xml.getBytes()), "text/xml", null); + assertEquals("", TestHelper.read(reader)); + } catch (IOException e) { + fail("XMLTextExtractor does not handle XML parse errors"); + } + } + + /** + * Tests that the extractor correctly handles unsupported encodings. + */ + public void testUnsupportedEncoding() { + try { + String xml = "text content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(xml.getBytes()), + "text/xml", "unsupported"); + assertEquals("", TestHelper.read(reader)); + } catch (UnsupportedEncodingException e) { + fail("XMLTextExtractor does not handle unsupported encodings"); + } catch (IOException e) { + fail("XMLTextExtractor does not handle unsupported encodings"); + } + } + +} Index: src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java =================================================================== --- src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java (revision 0) +++ src/test/java/org/apache/jackrabbit/extractor/CompositeTextExtractorTest.java (revision 0) @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +/** + * Unit tests for the {@link CompositeTextExtractor} class. + */ +public class CompositeTextExtractorTest extends TestCase { + + /** + * Text extractor being tested. + */ + private CompositeTextExtractor extractor; + + /** + * Creates the text extractor to be tested. + */ + protected void setUp() { + extractor = new CompositeTextExtractor(); + extractor.addTextExtractor(new PlainTextExtractor()); + extractor.addTextExtractor(new XMLTextExtractor()); + } + + /** + * Tests that the extractor supports all the content types of the + * component extractors. + */ + public void testContentTypes() { + Set types = new HashSet(); + types.addAll(Arrays.asList(extractor.getContentTypes())); + assertTrue( + "CompositeTextExtractor does not support component types", + types.contains("text/plain")); + assertTrue( + "CompositeTextExtractor does not support component types", + types.contains("text/xml")); + assertTrue( + "CompositeTextExtractor does not support component types", + types.contains("application/xml")); + assertEquals( + "CompositeTextExtractor supports unknown content types", + 3, types.size()); + } + + /** + * Tests that the extractor correctly handles an empty stream. + * + * @throws IOException on IO errors + */ + public void testEmptyStream() throws IOException { + Reader reader = extractor.extractText( + new ByteArrayInputStream(new byte[0]), "text/plain", null); + assertEquals("", TestHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles a normal stream. + * + * @throws IOException on IO errors + */ + public void testNormalStream() throws IOException { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), "text/plain", null); + assertEquals(text, TestHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles unsupported content types. + * + * @throws IOException on IO errors + */ + public void testUnsupportedEncoding() throws IOException { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), + "unsupported", null); + assertEquals("", TestHelper.read(reader)); + } + +} Index: src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java =================================================================== --- src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java (revision 0) +++ src/test/java/org/apache/jackrabbit/extractor/PlainTextExtractorTest.java (revision 0) @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +/** + * Unit tests for the {@link PlainTextExtractor} class. + */ +public class PlainTextExtractorTest extends TestCase { + + /** + * Text extractor being tested. + */ + private TextExtractor extractor; + + /** + * Creates the text extractor to be tested. + */ + protected void setUp() { + extractor = new PlainTextExtractor(); + } + + /** + * Tests that the extractor supportes text/plain. + */ + public void testContentTypes() { + Set types = new HashSet(); + types.addAll(Arrays.asList(extractor.getContentTypes())); + assertTrue( + "PlainTextExtractor does not support text/plain", + types.contains("text/plain")); + assertEquals( + "PlainTextExtractor supports unknown content types", + 1, types.size()); + } + + /** + * Tests that the extractor correctly handles an empty stream. + * + * @throws IOException on IO errors + */ + public void testEmptyStream() throws IOException { + Reader reader = extractor.extractText( + new ByteArrayInputStream(new byte[0]), "text/plain", null); + assertEquals("", TestHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles a normal stream. + * + * @throws IOException on IO errors + */ + public void testNormalStream() throws IOException { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), "text/plain", null); + assertEquals(text, TestHelper.read(reader)); + } + + /** + * Tests that the extractor correctly handles unsupported encodings. + * + * @throws IOException on IO errors + */ + public void testUnsupportedEncoding() throws IOException { + try { + String text = "some test content"; + Reader reader = extractor.extractText( + new ByteArrayInputStream(text.getBytes()), + "text/plain", "unsupported"); + assertEquals("", TestHelper.read(reader)); + } catch (UnsupportedEncodingException e) { + fail("PlainTextExtractor does not handle unsupported encodings"); + } + } + +} Index: src/test/java/org/apache/jackrabbit/extractor/TestAll.java =================================================================== --- src/test/java/org/apache/jackrabbit/extractor/TestAll.java (revision 0) +++ src/test/java/org/apache/jackrabbit/extractor/TestAll.java (revision 0) @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +/** + * Test suite for text extractor test cases. + */ +public class TestAll extends TestCase { + + /** + * Returns a {@link Test} suite that executes all tests inside this + * package. + * + * @return test suite + */ + public static Test suite() { + TestSuite suite = new TestSuite("Text extractor test cases"); + suite.addTestSuite(CompositeTextExtractorTest.class); + suite.addTestSuite(EmptyTextExtractorTest.class); + suite.addTestSuite(PlainTextExtractorTest.class); + suite.addTestSuite(XMLTextExtractorTest.class); + return suite; + } + +} Index: src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/EmptyTextExtractor.java (revision 0) @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; + +/** + * Dummy text extractor that always returns and empty reader for all documents. + * Useful as a dummy handler for unsupported content types. + */ +public class EmptyTextExtractor implements TextExtractor { + + /** + * Supported content types. + */ + private final String[] types; + + /** + * Creates a dummy text extractor for the given content types. + * The given array must not be modified after it has been passed + * to this constructor. + * + * @param types supported content types + */ + public EmptyTextExtractor(String[] types) { + this.types = types; + } + + /** + * Creates a dummy text extractor for the given content type. + * + * @param type supported content type + */ + public EmptyTextExtractor(String type) { + this(new String[] { type }); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * Returns the supported content types. + * + * @return supported content types + */ + public String[] getContentTypes() { + return types; + } + + /** + * Closes the given stream and returns an empty reader. + * + * @param stream binary stream that simply gets closed + * @param type ignored + * @param encoding ignored + * @return empty reader + * @throws IOException if the binary stream can not be closed + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + stream.close(); + return new StringReader(""); + } + +} Index: src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/TextExtractor.java (revision 0) @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; + +/** + * Interface for extracting text content from binary streams. + */ +public interface TextExtractor { + + /** + * Returns the MIME types supported by this extractor. The returned + * strings must be in lower case, and the returned array must not be empty. + *

+ * The returned array must not be modified. + * + * @return supported MIME types, lower case + */ + String[] getContentTypes(); + + /** + * Returns a reader for the text content of the given binary document. + * The content type and character encoding (if available and applicable) + * are given as arguments. The given content type is guaranteed to be + * one of the types reported by {@link #getContentTypes()} unless the + * implementation explicitly permits other content types. + *

+ * The implementation can choose either to read and parse the given + * document immediately or to return a reader that does it incrementally. + * The only constraint is that the implementation must close the given + * stream latest when the returned reader is closed. The caller on the + * other hand is responsible for closing the returned reader. + *

+ * The implemenation should only throw an exception on transient + * errors, i.e. when it can expect to be able to successfully extract + * the text content of the same binary at another time. An effort + * should be made to recover from syntax errors and other similar problems. + *

+ * This method should be thread-safe, i.e. it is possible that this + * method is invoked simultaneously by different threads to extract the + * text content of different documents. On the other hand the returned + * reader does not need to be thread-safe. + * + * @param stream binary document from which to extract text + * @param type MIME type of the given document, lower case + * @param encoding the character encoding of the binary data, + * or null if not available + * @return reader for the extracted text content + * @throws IOException on transient errors + */ + Reader extractText(InputStream stream, String type, String encoding) + throws IOException; + +} Index: src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java (revision 0) @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.CharArrayReader; +import java.io.CharArrayWriter; +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +/** + * Text extractor for XML documents. This class extracts the text content + * and attribute values from XML documents. + *

+ * This class can handle any XML-based format + * (application/xml+something), not just the base XML content + * types reported by {@link #getContentTypes()}. However, it often makes + * sense to use more specialized extractors that better understand the + * specific content type. + */ +public class XMLTextExtractor implements TextExtractor { + + /** + * Supported content types. + */ + private static final String[] CONTENT_TYPES = + new String[] { "text/xml", "application/xml" }; + + //-------------------------------------------------------< TextExtractor > + + /** + * Returns the supported content types. + * + * @return text/xml and application/xml + */ + public String[] getContentTypes() { + return CONTENT_TYPES; + } + + /** + * Returns a reader for the text content of the given XML document. + * Returns an empty reader if the given encoding is not supported or + * if the XML document could not be parsed. + * + * @param stream XML document + * @param type XML content type + * @param encoding character encoding, or null + * @return reader for the text content of the given XML document, + * or an empty reader if the document could not be parsed + * @throws IOException if the XML document stream can not be closed + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + try { + CharArrayWriter writer = new CharArrayWriter(); + ExtractorHandler handler = new ExtractorHandler(writer); + + // TODO: Use a pull parser to avoid the memory overhead + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser parser = factory.newSAXParser(); + XMLReader reader = parser.getXMLReader(); + reader.setContentHandler(handler); + reader.setErrorHandler(handler); + + // It is unspecified whether the XML parser closes the stream when + // done parsing. To ensure that the stream gets closed just once, + // we prevent the parser from closing it by catching the close() + // call and explicitly close the stream in a finally block. + InputSource source = new InputSource(new FilterInputStream(stream) { + public void close() { + } + }); + if (encoding != null) { + source.setEncoding(encoding); + } + reader.parse(source); + + return new CharArrayReader(writer.toCharArray()); + } catch (ParserConfigurationException e) { + return new StringReader(""); + } catch (SAXException e) { + return new StringReader(""); + } finally { + stream.close(); + } + } + +} Index: src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java (revision 0) @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * Composite text extractor. This class presents a unified interface + * for a set of {@link TextExtractor} instances. The composite extractor + * supports all the content types supported by the component extractors, + * and delegates text extraction calls to the appropriate components. + */ +public class CompositeTextExtractor implements TextExtractor { + + /** + * Configured {@link TextExtractor} instances, keyed by content types. + */ + private final Map extractors = new HashMap(); + + /** + * Adds a component text extractor. The given extractor is registered + * to process all the content types it claims to support. + * + * @param extractor component extractor + */ + public void addTextExtractor(TextExtractor extractor) { + String[] types = extractor.getContentTypes(); + for (int i = 0; i < types.length; i++) { + extractors.put(types[i], extractor); + } + } + + //-------------------------------------------------------< TextExtractor > + + /** + * Returns all the content types supported by the component extractors. + * + * @return supported content types + */ + public String[] getContentTypes() { + Set types = extractors.keySet(); + return (String[]) types.toArray(new String[types.size()]); + } + + /** + * Extracts text content using one of the component extractors. If an + * extractor for the given content type does not exist, then the binary + * stream is just closed and an empty reader is returned. + * + * @param stream binary stream + * @param type content type + * @param encoding optional character encoding + * @return reader for the text content of the binary stream + * @throws IOException if the binary stream can not be read + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + TextExtractor extractor = (TextExtractor) extractors.get(type); + if (extractor != null) { + return extractor.extractText(stream, type, encoding); + } else { + stream.close(); + return new StringReader(""); + } + } + +} Index: src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java (revision 0) @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; + +/** + * Text extractor for plain text. + */ +public class PlainTextExtractor implements TextExtractor { + + /** + * Supported content types. + */ + private static final String[] CONTENT_TYPES = new String[] { "text/plain" }; + + //-------------------------------------------------------< TextExtractor > + + /** + * Returns the supported content types. + * + * @return text/plain + */ + public String[] getContentTypes() { + return CONTENT_TYPES; + } + + /** + * Wraps the given input stream to an {@link InputStreamReader} using + * the given encoding, or the platform default encoding if the encoding + * is not given. Closes the stream and returns an empty reader if the + * given encoding is not supported. + * + * @param stream binary stream + * @param type ignored + * @param encoding character encoding, optional + * @return reader for the plain text content + * @throws IOException if the binary stream can not be closed in case + * of an encoding issue + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + try { + if (encoding != null) { + return new InputStreamReader(stream, encoding); + } else { + return new InputStreamReader(stream); + } + } catch (UnsupportedEncodingException e) { + stream.close(); + return new StringReader(""); + } + } + +} Index: src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/DelegatingTextExtractor.java (revision 0) @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +/** + * Interface for text extractors that need to delegate the extraction + * of parts of content documents to another text extractor. This interface + * is usually implemented by extractors of composite multimedia or archive + * file formats. + *

+ * The configured delegate text extractor is usually a composite extractor + * that may contain also the delegating extractor, thus it is possible for + * the extractor to be invoked recursively within a single thread. An + * implementation should never pass the full content document to the + * delegate extractor to avoid infinite loops. + */ +public interface DelegatingTextExtractor extends TextExtractor { + + /** + * Sets the text textractor to which this extractor should delegate + * any partial text extraction tasks. The given delegate extractor + * is expected to be able to handle any content types passed to it. + * + * @param extractor delegate text extractor + */ + void setDelegateTextExtractor(TextExtractor extractor); + +} Index: src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java (revision 0) @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +/** + * Composite text extractor that by default contains the standard + * text extractors found in this package. + */ +public class DefaultTextExtractor extends CompositeTextExtractor { + + /** + * Creates the default text extractor by adding instances of the standard + * text extractors as components. + */ + public DefaultTextExtractor() { + addTextExtractor(new PlainTextExtractor()); + addTextExtractor(new XMLTextExtractor()); + } + +} Index: src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java =================================================================== --- src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java (revision 0) +++ src/main/java/org/apache/jackrabbit/extractor/ExtractorHandler.java (revision 0) @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.IOException; +import java.io.Writer; + +import org.xml.sax.Attributes; +import org.xml.sax.ErrorHandler; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Utility class for extracting text content from an XML document. + * An instance of this class is a SAX event handler that extracts + * character data and attribute values from the SAX events and writes + * the extracted content to a given {@link Writer}. + *

+ * Any whitespace sequences are imploded into a single space character + * and consecutive attribute values and character data are delimited + * using spaces. + *

+ * This class also implements the {@link ErrorHandler} interface by + * ignoring all errors and warnings. This is useful in avoiding the + * default console output or other error logging of many XML parsers. + * + * @see XMLTextExtractor + */ +class ExtractorHandler extends DefaultHandler implements ErrorHandler { + + /** + * Separator that is written between consecutive text and attribute values. + */ + private static final char SPACE = ' '; + + /** + * The writer to which the selected text content is written. + */ + private final Writer writer; + + /** + * Flag for outputting a space before the next character to be outputted. + * Used to implode all whitespace sequences and to separate consecutive + * attribute values and text elements. + */ + private boolean space; + + /** + * Creates an extractor handler that writes text content to the given + * writer. + * + * @param writer writer to which the XML text content is written + */ + public ExtractorHandler(Writer writer) { + this.writer = writer; + this.space = false; + } + + //------------------------------------------------------< DefaultHandler > + + /** + * Writes attribute values to the underlying writer. + * + * @param uri ignored + * @param local ignored + * @param name ignored + * @param attributes attributes, whose values to extract + * @throws SAXException on IO errors + */ + public void startElement( + String uri, String local, String name, Attributes attributes) + throws SAXException { + for (int i = 0; i < attributes.getLength(); i++) { + String value = attributes.getValue(i); + characters(value.toCharArray(), 0, value.length()); + } + } + + /** + * Writes the given characters to the underlying writer. + * + * @param ch character array that contains the characters to be written + * @param start start index within the array + * @param length number of characters to write + * @throws SAXException on IO errors + */ + public void characters(char[] ch, int start, int length) + throws SAXException { + try { + for (int i = 0; i < length; i++) { + if (Character.isSpaceChar(ch[start + i])) { + space = true; + } else { + if (space) { + writer.write(SPACE); + space = false; + } + writer.write(ch[start + i]); + } + } + space = true; + } catch (IOException e) { + throw new SAXException(e.getMessage()); + } + } + + //--------------------------------------------------------< ErrorHandler > + + /** + * Ignored. + * + * @param exception ignored + */ + public void warning(SAXParseException exception) { + } + + /** + * Ignored. + * + * @param exception ignored + */ + public void error(SAXParseException exception) { + } + + /** + * Ignored. + * + * @param exception ignored + */ + public void fatalError(SAXParseException exception) { + } + +}