Index: jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java =================================================================== --- jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java (revision 0) +++ jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java (revision 0) @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.CharArrayReader; +import java.io.CharArrayWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.util.Arrays; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Text extractor for png/apng/mng images. This class extracts the text content + * from tEXt chunks. + *

can handle image with mime types + * (image/png, image/apng, image/mng) + */ +public class PngTextExtractor extends AbstractTextExtractor { + + private static byte[] pngHeader = {-119,80,78,71,13,10,26,10}; + private static byte[] mngHeader = {-119,77,78,71,13,10,26,10}; + private static byte[] iendChunk = {73,69,78,68}; + private static byte[] tEXtChunk = {116,69,88,116}; + + private static String separator = System.getProperty("line.separator"); + /** + * Logger instance. + */ + private static final Logger logger = + LoggerFactory.getLogger(PngTextExtractor.class); + + /** + * Creates a new PngTextExtractor instance. + */ + public PngTextExtractor() { + super(new String[]{"image/png", "image/apng", "image/mng"}); + } + + /** + * Returns a reader for the text content of the given png image. + * Returns an empty reader if the png document could not be parsed. + * + * @param stream png image + * @param type ignored + * @param encoding ignored + * @return reader for the text content of the given png image, + * or an empty reader if the image could not be parsed + * @throws IOException if the png image stream can not be closed + */ + public Reader extractText(InputStream stream, String type, String encoding) + throws IOException { + try { + CharArrayWriter writer = new CharArrayWriter(); + byte[] header = new byte[8]; + stream.read(header); + if (!Arrays.equals(pngHeader, header) && (!Arrays.equals(mngHeader, header))) + return new StringReader(""); + byte[] length = new byte[4]; + byte[] chunkType = new byte[4]; + + stream.read(length); + stream.read(chunkType); + + String sep = ""; + while (!Arrays.equals(chunkType, iendChunk)) + { + if (Arrays.equals(chunkType, tEXtChunk)) { + byte[] txtBytes = new byte[calcLen(length)]; + stream.read(txtBytes); + int nullPos = findOffset(txtBytes, (byte)0); + String key = new String(txtBytes, 0, nullPos, "ISO-8859-1"); + String value = new String(txtBytes, nullPos + 1, txtBytes.length - (nullPos + 1), "ISO-8859-1"); + writer.write(key); + writer.write(": "); + writer.write(value); + writer.write(sep); + sep = separator; + } else { + stream.skip(calcLen(length)); + } + + stream.skip(4); + stream.read(length); + stream.read(chunkType); + } + return new CharArrayReader(writer.toCharArray()); + } catch (IOException e) { + logger.warn("Failed to extract png text content", e); + return new StringReader(""); + } finally { + stream.close(); + } + } + + private int calcLen(byte[] length) { + int len = 0x00FF & length[0]; + len <<= 8; + len |= 0x00FF & length[1]; + len <<= 8; + len |= 0x00FF & length[2]; + len <<= 8; + len |= 0x00FF & length[3]; + return len; + } + + int findOffset(byte[] data, byte val) { + for (int i = 0; i < data.length; i++) { + if (data[i] == val) + return i; + } + + return -1; + } +} Index: jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java =================================================================== --- jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java (revision 0) +++ jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java (revision 0) @@ -0,0 +1,84 @@ +package org.apache.jackrabbit.extractor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.Reader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import junit.framework.TestCase; + +public class PngTextExtractorTest extends TestCase { + /** + * Text extractor being tested. + */ + private TextExtractor extractor; + + /** + * Creates the text extractor to be tested. + */ + protected void setUp() throws Exception { + super.setUp(); + extractor = new PngTextExtractor(); + } + + /** + * Tests that the extractor supportes text/xml and + * application/xml. + */ + public void testContentTypes() { + Set types = new HashSet(); + types.addAll(Arrays.asList(extractor.getContentTypes())); + assertTrue( + "PngTextExtractor does not support image/png", + types.contains("image/png")); + assertTrue( + "PngTextExtractor does not support image/apng", + types.contains("image/apng")); + assertTrue( + "PngTextExtractor does not support image/mng", + types.contains("image/mng")); + assertEquals( + "PngTextExtractor supports unknown content types", + 3, types.size()); + } + + /** + * Tests that the extractor correctly handles an empty stream. + */ + public void testEmptyStream() { + try { + Reader reader = extractor.extractText( + new ByteArrayInputStream(new byte[0]), "image/png", null); + assertEquals("", ExtractorHelper.read(reader)); + } catch (IOException e) { + fail("PngTextExtractor does not handle empty streams"); + } + } + + /** + * Tests that the extractor correctly handles a normal stream. + * + * @throws IOException on IO errors + */ + public void testNormalStream() throws IOException { + byte[] png = {-119, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82, + 0, 0, 0, 1, 0, 0, 0, 1, 8, 6, 0, 0, 0, 31, 21, -60, + -119, 0, 0, 0, 6, 98, 75, 71, 68, 0, -1, 0, -1, 0, -1, -96, + -67, -89, -109, 0, 0, 0, 9, 112, 72, 89, 115, 0, 0, 11, 19, 0, + 0, 11, 19, 1, 0, -102, -100, 24, 0, 0, 0, 7, 116, 73, 77, 69, + 7, -40, 4, 6, 5, 59, 15, 72, -108, -3, -68, 0, 0, 0, 52, 116, + 69, 88, 116, 67, 111, 109, 109, 101, 110, 116, 0, 84, 104, 101, 32, 113, + 117, 105, 99, 107, 32, 98, 114, 111, 119, 110, 32, 102, 111, 120, 32, 106, + 117, 109, 112, 115, 32, 111, 118, 101, 114, 32, 116, 104, 101, 32, 108, 97, + 122, 121, 32, 100, 111, 103, 46, 55, 79, -28, -66, 0, 0, 0, 13, 73, + 68, 65, 84, 8, -41, 99, -8, -33, -64, -16, 31, 0, 6, -128, 2, 127, + -21, 73, 116, -101, 0, 0, 0, 0, 73, 69, 78, 68, -82, 66, 96, -126}; + + Reader reader = extractor.extractText( + new ByteArrayInputStream(png), "image/png", null); + assertEquals("Comment: The quick brown fox jumps over the lazy dog.", ExtractorHelper.read(reader)); + } + +}