Index: jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java =================================================================== --- jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java (revision 0) +++ jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PngTextExtractor.java (revision 0) @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import java.io.CharArrayReader; +import java.io.CharArrayWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.util.Arrays; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Text extractor for png/apng/mng images. This class extracts the text content + * from tEXt chunks. + *
can handle image with mime types
+ * (image/png, image/apng, image/mng)
+ */
+public class PngTextExtractor extends AbstractTextExtractor {
+
+ private static byte[] pngHeader = {-119,80,78,71,13,10,26,10};
+ private static byte[] mngHeader = {-119,77,78,71,13,10,26,10};
+ private static byte[] iendChunk = {73,69,78,68};
+ private static byte[] tEXtChunk = {116,69,88,116};
+
+ private static String separator = System.getProperty("line.separator");
+ /**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(PngTextExtractor.class);
+
+ /**
+ * Creates a new PngTextExtractor instance.
+ */
+ public PngTextExtractor() {
+ super(new String[]{"image/png", "image/apng", "image/mng"});
+ }
+
+ /**
+ * Returns a reader for the text content of the given png image.
+ * Returns an empty reader if the png document could not be parsed.
+ *
+ * @param stream png image
+ * @param type ignored
+ * @param encoding ignored
+ * @return reader for the text content of the given png image,
+ * or an empty reader if the image could not be parsed
+ * @throws IOException if the png image stream can not be closed
+ */
+ public Reader extractText(InputStream stream, String type, String encoding)
+ throws IOException {
+ try {
+ CharArrayWriter writer = new CharArrayWriter();
+ byte[] header = new byte[8];
+ stream.read(header);
+ if (!Arrays.equals(pngHeader, header) && (!Arrays.equals(mngHeader, header)))
+ return new StringReader("");
+ byte[] length = new byte[4];
+ byte[] chunkType = new byte[4];
+
+ stream.read(length);
+ stream.read(chunkType);
+
+ String sep = "";
+ while (!Arrays.equals(chunkType, iendChunk))
+ {
+ if (Arrays.equals(chunkType, tEXtChunk)) {
+ byte[] txtBytes = new byte[calcLen(length)];
+ stream.read(txtBytes);
+ int nullPos = findOffset(txtBytes, (byte)0);
+ String key = new String(txtBytes, 0, nullPos, "ISO-8859-1");
+ String value = new String(txtBytes, nullPos + 1, txtBytes.length - (nullPos + 1), "ISO-8859-1");
+ writer.write(key);
+ writer.write(": ");
+ writer.write(value);
+ writer.write(sep);
+ sep = separator;
+ } else {
+ stream.skip(calcLen(length));
+ }
+
+ stream.skip(4);
+ stream.read(length);
+ stream.read(chunkType);
+ }
+ return new CharArrayReader(writer.toCharArray());
+ } catch (IOException e) {
+ logger.warn("Failed to extract png text content", e);
+ return new StringReader("");
+ } finally {
+ stream.close();
+ }
+ }
+
+ private int calcLen(byte[] length) {
+ int len = 0x00FF & length[0];
+ len <<= 8;
+ len |= 0x00FF & length[1];
+ len <<= 8;
+ len |= 0x00FF & length[2];
+ len <<= 8;
+ len |= 0x00FF & length[3];
+ return len;
+ }
+
+ int findOffset(byte[] data, byte val) {
+ for (int i = 0; i < data.length; i++) {
+ if (data[i] == val)
+ return i;
+ }
+
+ return -1;
+ }
+}
Index: jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java
===================================================================
--- jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java (revision 0)
+++ jackrabbit-text-extractors/src/test/java/org/apache/jackrabbit/extractor/PngTextExtractorTest.java (revision 0)
@@ -0,0 +1,84 @@
+package org.apache.jackrabbit.extractor;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+public class PngTextExtractorTest extends TestCase {
+ /**
+ * Text extractor being tested.
+ */
+ private TextExtractor extractor;
+
+ /**
+ * Creates the text extractor to be tested.
+ */
+ protected void setUp() throws Exception {
+ super.setUp();
+ extractor = new PngTextExtractor();
+ }
+
+ /**
+ * Tests that the extractor supportes text/xml and
+ * application/xml.
+ */
+ public void testContentTypes() {
+ Set types = new HashSet();
+ types.addAll(Arrays.asList(extractor.getContentTypes()));
+ assertTrue(
+ "PngTextExtractor does not support image/png",
+ types.contains("image/png"));
+ assertTrue(
+ "PngTextExtractor does not support image/apng",
+ types.contains("image/apng"));
+ assertTrue(
+ "PngTextExtractor does not support image/mng",
+ types.contains("image/mng"));
+ assertEquals(
+ "PngTextExtractor supports unknown content types",
+ 3, types.size());
+ }
+
+ /**
+ * Tests that the extractor correctly handles an empty stream.
+ */
+ public void testEmptyStream() {
+ try {
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(new byte[0]), "image/png", null);
+ assertEquals("", ExtractorHelper.read(reader));
+ } catch (IOException e) {
+ fail("PngTextExtractor does not handle empty streams");
+ }
+ }
+
+ /**
+ * Tests that the extractor correctly handles a normal stream.
+ *
+ * @throws IOException on IO errors
+ */
+ public void testNormalStream() throws IOException {
+ byte[] png = {-119, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82,
+ 0, 0, 0, 1, 0, 0, 0, 1, 8, 6, 0, 0, 0, 31, 21, -60,
+ -119, 0, 0, 0, 6, 98, 75, 71, 68, 0, -1, 0, -1, 0, -1, -96,
+ -67, -89, -109, 0, 0, 0, 9, 112, 72, 89, 115, 0, 0, 11, 19, 0,
+ 0, 11, 19, 1, 0, -102, -100, 24, 0, 0, 0, 7, 116, 73, 77, 69,
+ 7, -40, 4, 6, 5, 59, 15, 72, -108, -3, -68, 0, 0, 0, 52, 116,
+ 69, 88, 116, 67, 111, 109, 109, 101, 110, 116, 0, 84, 104, 101, 32, 113,
+ 117, 105, 99, 107, 32, 98, 114, 111, 119, 110, 32, 102, 111, 120, 32, 106,
+ 117, 109, 112, 115, 32, 111, 118, 101, 114, 32, 116, 104, 101, 32, 108, 97,
+ 122, 121, 32, 100, 111, 103, 46, 55, 79, -28, -66, 0, 0, 0, 13, 73,
+ 68, 65, 84, 8, -41, 99, -8, -33, -64, -16, 31, 0, 6, -128, 2, 127,
+ -21, 73, 116, -101, 0, 0, 0, 0, 73, 69, 78, 68, -82, 66, 96, -126};
+
+ Reader reader = extractor.extractText(
+ new ByteArrayInputStream(png), "image/png", null);
+ assertEquals("Comment: The quick brown fox jumps over the lazy dog.", ExtractorHelper.read(reader));
+ }
+
+}