Index: jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties
===================================================================
--- jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties (revision 719600)
+++ jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties (working copy)
@@ -35,6 +35,7 @@
dir=application/x-director
dms=application/octet-stream
doc=application/msword
+docx=application/vnd.openxmlformats-officedocument.wordprocessingml.document
dvi=application/x-dvi
dxr=application/x-director
ecma=text/qhtml
@@ -93,6 +94,7 @@
pnm=image/x-portable-anymap
ppm=image/x-portable-pixmap
ppt=application/vnd.ms-powerpoint
+pptx=application/vnd.openxmlformats-officedocument.presentationml.presentation
ps=application/postscript
qhtml=text/qhtml
qt=video/quicktime
@@ -142,6 +144,7 @@
wrl=model/vrml
xbm=image/x-xbitmap
xls=application/vnd.ms-excel
+xlsx=application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
xml=text/xml
xpm=image/x-xpixmap
xwd=image/x-xwindowdump
Index: jackrabbit-ocm/pom.xml
===================================================================
--- jackrabbit-ocm/pom.xml (revision 719600)
+++ jackrabbit-ocm/pom.xml (working copy)
@@ -154,4 +154,8 @@
+
+ true
+
+
Index: jackrabbit-text-extractors/pom.xml
===================================================================
--- jackrabbit-text-extractors/pom.xml (revision 719600)
+++ jackrabbit-text-extractors/pom.xml (working copy)
@@ -47,13 +47,28 @@
-
+
org.apache.poi
poi
+ 3.5-beta3
org.apache.poi
+ openxml4j
+ 1.0-beta
+
+
+
+
+ org.apache.poi
poi-scratchpad
+ 3.5-beta3
pdfbox
Index: jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
===================================================================
--- jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (revision 0)
+++ jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (revision 0)
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.apache.poi.extractor.ExtractorFactory;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Text extractor for Microsoft Word documents.
+ */
+public class MsTextExtractor extends AbstractTextExtractor {
+
+ /**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(MsTextExtractor.class);
+
+ /**
+ * Force loading of dependent class.
+ */
+ static {
+ ExtractorFactory.class.getName();
+ }
+
+ /**
+ * Creates a new MsWordTextExtractor instance.
+ */
+ public MsTextExtractor() {
+ super(new String[]{"application/vnd.ms-word",
+ "application/msword",
+ "application/vnd.ms-powerpoint",
+ "application/mspowerpoint",
+ "application/vnd.ms-excel",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"});
+ }
+
+ //-------------------------------------------------------< TextExtractor >
+
+ /**
+ * {@inheritDoc}
+ * Returns an empty reader if an error occured extracting text from
+ * the word document.
+ */
+ public Reader extractText(InputStream stream,
+ String type,
+ String encoding) throws IOException {
+ try {
+ String text = ExtractorFactory.createExtractor(stream).getText();
+ return new StringReader(text);
+ } catch (Exception e) {
+ logger.warn("Failed to extract Microsoft Document text content", e);
+ return new StringReader("");
+ } finally {
+ stream.close();
+ }
+ }
+
+}
Property changes on: jackrabbit-text-extractors\src\main\java\org\apache\jackrabbit\extractor\MsTextExtractor.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision Rev URL
Added: svn:eol-style
+ native