Index: jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties =================================================================== --- jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties (revision 719600) +++ jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties (working copy) @@ -35,6 +35,7 @@ dir=application/x-director dms=application/octet-stream doc=application/msword +docx=application/vnd.openxmlformats-officedocument.wordprocessingml.document dvi=application/x-dvi dxr=application/x-director ecma=text/qhtml @@ -93,6 +94,7 @@ pnm=image/x-portable-anymap ppm=image/x-portable-pixmap ppt=application/vnd.ms-powerpoint +pptx=application/vnd.openxmlformats-officedocument.presentationml.presentation ps=application/postscript qhtml=text/qhtml qt=video/quicktime @@ -142,6 +144,7 @@ wrl=model/vrml xbm=image/x-xbitmap xls=application/vnd.ms-excel +xlsx=application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xml=text/xml xpm=image/x-xpixmap xwd=image/x-xwindowdump Index: jackrabbit-ocm/pom.xml =================================================================== --- jackrabbit-ocm/pom.xml (revision 719600) +++ jackrabbit-ocm/pom.xml (working copy) @@ -154,4 +154,8 @@ + + true + + Index: jackrabbit-text-extractors/pom.xml =================================================================== --- jackrabbit-text-extractors/pom.xml (revision 719600) +++ jackrabbit-text-extractors/pom.xml (working copy) @@ -47,13 +47,28 @@ - + org.apache.poi poi + 3.5-beta3 org.apache.poi + openxml4j + 1.0-beta + + + + + org.apache.poi poi-scratchpad + 3.5-beta3 pdfbox Index: jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java =================================================================== --- jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (revision 0) +++ jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (revision 0) @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.extractor; + +import org.apache.poi.extractor.ExtractorFactory; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Reader; +import java.io.InputStream; +import java.io.IOException; +import java.io.StringReader; + +/** + * Text extractor for Microsoft Word documents. + */ +public class MsTextExtractor extends AbstractTextExtractor { + + /** + * Logger instance. + */ + private static final Logger logger = + LoggerFactory.getLogger(MsTextExtractor.class); + + /** + * Force loading of dependent class. + */ + static { + ExtractorFactory.class.getName(); + } + + /** + * Creates a new MsWordTextExtractor instance. + */ + public MsTextExtractor() { + super(new String[]{"application/vnd.ms-word", + "application/msword", + "application/vnd.ms-powerpoint", + "application/mspowerpoint", + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}); + } + + //-------------------------------------------------------< TextExtractor > + + /** + * {@inheritDoc} + * Returns an empty reader if an error occured extracting text from + * the word document. + */ + public Reader extractText(InputStream stream, + String type, + String encoding) throws IOException { + try { + String text = ExtractorFactory.createExtractor(stream).getText(); + return new StringReader(text); + } catch (Exception e) { + logger.warn("Failed to extract Microsoft Document text content", e); + return new StringReader(""); + } finally { + stream.close(); + } + } + +} Property changes on: jackrabbit-text-extractors\src\main\java\org\apache\jackrabbit\extractor\MsTextExtractor.java ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Rev URL Added: svn:eol-style + native