Uploaded image for project: 'Tika'
  1. Tika
  2. TIKA-3194

TikaException: RESERVED_NONZERO when parsing some OneNote files

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Open
    • Major
    • Resolution: Unresolved
    • 1.24.1
    • None
    • parser
    • None
    • Currently testing on Windows 10 environment, but the issue can be replicated in Linux environment as well.

    Description

      Unable to parse some OneNote files with .one extension. Am able to parse some OneNote section files, but not others. May be related to data content. Using following sample code to parse. org.apache.tika.exception.TikaException: RESERVED_NONZERO exception thrown for each failed file.

      import java.io.ByteArrayInputStream;
      import java.io.DataInputStream;
      import java.io.File;
      import java.io.FileInputStream;
      import java.io.FileOutputStream;
      import java.io.IOException;
      import java.io.InputStream;
      import org.apache.tika.exception.TikaException;
      import org.apache.tika.metadata.Metadata;
      import org.apache.tika.parser.Parser;
      import org.apache.tika.parser.ParseContext;
      import org.apache.tika.parser.microsoft.onenote.OneNoteParser;
      import org.slf4j.Logger;
      import org.slf4j.LoggerFactory;
      import org.xml.sax.ContentHandler;
      import org.xml.sax.SAXException;

      public class DocumentParser {
      private static Logger logger = LoggerFactory.getLogger(DocumentParser.class);

      public static void main(String[] args) throws IOException, SAXException, TikaException {
      logger.info("DocumentParser.main Begin");

      if (args.length != 1)

      { System.out.println("syntax: DocumentParser <inputFileName>"); System.exit(-1); }

      String inputFileName = args[0];
      String outputFileName = inputFileName + ".txt";
      logger.info("Processing " + inputFileName);
      ContentHandler contentHandler = new CustomBodyContentHandler(-1);
      Metadata metadata = new Metadata();
      OneNoteParser oneNoteParser = new OneNoteParser();
      ParseContext parseContext = new ParseContext();
      File file = new File(inputFileName);
      InputStream inputStream = new DataInputStream(new FileInputStream(file));
      logger.debug("metadata size: " + metadata.size());
      logger.debug("metadata: " + metadata);
      logger.debug("Implementing Parser class: " + parseContext.get(Parser.class));
      logger.debug("SAXParser class: " + parseContext.getSAXParser().getClass().getCanonicalName());
      oneNoteParser.parse(inputStream, contentHandler, metadata, parseContext);
      inputStream.close();

      try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
      try (InputStream contentsStream = new ByteArrayInputStream(contentHandler.toString().getBytes())) {
      byte[] buffer = new byte[10240];
      for (int length; (length = contentsStream.read(buffer)) > 0; )

      { fileOutputStream.write(buffer, 0, length); }

      }
      }

      logger.info("Finished Writing to " + outputFileName);
      logger.info("DocumentParser.main End");
      }
      }

      20-09-10 20:50:44:372 INFO main parser.DocumentParser:24 - DocumentParser.main Begin
      20-09-10 20:50:44:375 INFO main parser.DocumentParser:33 - Processing D:\\temp
      100603_N150316 - Notes.one
      20-09-10 20:50:45:118 INFO main parser.CustomBodyContentHandler:14 - CustomBodyContentHandler constructed
      20-09-10 20:50:45:134 DEBUG main parser.DocumentParser:40 - metadata size: 0
      20-09-10 20:50:45:135 DEBUG main parser.DocumentParser:41 - metadata:
      20-09-10 20:50:45:136 DEBUG main parser.DocumentParser:42 - Implementing Parser class: null
      20-09-10 20:50:45:326 DEBUG main parser.DocumentParser:43 - SAXParser class: org.apache.xerces.jaxp.SAXParserImpl
      log4j:WARN No appenders could be found for logger (org.apache.tika.parser.microsoft.onenote.OneNoteParser).
      log4j:WARN Please initialize the log4j system properly.
      log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
      Exception in thread "main" org.apache.tika.exception.TikaException: RESERVED_NONZERO
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNode(OneNotePtr.java:687)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeListFragment(OneNotePtr.java:341)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.internalDeserializeFileNodeList(OneNotePtr.java:301)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeList(OneNotePtr.java:316)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNode(OneNotePtr.java:680)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeListFragment(OneNotePtr.java:341)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.internalDeserializeFileNodeList(OneNotePtr.java:301)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeList(OneNotePtr.java:316)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNode(OneNotePtr.java:680)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeListFragment(OneNotePtr.java:341)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.internalDeserializeFileNodeList(OneNotePtr.java:301)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeList(OneNotePtr.java:316)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNode(OneNotePtr.java:680)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeListFragment(OneNotePtr.java:341)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.internalDeserializeFileNodeList(OneNotePtr.java:301)
      at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeList(OneNotePtr.java:316)
      at org.apache.tika.parser.microsoft.onenote.OneNoteParser.createOneNoteDocumentFromDirectFileResource(OneNoteParser.java:172)
      at org.apache.tika.parser.microsoft.onenote.OneNoteParser.parse(OneNoteParser.java:80)
      at com.gm.bigdata.training.parser.DocumentParser.main(DocumentParser.java:44)

      Attachments

        Activity

          People

            Unassigned Unassigned
            nikhilkumar Nikhil Kumar
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

            Dates

              Created:
              Updated: