Details
-
Bug
-
Status: Open
-
Major
-
Resolution: Unresolved
-
1.24.1
-
None
-
None
-
Currently testing on Windows 10 environment, but the issue can be replicated in Linux environment as well.
Description
Unable to parse some OneNote files with .one extension. Am able to parse some OneNote section files, but not others. May be related to data content. Using following sample code to parse. org.apache.tika.exception.TikaException: RESERVED_NONZERO exception thrown for each failed file.
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.onenote.OneNoteParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class DocumentParser {
private static Logger logger = LoggerFactory.getLogger(DocumentParser.class);
public static void main(String[] args) throws IOException, SAXException, TikaException {
logger.info("DocumentParser.main Begin");
if (args.length != 1)
{ System.out.println("syntax: DocumentParser <inputFileName>"); System.exit(-1); } String inputFileName = args[0];
String outputFileName = inputFileName + ".txt";
logger.info("Processing " + inputFileName);
ContentHandler contentHandler = new CustomBodyContentHandler(-1);
Metadata metadata = new Metadata();
OneNoteParser oneNoteParser = new OneNoteParser();
ParseContext parseContext = new ParseContext();
File file = new File(inputFileName);
InputStream inputStream = new DataInputStream(new FileInputStream(file));
logger.debug("metadata size: " + metadata.size());
logger.debug("metadata: " + metadata);
logger.debug("Implementing Parser class: " + parseContext.get(Parser.class));
logger.debug("SAXParser class: " + parseContext.getSAXParser().getClass().getCanonicalName());
oneNoteParser.parse(inputStream, contentHandler, metadata, parseContext);
inputStream.close();
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
try (InputStream contentsStream = new ByteArrayInputStream(contentHandler.toString().getBytes())) {
byte[] buffer = new byte[10240];
for (int length; (length = contentsStream.read(buffer)) > 0; )
}
}
logger.info("Finished Writing to " + outputFileName);
logger.info("DocumentParser.main End");
}
}
20-09-10 20:50:44:372 INFO main parser.DocumentParser:24 - DocumentParser.main Begin
20-09-10 20:50:44:375 INFO main parser.DocumentParser:33 - Processing D:\\temp
100603_N150316 - Notes.one
20-09-10 20:50:45:118 INFO main parser.CustomBodyContentHandler:14 - CustomBodyContentHandler constructed
20-09-10 20:50:45:134 DEBUG main parser.DocumentParser:40 - metadata size: 0
20-09-10 20:50:45:135 DEBUG main parser.DocumentParser:41 - metadata:
20-09-10 20:50:45:136 DEBUG main parser.DocumentParser:42 - Implementing Parser class: null
20-09-10 20:50:45:326 DEBUG main parser.DocumentParser:43 - SAXParser class: org.apache.xerces.jaxp.SAXParserImpl
log4j:WARN No appenders could be found for logger (org.apache.tika.parser.microsoft.onenote.OneNoteParser).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Exception in thread "main" org.apache.tika.exception.TikaException: RESERVED_NONZERO
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNode(OneNotePtr.java:687)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeListFragment(OneNotePtr.java:341)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.internalDeserializeFileNodeList(OneNotePtr.java:301)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeList(OneNotePtr.java:316)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNode(OneNotePtr.java:680)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeListFragment(OneNotePtr.java:341)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.internalDeserializeFileNodeList(OneNotePtr.java:301)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeList(OneNotePtr.java:316)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNode(OneNotePtr.java:680)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeListFragment(OneNotePtr.java:341)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.internalDeserializeFileNodeList(OneNotePtr.java:301)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeList(OneNotePtr.java:316)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNode(OneNotePtr.java:680)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeListFragment(OneNotePtr.java:341)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.internalDeserializeFileNodeList(OneNotePtr.java:301)
at org.apache.tika.parser.microsoft.onenote.OneNotePtr.deserializeFileNodeList(OneNotePtr.java:316)
at org.apache.tika.parser.microsoft.onenote.OneNoteParser.createOneNoteDocumentFromDirectFileResource(OneNoteParser.java:172)
at org.apache.tika.parser.microsoft.onenote.OneNoteParser.parse(OneNoteParser.java:80)
at com.gm.bigdata.training.parser.DocumentParser.main(DocumentParser.java:44)