Details
Description
Java Code:
public static String parseBodyToHTML(String filePath) throws IOException, SAXException, TikaException
{
ContentHandler handler = new BodyContentHandler(new ToXMLContentHandler());
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (FileInputStream stream =new FileInputStream(new File(filePath)))
}
While using this function for some files, I get the following error:
Exception in thread "main" org.xml.sax.SAXException: Namespace http://www.w3.org/1999/xhtml not declared
at org.apache.tika.sax.ToXMLContentHandler$ElementInfo.getPrefix(ToXMLContentHandler.java:62)
at org.apache.tika.sax.ToXMLContentHandler$ElementInfo.getQName(ToXMLContentHandler.java:68)
at org.apache.tika.sax.ToXMLContentHandler.startElement(ToXMLContentHandler.java:148)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.xpath.MatchingContentHandler.startElement(MatchingContentHandler.java:60)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.SecureContentHandler.startElement(SecureContentHandler.java:250)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at org.apache.tika.sax.SafeContentHandler.startElement(SafeContentHandler.java:264)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:254)
at org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:291)
at org.apache.tika.parser.pdf.PDF2XHTML.startPage(PDF2XHTML.java:225)
at org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:437)
at org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:383)
at org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:342)
at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:148)
at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:148)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:136)
at TTR.TTRAnalysis.parseBodyToHTML(TTRAnalysis.java:39)