Description
If I use apache poi's HWPF component to create MS doc, and pass it to tika.parseToString(is); it returns just carriage return "\n". I tested that with tons of different input text. Adding paragraphs doesn't help.
private void createDOCDocument(String from, File file) throws Exception { POIFSFileSystem fs = new POIFSFileSystem(DOCGenerator.class.getClass().getResourceAsStream("/poi/template.doc")); HWPFDocument doc = new HWPFDocument(fs); Range range = doc.getRange(); CharacterRun run1 = range.insertBefore(from); run1.setFontSize(11); DocumentSummaryInformation dsi = doc.getDocumentSummaryInformation(); CustomProperties cp = dsi.getCustomProperties(); if (cp == null) cp = new CustomProperties(); cp.put("myProperty", "foo bar baz"); dsi.setCustomProperties(cp); doc.write(new FileOutputStream(file)); }
protected String extractText(InputStream is) throws SystemException { Tika tika = new Tika(); tika.setMaxStringLength(new Long(maxCharCount).intValue()); String text; try { text = tika.parseToString(is); } catch (IOException ioe) { throw new SystemException(ioe.getMessage(), ioe); } catch (TikaException te) { throw new SystemException(te.getMessage(), te); } return text; }