Details
-
Bug
-
Status: Closed
-
Major
-
Resolution: Fixed
-
1.2.1
-
None
-
None
Description
The attached PDF gives the following error when trying to extract text with PDFBox 1.2.1:
dylan@dylan-laptop:~/desktop/pdfbox$ java org.apache.pdfbox.ExtractText -console Document2.v1.20100630.pdf
Jul 12, 2010 9:00:31 AM org.apache.pdfbox.util.PDFStreamEngine processOperator
WARNING: java.io.IOException: Error: expected hex character and not :32
java.io.IOException: Error: expected hex character and not :32
at org.apache.fontbox.cmap.CMapParser.parseNextToken(CMapParser.java:336)
at org.apache.fontbox.cmap.CMapParser.parse(CMapParser.java:139)
at org.apache.pdfbox.pdmodel.font.PDFont.parseCmap(PDFont.java:556)
at org.apache.pdfbox.pdmodel.font.PDFont.encode(PDFont.java:390)
at org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEngine.java:386)
at org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.java:61)
at org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:567)
at org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:250)
at org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:208)
at org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:378)
at org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:302)
at org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:258)
at org.apache.pdfbox.ExtractText.main(ExtractText.java:236)
Jul 12, 2010 9:00:31 AM org.apache.pdfbox.util.PDFStreamEngine processOperator
WARNING: java.io.IOException: Error: expected hex character and not :32
java.io.IOException: Error: expected hex character and not :32
at org.apache.fontbox.cmap.CMapParser.parseNextToken(CMapParser.java:336)
at org.apache.fontbox.cmap.CMapParser.parse(CMapParser.java:139)
at org.apache.pdfbox.pdmodel.font.PDFont.parseCmap(PDFont.java:556)
at org.apache.pdfbox.pdmodel.font.PDFont.encode(PDFont.java:390)
at org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEngine.java:386)
at org.apache.pdfbox.util.operator.ShowText.process(ShowText.java:45)
at org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:567)
at org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:250)
at org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:208)
at org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:378)
at org.apache.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:302)
at org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:258)
at org.apache.pdfbox.ExtractText.main(ExtractText.java:236)
Jul 12, 2010 9:00:31 AM org.apache.pdfbox.util.PDFStreamEngine processOperator