Details
-
Bug
-
Status: Closed
-
Major
-
Resolution: Not A Bug
-
2.0.7
-
None
Description
I'm trying to extract text from a PDF file, and save it to a XML file.
The PDF file includes italic and strikethrough font, I cannot get it with PDFont class.
Below is the code and result.
public class TextExtractor extends PDFTextStripper { private static final Log LOG = LogFactory.getLog(PDFTextStripper.class); private final HashMap<TextPosition, String> colors = new HashMap<>(); public TextExtractor() throws IOException { addOperator(new SetNonStrokingColorSpace()); addOperator(new SetNonStrokingDeviceCMYKColor()); addOperator(new SetNonStrokingDeviceRGBColor()); addOperator(new SetNonStrokingDeviceGrayColor()); addOperator(new SetNonStrokingColor()); addOperator(new SetNonStrokingColorN()); } @Override protected void startDocument(PDDocument document) throws IOException { super.startDocument(document); super.writeString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<document>\n"); } @Override protected void endDocument(PDDocument document) throws IOException { super.endDocument(document); super.writeString("</document>\n"); } @Override protected void startPage(PDPage page) throws IOException { super.startPage(page); super.writeString(String.format(" <page width=\"%f\" height=\"%f\">\n", page.getBBox().getWidth(), page.getBBox().getHeight())); } @Override protected void endPage(PDPage page) throws IOException { super.endPage(page); super.writeString(" </page>\n"); } @Override protected void processTextPosition(TextPosition text) { super.processTextPosition(text); PDColor nonStrokingColor = getGraphicsState().getNonStrokingColor(); try { String hex = Integer.toHexString(nonStrokingColor.toRGB() & 0xffffff); while (hex.length() < 6) { hex = "0" + hex; } colors.put(text, "#" + hex); } catch (IOException e) { e.printStackTrace(); } } @Override protected void writeString(String string, List<TextPosition> textPositions) throws IOException { StringBuilder builder = new StringBuilder(" <line>\n"); String[] words = string.split(this.getWordSeparator()); int startIndex = 0; for (String word : words) { if(Strings.isNullOrEmpty(word)){ continue; } TextPosition startPosition = textPositions.get(startIndex); String color = colors.get(startPosition); String font = startPosition.getFont().getName(); float fontSize = startPosition.getFontSize(); float x = startPosition.getX(); float y = startPosition.getY(); TextPosition endPosition = textPositions.get(startIndex + word.length() - 1); float width = endPosition.getEndX() - startPosition.getX(); float height = startPosition.getHeight(); String template =" <word x=\"%f\" y=\"%f\" width=\"%f\" height=\"%f\" font=\"%s\" font-size=\"%.0f\" color=\"%s\">%s</word>\n"; builder.append(String.format(template, x, y, width, height, font, fontSize, color, escape(word))); startIndex += word.length() + 1; } builder.append(" </line>"); super.writeString(builder.toString()); } /** * Escape some HTML characters. * * @param chars String to be escaped * @return returns escaped String. */ private static String escape(String chars) { StringBuilder builder = new StringBuilder(chars.length()); for (int i = 0; i < chars.length(); i++) { appendEscaped(builder, chars.charAt(i)); } return builder.toString(); } private static void appendEscaped(StringBuilder builder, char character) { // write non-ASCII as named entities if ((character < 32) || (character > 126)) { int charAsInt = character; builder.append("&#").append(charAsInt).append(";"); } else { switch (character) { case 34: builder.append("""); break; case 38: builder.append("&"); break; case 60: builder.append("<"); break; case 62: builder.append(">"); break; default: builder.append(String.valueOf(character)); } } } }
<document> <page width="595.000000" height="842.000000"> <line> <word x="48.000000" y="89.000000" width="59.843376" height="20.234375" font="LucidaGrande" font-size="28" color="#000000">Title</word> </line> <line> <word x="48.000000" y="139.000000" width="32.190125" height="10.562654" font="LucidaGrande" font-size="14" color="#000000">Italic</word> </line> <line> <word x="48.000000" y="175.000000" width="31.480873" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word> <word x="84.171875" y="175.000000" width="26.590248" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">and</word> <word x="115.453125" y="175.000000" width="39.458496" height="10.562654" font="LucidaGrande-Bold" font-size="14" color="#000000">Italic.</word> </line> <line> <word x="48.000000" y="211.000000" width="31.480873" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word> </line> <line> <word x="48.000000" y="247.000000" width="92.764618" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word> </line> <line> <word x="48.000000" y="283.000000" width="36.523254" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">some</word> <word x="89.000000" y="283.000000" width="26.803375" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">text</word> </line> <line> <word x="48.000000" y="319.000000" width="27.180374" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">new</word> <word x="79.687500" y="319.000000" width="24.523247" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">line</word> <word x="108.687500" y="319.000000" width="25.350250" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">test</word> </line> </page> </document>