Uploaded image for project: 'PDFBox'
  1. PDFBox
  2. PDFBOX-3879

Not able to get font styles, like italic and Strikethrough

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Closed
    • Major
    • Resolution: Not A Bug
    • 2.0.7
    • None
    • Text extraction

    Description

      I'm trying to extract text from a PDF file, and save it to a XML file.
      The PDF file includes italic and strikethrough font, I cannot get it with PDFont class.
      Below is the code and result.

      public class TextExtractor extends PDFTextStripper {
          private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
      
          private final HashMap<TextPosition, String> colors = new HashMap<>();
      
          public TextExtractor() throws IOException {
              addOperator(new SetNonStrokingColorSpace());
              addOperator(new SetNonStrokingDeviceCMYKColor());
              addOperator(new SetNonStrokingDeviceRGBColor());
              addOperator(new SetNonStrokingDeviceGrayColor());
              addOperator(new SetNonStrokingColor());
              addOperator(new SetNonStrokingColorN());
          }
      
          @Override
          protected void startDocument(PDDocument document) throws IOException {
              super.startDocument(document);
              super.writeString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<document>\n");
          }
      
          @Override
          protected void endDocument(PDDocument document) throws IOException {
              super.endDocument(document);
              super.writeString("</document>\n");
          }
      
          @Override
          protected void startPage(PDPage page) throws IOException {
              super.startPage(page);
              super.writeString(String.format("  <page width=\"%f\" height=\"%f\">\n", page.getBBox().getWidth(), page.getBBox().getHeight()));
          }
      
          @Override
          protected void endPage(PDPage page) throws IOException {
              super.endPage(page);
              super.writeString("  </page>\n");
      
          }
      
          @Override
          protected void processTextPosition(TextPosition text) {
              super.processTextPosition(text);
              PDColor nonStrokingColor = getGraphicsState().getNonStrokingColor();
              try {
                  String hex = Integer.toHexString(nonStrokingColor.toRGB() & 0xffffff);
                  while (hex.length() < 6) {
                      hex = "0" + hex;
                  }
                  colors.put(text, "#" + hex);
              } catch (IOException e) {
                  e.printStackTrace();
              }
          }
      
          @Override
          protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
              StringBuilder builder = new StringBuilder("    <line>\n");
              String[] words = string.split(this.getWordSeparator());
              int startIndex = 0;
              for (String word : words) {
                  if(Strings.isNullOrEmpty(word)){
                      continue;
                  }
                  TextPosition startPosition = textPositions.get(startIndex);
                  String color = colors.get(startPosition);
                  String font = startPosition.getFont().getName();
                  float fontSize = startPosition.getFontSize();
                  float x = startPosition.getX();
                  float y = startPosition.getY();
                  TextPosition endPosition = textPositions.get(startIndex + word.length() - 1);
                  float width = endPosition.getEndX() - startPosition.getX();
                  float height = startPosition.getHeight();
                  String template ="      <word x=\"%f\" y=\"%f\" width=\"%f\" height=\"%f\" font=\"%s\" font-size=\"%.0f\" color=\"%s\">%s</word>\n";
                  builder.append(String.format(template, x, y, width, height, font, fontSize, color, escape(word)));
                  startIndex += word.length() + 1;
              }
              builder.append("    </line>");
              super.writeString(builder.toString());
          }
      
          /**
           * Escape some HTML characters.
           *
           * @param chars String to be escaped
           * @return returns escaped String.
           */
          private static String escape(String chars)
          {
              StringBuilder builder = new StringBuilder(chars.length());
              for (int i = 0; i < chars.length(); i++)
              {
                  appendEscaped(builder, chars.charAt(i));
              }
              return builder.toString();
          }
      
          private static void appendEscaped(StringBuilder builder, char character)
          {
              // write non-ASCII as named entities
              if ((character < 32) || (character > 126))
              {
                  int charAsInt = character;
                  builder.append("&#").append(charAsInt).append(";");
              }
              else
              {
                  switch (character)
                  {
                      case 34:
                          builder.append("&quot;");
                          break;
                      case 38:
                          builder.append("&amp;");
                          break;
                      case 60:
                          builder.append("&lt;");
                          break;
                      case 62:
                          builder.append("&gt;");
                          break;
                      default:
                          builder.append(String.valueOf(character));
                  }
              }
          }
      }
      
      <document>
        <page width="595.000000" height="842.000000">
          <line>
            <word x="48.000000" y="89.000000" width="59.843376" height="20.234375" font="LucidaGrande" font-size="28" color="#000000">Title</word>
          </line>
          <line>
            <word x="48.000000" y="139.000000" width="32.190125" height="10.562654" font="LucidaGrande" font-size="14" color="#000000">Italic</word>
          </line>
          <line>
            <word x="48.000000" y="175.000000" width="31.480873" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
            <word x="84.171875" y="175.000000" width="26.590248" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">and</word>
            <word x="115.453125" y="175.000000" width="39.458496" height="10.562654" font="LucidaGrande-Bold" font-size="14" color="#000000">Italic.</word>
          </line>
          <line>
            <word x="48.000000" y="211.000000" width="31.480873" height="10.117188" font="LucidaGrande-Bold" font-size="14" color="#000000">Bold</word>
          </line>
          <line>
            <word x="48.000000" y="247.000000" width="92.764618" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">Strikethrough</word>
          </line>
          <line>
            <word x="48.000000" y="283.000000" width="36.523254" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">some</word>
            <word x="89.000000" y="283.000000" width="26.803375" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">text</word>
          </line>
          <line>
            <word x="48.000000" y="319.000000" width="27.180374" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">new</word>
            <word x="79.687500" y="319.000000" width="24.523247" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">line</word>
            <word x="108.687500" y="319.000000" width="25.350250" height="10.117188" font="LucidaGrande" font-size="14" color="#000000">test</word>
          </line>
        </page>
      </document>
      

      Attachments

        1. src.pdf
          10 kB
          Sun Peter

        Activity

          People

            Unassigned Unassigned
            perfectspr Sun Peter
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: