Uploaded image for project: 'PDFBox'
  1. PDFBox
  2. PDFBOX-3581

PDFTextStripper not working with multiple threads

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Closed
    • Major
    • Resolution: Not A Problem
    • 2.0.3
    • None
    • Text extraction
    • Ubuntu 15.1

    Description

      Hi, I try to use pdfbox to extract text from a list of files, the problem is PDFTextStripper does not work on thread mode, when I try to use it in multythread nothing happens. it is a bug or limitation?

      could you help me ?
      thanks

      I have added the full class

       private void scanFolderFiles(File scanDirectory) {
              File[] filesScan = scanDirectory.listFiles();
      
              if ( filesScan.length > 0 ) {
                  int iterator=0;
                  for (final File fileEntry : filesScan) {
      
                 
                      if (fileEntry.isDirectory()) {
                          scanFolderFiles(fileEntry);
                      } else {
                          try {
                             new PDFExtractThread(fileEntry).start();
                          }
                          catch (Exception e) {
                              e.printStackTrace();
                          }
                      }
      
                      iterator++;
                  }
              }
          }
      
      
      ////////////////////////////////////////////////////////////////////////////////////////////
      package org.pdfextractor;
      
      
      import org.apache.pdfbox.pdmodel.PDDocument;
      import org.apache.pdfbox.text.PDFTextStripper;
      import java.io.*;
      
      /**
       * Created by dru on 21.11.2016.
       */
      class PDFExtractThread extends Thread {
      
          private String fileName;
          private File readFile;
          private PDDocument document;
      
          public PDFExtractThread(File readFile) {
      
              try {
      
                  this.readFile = readFile;
      
              } catch (Exception e) {
                  e.printStackTrace();
                  System.exit(1);
              }
          }
      
      
          public void run() {
      
              try {
      
                  //get FileName
                  this.readFile.setWritable(true);
                  this.document = PDDocument.load(this.readFile);
                  this.fileName = (new String(this.readFile.getName()).toLowerCase().replace(".pdf", ""));
      
                  pdfBoxExtractText();
                  //Closing the document
                  this.document.close();
      
              } catch (IOException e) {
                  e.printStackTrace();
              } catch (Exception e) {
                  e.printStackTrace();
              }
          }
      
      
          public void pdfBoxExtractText() throws Exception {
              //Retrieving text from PDF document
              PDFTextStripper pdfStripper = new PDFTextStripper();
              System.out.println(this.fileName);
              FileWriter fileWriter = new FileWriter(this.fileName+".txt");
              BufferedWriter writer = new BufferedWriter(fileWriter);
              String text = pdfStripper.getText(this.document);
      
              writer.write(text);
              writer.close();
          }
      }
      
      

      Attachments

        Activity

          People

            Unassigned Unassigned
            dmitri_russu Dmitri Russu
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: