Details
-
Bug
-
Status: Closed
-
Major
-
Resolution: Not A Problem
-
2.0.3
-
None
-
Ubuntu 15.1
Description
Hi, I try to use pdfbox to extract text from a list of files, the problem is PDFTextStripper does not work on thread mode, when I try to use it in multythread nothing happens. it is a bug or limitation?
could you help me ?
thanks
I have added the full class
private void scanFolderFiles(File scanDirectory) { File[] filesScan = scanDirectory.listFiles(); if ( filesScan.length > 0 ) { int iterator=0; for (final File fileEntry : filesScan) { if (fileEntry.isDirectory()) { scanFolderFiles(fileEntry); } else { try { new PDFExtractThread(fileEntry).start(); } catch (Exception e) { e.printStackTrace(); } } iterator++; } } } //////////////////////////////////////////////////////////////////////////////////////////// package org.pdfextractor; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import java.io.*; /** * Created by dru on 21.11.2016. */ class PDFExtractThread extends Thread { private String fileName; private File readFile; private PDDocument document; public PDFExtractThread(File readFile) { try { this.readFile = readFile; } catch (Exception e) { e.printStackTrace(); System.exit(1); } } public void run() { try { //get FileName this.readFile.setWritable(true); this.document = PDDocument.load(this.readFile); this.fileName = (new String(this.readFile.getName()).toLowerCase().replace(".pdf", "")); pdfBoxExtractText(); //Closing the document this.document.close(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } public void pdfBoxExtractText() throws Exception { //Retrieving text from PDF document PDFTextStripper pdfStripper = new PDFTextStripper(); System.out.println(this.fileName); FileWriter fileWriter = new FileWriter(this.fileName+".txt"); BufferedWriter writer = new BufferedWriter(fileWriter); String text = pdfStripper.getText(this.document); writer.write(text); writer.close(); } }