Details
-
Bug
-
Status: Closed
-
Major
-
Resolution: Not A Problem
-
1.13
-
None
-
None
-
Windows 8.1, jdk1.8.0_102
Description
I'm trying to use tika to parse PDF files that contain Japanese and Chinese characters, but for some reason it does parse it correctly. Every character that is extracted is changed to the first letter in the line. For example if the document contains 早上好, this, the extracted text will correctly know that it has 3 characters but all 3 characters will be 早早早, the last two characters are replaced by the gfirst character. This same string is correctly parsed, in a word document. The follwoing is what I am using as java sample code (Don't forget to change the fdilename)
package kaleb;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.parser.pdf.PDFParser;
import org.xml.sax.SAXException;
public class TestTika {
/** character limit */
private static int parserCharLimit = 10 * 1024 * 1024;
public static int getParserCharLimit()
{ return parserCharLimit; }public static void setParserCharLimit(int l)
{ parserCharLimit = l; }private static StringBuilder sb = null;
private static ContentHandlerDecorator handler = new ContentHandlerDecorator() {
private void ensureLimit() throws SAXException {
if (sb.length() > parserCharLimit)
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException
@Override
public void ignorableWhitespace(char[] ch, int start,
int length) throws SAXException
};
public static class MaxContentExceededException extends SAXException {
public MaxContentExceededException()
public MaxContentExceededException(Exception e)
{ super(e); }public MaxContentExceededException(String message, Exception e)
{ super(message, e); }public MaxContentExceededException(String message)
{super(message);}}
public static void myTika() throws Exception{
TikaConfig tikaConfig = null;
try{
InputStream stream = new FileInputStream(new File(("C:\\Users\\kaleba\\workspace\\TestingStuff\\src\\kaleb
tika-config.xml")));
try
catch (IOException | SAXException | TikaException e)
{ tikaConfig = TikaConfig.getDefaultConfig(); } finally {
try
catch (IOException e) { }
}
}catch(Exception e){}
/** default Tika detector */
Detector tikaDetector = tikaConfig.getDetector();
/** default Tika parser */
CompositeParser tikaParser = new CompositeParser(tikaConfig.getMediaTypeRegistry(), tikaConfig.getParser());
TemporaryResources tmp = new TemporaryResources();
InputStream stream = new FileInputStream(new File("C:\\Users\\kaleba\\Desktop
Chin.docx"));
TikaInputStream tis = TikaInputStream.get(stream, tmp);
String type ="";
// TODO: TIKA-216: Zip bomb prevention: use SecureContentHandler instead??
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(org.apache.tika.parser.Parser.class, tikaParser);
try
catch(Exception e){}
sb = new StringBuilder();
tikaParser.parse(tis, handler, metadata, context);
String s = sb.toString();
int i= 1;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
/*try
{ File initialFile = new File("C:\\Users\\kaleba\\Desktop\\UnicodeTest.pdf"); InputStream targetStream = new FileInputStream(initialFile); String s = parse(targetStream,null, null); int i=1; } catch (Exception e){}*/
/* TestTika tk = new TestTika();
tk.setFilePath("C:\\Users\\kaleba\\Desktop
Rus3.pdf");
try
catch(Exception e){}*/
try
catch (Exception e)
{ System.out.print(e.getMessage()); }}
}
Attachments
Attachments
Issue Links
- Blocked
-
PDFBOX-3499 PDFBox 2.0.2 not parsing Japanese and Chinese Characters correctly from PDF
- Closed