[TIKA-2080] PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters correctly - ASF JIRA

Details

Type: Bug
Status: Closed
Priority: Major
Resolution: Not A Problem
Affects Version/s: 1.13
Fix Version/s: None
Component/s: parser
Labels:
None
Environment:

Windows 8.1, jdk1.8.0_102

Description

I'm trying to use tika to parse PDF files that contain Japanese and Chinese characters, but for some reason it does parse it correctly. Every character that is extracted is changed to the first letter in the line. For example if the document contains 早上好, this, the extracted text will correctly know that it has 3 characters but all 3 characters will be 早早早, the last two characters are replaced by the gfirst character. This same string is correctly parsed, in a word document. The follwoing is what I am using as java sample code (Don't forget to change the fdilename)

package kaleb;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.parser.pdf.PDFParser;

import org.xml.sax.SAXException;

public class TestTika {

/** character limit */
private static int parserCharLimit = 10 * 1024 * 1024;

public static int getParserCharLimit()

{ return parserCharLimit; }

public static void setParserCharLimit(int l)

{ parserCharLimit = l; }

private static StringBuilder sb = null;

private static ContentHandlerDecorator handler = new ContentHandlerDecorator() {
private void ensureLimit() throws SAXException {
if (sb.length() > parserCharLimit)

{ throw new MaxContentExceededException( "Your document contained more than " +parserCharLimit+" characters: "+sb.length()); }

}
@Override
public void characters(char[] ch, int start, int length) throws SAXException

{ if (length == 5) length *=2; sb.append(ch, start, length ); ensureLimit(); }

@Override
public void ignorableWhitespace(char[] ch, int start,
int length) throws SAXException

{ if (sb.length() > 0) sb.append(ch, start, length); ensureLimit(); }

};

public static class MaxContentExceededException extends SAXException {
public MaxContentExceededException()

{ super(); }

public MaxContentExceededException(Exception e)

{ super(e); }

public MaxContentExceededException(String message, Exception e)

{ super(message, e); }

public MaxContentExceededException(String message)

{super(message);}

}

public static void myTika() throws Exception{
TikaConfig tikaConfig = null;

try{

InputStream stream = new FileInputStream(new File(("C:\\Users\\kaleba\\workspace\\TestingStuff\\src\\kaleb
tika-config.xml")));
try

{ tikaConfig = new TikaConfig(stream); }

catch (IOException | SAXException | TikaException e)

{ tikaConfig = TikaConfig.getDefaultConfig(); }

finally {
try

{ stream.close(); }

catch (IOException e) { }
}
}catch(Exception e){}

/** default Tika detector */
Detector tikaDetector = tikaConfig.getDetector();

/** default Tika parser */
CompositeParser tikaParser = new CompositeParser(tikaConfig.getMediaTypeRegistry(), tikaConfig.getParser());
TemporaryResources tmp = new TemporaryResources();
InputStream stream = new FileInputStream(new File("C:\\Users\\kaleba\\Desktop
Chin.docx"));

TikaInputStream tis = TikaInputStream.get(stream, tmp);

String type ="";
// TODO: ~~TIKA-216~~: Zip bomb prevention: use SecureContentHandler instead??
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(org.apache.tika.parser.Parser.class, tikaParser);
try

{ // TODO: limit by content type to reduce dependencies? // https://tika.apache.org/1.10/parser_guide.html type = tikaDetector.detect(tis, metadata).toString(); metadata.set(Metadata.CONTENT_TYPE, type); }

catch(Exception e){}
sb = new StringBuilder();
tikaParser.parse(tis, handler, metadata, context);

String s = sb.toString();

int i= 1;
}
public static void main(String[] args) {
// TODO Auto-generated method stub

/*try

{ File initialFile = new File("C:\\Users\\kaleba\\Desktop\\UnicodeTest.pdf"); InputStream targetStream = new FileInputStream(initialFile); String s = parse(targetStream,null, null); int i=1; }

catch (Exception e){}*/
/* TestTika tk = new TestTika();
tk.setFilePath("C:\\Users\\kaleba\\Desktop
Rus3.pdf");
try

{ System.out.println(tk.ToText()); }

catch(Exception e){}*/
try

{ myTika(); }

catch (Exception e)

{ System.out.print(e.getMessage()); }

}

Attachments

- Sort By Name
- Sort By Date
- Ascending
- Descending

nihao2.pdf
15/Sep/16 19:25
11 kB
Kaleb Akalework

Issue Links

Blocked

PDFBOX-3499 PDFBox 2.0.2 not parsing Japanese and Chinese Characters correctly from PDF

Closed

PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters correctly

Details

Description

Attachments

Attachments

Issue Links

Activity

People

Dates