Uploaded image for project: 'Tika'
  1. Tika
  2. TIKA-2080

PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters correctly

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Closed
    • Major
    • Resolution: Not A Problem
    • 1.13
    • None
    • parser
    • None
    • Windows 8.1, jdk1.8.0_102

    Description

      I'm trying to use tika to parse PDF files that contain Japanese and Chinese characters, but for some reason it does parse it correctly. Every character that is extracted is changed to the first letter in the line. For example if the document contains 早上好, this, the extracted text will correctly know that it has 3 characters but all 3 characters will be 早早早, the last two characters are replaced by the gfirst character. This same string is correctly parsed, in a word document. The follwoing is what I am using as java sample code (Don't forget to change the fdilename)

      package kaleb;

      import java.io.BufferedReader;
      import java.io.File;
      import java.io.FileInputStream;
      import java.io.IOException;
      import java.io.InputStream;
      import java.io.InputStreamReader;
      import java.io.StringWriter;
      import java.nio.charset.Charset;
      import java.nio.charset.CharsetEncoder;

      import org.apache.commons.io.IOUtils;
      import org.apache.commons.io.input.ReaderInputStream;
      import org.apache.tika.config.TikaConfig;
      import org.apache.tika.detect.Detector;
      import org.apache.tika.exception.TikaException;
      import org.apache.tika.io.TemporaryResources;
      import org.apache.tika.io.TikaInputStream;
      import org.apache.tika.metadata.Metadata;
      import org.apache.tika.parser.CompositeParser;
      import org.apache.tika.parser.ParseContext;
      import org.apache.tika.parser.pdf.PDFParser;
      import org.apache.tika.sax.BodyContentHandler;
      import org.apache.tika.sax.ContentHandlerDecorator;
      import org.apache.tika.parser.pdf.PDFParser;

      import org.xml.sax.SAXException;

      public class TestTika {

      /** character limit */
      private static int parserCharLimit = 10 * 1024 * 1024;

      public static int getParserCharLimit()

      { return parserCharLimit; }

      public static void setParserCharLimit(int l)

      { parserCharLimit = l; }

      private static StringBuilder sb = null;

      private static ContentHandlerDecorator handler = new ContentHandlerDecorator() {
      private void ensureLimit() throws SAXException {
      if (sb.length() > parserCharLimit)

      { throw new MaxContentExceededException( "Your document contained more than " +parserCharLimit+" characters: "+sb.length()); }

      }
      @Override
      public void characters(char[] ch, int start, int length) throws SAXException

      { if (length == 5) length *=2; sb.append(ch, start, length ); ensureLimit(); }

      @Override
      public void ignorableWhitespace(char[] ch, int start,
      int length) throws SAXException

      { if (sb.length() > 0) sb.append(ch, start, length); ensureLimit(); }

      };

      public static class MaxContentExceededException extends SAXException {
      public MaxContentExceededException()

      { super(); }

      public MaxContentExceededException(Exception e)

      { super(e); }

      public MaxContentExceededException(String message, Exception e)

      { super(message, e); }

      public MaxContentExceededException(String message)

      {super(message);}

      }

      public static void myTika() throws Exception{
      TikaConfig tikaConfig = null;

      try{

      InputStream stream = new FileInputStream(new File(("C:\\Users\\kaleba\\workspace\\TestingStuff\\src\\kaleb
      tika-config.xml")));
      try

      { tikaConfig = new TikaConfig(stream); }

      catch (IOException | SAXException | TikaException e)

      { tikaConfig = TikaConfig.getDefaultConfig(); }

      finally {
      try

      { stream.close(); }

      catch (IOException e) { }
      }
      }catch(Exception e){}

      /** default Tika detector */
      Detector tikaDetector = tikaConfig.getDetector();

      /** default Tika parser */
      CompositeParser tikaParser = new CompositeParser(tikaConfig.getMediaTypeRegistry(), tikaConfig.getParser());
      TemporaryResources tmp = new TemporaryResources();
      InputStream stream = new FileInputStream(new File("C:\\Users\\kaleba\\Desktop
      Chin.docx"));

      TikaInputStream tis = TikaInputStream.get(stream, tmp);

      String type ="";
      // TODO: TIKA-216: Zip bomb prevention: use SecureContentHandler instead??
      Metadata metadata = new Metadata();
      ParseContext context = new ParseContext();
      context.set(org.apache.tika.parser.Parser.class, tikaParser);
      try

      { // TODO: limit by content type to reduce dependencies? // https://tika.apache.org/1.10/parser_guide.html type = tikaDetector.detect(tis, metadata).toString(); metadata.set(Metadata.CONTENT_TYPE, type); }

      catch(Exception e){}
      sb = new StringBuilder();
      tikaParser.parse(tis, handler, metadata, context);

      String s = sb.toString();

      int i= 1;
      }
      public static void main(String[] args) {
      // TODO Auto-generated method stub

      /*try

      { File initialFile = new File("C:\\Users\\kaleba\\Desktop\\UnicodeTest.pdf"); InputStream targetStream = new FileInputStream(initialFile); String s = parse(targetStream,null, null); int i=1; }

      catch (Exception e){}*/
      /* TestTika tk = new TestTika();
      tk.setFilePath("C:\\Users\\kaleba\\Desktop
      Rus3.pdf");
      try

      { System.out.println(tk.ToText()); }

      catch(Exception e){}*/
      try

      { myTika(); }

      catch (Exception e)

      { System.out.print(e.getMessage()); }

      }

      }

      Attachments

        1. nihao2.pdf
          11 kB
          Kaleb Akalework

        Issue Links

          Activity

            People

              Unassigned Unassigned
              kalebAkale Kaleb Akalework
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: