Uploaded image for project: 'Tika'
  1. Tika
  2. TIKA-3187

Tika cannot parse the characters which appear from the symbol font (Microsoft word)

    XMLWordPrintableJSON

Details

    Description

      I am trying to parse a microsoft word document (.doc) which contains characters from symbol font ,symbol font is a special font in microsoft word which contains maths and greek characters

       the code I am using for parsing the doc is below

         import org.apache.commons.io.FileUtils;
            import org.apache.tika.metadata.Metadata;
            import org.apache.tika.parser.AutoDetectParser;
            import org.apache.tika.parser.ParseContext;
            import org.apache.tika.parser.Parser;
            import org.apache.tika.sax.BodyContentHandler;
            import java.io.File;
            import java.io.FileInputStream;
            import java.nio.charset.StandardCharsets;
      
      
          public class Tika {
      
          public static void main(String[] args) {
              try {
                  String inputPath = args[0];
                  String outputPath = args[1];
                  File f = new File(inputPath);
                  System.out.println("path is : " + f.getAbsoluteFile());
                  FileInputStream fileInputStream = new FileInputStream(f);
                  Parser parser = new AutoDetectParser();
                  BodyContentHandler handler = new BodyContentHandler(-1);
                  ParseContext parseContext = new ParseContext();
                  parseContext.set(Parser.class, parser);
                  Metadata metadatafromtika = new Metadata();
                  metadatafromtika.add(Metadata.CONTENT_ENCODING, "UTF-8");
                  parser.parse(fileInputStream, handler, metadatafromtika, parseContext);
                  String text = handler.toString();
      
                  System.out.println("done parsing for file : " + f.getAbsolutePath());
                  System.out.println("text is : \n" + text);
      
                  byte[] bytes = text.getBytes();
                  String encodedText = new String(bytes, StandardCharsets.UTF_8);
                  System.out.println("encoded text is : " + encodedText);
      
                  FileUtils.writeStringToFile(new File(outputPath + File.separator + f.getName() + "_content.txt"),
                      text, "UTF-8");
              }
              catch (Exception e) {
                  e.printStackTrace();
              }
          }
      }
      
      

       

      the dependencies I am using are

      <dependencies>
      <dependency>
        <groupId>org.apache.tika</groupId>
        <artifactId>tika-parsers</artifactId>
        <version>1.18</version>
      </dependency>
      
      <dependency>
        <groupId>commons-collections</groupId>
        <artifactId>commons-collections</artifactId>
        <version>3.2.1</version>
      </dependency>
      
      <dependency>
        <groupId>org.apache.logging.log4j</groupId>
        <artifactId>log4j-core</artifactId>
        <version>2.9.1</version>
      </dependency>
      
      <dependency>
        <groupId>org.antlr</groupId>
        <artifactId>ST4</artifactId>
        <version>4.0.8</version>
      </dependency>
      
      <dependency>
        <groupId>org.postgresql</groupId>
        <artifactId>postgresql</artifactId>
        <version>42.1.4</version>
      </dependency>
      
      <dependency>
        <groupId>com.zaxxer</groupId>
        <artifactId>HikariCP</artifactId>
        <version>2.7.2</version>
      </dependency>
      
      <dependency>
        <groupId>commons-dbutils</groupId>
        <artifactId>commons-dbutils</artifactId>
        <version>1.6</version>
      </dependency>
      
      <dependency>
        <groupId>commons-io</groupId>
        <artifactId>commons-io</artifactId>
        <version>2.5</version>
      </dependency>
      
      <dependency>
        <groupId>org.json</groupId>
        <artifactId>json</artifactId>
        <version>20171018</version>
      </dependency>
      
      <dependency>
        <groupId>org.apache.hive</groupId>
        <artifactId>hive-jdbc</artifactId>
        <version>1.1.0-cdh5.10.1</version>
      </dependency>
      
      <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>2.6.0-cdh5.10.1</version>
      </dependency>
      
      <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-hdfs</artifactId>
        <version>2.6.0-cdh5.10.1</version>
      </dependency>
      
      <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-mapreduce-client-core</artifactId>
        <version>2.6.0-cdh5.10.1</version>
      </dependency>
      
      <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-tools</artifactId>
        <version>2.6.0-mr1-cdh5.10.1</version>
      </dependency>
      
      <dependency>
        <groupId>org.apache.htrace</groupId>
        <artifactId>htrace-core4</artifactId>
        <version>4.0.1-incubating</version>
      </dependency>
      
      <dependency>
        <groupId>com.google.code.gson</groupId>
        <artifactId>gson</artifactId>
        <version>2.8.1</version>
      </dependency>
      
      <dependency>
        <groupId>com.levigo.jbig2</groupId>
        <artifactId>levigo-jbig2-imageio</artifactId>
        <version>1.6.5</version>
      </dependency>
      
      <dependency>
        <groupId>com.github.jai-imageio</groupId>
        <artifactId>jai-imageio-core</artifactId>
        <version>1.3.1</version>
      </dependency>
      
      <dependency>
        <groupId>com.fasterxml.jackson.core</groupId>
        <artifactId>jackson-core</artifactId>
        <version>2.9.5</version>
      </dependency>
      </dependencies
      

      I have attached the input and output files

      Attachments

        1. greek_characters_input.PNG
          1 kB
          Akhil Poshetty
        2. greek_characters_output.PNG
          0.8 kB
          Akhil Poshetty

        Activity

          People

            Unassigned Unassigned
            Akhil997766 Akhil Poshetty
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

            Dates

              Created:
              Updated: