Uploaded image for project: 'Solr'
  1. Solr
  2. SOLR-4809

OpenOffice document body is not indexed by SolrCell

    XMLWordPrintableJSON

Details

    Description

      As reported on the solr user mailing list, SolrCell is not indexing document body content for OpenOffice documents.

      I tested with Apache Open Office 3.4.1 on Solr 4.3 and 3.6.1, for both OpenWriter (.ODT) and Impress (.ODS).

      The extractOnly option does return the document body text, but Solr does not index the document body text. In my test cases (.ODS and .ODT), all I see for the "content" attribute in Solr are a few spaces.

      Using the example schema, I indexed HelloWorld.odt using:

       curl "http://localhost:8983/solr/update/extract?literal.id=doc-1&uprefix=attr_&commit=true" -F "myfile=@HelloWorld.odt"
      

      It queries as:

      <?xml version="1.0" encoding="UTF-8"?>
      <response>
      
      <lst name="responseHeader">
        <int name="status">0</int>
        <int name="QTime">2</int>
        <lst name="params">
          <str name="indent">true</str>
          <str name="q">id:doc-1</str>
        </lst>
      </lst>
      <result name="response" numFound="1" start="0">
        <doc>
          <str name="id">doc-1</str>
          <arr name="attr_image_count">
            <str>0</str>
          </arr>
          <arr name="attr_editing_cycles">
            <str>1</str>
          </arr>
          <arr name="attr_stream_source_info">
            <str>myfile</str>
          </arr>
          <arr name="attr_meta_save_date">
            <str>2013-05-10T17:15:40.99</str>
          </arr>
          <arr name="attr_dc_subject">
            <str>Hello, World</str>
          </arr>
          <str name="subject">Hello World - subject</str>
          <arr name="attr_dcterms_created">
            <str>2013-05-10T17:11:58.88</str>
          </arr>
          <arr name="attr_date">
            <str>2013-05-10T17:15:40.99</str>
          </arr>
          <arr name="attr_dc_description">
            <str>This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
          </arr>
          <arr name="attr_nbobject">
            <str>0</str>
          </arr>
          <arr name="attr_word_count">
            <str>10</str>
          </arr>
          <arr name="attr_edit_time">
            <str>PT3M44S</str>
          </arr>
          <arr name="attr_meta_paragraph_count">
            <str>4</str>
          </arr>
          <arr name="attr_creation_date">
            <str>2013-05-10T17:11:58.88</str>
          </arr>
          <arr name="title">
            <str>Hello World SolrCell Test - title</str>
          </arr>
          <arr name="attr_object_count">
            <str>0</str>
          </arr>
          <arr name="attr_stream_content_type">
            <str>application/octet-stream</str>
          </arr>
          <arr name="attr_nbimg">
            <str>0</str>
          </arr>
          <str name="description">This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
          <arr name="attr_stream_size">
            <str>8960</str>
          </arr>
          <arr name="attr_meta_object_count">
            <str>0</str>
          </arr>
          <arr name="attr_cp_subject">
            <str>Hello World - subject</str>
          </arr>
          <arr name="attr_stream_name">
            <str>HelloWorld.odt</str>
          </arr>
          <arr name="attr_generator">
            <str>OpenOffice.org/3.4.1$Win32 OpenOffice.org_project/341m1$Build-9593</str>
          </arr>
          <str name="keywords">Hello, World</str>
          <arr name="attr_last_save_date">
            <str>2013-05-10T17:15:40.99</str>
          </arr>
          <arr name="attr_paragraph_count">
            <str>4</str>
          </arr>
          <arr name="attr_dc_title">
            <str>Hello World SolrCell Test - title</str>
          </arr>
          <arr name="attr_dcterms_modified">
            <str>2013-05-10T17:15:40.99</str>
          </arr>
          <arr name="attr_meta_creation_date">
            <str>2013-05-10T17:11:58.88</str>
          </arr>
          <arr name="attr_page_count">
            <str>1</str>
          </arr>
          <arr name="attr_meta_character_count">
            <str>60</str>
          </arr>
          <date name="last_modified">2013-05-10T17:15:40Z</date>
          <arr name="attr_nbtab">
            <str>0</str>
          </arr>
          <arr name="attr_meta_word_count">
            <str>10</str>
          </arr>
          <arr name="attr_meta_table_count">
            <str>0</str>
          </arr>
          <arr name="attr_modified">
            <str>2013-05-10T17:15:40.99</str>
          </arr>
          <arr name="attr_meta_image_count">
            <str>0</str>
          </arr>
          <arr name="attr_xmptpg_npages">
            <str>1</str>
          </arr>
          <arr name="attr_table_count">
            <str>0</str>
          </arr>
          <arr name="attr_nbpara">
            <str>4</str>
          </arr>
          <arr name="attr_character_count">
            <str>60</str>
          </arr>
          <arr name="attr_meta_page_count">
            <str>1</str>
          </arr>
          <arr name="attr_nbword">
            <str>10</str>
          </arr>
          <arr name="attr_nbpage">
            <str>1</str>
          </arr>
          <arr name="content_type">
            <str>application/vnd.oasis.opendocument.text</str>
          </arr>
          <arr name="attr_nbcharacter">
            <str>60</str>
          </arr>
          <arr name="content">
            <str>  </str>
          </arr>
          <long name="_version_">1434688567598120960</long></doc>
      </result>
      </response>
      

      Command to extract as text:

      curl "http://localhost:8983/solr/update/extract?literal.id=doc-1&indent=true&extractOnly=true&extractFormat=text&commit=true" -F "myfile=@HelloWorld.odt"
      

      The response:

      <?xml version="1.0" encoding="UTF-8"?>
      <response>
      
      <lst name="responseHeader">
        <int name="status">0</int>
        <int name="QTime">124</int>
      </lst>
      <str name="HelloWorld.odt">
      
      
      
      
      
      
      
      
      Hello World, from OpenOffice!
      
      Third line.
      Fourth line.
      The end.
      
      
      </str>
      <lst name="HelloWorld.odt_metadata">
        <arr name="Image-Count">
          <str>0</str>
        </arr>
        <arr name="editing-cycles">
          <str>1</str>
        </arr>
        <arr name="stream_source_info">
          <str>myfile</str>
        </arr>
        <arr name="meta:save-date">
          <str>2013-05-10T17:15:40.99</str>
        </arr>
        <arr name="dc:subject">
          <str>Hello, World</str>
        </arr>
        <arr name="subject">
          <str>Hello World - subject</str>
        </arr>
        <arr name="dcterms:created">
          <str>2013-05-10T17:11:58.88</str>
        </arr>
        <arr name="date">
          <str>2013-05-10T17:15:40.99</str>
        </arr>
        <arr name="dc:description">
          <str>This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
        </arr>
        <arr name="nbObject">
          <str>0</str>
        </arr>
        <arr name="Word-Count">
          <str>10</str>
        </arr>
        <arr name="Edit-Time">
          <str>PT3M44S</str>
        </arr>
        <arr name="meta:paragraph-count">
          <str>4</str>
        </arr>
        <arr name="Creation-Date">
          <str>2013-05-10T17:11:58.88</str>
        </arr>
        <arr name="title">
          <str>Hello World SolrCell Test - title</str>
        </arr>
        <arr name="Object-Count">
          <str>0</str>
        </arr>
        <arr name="stream_content_type">
          <str>application/octet-stream</str>
        </arr>
        <arr name="nbImg">
          <str>0</str>
        </arr>
        <arr name="description">
          <str>This is a test of SolrCell using OpenOffice 3.4.1 - OpenWriter.</str>
        </arr>
        <arr name="stream_size">
          <str>8960</str>
        </arr>
        <arr name="meta:object-count">
          <str>0</str>
        </arr>
        <arr name="cp:subject">
          <str>Hello World - subject</str>
        </arr>
        <arr name="stream_name">
          <str>HelloWorld.odt</str>
        </arr>
        <arr name="generator">
          <str>OpenOffice.org/3.4.1$Win32 OpenOffice.org_project/341m1$Build-9593</str>
        </arr>
        <arr name="Keywords">
          <str>Hello, World</str>
        </arr>
        <arr name="Last-Save-Date">
          <str>2013-05-10T17:15:40.99</str>
        </arr>
        <arr name="Paragraph-Count">
          <str>4</str>
        </arr>
        <arr name="dc:title">
          <str>Hello World SolrCell Test - title</str>
        </arr>
        <arr name="dcterms:modified">
          <str>2013-05-10T17:15:40.99</str>
        </arr>
        <arr name="meta:creation-date">
          <str>2013-05-10T17:11:58.88</str>
        </arr>
        <arr name="Page-Count">
          <str>1</str>
        </arr>
        <arr name="meta:character-count">
          <str>60</str>
        </arr>
        <arr name="Last-Modified">
          <str>2013-05-10T17:15:40.99</str>
        </arr>
        <arr name="nbTab">
          <str>0</str>
        </arr>
        <arr name="meta:word-count">
          <str>10</str>
        </arr>
        <arr name="meta:table-count">
          <str>0</str>
        </arr>
        <arr name="modified">
          <str>2013-05-10T17:15:40.99</str>
        </arr>
        <arr name="meta:image-count">
          <str>0</str>
        </arr>
        <arr name="xmpTPg:NPages">
          <str>1</str>
        </arr>
        <arr name="Table-Count">
          <str>0</str>
        </arr>
        <arr name="nbPara">
          <str>4</str>
        </arr>
        <arr name="Character Count">
          <str>60</str>
        </arr>
        <arr name="meta:page-count">
          <str>1</str>
        </arr>
        <arr name="nbWord">
          <str>10</str>
        </arr>
        <arr name="nbPage">
          <str>1</str>
        </arr>
        <arr name="Content-Type">
          <str>application/vnd.oasis.opendocument.text</str>
        </arr>
        <arr name="nbCharacter">
          <str>60</str>
        </arr>
      </lst>
      </response>
      

      Attachments

        1. SOLR-4809.patch
          0.7 kB
          Doug Wegscheid
        2. HelloWorld.txt
          0.0 kB
          Jack Krupansky
        3. HelloWorld.odt
          9 kB
          Jack Krupansky
        4. HelloWorld.odp
          11 kB
          Jack Krupansky
        5. HelloWorld.docx
          10 kB
          Jack Krupansky

        Issue Links

          Activity

            People

              Unassigned Unassigned
              jkrupan Jack Krupansky
              Votes:
              3 Vote for this issue
              Watchers:
              7 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: