Solr
  1. Solr
  2. SOLR-6187

facet.mincount ignored in range faceting using distributed search

    Details

    • Type: Bug Bug
    • Status: Closed
    • Priority: Major Major
    • Resolution: Fixed
    • Affects Version/s: 4.8, 4.8.1
    • Fix Version/s: 5.0, 6.0
    • Component/s: search
    • Labels:
      None

      Description

      While I was trying to do a range faceting with gap +1YEAR using shards, I noticed that facet.mincount parameter seems to be ignored.

      Issue can be reproduced in this way:
      Create 2 cores "testshard1" and "testshard2" with:

      solrconfig.xml
      <?xml version="1.0" encoding="UTF-8" ?>
      <config>
      <luceneMatchVersion>LUCENE_41</luceneMatchVersion>
      <lib dir="/opt/solr/dist" regex="solr-cell-.*\.jar"/>
      <directoryFactory name="DirectoryFactory" class="$

      {solr.directoryFactory:solr.NRTCachingDirectoryFactory}

      "/>
      <updateHandler class="solr.DirectUpdateHandler2" />
      <requestHandler name="/select" class="solr.SearchHandler">
      <lst name="defaults">
      <str name="echoParams">explicit</str>
      <int name="rows">10</int>
      <str name="df">id</str>
      </lst>
      </requestHandler>
      <requestHandler name="/update" class="solr.UpdateRequestHandler" />
      <requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" />
      <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
      <lst name="invariants">
      <str name="q">solrpingquery</str>
      </lst>
      <lst name="defaults">
      <str name="echoParams">all</str>
      </lst>
      </requestHandler>
      </config>

      schema.xml
      <?xml version="1.0" ?>
      <schema name="$

      {solr.core.name}

      " version="1.5" xmlns:xi="http://www.w3.org/2001/XInclude">
      <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
      <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
      <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
      <field name="version" type="long" indexed="true" stored="true"/>
      <field name="id" type="int" indexed="true" stored="true" multiValued="false" />
      <field name="date" type="date" indexed="true" stored="true" multiValued="false" />
      <uniqueKey>id</uniqueKey>
      <defaultSearchField>id</defaultSearchField>
      </schema>

      Insert in testshard1:
      <add>
      <doc>
      <field name="id">1</field>
      <field name="date">2014-06-20T12:51:00Z</field>
      </doc>
      </add>

      Insert into testshard2:
      <add>
      <doc>
      <field name="id">2</field>
      <field name="date">2013-06-20T12:51:00Z</field>
      </doc>
      </add>

      Now if I execute:

      curl "http://localhost:8983/solr/testshard1/select?q=id:1&facet=true&facet.mincount=1&facet.range=date&f.date.facet.range.start=1900-01-01T00:00:00Z&f.date.facet.range.end=NOW&f.date.facet.range.gap=%2B1YEAR&shards=localhost%3A8983%2Fsolr%2Ftestshard1%2Clocalhost%3A8983%2Fsolr%2Ftestshard2&shards.info=true&wt=json"

      I obtain:
      {"responseHeader":{"status":0,"QTime":88,"params":{"f.date.facet.range.gap":"+1YEAR","f.date.facet.range.start":"1900-01-01T00:00:00Z","facet":"true","shards":"localhost:8983/solr/testshard1,localhost:8983/solr/testshard2","facet.mincount":"1","q":"id:1","shards.info":"true","facet.range":"date","wt":"json","f.date.facet.range.end":"NOW"}},"shards.info":{"localhost:8983/solr/testshard2":

      {"numFound":0,"maxScore":0.0,"shardAddress":"http://localhost:8983/solr/testshard2","time":76}

      ,"localhost:8983/solr/testshard1":{"numFound":1,"maxScore":0.30685282,"shardAddress":"http://localhost:8983/solr/testshard1","time":79}},"response":{"numFound":1,"start":0,"maxScore":0.30685282,"docs":[

      {"id":1,"date":"2014-06-20T12:51:00Z"}

      ]},"facet_counts":{"facet_queries":{},"facet_fields":{},"facet_dates":{},"facet_ranges":{"date":{"counts":["1900-01-01T00:00:00Z",0,"1901-01-01T00:00:00Z",0,"1902-01-01T00:00:00Z",0,"1903-01-01T00:00:00Z",0,"1904-01-01T00:00:00Z",0,"1905-01-01T00:00:00Z",0,"1906-01-01T00:00:00Z",0,"1907-01-01T00:00:00Z",0,"1908-01-01T00:00:00Z",0,"1909-01-01T00:00:00Z",0,"1910-01-01T00:00:00Z",0,"1911-01-01T00:00:00Z",0,"1912-01-01T00:00:00Z",0,"1913-01-01T00:00:00Z",0,"1914-01-01T00:00:00Z",0,"1915-01-01T00:00:00Z",0,"1916-01-01T00:00:00Z",0,"1917-01-01T00:00:00Z",0,"1918-01-01T00:00:00Z",0,"1919-01-01T00:00:00Z",0,"1920-01-01T00:00:00Z",0,"1921-01-01T00:00:00Z",0,"1922-01-01T00:00:00Z",0,"1923-01-01T00:00:00Z",0,"1924-01-01T00:00:00Z",0,"1925-01-01T00:00:00Z",0,"1926-01-01T00:00:00Z",0,"1927-01-01T00:00:00Z",0,"1928-01-01T00:00:00Z",0,"1929-01-01T00:00:00Z",0,"1930-01-01T00:00:00Z",0,"1931-01-01T00:00:00Z",0,"1932-01-01T00:00:00Z",0,"1933-01-01T00:00:00Z",0,"1934-01-01T00:00:00Z",0,"1935-01-01T00:00:00Z",0,"1936-01-01T00:00:00Z",0,"1937-01-01T00:00:00Z",0,"1938-01-01T00:00:00Z",0,"1939-01-01T00:00:00Z",0,"1940-01-01T00:00:00Z",0,"1941-01-01T00:00:00Z",0,"1942-01-01T00:00:00Z",0,"1943-01-01T00:00:00Z",0,"1944-01-01T00:00:00Z",0,"1945-01-01T00:00:00Z",0,"1946-01-01T00:00:00Z",0,"1947-01-01T00:00:00Z",0,"1948-01-01T00:00:00Z",0,"1949-01-01T00:00:00Z",0,"1950-01-01T00:00:00Z",0,"1951-01-01T00:00:00Z",0,"1952-01-01T00:00:00Z",0,"1953-01-01T00:00:00Z",0,"1954-01-01T00:00:00Z",0,"1955-01-01T00:00:00Z",0,"1956-01-01T00:00:00Z",0,"1957-01-01T00:00:00Z",0,"1958-01-01T00:00:00Z",0,"1959-01-01T00:00:00Z",0,"1960-01-01T00:00:00Z",0,"1961-01-01T00:00:00Z",0,"1962-01-01T00:00:00Z",0,"1963-01-01T00:00:00Z",0,"1964-01-01T00:00:00Z",0,"1965-01-01T00:00:00Z",0,"1966-01-01T00:00:00Z",0,"1967-01-01T00:00:00Z",0,"1968-01-01T00:00:00Z",0,"1969-01-01T00:00:00Z",0,"1970-01-01T00:00:00Z",0,"1971-01-01T00:00:00Z",0,"1972-01-01T00:00:00Z",0,"1973-01-01T00:00:00Z",0,"1974-01-01T00:00:00Z",0,"1975-01-01T00:00:00Z",0,"1976-01-01T00:00:00Z",0,"1977-01-01T00:00:00Z",0,"1978-01-01T00:00:00Z",0,"1979-01-01T00:00:00Z",0,"1980-01-01T00:00:00Z",0,"1981-01-01T00:00:00Z",0,"1982-01-01T00:00:00Z",0,"1983-01-01T00:00:00Z",0,"1984-01-01T00:00:00Z",0,"1985-01-01T00:00:00Z",0,"1986-01-01T00:00:00Z",0,"1987-01-01T00:00:00Z",0,"1988-01-01T00:00:00Z",0,"1989-01-01T00:00:00Z",0,"1990-01-01T00:00:00Z",0,"1991-01-01T00:00:00Z",0,"1992-01-01T00:00:00Z",0,"1993-01-01T00:00:00Z",0,"1994-01-01T00:00:00Z",0,"1995-01-01T00:00:00Z",0,"1996-01-01T00:00:00Z",0,"1997-01-01T00:00:00Z",0,"1998-01-01T00:00:00Z",0,"1999-01-01T00:00:00Z",0,"2000-01-01T00:00:00Z",0,"2001-01-01T00:00:00Z",0,"2002-01-01T00:00:00Z",0,"2003-01-01T00:00:00Z",0,"2004-01-01T00:00:00Z",0,"2005-01-01T00:00:00Z",0,"2006-01-01T00:00:00Z",0,"2007-01-01T00:00:00Z",0,"2008-01-01T00:00:00Z",0,"2009-01-01T00:00:00Z",0,"2010-01-01T00:00:00Z",0,"2011-01-01T00:00:00Z",0,"2012-01-01T00:00:00Z",0,"2013-01-01T00:00:00Z",0,"2014-01-01T00:00:00Z",1],"gap":"+1YEAR","start":"1900-01-01T00:00:00Z","end":"2015-01-01T00:00:00Z"}}}}

      though facet.mincount is set to 1.

      1. SOLR-6187.patch
        20 kB
        Erick Erickson
      2. SOLR-6187.patch
        19 kB
        Erick Erickson
      3. SOLR-6187.patch
        17 kB
        Erick Erickson
      4. SOLR-6187.patch
        17 kB
        Erick Erickson
      5. SOLR-6187.patch
        18 kB
        Erick Erickson
      6. SOLR-6187.patch
        14 kB
        Erick Erickson

        Issue Links

          Activity

          Hide
          Erick Erickson added a comment - - edited

          OK, a patch with tests. It's not ready to commit, but I'd like some comments.

          The approach just doesn't feel very good. Unfortunately I don't see any good way to apply mincounts to distributed facets except to post-process the fully-counted list. I hope I did it in the right place. Suggestions for other approaches welcome!

          I've got some code in there that processes (deprecated) date facets that is really questionable. Problem is that there's no good way to tell where the counts are for an old-style date facet. You get pairs like
          "2001-01-01T00:00:00Z"
          34 (as an integer)

          which is fine, but pretty soon you get things like
          "gap"
          "+1YEAR" (as a String)

          The current code says "if the value is an integer, it must be a count so test it" which is bogus at best. I propose to rip it out and just not support date facets for this issue, but wanted people to see it. I'd rather have people who want mincounts for dates use the modern range faceting rather than introduce this kind of fragility and, perhaps, unintended consequences.

          This patch doesn't handle Pivot Facets, Interval Facets or Query Facets. The first one scares me, the second one I just haven't figured out how to test yet and the third doesn't strike me as very important since there aren't likely to be very many individual Query Facets. I may tackle Query and Interval if this approach isn't shot down.

          I'm not very willing to try to put this in 4.10, even if the approach is OK. I'd like to get this more in the 4.11 time-frame if at all.

          Probably handles SOLR-6154 and SOLR-6300 although I haven't tested those yet.

          All tests pass, although I haven't run precommit yet.

          Show
          Erick Erickson added a comment - - edited OK, a patch with tests. It's not ready to commit, but I'd like some comments. The approach just doesn't feel very good. Unfortunately I don't see any good way to apply mincounts to distributed facets except to post-process the fully-counted list. I hope I did it in the right place. Suggestions for other approaches welcome! I've got some code in there that processes (deprecated) date facets that is really questionable. Problem is that there's no good way to tell where the counts are for an old-style date facet. You get pairs like "2001-01-01T00:00:00Z" 34 (as an integer) which is fine, but pretty soon you get things like "gap" "+1YEAR" (as a String) The current code says "if the value is an integer, it must be a count so test it" which is bogus at best. I propose to rip it out and just not support date facets for this issue, but wanted people to see it. I'd rather have people who want mincounts for dates use the modern range faceting rather than introduce this kind of fragility and, perhaps, unintended consequences. This patch doesn't handle Pivot Facets, Interval Facets or Query Facets. The first one scares me, the second one I just haven't figured out how to test yet and the third doesn't strike me as very important since there aren't likely to be very many individual Query Facets. I may tackle Query and Interval if this approach isn't shot down. I'm not very willing to try to put this in 4.10, even if the approach is OK. I'd like to get this more in the 4.11 time-frame if at all. Probably handles SOLR-6154 and SOLR-6300 although I haven't tested those yet. All tests pass, although I haven't run precommit yet.
          Hide
          Erick Erickson added a comment -

          Another version of the patch. If testing is OK I'll commit this tomorrow unless there are objections:

          This patch has:
          1> the date stuff ripped out
          2> not doing the purging of facets < mincount until STAGE_DONE. I'd really like anyone who knows this code to comment if they can! My understanding is that we're ready to return the final-final packet to the original requester at this point, so it's an appropriate place to prune the response.

          On the plus side, I now know a bit more about the distributed facet code. On the minus side, I spent far too much time figuring it out...

          The bug I introduced with the previous patch (and I thank my lucky stars that there was a lot of unit testing in place) was that I was pruning the facets at each stage. So during the first stage there's some faceting done with some heuristics in place to trigger whether particular terms need to get fetched "for real". By pruning the list before STAGE_DONE, the second round-trip wasn't being made and some facets were incorrectly pruned. Why am I going through all this? So anyone looking at the bug before I commit can tell me I'm totally off base.

          I'll look at the other couple of JIRAs that might be associated with this one before committing.

          Show
          Erick Erickson added a comment - Another version of the patch. If testing is OK I'll commit this tomorrow unless there are objections: This patch has: 1> the date stuff ripped out 2> not doing the purging of facets < mincount until STAGE_DONE. I'd really like anyone who knows this code to comment if they can! My understanding is that we're ready to return the final-final packet to the original requester at this point, so it's an appropriate place to prune the response. On the plus side, I now know a bit more about the distributed facet code. On the minus side, I spent far too much time figuring it out... The bug I introduced with the previous patch (and I thank my lucky stars that there was a lot of unit testing in place) was that I was pruning the facets at each stage. So during the first stage there's some faceting done with some heuristics in place to trigger whether particular terms need to get fetched "for real". By pruning the list before STAGE_DONE, the second round-trip wasn't being made and some facets were incorrectly pruned. Why am I going through all this? So anyone looking at the bug before I commit can tell me I'm totally off base. I'll look at the other couple of JIRAs that might be associated with this one before committing.
          Hide
          Erick Erickson added a comment -

          OK, this one removes the tests associated with the date facets.

          But it also makes me even more nervous. To get query facets and range facets to work, I had to use STAGE_EXECUTEQUERY rather than STAGE_DONE. Which just feels really, really wrong. I'll try some other stuff tomorrow unless someone has some wisdom to offer here.

          Show
          Erick Erickson added a comment - OK, this one removes the tests associated with the date facets. But it also makes me even more nervous. To get query facets and range facets to work, I had to use STAGE_EXECUTEQUERY rather than STAGE_DONE. Which just feels really, really wrong. I'll try some other stuff tomorrow unless someone has some wisdom to offer here.
          Hide
          Erick Erickson added a comment -

          Final patch I hope. I'll commit tomorrow or Thursday, I want to give people a chance to object to using STAGE_EXECUTEQUERY in all situations.

          I ran this through 300 iterations last night and didn't get a failure, I want to run the full test suite a few times today as well as think about it a bit more.

          Show
          Erick Erickson added a comment - Final patch I hope. I'll commit tomorrow or Thursday, I want to give people a chance to object to using STAGE_EXECUTEQUERY in all situations. I ran this through 300 iterations last night and didn't get a failure, I want to run the full test suite a few times today as well as think about it a bit more.
          Hide
          Erick Erickson added a comment -

          About date ranges. Since they're deprecated and the same functionality can be had with range facets and the fix would be fragile, I'm not going to include fixing date ranges for mincount and distributed processing.

          The problem is this: The response packet is jumbled together, so you might have something like
          2001-01-01T00:00:00Z 78
          2002-01-01T00:00:00Z 33
          gap: +1YEAR
          start: "2000-01-01T00:00:00Z"

          So recognizing which parts of the response are numbers associated with dates is fragile. They are pairs, but even trying to recognize the date format regex will fail if the labels are changed with the "key" trick.

          And the objects are different in the response. In the above, 78 and 33 are integers, but not, of course, "+1YEAR". I did try a hack that said, essentially, "if it's a date facet and if the second member of the pair is an integer, it must be a count so see if it exceeds mincount and remove it if not". Yuuuuck.

          None of this applies to non-distributed modes since the things being examined for mincount are numerics before they're put into a response packet. For distributed, we have to do things after they're put in a response packet and collated.

          Show
          Erick Erickson added a comment - About date ranges. Since they're deprecated and the same functionality can be had with range facets and the fix would be fragile, I'm not going to include fixing date ranges for mincount and distributed processing. The problem is this: The response packet is jumbled together, so you might have something like 2001-01-01T00:00:00Z 78 2002-01-01T00:00:00Z 33 gap: +1YEAR start: "2000-01-01T00:00:00Z" So recognizing which parts of the response are numbers associated with dates is fragile. They are pairs, but even trying to recognize the date format regex will fail if the labels are changed with the "key" trick. And the objects are different in the response. In the above, 78 and 33 are integers, but not, of course, "+1YEAR". I did try a hack that said, essentially, "if it's a date facet and if the second member of the pair is an integer, it must be a count so see if it exceeds mincount and remove it if not". Yuuuuck. None of this applies to non-distributed modes since the things being examined for mincount are numerics before they're put into a response packet. For distributed, we have to do things after they're put in a response packet and collated.
          Hide
          Erick Erickson added a comment -

          A significantly cleaned-up patch. Seems to fix this issue up, as well as SOLR-6157.

          Still want to see what's up with SOLR-6386 before committing.

          Show
          Erick Erickson added a comment - A significantly cleaned-up patch. Seems to fix this issue up, as well as SOLR-6157 . Still want to see what's up with SOLR-6386 before committing.
          Hide
          ASF subversion and git services added a comment -

          Commit 1623429 from Erick Erickson in branch 'dev/trunk'
          [ https://svn.apache.org/r1623429 ]

          SOLR-6187: facet.mincount ignored in range faceting using distributed search

          Show
          ASF subversion and git services added a comment - Commit 1623429 from Erick Erickson in branch 'dev/trunk' [ https://svn.apache.org/r1623429 ] SOLR-6187 : facet.mincount ignored in range faceting using distributed search
          Hide
          Erick Erickson added a comment -

          With CHANGES.txt entries

          Show
          Erick Erickson added a comment - With CHANGES.txt entries
          Hide
          ASF subversion and git services added a comment -

          Commit 1623447 from Erick Erickson in branch 'dev/branches/branch_4x'
          [ https://svn.apache.org/r1623447 ]

          SOLR-6187: facet.mincount ignored in range faceting using distributed search

          Show
          ASF subversion and git services added a comment - Commit 1623447 from Erick Erickson in branch 'dev/branches/branch_4x' [ https://svn.apache.org/r1623447 ] SOLR-6187 : facet.mincount ignored in range faceting using distributed search
          Hide
          Zaccheo Bagnati added a comment -

          Sorry for not giving feedback but I',m not familiar with SOLR source code.
          Can you please correct my name in CHANGES.txt: it's misspelled. Thank you.

          Show
          Zaccheo Bagnati added a comment - Sorry for not giving feedback but I',m not familiar with SOLR source code. Can you please correct my name in CHANGES.txt: it's misspelled. Thank you.
          Hide
          ASF subversion and git services added a comment -

          Commit 1634915 from hossman@apache.org in branch 'dev/trunk'
          [ https://svn.apache.org/r1634915 ]

          SOLR-6187: fix Zaccheo's name in CHANGES.txt

          Show
          ASF subversion and git services added a comment - Commit 1634915 from hossman@apache.org in branch 'dev/trunk' [ https://svn.apache.org/r1634915 ] SOLR-6187 : fix Zaccheo's name in CHANGES.txt
          Hide
          ASF subversion and git services added a comment -

          Commit 1634916 from hossman@apache.org in branch 'dev/branches/branch_5x'
          [ https://svn.apache.org/r1634916 ]

          SOLR-6187: fix Zaccheo's name in CHANGES.txt (merge r1634915)

          Show
          ASF subversion and git services added a comment - Commit 1634916 from hossman@apache.org in branch 'dev/branches/branch_5x' [ https://svn.apache.org/r1634916 ] SOLR-6187 : fix Zaccheo's name in CHANGES.txt (merge r1634915)
          Hide
          Hoss Man added a comment -

          Can you please correct my name in CHANGES.txt: it's misspelled.

          Zaccheo: sorry about that, all fixed now.

          Show
          Hoss Man added a comment - Can you please correct my name in CHANGES.txt: it's misspelled. Zaccheo: sorry about that, all fixed now.
          Hide
          Erick Erickson added a comment -

          Hoss:

          Thanks, slipped off my radar..

          Apologies Zaccheo...

          Show
          Erick Erickson added a comment - Hoss: Thanks, slipped off my radar.. Apologies Zaccheo...
          Hide
          Anshum Gupta added a comment -

          Bulk close after 5.0 release.

          Show
          Anshum Gupta added a comment - Bulk close after 5.0 release.

            People

            • Assignee:
              Erick Erickson
              Reporter:
              Zaccheo Bagnati
            • Votes:
              0 Vote for this issue
              Watchers:
              4 Start watching this issue

              Dates

              • Created:
                Updated:
                Resolved:

                Development