Index: common-build.xml =================================================================== --- common-build.xml (revision 551794) +++ common-build.xml (working copy) @@ -280,6 +280,8 @@ + + ################################################################## @@ -295,6 +297,10 @@ + + + + Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt (revision 0) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt (revision 0) @@ -0,0 +1,723 @@ +# ----------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------- + +# ------------------------------------------------------------ +# Format: +# +# qnum 0 doc-name is-relevant +# +# +# The origin of this file was created using +# utils.QualityQueriesFinder, so all queries +# would have perfect 1.0 for all meassures. +# +# To make it suitable for testing it was modified +# for some queries, depending on m = qnum % 8 +# m==0: recall is hurt, by marking fake docs as relevant +# m==1: precision is hurt, by unmarking relevant docs +# m==2: both precision and recall are hurt. +# m>=3: these queries remain perfect +# ------------------------------------------------------------ + +# ----- m==0: recall is hurt, by adding fake docs as relevant + +0 0 fakedoc1 1 +0 0 fakedoc2 1 +0 0 fakedoc3 1 +0 0 fakedoc4 1 + +0 0 doc18211 1 +0 0 doc20192 1 +0 0 doc7401 1 +0 0 doc11285 1 +0 0 doc20647 1 +0 0 doc3057 1 +0 0 doc12431 1 +0 0 doc4989 1 +0 0 doc17324 1 +0 0 doc4030 1 +0 0 doc4290 1 +0 0 doc3462 1 +0 0 doc15313 1 +0 0 doc10303 1 +0 0 doc1893 1 +0 0 doc5008 1 +0 0 doc14634 1 +0 0 doc5471 1 +0 0 doc17904 1 +0 0 doc7168 1 +0 0 doc21275 1 +0 0 doc9011 1 +0 0 doc17546 1 +0 0 doc9102 1 +0 0 doc13199 1 + +# --- m==1: precision is hurt, by unmarking relevant docs + +1 0 doc9857 0 +1 0 doc16846 1 +1 0 doc4320 1 +1 0 doc9501 0 +1 0 doc10159 1 +1 0 doc16642 1 +1 0 doc17536 0 +1 0 doc17571 1 +1 0 doc18728 1 +1 0 doc18828 1 +1 0 doc19108 0 +1 0 doc9940 1 +1 0 doc11852 1 +1 0 doc7430 0 +1 0 doc19162 1 +1 0 doc1743 1 +1 0 doc2137 1 +1 0 doc7611 1 +1 0 doc8072 1 +1 0 doc12764 1 +1 0 doc2593 1 +1 0 doc11088 1 +1 0 doc931 1 +1 0 doc7673 1 +1 0 doc12941 1 +1 0 doc11797 1 +1 0 doc11831 1 +1 0 doc13162 1 +1 0 doc4423 1 +1 0 doc5217 1 + +# ---- m==2: both precision and recall are hurt. + +2 0 fakedoc1 1 +2 0 fakedoc2 1 +2 0 fakedoc3 1 +2 0 fakedoc4 1 + +2 0 doc3137 0 +2 0 doc7142 0 +2 0 doc13667 0 +2 0 doc13171 0 +2 0 doc13372 1 +2 0 doc21415 1 +2 0 doc16298 1 +2 0 doc14957 1 +2 0 doc153 1 +2 0 doc16092 1 +2 0 doc16096 1 +2 0 doc21303 1 +2 0 doc18681 1 +2 0 doc20756 1 +2 0 doc355 1 +2 0 doc13395 1 +2 0 doc5009 1 +2 0 doc17164 1 +2 0 doc13162 1 +2 0 doc11757 1 +2 0 doc9637 1 +2 0 doc18087 1 +2 0 doc4593 1 +2 0 doc4677 1 +2 0 doc20865 1 +2 0 doc8556 1 +2 0 doc2578 1 +2 0 doc1163 1 +2 0 doc3797 1 +2 0 doc11094 1 + + +3 0 doc19578 1 +3 0 doc14860 1 +3 0 doc7235 1 +3 0 doc20590 1 +3 0 doc17933 1 +3 0 doc9384 1 +3 0 doc10783 1 +3 0 doc1963 1 +3 0 doc18356 1 +3 0 doc13254 1 +3 0 doc18402 1 +3 0 doc15241 1 +3 0 doc3303 1 +3 0 doc8868 1 +3 0 doc18520 1 +3 0 doc4650 1 +3 0 doc4727 1 +3 0 doc21518 1 +3 0 doc5060 1 +3 0 doc7587 1 +3 0 doc2990 1 +3 0 doc8042 1 +3 0 doc6304 1 +3 0 doc13223 1 +3 0 doc1964 1 +3 0 doc10597 1 +3 0 doc21023 1 +3 0 doc19057 1 +3 0 doc14948 1 +3 0 doc9692 1 + + +4 0 doc2534 1 +4 0 doc21388 1 +4 0 doc20923 1 +4 0 doc11547 1 +4 0 doc19755 1 +4 0 doc3793 1 +4 0 doc6714 1 +4 0 doc12722 1 +4 0 doc5552 1 +4 0 doc6810 1 +4 0 doc16953 1 +4 0 doc2527 1 +4 0 doc5361 1 +4 0 doc12353 1 +4 0 doc7308 1 +4 0 doc3836 1 +4 0 doc2293 1 +4 0 doc7348 1 +4 0 doc17119 1 +4 0 doc19331 1 +4 0 doc3411 1 +4 0 doc14643 1 +4 0 doc9058 1 +4 0 doc11099 1 +4 0 doc12485 1 +4 0 doc16432 1 +4 0 doc10047 1 +4 0 doc13788 1 +4 0 doc117 1 +4 0 doc638 1 + + + +5 0 doc169 1 +5 0 doc13181 1 +5 0 doc4350 1 +5 0 doc10242 1 +5 0 doc955 1 +5 0 doc5389 1 +5 0 doc17122 1 +5 0 doc17417 1 +5 0 doc12199 1 +5 0 doc6918 1 +5 0 doc3857 1 +5 0 doc2981 1 +5 0 doc10639 1 +5 0 doc10478 1 +5 0 doc8573 1 +5 0 doc9197 1 +5 0 doc9298 1 +5 0 doc2492 1 +5 0 doc10262 1 +5 0 doc5180 1 +5 0 doc11758 1 +5 0 doc4065 1 +5 0 doc9124 1 +5 0 doc11528 1 +5 0 doc18879 1 +5 0 doc17864 1 +5 0 doc3204 1 +5 0 doc12157 1 +5 0 doc4496 1 +5 0 doc20190 1 + + + +6 0 doc9507 1 +6 0 doc15630 1 +6 0 doc8469 1 +6 0 doc11918 1 +6 0 doc20482 1 +6 0 doc20158 1 +6 0 doc19831 1 +6 0 doc8296 1 +6 0 doc8930 1 +6 0 doc16460 1 +6 0 doc2577 1 +6 0 doc15476 1 +6 0 doc1767 1 +6 0 doc689 1 +6 0 doc16606 1 +6 0 doc6149 1 +6 0 doc18691 1 +6 0 doc2208 1 +6 0 doc3592 1 +6 0 doc11199 1 +6 0 doc16329 1 +6 0 doc6007 1 +6 0 doc15231 1 +6 0 doc20622 1 +6 0 doc21468 1 +6 0 doc12230 1 +6 0 doc5723 1 +6 0 doc8120 1 +6 0 doc8668 1 +6 0 doc303 1 + + + + +7 0 doc7728 1 +7 0 doc7693 1 +7 0 doc21088 1 +7 0 doc5017 1 +7 0 doc10807 1 +7 0 doc16204 1 +7 0 doc2233 1 +7 0 doc3632 1 +7 0 doc4719 1 +7 0 doc6477 1 +7 0 doc6502 1 +7 0 doc6709 1 +7 0 doc7710 1 +7 0 doc9193 1 +7 0 doc9309 1 +7 0 doc9789 1 +7 0 doc10971 1 +7 0 doc18059 1 +7 0 doc19906 1 +7 0 doc20089 1 +7 0 doc20102 1 +7 0 doc21040 1 +7 0 doc21153 1 +7 0 doc9147 1 +7 0 doc9930 1 +7 0 doc19763 1 +7 0 doc1559 1 +7 0 doc21248 1 +7 0 doc17945 1 +7 0 doc526 1 + + +# ----- m==0: recall is hurt, by adding fake docs as relevant + +8 0 fakedoc1 1 +8 0 fakedoc2 1 +8 0 fakedoc3 1 +8 0 fakedoc4 1 + +8 0 doc16299 1 +8 0 doc1662 1 +8 0 doc4585 1 +8 0 doc12315 1 +8 0 doc16266 1 +8 0 doc13136 1 +8 0 doc19212 1 +8 0 doc7086 1 +8 0 doc7062 1 +8 0 doc6134 1 +8 0 doc13953 1 +8 0 doc16264 1 +8 0 doc2494 1 +8 0 doc10636 1 +8 0 doc10894 1 +8 0 doc6844 1 +8 0 doc674 1 +8 0 doc13520 1 +8 0 doc344 1 +8 0 doc2896 1 +8 0 doc11871 1 +8 0 doc1862 1 +8 0 doc16728 1 +8 0 doc10308 1 +8 0 doc2227 1 +8 0 doc13167 1 +8 0 doc20607 1 +8 0 doc9670 1 +8 0 doc1566 1 +8 0 doc17885 1 + + +# ---- m==1: precision is hurt, by unmarking relevant docs + + +9 0 doc1990 0 +9 0 doc9342 1 +9 0 doc19427 1 +9 0 doc12432 0 +9 0 doc13480 1 +9 0 doc3322 1 +9 0 doc16044 1 +9 0 doc266 0 +9 0 doc3437 1 +9 0 doc5370 1 +9 0 doc10314 1 +9 0 doc4892 1 +9 0 doc5763 0 +9 0 doc14045 1 +9 0 doc1090 1 +9 0 doc7437 1 +9 0 doc5822 1 +9 0 doc4285 1 +9 0 doc17119 1 +9 0 doc21001 1 +9 0 doc4337 1 +9 0 doc5967 1 +9 0 doc10214 1 +9 0 doc12001 1 +9 0 doc18553 1 +9 0 doc12116 1 +9 0 doc5064 1 +9 0 doc5018 1 +9 0 doc5037 1 +9 0 doc8025 1 + + +# ---- m==2: both precision and recall are hurt. + +10 0 fakedoc1 1 +10 0 fakedoc2 1 +10 0 fakedoc3 1 +10 0 fakedoc4 1 + +10 0 doc17218 0 +10 0 doc10270 0 +10 0 doc5958 0 +10 0 doc19943 0 +10 0 doc6510 1 +10 0 doc16087 1 +10 0 doc14893 1 +10 0 doc8933 1 +10 0 doc4354 1 +10 0 doc16729 1 +10 0 doc16761 1 +10 0 doc6964 1 +10 0 doc16743 1 +10 0 doc7357 1 +10 0 doc2534 1 +10 0 doc18321 1 +10 0 doc18497 1 +10 0 doc11214 1 +10 0 doc11819 1 +10 0 doc10818 1 +10 0 doc15769 1 +10 0 doc5348 1 +10 0 doc14948 1 +10 0 doc7891 1 +10 0 doc9897 1 +10 0 doc15559 1 +10 0 doc14935 1 +10 0 doc14954 1 +10 0 doc6621 1 +10 0 doc6930 1 + + +11 0 doc11943 1 +11 0 doc286 1 +11 0 doc1574 1 +11 0 doc17916 1 +11 0 doc17918 1 +11 0 doc19213 1 +11 0 doc9337 1 +11 0 doc8593 1 +11 0 doc8800 1 +11 0 doc18580 1 +11 0 doc209 1 +11 0 doc1893 1 +11 0 doc11189 1 +11 0 doc17702 1 +11 0 doc10180 1 +11 0 doc11869 1 +11 0 doc9705 1 +11 0 doc8715 1 +11 0 doc12753 1 +11 0 doc10195 1 +11 0 doc3552 1 +11 0 doc16030 1 +11 0 doc4623 1 +11 0 doc3188 1 +11 0 doc8735 1 +11 0 doc151 1 +11 0 doc5792 1 +11 0 doc5194 1 +11 0 doc3393 1 +11 0 doc19027 1 + + + +12 0 doc18198 1 +12 0 doc2444 1 +12 0 doc4305 1 +12 0 doc6544 1 +12 0 doc11639 1 +12 0 doc10640 1 +12 0 doc12192 1 +12 0 doc128 1 +12 0 doc10760 1 +12 0 doc10881 1 +12 0 doc2698 1 +12 0 doc3552 1 +12 0 doc20524 1 +12 0 doc1884 1 +12 0 doc9187 1 +12 0 doc3131 1 +12 0 doc2911 1 +12 0 doc2589 1 +12 0 doc3747 1 +12 0 doc3813 1 +12 0 doc5222 1 +12 0 doc6023 1 +12 0 doc6624 1 +12 0 doc7655 1 +12 0 doc9205 1 +12 0 doc12062 1 +12 0 doc15504 1 +12 0 doc13625 1 +12 0 doc18704 1 +12 0 doc2277 1 + + + +13 0 doc4948 1 +13 0 doc21565 1 +13 0 doc17135 1 +13 0 doc1866 1 +13 0 doc13989 1 +13 0 doc5605 1 +13 0 doc13431 1 +13 0 doc2100 1 +13 0 doc16347 1 +13 0 doc16894 1 +13 0 doc6764 1 +13 0 doc8554 1 +13 0 doc8695 1 +13 0 doc8977 1 +13 0 doc19478 1 +13 0 doc14595 1 +13 0 doc2408 1 +13 0 doc2592 1 +13 0 doc10947 1 +13 0 doc15794 1 +13 0 doc5236 1 +13 0 doc14847 1 +13 0 doc3980 1 +13 0 doc1844 1 +13 0 doc42 1 +13 0 doc7783 1 +13 0 doc4557 1 +13 0 doc16423 1 +13 0 doc17170 1 +13 0 doc5822 1 + + + +14 0 doc17172 1 +14 0 doc17210 1 +14 0 doc5044 1 +14 0 doc4627 1 +14 0 doc4683 1 +14 0 doc15126 1 +14 0 doc4538 1 +14 0 doc273 1 +14 0 doc19585 1 +14 0 doc16078 1 +14 0 doc4529 1 +14 0 doc4186 1 +14 0 doc12961 1 +14 0 doc19217 1 +14 0 doc5670 1 +14 0 doc1699 1 +14 0 doc4716 1 +14 0 doc12644 1 +14 0 doc18387 1 +14 0 doc336 1 +14 0 doc16130 1 +14 0 doc18718 1 +14 0 doc12527 1 +14 0 doc11797 1 +14 0 doc11831 1 +14 0 doc7538 1 +14 0 doc17259 1 +14 0 doc18724 1 +14 0 doc19330 1 +14 0 doc19206 1 + + + +15 0 doc12198 1 +15 0 doc20371 1 +15 0 doc2947 1 +15 0 doc10750 1 +15 0 doc7239 1 +15 0 doc14189 1 +15 0 doc19474 1 +15 0 doc14776 1 +15 0 doc21270 1 +15 0 doc6387 1 +15 0 doc12908 1 +15 0 doc9573 1 +15 0 doc17102 1 +15 0 doc21482 1 +15 0 doc6524 1 +15 0 doc18034 1 +15 0 doc1358 1 +15 0 doc13147 1 +15 0 doc17731 1 +15 0 doc12890 1 +15 0 doc20887 1 +15 0 doc19508 1 +15 0 doc18498 1 +15 0 doc20642 1 +15 0 doc19878 1 +15 0 doc6556 1 +15 0 doc10272 1 +15 0 doc5720 1 +15 0 doc17578 1 +15 0 doc17164 1 + + +# ----- m==0: recall is hurt, by adding fake docs as relevant + +16 0 fakedoc1 1 +16 0 fakedoc2 1 +16 0 fakedoc3 1 +16 0 fakedoc4 1 + +16 0 doc4043 1 +16 0 doc14985 1 +16 0 doc15370 1 +16 0 doc15426 1 +16 0 doc1702 1 +16 0 doc3062 1 +16 0 doc16134 1 +16 0 doc15037 1 +16 0 doc8224 1 +16 0 doc5044 1 +16 0 doc8545 1 +16 0 doc7228 1 +16 0 doc12686 1 +16 0 doc16609 1 +16 0 doc13161 1 +16 0 doc3446 1 +16 0 doc16493 1 +16 0 doc19297 1 +16 0 doc13619 1 +16 0 doc3281 1 +16 0 doc15499 1 +16 0 doc7373 1 +16 0 doc9064 1 +16 0 doc1710 1 +16 0 doc15411 1 +16 0 doc10890 1 +16 0 doc3166 1 +16 0 doc17894 1 +16 0 doc4560 1 +16 0 doc12766 1 + + +# --- m==1: precision is hurt, by unmarking relevant docs + +17 0 doc3117 0 +17 0 doc7477 0 +17 0 doc7569 0 +17 0 doc20667 0 +17 0 doc20260 1 +17 0 doc17355 1 +17 0 doc11021 1 +17 0 doc20934 1 +17 0 doc552 1 +17 0 doc20856 1 +17 0 doc3524 1 +17 0 doc17343 1 +17 0 doc21055 1 +17 0 doc19032 1 +17 0 doc19786 1 +17 0 doc9281 1 +17 0 doc1695 1 +17 0 doc15940 1 +17 0 doc9215 1 +17 0 doc8335 1 +17 0 doc20936 1 +17 0 doc6914 1 +17 0 doc12122 1 +17 0 doc6618 1 +17 0 doc5049 1 +17 0 doc450 1 +17 0 doc19206 1 +17 0 doc18823 1 +17 0 doc5307 1 +17 0 doc17295 1 + + +# ---- m==2: both precision and recall are hurt. + +18 0 fakedoc1 1 +18 0 fakedoc2 1 +18 0 fakedoc3 1 +18 0 fakedoc4 1 + +18 0 doc8064 0 +18 0 doc18142 0 +18 0 doc19383 0 +18 0 doc21151 0 +18 0 doc4665 1 +18 0 doc2897 1 +18 0 doc6878 1 +18 0 doc14507 1 +18 0 doc2976 1 +18 0 doc11757 1 +18 0 doc12625 1 +18 0 doc14908 1 +18 0 doc12790 1 +18 0 doc17915 1 +18 0 doc11804 1 +18 0 doc12935 1 +18 0 doc8225 1 +18 0 doc18011 1 +18 0 doc10493 1 +18 0 doc17922 1 +18 0 doc1902 1 +18 0 doc14049 1 +18 0 doc1334 1 +18 0 doc1168 1 +18 0 doc4859 1 +18 0 doc7124 1 +18 0 doc9692 1 +18 0 doc18402 1 +18 0 doc9089 1 +18 0 doc15375 1 + + +19 0 doc5267 1 +19 0 doc2310 1 +19 0 doc11435 1 +19 0 doc15666 1 +19 0 doc12733 1 +19 0 doc7925 1 +19 0 doc2444 1 +19 0 doc4900 1 +19 0 doc10803 1 +19 0 doc8869 1 +19 0 doc5051 1 +19 0 doc9163 1 +19 0 doc529 1 +19 0 doc19546 1 +19 0 doc18561 1 +19 0 doc10634 1 +19 0 doc3979 1 +19 0 doc8833 1 +19 0 doc7652 1 +19 0 doc4804 1 +19 0 doc12616 1 +19 0 doc8419 1 +19 0 doc9431 1 +19 0 doc16235 1 +19 0 doc732 1 +19 0 doc2515 1 +19 0 doc7194 1 +19 0 doc16301 1 +19 0 doc4494 1 +19 0 doc4496 1 Property changes on: contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java (revision 0) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java (revision 0) @@ -0,0 +1,178 @@ +package org.apache.lucene.benchmark.quality; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.PrintWriter; + +import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic; +import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; +import org.apache.lucene.benchmark.quality.Judge; +import org.apache.lucene.benchmark.quality.QualityQueriesReader; +import org.apache.lucene.benchmark.quality.QualityQuery; +import org.apache.lucene.benchmark.quality.QualityQueryParser; +import org.apache.lucene.benchmark.quality.QualityRun; +import org.apache.lucene.benchmark.quality.trec.JudgeTrec; +import org.apache.lucene.benchmark.quality.trec.TopicsReader; +import org.apache.lucene.benchmark.quality.utils.QualityStats; +import org.apache.lucene.benchmark.quality.utils.SimpleQQParser; +import org.apache.lucene.benchmark.quality.utils.SubmissionLogger; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.FSDirectory; + +import junit.framework.TestCase; + +/** + * Test that quality run does its job. + */ +public class TestQualityRun extends TestCase { + + private static boolean DEBUG = Boolean.getBoolean("tests.verbose"); + + /** + * @param arg0 + */ + public TestQualityRun(String name) { + super(name); + } + + public void testTrecQuality() throws Exception { + // first create the complete reuters index + createReutersIndex(); + + File workDir = new File(System.getProperty("benchmark.work.dir","work")); + assertTrue("Bad workDir: "+workDir, workDir.exists()&& workDir.isDirectory()); + + int maxResults = 1000; + String docNameField = "docid"; + + PrintWriter logger = DEBUG ? new PrintWriter(System.out,true) : null; + + // for topics/qrels files - src\test\org\apache\lucene\benchmark/quality + File srcTestDir = new File(new File(new File(new File(new File( + new File(new File(workDir.getAbsoluteFile().getParentFile(), + "src"),"test"),"org"),"apache"),"lucene"),"benchmark"),"quality"); + + // prepare topics + File topicsFile = new File(srcTestDir, "trecTopics.txt"); + assertTrue("Bad topicsFile: "+topicsFile, topicsFile.exists()&& topicsFile.isFile()); + QualityQueriesReader qReader = new TopicsReader(); + QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile))); + + // prepare judge + File qrelsFile = new File(srcTestDir, "trecQRels.txt"); + assertTrue("Bad qrelsFile: "+qrelsFile, qrelsFile.exists()&& qrelsFile.isFile()); + Judge judge = new JudgeTrec(new BufferedReader(new FileReader(qrelsFile))); + + // validate topics & judgments match each other + judge.validateData(qqs, logger); + + IndexSearcher searcher = new IndexSearcher(FSDirectory.getDirectory(new File(workDir,"index"))); + + QualityQueryParser qqParser = new SimpleQQParser("title","body"); + QualityRun qrun = new QualityRun(qqs, qqParser, searcher, docNameField); + + SubmissionLogger submitLog = DEBUG ? new SubmissionLogger(logger) : null; + QualityStats stats[] = qrun.runQualityTest(maxResults, judge, submitLog, logger); + + // --------- verify by the way judgments were altered for this test: + // for some queries, depending on m = qnum % 8 + // m==0: recall is hurt, by marking fake docs as relevant + // m==1: precision is hurt, by unmarking relevant docs + // m==2: both precision and recall are hurt. + // m>=3: these queries remain perfect + for (int i = 0; i < stats.length; i++) { + QualityStats s = stats[i]; + switch (i%8) { + + case 0: + assertEquals("avg-p should be perfect: "+s.getAvp(), 1.0, s.getAvp(), 1E-9); + assertTrue("recall should be hurt: "+s.getRecall(), 1.0 > s.getRecall()); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertEquals("p_at_"+j+" should be perfect: "+s.getPrecisionAt(j), 1.0, s.getPrecisionAt(j), 1E-9); + } + break; + + case 1: + assertTrue("avg-p should be hurt", 1.0 > s.getAvp()); + assertEquals("recall should be perfect: "+s.getRecall(), 1.0, s.getRecall(), 1E-9); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertTrue("p_at_"+j+" should be hurt: "+s.getPrecisionAt(j), 1.0 > s.getPrecisionAt(j)); + } + break; + + case 2: + assertTrue("avg-p should be hurt: "+s.getAvp(), 1.0 > s.getAvp()); + assertTrue("recall should be hurt: "+s.getRecall(), 1.0 > s.getRecall()); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertTrue("p_at_"+j+" should be hurt: "+s.getPrecisionAt(j), 1.0 > s.getPrecisionAt(j)); + } + break; + + default: { + assertEquals("avg-p should be perfect: "+s.getAvp(), 1.0, s.getAvp(), 1E-9); + assertEquals("recall should be perfect: "+s.getRecall(), 1.0, s.getRecall(), 1E-9); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertEquals("p_at_"+j+" should be perfect: "+s.getPrecisionAt(j), 1.0, s.getPrecisionAt(j), 1E-9); + } + } + + } + } + + QualityStats avg = QualityStats.average(stats); + if (logger!=null) { + logger.println(); + logger.println("Average statistis:"); + avg.log(logger," "); + } + + assertTrue("mean avg-p should be hurt: "+avg.getAvp(), 1.0 > avg.getAvp()); + assertTrue("avg recall should be hurt: "+avg.getRecall(), 1.0 > avg.getRecall()); + for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { + assertTrue("avg p_at_"+j+" should be hurt: "+avg.getPrecisionAt(j), 1.0 > avg.getPrecisionAt(j)); + } + + + } + + // use benchmark logic to create the full Reuters index + private void createReutersIndex() throws Exception { + // 1. alg definition + String algLines[] = { + "# ----- properties ", + "doc.maker="+ReutersDocMaker.class.getName(), + "doc.add.log.step=2500", + "doc.term.vector=false", + "doc.maker.forever=false", + "directory=FSDirectory", + "doc.stored=false", + "doc.tokenized=true", + "# ----- alg ", + "ResetSystemErase", + "CreateIndex", + "{ AddDoc } : *", + "CloseIndex", + }; + + // 2. execute the algorithm (required in every "logic" test) + TestPerfTasksLogic.execBenchmark(algLines); + } +} Property changes on: contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java ___________________________________________________________________ Name: svn:eol-style + native Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecTopics.txt =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecTopics.txt (revision 0) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecTopics.txt (revision 0) @@ -0,0 +1,276 @@ +# ----------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ----------------------------------------------------------------------- + + + Number: 0 + + statement months total 1987 + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 1 + +<title> agreed 15 against five + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 2 + +<title> nine only month international + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 3 + +<title> finance any 10 government + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 4 + +<title> issue next years all + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 5 + +<title> who major ltd today + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 6 + +<title> business revs securities per + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 7 + +<title> quarter time note sales + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 8 + +<title> february earlier loss group + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 9 + +<title> out end made some + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 10 + +<title> spokesman financial 30 expected + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 11 + +<title> 1985 now prices due + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 12 + +<title> before board record could + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 13 + +<title> pay debt because trade + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 14 + +<title> meeting increase four price + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 15 + +<title> chairman rate six interest + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 16 + +<title> since current between agreement + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 17 + +<title> oil we when president + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 18 + +<title> capital through foreign added + +<desc> Description: + + +<narr> Narrative: + + +</top> + +<top> +<num> Number: 19 + +<title> 20 while common week + +<desc> Description: + + +<narr> Narrative: + + +</top> Property changes on: contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecTopics.txt ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java =================================================================== --- contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 551794) +++ contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy) @@ -20,6 +20,9 @@ import java.io.StringReader; import org.apache.lucene.benchmark.byTask.Benchmark; +import org.apache.lucene.benchmark.byTask.feeds.DocData; +import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; +import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; @@ -130,8 +133,8 @@ // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker", - "doc.add.log.step=2697", + "doc.maker="+Reuters20DocMaker.class.getName(), + "doc.add.log.step=3", "doc.term.vector=false", "doc.maker.forever=false", "directory=FSDirectory", @@ -148,12 +151,12 @@ // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - int ndocsExpected = 21578; // that's how many docs there are in the Reuters collecton. + int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); } // create the benchmark and execute it. - private Benchmark execBenchmark(String[] algLines) throws Exception { + public static Benchmark execBenchmark(String[] algLines) throws Exception { String algText = algLinesToText(algLines); logTstLogic(algText); Benchmark benchmark = new Benchmark(new StringReader(algText)); @@ -162,7 +165,7 @@ } // catenate alg lines to make the alg text - private String algLinesToText(String[] algLines) { + private static String algLinesToText(String[] algLines) { String indent = " "; StringBuffer sb = new StringBuffer(); for (int i = 0; i < propLines.length; i++) { @@ -174,11 +177,22 @@ return sb.toString(); } - private void logTstLogic (String txt) { + private static void logTstLogic (String txt) { if (!DEBUG) return; System.out.println("Test logic of:"); System.out.println(txt); } + /** use reuters and the exhaust mechanism, but to be faster, add 20 docs only... */ + public static class Reuters20DocMaker extends ReutersDocMaker { + private int nDocs=0; + protected DocData getNextDocData() throws Exception { + if (nDocs>=20 && !forever) { + throw new NoMoreDataException(); + } + nDocs++; + return super.getNextDocData(); + } + } } Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java (revision 0) @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.utils; + +import java.io.IOException; + +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.FieldSelectorResult; +import org.apache.lucene.search.Searcher; + +/** + * Utility: extract doc names from an index + */ +public class DocNameExtractor { + + private FieldSelector fldSel; + private String docNameField; + + public DocNameExtractor (final String docNameField) { + this.docNameField = docNameField; + fldSel = new FieldSelector() { + public FieldSelectorResult accept(String fieldName) { + return fieldName.equals(docNameField) ? + FieldSelectorResult.LOAD_AND_BREAK : + FieldSelectorResult.NO_LOAD; + } + }; + } + + public String docName(Searcher searcher, int docid) throws IOException { + return searcher.doc(docid,fldSel).get(docNameField); + } + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityStats.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityStats.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityStats.java (revision 0) @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.utils; + +import java.io.PrintWriter; +import java.util.ArrayList; + +public class QualityStats { + + public static final int MAX_POINTS = 20; + + private int maxRecall; + private double recall; + private double pAt[]; + private double pSum; + private double nPoints; + + private ArrayList recallPoints; + private int lastPoint = -1; + + public QualityStats(int maxRecall) { + this.maxRecall = maxRecall; + this.recallPoints = new ArrayList(); + pAt = new double[MAX_POINTS]; + } + + public void addRelevantDoc(int n) { + assert n > lastPoint : "cannot be called with "+n+" after "+lastPoint; + for (int i=lastPoint+1; i<n && i<pAt.length; i++) { + pAt[i] = nPoints / (i+1); + } + recallPoints.add(new Integer(n)); + nPoints+=1; + double p = nPoints / (n+1); + pSum += p; + if (n<pAt.length) { + pAt[n] = p; + } + lastPoint = n; + recall = maxRecall==0 ? 0 : nPoints/maxRecall; + } + + public double getPrecisionAt(int n) { + if (n<1 || n>pAt.length) { + throw new IllegalArgumentException("n="+n+" - but it must be in [1,"+pAt.length+"] range!"); + } + return pAt[n-1]; + } + + public double getAvp() { + return nPoints>0 ? pSum/nPoints : 0; + } + + public double getRecall() { + return recall; + } + + public void log(PrintWriter logger, String prefix) { + logger.println(prefix+"nPoints: "+nPoints); + logger.println(prefix+"Average Precision: "+getAvp()); + logger.println(prefix+"Total Recall: "+getRecall()); + for (int i=0; i<pAt.length; i++) { + logger.println(prefix+"pAt_"+(i+1)+": "+pAt[i]); + } + } + + public static QualityStats average(QualityStats[] stats) { + QualityStats avg = new QualityStats(0); + int nGood = 0; // queries with positive judgements + // aggregate + for (int i=0; i<stats.length; i++) { + if (stats[i].maxRecall>0) { + nGood++; + avg.nPoints += stats[i].nPoints; + avg.pSum += stats[i].pSum; + avg.recall += stats[i].recall; + avg.maxRecall += stats[i].maxRecall; + for (int j=0; j<avg.pAt.length; j++) { + avg.pAt[j] += stats[i].pAt[j]; + } + } + } + assert nGood>0 : "Fishy: no \"good\" queries!"; + // take average: times go by all queries, other meassures go by "good" queries noly. + avg.nPoints /= nGood; + avg.pSum /= nGood; + avg.recall /= nGood; + avg.maxRecall /= nGood; + for (int j=0; j<avg.pAt.length; j++) { + avg.pAt[j] /= nGood; + } + + return avg; + } + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityStats.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java (revision 0) @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.utils; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.benchmark.quality.QualityQuery; +import org.apache.lucene.benchmark.quality.QualityQueryParser; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.Query; + +/** + * Simplistic quality query parser + */ +public class SimpleQQParser implements QualityQueryParser { + + private String qqField; + private String indexField; + ThreadLocal queryParser = new ThreadLocal(); + + /** + * Constructor + * @param qqField field of quality query to use for creating the query + * @param indexField corresponding index field + */ + public SimpleQQParser(String qqField, String indexField) { + this.qqField = qqField; + this.indexField = indexField; + } + + /* (non-Javadoc) + * @see org.apache.lucene.benchmark.quality.QualityQueryParser#parse(org.apache.lucene.benchmark.quality.QualityQuery) + */ + public Query parse(QualityQuery qq) throws ParseException { + QueryParser qp = (QueryParser) queryParser.get(); + if (qp==null) { + qp = new QueryParser(indexField, new StandardAnalyzer()); + queryParser.set(qp); + } + return qp.parse(qq.getField(qqField)); + } + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionLogger.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionLogger.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionLogger.java (revision 0) @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.utils; + +import java.io.IOException; +import java.io.PrintWriter; +import java.text.NumberFormat; + +import org.apache.lucene.benchmark.quality.QualityQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TopDocs; + +/** + * Create a log ready for submission + */ +public class SubmissionLogger { + + NumberFormat nf; + private PrintWriter logger; + + public SubmissionLogger (PrintWriter logger) { + this.logger = logger; + nf = NumberFormat.getInstance(); + nf.setMaximumFractionDigits(4); + nf.setMinimumFractionDigits(4); + } + + public void report(QualityQuery qq, TopDocs td, String docNameField, Searcher searcher) throws IOException { + if (logger==null) { + return; + } + ScoreDoc sd[] = td.scoreDocs; + String sep = " \t "; + DocNameExtractor xt = new DocNameExtractor(docNameField); + for (int i=0; i<sd.length; i++) { + String docName = xt.docName(searcher,sd[i].doc); + logger.println( + qq.getQueryID() + sep + + '0' + sep + + format(docName,20) + sep + + format(""+i,7) + sep + + nf.format(sd[i].score) + ); + } + } + + private static String padd = " "; + private String format(String s, int minLen) { + s = (s==null ? "" : s); + int n = Math.max(minLen,s.length()); + return (s+padd).substring(0,n); + } +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionLogger.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java (revision 0) @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.utils; + +import java.io.File; +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.PriorityQueue; + +/** + * Suggest Quality queries based on an index contents. + */ +public class QualityQueriesFinder { + + private static final String newline = System.getProperty("line.separator"); + private Directory dir; + + public QualityQueriesFinder(Directory dir) { + this.dir = dir; + } + + /** + * @param args <index-dir> + * @throws IOException + */ + public static void main(String[] args) throws IOException { + if (args.length<1) { + System.err.println("Usage: java QualityQueriesFinder <index-dir>"); + System.exit(1); + } + QualityQueriesFinder qqf = new QualityQueriesFinder(FSDirectory.getDirectory(new File(args[0]))); + String q[] = qqf.bestQueries("body",20); + for (int i=0; i<q.length; i++) { + System.out.println(newline+formatQueryAsTrecTopic(i,q[i],null,null)); + } + } + + public String [] bestQueries(String field,int numQueries) throws IOException { + String words[] = bestTerms("body",4*numQueries); + int n = words.length; + int m = n/4; + String res[] = new String[m]; + for (int i=0; i<res.length; i++) { + res[i] = words[i] + " " + words[m+i]+ " " + words[n-1-m-i] + " " + words[n-1-i]; + //System.out.println("query["+i+"]: "+res[i]); + } + return res; + } + + public static String formatQueryAsTrecTopic (int qnum, String title, String description, String narrative) { + return + "<top>" + newline + + "<num> Number: " + qnum + newline + newline + + "<title> " + (title==null?"":title) + newline + newline + + "<desc> Description:" + newline + + (description==null?"":description) + newline + newline + + "<narr> Narrative:" + newline + + (narrative==null?"":narrative) + newline + newline + + "</top>"; + } + + public String [] bestTerms(String field,int numTerms) throws IOException { + PriorityQueue pq = new TermsDfQueue(numTerms); + IndexReader ir = IndexReader.open(dir); + try { + int threshold = ir.maxDoc() / 10; // ignore words too common. + TermEnum terms = ir.terms(new Term(field,"")); + while (terms.next()) { + if (!field.equals(terms.term().field())) { + break; + } + int df = terms.docFreq(); + if (df<threshold) { + String ttxt = terms.term().text(); + pq.insert(new TermDf(ttxt,df)); + } + } + } finally { + ir.close(); + } + String res[] = new String[pq.size()]; + int i = 0; + while (pq.size()>0) { + TermDf tdf = (TermDf) pq.pop(); + res[i++] = tdf.word; + System.out.println(i+". word: "+tdf.df+" "+tdf.word); + } + return res; + } + + private static class TermDf { + String word; + int df; + TermDf (String word, int freq) { + this.word = word; + this.df = freq; + } + } + + private static class TermsDfQueue extends PriorityQueue { + TermsDfQueue (int maxSize) { + initialize(maxSize); + } + protected boolean lessThan(Object a, Object b) { + TermDf tf1 = (TermDf) a; + TermDf tf2 = (TermDf) b; + return tf1.df < tf2.df; + } + } + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueriesReader.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueriesReader.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueriesReader.java (revision 0) @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality; + +import java.io.BufferedReader; +import java.io.IOException; + + +/** + * Read Quality test queries. + */ +public interface QualityQueriesReader { + + public QualityQuery [] readQueries(BufferedReader reader) throws IOException; + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueriesReader.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java (revision 0) @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality; + +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.Query; + +/** + * parse a QualityQuery into a Lucene query. + */ +public interface QualityQueryParser { + + public Query parse(QualityQuery qq) throws ParseException; + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityRun.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityRun.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityRun.java (revision 0) @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality; + +import java.io.IOException; +import java.io.PrintWriter; + +import org.apache.lucene.benchmark.quality.utils.DocNameExtractor; +import org.apache.lucene.benchmark.quality.utils.QualityStats; +import org.apache.lucene.benchmark.quality.utils.SubmissionLogger; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TopDocs; + +/** + * Run a quality test + * + */ +public class QualityRun { + + protected QualityQuery qualityQueries[]; + protected QualityQueryParser qqParser; + protected Searcher searcher; + protected String docNameField; + + public QualityRun(QualityQuery qqs[], QualityQueryParser qqParser, Searcher searcher, String docNameField) throws Exception { + this.qualityQueries = qqs; + this.qqParser = qqParser; + this.searcher = searcher; + this.docNameField = docNameField; + } + + public QualityStats [] runQualityTest(int maxResults, Judge judge, SubmissionLogger submitLog, PrintWriter qualityLog) throws Exception { + QualityStats stats[] = new QualityStats[qualityQueries.length]; + for (int i=0; i<qualityQueries.length; i++) { + QualityQuery qq = qualityQueries[i]; + // generate query + Query q = qqParser.parse(qq); + // search with this query + TopDocs td = searcher.search(q,null,maxResults); + //most likely we either submit or judge, but check both + if (judge!=null) { + stats[i] = analyzeQueryResults(qq, q, td, judge, qualityLog); + } + if (submitLog!=null) { + submitLog.report(qq,td,docNameField,searcher); + } + } + return stats; + } + + private QualityStats analyzeQueryResults(QualityQuery qq, Query q, TopDocs td, Judge judge, PrintWriter logger) throws IOException { + QualityStats stts = new QualityStats(judge.maxRecall(qq)); + ScoreDoc sd[] = td.scoreDocs; + DocNameExtractor xt = new DocNameExtractor(docNameField); + for (int i=0; i<sd.length; i++) { + String docName = xt.docName(searcher,sd[i].doc); + if (judge.isRelevant(docName,qq)) { + stts.addRelevantDoc(i); + } + } + if (logger!=null) { + logger.println(); + logger.println(qq.getQueryID()+" - "+q); + logger.println(); + stts.log(logger," "); + } + return stts; + } + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityRun.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TopicsReader.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TopicsReader.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TopicsReader.java (revision 0) @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.trec; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; + +import org.apache.lucene.benchmark.quality.QualityQueriesReader; +import org.apache.lucene.benchmark.quality.QualityQuery; + +/** + * Read TREC topics. + * <p> + * Expects this topic format - + * <pre> + * <top> + * <num> Number: nnn + * + * <title> title of the topic + * + * <desc> Description: + * description of the topic + * + * <narr> Narrative: + * "story" composed by assessors. + * + * </top> + * </pre> + * Comment lines starting with '#' are ignored. + */ +public class TopicsReader implements QualityQueriesReader { + + private static final String newline = System.getProperty("line.separator"); + + /** + * Constructor for Trec's TopicsReader + */ + public TopicsReader() { + super(); + } + + /* (non-Javadoc) + * @see org.apache.lucene.benchmark.quality.utils.QualityQueriesReader#readQueries(java.io.BufferedReader) + */ + public QualityQuery[] readQueries(BufferedReader reader) throws IOException { + ArrayList res = new ArrayList(); + StringBuffer sb; + try { + while (null!=(sb=read(reader,"<top>",null,false,false))) { + HashMap fields = new HashMap(); + // id + sb = read(reader,"<num>",null,true,false); + int k = sb.indexOf(":"); + String id = sb.substring(k+1).trim(); + // title + sb = read(reader,"<title>",null,true,false); + k = sb.indexOf(">"); + String title = sb.substring(k+1).trim(); + // description + sb = read(reader,"<desc>",null,false,false); + sb = read(reader,"<narr>",null,false,true); + String descripion = sb.toString().trim(); + // we got a topic! + fields.put("title",title); + fields.put("description",descripion); + QualityQuery topic = new QualityQuery(id,fields); + res.add(topic); + // skip narrative, get to end of doc + read(reader,"</top>",null,false,false); + } + } finally { + reader.close(); + } + // sort result array (by ID) + QualityQuery qq[] = (QualityQuery[]) res.toArray(new QualityQuery[0]); + Arrays.sort(qq); + return qq; + } + + // read until finding a line that starts with the specified prefix + private StringBuffer read (BufferedReader reader, String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws IOException { + sb = (sb==null ? new StringBuffer() : sb); + String sep = ""; + while (true) { + String line = reader.readLine(); + if (line==null) { + return null; + } + if (line.startsWith(prefix)) { + if (collectMatchLine) { + sb.append(sep+line); + sep = newline; + } + break; + } + if (collectAll) { + sb.append(sep+line); + sep = newline; + } + } + //System.out.println("read: "+sb); + return sb; + } +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TopicsReader.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/JudgeTrec.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/JudgeTrec.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/JudgeTrec.java (revision 0) @@ -0,0 +1,155 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality.trec; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.StringTokenizer; + +import org.apache.lucene.benchmark.quality.Judge; +import org.apache.lucene.benchmark.quality.QualityQuery; + +/** + * Judge if given document is relevant to given quality query. + */ +public class JudgeTrec implements Judge { + + HashMap judgements; + + /** + * Constructor from a reader. + * <p> + * Expected input format: + * <pre> + * qnum 0 doc-name is-relevant + * </pre> + * Two sample lines: + * <pre> + * 19 0 doc303 1 + * 19 0 doc7295 0 + * </pre> + * @param reader + * @throws IOException + */ + public JudgeTrec (BufferedReader reader) throws IOException { + judgements = new HashMap(); + QRelJudgement curr = null; + String zero = "0"; + String line; + + try { + while (null!=(line=reader.readLine())) { + line = line.trim(); + if (line.length()==0 || '#'==line.charAt(0)) { + continue; + } + StringTokenizer st = new StringTokenizer(line); + String queryID = st.nextToken(); + st.nextToken(); + String docName = st.nextToken(); + boolean relevant = !zero.equals(st.nextToken()); + assert !st.hasMoreTokens() : "wrong format: "+line+" next: "+st.nextToken(); + if (relevant) { // only keep relevant docs + if (curr==null || !curr.queryID.equals(queryID)) { + curr = (QRelJudgement)judgements.get(queryID); + if (curr==null) { + curr = new QRelJudgement(queryID); + judgements.put(queryID,curr); + } + } + curr.addRelevandDoc(docName); + } + } + } finally { + reader.close(); + } + } + + public boolean isRelevant(String docName, QualityQuery query) { + QRelJudgement qrj = (QRelJudgement) judgements.get(query.getQueryID()); + return qrj!=null && qrj.isRelevant(docName); + } + + /** single Judgement of a trec quality query */ + private static class QRelJudgement { + private String queryID; + private HashMap relevantDocs; + + QRelJudgement(String queryID) { + this.queryID = queryID; + relevantDocs = new HashMap(); + } + + public void addRelevandDoc(String docName) { + relevantDocs.put(docName,docName); + } + + boolean isRelevant(String docName) { + return relevantDocs.containsKey(docName); + } + + public int maxRecall() { + return relevantDocs.size(); + } + } + + public boolean validateData(QualityQuery[] qq, PrintWriter logger) { + HashMap missingQueries = (HashMap) judgements.clone(); + ArrayList missingJudgements = new ArrayList(); + for (int i=0; i<qq.length; i++) { + String id = qq[i].getQueryID(); + if (missingQueries.containsKey(id)) { + missingQueries.remove(id); + } else { + missingJudgements.add(id); + } + } + boolean isValid = true; + if (missingJudgements.size()>0) { + isValid = false; + if (logger!=null) { + logger.println("WARNING: "+missingJudgements.size()+" queries have no judgments! - "); + for (int i=0; i<missingJudgements.size(); i++) { + logger.println(" "+(String)missingJudgements.get(i)); + } + } + } + if (missingQueries.size()>0) { + isValid = false; + if (logger!=null) { + logger.println("WARNING: "+missingQueries.size()+" judgments match no query! - "); + for (Iterator it = missingQueries.keySet().iterator(); it.hasNext();) { + String id = (String) it.next(); + logger.println(" "+id); + } + } + } + return isValid; + } + + public int maxRecall(QualityQuery query) { + QRelJudgement qrj = (QRelJudgement) judgements.get(query.getQueryID()); + if (qrj!=null) { + return qrj.maxRecall(); + } + return 0; + } +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/JudgeTrec.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java (revision 0) @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality; + +import java.io.PrintWriter; + +/** + * Judge if a document is relevant for a quality query. + */ +public interface Judge { + + public boolean isRelevant(String docName, QualityQuery query); + + public boolean validateData (QualityQuery qq[], PrintWriter logger); + + public int maxRecall (QualityQuery query); + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQuery.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQuery.java (revision 0) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQuery.java (revision 0) @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.quality; + +import java.util.Map; + +/** + * A QualityQuery has an ID and some fields. + * <p> + * The ID allows to map the quality query with its judgements. + * The fields are named parts of the quality query representation. + * <p> + * Borrowing from TREC notions, possible field names may be "title", + * "decription", etc., but anything goes, and so a quality query does + * not have to be a TREC query. + */ +public class QualityQuery implements Comparable { + private String queryID; + private Map queryFields; + + public QualityQuery(String queryID, Map queryFields) { + this.queryID = queryID; + this.queryFields = queryFields; + } + + public String[] getFieldNames() { + return (String[]) queryFields.keySet().toArray(new String[0]); + } + + public String getField(String fieldName) { + return (String) queryFields.get(fieldName); + } + + public String getQueryID() { + return queryID; + } + + // for a nicer sort of input queries before running them + public int compareTo(Object o) { + QualityQuery other = (QualityQuery) o; + try { + // compare as ints when ids ints + int n = Integer.parseInt(queryID); + int nOther = Integer.parseInt(other.queryID); + return n - nOther; + } catch (NumberFormatException e) { + // fall back to string comparison + return queryID.compareTo(other.queryID); + } + } + +} Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQuery.java ___________________________________________________________________ Name: svn:executable + * Name: svn:eol-style + native Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java (revision 551794) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java (working copy) @@ -22,6 +22,9 @@ import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; import java.io.File; import java.io.UnsupportedEncodingException; @@ -97,10 +100,10 @@ private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException { int docid = incrNumDocsCreated(); Document doc = new Document(); - doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal)); + doc.add(new Field("docid", "doc"+docid, Store.YES, Index.NO, TermVector.NO)); if (docData.getName()!=null) { String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt); - doc.add(new Field("docname", name, storeVal, indexVal, termVecVal)); + doc.add(new Field("docname", name, Store.YES, Index.NO, TermVector.NO)); } if (docData.getDate()!=null) { String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);