Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 908159) +++ CHANGES.txt (working copy) @@ -27,6 +27,10 @@ * LUCENE-2158: At high indexing rates, NRT reader could temporarily lose deletions. (Mike McCandless) + * LUCENE-2257: Increase max number of unique terms in one segment to + termIndexInterval (default 128) * ~2.1 billion = ~274 billion. + (Tom Burton-West via Mike McCandless) + API Changes * LUCENE-2182: DEFAULT_ATTRIBUTE_FACTORY was failing to load Index: src/java/org/apache/lucene/index/SegmentTermEnum.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermEnum.java (revision 908410) +++ src/java/org/apache/lucene/index/SegmentTermEnum.java (working copy) @@ -108,7 +108,7 @@ return clone; } - final void seek(long pointer, int p, Term t, TermInfo ti) + final void seek(long pointer, long p, Term t, TermInfo ti) throws IOException { input.seek(pointer); position = p; Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 908410) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -170,7 +170,7 @@ private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { enumerator.seek(indexPointers[indexOffset], - (indexOffset * totalIndexInterval) - 1, + ((long) indexOffset * totalIndexInterval) - 1, indexTerms[indexOffset], indexInfos[indexOffset]); } @@ -241,28 +241,6 @@ return ti; } - /** Returns the nth term in the set. */ - final Term get(int position) throws IOException { - if (size == 0) return null; - - SegmentTermEnum enumerator = getThreadResources().termEnum; - if (enumerator.term() != null && - position >= enumerator.position && - position < (enumerator.position + totalIndexInterval)) - return scanEnum(enumerator, position); // can avoid seek - - seekEnum(enumerator, position/totalIndexInterval); // must seek - return scanEnum(enumerator, position); - } - - private final Term scanEnum(SegmentTermEnum enumerator, int position) throws IOException { - while(enumerator.position < position) - if (!enumerator.next()) - return null; - - return enumerator.term(); - } - private void ensureIndexIsRead() { if (indexTerms == null) { throw new IllegalStateException("terms index was not loaded when this reader was created"); Index: src/site/src/documentation/content/xdocs/fileformats.xml =================================================================== --- src/site/src/documentation/content/xdocs/fileformats.xml (revision 908159) +++ src/site/src/documentation/content/xdocs/fileformats.xml (working copy) @@ -1845,11 +1845,12 @@

When referring to term numbers, Lucene's current - implementation uses a Java int, which means - the maximum number of unique terms in any single index - segment is 2,147,483,648. This is technically not a - limitation of the index file format, just of Lucene's - current implementation. + implementation uses a Java int to hold the + term index, which means the maximum number of unique + terms in any single index segment is ~2.1 billion times + the term index interval (default 128) = ~274 billion. + This is technically not a limitation of the index file + format, just of Lucene's current implementation.

Similarly, Lucene uses a Java int to refer Index: docs/fileformats.pdf =================================================================== --- docs/fileformats.pdf (revision 908159) +++ docs/fileformats.pdf (working copy) @@ -667,10 +667,10 @@ >> endobj 111 0 obj -<< /Length 1801 /Filter [ /ASCII85Decode /FlateDecode ] +<< /Length 1845 /Filter [ /ASCII85Decode /FlateDecode ] >> stream -Gatm<>E@c%'Ro4HjK13fG\lPL)aB'A"VfQS_N=$kG,NXdH;*]f';idMD;tC&-V:5k;U.'/tp@-$LTH7^Q,1p?N)YM*ArPAGr!8(0!,&.e*ZcV$F&^;F6Ba=Bj-erc@iWHmO",>o\fW2U#KJ8%,]-0<;4\qcMW)dW(W4r')L'%gC?hm*f`HWnV%dbVZG#-_jUj#'P\O_oZ'&jIQr]%54&Rn#K1_F/7O2B740fG=lc[4Ro3YmceBq7Jn539+CM0NV+[Zoel+/JaFIhGpmP4I%\.R[ZC5-ce+e*fLJ4fAT\@3QgGc1u-7W7h"cbCre?E8TM&At8O!?\Zsm0q0)\(>&=We,.h1o`pZE7qOk1rFOCj3(%1n;J"B0i&M[BePZMTY8(,j_P+]_P5I?\J/(K8Z6DjQ_^H`SEO/Y">"_S/9Z@,81;O;nec;K.Sh35ViII!^N'h`d&Fa!oTbnAF-)SW:(h*^RW!ZkFMnf4=9B6Til(FqYcoO?tniTMb>_+7;LbV\NtQAdON4g_="n/dHr.9:I`img>pmXGCoL.5dT;_%U,hKC-2."mArlK#2QtTjDcU^OZijUGX1BS&fM_IqtMc4YW2pchD7rNX.bkiX:CO+B#XE_l!(M&o8,(lC&:VFoWIf2JGO'0#O'r5)PBdF3_]8*R*2#c0Gb-m"MMVl"4r1ri%&PJjQZZUW)F>7c#SGN$u)DUp-W7j\O%/N"t%)+rC6*J5b&i/rg?gf)qQ>0oC<>?l$4CDGTap@Wn1#_`e#([onhm(7l/DLQ].K%&"i%@qog]SY54_K@#]&\CdSEi2a6_A-\ptml007OI2_pGssbFA@6[laqdrV2`R)'V"YX4fS*\3f>,V>4#L01afe)u\m0--fn,sQ)-]FFs,o^4H.qb\/+2kb`6l+lrN;Y4Gh8JS#)F1%g,CT;[NS:Jn>_(cEbKaGProLPjEZK-n>=?sW2b%F]e/'&0&"]e\7%IOAG78TS[DN8C+WmlWBfB"=LHK?[dUc'ujV5J\KMr?G`JG;6k+/'3XN&F5`o:W;koaM4seWJhHt0u4mEMOr5Bo>0`iiqk,6HmeKLq4s5I7W&`*]&'g1*_%_OUd!0NMK,:Za1Fo%!78.6-^dpe7i'q*bg[A:V/te//^*dZeQmd3b+3">0,"]'IfX.'6n&~> +Gatm<>BAg]'RnB3n/LcAW02O@J/Pq745JTKec65F#:5KGZ*oni-#N@,hi;M#d?,Yd>@9L+Eb+E>^2oQXAs9^1ai=_XQ0'^&@dco^idiB%[<2Vm-.F?@O`+.)LNo/%e)!DF@c9H;iPf\X6.,28n**]:&i5-/YXjesG6chX''KWD??0@/BgBgPg/i+("o=j2NX(P0'ODl8@)CV8'q/0JJ'qT,2dZj@eRhl2)eG5hql,[U'&GA5g]tBkWd"ht*C]:.X,IJD:$Y8*\o"9@*4#=aI`*i2[+QM-hWkQX:B7;*(I#aIE4Y-7]I@hlm_I=Ys^3AIVYCG^U?e;aZ/$Qjg2,^d?/3#mjXsjHcBla:tk5Zc1Wp9=8F?5!XlfGL:G5q0gNL#ZR+WL!h;L)l;MY.r[`SLg6)`G/k_:L[KZnU*72gF,.XiE3`m5#7f1@JS.WIo@efS#26'!,qgh`+lfY'KqWP4WnNG$TdmPA&P1bNXJEpZ\B5_g1KufB?AQgcV/oL3562M1$jDUZ1WCYN83`gae1`4ETh\sT\c[KpZmarCa$tR*@=Y;>DfdgNV-pfL=BX>945KHrX\,OeJpGQrNP66?Z4=7n(UJ]F!FJr.#+@(,[$AS,(A(1%\_5*=SV2gop=(.1,XC28DF:0,;X?n:1j]'Wb.SUFW*)2i<_uNVVVkC$3J`Vrg?RV1/(db)*gG\kW84OJT(0*+D9"Le6*KPe'UFCGc%8Uf"`,Uc=9Bb\NVp6YGICZ%"(U$^I)i[3O21F*H=B\^-/',Lfd)SVJd[/QeTj/7!KC?b2Sr8-R&2$!%f9\.Z;KjY/\pF]/`6V/E$iU0aau[6U,bNl>lkro4uNKp7S4\bE2qFEc1sV3VnHsGg=oj%ebEhM2anjEk9A5(cdng-*M"4t5uKquQQ_a69:iC8#Ds(]^mtB.WYGadC0Adc)WACId3qH"_<63NQG`$BeI4^mk/[+:Hd>#e@!eWJq1$^qlA*I$M'_Vg+Q_O?7#,E2C?@^ATK_ZF85?IOS2(1O_eSX_NLU4h_NH#b`4qeD:hJW!qA#HkL"n)KBH@-.LpP>=ki2,O,5a/`l#J;k\mu>9-Z2""?/*82.L2pc7M[%J+,^%=DgZ1dXbF=T#uc3+nuNVAp&*oI>jo`AQCR[td7;TqaI0,q(ah.jdG(2!m_[[pb3OpT`#>323Wj*LEW=Faj\2j3$aQbKW?iCt:Ss^4Pb5'-*c,geVDUOr:]L=E^(LLf endstream endobj 112 0 obj @@ -1206,80 +1206,80 @@ xref 0 153 0000000000 65535 f -0000054446 00000 n -0000054652 00000 n -0000054745 00000 n +0000054490 00000 n +0000054696 00000 n +0000054789 00000 n 0000000015 00000 n 0000000071 00000 n 0000001333 00000 n 0000001453 00000 n 0000001639 00000 n -0000054897 00000 n +0000054941 00000 n 0000001774 00000 n -0000054960 00000 n +0000055004 00000 n 0000001909 00000 n -0000055026 00000 n +0000055070 00000 n 0000002046 00000 n -0000055090 00000 n +0000055134 00000 n 0000002183 00000 n -0000055156 00000 n +0000055200 00000 n 0000002320 00000 n -0000055222 00000 n +0000055266 00000 n 0000002457 00000 n -0000055288 00000 n +0000055332 00000 n 0000002594 00000 n -0000055352 00000 n +0000055396 00000 n 0000002731 00000 n -0000055416 00000 n +0000055460 00000 n 0000002868 00000 n -0000055482 00000 n +0000055526 00000 n 0000003005 00000 n -0000055547 00000 n +0000055591 00000 n 0000003142 00000 n -0000055613 00000 n +0000055657 00000 n 0000003279 00000 n -0000055679 00000 n +0000055723 00000 n 0000003416 00000 n -0000055744 00000 n +0000055788 00000 n 0000003553 00000 n -0000055810 00000 n +0000055854 00000 n 0000003690 00000 n -0000055874 00000 n +0000055918 00000 n 0000003826 00000 n -0000055938 00000 n +0000055982 00000 n 0000003963 00000 n -0000056004 00000 n +0000056048 00000 n 0000004100 00000 n -0000056070 00000 n +0000056114 00000 n 0000004237 00000 n -0000056135 00000 n +0000056179 00000 n 0000004373 00000 n -0000056201 00000 n +0000056245 00000 n 0000004510 00000 n -0000056265 00000 n +0000056309 00000 n 0000004647 00000 n -0000056331 00000 n +0000056375 00000 n 0000004783 00000 n -0000056397 00000 n +0000056441 00000 n 0000004920 00000 n 0000005673 00000 n 0000005796 00000 n 0000005872 00000 n -0000056461 00000 n +0000056505 00000 n 0000006004 00000 n -0000056527 00000 n +0000056571 00000 n 0000006137 00000 n -0000056591 00000 n +0000056635 00000 n 0000006270 00000 n -0000056656 00000 n +0000056700 00000 n 0000006403 00000 n -0000056721 00000 n +0000056765 00000 n 0000006536 00000 n -0000056786 00000 n +0000056830 00000 n 0000006669 00000 n -0000056851 00000 n +0000056895 00000 n 0000006801 00000 n -0000056916 00000 n +0000056960 00000 n 0000006934 00000 n 0000009083 00000 n 0000009191 00000 n @@ -1317,47 +1317,47 @@ 0000043725 00000 n 0000045456 00000 n 0000045566 00000 n -0000047461 00000 n -0000056981 00000 n -0000047571 00000 n -0000047771 00000 n -0000047989 00000 n -0000048195 00000 n -0000048403 00000 n -0000048571 00000 n -0000048771 00000 n -0000048929 00000 n -0000049104 00000 n -0000049367 00000 n -0000049608 00000 n -0000049737 00000 n -0000049891 00000 n -0000050045 00000 n -0000050189 00000 n -0000050339 00000 n -0000050480 00000 n -0000050715 00000 n -0000050910 00000 n -0000051150 00000 n -0000051332 00000 n -0000051505 00000 n -0000051708 00000 n -0000051896 00000 n -0000052148 00000 n -0000052289 00000 n -0000052498 00000 n -0000052684 00000 n -0000052858 00000 n -0000053103 00000 n -0000053294 00000 n -0000053500 00000 n -0000053666 00000 n -0000053780 00000 n -0000053891 00000 n -0000054003 00000 n -0000054112 00000 n -0000054219 00000 n -0000054336 00000 n +0000047505 00000 n +0000057025 00000 n +0000047615 00000 n +0000047815 00000 n +0000048033 00000 n +0000048239 00000 n +0000048447 00000 n +0000048615 00000 n +0000048815 00000 n +0000048973 00000 n +0000049148 00000 n +0000049411 00000 n +0000049652 00000 n +0000049781 00000 n +0000049935 00000 n +0000050089 00000 n +0000050233 00000 n +0000050383 00000 n +0000050524 00000 n +0000050759 00000 n +0000050954 00000 n +0000051194 00000 n +0000051376 00000 n +0000051549 00000 n +0000051752 00000 n +0000051940 00000 n +0000052192 00000 n +0000052333 00000 n +0000052542 00000 n +0000052728 00000 n +0000052902 00000 n +0000053147 00000 n +0000053338 00000 n +0000053544 00000 n +0000053710 00000 n +0000053824 00000 n +0000053935 00000 n +0000054047 00000 n +0000054156 00000 n +0000054263 00000 n +0000054380 00000 n trailer << /Size 153 @@ -1365,5 +1365,5 @@ /Info 4 0 R >> startxref -57035 +57079 %%EOF Index: docs/fileformats.html =================================================================== --- docs/fileformats.html (revision 908159) +++ docs/fileformats.html (working copy) @@ -2547,11 +2547,12 @@

When referring to term numbers, Lucene's current - implementation uses a Java int, which means - the maximum number of unique terms in any single index - segment is 2,147,483,648. This is technically not a - limitation of the index file format, just of Lucene's - current implementation. + implementation uses a Java int to hold the + term index, which means the maximum number of unique + terms in any single index segment is ~2.1 billion times + the term index interval (default 128) = ~274 billion. + This is technically not a limitation of the index file + format, just of Lucene's current implementation.

Similarly, Lucene uses a Java int to refer