Property changes on: . ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/trunk:r1291418,1291430,1291506,1291826,1293148,1296268 Property changes on: solr ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/trunk/solr:r1291418,1291430,1291506,1291826,1293148,1296268 Property changes on: solr/core ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/trunk/solr/core:r1291418,1291430,1291506,1291826,1293148,1296268 Index: solr/core/src/java/org/apache/solr/spelling/suggest/Suggester.java =================================================================== --- solr/core/src/java/org/apache/solr/spelling/suggest/Suggester.java (revision 1297697) +++ solr/core/src/java/org/apache/solr/spelling/suggest/Suggester.java (working copy) @@ -32,6 +32,7 @@ import org.apache.lucene.search.suggest.FileDictionary; import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.Lookup.LookupResult; +import org.apache.lucene.util.CharsRef; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; @@ -153,11 +154,6 @@ build(core, searcher); } - public void add(String query, int numHits) { - LOG.info("add " + query + ", " + numHits); - lookup.add(query, new Integer(numHits)); - } - static SpellingResult EMPTY_RESULT = new SpellingResult(); @Override @@ -173,9 +169,12 @@ return EMPTY_RESULT; } SpellingResult res = new SpellingResult(); + CharsRef scratch = new CharsRef(); for (Token t : options.tokens) { - String term = new String(t.buffer(), 0, t.length()); - List suggestions = lookup.lookup(term, + scratch.chars = t.buffer(); + scratch.offset = 0; + scratch.length = t.length(); + List suggestions = lookup.lookup(scratch, options.onlyMorePopular, options.count); if (suggestions == null) { continue; @@ -184,7 +183,7 @@ Collections.sort(suggestions); } for (LookupResult lr : suggestions) { - res.add(t, lr.key, ((Number)lr.value).intValue()); + res.add(t, lr.key.toString(), (int)lr.value); } } return res; Property changes on: lucene ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/trunk/lucene:r1291418,1291430,1291506,1291826,1293148,1296268 Index: lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java (revision 1297697) +++ lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java (working copy) @@ -270,6 +270,37 @@ } /** + * + */ + public final BytesRef copyFrom(final BytesRef bytes) { + final int length = bytes.length; + final int offset = bytes.offset; + bytes.offset = 0; + bytes.grow(length); + int bufferIndex = offset >> BYTE_BLOCK_SHIFT; + byte[] buffer = buffers[bufferIndex]; + int pos = offset & BYTE_BLOCK_MASK; + int overflow = (pos + length) - BYTE_BLOCK_SIZE; + do { + if (overflow <= 0) { + System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytes.length); + bytes.length = length; + bytes.offset = 0; + break; + } else { + final int bytesToCopy = length - overflow; + System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytesToCopy); + pos = 0; + bytes.length -= bytesToCopy; + bytes.offset += bytesToCopy; + buffer = buffers[++bufferIndex]; + overflow = overflow - BYTE_BLOCK_SIZE; + } + } while (true); + return bytes; + } + + /** * Writes the pools content to the given {@link DataOutput} */ public final void writePool(final DataOutput out) throws IOException { Index: lucene/core/src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/BytesRef.java (revision 1297697) +++ lucene/core/src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -271,13 +271,7 @@ final byte[] bBytes = b.bytes; int bUpto = b.offset; - final int aStop; - if (a.length < b.length) { - aStop = aUpto + a.length; - } else { - aStop = aUpto + b.length; - } - + final int aStop = aUpto + Math.min(a.length, b.length); while(aUpto < aStop) { int aByte = aBytes[aUpto++] & 0xff; int bByte = bBytes[bUpto++] & 0xff; Index: lucene/core/src/java/org/apache/lucene/util/BytesRefIterator.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/BytesRefIterator.java (revision 1297697) +++ lucene/core/src/java/org/apache/lucene/util/BytesRefIterator.java (working copy) @@ -18,6 +18,7 @@ */ import java.io.IOException; +import java.util.Comparator; /** * A simple iterator interface for {@link BytesRef} iteration @@ -40,6 +41,14 @@ */ public BytesRef next() throws IOException; + /** + * Return the {@link BytesRef} Comparator used to sort terms provided by the + * iterator. This may return null if there are no items or the iterator is not + * sorted. Callers may invoke this method many times, so it's best to cache a + * single instance & reuse it. + */ + public Comparator getComparator(); + public final static class EmptyBytesRefIterator implements BytesRefIterator { @Override @@ -47,6 +56,10 @@ return null; } + public Comparator getComparator() { + return null; + } + } } Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 1297697) +++ lucene/contrib/CHANGES.txt (working copy) @@ -10,6 +10,13 @@ * LUCENE-3626: The internal implementation classes in PKIndexSplitter and MultiPassIndexSplitter were made private as they now work per segment. (Uwe Schindler) + + * LUCENE-3807: Cleaned up Suggest / Lookup API. Term weights (freqs) are now + 64bit signed integers instead of 32bit floats. Sorting of terms is now a + disk based merge sort instead of an in-memory sort. The Lookup API now + accepts and returns CharSequence instead of String which should be converted + into a String before used in a datastructure that relies on hashCode / equals. + (Simon Willnauer) Changes in Runtime Behavior Property changes on: lucene/contrib/spellchecker ___________________________________________________________________ Added: svn:mergeinfo Merged /lucene/dev/trunk/contrib/spellchecker:r932749,1141465 Merged /lucene/dev/branches/preflexfixes/lucene/contrib/spellchecker:r967125-979432 Merged /lucene/java/branches/lucene_3_0/contrib/spellchecker:r880793,896906,1098765 Merged /lucene/java/trunk/contrib/spellchecker:r924483-925561 Merged /lucene/java/branches/flex_1458/contrib/spellchecker:r924791,924850,930201 Merged /lucene/java/branches/lucene_2_4/contrib/spellchecker:r748824 Merged /lucene/dev/trunk/modules/suggest:r1291418,1291430,1291506,1291826,1293148,1296268 Merged /lucene/dev/branches/lucene_solr_3_1/lucene/contrib/spellchecker:r1081856,1083239,1085499,1085511,1085532,1085809,1101103 Merged /lucene/java/branches/lucene_2_9/contrib/spellchecker:r817269-818600,825998,829134,829881,831036,896850,909334,948516 Merged /lucene/dev/branches/lucene_solr_3_2/lucene/contrib/spellchecker:r1128223,1128247,1129418,1129472 Merged /lucene/dev/branches/lucene_solr_3_3/lucene/contrib/spellchecker:r1138390,1138979,1139775 Merged /lucene/dev/trunk/lucene/contrib/spellchecker:r931298,931337,931502,932129-932131,932163,932304,932369,932374,932398,932417,932541,932576,932587,932698,932731-932749,932752,932773,932795,932828,932856-932857,932862,932864,932878,932963,932998-932999,933541-933575,933598,933613,933679,933879,934339,934954,935014-935048,935065,935186-935513,935521-935522,935553-935962,936522,936544,936605,936657-936726,937039,937360,938582-938646,938989,939111,939611,939649,940433,940447,940451-940452,940666,940699,940730,940878-940892,940994,941270,941363,941780,942166,942235,942288,942292,942676,942719,943142,943493,943931,945057,945090,945130,945245,945343,945420,946139,946330,946338,946599,948011,948082,948429,949156,949288,949311,949318,949445,949976,949997,950008,950042,950458,950467,950613,950667,951126,951355,951397,951521,953628,955547,955613,955615,955796-955797,955809-955996,956097,956125,956173,956316,956715,957465,957481,957486,957520,957634,957707,960367,960371,960374,960719,962555,963372,963654,963720,963781,963873,963906,963909,963920,964019,964054,964430,964459,964720,964753,964832,964856,965103,965110,965222,965230,965299,965327,965330,965585,966354,966878,967080,979453,979809,980369,980428,980436,980501,980909,980911,980917,981265,981550,981598,981650,981661,981857,981936,982073,982084,982201,982323,982725,982824,983100,983212,983216,983313,983328,983495,983500,983530,983622,983632,983778,984187,984202,984232,984510,984968,985453,985455,985672,985875,986158,986173,986612,987122,988087,988206,988216,988259,988346,988478,988527,988543,988592,988613,988688,988710,988736,988739,989004,989010,989013,989030,989035,989315,989321,989334,989785,990160-990161,990180,990189,990281,990301,990451,990459,990766,990781,990854,991053,991191,991310,991497,992424,992469,992567,992571,992623,993106,993194,993199,993287,993408,994935,994976,994979,995247,995250,995376,995607,995772,996268,996357,996416,996511,996611,996623,996647-996653,996720,996942,996961,996978,997180,997230,998055,998505,998684,999016,999037,999137,999139,999152,999175,999223,999378,999409,999483,999545,999842,999984,1000000,1000424,1000428,1000581,1000597,1000675,1001006,1001010,1001129,1001318,1001420,1001661,1001796,1002002,1002739,1003107,1003291,1003614,1003631,1003645,1003841-1003852,1003873,1003877,1003906,1003938,1003954,1003978,1003990,1004038,1004082,1004179,1004200,1004215,1004241,1004335,1005310,1005356,1005363,1006146,1006280,1006290,1006324,1021340,1021357,1021360,1021439,1021449,1021969-1021971,1022165,1022191,1022632,1022708-1022710,1022730-1022735,1022748-1022755,1022762-1022793,1022798-1022802,1022805,1022826,1022927,1022939,1022956,1022989,1022998,1023006,1023009,1023022,1023040,1023106,1023235-1023246,1023250,1023264-1023265,1023312,1023329-1023330,1023346-1023347,1023355,1023493,1023509-1023511,1023518,1023520,1023535-1023536,1023562,1023579-1023588,1023594-1023595,1023600-1023602,1023606,1023621,1023635,1023637,1023711,1023845,1023870,1024196,1024219,1024233,1024238,1024256,1024292,1024305,1024338,1024395,1024402,1024408,1024475-1024476,1024486,1025545,1025547,1025570,1025579,1025597,1025669,1025929,1026044,1026058,1026129-1026130,1026167,1026336,1026431,1026446,1026456,1026460,1026592,1026606,1026610,1026738,1026841,1026868,1026882,1027743,1027788,1027998,1028039,1028386,1029096,1029325,1029333,1029345,1030012,1030019,1030073,1030078,1030754,1031076,1031219,1031460,1031467,1031474,1031480,1031496,1031686,1031689,1032570,1032776,1034007,1034011,1034017,1034342,1034361,1034763,1034921,1034975,1034977,1035096,1035103,1035194,1035205,1035214,1035395,1035397,1035420,1035535,1035651,1035996,1036088,1036970,1037077,1037154,1037223,1037406,1037429,1038562,1038785,1039068,1039314,1039688,1039737,1039759,1039773,1039778,1039868,1039911,1039917,1039962-1039967,1040064,1040290,1040390,1040447,1040463,1040608,1040815,1040935,1040940,1040982,1041844,1041914,1041954,1041963,1042008,1042185,1042213,1042315,1042359,1042373,1043071,1043114,1043148,1043277,1043693,1043749,1044066-1044069,1044098,1044257,1044315,1044328,1044505,1044561,1044635,1044660,1044854,1044867,1045010,1045212,1045266,1045310,1045315,1045322-1045323,1049094,1049107,1049117,1049131-1049132,1049144,1049187,1049413,1049502,1049693,1049918,1050063,1050084,1050687,1050697-1050725,1050728,1050733,1050737,1050813,1050827,1051041,1051058,1051305,1051715,1051872,1051891,1052898,1052926,1052974,1052980,1052991,1053236,1053405,1053509,1053896,1054015,1054164,1054172,1054405-1054406,1055285,1055408,1055435,1055595,1055877,1055892-1055906,1056014,1056428,1056702,1056821,1056955,1057010,1057149,1057221,1057340,1058284-1058288,1058324,1058393,1058939,1059426,1059719,1059866,1060023,1060324,1060437,1060608,1060779,1060807,1060846,1060872,1060997,1061050,1061065,1061078,1061350,1061424,1061499,1061622,1062070,1062123,1062153,1062319,1062451,1062454,1062509,1062604,1062633,1062876,1062879,1063323,1063333,1063478,1063493,1063498,1063501,1063513,1063702,1063762,1063837,1063842,1063868-1063869,1063877,1063897,1063908,1063920,1064330,1064379,1064735,1064781,1064844,1064942,1065059,1065095-1065096,1065102,1065261,1065265,1065272,1065286,1065302,1065304,1065327,1065337,1065410,1065416,1065465,1065474,1065572,1065601,1065621,1065719,1065853,1065855,1065891,1066008,1066691,1066764,1066819,1066850,1066889,1067119,1067131,1067160,1067163,1067165,1067299,1067427,1067551,1068387,1068979,1069316,1069341,1069656,1070183,1070185,1070206,1070240,1070321,1070691,1070760,1070879,1071074,1071417,1071435,1071569,1071594,1071654-1071655,1071658,1072127,1072250,1072567,1072591,1072607,1072683,1073336,1073806,1073850,1073957,1074009,1074017,1074226,1074326,1074357,1074726,1074750,1074952,1075023-1075024,1075069,1075072,1075079,1075089,1075103,1075184,1075190-1075191,1075196,1075287,1075443,1075505,1075850,1076032,1076237,1076279,1076311,1076315,1076319,1076325,1076433,1076884,1077908,1077916,1078058,1078117,1078127,1078398,1078448,1078451,1078463,1078471,1078500-1078501,1078512-1078515,1078529,1078540,1078553,1078563,1078570,1078580,1078599,1078614,1078639,1078659,1078670,1078681,1078770,1079707,1079786,1079949,1080038,1080258,1080424,1080443,1080445,1080647,1080665,1080691,1080762,1080970,1080979,1080985,1080988,1081012,1081017,1081777-1081778,1081790-1081791,1081795,1082186,1082514-1082516,1082601,1082687,1082720,1082730,1082776,1082865,1082919,1082926,1083010,1083213,1083447,1083459,1083991,1084045,1084210,1084247,1084273-1084274,1084327,1084544,1084549,1084566,1084929,1085004,1085089,1085224,1085241,1085423,1085515,1085530,1085689,1086276,1086584,1086629,1086821,1087319,1087426,1087722,1088021,1089335,1089813,1089815,1089906,1089918,1091132-1091159,1091499,1092105,1092136,1092328,1092396,1092812,1092848,1094014,1094214,1095120,1095260,1095432,1095517,1095861,1095937,1096073,1096077,1096178-1096183,1096194,1096249,1096301,1096315,1096334,1096339,1097187,1097216,1097627,1098303,1098357,1098367,1098375,1098532,1098633,1098730,1098740,1098800,1098860,1099041,1099340,1099529,1099582,1099745,1099999,1100435,1100437,1101047,1101056,1101072,1101088,1101539,1101572,1101574,1102058,1102120,1102290,1102377,1102658,1102718,1102785,1102817,1102827,1102907,1103024,1103048,1103077,1103102,1103120,1103155,1103979,1103983,1104421,1104432,1104452,1104519,1124160,1124266,1124293,1124307,1124316,1124330,1124366,1125006,1125150,1125165,1125376,1125932,1125972,1126022,1126091,1126280,1126284,1126487,1126573,1126642,1126645,1126761,1127156,1127247,1127301,1127436,1128105,1128246,1128253,1128549,1128830,1128844,1128854,1128856,1129398,1129403,1129413,1129427,1129450,1129453,1129456,1129459,1129465,1129645,1129656,1129694,1130039,1130042,1130052,1130063,1130150,1130439,1130527,1130547,1130648,1130852,1130858-1130859,1130861,1130954-1131005,1131150,1131158,1131371,1131395,1131401,1132391,1132517,1132620,1132729,1132806,1132855,1132969,1133021,1133136,1133187,1133330,1133383,1133385,1133486,1133553,1133565,1133599,1133616,1133631,1133646,1133839,1133937,1134163,1134328,1134515,1134592,1134685,1134763,1134781,1134895,1134995,1134998,1135009,1135011,1135154,1135204,1135300,1135369,1135509,1135525,1135527,1135537,1135650,1135658,1135670,1135764,1135801,1135818,1135822,1135825,1135954,1136027,1136080,1136357,1136467,1136568,1136605,1136644,1136789,1136792,1137054,1137060,1137064,1137162,1137211,1137330,1137357,1137477,1137480,1137529,1137533,1137665,1137733,1137882,1138030,1138069,1138319,1138405,1138446,1138450,1138821,1138890,1139054,1139173,1139178,1139188,1139199,1139285,1139513,1139789,1139995,1140004,1140119,1140243,1140252,1140498,1140574,1140720,1140827,1140836,1140851,1141167,1141170,1141295,1141400,1141593,1141629,1141999,1142179,1143122,1143189,1143238,1143420,1143558,1143766,1143783,1143878,1144294,1144415,1144513,1144792,1144841,1145158,1145163,1145182,1145198,1145233,1145239,1145255,1145263,1145292,1145442,1145479,1145502,1145518,1145594,1145657,1145701,1145730,1145885,1145925,1145957,1146638,1146984,1147023,1147578,1147586,1147671,1147691,1147807,1147881,1148596,1148602,1148681,1148728,1148763,1148968,1149028,1149050,1149108,1149256,1149740,1149746,1150091,1150362,1150384,1150389,1150394,1150404-1150405,1150415,1150478,1150480,1150486-1150489,1150671,1150840,1151081,1151146,1151720,1151782,1151984,1151997,1152024,1152055,1152089,1152288,1152456,1152525,1152530,1152653,1152669,1152892,1153399,1153408,1153844,1154005,1154926,1154936,1155278,1156053,1156590-1156591,1157437,1158342,1158697,1158730,1158819,1158832,1159291,1159418,1159627,1160832,1161488,1161505,1161964,1161966,1161972,1161974,1162135,1162156,1162158,1162166,1162375,1162394,1162401,1163370,1163568,1163576,1163589,1163625,1164287,1164311,1164620,1164956,1165902,1165995,1166106,1166457,1166530,1166541,1166582,1166656,1166702,1166715,1166728,1166784,1166850,1166866,1166954,1167008,1167199,1167467,1169612,1169816,1169820,1170157,1170203,1170586,1170616,1170699,1170716,1170725,1170908,1171556,1171570,1171597,1171691,1171704,1171739,1172227,1173139,1173423,1173430,1173720,1173778,1173961,1174377-1174407,1175300,1175376,1175385,1175397,1175413,1175425,1175475,1175529,1175579,1175650,1175696,1175699,1175956,1175975,1176097,1176114,1176478,1176772,1176774,1177048-1177049,1177723,1177940,1178612,1178923,1179315,1179677,1179762,1179956,1180124,1181265,1181268,1181299,1181659,1181664,1181760,1182982,1183458,1183464,1183582,1183738,1183753,1183756,1184753-1184754,1184761,1184822,1184851,1184877,1185120,1187900,1188597,1188777,1188975,1189039,1189160,1189186,1189655,1189903,1189958,1190029,1190107,1190410,1195082,1195101,1195275,1196228,1197469,1197690,1197742,1197879,1198009,1198024,1198039,1198089,1198134,1198332,1198371,1198636,1198777-1198778,1198911,1199405,1199832,1199837,1200007,1200051,1200080,1200274,1200440,1200480,1200854,1201036,1201165,1201191,1201329,1201375,1201855,1202152,1202657,1202754,1202969,1203114,1203206,1203756,1203966,1203970,1204416,1204453,1205021,1205152,1205342,1205360,1205366,1205430,1205774,1205954,1206017,1206033,1206070,1206143,1206229,1206436-1206437,1206452,1206707,1206767,1206789,1206996,1207070,1207103,1207291,1207577,1207718,1208032,1208118,1208509,1208525,1210020,1210054,1210469,1210714,1211710,1211827,1211887,1212894,1213013,1213016,1213020,1213033,1213044,1213106,1213329,1213704,1213706,1213800,1213803,1213824,1213826,1213910,1214012,1214376,1214413,1214540,1215018,1215349,1215352,1220426,1220458,1220555,1220705,1220795,1221195,1221368-1221369,1222367-1222368,1225120,1225211,1225231,1225233,1225433,1225920,1226417,1226455,1226793,1226821,1226871,1227439,1228650,1228704,1228727,1228928,1229519,1229523,1229602,1229713,1231223,1231367,1231512,1231514,1231665,1231788,1231795,1232470,1232491,1232769,1232943,1233381,1233583,1233696,1233708,1234396,1234452,1234546,1234598,1234652,1234687,1234850,1234867,1235187,1235228,1235753,1236429,1236431,1237497,1237500,1237506,1237528,1237809,1238832,1238851,1239040,1239052-1239056,1239061,1239316,1239658,1240034-1240081,1240655,1240980,1241355,1241588,1241596,1241598,1241741,1241878,1241986,1242497,1242557,1242740,1242890,1242903,1243278,1243656,1244379,1244458,1244536,1244552,1245710,1245715,1245947,1291020,1291097,1291184,1291418,1291430,1291506,1291541,1291703,1291728,1291826,1292282,1292864,1292881,1293148,1293728,1293821-1293823,1294856,1294920,1295067,1296237,1296268,1297001,1297048,1297162-1297168,1297518 Merged /lucene/java/branches/lucene_2_9_back_compat_tests/contrib/spellchecker:r818601-821336 Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java (working copy) @@ -18,7 +18,6 @@ */ import java.io.IOException; -import java.util.Iterator; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; @@ -27,6 +26,8 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.LuceneTestCase; /** @@ -40,7 +41,8 @@ private IndexReader indexReader = null; private LuceneDictionary ld; - private Iterator it; + private BytesRefIterator it; + private BytesRef spare = new BytesRef(); @Override public void setUp() throws Exception { @@ -59,11 +61,11 @@ writer.addDocument(doc); doc = new Document(); - doc.add(new Field("contents", "Tom", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(newField("contents", "Tom", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); doc = new Document(); - doc.add(new Field("contents", "Jerry", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(newField("contents", "Jerry", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); doc = new Document(); @@ -84,13 +86,12 @@ public void testFieldNonExistent() throws IOException { try { - indexReader = IndexReader.open(store, true); + indexReader = IndexReader.open(store); ld = new LuceneDictionary(indexReader, "nonexistent_field"); it = ld.getWordsIterator(); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); + assertNull("More elements than expected", spare = it.next()); } finally { if (indexReader != null) { indexReader.close(); } } @@ -98,15 +99,13 @@ public void testFieldAaa() throws IOException { try { - indexReader = IndexReader.open(store, true); + indexReader = IndexReader.open(store); ld = new LuceneDictionary(indexReader, "aaa"); it = ld.getWordsIterator(); - - assertTrue("First element doesn't exist.", it.hasNext()); - assertTrue("First element isn't correct", it.next().equals("foo")); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); + assertNotNull("First element doesn't exist.", spare = it.next()); + assertTrue("First element isn't correct", spare.utf8ToString().equals("foo")); + assertNull("More elements than expected", it.next()); } finally { if (indexReader != null) { indexReader.close(); } } @@ -114,24 +113,22 @@ public void testFieldContents_1() throws IOException { try { - indexReader = IndexReader.open(store, true); + indexReader = IndexReader.open(store); ld = new LuceneDictionary(indexReader, "contents"); it = ld.getWordsIterator(); - assertTrue("First element doesn't exist.", it.hasNext()); - assertTrue("First element isn't correct", it.next().equals("Jerry")); - assertTrue("Second element doesn't exist.", it.hasNext()); - assertTrue("Second element isn't correct", it.next().equals("Tom")); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); + assertNotNull("First element doesn't exist.", spare = it.next()); + assertTrue("First element isn't correct", spare.utf8ToString().equals("Jerry")); + assertNotNull("Second element doesn't exist.", spare = it.next()); + assertTrue("Second element isn't correct", spare.utf8ToString().equals("Tom")); + assertNull("More elements than expected", it.next()); ld = new LuceneDictionary(indexReader, "contents"); it = ld.getWordsIterator(); int counter = 2; - while (it.hasNext()) { - it.next(); + while (it.next() != null) { counter--; } @@ -144,30 +141,15 @@ public void testFieldContents_2() throws IOException { try { - indexReader = IndexReader.open(store, true); + indexReader = IndexReader.open(store); ld = new LuceneDictionary(indexReader, "contents"); it = ld.getWordsIterator(); - // hasNext() should have no side effects - assertTrue("First element isn't were it should be.", it.hasNext()); - assertTrue("First element isn't were it should be.", it.hasNext()); - assertTrue("First element isn't were it should be.", it.hasNext()); - // just iterate through words - assertTrue("First element isn't correct", it.next().equals("Jerry")); - assertTrue("Second element isn't correct", it.next().equals("Tom")); - assertTrue("Nonexistent element is really null", it.next() == null); - - // hasNext() should still have no side effects ... - assertFalse("There should be any more elements", it.hasNext()); - assertFalse("There should be any more elements", it.hasNext()); - assertFalse("There should be any more elements", it.hasNext()); - - // .. and there are really no more words - assertTrue("Nonexistent element is really null", it.next() == null); - assertTrue("Nonexistent element is really null", it.next() == null); - assertTrue("Nonexistent element is really null", it.next() == null); + assertEquals("First element isn't correct", "Jerry", it.next().utf8ToString()); + assertEquals("Second element isn't correct", "Tom", it.next().utf8ToString()); + assertNull("Nonexistent element is really null", it.next()); } finally { if (indexReader != null) { indexReader.close(); } @@ -176,15 +158,14 @@ public void testFieldZzz() throws IOException { try { - indexReader = IndexReader.open(store, true); + indexReader = IndexReader.open(store); ld = new LuceneDictionary(indexReader, "zzz"); it = ld.getWordsIterator(); - assertTrue("First element doesn't exist.", it.hasNext()); - assertTrue("First element isn't correct", it.next().equals("bar")); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); + assertNotNull("First element doesn't exist.", spare = it.next()); + assertEquals("First element isn't correct", "bar", spare.utf8ToString()); + assertNull("More elements than expected", it.next()); } finally { if (indexReader != null) { indexReader.close(); } @@ -194,7 +175,7 @@ public void testSpellchecker() throws IOException { Directory dir = newDirectory(); SpellChecker sc = new SpellChecker(dir); - indexReader = IndexReader.open(store, true); + indexReader = IndexReader.open(store); sc.indexDictionary(new LuceneDictionary(indexReader, "contents"), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false); String[] suggestions = sc.suggestSimilar("Tam", 1); assertEquals(1, suggestions.length); Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java (working copy) @@ -17,59 +17,117 @@ * the License. */ +import java.util.Comparator; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; public class TestTermFreqIterator extends LuceneTestCase { public void testEmpty() throws Exception { TermFreqArrayIterator iterator = new TermFreqArrayIterator(new TermFreq[0]); - TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(iterator); - assertFalse(wrapper.hasNext()); + TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator()); + assertNull(wrapper.next()); wrapper = new UnsortedTermFreqIteratorWrapper(iterator); - assertFalse(wrapper.hasNext()); + assertNull(wrapper.next()); } public void testTerms() throws Exception { int num = atLeast(10000); - TreeMap sorted = new TreeMap(); + Comparator comparator = random.nextBoolean() ? BytesRef.getUTF8SortedAsUnicodeComparator() : BytesRef.getUTF8SortedAsUTF16Comparator(); + TreeMap sorted = new TreeMap(comparator); TermFreq[] unsorted = new TermFreq[num]; for (int i = 0; i < num; i++) { - String key; + BytesRef key; do { - key = _TestUtil.randomUnicodeString(random); + key = new BytesRef(_TestUtil.randomUnicodeString(random)); } while (sorted.containsKey(key)); - float value = random.nextFloat(); + long value = random.nextLong(); sorted.put(key, value); unsorted[i] = new TermFreq(key, value); } // test the sorted iterator wrapper - TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted)); - Iterator> expected = sorted.entrySet().iterator(); + TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator); + Iterator> expected = sorted.entrySet().iterator(); while (expected.hasNext()) { - Map.Entry entry = expected.next(); + Map.Entry entry = expected.next(); - assertTrue(wrapper.hasNext()); assertEquals(entry.getKey(), wrapper.next()); - assertEquals(entry.getValue().floatValue(), wrapper.freq(), 0F); + assertEquals(entry.getValue().longValue(), wrapper.weight()); } - assertFalse(wrapper.hasNext()); + assertNull(wrapper.next()); // test the unsorted iterator wrapper wrapper = new UnsortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted)); - TreeMap actual = new TreeMap(); - while (wrapper.hasNext()) { - String key = wrapper.next(); - float value = wrapper.freq(); - actual.put(key, value); + TreeMap actual = new TreeMap(); + BytesRef key; + while ((key = wrapper.next()) != null) { + long value = wrapper.weight(); + actual.put(BytesRef.deepCopyOf(key), value); } assertEquals(sorted, actual); } + + + public void testRaw() throws Exception { + int num = atLeast(10000); + + Comparator comparator = BytesRef.getUTF8SortedAsUnicodeComparator(); + BytesRefHash sorted = new BytesRefHash(); + TermFreq[] unsorted = new TermFreq[num]; + byte[] buffer = new byte[0]; + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); + + for (int i = 0; i < num; i++) { + BytesRef spare; + long weight; + do { + spare = new BytesRef(_TestUtil.randomUnicodeString(random)); + if (spare.length + 8 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 8); + } + output.reset(buffer); + output.writeBytes(spare.bytes, spare.offset, spare.length); + weight = random.nextLong(); + output.writeLong(weight); + + } while (sorted.add(new BytesRef(buffer, 0, output.getPosition())) < 0); + unsorted[i] = new TermFreq(spare, weight); + } + + // test the sorted iterator wrapper + TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator, true); + int[] sort = sorted.sort(comparator); + int size = sorted.size(); + BytesRef spare = new BytesRef(); + for (int i = 0; i < size; i++) { + sorted.get(sort[i], spare); + spare.length -= 8; // sub the long value + assertEquals(spare, wrapper.next()); + spare.offset = spare.offset + spare.length; + spare.length = 8; + assertEquals(asLong(spare), wrapper.weight()); + } + assertNull(wrapper.next()); + } + + public static long asLong(BytesRef b) { + return (((long) asIntInternal(b, b.offset) << 32) | asIntInternal(b, + b.offset + 4) & 0xFFFFFFFFL); + } + + private static int asIntInternal(BytesRef b, int pos) { + return ((b.bytes[pos++] & 0xFF) << 24) | ((b.bytes[pos++] & 0xFF) << 16) + | ((b.bytes[pos++] & 0xFF) << 8) | (b.bytes[pos] & 0xFF); + } } Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TestBytesRefList.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TestBytesRefList.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TestBytesRefList.java (working copy) @@ -21,65 +21,87 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; + +import org.apache.lucene.search.suggest.BytesRefList; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; public class TestBytesRefList extends LuceneTestCase { - + public void testAppend() throws IOException { BytesRefList list = new BytesRefList(); List stringList = new ArrayList(); - int entries = atLeast(500); - BytesRef spare = new BytesRef(); - for (int i = 0; i < entries; i++) { - String randomRealisticUnicodeString = _TestUtil - .randomRealisticUnicodeString(random); - spare.copyChars(randomRealisticUnicodeString); - list.append(spare); - stringList.add(randomRealisticUnicodeString); - } - for (int i = 0; i < entries; i++) { - assertNotNull(list.get(spare, i)); - assertEquals("entry " + i + " doesn't match", stringList.get(i), - spare.utf8ToString()); - } - - // check random - for (int i = 0; i < entries; i++) { - int e = random.nextInt(entries); - assertNotNull(list.get(spare, e)); - assertEquals("entry " + i + " doesn't match", stringList.get(e), - spare.utf8ToString()); - } - for (int i = 0; i < 2; i++) { - - BytesRefIterator iterator = list.iterator(); - for (String string : stringList) { - assertEquals(string, iterator.next().utf8ToString()); + for (int j = 0; j < 2; j++) { + if (j > 0 && random.nextBoolean()) { + list.clear(); + stringList.clear(); } + int entries = atLeast(500); + BytesRef spare = new BytesRef(); + for (int i = 0; i < entries; i++) { + String randomRealisticUnicodeString = _TestUtil + .randomRealisticUnicodeString(random); + spare.copyChars(randomRealisticUnicodeString); + list.append(spare); + stringList.add(randomRealisticUnicodeString); + } + for (int i = 0; i < entries; i++) { + assertNotNull(list.get(spare, i)); + assertEquals("entry " + i + " doesn't match", stringList.get(i), + spare.utf8ToString()); + } + + // check random + for (int i = 0; i < entries; i++) { + int e = random.nextInt(entries); + assertNotNull(list.get(spare, e)); + assertEquals("entry " + i + " doesn't match", stringList.get(e), + spare.utf8ToString()); + } + for (int i = 0; i < 2; i++) { + + BytesRefIterator iterator = list.iterator(); + for (String string : stringList) { + assertEquals(string, iterator.next().utf8ToString()); + } + } } } - - public void testSort() { + + public void testSort() throws IOException { BytesRefList list = new BytesRefList(); List stringList = new ArrayList(); - int entries = atLeast(500); - BytesRef spare = new BytesRef(); - for (int i = 0; i < entries; i++) { - String randomRealisticUnicodeString = _TestUtil.randomRealisticUnicodeString(random); - spare.copyChars(randomRealisticUnicodeString); - list.append(spare); - stringList.add(randomRealisticUnicodeString); + + for (int j = 0; j < 2; j++) { + if (j > 0 && random.nextBoolean()) { + list.clear(); + stringList.clear(); + } + int entries = atLeast(500); + BytesRef spare = new BytesRef(); + for (int i = 0; i < entries; i++) { + String randomRealisticUnicodeString = _TestUtil + .randomRealisticUnicodeString(random); + spare.copyChars(randomRealisticUnicodeString); + list.append(spare); + stringList.add(randomRealisticUnicodeString); + } + + Collections.sort(stringList); + BytesRefIterator iter = list.iterator(BytesRef + .getUTF8SortedAsUTF16Comparator()); + int i = 0; + while ((spare = iter.next()) != null) { + assertEquals("entry " + i + " doesn't match", stringList.get(i), + spare.utf8ToString()); + i++; + } + assertNull(iter.next()); + assertEquals(i, stringList.size()); } - Collections.sort(stringList); - int[] sortedOrds = list.sort(BytesRef.getUTF8SortedAsUTF16Comparator()); - for (int i = 0; i < entries; i++) { - assertNotNull(list.get(spare, sortedOrds[i])); - assertEquals("entry " + i + " doesn't match", stringList.get(i), - spare.utf8ToString()); - } } + } Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (working copy) @@ -98,7 +98,7 @@ while ((line = br.readLine()) != null) { int tab = line.indexOf('|'); assertTrue("No | separator?: " + line, tab >= 0); - float weight = Float.parseFloat(line.substring(tab + 1)); + int weight = Integer.parseInt(line.substring(tab + 1)); String key = line.substring(0, tab); input.add(new TermFreq(key, weight)); } @@ -192,7 +192,7 @@ final List input = new ArrayList(benchmarkInput.size()); for (TermFreq tf : benchmarkInput) { - input.add(tf.term.substring(0, Math.min(tf.term.length(), + input.add(tf.term.utf8ToString().substring(0, Math.min(tf.term.length, minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)))); } Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/PersistenceTest.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/PersistenceTest.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/PersistenceTest.java (working copy) @@ -17,13 +17,16 @@ package org.apache.lucene.search.suggest; import java.io.File; +import java.util.List; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; import org.apache.lucene.search.suggest.fst.FSTLookup; import org.apache.lucene.search.suggest.jaspell.JaspellLookup; import org.apache.lucene.search.suggest.tst.TSTLookup; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; public class PersistenceTest extends LuceneTestCase { public final String[] keys = new String[] { @@ -62,7 +65,7 @@ Lookup lookup = lookupClass.newInstance(); TermFreq[] keys = new TermFreq[this.keys.length]; for (int i = 0; i < keys.length; i++) - keys[i] = new TermFreq(this.keys[i], (float) i); + keys[i] = new TermFreq(this.keys[i], i); lookup.build(new TermFreqArrayIterator(keys)); // Store the suggester. @@ -74,16 +77,18 @@ lookup.load(storeDir); // Assert validity. - float previous = Float.NEGATIVE_INFINITY; + long previous = Long.MIN_VALUE; for (TermFreq k : keys) { - Float val = (Float) lookup.get(k.term); - assertNotNull(k.term, val); + List list = lookup.lookup(_TestUtil.bytesToCharSequence(k.term, random), false, 1); + assertEquals(1, list.size()); + LookupResult lookupResult = list.get(0); + assertNotNull(k.term.utf8ToString(), lookupResult.key); if (supportsExactWeights) { - assertEquals(k.term, Float.valueOf(k.v), val); + assertEquals(k.term.utf8ToString(), k.v, lookupResult.value); } else { - assertTrue(val + ">=" + previous, val >= previous); - previous = val.floatValue(); + assertTrue(lookupResult.value + ">=" + previous, lookupResult.value >= previous); + previous = lookupResult.value; } } } Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TermFreq.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TermFreq.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TermFreq.java (working copy) @@ -1,5 +1,7 @@ package org.apache.lucene.search.suggest; +import org.apache.lucene.util.BytesRef; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -18,10 +20,14 @@ */ public final class TermFreq { - public final String term; - public final float v; + public final BytesRef term; + public final long v; - public TermFreq(String term, float v) { + public TermFreq(String term, long v) { + this(new BytesRef(term), v); + } + + public TermFreq(BytesRef term, long v) { this.term = term; this.v = v; } Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TestHighFrequencyDictionary.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TestHighFrequencyDictionary.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/TestHighFrequencyDictionary.java (working copy) @@ -17,14 +17,13 @@ * the License. */ -import java.util.Iterator; - import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.search.spell.HighFrequencyDictionary; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.LuceneTestCase; public class TestHighFrequencyDictionary extends LuceneTestCase { @@ -35,8 +34,9 @@ writer.close(); IndexReader ir = IndexReader.open(dir); Dictionary dictionary = new HighFrequencyDictionary(ir, "bogus", 0.1f); - Iterator tf = dictionary.getWordsIterator(); - assertFalse(tf.hasNext()); + BytesRefIterator tf = dictionary.getWordsIterator(); + assertNull(tf.getComparator()); + assertNull(tf.next()); dir.close(); } } Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java (working copy) @@ -17,9 +17,8 @@ * limitations under the License. */ -import java.util.Iterator; - import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.LuceneTestCase; import org.junit.Test; @@ -31,7 +30,7 @@ @Test public void testInMemorySorter() throws Exception { - check(new InMemorySorter()); + check(new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator())); } private void check(BytesRefSorter sorter) throws Exception { @@ -42,8 +41,8 @@ } // Create two iterators and check that they're aligned with each other. - Iterator i1 = sorter.iterator(); - Iterator i2 = sorter.iterator(); + BytesRefIterator i1 = sorter.iterator(); + BytesRefIterator i2 = sorter.iterator(); // Verify sorter contract. try { @@ -52,10 +51,12 @@ } catch (IllegalStateException e) { // Expected. } - - while (i1.hasNext() && i2.hasNext()) { - assertEquals(i1.next(), i2.next()); + BytesRef spare1; + BytesRef spare2; + while ((spare1 = i1.next()) != null && (spare2 = i2.next()) != null) { + assertEquals(spare1, spare2); } - assertEquals(i1.hasNext(), i2.hasNext()); + assertNull(i1.next()); + assertNull(i2.next()); } } Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/FSTLookupTest.java (working copy) @@ -26,6 +26,7 @@ import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.fst.FSTLookup; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.search.suggest.LookupBenchmarkTest; import org.apache.lucene.search.suggest.TermFreq; @@ -37,7 +38,7 @@ */ @Deprecated public class FSTLookupTest extends LuceneTestCase { - public static TermFreq tf(String t, float v) { + public static TermFreq tf(String t, long v) { return new TermFreq(t, v); } @@ -52,73 +53,73 @@ private TermFreq[] evalKeys() { final TermFreq[] keys = new TermFreq[] { - tf("one", 0.5f), - tf("oneness", 1), - tf("onerous", 1), - tf("onesimus", 1), - tf("two", 1), - tf("twofold", 1), - tf("twonk", 1), - tf("thrive", 1), - tf("through", 1), - tf("threat", 1), - tf("three", 1), - tf("foundation", 1), - tf("fourblah", 1), - tf("fourteen", 1), - tf("four", 0.5f), - tf("fourier", 0.5f), - tf("fourty", 0.5f), + tf("one", 1), + tf("oneness", 2), + tf("onerous", 2), + tf("onesimus", 2), + tf("two", 2), + tf("twofold", 2), + tf("twonk", 2), + tf("thrive", 2), + tf("through", 2), + tf("threat", 2), + tf("three", 2), + tf("foundation", 2), + tf("fourblah", 2), + tf("fourteen", 2), + tf("four", 1), + tf("fourier", 1), + tf("fourty", 1), tf("xo", 1), }; return keys; } public void testExactMatchHighPriority() throws Exception { - assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0"); + assertMatchEquals(lookup.lookup("two", true, 1), "two/1"); } public void testExactMatchLowPriority() throws Exception { assertMatchEquals(lookup.lookup("one", true, 2), - "one/0.0", - "oneness/1.0"); + "one/0", + "oneness/1"); } public void testRequestedCount() throws Exception { // 'one' is promoted after collecting two higher ranking results. assertMatchEquals(lookup.lookup("one", true, 2), - "one/0.0", - "oneness/1.0"); + "one/0", + "oneness/1"); // 'one' is at the top after collecting all alphabetical results. assertMatchEquals(lookup.lookup("one", false, 2), - "one/0.0", - "oneness/1.0"); + "one/0", + "oneness/1"); // 'four' is collected in a bucket and then again as an exact match. assertMatchEquals(lookup.lookup("four", true, 2), - "four/0.0", - "fourblah/1.0"); + "four/0", + "fourblah/1"); // Check reordering of exact matches. assertMatchEquals(lookup.lookup("four", true, 4), - "four/0.0", - "fourblah/1.0", - "fourteen/1.0", - "fourier/0.0"); + "four/0", + "fourblah/1", + "fourteen/1", + "fourier/0"); lookup = new FSTLookup(10, false); lookup.build(new TermFreqArrayIterator(evalKeys())); // 'one' is not promoted after collecting two higher ranking results. assertMatchEquals(lookup.lookup("one", true, 2), - "oneness/1.0", - "onerous/1.0"); + "oneness/1", + "onerous/1"); // 'one' is at the top after collecting all alphabetical results. assertMatchEquals(lookup.lookup("one", false, 2), - "one/0.0", - "oneness/1.0"); + "one/0", + "oneness/1"); } public void testMiss() throws Exception { @@ -131,10 +132,10 @@ public void testFullMatchList() throws Exception { assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE), - "oneness/1.0", - "onerous/1.0", - "onesimus/1.0", - "one/0.0"); + "oneness/1", + "onerous/1", + "onesimus/1", + "one/0"); } public void testMultilingualInput() throws Exception { @@ -144,8 +145,8 @@ lookup.build(new TermFreqArrayIterator(input)); for (TermFreq tf : input) { - assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null); - assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key); + assertTrue("Not found: " + tf.term, lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)) != null); + assertEquals(tf.term.utf8ToString(), lookup.lookup(_TestUtil.bytesToCharSequence(tf.term, random), true, 1).get(0).key.toString()); } } @@ -166,11 +167,11 @@ lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()]))); for (TermFreq tf : freqs) { - final String term = tf.term; + final CharSequence term = _TestUtil.bytesToCharSequence(tf.term, random); for (int i = 1; i < term.length(); i++) { - String prefix = term.substring(0, i); + CharSequence prefix = term.subSequence(0, i); for (LookupResult lr : lookup.lookup(prefix, true, 10)) { - assertTrue(lr.key.startsWith(prefix)); + assertTrue(lr.key.toString().startsWith(prefix.toString())); } } } Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java (working copy) @@ -28,7 +28,7 @@ * Unit tests for {@link FSTCompletion}. */ public class FSTCompletionTest extends LuceneTestCase { - public static TermFreq tf(String t, float v) { + public static TermFreq tf(String t, int v) { return new TermFreq(t, v); } @@ -40,7 +40,7 @@ FSTCompletionBuilder builder = new FSTCompletionBuilder(); for (TermFreq tf : evalKeys()) { - builder.add(new BytesRef(tf.term), (int) tf.v); + builder.add(tf.term, (int) tf.v); } completion = builder.build(); completionAlphabetical = new FSTCompletion(completion.getFST(), false, true); @@ -62,28 +62,28 @@ tf("foundation", 1), tf("fourblah", 1), tf("fourteen", 1), - tf("four", 0f), - tf("fourier", 0f), - tf("fourty", 0f), + tf("four", 0), + tf("fourier", 0), + tf("fourty", 0), tf("xo", 1), }; return keys; } public void testExactMatchHighPriority() throws Exception { - assertMatchEquals(completion.lookup("two", 1), + assertMatchEquals(completion.lookup(_TestUtil.stringToCharSequence("two", random), 1), "two/1.0"); } public void testExactMatchLowPriority() throws Exception { - assertMatchEquals(completion.lookup("one", 2), + assertMatchEquals(completion.lookup(_TestUtil.stringToCharSequence("one", random), 2), "one/0.0", "oneness/1.0"); } public void testExactMatchReordering() throws Exception { // Check reordering of exact matches. - assertMatchEquals(completion.lookup("four", 4), + assertMatchEquals(completion.lookup(_TestUtil.stringToCharSequence("four", random), 4), "four/0.0", "fourblah/1.0", "fourteen/1.0", @@ -92,49 +92,49 @@ public void testRequestedCount() throws Exception { // 'one' is promoted after collecting two higher ranking results. - assertMatchEquals(completion.lookup("one", 2), + assertMatchEquals(completion.lookup(_TestUtil.stringToCharSequence("one", random), 2), "one/0.0", "oneness/1.0"); // 'four' is collected in a bucket and then again as an exact match. - assertMatchEquals(completion.lookup("four", 2), + assertMatchEquals(completion.lookup(_TestUtil.stringToCharSequence("four", random), 2), "four/0.0", "fourblah/1.0"); // Check reordering of exact matches. - assertMatchEquals(completion.lookup("four", 4), + assertMatchEquals(completion.lookup(_TestUtil.stringToCharSequence("four", random), 4), "four/0.0", "fourblah/1.0", "fourteen/1.0", "fourier/0.0"); // 'one' is at the top after collecting all alphabetical results. - assertMatchEquals(completionAlphabetical.lookup("one", 2), + assertMatchEquals(completionAlphabetical.lookup(_TestUtil.stringToCharSequence("one", random), 2), "one/0.0", "oneness/1.0"); // 'one' is not promoted after collecting two higher ranking results. FSTCompletion noPromotion = new FSTCompletion(completion.getFST(), true, false); - assertMatchEquals(noPromotion.lookup("one", 2), + assertMatchEquals(noPromotion.lookup(_TestUtil.stringToCharSequence("one", random), 2), "oneness/1.0", "onerous/1.0"); // 'one' is at the top after collecting all alphabetical results. - assertMatchEquals(completionAlphabetical.lookup("one", 2), + assertMatchEquals(completionAlphabetical.lookup(_TestUtil.stringToCharSequence("one", random), 2), "one/0.0", "oneness/1.0"); } public void testMiss() throws Exception { - assertMatchEquals(completion.lookup("xyz", 1)); + assertMatchEquals(completion.lookup(_TestUtil.stringToCharSequence("xyz", random), 1)); } public void testAlphabeticWithWeights() throws Exception { - assertEquals(0, completionAlphabetical.lookup("xyz", 1).size()); + assertEquals(0, completionAlphabetical.lookup(_TestUtil.stringToCharSequence("xyz", random), 1).size()); } public void testFullMatchList() throws Exception { - assertMatchEquals(completion.lookup("one", Integer.MAX_VALUE), + assertMatchEquals(completion.lookup(_TestUtil.stringToCharSequence("one", random), Integer.MAX_VALUE), "oneness/1.0", "onerous/1.0", "onesimus/1.0", @@ -148,7 +148,7 @@ builder.add(new BytesRef(key), 0); FSTCompletion lookup = builder.build(); - List result = lookup.lookup(key, 1); + List result = lookup.lookup(_TestUtil.stringToCharSequence(key, random), 1); assertEquals(1, result.size()); } @@ -158,16 +158,16 @@ Random r = random; List keys = new ArrayList(); for (int i = 0; i < 5000; i++) { - keys.add(new TermFreq(_TestUtil.randomSimpleString(r), -1.0f)); + keys.add(new TermFreq(_TestUtil.randomSimpleString(r), -1)); } lookup.build(new TermFreqArrayIterator(keys)); // All the weights were constant, so all returned buckets must be constant, whatever they // are. - Float previous = null; + Long previous = null; for (TermFreq tf : keys) { - Float current = lookup.get(tf.term); + Long current = ((Number)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random))).longValue(); if (previous != null) { assertEquals(previous, current); } @@ -180,28 +180,27 @@ FSTCompletionLookup lookup = new FSTCompletionLookup(); lookup.build(new TermFreqArrayIterator(input)); - for (TermFreq tf : input) { - assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null); - assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key); + assertNotNull("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random))); + assertEquals(tf.term.utf8ToString(), lookup.lookup(_TestUtil.bytesToCharSequence(tf.term, random), true, 1).get(0).key.toString()); } - List result = lookup.lookup("wit", true, 5); + List result = lookup.lookup(_TestUtil.stringToCharSequence("wit", random), true, 5); assertEquals(5, result.size()); - assertTrue(result.get(0).key.equals("wit")); // exact match. - assertTrue(result.get(1).key.equals("with")); // highest count. + assertTrue(result.get(0).key.toString().equals("wit")); // exact match. + assertTrue(result.get(1).key.toString().equals("with")); // highest count. } public void testEmptyInput() throws Exception { completion = new FSTCompletionBuilder().build(); - assertMatchEquals(completion.lookup("", 10)); + assertMatchEquals(completion.lookup(_TestUtil.stringToCharSequence("", random), 10)); } public void testRandom() throws Exception { List freqs = new ArrayList(); Random rnd = random; for (int i = 0; i < 2500 + rnd.nextInt(2500); i++) { - float weight = rnd.nextFloat() * 100; + int weight = random.nextInt(100); freqs.add(new TermFreq("" + rnd.nextLong(), weight)); } @@ -209,11 +208,11 @@ lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()]))); for (TermFreq tf : freqs) { - final String term = tf.term; + final String term = tf.term.utf8ToString(); for (int i = 1; i < term.length(); i++) { String prefix = term.substring(0, i); - for (LookupResult lr : lookup.lookup(prefix, true, 10)) { - assertTrue(lr.key.startsWith(prefix)); + for (LookupResult lr : lookup.lookup(_TestUtil.stringToCharSequence(prefix, random), true, 10)) { + assertTrue(lr.key.toString().startsWith(prefix)); } } } Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java (revision 1297711) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java (working copy) @@ -1,140 +0,0 @@ -package org.apache.lucene.search.suggest.fst; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.*; - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.NumericUtils; -import org.junit.Ignore; -import org.junit.Test; - -public class FloatMagicTest extends LuceneTestCase { - public void testFloatMagic() { - ArrayList floats = new ArrayList(Arrays.asList( - Float.intBitsToFloat(0x7f800001), // NaN (invalid combination). - Float.intBitsToFloat(0x7fffffff), // NaN (invalid combination). - Float.intBitsToFloat(0xff800001), // NaN (invalid combination). - Float.intBitsToFloat(0xffffffff), // NaN (invalid combination). - Float.POSITIVE_INFINITY, - Float.MAX_VALUE, - 100f, - 0f, - 0.1f, - Float.MIN_VALUE, - Float.NaN, - -0.0f, - -Float.MIN_VALUE, - -0.1f, - -1f, - -10f, - Float.NEGATIVE_INFINITY)); - - // Sort them using juc. - Collections.sort(floats); - - // Convert to sortable int4 representation (as long to have an unsigned sort). - long [] int4 = new long [floats.size()]; - for (int i = 0; i < floats.size(); i++) { - int4[i] = FloatMagic.toSortable(floats.get(i)) & 0xffffffffL; - - /* - System.out.println( - String.format("raw %8s sortable %8s %8s numutils %8s %s", - Integer.toHexString(Float.floatToRawIntBits(floats.get(i))), - Integer.toHexString(FloatMagic.toSortable(floats.get(i))), - Integer.toHexString(FloatMagic.unsignedOrderedToFloatBits(FloatMagic.toSortable(floats.get(i)))), - Integer.toHexString(NumericUtils.floatToSortableInt(floats.get(i))), - floats.get(i))); - */ - } - - // Sort and compare. Should be identical order. - Arrays.sort(int4); - ArrayList backFromFixed = new ArrayList(); - for (int i = 0; i < int4.length; i++) { - backFromFixed.add(FloatMagic.fromSortable((int) int4[i])); - } - - /* - for (int i = 0; i < int4.length; i++) { - System.out.println( - floats.get(i) + " " + FloatMagic.fromSortable((int) int4[i])); - } - */ - - assertEquals(floats, backFromFixed); - } - - @Ignore("Once checked, valid forever?") @Test - public void testRoundTripFullRange() { - int i = 0; - do { - float f = Float.intBitsToFloat(i); - float f2 = FloatMagic.fromSortable(FloatMagic.toSortable(f)); - - if (!((Float.isNaN(f) && Float.isNaN(f2)) || f == f2)) { - throw new RuntimeException("! " + Integer.toHexString(i) + "> " + f + " " + f2); - } - - if ((i & 0xffffff) == 0) { - System.out.println(Integer.toHexString(i)); - } - - i++; - } while (i != 0); - } - - @Ignore("Once checked, valid forever?") @Test - public void testIncreasingFullRange() { - // -infinity ... -0.0 - for (int i = 0xff800000; i != 0x80000000; i--) { - checkSmaller(i, i - 1); - } - - // -0.0 +0.0 - checkSmaller(0x80000000, 0); - - // +0.0 ... +infinity - for (int i = 0; i != 0x7f800000; i++) { - checkSmaller(i, i + 1); - } - - // All other are NaNs and should be after positive infinity. - final long infinity = toSortableL(Float.POSITIVE_INFINITY); - for (int i = 0x7f800001; i != 0x7fffffff; i++) { - assertTrue(infinity < toSortableL(Float.intBitsToFloat(i))); - } - for (int i = 0xff800001; i != 0xffffffff; i++) { - assertTrue(infinity < toSortableL(Float.intBitsToFloat(i))); - } - } - - private long toSortableL(float f) { - return FloatMagic.toSortable(f) & 0xffffffffL; - } - - private void checkSmaller(int i1, int i2) { - float f1 = Float.intBitsToFloat(i1); - float f2 = Float.intBitsToFloat(i2); - if (f1 > f2) { - throw new AssertionError(f1 + " " + f2 + " " + i1 + " " + i2); - } - assertTrue(toSortableL(f1) < toSortableL(f2)); - } -} Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/TestSort.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/TestSort.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/TestSort.java (working copy) @@ -20,6 +20,7 @@ import java.io.*; import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; import org.apache.lucene.search.suggest.fst.Sort.BufferSize; import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; @@ -61,7 +62,7 @@ @Test public void testIntermediateMerges() throws Exception { // Sort 20 mb worth of data with 1mb buffer, binary merging. - SortInfo info = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), 2), + SortInfo info = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), 2), generateRandom(Sort.MB * 20)); assertTrue(info.mergeRounds > 10); } @@ -69,7 +70,7 @@ @Test public void testSmallRandom() throws Exception { // Sort 20 mb worth of data with 1mb buffer. - SortInfo sortInfo = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), + SortInfo sortInfo = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), generateRandom(Sort.MB * 20)); assertEquals(1, sortInfo.mergeRounds); } @@ -77,7 +78,7 @@ @Test @Nightly public void testLargerRandom() throws Exception { // Sort 100MB worth of data with 15mb buffer. - checkSort(new Sort(BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), + checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), generateRandom(Sort.MB * 100)); } @@ -92,14 +93,25 @@ byte [][] bytes = data.toArray(new byte[data.size()][]); return bytes; } - + + static final Comparator unsignedByteOrderComparator = new Comparator() { + public int compare(byte[] left, byte[] right) { + final int max = Math.min(left.length, right.length); + for (int i = 0, j = 0; i < max; i++, j++) { + int diff = (left[i] & 0xff) - (right[j] & 0xff); + if (diff != 0) + return diff; + } + return left.length - right.length; + } + }; /** * Check sorting data on an instance of {@link Sort}. */ private SortInfo checkSort(Sort sort, byte[][] data) throws IOException { File unsorted = writeAll("unsorted", data); - Arrays.sort(data, Sort.unsignedByteOrderComparator); + Arrays.sort(data, unsignedByteOrderComparator); File golden = writeAll("golden", data); File sorted = new File(tempDir, "sorted"); Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java (revision 1297697) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java (working copy) @@ -45,33 +45,33 @@ suggester.build(new TermFreqArrayIterator(keys)); // top N of 2, but only foo is available - List results = suggester.lookup("f", false, 2); + List results = suggester.lookup(_TestUtil.stringToCharSequence("f", random), false, 2); assertEquals(1, results.size()); - assertEquals("foo", results.get(0).key); + assertEquals("foo", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // top N of 1 for 'bar': we return this even though barbar is higher - results = suggester.lookup("bar", false, 1); + results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random), false, 1); assertEquals(1, results.size()); - assertEquals("bar", results.get(0).key); + assertEquals("bar", results.get(0).key.toString()); assertEquals(10, results.get(0).value, 0.01F); // top N Of 2 for 'b' - results = suggester.lookup("b", false, 2); + results = suggester.lookup(_TestUtil.stringToCharSequence("b", random), false, 2); assertEquals(2, results.size()); - assertEquals("barbar", results.get(0).key); + assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); - assertEquals("bar", results.get(1).key); + assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); // top N of 3 for 'ba' - results = suggester.lookup("ba", false, 3); + results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random), false, 3); assertEquals(3, results.size()); - assertEquals("barbar", results.get(0).key); + assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); - assertEquals("bar", results.get(1).key); + assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); - assertEquals("barbara", results.get(2).key); + assertEquals("barbara", results.get(2).key.toString()); assertEquals(6, results.get(2).value, 0.01F); } @@ -100,7 +100,7 @@ // we can probably do Integer.MAX_VALUE here, but why worry. int weight = random.nextInt(1<<24); slowCompletor.put(s, (long)weight); - keys[i] = new TermFreq(s, (float) weight); + keys[i] = new TermFreq(s, weight); } WFSTCompletionLookup suggester = new WFSTCompletionLookup(false); @@ -109,7 +109,7 @@ for (String prefix : allPrefixes) { final int topN = _TestUtil.nextInt(random, 1, 10); - List r = suggester.lookup(prefix, false, topN); + List r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random), false, topN); // 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion final List matches = new ArrayList(); @@ -117,7 +117,7 @@ // TODO: could be faster... but its slowCompletor for a reason for (Map.Entry e : slowCompletor.entrySet()) { if (e.getKey().startsWith(prefix)) { - matches.add(new LookupResult(e.getKey(), (float)e.getValue().longValue())); + matches.add(new LookupResult(e.getKey(), e.getValue().longValue())); } } @@ -126,7 +126,7 @@ public int compare(LookupResult left, LookupResult right) { int cmp = Float.compare(right.value, left.value); if (cmp == 0) { - return left.key.compareTo(right.key); + return left.compareTo(right); } else { return cmp; } @@ -140,7 +140,7 @@ for(int hit=0;hit i; private TermFreq current; + private final BytesRef spare = new BytesRef(); public TermFreqArrayIterator(Iterator i) { this.i = i; @@ -41,17 +45,22 @@ this(i.iterator()); } - public float freq() { + public long weight() { return current.v; } - - public boolean hasNext() { - return i.hasNext(); + + @Override + public BytesRef next() throws IOException { + if (i.hasNext()) { + current = i.next(); + spare.copyBytes(current.term); + return spare; + } + return null; } - - public String next() { - return (current = i.next()).term; + + @Override + public Comparator getComparator() { + return null; } - - public void remove() { throw new UnsupportedOperationException(); } } \ No newline at end of file Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TermFreqIterator.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TermFreqIterator.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TermFreqIterator.java (working copy) @@ -17,34 +17,34 @@ * limitations under the License. */ -import java.util.Iterator; +import java.io.IOException; +import java.util.Comparator; -public interface TermFreqIterator extends Iterator { +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; - public float freq(); +public interface TermFreqIterator extends BytesRefIterator { + + public long weight(); public static class TermFreqIteratorWrapper implements TermFreqIterator { - private Iterator wrapped; + private BytesRefIterator wrapped; - public TermFreqIteratorWrapper(Iterator wrapped) { + public TermFreqIteratorWrapper(BytesRefIterator wrapped) { this.wrapped = wrapped; } - public float freq() { - return 1.0f; + public long weight() { + return 1; } - public boolean hasNext() { - return wrapped.hasNext(); + public BytesRef next() throws IOException { + return wrapped.next(); } - public String next() { - return wrapped.next().toString(); + @Override + public Comparator getComparator() { + return wrapped.getComparator(); } - - public void remove() { - throw new UnsupportedOperationException(); - } - } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java (working copy) @@ -18,14 +18,13 @@ */ import org.apache.lucene.index.IndexReader; - -import java.util.Iterator; - -import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.Term; -import org.apache.lucene.util.StringHelper; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import java.io.*; +import java.util.Comparator; /** * Lucene Dictionary: terms taken from the given field @@ -43,68 +42,57 @@ public LuceneDictionary(IndexReader reader, String field) { this.reader = reader; - this.field = StringHelper.intern(field); + this.field = field; } - public final Iterator getWordsIterator() { - return new LuceneIterator(); + public final BytesRefIterator getWordsIterator() throws IOException { + return new TermIterator(); } + + final class TermIterator implements TermFreqIterator { + private final BytesRef spare = new BytesRef(); + private final TermEnum termsEnum; + private long freq; + private final Comparator comp; - final class LuceneIterator implements Iterator { - private TermEnum termEnum; - private Term actualTerm; - private boolean hasNextCalled; - - LuceneIterator() { - try { - termEnum = reader.terms(new Term(field)); - } catch (IOException e) { - throw new RuntimeException(e); + TermIterator() throws IOException { + termsEnum = reader.terms(new Term(field, "")); + Term term = termsEnum.term(); + if (term == null || term.field() != field) { + comp = null; + } else { + comp = BytesRef.getUTF8SortedAsUnicodeComparator(); } } - public String next() { - if (!hasNextCalled) { - hasNext(); - } - hasNextCalled = false; - - try { - termEnum.next(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - return (actualTerm != null) ? actualTerm.text() : null; + public long weight() { + return freq; } - public boolean hasNext() { - if (hasNextCalled) { - return actualTerm != null; - } - hasNextCalled = true; + @Override + public BytesRef next() throws IOException { + if (termsEnum != null) { + Term actualTerm; + do { + actualTerm = termsEnum.term(); + if (actualTerm == null || actualTerm.field() != field) { + return null; + } + freq = termsEnum.docFreq(); + spare.copyChars(actualTerm.text()); + termsEnum.next(); + return spare; + } while(termsEnum.next()); - actualTerm = termEnum.term(); - - // if there are no words return false - if (actualTerm == null) { - return false; } - - String currentField = actualTerm.field(); - - // if the next word doesn't have the same field return false - if (currentField != field) { - actualTerm = null; - return false; - } - - return true; + return null; } - public void remove() { - throw new UnsupportedOperationException(); + @Override + public Comparator getComparator() { + return comp; } } + } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java (working copy) @@ -20,7 +20,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; -import java.util.Iterator; import java.util.List; import org.apache.lucene.analysis.WhitespaceAnalyzer; @@ -40,6 +39,8 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.Version; @@ -580,10 +581,11 @@ boolean isEmpty = readers.isEmpty(); try { - Iterator iter = dict.getWordsIterator(); + BytesRefIterator iter = dict.getWordsIterator(); - terms: while (iter.hasNext()) { - String word = iter.next(); + BytesRef spare; + terms: while ((spare = iter.next()) != null) { + String word = spare.utf8ToString(); int len = word.length(); if (len < 3) { Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java (working copy) @@ -16,7 +16,8 @@ * limitations under the License. */ -import java.util.Iterator; +import java.io.IOException; +import org.apache.lucene.util.BytesRefIterator; /** * A simple interface representing a Dictionary. A Dictionary @@ -31,5 +32,5 @@ * Return all words present in the dictionary * @return Iterator */ - Iterator getWordsIterator(); + BytesRefIterator getWordsIterator() throws IOException; } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SortedIterator.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SortedIterator.java (revision 1297711) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SortedIterator.java (working copy) @@ -1,28 +0,0 @@ -package org.apache.lucene.search.spell; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Iterator; - -/** - * Marker interface to signal that elements coming from {@link Iterator} - * come in ascending lexicographic order. - */ -public interface SortedIterator { - -} Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java (working copy) @@ -18,22 +18,19 @@ package org.apache.lucene.search.spell; import java.io.IOException; -import java.util.Iterator; +import java.util.Comparator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; -import org.apache.lucene.search.spell.Dictionary; -import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.BytesRef; /** * HighFrequencyDictionary: terms taken from the given field * of a Lucene index, which appear in a number of documents * above a given threshold. * - * When using IndexReader.terms(Term) the code must not call next() on TermEnum - * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6 - * * Threshold is a value in [0..1] representing the minimum * number of documents (of the total) where a term should appear. * @@ -46,98 +43,63 @@ public HighFrequencyDictionary(IndexReader reader, String field, float thresh) { this.reader = reader; - this.field = StringHelper.intern(field); + this.field = field; this.thresh = thresh; } - public final Iterator getWordsIterator() { + public final BytesRefIterator getWordsIterator() throws IOException { return new HighFrequencyIterator(); } final class HighFrequencyIterator implements TermFreqIterator { - private TermEnum termEnum; - private Term actualTerm; - private int actualFreq; - private boolean hasNextCalled; + private final BytesRef spare = new BytesRef(); + private final TermEnum termsEnum; private int minNumDocs; + private long freq; + private final Comparator comp; - HighFrequencyIterator() { - try { - termEnum = reader.terms(new Term(field, "")); - minNumDocs = (int)(thresh * (float)reader.numDocs()); - } catch (IOException e) { - throw new RuntimeException(e); + HighFrequencyIterator() throws IOException { + termsEnum = reader.terms(new Term(field, "")); + minNumDocs = (int)(thresh * (float)reader.numDocs()); + Term term = termsEnum.term(); + if (term == null || term.field() != field) { + comp = null; + } else { + comp = BytesRef.getUTF8SortedAsUnicodeComparator(); } } - private boolean isFrequent(Term term) { - try { - return reader.docFreq(term) >= minNumDocs; - } catch (IOException e) { - throw new RuntimeException(e); - } + private boolean isFrequent(int freq) { + return freq >= minNumDocs; } - - public String next() { - if (!hasNextCalled) { - hasNext(); - } - hasNextCalled = false; - - try { - termEnum.next(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - return (actualTerm != null) ? actualTerm.text() : null; + + public long weight() { + return freq; } - public float freq() { - return actualFreq; - } - - - public boolean hasNext() { - if (hasNextCalled) { - return actualTerm != null; + @Override + public BytesRef next() throws IOException { + if (termsEnum != null) { + Term actualTerm; + do { + actualTerm = termsEnum.term(); + if (actualTerm == null || actualTerm.field() != field) { + return null; + } + if (isFrequent(termsEnum.docFreq())) { + freq = termsEnum.docFreq(); + spare.copyChars(actualTerm.text()); + termsEnum.next(); + return spare; + } + } while(termsEnum.next()); } - hasNextCalled = true; - - do { - actualTerm = termEnum.term(); - actualFreq = termEnum.docFreq(); - - // if there are no words return false - if (actualTerm == null) { - return false; - } - - String currentField = actualTerm.field(); - - // if the next word doesn't have the same field return false - if (currentField != field) { // intern'd comparison - actualTerm = null; - return false; - } - - // got a valid term, does it pass the threshold? - if (isFrequent(actualTerm)) { - return true; - } - - // term not up to threshold - try { - termEnum.next(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - } while (true); + return null; } - public void remove() { - throw new UnsupportedOperationException(); + @Override + public Comparator getComparator() { + return comp; } } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java (working copy) @@ -18,10 +18,14 @@ */ -import java.util.Iterator; +import java.util.Comparator; import java.io.*; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.IOUtils; + /** * Dictionary represented by a text file. * @@ -33,8 +37,6 @@ public class PlainTextDictionary implements Dictionary { private BufferedReader in; - private String line; - private boolean hasNextCalled; public PlainTextDictionary(File file) throws FileNotFoundException { in = new BufferedReader(new FileReader(file)); @@ -51,31 +53,42 @@ in = new BufferedReader(reader); } - public Iterator getWordsIterator() { - return new fileIterator(); + public BytesRefIterator getWordsIterator() throws IOException { + return new FileIterator(); } - final class fileIterator implements Iterator { - public String next() { - if (!hasNextCalled) { - hasNext(); + final class FileIterator implements BytesRefIterator { + private boolean done = false; + private final BytesRef spare = new BytesRef(); + @Override + public BytesRef next() throws IOException { + if (done) { + return null; } - hasNextCalled = false; - return line; - } - - public boolean hasNext() { - hasNextCalled = true; + boolean success = false; + BytesRef result; try { - line = in.readLine(); - } catch (IOException ex) { - throw new RuntimeException(ex); + String line; + if ((line = in.readLine()) != null) { + spare.copyChars(line); + result = spare; + } else { + done = true; + IOUtils.close(in); + result = null; + } + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(in); + } } - return (line != null) ? true : false; + return result; } - - public void remove() { - throw new UnsupportedOperationException(); + + @Override + public Comparator getComparator() { + return null; } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/BytesRefList.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/BytesRefList.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/BytesRefList.java (working copy) @@ -18,75 +18,113 @@ */ import java.io.IOException; +import java.util.Arrays; import java.util.Comparator; +import java.util.concurrent.atomic.AtomicLong; + import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.SorterTemplate; -final class BytesRefList { - +/** + * A simple append only random-access {@link BytesRef} array that stores full + * copies of the appended bytes in a {@link ByteBlockPool}. + * + * + * Note: This class is not Thread-Safe! + * + * @lucene.internal + * @lucene.experimental + */ +public final class BytesRefList { + // TODO rename to BytesRefArray private final ByteBlockPool pool; private int[] offsets = new int[1]; - private int currentElement = 0; + private int lastElement = 0; private int currentOffset = 0; - + private final AtomicLong bytesUsed = new AtomicLong(); + + /** + * Creates a new {@link BytesRefList} + */ public BytesRefList() { - this(new ByteBlockPool(new ByteBlockPool.DirectAllocator())); - } - - public BytesRefList(ByteBlockPool pool) { - this.pool = pool; + this.pool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator( + bytesUsed)); pool.nextBuffer(); + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + RamUsageEstimator.NUM_BYTES_INT); } - + + /** + * Clears this {@link BytesRefList} + */ + public void clear() { + lastElement = 0; + currentOffset = 0; + Arrays.fill(offsets, 0); + pool.reset(); + } + + /** + * Appends a copy of the given {@link BytesRef} to this {@link BytesRefList}. + * @param bytes the bytes to append + * @return the ordinal of the appended bytes + */ public int append(BytesRef bytes) { - if (currentElement >= offsets.length) { + if (lastElement >= offsets.length) { + int oldLen = offsets.length; offsets = ArrayUtil.grow(offsets, offsets.length + 1); + bytesUsed.addAndGet((offsets.length - oldLen) + * RamUsageEstimator.NUM_BYTES_INT); } pool.copy(bytes); - offsets[currentElement++] = currentOffset; + offsets[lastElement++] = currentOffset; currentOffset += bytes.length; - return currentElement; + return lastElement; } - + + /** + * Returns the current size of this {@link BytesRefList} + * @return the current size of this {@link BytesRefList} + */ public int size() { - return currentElement; + return lastElement; } - - public BytesRef get(BytesRef bytes, int pos) { - if (currentElement > pos) { - bytes.offset = offsets[pos]; - bytes.length = pos == currentElement - 1 ? currentOffset - bytes.offset - : offsets[pos + 1] - bytes.offset; - pool.copyFrom(bytes); - return bytes; + + /** + * Returns the n'th element of this {@link BytesRefList} + * @param spare a spare {@link BytesRef} instance + * @param ord the elements ordinal to retrieve + * @return the n'th element of this {@link BytesRefList} + */ + public BytesRef get(BytesRef spare, int ord) { + if (lastElement > ord) { + spare.offset = offsets[ord]; + spare.length = ord == lastElement - 1 ? currentOffset - spare.offset + : offsets[ord + 1] - spare.offset; + pool.copyFrom(spare); + return spare; } - throw new IndexOutOfBoundsException("index " + pos - + " must be less than the size: " + currentElement); - - } - - public BytesRefIterator iterator() { - final int numElements = currentElement; + throw new IndexOutOfBoundsException("index " + ord + + " must be less than the size: " + lastElement); - return new BytesRefIterator() { - private final BytesRef spare = new BytesRef(); - private int pos = 0; - - @Override - public BytesRef next() throws IOException { - if (pos < numElements) { - get(spare, pos++); - return spare; - } - return null; - } - }; } - public int[] sort(final Comparator comp) { + /** + * Returns the number internally used bytes to hold the appended bytes in + * memory + * + * @return the number internally used bytes to hold the appended bytes in + * memory + */ + public long bytesUsed() { + return bytesUsed.get(); + } + + private int[] sort(final Comparator comp) { final int[] orderdEntries = new int[size()]; for (int i = 0; i < orderdEntries.length; i++) { orderdEntries[i] = i; @@ -104,22 +142,65 @@ final int ord1 = orderdEntries[i], ord2 = orderdEntries[j]; return comp.compare(get(scratch1, ord1), get(scratch2, ord2)); } - + @Override protected void setPivot(int i) { final int ord = orderdEntries[i]; get(pivot, ord); } - + @Override protected int comparePivot(int j) { final int ord = orderdEntries[j]; return comp.compare(pivot, get(scratch2, ord)); } - private final BytesRef pivot = new BytesRef(), - scratch1 = new BytesRef(), scratch2 = new BytesRef(); + private final BytesRef pivot = new BytesRef(), scratch1 = new BytesRef(), + scratch2 = new BytesRef(); }.quickSort(0, size() - 1); return orderdEntries; } + + /** + * sugar for {@link #iterator(Comparator)} with a null comparator + */ + public BytesRefIterator iterator() { + return iterator(null); + } + + /** + *

+ * Returns a {@link BytesRefIterator} with point in time semantics. The + * iterator provides access to all so far appended {@link BytesRef} instances. + *

+ *

+ * If a non null {@link Comparator} is provided the iterator will + * iterate the byte values in the order specified by the comparator. Otherwise + * the order is the same as the values were appended. + *

+ *

+ * This is a non-destructive operation. + *

+ */ + public BytesRefIterator iterator(final Comparator comp) { + final BytesRef spare = new BytesRef(); + final int size = size(); + final int[] ords = comp == null ? null : sort(comp); + return new BytesRefIterator() { + int pos = 0; + + @Override + public BytesRef next() throws IOException { + if (pos < size) { + return get(spare, ords == null ? pos++ : ords[pos++]); + } + return null; + } + + @Override + public Comparator getComparator() { + return comp; + } + }; + } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java (working copy) @@ -17,19 +17,166 @@ * limitations under the License. */ -import java.util.Collections; +import java.io.File; +import java.io.IOException; +import java.util.Comparator; -import org.apache.lucene.search.spell.SortedIterator; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.search.suggest.fst.Sort; +import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader; +import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; /** - * This wrapper buffers incoming elements and makes sure they are sorted in - * ascending lexicographic order. + * This wrapper buffers incoming elements and makes sure they are sorted based on given comparator. + * @lucene.experimental */ -public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper implements SortedIterator { +public class SortedTermFreqIteratorWrapper implements TermFreqIterator { + + private final TermFreqIterator source; + private File tempInput; + private File tempSorted; + private final ByteSequencesReader reader; + private boolean done = false; + + private long weight; + private final BytesRef scratch = new BytesRef(); + private final Comparator comparator; + + public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comparator) throws IOException { + this(source, comparator, false); + } + + public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comparator, boolean compareRawBytes) throws IOException { + this.source = source; + this.comparator = comparator; + this.reader = sort(compareRawBytes ? comparator : new BytesOnlyComparator(this.comparator)); + } + + @Override + public BytesRef next() throws IOException { + boolean success = false; + if (done) { + return null; + } + try { + ByteArrayDataInput input = new ByteArrayDataInput(); + if (reader.read(scratch)) { + weight = decode(scratch, input); + success = true; + return scratch; + } + close(); + success = done = true; + return null; + } finally { + if (!success) { + done = true; + close(); + } + } + } + + @Override + public Comparator getComparator() { + return comparator; + } + + @Override + public long weight() { + return weight; + } + + private Sort.ByteSequencesReader sort(Comparator comparator) throws IOException { + String prefix = getClass().getSimpleName(); + File directory = Sort.defaultTempDir(); + tempInput = File.createTempFile(prefix, ".input", directory); + tempSorted = File.createTempFile(prefix, ".sorted", directory); + + final Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); + boolean success = false; + try { + BytesRef spare; + byte[] buffer = new byte[0]; + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); - public SortedTermFreqIteratorWrapper(TermFreqIterator source) { - super(source); - Collections.sort(entries); + while ((spare = source.next()) != null) { + encode(writer, output, buffer, spare, source.weight()); + } + writer.close(); + new Sort(comparator).sort(tempInput, tempSorted); + ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted); + success = true; + return reader; + + } finally { + if (success) { + IOUtils.close(writer); + } else { + try { + IOUtils.closeWhileHandlingException(writer); + } finally { + close(); + } + } + + } } + + private void close() throws IOException { + if (tempInput != null) { + tempInput.delete(); + } + if (tempSorted != null) { + tempSorted.delete(); + } + IOUtils.close(reader); + } + + private final static class BytesOnlyComparator implements Comparator { + + final Comparator other; + private final BytesRef leftScratch = new BytesRef(); + private final BytesRef rightScratch = new BytesRef(); + + public BytesOnlyComparator(Comparator other) { + this.other = other; + } + + @Override + public int compare(BytesRef left, BytesRef right) { + wrap(leftScratch, left); + wrap(rightScratch, right); + return other.compare(leftScratch, rightScratch); + } + + private void wrap(BytesRef wrapper, BytesRef source) { + wrapper.bytes = source.bytes; + wrapper.offset = source.offset; + wrapper.length = source.length - 8; + + } + } + + protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException { + if (spare.length + 8 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 8); + } + output.reset(buffer); + output.writeBytes(spare.bytes, spare.offset, spare.length); + output.writeLong(weight); + writer.write(buffer, 0, output.getPosition()); + } + + protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { + tmpInput.reset(scratch.bytes); + tmpInput.skipBytes(scratch.length - 8); // suggestion + separator + scratch.length -= 8; // sep + long + return tmpInput.readLong(); + } + } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java (working copy) @@ -17,18 +17,47 @@ * limitations under the License. */ -import java.util.Collections; +import java.io.IOException; +import java.util.Random; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRef; /** * This wrapper buffers the incoming elements and makes sure they are in * random order. + * @lucene.experimental */ public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper { - - public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) { + // TODO keep this for now + private final int[] ords; + private int currentOrd = -1; + private final BytesRef spare = new BytesRef(); + public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) throws IOException { super(source); - Collections.shuffle(entries); + ords = new int[entries.size()]; + Random random = new Random(); + for (int i = 0; i < ords.length; i++) { + ords[i] = i; + } + for (int i = 0; i < ords.length; i++) { + int randomPosition = random.nextInt(ords.length); + int temp = ords[i]; + ords[i] = ords[randomPosition]; + ords[randomPosition] = temp; + } } + + @Override + public long weight() { + return freqs[currentOrd]; + } + + @Override + public BytesRef next() throws IOException { + if (++curPos < entries.size()) { + return entries.get(spare, (currentOrd = ords[curPos])); + } + return null; + } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/FileDictionary.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/FileDictionary.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/FileDictionary.java (working copy) @@ -19,9 +19,12 @@ import java.io.*; +import java.util.Comparator; import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; /** @@ -36,7 +39,7 @@ private BufferedReader in; private String line; - private boolean hasNextCalled; + private boolean done = false; public FileDictionary(InputStream dictFile) { in = new BufferedReader(new InputStreamReader(dictFile)); @@ -50,45 +53,49 @@ } public TermFreqIterator getWordsIterator() { - return new fileIterator(); + return new FileIterator(); } - final class fileIterator implements TermFreqIterator { - private float curFreq; + final class FileIterator implements TermFreqIterator { + private long curFreq; + private final BytesRef spare = new BytesRef(); - public String next() { - if (!hasNextCalled) { - hasNext(); - } - hasNextCalled = false; - return line; - } - - public float freq() { + + public long weight() { return curFreq; } - public boolean hasNext() { - hasNextCalled = true; - try { - line = in.readLine(); - if (line != null) { - String[] fields = line.split("\t"); - if (fields.length > 1) { - curFreq = Float.parseFloat(fields[1]); - line = fields[0]; - } else { - curFreq = 1; + @Override + public BytesRef next() throws IOException { + if (done) { + return null; + } + line = in.readLine(); + if (line != null) { + String[] fields = line.split("\t"); + if (fields.length > 1) { + // keep reading floats for bw compat + try { + curFreq = Long.parseLong(fields[1]); + } catch (NumberFormatException e) { + curFreq = (long)Double.parseDouble(fields[1]); } + spare.copyChars(fields[0]); + } else { + spare.copyChars(line); + curFreq = 1; } - } catch (IOException ex) { - throw new RuntimeException(ex); + return spare; + } else { + done = true; + IOUtils.close(in); + return null; } - return (line != null) ? true : false; } - public void remove() { - throw new UnsupportedOperationException(); + @Override + public Comparator getComparator() { + return null; } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java (working copy) @@ -17,65 +17,54 @@ * limitations under the License. */ -import java.util.ArrayList; -import java.util.List; - +import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; /** * This wrapper buffers incoming elements. + * @lucene.experimental */ public class BufferingTermFreqIteratorWrapper implements TermFreqIterator { - - /** Entry in the buffer. */ - public static final class Entry implements Comparable { - String word; - float freq; - - public Entry(String word, float freq) { - this.word = word; - this.freq = freq; + // TODO keep this for now + protected BytesRefList entries = new BytesRefList(); + protected int curPos = -1; + protected long[] freqs = new long[1]; + private final BytesRef spare = new BytesRef(); + private final Comparator comp; + public BufferingTermFreqIteratorWrapper(TermFreqIterator source) throws IOException { + this.comp = source.getComparator(); + BytesRef spare; + int freqIndex = 0; + while((spare = source.next()) != null) { + entries.append(spare); + if (freqIndex >= freqs.length) { + freqs = ArrayUtil.grow(freqs, freqs.length+1); + } + freqs[freqIndex++] = source.weight(); } - - public int compareTo(Entry o) { - return word.compareTo(o.word); - } + } - protected ArrayList entries = new ArrayList(); - - protected int curPos; - protected Entry curEntry; - - public BufferingTermFreqIteratorWrapper(TermFreqIterator source) { - // read all source data into buffer - while (source.hasNext()) { - String w = source.next(); - Entry e = new Entry(w, source.freq()); - entries.add(e); - } - curPos = 0; + public long weight() { + return freqs[curPos]; } - public float freq() { - return curEntry.freq; + @Override + public BytesRef next() throws IOException { + if (++curPos < entries.size()) { + entries.get(spare, curPos); + return spare; + } + return null; } - public boolean hasNext() { - return curPos < entries.size(); + @Override + public Comparator getComparator() { + return comp; } - public String next() { - curEntry = entries.get(curPos); - curPos++; - return curEntry.word; - } - - public void remove() { - throw new UnsupportedOperationException("remove is not supported"); - } - - public List entries() { - return entries; - } + } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/Lookup.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/Lookup.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/Lookup.java (working copy) @@ -19,22 +19,29 @@ import java.io.File; import java.io.IOException; -import java.util.Iterator; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Comparator; import java.util.List; import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.PriorityQueue; +/** + * Simple Lookup interface for {@link CharSequence} suggestions. + * @lucene.experimental + */ public abstract class Lookup { /** * Result of a lookup. */ public static final class LookupResult implements Comparable { - public final String key; - public final float value; + public final CharSequence key; + public final long value; - public LookupResult(String key, float value) { + public LookupResult(CharSequence key, long value) { this.key = key; this.value = value; } @@ -46,10 +53,32 @@ /** Compare alphabetically. */ public int compareTo(LookupResult o) { - return this.key.compareTo(o.key); + return CHARSEQUENCE_COMPARATOR.compare(key, o.key); } } + public static final Comparator CHARSEQUENCE_COMPARATOR = new CharSequenceComparator(); + + private static class CharSequenceComparator implements Comparator { + + @Override + public int compare(CharSequence o1, CharSequence o2) { + final int l1 = o1.length(); + final int l2 = o2.length(); + + final int aStop = Math.min(l1, l2); + for (int i = 0; i < aStop; i++) { + int diff = o1.charAt(i) - o2.charAt(i); + if (diff != 0) { + return diff; + } + } + // One is a prefix of the other, or, they are equal: + return l1 - l2; + } + + } + public static final class LookupPriorityQueue extends PriorityQueue { public LookupPriorityQueue(int size) { @@ -77,7 +106,7 @@ * {@link UnsortedTermFreqIteratorWrapper} in such case. */ public void build(Dictionary dict) throws IOException { - Iterator it = dict.getWordsIterator(); + BytesRefIterator it = dict.getWordsIterator(); TermFreqIterator tfit; if (it instanceof TermFreqIterator) { tfit = (TermFreqIterator)it; @@ -87,48 +116,55 @@ build(tfit); } + /** + * Builds up a new internal {@link Lookup} representation based on the given {@link TermFreqIterator}. + * The implementation might re-sort the data internally. + */ public abstract void build(TermFreqIterator tfit) throws IOException; /** + * Look up a key and return possible completion for this key. + * @param key lookup key. Depending on the implementation this may be + * a prefix, misspelling, or even infix. + * @param onlyMorePopular return only more popular results + * @param num maximum number of results to return + * @return a list of possible completions, with their relative weight (e.g. popularity) + */ + public abstract List lookup(CharSequence key, boolean onlyMorePopular, int num); + + + /** * Persist the constructed lookup data to a directory. Optional operation. - * @param storeDir directory where data can be stored. + * @param output {@link OutputStream} to write the data to. * @return true if successful, false if unsuccessful or not supported. * @throws IOException when fatal IO error occurs. */ - public abstract boolean store(File storeDir) throws IOException; + public abstract boolean store(OutputStream output) throws IOException; /** * Discard current lookup data and load it from a previously saved copy. * Optional operation. - * @param storeDir directory where lookup data was stored. + * @param input the {@link InputStream} to load the lookup data. * @return true if completed successfully, false if unsuccessful or not supported. * @throws IOException when fatal IO error occurs. */ - public abstract boolean load(File storeDir) throws IOException; + public abstract boolean load(InputStream input) throws IOException; /** - * Look up a key and return possible completion for this key. - * @param key lookup key. Depending on the implementation this may be - * a prefix, misspelling, or even infix. - * @param onlyMorePopular return only more popular results - * @param num maximum number of results to return - * @return a list of possible completions, with their relative weight (e.g. popularity) + * Persist the constructed lookup data to a directory. Optional operation. + * @param storeDir directory where data can be stored. + * @return true if successful, false if unsuccessful or not supported. + * @throws IOException when fatal IO error occurs. */ - public abstract List lookup(String key, boolean onlyMorePopular, int num); + public abstract boolean store(File storeDir) throws IOException; /** - * Modify the lookup data by recording additional data. Optional operation. - * @param key new lookup key - * @param value value to associate with this key - * @return true if new key is added, false if it already exists or operation - * is not supported. + * Discard current lookup data and load it from a previously saved copy. + * Optional operation. + * @param storeDir directory where lookup data was stored. + * @return true if completed successfully, false if unsuccessful or not supported. + * @throws IOException when fatal IO error occurs. */ - public abstract boolean add(String key, Object value); + public abstract boolean load(File storeDir) throws IOException; - /** - * Get value associated with a specific key. - * @param key lookup key - * @return associated value - */ - public abstract Object get(String key); } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java (working copy) @@ -23,14 +23,19 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.search.spell.SortedIterator; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.UnsortedTermFreqIteratorWrapper; import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie.TSTNode; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.UnicodeUtil; public class JaspellLookup extends Lookup { JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie(); @@ -39,36 +44,39 @@ @Override public void build(TermFreqIterator tfit) throws IOException { - if (tfit instanceof SortedIterator) { + if (tfit.getComparator() != null) { // make sure it's unsorted + // WTF - this could result in yet another sorted iteration.... tfit = new UnsortedTermFreqIteratorWrapper(tfit); } trie = new JaspellTernarySearchTrie(); trie.setMatchAlmostDiff(editDistance); - while (tfit.hasNext()) { - String key = tfit.next(); - float freq = tfit.freq(); - if (key.length() == 0) { + BytesRef spare; + final CharsRef charsSpare = new CharsRef(); + + while ((spare = tfit.next()) != null) { + final long weight = tfit.weight(); + if (spare.length == 0) { continue; } - trie.put(key, new Float(freq)); + charsSpare.grow(spare.length); + UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); + trie.put(charsSpare.toString(), Long.valueOf(weight)); } } - @Override - public boolean add(String key, Object value) { + public boolean add(CharSequence key, Object value) { trie.put(key, value); // XXX return false; } - @Override - public Object get(String key) { + public Object get(CharSequence key) { return trie.get(key); } @Override - public List lookup(String key, boolean onlyMorePopular, int num) { + public List lookup(CharSequence key, boolean onlyMorePopular, int num) { List res = new ArrayList(); List list; int count = onlyMorePopular ? num * 2 : num; @@ -85,8 +93,8 @@ if (onlyMorePopular) { LookupPriorityQueue queue = new LookupPriorityQueue(num); for (String s : list) { - float freq = (Float)trie.get(s); - queue.insertWithOverflow(new LookupResult(s, freq)); + long freq = ((Number)trie.get(s)).longValue(); + queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq)); } for (LookupResult lr : queue.getResults()) { res.add(lr); @@ -94,8 +102,8 @@ } else { for (int i = 0; i < maxCnt; i++) { String s = list.get(i); - float freq = (Float)trie.get(s); - res.add(new LookupResult(s, freq)); + long freq = ((Number)trie.get(s)).longValue(); + res.add(new LookupResult(new CharsRef(s), freq)); } } return res; @@ -114,22 +122,14 @@ if (!data.exists() || !data.canRead()) { return false; } - DataInputStream in = new DataInputStream(new FileInputStream(data)); - TSTNode root = trie.new TSTNode('\0', null); - try { - readRecursively(in, root); - trie.setRoot(root); - } finally { - in.close(); - } - return true; + return load(new FileInputStream(data)); } private void readRecursively(DataInputStream in, TSTNode node) throws IOException { node.splitchar = in.readChar(); byte mask = in.readByte(); if ((mask & HAS_VALUE) != 0) { - node.data = new Float(in.readFloat()); + node.data = Long.valueOf(in.readLong()); } if ((mask & LO_KID) != 0) { TSTNode kid = trie.new TSTNode('\0', node); @@ -153,19 +153,8 @@ if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { return false; } - TSTNode root = trie.getRoot(); - if (root == null) { // empty tree - return false; - } File data = new File(storeDir, FILENAME); - DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); - try { - writeRecursively(out, root); - out.flush(); - } finally { - out.close(); - } - return true; + return store(new FileOutputStream(data)); } private void writeRecursively(DataOutputStream out, TSTNode node) throws IOException { @@ -180,10 +169,39 @@ if (node.data != null) mask |= HAS_VALUE; out.writeByte(mask); if (node.data != null) { - out.writeFloat((Float)node.data); + out.writeLong(((Number)node.data).longValue()); } writeRecursively(out, node.relatives[TSTNode.LOKID]); writeRecursively(out, node.relatives[TSTNode.EQKID]); writeRecursively(out, node.relatives[TSTNode.HIKID]); } + + @Override + public boolean store(OutputStream output) throws IOException { + TSTNode root = trie.getRoot(); + if (root == null) { // empty tree + return false; + } + DataOutputStream out = new DataOutputStream(output); + try { + writeRecursively(out, root); + out.flush(); + } finally { + IOUtils.close(out); + } + return true; + } + + @Override + public boolean load(InputStream input) throws IOException { + DataInputStream in = new DataInputStream(input); + TSTNode root = trie.new TSTNode('\0', null); + try { + readRecursively(in, root); + trie.setRoot(root); + } finally { + IOUtils.close(in); + } + return true; + } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/jaspell/JaspellTernarySearchTrie.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/jaspell/JaspellTernarySearchTrie.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/jaspell/JaspellTernarySearchTrie.java (working copy) @@ -368,8 +368,8 @@ * A String index. *@return The object retrieved from the Ternary Search Trie. */ - public Object get(String key) { - TSTNode node = getNode(key.trim().toLowerCase()); + public Object get(CharSequence key) { + TSTNode node = getNode(key); if (node == null) { return null; } @@ -435,7 +435,7 @@ *@return The node object indexed by key. This object is an instance of an * inner class named TernarySearchTrie.TSTNode. */ - public TSTNode getNode(String key) { + public TSTNode getNode(CharSequence key) { return getNode(key, rootNode); } @@ -443,15 +443,14 @@ * Returns the node indexed by key, or null if that node doesn't * exist. The search begins at root node. * - *@param key2 + *@param key * A String that indexes the node that is returned. *@param startNode * The top node defining the subtrie to be searched. *@return The node object indexed by key. This object is an instance of an * inner class named TernarySearchTrie.TSTNode. */ - protected TSTNode getNode(String key2, TSTNode startNode) { - String key = key2.trim().toLowerCase(); + protected TSTNode getNode(CharSequence key, TSTNode startNode) { if (key == null || startNode == null || key.length() == 0) { return null; } @@ -490,7 +489,7 @@ *@exception IllegalArgumentException * If the key is an empty String. */ - protected TSTNode getOrCreateNode(String key) throws NullPointerException, + protected TSTNode getOrCreateNode(CharSequence key) throws NullPointerException, IllegalArgumentException { if (key == null) { throw new NullPointerException( @@ -568,7 +567,7 @@ * The maximum number of values returned by this method. *@return A List with the results */ - public List matchAlmost(String key, int numReturnValues) { + public List matchAlmost(CharSequence key, int numReturnValues) { return matchAlmostRecursion(rootNode, 0, matchAlmostDiff, key, ((numReturnValues < 0) ? -1 : numReturnValues), new Vector(), false); } @@ -598,7 +597,7 @@ *@return A List with the results. */ private List matchAlmostRecursion(TSTNode currentNode, int charIndex, - int d, String matchAlmostKey, int matchAlmostNumReturnValues, + int d, CharSequence matchAlmostKey, int matchAlmostNumReturnValues, List matchAlmostResult2, boolean upTo) { if ((currentNode == null) || (matchAlmostNumReturnValues != -1 && matchAlmostResult2.size() >= matchAlmostNumReturnValues) @@ -658,7 +657,7 @@ * The maximum number of values returned from this method. *@return A List with the results */ - public List matchPrefix(String prefix, int numReturnValues) { + public List matchPrefix(CharSequence prefix, int numReturnValues) { Vector sortKeysResult = new Vector(); TSTNode startNode = getNode(prefix); if (startNode == null) { @@ -722,8 +721,8 @@ *@param value * The object to be stored in the Trie. */ - public void put(String key, Object value) { - getOrCreateNode(key.trim().toLowerCase()).data = value; + public void put(CharSequence key, Object value) { + getOrCreateNode(key).data = value; } /** Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java (working copy) @@ -23,13 +23,18 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper; -import org.apache.lucene.search.spell.SortedIterator; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.UnicodeUtil; public class TSTLookup extends Lookup { TernaryTreeNode root = new TernaryTreeNode(); @@ -39,43 +44,58 @@ public void build(TermFreqIterator tfit) throws IOException { root = new TernaryTreeNode(); // buffer first - if (!(tfit instanceof SortedIterator)) { - // make sure it's sorted - tfit = new SortedTermFreqIteratorWrapper(tfit); + if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) { + // make sure it's sorted and the comparator uses UTF16 sort order + tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator()); } ArrayList tokens = new ArrayList(); - ArrayList vals = new ArrayList(); - while (tfit.hasNext()) { - tokens.add(tfit.next()); - vals.add(new Float(tfit.freq())); + ArrayList vals = new ArrayList(); + BytesRef spare; + CharsRef charsSpare = new CharsRef(); + while ((spare = tfit.next()) != null) { + charsSpare.grow(spare.length); + UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); + tokens.add(charsSpare.toString()); + vals.add(Long.valueOf(tfit.weight())); } autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); } - @Override - public boolean add(String key, Object value) { + public boolean add(CharSequence key, Object value) { autocomplete.insert(root, key, value, 0); // XXX we don't know if a new node was created return true; } - @Override - public Object get(String key) { + public Object get(CharSequence key) { List list = autocomplete.prefixCompletion(root, key, 0); if (list == null || list.isEmpty()) { return null; } for (TernaryTreeNode n : list) { - if (n.token.equals(key)) { + if (charSeqEquals(n.token, key)) { return n.val; } } return null; } + + private static boolean charSeqEquals(CharSequence left, CharSequence right) { + int len = left.length(); + if (len != right.length()) { + return false; + } + for (int i = 0; i < len; i++) { + if (left.charAt(i) != right.charAt(i)) { + return false; + } + } + return true; + } @Override - public List lookup(String key, boolean onlyMorePopular, int num) { + public List lookup(CharSequence key, boolean onlyMorePopular, int num) { List list = autocomplete.prefixCompletion(root, key, 0); List res = new ArrayList(); if (list == null || list.size() == 0) { @@ -85,7 +105,7 @@ if (onlyMorePopular) { LookupPriorityQueue queue = new LookupPriorityQueue(num); for (TernaryTreeNode ttn : list) { - queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val)); + queue.insertWithOverflow(new LookupResult(ttn.token, ((Number)ttn.val).longValue())); } for (LookupResult lr : queue.getResults()) { res.add(lr); @@ -93,7 +113,7 @@ } else { for (int i = 0; i < maxCnt; i++) { TernaryTreeNode ttn = list.get(i); - res.add(new LookupResult(ttn.token, (Float)ttn.val)); + res.add(new LookupResult(ttn.token, ((Number)ttn.val).longValue())); } } return res; @@ -113,14 +133,7 @@ if (!data.exists() || !data.canRead()) { return false; } - DataInputStream in = new DataInputStream(new FileInputStream(data)); - root = new TernaryTreeNode(); - try { - readRecursively(in, root); - } finally { - in.close(); - } - return true; + return load(new FileInputStream(data)); } // pre-order traversal @@ -131,7 +144,7 @@ node.token = in.readUTF(); } if ((mask & HAS_VALUE) != 0) { - node.val = new Float(in.readFloat()); + node.val = Long.valueOf(in.readLong()); } if ((mask & LO_KID) != 0) { node.loKid = new TernaryTreeNode(); @@ -153,14 +166,7 @@ return false; } File data = new File(storeDir, FILENAME); - DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); - try { - writeRecursively(out, root); - out.flush(); - } finally { - out.close(); - } - return true; + return store(new FileOutputStream(data)); } // pre-order traversal @@ -176,7 +182,7 @@ if (node.val != null) mask |= HAS_VALUE; out.writeByte(mask); if (node.token != null) out.writeUTF(node.token); - if (node.val != null) out.writeFloat((Float)node.val); + if (node.val != null) out.writeLong(((Number)node.val).longValue()); // recurse and write kids if (node.loKid != null) { writeRecursively(out, node.loKid); @@ -188,4 +194,28 @@ writeRecursively(out, node.hiKid); } } + + @Override + public synchronized boolean store(OutputStream output) throws IOException { + DataOutputStream out = new DataOutputStream(output); + try { + writeRecursively(out, root); + out.flush(); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public synchronized boolean load(InputStream input) throws IOException { + DataInputStream in = new DataInputStream(input); + root = new TernaryTreeNode(); + try { + readRecursively(in, root); + } finally { + IOUtils.close(in); + } + return true; + } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/tst/TSTAutocomplete.java (working copy) @@ -57,7 +57,7 @@ * index of character in key to be inserted currently. * @return currentNode The new reference to root node of TST */ - public TernaryTreeNode insert(TernaryTreeNode currentNode, String s, + public TernaryTreeNode insert(TernaryTreeNode currentNode, CharSequence s, Object val, int x) { if (s == null || s.length() <= x) { return currentNode; @@ -69,7 +69,7 @@ if (x < s.length() - 1) { currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1); } else { - currentNode.token = s; + currentNode.token = s.toString(); currentNode.val = val; return currentNode; } @@ -79,7 +79,7 @@ if (x < s.length() - 1) { currentNode.eqKid = insert(currentNode.eqKid, s, val, x + 1); } else { - currentNode.token = s; + currentNode.token = s.toString(); currentNode.val = val; return currentNode; } @@ -104,7 +104,7 @@ * @return suggest list of auto-completed keys for the given prefix query. */ public ArrayList prefixCompletion(TernaryTreeNode root, - String s, int x) { + CharSequence s, int x) { TernaryTreeNode p = root; ArrayList suggest = new ArrayList(); Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java (working copy) @@ -19,6 +19,8 @@ import java.io.File; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.List; @@ -29,6 +31,8 @@ import org.apache.lucene.search.suggest.tst.TSTLookup; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.util.*; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.NoOutputs; @@ -37,7 +41,7 @@ * An adapter from {@link Lookup} API to {@link FSTCompletion}. * *

This adapter differs from {@link FSTCompletion} in that it attempts - * to discretize any "weights" as passed from in {@link TermFreqIterator#freq()} + * to discretize any "weights" as passed from in {@link TermFreqIterator#weight()} * to match the number of buckets. For the rationale for bucketing, see * {@link FSTCompletion}. * @@ -55,6 +59,7 @@ * use {@link FSTCompletion} directly or {@link TSTLookup}, for example. * * @see FSTCompletion + * @lucene.experimental */ public class FSTCompletionLookup extends Lookup { /** @@ -158,20 +163,17 @@ // If negative floats are allowed some trickery needs to be done to find their byte order. boolean success = false; try { - BytesRef tmp1 = new BytesRef(); byte [] buffer = new byte [0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); - while (tfit.hasNext()) { - String key = tfit.next(); - UnicodeUtil.UTF16toUTF8(key, 0, key.length(), tmp1); - - if (tmp1.length + 4 >= buffer.length) { - buffer = ArrayUtil.grow(buffer, tmp1.length + 4); + BytesRef spare; + while ((spare = tfit.next()) != null) { + if (spare.length + 4 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 4); } output.reset(buffer); - output.writeInt(FloatMagic.toSortable(tfit.freq())); - output.writeBytes(tmp1.bytes, tmp1.offset, tmp1.length); + output.writeInt(encodeWeight(tfit.weight())); + output.writeBytes(spare.bytes, spare.offset, spare.length); writer.write(buffer, 0, output.getPosition()); } writer.close(); @@ -187,12 +189,13 @@ reader = new Sort.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; - float previousScore = 0; + int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); + BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.read(tmp1)) { input.reset(tmp1.bytes); - float currentScore = FloatMagic.fromSortable(input.readInt()); + int currentScore = input.readInt(); int bucket; if (line > 0 && currentScore == previousScore) { @@ -228,9 +231,17 @@ tempSorted.delete(); } } + + /** weight -> cost */ + private static int encodeWeight(long value) { + if (value < Integer.MIN_VALUE || value > Integer.MAX_VALUE) { + throw new UnsupportedOperationException("cannot encode value: " + value); + } + return (int)value; + } @Override - public List lookup(String key, boolean higherWeightsFirst, int num) { + public List lookup(CharSequence key, boolean higherWeightsFirst, int num) { final List completions; if (higherWeightsFirst) { completions = higherWeightsCompletion.lookup(key, num); @@ -239,27 +250,20 @@ } final ArrayList results = new ArrayList(completions.size()); + CharsRef spare = new CharsRef(); for (Completion c : completions) { - results.add(new LookupResult(c.utf8.utf8ToString(), c.bucket)); + spare.grow(c.utf8.length); + UnicodeUtil.UTF8toUTF16(c.utf8, spare); + results.add(new LookupResult(spare.toString(), c.bucket)); } return results; } - @Override - public boolean add(String key, Object value) { - // Not supported. - return false; + public Object get(CharSequence key) { + final int bucket = normalCompletion.getBucket(key); + return bucket == -1 ? null : Long.valueOf(bucket); } - @Override - public Float get(String key) { - Integer bucket = normalCompletion.getBucket(key); - if (bucket == null) - return null; - else - return (float) normalCompletion.getBucket(key) / normalCompletion.getBucketCount(); - } - /** * Deserialization from disk. */ @@ -293,4 +297,30 @@ normalCompletion.getFST().save(new File(storeDir, FILENAME)); return true; } + + @Override + public synchronized boolean store(OutputStream output) throws IOException { + + if (this.normalCompletion == null) + return false; + try { + normalCompletion.getFST().save(new OutputStreamDataOutput(output)); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public synchronized boolean load(InputStream input) throws IOException { + try { + this.higherWeightsCompletion = new FSTCompletion(new FST( + new InputStreamDataInput(input), NoOutputs.getSingleton())); + this.normalCompletion = new FSTCompletion( + higherWeightsCompletion.getFST(), false, exactMatchFirst); + } finally { + IOUtils.close(input); + } + return true; + } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java (working copy) @@ -18,13 +18,16 @@ */ import java.io.IOException; -import java.util.Iterator; +import java.util.Comparator; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; /** * Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations - * of this interface will be called in a single-threaded scenario. + * of this interface will be called in a single-threaded scenario. + * @lucene.experimental + * @lucene.internal */ public interface BytesRefSorter { /** @@ -42,5 +45,7 @@ * * @throws IOException If an I/O exception occurs. */ - Iterator iterator() throws IOException; + BytesRefIterator iterator() throws IOException; + + Comparator getComparator(); } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java (working copy) @@ -13,6 +13,7 @@ import java.util.Comparator; import java.util.List; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; @@ -156,12 +157,13 @@ // Buffer the input because we will need it twice: for calculating // weights distribution and for the actual automata building. List entries = new ArrayList(); - while (tfit.hasNext()) { - String term = tfit.next(); + BytesRef spare; + while ((spare = tfit.next()) != null) { + String term = spare.utf8ToString(); char [] termChars = new char [term.length() + 1]; // add padding for weight. for (int i = 0; i < term.length(); i++) termChars[i + 1] = term.charAt(i); - entries.add(new Entry(termChars, tfit.freq())); + entries.add(new Entry(termChars, tfit.weight())); } // Distribute weights into at most N buckets. This is a form of discretization to @@ -203,15 +205,6 @@ } } - /** - * Not implemented. - */ - @Override - public boolean add(String key, Object value) { - // This implementation does not support ad-hoc additions (all input - // must be sorted for the builder). - return false; - } /** * Get the (approximated) weight of a single key (if there is a perfect match @@ -220,8 +213,7 @@ * @return Returns the approximated weight of the input key or null * if not found. */ - @Override - public Float get(String key) { + public Float get(CharSequence key) { return getExactMatchStartingFromRootArc(0, key); } @@ -232,7 +224,7 @@ * @param i The first root arc index in {@link #rootArcs} to consider when * matching. */ - private Float getExactMatchStartingFromRootArc(int i, String key) { + private Float getExactMatchStartingFromRootArc(int i, CharSequence key) { // Get the UTF-8 bytes representation of the input key. try { final FST.Arc scratch = new FST.Arc(); @@ -269,7 +261,7 @@ * and then alphabetically (utf16 codepoint order). */ @Override - public List lookup(String key, boolean onlyMorePopular, int num) { + public List lookup(CharSequence key, boolean onlyMorePopular, int num) { if (key.length() == 0 || automaton == null) { // Keep the result an ArrayList to keep calls monomorphic. return EMPTY_RESULT; @@ -294,7 +286,7 @@ * Lookup suggestions sorted alphabetically if weights are not constant. This * is a workaround: in general, use constant weights for alphabetically sorted result. */ - private List lookupSortedAlphabetically(String key, int num) throws IOException { + private List lookupSortedAlphabetically(CharSequence key, int num) throws IOException { // Greedily get num results from each weight branch. List res = lookupSortedByWeight(key, num, true); @@ -302,7 +294,7 @@ Collections.sort(res, new Comparator() { // not till java6 @Override public int compare(LookupResult o1, LookupResult o2) { - return o1.key.compareTo(o2.key); + return Lookup.CHARSEQUENCE_COMPARATOR.compare(o1.key, o2.key); } }); if (res.size() > num) { @@ -318,7 +310,7 @@ * suggestions have been collected. If false, it will collect suggestions from * all weight arcs (needed for {@link #lookupSortedAlphabetically}. */ - private ArrayList lookupSortedByWeight(String key, int num, boolean collectAll) throws IOException { + private ArrayList lookupSortedByWeight(CharSequence key, int num, boolean collectAll) throws IOException { // Don't overallocate the results buffers. This also serves the purpose of allowing // the user of this class to request all matches using Integer.MAX_VALUE as the number // of results. @@ -339,18 +331,18 @@ // of the key prefix. The arc we're at is the last key's byte, // so we will collect it too. output.setLength(matchLength); - if (collect(res, num, weight, output, arc) && !collectAll) { + if (collect(res, num, (long) weight, output, arc) && !collectAll) { // We have enough suggestions to return immediately. Keep on looking for an // exact match, if requested. if (exactMatchFirst) { if (!checkExistingAndReorder(res, key)) { - Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key); + Number exactMatchWeight = getExactMatchStartingFromRootArc(i, key); if (exactMatchWeight != null) { // Insert as the first result and truncate at num. while (res.size() >= num) { res.remove(res.size() - 1); } - res.add(0, new LookupResult(key, exactMatchWeight)); + res.add(0, new LookupResult(key, exactMatchWeight.longValue())); } } } @@ -367,7 +359,7 @@ * * @return Returns true if and only if list contained key. */ - private boolean checkExistingAndReorder(ArrayList list, String key) { + private boolean checkExistingAndReorder(ArrayList list, CharSequence key) { // We assume list does not have duplicates (because of how the FST is created). for (int i = list.size(); --i >= 0;) { if (key.equals(list.get(i).key)) { @@ -390,7 +382,7 @@ * last byte of utf8. false is returned if no such * prefix utf8 exists. */ - private boolean descendWithPrefix(Arc arc, String term) throws IOException { + private boolean descendWithPrefix(Arc arc, CharSequence term) throws IOException { final int max = term.length(); final FST.BytesReader fstReader = automaton.getBytesReader(0); @@ -410,7 +402,7 @@ * @param num Maximum number of results needed (early termination). * @param weight Weight of all results found during this collection. */ - private boolean collect(List res, int num, float weight, StringBuilder output, Arc arc) throws IOException { + private boolean collect(List res, int num, long weight, StringBuilder output, Arc arc) throws IOException { output.append((char) arc.label); automaton.readFirstTargetArc(arc, arc); @@ -535,14 +527,7 @@ return false; } - InputStream is = new BufferedInputStream(new FileInputStream(data)); - try { - this.automaton = new FST(new InputStreamDataInput(is), NoOutputs.getSingleton()); - cacheRootArcs(); - } finally { - IOUtils.close(is); - } - return true; + return load(new FileInputStream(data)); } /** @@ -558,13 +543,29 @@ return false; File data = new File(storeDir, FILENAME); - OutputStream os = new BufferedOutputStream(new FileOutputStream(data)); + return store(new FileOutputStream(data)); + } + + @Override + public boolean store(OutputStream output) throws IOException { + OutputStream os = new BufferedOutputStream(output); try { this.automaton.save(new OutputStreamDataOutput(os)); } finally { IOUtils.close(os); + } + return true; + } + + @Override + public boolean load(InputStream input) throws IOException { + InputStream is = new BufferedInputStream(input); + try { + this.automaton = new FST(new InputStreamDataInput(is), NoOutputs.getSingleton()); + cacheRootArcs(); + } finally { + IOUtils.close(is); } - return true; } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java (working copy) @@ -28,6 +28,7 @@ * Finite state automata based implementation of "autocomplete" functionality. * * @see FSTCompletionBuilder + * @lucene.experimental */ // TODO: we could store exact weights as outputs from the FST (int4 encoded @@ -159,10 +160,10 @@ * @param utf8 * The sequence of utf8 bytes to follow. * - * @return Returns the bucket number of the match or null if no + * @return Returns the bucket number of the match or -1 if no * match was found. */ - private Integer getExactMatchStartingFromRootArc( + private int getExactMatchStartingFromRootArc( int rootArcIndex, BytesRef utf8) { // Get the UTF-8 bytes representation of the input key. try { @@ -186,7 +187,7 @@ } // No match. - return null; + return -1; } /** @@ -199,7 +200,7 @@ * @return Returns the suggestions, sorted by their approximated weight first * (decreasing) and then alphabetically (UTF-8 codepoint order). */ - public List lookup(String key, int num) { + public List lookup(CharSequence key, int num) { if (key.length() == 0 || automaton == null) { return EMPTY_RESULT; } @@ -273,8 +274,8 @@ // exact match, if requested. if (exactFirst) { if (!checkExistingAndReorder(res, key)) { - Integer exactMatchBucket = getExactMatchStartingFromRootArc(i, key); - if (exactMatchBucket != null) { + int exactMatchBucket = getExactMatchStartingFromRootArc(i, key); + if (exactMatchBucket != -1) { // Insert as the first result and truncate at num. while (res.size() >= num) { res.remove(res.size() - 1); @@ -385,10 +386,10 @@ } /** - * Returns the bucket assigned to a given key (if found) or null if + * Returns the bucket assigned to a given key (if found) or -1 if * no exact match exists. */ - public Integer getBucket(String key) { + public int getBucket(CharSequence key) { return getExactMatchStartingFromRootArc(0, new BytesRef(key)); } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java (revision 1297711) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java (working copy) @@ -1,75 +0,0 @@ -package org.apache.lucene.search.suggest.fst; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.NumericUtils; - -/** - * Converts normalized float representations ({@link Float#floatToIntBits(float)}) - * into integers that are directly sortable in int4 representation (or unsigned values or - * after promoting to a long with higher 32-bits zeroed). - */ -class FloatMagic { - /** - * Convert a float to a directly sortable unsigned integer. For sortable signed - * integers, see {@link NumericUtils#floatToSortableInt(float)}. - */ - public static int toSortable(float f) { - return floatBitsToUnsignedOrdered(Float.floatToRawIntBits(f)); - } - - /** - * Back from {@link #toSortable(float)} to float. - */ - public static float fromSortable(int v) { - return Float.intBitsToFloat(unsignedOrderedToFloatBits(v)); - } - - /** - * Convert float bits to directly sortable bits. - * Normalizes all NaNs to canonical form. - */ - static int floatBitsToUnsignedOrdered(int v) { - // Canonicalize NaN ranges. I assume this check will be faster here than - // (v == v) == false on the FPU? We don't distinguish between different - // flavors of NaNs here (see http://en.wikipedia.org/wiki/NaN). I guess - // in Java this doesn't matter much anyway. - if ((v & 0x7fffffff) > 0x7f800000) { - // Apply the logic below to a canonical "quiet NaN" - return 0x7fc00000 ^ 0x80000000; - } - - if (v < 0) { - // Reverse the order of negative values and push them before positive values. - return ~v; - } else { - // Shift positive values after negative, but before NaNs, they're sorted already. - return v ^ 0x80000000; - } - } - - /** - * Back from {@link #floatBitsToUnsignedOrdered(int)}. - */ - static int unsignedOrderedToFloatBits(int v) { - if (v < 0) - return v & ~0x80000000; - else - return ~v; - } -} Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/Sort.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/Sort.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/Sort.java (working copy) @@ -20,15 +20,10 @@ import java.io.*; import java.util.*; +import org.apache.lucene.search.suggest.BytesRefList; import org.apache.lucene.util.*; import org.apache.lucene.util.PriorityQueue; -// TODO: the buffer is currently byte[][] which with very small arrays will terribly overallocate -// memory (alignments) and make GC very happy. -// -// We could move it to a single byte[] + and use custom sorting, but we'd need to check if this -// yields any improvement first. - /** * On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following * fields: @@ -38,6 +33,8 @@ * * * @see #sort(File, File) + * @lucene.experimental + * @lucene.internal */ public final class Sort { public final static int MB = 1024 * 1024; @@ -59,11 +56,6 @@ */ public final static int MAX_TEMPFILES = 128; - /** - * Minimum slot buffer expansion. - */ - private final static int MIN_EXPECTED_GROWTH = 1000; - /** * A bit more descriptive unit for constructors. * @@ -112,21 +104,6 @@ } /** - * byte[] in unsigned byte order. - */ - static final Comparator unsignedByteOrderComparator = new Comparator() { - public int compare(byte[] left, byte[] right) { - final int max = Math.min(left.length, right.length); - for (int i = 0, j = 0; i < max; i++, j++) { - int diff = (left[i] & 0xff) - (right[j] & 0xff); - if (diff != 0) - return diff; - } - return left.length - right.length; - } - }; - - /** * Sort info (debugging mostly). */ public class SortInfo { @@ -149,14 +126,15 @@ } } - private final static byte [][] EMPTY = new byte [0][]; - private final BufferSize ramBufferSize; private final File tempDirectory; - - private byte [][] buffer = new byte [0][]; + + private final BytesRefList buffer = new BytesRefList(); private SortInfo sortInfo; private int maxTempFiles; + private final Comparator comparator; + + public static final Comparator DEFAULT_COMPARATOR = BytesRef.getUTF8SortedAsUnicodeComparator(); /** * Defaults constructor. @@ -165,13 +143,17 @@ * @see BufferSize#automatic() */ public Sort() throws IOException { - this(BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES); + this(DEFAULT_COMPARATOR, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES); } + + public Sort(Comparator comparator) throws IOException { + this(comparator, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES); + } /** * All-details constructor. */ - public Sort(BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) { + public Sort(Comparator comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) { if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) { throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes); } @@ -183,6 +165,7 @@ this.ramBufferSize = ramBufferSize; this.tempDirectory = tempDirectory; this.maxTempFiles = maxTempfiles; + this.comparator = comparator; } /** @@ -283,23 +266,25 @@ /** Sort a single partition in-memory. */ protected File sortPartition(int len) throws IOException { - byte [][] data = this.buffer; + BytesRefList data = this.buffer; File tempFile = File.createTempFile("sort", "partition", tempDirectory); long start = System.currentTimeMillis(); - Arrays.sort(data, 0, len, unsignedByteOrderComparator); sortInfo.sortTime += (System.currentTimeMillis() - start); - ByteSequencesWriter out = new ByteSequencesWriter(tempFile); + final ByteSequencesWriter out = new ByteSequencesWriter(tempFile); + BytesRef spare; try { - for (int i = 0; i < len; i++) { - assert data[i].length <= Short.MAX_VALUE; - out.write(data[i]); + BytesRefIterator iter = buffer.iterator(comparator); + while((spare = iter.next()) != null) { + assert spare.length <= Short.MAX_VALUE; + out.write(spare); } + out.close(); // Clean up the buffer for the next partition. - this.buffer = EMPTY; + data.clear(); return tempFile; } finally { IOUtils.close(out); @@ -317,7 +302,7 @@ initialize(size); } protected boolean lessThan(FileAndTop a, FileAndTop b) { - return a.current.compareTo(b.current) < 0; + return comparator.compare(a.current, b.current) < 0; } }; @@ -362,33 +347,18 @@ /** Read in a single partition of data */ int readPartition(ByteSequencesReader reader) throws IOException { long start = System.currentTimeMillis(); - - // We will be reallocating from scratch. - Arrays.fill(this.buffer, null); - - int bytesLimit = this.ramBufferSize.bytes; - byte [][] data = this.buffer; - byte[] line; - int linesRead = 0; - while ((line = reader.read()) != null) { - if (linesRead + 1 >= data.length) { - byte[][] newData = new byte[ArrayUtil.oversize(linesRead + MIN_EXPECTED_GROWTH, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][]; - System.arraycopy(data, 0, newData, 0, data.length); - data = newData; - } - data[linesRead++] = line; - + final BytesRef scratch = new BytesRef(); + while ((scratch.bytes = reader.read()) != null) { + scratch.length = scratch.bytes.length; + buffer.append(scratch); // Account for the created objects. // (buffer slots do not account to buffer size.) - bytesLimit -= line.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER; - if (bytesLimit < 0) { + if (ramBufferSize.bytes < buffer.bytesUsed()) { break; } } - this.buffer = data; - sortInfo.readTime += (System.currentTimeMillis() - start); - return linesRead; + return buffer.size(); } static class FileAndTop { @@ -438,6 +408,7 @@ /** * Closes the provided {@link DataOutput} if it is {@link Closeable}. */ + @Override public void close() throws IOException { if (os instanceof Closeable) { ((Closeable) os).close(); @@ -511,10 +482,15 @@ /** * Closes the provided {@link DataInput} if it is {@link Closeable}. */ + @Override public void close() throws IOException { if (is instanceof Closeable) { ((Closeable) is).close(); } } + } + + public Comparator getComparator() { + return comparator; } } \ No newline at end of file Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (working copy) @@ -19,6 +19,8 @@ import java.io.File; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -26,10 +28,15 @@ import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper; +import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.UnicodeUtil; @@ -97,74 +104,27 @@ @Override public void build(TermFreqIterator iterator) throws IOException { - String prefix = getClass().getSimpleName(); - File directory = Sort.defaultTempDir(); - File tempInput = File.createTempFile(prefix, ".input", directory); - File tempSorted = File.createTempFile(prefix, ".sorted", directory); - - Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); - Sort.ByteSequencesReader reader = null; BytesRef scratch = new BytesRef(); - - boolean success = false; - try { - byte [] buffer = new byte [0]; - ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); - while (iterator.hasNext()) { - String key = iterator.next(); - UnicodeUtil.UTF16toUTF8(key, 0, key.length(), scratch); - - if (scratch.length + 5 >= buffer.length) { - buffer = ArrayUtil.grow(buffer, scratch.length + 5); - } - - output.reset(buffer); - output.writeBytes(scratch.bytes, scratch.offset, scratch.length); - output.writeByte((byte)0); // separator: not used, just for sort order - output.writeInt((int)encodeWeight(iterator.freq())); - writer.write(buffer, 0, output.getPosition()); - } - writer.close(); - new Sort().sort(tempInput, tempSorted); - reader = new Sort.ByteSequencesReader(tempSorted); + TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator, + BytesRef.getUTF8SortedAsUnicodeComparator()); + IntsRef scratchInts = new IntsRef(); + BytesRef previous = null; + PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + while ((scratch = iter.next()) != null) { + long cost = iter.weight(); - PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - - BytesRef previous = null; - BytesRef suggestion = new BytesRef(); - IntsRef scratchInts = new IntsRef(); - ByteArrayDataInput input = new ByteArrayDataInput(); - while (reader.read(scratch)) { - suggestion.bytes = scratch.bytes; - suggestion.offset = scratch.offset; - suggestion.length = scratch.length - 5; // int + separator - - input.reset(scratch.bytes); - input.skipBytes(suggestion.length + 1); // suggestion + separator - long cost = input.readInt(); - - if (previous == null) { - previous = new BytesRef(); - } else if (suggestion.equals(previous)) { - continue; // for duplicate suggestions, the best weight is actually added - } - Util.toIntsRef(suggestion, scratchInts); - builder.add(scratchInts, cost); - previous.copyBytes(suggestion); + if (previous == null) { + previous = new BytesRef(); + } else if (scratch.equals(previous)) { + continue; // for duplicate suggestions, the best weight is actually + // added } - fst = builder.finish(); - success = true; - } finally { - if (success) { - IOUtils.close(reader, writer); - } else { - IOUtils.closeWhileHandlingException(reader, writer); - } - - tempInput.delete(); - tempSorted.delete(); + Util.toIntsRef(scratch, scratchInts); + builder.add(scratchInts, cost); + previous.copyBytes(scratch); } + fst = builder.finish(); } @Override @@ -178,9 +138,29 @@ this.fst = FST.read(new File(storeDir, FILENAME), PositiveIntOutputs.getSingleton(true)); return true; } + + @Override + public boolean store(OutputStream output) throws IOException { + try { + fst.save(new OutputStreamDataOutput(output)); + } finally { + IOUtils.close(output); + } + return true; + } @Override - public List lookup(String key, boolean onlyMorePopular, int num) { + public boolean load(InputStream input) throws IOException { + try { + this.fst = new FST(new InputStreamDataInput(input), PositiveIntOutputs.getSingleton(true)); + } finally { + IOUtils.close(input); + } + return true; + } + + @Override + public List lookup(CharSequence key, boolean onlyMorePopular, int num) { assert num > 0; BytesRef scratch = new BytesRef(key); int prefixLength = scratch.length; @@ -197,8 +177,11 @@ } List results = new ArrayList(num); + CharsRef spare = new CharsRef(); if (exactFirst && arc.isFinal()) { - results.add(new LookupResult(scratch.utf8ToString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); + spare.grow(scratch.length); + UnicodeUtil.UTF8toUTF16(scratch, spare); + results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); if (--num == 0) { return results; // that was quick } @@ -216,8 +199,9 @@ // append suffix Util.toBytesRef(completion.input, suffix); scratch.append(suffix); - - results.add(new LookupResult(scratch.utf8ToString(), decodeWeight(prefixOutput + completion.output))); + spare.grow(scratch.length); + UnicodeUtil.UTF8toUTF16(scratch, spare); + results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + completion.output))); } return results; } @@ -243,17 +227,11 @@ return output; } - @Override - public boolean add(String key, Object value) { - return false; // Not supported. - } - /** * Returns the weight associated with an input string, * or null if it does not exist. */ - @Override - public Float get(String key) { + public Object get(CharSequence key) { Arc arc = new Arc(); Long result = null; try { @@ -262,23 +240,51 @@ if (result == null || !arc.isFinal()) { return null; } else { - return decodeWeight(result + arc.nextFinalOutput); + return Integer.valueOf(decodeWeight(result + arc.nextFinalOutput)); } } /** cost -> weight */ - private static float decodeWeight(long encoded) { - return Integer.MAX_VALUE - encoded; + private static int decodeWeight(long encoded) { + return (int)(Integer.MAX_VALUE - encoded); } /** weight -> cost */ - private static long encodeWeight(float value) { - if (Float.isNaN(value) || Float.isInfinite(value) || value < 0 || value > Integer.MAX_VALUE) { + private static int encodeWeight(long value) { + if (value < 0 || value > Integer.MAX_VALUE) { throw new UnsupportedOperationException("cannot encode value: " + value); } return Integer.MAX_VALUE - (int)value; } + private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper { + + WFSTTermFreqIteratorWrapper(TermFreqIterator source, + Comparator comparator) throws IOException { + super(source, comparator, true); + } + + @Override + protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException { + if (spare.length + 5 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 5); + } + output.reset(buffer); + output.writeBytes(spare.bytes, spare.offset, spare.length); + output.writeByte((byte)0); // separator: not used, just for sort order + output.writeInt(encodeWeight(weight)); + writer.write(buffer, 0, output.getPosition()); + } + + @Override + protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { + tmpInput.reset(scratch.bytes); + tmpInput.skipBytes(scratch.length - 4); // suggestion + separator + scratch.length -= 5; // sep + long + return tmpInput.readInt(); + } + } + static final Comparator weightComparator = new Comparator () { public int compare(Long left, Long right) { return left.compareTo(right); Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java (working copy) @@ -19,9 +19,9 @@ import java.io.Closeable; import java.io.IOException; -import java.util.Iterator; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.*; @@ -46,7 +46,7 @@ * * *

- * At runtime, in {@link FSTCompletion#lookup(String, int)}, + * At runtime, in {@link FSTCompletion#lookup(CharSequence, int)}, * the automaton is utilized as follows: *

    *
  • For each possible term weight encoded in the automaton (cached arcs from @@ -98,6 +98,7 @@ * change, requiring you to rebuild the FST suggest index. * * @see FSTCompletion + * @lucene.experimental */ public class FSTCompletionBuilder { /** @@ -143,10 +144,11 @@ /** * Creates an {@link FSTCompletion} with default options: 10 buckets, exact match - * promoted to first position and {@link InMemorySorter}. + * promoted to first position and {@link InMemorySorter} with a comparator obtained from + * {@link BytesRef#getUTF8SortedAsUnicodeComparator()}. */ public FSTCompletionBuilder() { - this(DEFAULT_BUCKETS, new InMemorySorter(), Integer.MAX_VALUE); + this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()), Integer.MAX_VALUE); } /** @@ -237,10 +239,12 @@ shareMaxTailLength, outputs, null, false); BytesRef scratch = new BytesRef(); + BytesRef entry; final IntsRef scratchIntsRef = new IntsRef(); int count = 0; - for (Iterator i = sorter.iterator(); i.hasNext(); count++) { - BytesRef entry = i.next(); + BytesRefIterator iter = sorter.iterator(); + while((entry = iter.next()) != null) { + count++; if (scratch.compareTo(entry) != 0) { builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); scratch.copyBytes(entry); Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java (working copy) @@ -18,60 +18,67 @@ */ import java.io.*; -import java.util.Iterator; -import java.util.NoSuchElementException; +import java.util.Comparator; import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.IOUtils; /** * Builds and iterates over sequences stored on disk. + * @lucene.experimental + * @lucene.internal */ public class ExternalRefSorter implements BytesRefSorter, Closeable { private final Sort sort; private Sort.ByteSequencesWriter writer; private File input; - private File sorted; - + private File sorted; + /** * Will buffer all sequences to a temporary file and then sort (all on-disk). */ public ExternalRefSorter(Sort sort) throws IOException { this.sort = sort; - this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir()); + this.input = File.createTempFile("RefSorter-", ".raw", + Sort.defaultTempDir()); this.writer = new Sort.ByteSequencesWriter(input); } - + + @Override public void add(BytesRef utf8) throws IOException { - if (writer == null) - throw new IllegalStateException(); + if (writer == null) throw new IllegalStateException(); writer.write(utf8); } - - public Iterator iterator() throws IOException { + + public BytesRefIterator iterator() throws IOException { if (sorted == null) { closeWriter(); - - sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir()); + + sorted = File.createTempFile("RefSorter-", ".sorted", + Sort.defaultTempDir()); sort.sort(input, sorted); - + input.delete(); input = null; } - - return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted)); + + return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted), + sort.getComparator()); } - + private void closeWriter() throws IOException { if (writer != null) { writer.close(); writer = null; } } - + /** * Removes any written temporary files. */ + @Override public void close() throws IOException { try { closeWriter(); @@ -80,37 +87,54 @@ if (sorted != null) sorted.delete(); } } - + /** * Iterate over byte refs in a file. */ - class ByteSequenceIterator implements Iterator { - private ByteSequencesReader reader; - private byte[] next; - - public ByteSequenceIterator(ByteSequencesReader reader) throws IOException { + class ByteSequenceIterator implements BytesRefIterator { + private final ByteSequencesReader reader; + private BytesRef scratch = new BytesRef(); + private final Comparator comparator; + + public ByteSequenceIterator(ByteSequencesReader reader, + Comparator comparator) { this.reader = reader; - this.next = reader.read(); + this.comparator = comparator; } - - public boolean hasNext() { - return next != null; - } - public BytesRef next() { - if (next == null) throw new NoSuchElementException(); - BytesRef r = new BytesRef(next); + @Override + public BytesRef next() throws IOException { + if (scratch == null) { + return null; + } + boolean success = false; try { - next = reader.read(); - if (next == null) { - reader.close(); + byte[] next = reader.read(); + if (next != null) { + scratch.bytes = next; + scratch.length = next.length; + scratch.offset = 0; + } else { + IOUtils.close(reader); + scratch = null; } - } catch (IOException e) { - throw new RuntimeException(e); + success = true; + return scratch; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(reader); + } } - return r; } + + @Override + public Comparator getComparator() { + return comparator; + } + } - public void remove() { throw new UnsupportedOperationException(); } + @Override + public Comparator getComparator() { + return sort.getComparator(); } } Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java (revision 1297697) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java (working copy) @@ -17,27 +17,40 @@ * limitations under the License. */ -import java.util.*; +import java.util.Comparator; +import org.apache.lucene.search.suggest.BytesRefList; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; /** * An {@link BytesRefSorter} that keeps all the entries in memory. + * @lucene.experimental + * @lucene.internal */ public final class InMemorySorter implements BytesRefSorter { - // TODO: use a single byte[] to back up all entries? - private final ArrayList refs = new ArrayList(); - + private final BytesRefList buffer = new BytesRefList(); private boolean closed = false; + private final Comparator comparator; + public InMemorySorter(Comparator comparator) { + this.comparator = comparator; + } + + @Override public void add(BytesRef utf8) { if (closed) throw new IllegalStateException(); - refs.add(BytesRef.deepCopyOf(utf8)); + buffer.append(utf8); } - public Iterator iterator() { + @Override + public BytesRefIterator iterator() { closed = true; - Collections.sort(refs, BytesRef.getUTF8SortedAsUnicodeComparator()); - return Collections.unmodifiableCollection(refs).iterator(); + return buffer.iterator(comparator); } + + @Override + public Comparator getComparator() { + return comparator; + } } Index: lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java (revision 1297697) +++ lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java (working copy) @@ -26,6 +26,7 @@ import java.io.OutputStream; import java.io.PrintStream; import java.lang.reflect.Method; +import java.nio.CharBuffer; import java.util.Enumeration; import java.util.HashMap; import java.util.Map; @@ -680,4 +681,23 @@ return doc2; } + + public static CharSequence stringToCharSequence(String string, Random random) { + return bytesToCharSequence(new BytesRef(string), random); + } + + public static CharSequence bytesToCharSequence(BytesRef ref, Random random) { + switch(random.nextInt(5)) { + case 4: + CharsRef chars = new CharsRef(ref.length); + UnicodeUtil.UTF8toUTF16(ref.bytes, ref.offset, ref.length, chars); + return chars; + case 3: + return CharBuffer.wrap(ref.utf8ToString()); + default: + return ref.utf8ToString(); + } + + } + }