Property changes on: . ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/trunk:r1296805 Property changes on: solr ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/trunk/solr:r1296805 Property changes on: solr/core ___________________________________________________________________ Deleted: svn:mergeinfo Reverse-merged /lucene/dev/branches/lucene_solr_3_1/solr/core:r1081856,1083239,1085499,1085511,1085532,1085809,1101103 Reverse-merged /lucene/java/branches/lucene_2_9_back_compat_tests/solr/core:r818601-821336 Reverse-merged /lucene/dev/branches/lucene_solr_3_2/solr/core:r1128223,1128247,1129418,1129472 Reverse-merged /lucene/dev/branches/lucene_solr_3_3/solr/core:r1138390,1138979,1139775 Reverse-merged /lucene/dev/trunk/solr/core:r931298,931337,931502,932129-932131,932163,932304,932369,932374,932398,932417,932541,932576,932587,932698,932731-932749,932752,932773,932795,932828,932856-932857,932862,932864,932878,932963,932998-932999,933541-933575,933598,933613,933679,933879,934339,934954,935014-935048,935065,935186-935513,935521-935522,935553-935962,936522,936544,936605,936657-936726,937039,937360,938582-938646,938989,939111,939611,939649,940433,940447,940451-940452,940556,940666,940699,940730,940878-940892,940994,941270,941363,941780,942166,942235,942288,942292,942676,942719,943142,943493,943931,944516,944528,945057,945090,945130,945245,945343,945420,946139,946330,946338,946599,948011,948082,948429,949156,949288,949311,949318,949445,949976,949997,950008,950042,950125,950207,950458,950467,950613,950667,950723,950835,951126,951355,951397,951521,953628,953886,954336,955547,955613,955615,955796-955797,955809-955996,956097,956125,956173,956316,956715,957465,957481,957486,957520,957634,957707,960367,960374,960719,961941,962555,962714,963372,963654,963720,963781,963873,963888,963906,963909,963920,964019,964054,964267,964430,964459,964720,964753,964832,964856,965103,965222,965230,965327,965330,965585,965938,966354,966878,967080,979049,979453,979807,979809,980369,980428,980436,980501,980555,980592,980909,980911,980917,981265,981550,981598,981650,981661,981690,981857,981936,982073,982084,982201,982323,982725,982824,983100,983212,983216,983313,983328,983495,983500,983530,983632,983778,984187,984202,984232,984510,984968,985453,985455,985672,985875,986158,986612,986773,987005,987009,987122,988087,988206,988216,988259,988346,988478,988527,988543,988592,988613,988688,988710,988736,988739,989004,989010,989013,989030,989035,989315,989321,989334,990160-990161,990180,990189,990281,990301,990451,990459,990766,990781,990854,991053,991191,991497,992424,992469,992567,992571,992623,992913,993106,993194,993199,993287,993408,994935,994976,994979,995247,995250,995253,995264,995286,995376,995557,995607,995772,996268,996357,996416,996511,996611,996623,996647-996653,996720,996942,996961,996978,997180,997230,998055,998505,998684,999016,999037,999137,999139,999152,999175,999223,999378,999409,999483,999545,999842,999984,1000000,1000424,1000428,1000581,1000597,1000675,1001006,1001010,1001129,1001318,1001320,1001420,1001661,1001957,1002002,1002032,1002739,1003107,1003291,1003614,1003631,1003645,1003703,1003841-1003852,1003873,1003877,1003906,1003938,1003954,1003978,1003990,1004038,1004082,1004179,1004200,1004215,1004241,1004335,1005310,1005356,1005363,1006146,1006280,1006290,1006324,1021340,1021357,1021360,1021439,1021449,1021969-1021971,1022111,1022165,1022191,1022357,1022632,1022708-1022710,1022730-1022735,1022748-1022755,1022762-1022793,1022798-1022802,1022805,1022826,1022927,1022939,1022956,1022989,1022998,1023006,1023009,1023022,1023040,1023106,1023235-1023246,1023250,1023264-1023265,1023312,1023329-1023330,1023346-1023347,1023355,1023493,1023509-1023511,1023518,1023520,1023535-1023536,1023562,1023579-1023588,1023594-1023595,1023600-1023602,1023606,1023621,1023635,1023637,1023711,1023845,1023870,1024196,1024219,1024233,1024238,1024256,1024292,1024305,1024338,1024395,1024402,1024408,1024475-1024476,1024486,1025545,1025547,1025570,1025579,1025597,1025669,1025929,1026044,1026058,1026129-1026130,1026167,1026336,1026431,1026446,1026456,1026460,1026592,1026606,1026610,1026738,1026823,1026841,1026868,1026882,1027743,1027788,1027998,1028039,1028386,1029096,1029325,1029333,1029345,1030012,1030019,1030073,1030078,1030098,1030754,1031076,1031219,1031460,1031467,1031474,1031480,1031496,1031686,1031689,1031722,1032433,1032446,1032570,1032776,1034007,1034011,1034017,1034342,1034361,1034763,1034921,1034975,1034977,1035096,1035103,1035194,1035205,1035214,1035395,1035397,1035420,1035535,1035651,1035996,1036088,1036970,1037077,1037154,1037223,1037406,1037429,1037965,1038562,1038785,1039068,1039314,1039688,1039737,1039759,1039773,1039778,1039868,1039911,1039917,1039962-1039967,1040064,1040290,1040390,1040447,1040463,1040608,1040815,1040935,1040940,1040982,1041844,1041914,1041954,1041963,1042008,1042185,1042213,1042315,1042359,1042373,1043071,1043114,1043693,1043749,1044066-1044069,1044098,1044257,1044315,1044328,1044505,1044561,1044635,1044660,1044854,1044867,1045010,1045212,1045253,1045266,1045302,1045310,1045315,1045322-1045323,1048886,1049094,1049107,1049117,1049131-1049132,1049144,1049187,1049413,1049502,1049693,1049918,1050063,1050084,1050687,1050697-1050725,1050728,1050733,1050737,1050813,1050827,1051041,1051058,1051305,1051611,1051715,1051872,1051891,1052898,1052926,1052974,1052980,1052991,1053236,1053405,1053509,1053578,1053896,1054015,1054164,1054172,1054405-1054406,1055285,1055408,1055435,1055595,1055877,1055892-1055906,1056014,1056428,1056702,1056821,1056955,1057010,1057149,1057221,1057340,1058284-1058288,1058324,1058393,1058939,1059426,1059719,1059866,1060023,1060324,1060437,1060585,1060608,1060641,1060645,1060779,1060807,1060846,1060872,1060997,1061050,1061065,1061078,1061350,1061424,1061499,1061622,1062070,1062123,1062153,1062319,1062451,1062454,1062509,1062604,1062633,1062876,1062879,1063323,1063333,1063339,1063478,1063493,1063498,1063501,1063513,1063702,1063762,1063837,1063842,1063868-1063869,1063877,1063897,1063908,1063920,1064330,1064379,1064386,1064395,1064730,1064735,1064781,1064844,1064942,1065059,1065095-1065096,1065102,1065261,1065265,1065272,1065286,1065302,1065304,1065327,1065337,1065410,1065416,1065465,1065474,1065572,1065601,1065621,1065719,1065853,1065855,1065891,1066008,1066691,1066764,1066819,1066850,1066889,1067030,1067044,1067119,1067131,1067160,1067163,1067165,1067299,1067427,1067551,1068149,1068214,1068387,1068979,1069316,1069341,1069656,1070183,1070185,1070206,1070240,1070321,1070691,1070760,1070879,1071074,1071417,1071435,1071459,1071569,1071594,1071654-1071655,1071658,1071842,1072127,1072230,1072232,1072250,1072397,1072567,1072591,1072607,1072683,1073336,1073806,1073850,1073957,1074009,1074017,1074226,1074326,1074726,1074750,1074952,1075023-1075024,1075069,1075072,1075079,1075089,1075103,1075184,1075190-1075191,1075196,1075287,1075443,1075505,1075603,1075804,1075850,1075960,1076005,1076032,1076237,1076279,1076311,1076315,1076319,1076325,1076432-1076433,1076884,1077908,1077916,1078058,1078117,1078127,1078398,1078448,1078451,1078463,1078471,1078500-1078501,1078512-1078515,1078529,1078540,1078553,1078563,1078570,1078580,1078599,1078614,1078639,1078659,1078670,1078681,1078770,1079707,1079786,1079949,1080038,1080258,1080424,1080443,1080445,1080647,1080665,1080691,1080762,1080970,1080979,1080985,1080988,1081012,1081017,1081777-1081778,1081790-1081791,1081795,1082186,1082514-1082516,1082601,1082687,1082720,1082730,1082776,1082865,1082919,1082926,1082998,1083010,1083116,1083213,1083447,1083459,1083991,1084045,1084247,1084273-1084274,1084327,1084544,1084549,1084566,1084929,1085004,1085089,1085224,1085241,1085423,1085515,1085530,1085689,1086276,1086584,1086629,1086821,1087319,1087426,1087722,1087913,1088021,1089335,1089813,1089815,1089906,1089918,1091132-1091159,1091499,1092105,1092136,1092328,1092396,1092812,1092848,1094014,1094214,1095120,1095260,1095432,1095517,1095526,1095861,1095937,1096073,1096077,1096178-1096183,1096194,1096249,1096301,1096315,1096334,1096339,1097187,1097216,1097627,1098303,1098357,1098367,1098375,1098532,1098633,1098730,1098740,1098760,1098800,1098860,1099041,1099340,1099529,1099582,1099745,1099999,1100435,1100437,1101047,1101056,1101072,1101088,1101539,1101572,1101574,1102058,1102120,1102290,1102377,1102658,1102718,1102785,1102817,1102827,1102907,1103024,1103048,1103077,1103102,1103120,1103155,1103979,1103983,1104421,1104432,1104452,1104519,1124160,1124266,1124293,1124307,1124316,1124330,1124366,1125006,1125150,1125165,1125376,1125932,1125972,1126022,1126091,1126280,1126284,1126487,1126573,1126642,1126645,1126761,1127156,1127247,1127301,1127436,1128105,1128246,1128253,1128549,1128830,1128844,1128854,1128856,1129398,1129403,1129413,1129427,1129450,1129453,1129456,1129459,1129465,1129645,1129656,1129694,1130039,1130042,1130052,1130063,1130150,1130439,1130527,1130547,1130648,1130852,1130858-1130859,1130861,1130954-1131005,1131150,1131158,1131371,1131395,1131401,1132391,1132517,1132729,1132806,1132855,1132969,1133021,1133136,1133187,1133330,1133383,1133385,1133486,1133553,1133565,1133599,1133616,1133631,1133646,1133839,1133937,1134163,1134328,1134515,1134592,1134685,1134763,1134781,1134895,1134995,1134998,1135009,1135011,1135154,1135204,1135300,1135369,1135509,1135525,1135527,1135537,1135650,1135658,1135670,1135764,1135801,1135818,1135822,1135825,1135954,1136027,1136080,1136357,1136467,1136568,1136605,1136644,1136789,1136792,1137054,1137060,1137064,1137162,1137211,1137330,1137357,1137477,1137480,1137533,1137665,1137733,1137882,1138030,1138069,1138319,1138405,1138446,1138450,1138821,1138890,1139054,1139173,1139178,1139188,1139199,1139285,1139513,1139789,1139995,1140004,1140119,1140243,1140252,1140498,1140574,1140720,1140827,1140836,1140851,1141167,1141170,1141295,1141400,1141593,1141999,1142179,1142722,1142730,1143122,1143189,1143238,1143420,1143558,1143766,1143783,1143878,1144014,1144038,1144294,1144415,1144513,1144792,1144841,1145158,1145163,1145182,1145198,1145233,1145239,1145255,1145263,1145292,1145442,1145479,1145502,1145518,1145594,1145657,1145701,1145730,1145885,1145925,1145957,1146638,1146984,1147023,1147284,1147578,1147586,1147671,1147691,1147807,1147881,1148596,1148602,1148681,1148728,1148763,1148968,1149028,1149050,1149108,1149256,1149740,1149746,1150091,1150362,1150384,1150389,1150394,1150404-1150405,1150415,1150478,1150480,1150486-1150489,1150671,1150840,1151081,1151146,1151720,1151782,1151984,1151997,1152024,1152055,1152089,1152288,1152456,1152525,1152530,1152653,1152669,1152892,1153399,1153408,1153844,1154005,1154926,1154936,1155278,1156053,1156590-1156591,1157437,1158342,1158697,1158730,1158819,1158832,1159291,1159418,1159627,1160832,1161488,1161505,1161964,1161966,1161972,1161974,1162135,1162156,1162158,1162166,1162375,1162394,1162401,1163370,1163568,1163576,1163589,1163625,1164287,1164311,1164620,1164956,1165902,1165995,1166106,1166457,1166530,1166541,1166582,1166656,1166702,1166715,1166728,1166784,1166850,1166866,1166954,1167008,1167199,1167467,1169612,1169816,1169820,1170157,1170203,1170586,1170616,1170699,1170716,1170725,1170908,1171556,1171570,1171597,1171691,1171704,1171739,1172227,1173139,1173423,1173430,1173720,1173778,1173961,1174377-1174407,1175300,1175376,1175385,1175397,1175413,1175425,1175475,1175529,1175579,1175650,1175696,1175699,1175956,1175975,1176097,1176114,1176478,1176772,1176774,1177048-1177049,1177723,1177940,1178612,1178923,1179315,1179677,1179762,1179956,1180124,1181265,1181268,1181299,1181659,1181664,1181760,1182982,1183458,1183464,1183582,1183738,1183753,1183756,1184753-1184754,1184761,1184822,1184851,1184877,1185120,1187900,1188597,1188777,1188975,1189160,1189186,1189235,1189655,1189903,1189958,1190029,1190107,1190410,1195082,1195101,1195275,1196228,1197469,1197690,1197742,1197879,1198024,1198332,1198371,1198636,1198778,1198911,1199405,1199832,1199837,1200007,1200080,1200480,1200854,1201036,1201165,1201191,1201855,1202152,1202657,1202754,1202969,1203114,1203206,1204453,1205342,1205360,1205366,1205774,1206017,1206070,1206143,1206229,1206270,1206436-1206437,1206452,1206767,1206789,1206996,1207070,1207103,1207291,1207577,1208032,1208118,1208509,1209076,1210020,1210054,1210168,1210714,1211827,1211887,1212486,1212488-1212490,1212492-1212495,1213329,1213474,1213704,1213706,1213824,1213826,1214413,1214540,1215352,1220458,1220795,1225120,1225211,1225231,1225233,1225433,1226417,1227439,1228650,1228704,1229519,1229523,1229602,1229713,1230748,1231367,1231512,1231514,1231665,1232470,1232491,1232769,1232818,1232943,1233583,1233708,1234573,1234579,1235753,1237497,1237528,1238851,1239052-1239056,1239117,1239316,1239658,1240034-1240081,1240655,1240710,1240760,1240784,1240980,1241878,1242497,1242557,1242573,1242740,1242890,1244379,1245947,1291020,1291184,1291322,1291541,1291703,1291728,1293343,1293821-1293823,1294920,1297048 Reverse-merged /lucene/dev/trunk/core:r1150091 Reverse-merged /lucene/java/branches/lucene_3_0/solr/core:r880793,896906 Reverse-merged /lucene/solr/trunk/core:r922950-923910,923912-925091 Reverse-merged /lucene/java/branches/lucene_2_4/solr/core:r748824 Reverse-merged /lucene/dev/branches/preflexfixes/solr/core:r967125-979432 Reverse-merged /lucene/dev/trunk/solr:r1045010,1095517 Reverse-merged /lucene/dev/trunk/solr/core/core:r1145442,1145594,1145603 Reverse-merged /lucene/java/branches/lucene_2_9/solr/core:r817269-818600,825998,829134,829881,831036,896850,909334 Index: solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java (revision 1297338) +++ solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java (working copy) @@ -50,7 +50,7 @@ factory.inform(new SolrResourceLoader(null, null)); TokenStream ts = factory.create(new StringReader("シニアソフトウェアエンジニア")); assertTokenStreamContents(ts, - new String[] { "シニア", "ソフトウェア", "エンジニア" } + new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" } ); } Index: solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (revision 1297338) +++ solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (working copy) @@ -28,8 +28,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer; -import org.apache.lucene.analysis.kuromoji.Segmenter; -import org.apache.lucene.analysis.kuromoji.Segmenter.Mode; +import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode; import org.apache.lucene.analysis.kuromoji.dict.UserDictionary; import org.apache.lucene.util.IOUtils; import org.apache.solr.analysis.BaseTokenizerFactory; @@ -88,7 +87,7 @@ //@Override public Tokenizer create(Reader input) { - return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input); + return new KuromojiTokenizer(input, userDictionary, true, mode); } private Mode getMode(Map args) { @@ -96,7 +95,7 @@ if (mode != null) { return Mode.valueOf(mode.toUpperCase(Locale.ENGLISH)); } else { - return Segmenter.DEFAULT_MODE; + return KuromojiTokenizer.DEFAULT_MODE; } } } Property changes on: lucene ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/trunk/lucene:r1296805 Property changes on: lucene/core/src/test/org/apache/lucene/util/TestRollingCharBuffer.java ___________________________________________________________________ Added: svn:eol-style + native Property changes on: lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java (revision 1297338) +++ lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java (working copy) @@ -54,9 +54,10 @@ * @param positionIncrement the distance from the prior term */ public void setPositionIncrement(int positionIncrement) { - if (positionIncrement < 0) + if (positionIncrement < 0) { throw new IllegalArgumentException - ("Increment must be zero or greater: " + positionIncrement); + ("Increment must be zero or greater: got " + positionIncrement); + } this.positionIncrement = positionIncrement; } @@ -79,7 +80,8 @@ } if (other instanceof PositionIncrementAttributeImpl) { - return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement; + PositionIncrementAttributeImpl _other = (PositionIncrementAttributeImpl) other; + return positionIncrement == _other.positionIncrement; } return false; @@ -95,5 +97,4 @@ PositionIncrementAttribute t = (PositionIncrementAttribute) target; t.setPositionIncrement(positionIncrement); } - } Property changes on: lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttribute.java ___________________________________________________________________ Added: svn:eol-style + native Property changes on: lucene/core/src/java/org/apache/lucene/util/RollingCharBuffer.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/core/src/java/org/apache/lucene/util/fst/FST.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/FST.java (revision 1297338) +++ lucene/core/src/java/org/apache/lucene/util/fst/FST.java (working copy) @@ -840,6 +840,7 @@ } public Arc readFirstRealTargetArc(int node, Arc arc, final BytesReader in) throws IOException { + assert in.bytes == bytes; final int address = getNodeAddress(node); in.pos = address; //System.out.println(" readFirstRealTargtArc address=" @@ -936,6 +937,7 @@ /** Never returns null, but you should never call this if * arc.isLast() is true. */ public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { + assert in.bytes == bytes; // TODO: can't assert this because we call from readFirstArc // assert !flag(arc.flags, BIT_LAST_ARC); @@ -1019,6 +1021,7 @@ * This returns null if the arc was not found, else the incoming arc. */ public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in) throws IOException { assert cachedRootArcs != null; + assert in.bytes == bytes; if (labelToMatch == END_LABEL) { if (follow.isFinal()) { @@ -1225,17 +1228,20 @@ /** Expert */ public static abstract class BytesReader extends DataInput { - int pos; + protected int pos; + protected final byte[] bytes; + protected BytesReader(byte[] bytes, int pos) { + this.bytes = bytes; + this.pos = pos; + } abstract void skip(int byteCount); abstract void skip(int base, int byteCount); } final static class ReverseBytesReader extends BytesReader { - final byte[] bytes; public ReverseBytesReader(byte[] bytes, int pos) { - this.bytes = bytes; - this.pos = pos; + super(bytes, pos); } @Override @@ -1262,11 +1268,9 @@ // TODO: can we use just ByteArrayDataInput...? need to // add a .skipBytes to DataInput.. hmm and .setPosition final static class ForwardBytesReader extends BytesReader { - final byte[] bytes; public ForwardBytesReader(byte[] bytes, int pos) { - this.bytes = bytes; - this.pos = pos; + super(bytes, pos); } @Override Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 1297338) +++ lucene/contrib/CHANGES.txt (working copy) @@ -42,6 +42,9 @@ * LUCENE-3730: Refine Kuromoji search mode (Mode.SEARCH) decompounding heuristics. (Christian Moen via Robert Muir) + * LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words + and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen) + * LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do joins in both parent to child and child to parent directions. Index: lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt =================================================================== --- lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt (revision 1297369) +++ lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt (working copy) @@ -4,3 +4,7 @@ # Custom reading for sumo wrestler 朝青龍,朝青龍,アサショウリュウ,カスタム人名 + +# Silly entry: +abcd,a b cd,foo1 foo2 foo3,bar +abcdefg,ab cd efg,foo1 foo2 foo4,bar Index: lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (revision 1297369) +++ lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (working copy) @@ -23,29 +23,17 @@ import java.io.Reader; import java.io.IOException; -import org.apache.lucene.analysis.kuromoji.SegmenterTest; import org.apache.lucene.analysis.kuromoji.dict.UserDictionary; +import org.apache.lucene.analysis.kuromoji.TestKuromojiTokenizer; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.junit.Test; public class UserDictionaryTest extends LuceneTestCase { - private UserDictionary readDict() throws IOException { - InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt"); - if (is == null) - throw new FileNotFoundException("Cannot find userdict.txt in test classpath!"); - try { - Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8); - return new UserDictionary(reader); - } finally { - is.close(); - } - } - @Test public void testLookup() throws IOException { - UserDictionary dictionary = readDict(); + UserDictionary dictionary = TestKuromojiTokenizer.readDict(); String s = "関西国際空港に行った"; int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length()); // Length should be three 関西, 国際, 空港 @@ -69,7 +57,7 @@ @Test public void testReadings() throws IOException { - UserDictionary dictionary = readDict(); + UserDictionary dictionary = TestKuromojiTokenizer.readDict(); int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6); assertEquals(3, result.length); int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞 @@ -83,7 +71,7 @@ @Test public void testPartOfSpeech() throws IOException { - UserDictionary dictionary = readDict(); + UserDictionary dictionary = TestKuromojiTokenizer.readDict(); int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6); assertEquals(3, result.length); int wordIdKeizai = result[1][0]; // wordId of 経済 in 日本経済新聞 @@ -92,7 +80,7 @@ @Test public void testRead() throws IOException { - UserDictionary dictionary = readDict(); + UserDictionary dictionary = TestKuromojiTokenizer.readDict(); assertNotNull(dictionary); } } Index: lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java (revision 1297369) +++ lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java (working copy) @@ -29,7 +29,7 @@ private Analyzer analyzer = new ReusableAnalyzerBase() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new KuromojiTokenizer(reader); + Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.DEFAULT_MODE); return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer)); } }; Index: lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java (revision 1297369) +++ lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java (working copy) @@ -26,18 +26,17 @@ import org.apache.lucene.analysis.ReusableAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.kuromoji.Segmenter.Mode; +import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; public class TestExtendedMode extends BaseTokenStreamTestCase { - private final Segmenter segmenter = new Segmenter(Mode.EXTENDED); private final Analyzer analyzer = new ReusableAnalyzerBase() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader); + Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.EXTENDED); return new TokenStreamComponents(tokenizer, tokenizer); } }; Index: lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (revision 1297369) +++ lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (working copy) @@ -18,8 +18,11 @@ */ import java.io.IOException; +import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode; public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -41,20 +44,103 @@ new int[] { 1, 2, 2, 2 } ); } - + /** * Test that search mode is enabled and working by default */ public void testDecomposition() throws IOException { - assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "シニアソフトウェアエンジニア", - new String[] { "シニア", "ソフトウェア", "エンジニア" } - ); + + final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, null, Mode.SEARCH, + KuromojiAnalyzer.getDefaultStopSet(), + KuromojiAnalyzer.getDefaultStopTags()); + + /* + //TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。")); + TokenStream ts = a.tokenStream("foo", new StringReader("�?>-->;")); + ts.reset(); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + while(ts.incrementToken()) { + System.out.println(" " + termAtt.toString()); + } + System.out.println("DONE PARSE\n\n"); + */ + + // Senior software engineer: + assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア", + new String[] { "シニア", + "シニアソフトウェアエンジニア", + "ソフトウェア", + "エンジニア" }, + new int[] { 1, 0, 1, 1}, + new int[] { 1, 3, 1, 1} + ); + + // Kansai International Airport: + assertAnalyzesToPositions(a, "関西国際空港", + new String[] { "関西", + "関西国際空港", // zero pos inc + "国際", + "空港" }, + new int[] {1, 0, 1, 1}, + new int[] {1, 3, 1, 1} + ); + + // Konika Minolta Holdings; not quite the right + // segmentation (see LUCENE-3726): + assertAnalyzesToPositions(a, "コニカミノルタホールディングス", + new String[] { "コニカ", + "コニカミノルタホールディングス", // zero pos inc + "ミノルタ", + "ホールディングス"}, + new int[] {1, 0, 1, 1}, + new int[] {1, 3, 1, 1} + ); + + // Narita Airport + assertAnalyzesToPositions(a, "成田空港", + new String[] { "成田", + "成田空港", + "空港" }, + new int[] {1, 0, 1}, + new int[] {1, 2, 1} + ); + + // Kyoto University Baseball Club + assertAnalyzesToPositions(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "京都大学硬式野球部", + new String[] { "京都大", + "学", + "硬式", + "野球", + "部" }, + new int[] {1, 1, 1, 1, 1}, + new int[] {1, 1, 1, 1, 1}); + // toDotFile(a, "成田空港", "/mnt/scratch/out.dot"); } + /** * blast random strings against the analyzer */ public void testRandom() throws IOException { - checkRandomData(random, new KuromojiAnalyzer(TEST_VERSION_CURRENT), atLeast(10000)); + final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, null, Mode.SEARCH, + KuromojiAnalyzer.getDefaultStopSet(), + KuromojiAnalyzer.getDefaultStopTags()); + checkRandomData(random, a, atLeast(10000)); } + + // Copied from TestKuromojiTokenizer, to make sure passing + // user dict to analyzer works: + public void testUserDict3() throws Exception { + // Test entry that breaks into multiple tokens: + final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, TestKuromojiTokenizer.readDict(), + Mode.SEARCH, + KuromojiAnalyzer.getDefaultStopSet(), + KuromojiAnalyzer.getDefaultStopTags()); + assertTokenStreamContents(a.tokenStream("foo", new StringReader("abcd")), + new String[] { "a", "b", "cd" }, + new int[] { 0, 1, 2 }, + new int[] { 1, 2, 4 }, + new Integer(4) + ); + } } Index: lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (revision 1297338) +++ lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (working copy) @@ -1,231 +0,0 @@ -package org.apache.lucene.analysis.kuromoji; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.InputStreamReader; -import java.io.LineNumberReader; -import java.util.List; - -import org.apache.lucene.util.LuceneTestCase; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; - -public class SegmenterTest extends LuceneTestCase { - - private static Segmenter segmenter; - - @BeforeClass - public static void setUpBeforeClass() throws Exception { - segmenter = new Segmenter(); - } - - @AfterClass - public static void afterClass() throws Exception { - segmenter = null; - } - - @Test - public void testSegmentation() { - // Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン - // String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。"; - // String[] surfaceForms = { - // "ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。", - // "スペース", "ステーション", "に", "行き", "ます", "。", - // "うたがわしい", "。" - // }; - String input = "スペースステーションに行きます。うたがわしい。"; - String[] surfaceForms = { - "スペース", "ステーション", "に", "行き", "ます", "。", - "うたがわしい", "。" - }; - List tokens = segmenter.tokenize(input); - assertTrue(tokens.size() == surfaceForms.length); - for (int i = 0; i < tokens.size(); i++) { - assertEquals(surfaceForms[i], tokens.get(i).getSurfaceFormString()); - } - } - - @Test - public void testReadings() { - List tokens = segmenter.tokenize("寿司が食べたいです。"); - assertEquals(6, tokens.size()); - assertEquals("スシ", tokens.get(0).getReading()); - assertEquals("ガ", tokens.get(1).getReading()); - assertEquals("タベ", tokens.get(2).getReading()); - assertEquals("タイ", tokens.get(3).getReading()); - assertEquals("デス", tokens.get(4).getReading()); - assertEquals("。", tokens.get(5).getReading()); - } - - @Test - public void testReadings2() { - List tokens = segmenter.tokenize("多くの学生が試験に落ちた。"); - assertEquals(9, tokens.size()); - assertEquals("オオク", tokens.get(0).getReading()); - assertEquals("ノ", tokens.get(1).getReading()); - assertEquals("ガクセイ", tokens.get(2).getReading()); - assertEquals("ガ", tokens.get(3).getReading()); - assertEquals("シケン", tokens.get(4).getReading()); - assertEquals("ニ", tokens.get(5).getReading()); - assertEquals("オチ", tokens.get(6).getReading()); - assertEquals("タ", tokens.get(7).getReading()); - assertEquals("。", tokens.get(8).getReading()); - } - - @Test - public void testPronunciations() { - List tokens = segmenter.tokenize("寿司が食べたいです。"); - assertEquals(6, tokens.size()); - assertEquals("スシ", tokens.get(0).getPronunciation()); - assertEquals("ガ", tokens.get(1).getPronunciation()); - assertEquals("タベ", tokens.get(2).getPronunciation()); - assertEquals("タイ", tokens.get(3).getPronunciation()); - assertEquals("デス", tokens.get(4).getPronunciation()); - assertEquals("。", tokens.get(5).getPronunciation()); - } - - @Test - public void testPronunciations2() { - List tokens = segmenter.tokenize("多くの学生が試験に落ちた。"); - assertEquals(9, tokens.size()); - // pronunciation differs from reading here - assertEquals("オーク", tokens.get(0).getPronunciation()); - assertEquals("ノ", tokens.get(1).getPronunciation()); - assertEquals("ガクセイ", tokens.get(2).getPronunciation()); - assertEquals("ガ", tokens.get(3).getPronunciation()); - assertEquals("シケン", tokens.get(4).getPronunciation()); - assertEquals("ニ", tokens.get(5).getPronunciation()); - assertEquals("オチ", tokens.get(6).getPronunciation()); - assertEquals("タ", tokens.get(7).getPronunciation()); - assertEquals("。", tokens.get(8).getPronunciation()); - } - - @Test - public void testBasicForms() { - List tokens = segmenter.tokenize("それはまだ実験段階にあります。"); - assertEquals(9, tokens.size()); - assertNull(tokens.get(0).getBaseForm()); - assertNull(tokens.get(1).getBaseForm()); - assertNull(tokens.get(2).getBaseForm()); - assertNull(tokens.get(3).getBaseForm()); - assertNull(tokens.get(4).getBaseForm()); - assertNull(tokens.get(5).getBaseForm()); - assertEquals(tokens.get(6).getBaseForm(), "ある"); - assertNull(tokens.get(7).getBaseForm()); - assertNull(tokens.get(8).getBaseForm()); - } - - @Test - public void testInflectionTypes() { - List tokens = segmenter.tokenize("それはまだ実験段階にあります。"); - assertEquals(9, tokens.size()); - assertNull(tokens.get(0).getInflectionType()); - assertNull(tokens.get(1).getInflectionType()); - assertNull(tokens.get(2).getInflectionType()); - assertNull(tokens.get(3).getInflectionType()); - assertNull(tokens.get(4).getInflectionType()); - assertNull(tokens.get(5).getInflectionType()); - assertEquals("五段・ラ行", tokens.get(6).getInflectionType()); - assertEquals("特殊・マス", tokens.get(7).getInflectionType()); - assertNull(tokens.get(8).getInflectionType()); - } - - @Test - public void testInflectionForms() { - List tokens = segmenter.tokenize("それはまだ実験段階にあります。"); - assertEquals(9, tokens.size()); - assertNull(tokens.get(0).getInflectionForm()); - assertNull(tokens.get(1).getInflectionForm()); - assertNull(tokens.get(2).getInflectionForm()); - assertNull(tokens.get(3).getInflectionForm()); - assertNull(tokens.get(4).getInflectionForm()); - assertNull(tokens.get(5).getInflectionForm()); - assertEquals("連用形", tokens.get(6).getInflectionForm()); - assertEquals("基本形", tokens.get(7).getInflectionForm()); - assertNull(tokens.get(8).getInflectionForm()); - } - - @Test - public void testPartOfSpeech() { - List tokens = segmenter.tokenize("それはまだ実験段階にあります。"); - assertEquals(9, tokens.size()); - assertEquals("名詞-代名詞-一般", tokens.get(0).getPartOfSpeech()); - assertEquals("助詞-係助詞", tokens.get(1).getPartOfSpeech()); - assertEquals("副詞-助詞類接続", tokens.get(2).getPartOfSpeech()); - assertEquals("名詞-サ変接続", tokens.get(3).getPartOfSpeech()); - assertEquals("名詞-一般", tokens.get(4).getPartOfSpeech()); - assertEquals("助詞-格助詞-一般", tokens.get(5).getPartOfSpeech()); - assertEquals("動詞-自立", tokens.get(6).getPartOfSpeech()); - assertEquals("助動詞", tokens.get(7).getPartOfSpeech()); - assertEquals("記号-句点", tokens.get(8).getPartOfSpeech()); - } - - // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix? - // do we have a possibility to actually lookup the first and last word from dictionary? - public void testYabottai() { - List tokens = segmenter.tokenize("やぼったい"); - assertEquals(1, tokens.size()); - assertEquals("やぼったい", tokens.get(0).getSurfaceFormString()); - } - - public void testTsukitosha() { - List tokens = segmenter.tokenize("突き通しゃ"); - assertEquals(1, tokens.size()); - assertEquals("突き通しゃ", tokens.get(0).getSurfaceFormString()); - } - - public void testBocchan() throws Exception { - doTestBocchan(1); - } - - @Test @Nightly - public void testBocchanBig() throws Exception { - doTestBocchan(100); - } - - private void doTestBocchan(int numIterations) throws Exception { - LineNumberReader reader = new LineNumberReader(new InputStreamReader( - this.getClass().getResourceAsStream("bocchan.utf-8"))); - - String line = reader.readLine(); - reader.close(); - - if (VERBOSE) { - System.out.println("Test for Bocchan without pre-splitting sentences"); - } - long totalStart = System.currentTimeMillis(); - for (int i = 0; i < numIterations; i++){ - segmenter.tokenize(line); - } - if (VERBOSE) { - System.out.println("Total time : " + (System.currentTimeMillis() - totalStart)); - System.out.println("Test for Bocchan with pre-splitting sentences"); - } - String[] sentences = line.split("、|。"); - totalStart = System.currentTimeMillis(); - for (int i = 0; i < numIterations; i++) { - for (String sentence: sentences) { - segmenter.tokenize(sentence); - } - } - if (VERBOSE) { - System.out.println("Total time : " + (System.currentTimeMillis() - totalStart)); - } - } -} Index: lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (revision 1297369) +++ lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (working copy) @@ -17,7 +17,13 @@ * limitations under the License. */ +import java.io.BufferedReader; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; @@ -26,21 +32,76 @@ import org.apache.lucene.analysis.ReusableAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode; +import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts; +import org.apache.lucene.analysis.kuromoji.dict.UserDictionary; +import org.apache.lucene.analysis.kuromoji.tokenattributes.*; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; public class TestKuromojiTokenizer extends BaseTokenStreamTestCase { + + public static UserDictionary readDict() { + InputStream is = TestKuromojiTokenizer.class.getResourceAsStream("userdict.txt"); + if (is == null) { + throw new RuntimeException("Cannot find userdict.txt in test classpath!"); + } + try { + try { + Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8); + return new UserDictionary(reader); + } finally { + is.close(); + } + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + private Analyzer analyzer = new ReusableAnalyzerBase() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new KuromojiTokenizer(reader); + Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH); return new TokenStreamComponents(tokenizer, tokenizer); } }; - + + private Analyzer analyzerNormal = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.NORMAL); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + private Analyzer analyzerNoPunct = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.SEARCH); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + private Analyzer extendedModeAnalyzerNoPunct = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.EXTENDED); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + public void testNormalMode() throws Exception { + assertAnalyzesTo(analyzerNormal, + "シニアソフトウェアエンジニア", + new String[] {"シニアソフトウェアエンジニア"}); + } + public void testDecomposition1() throws Exception { - assertAnalyzesTo(analyzer, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" + + assertAnalyzesTo(analyzerNoPunct, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" + "アメリカ低所得者医療援助制度が、今日では、その予算の約3分の1を老人に費やしている。", new String[] { "本来", "は", "貧困", "層", "の", "女性", "や", "子供", "に", "医療", "保護", "を", "提供", "する", "ため", "に", "創設", "さ", "れ", "た", "制度", "で", "ある", "アメリカ", @@ -56,7 +117,7 @@ } public void testDecomposition2() throws Exception { - assertAnalyzesTo(analyzer, "麻薬の密売は根こそぎ絶やさなければならない", + assertAnalyzesTo(analyzerNoPunct, "麻薬の密売は根こそぎ絶やさなければならない", new String[] { "麻薬", "の", "密売", "は", "根こそぎ", "絶やさ", "なけれ", "ば", "なら", "ない" }, new int[] { 0, 2, 3, 5, 6, 10, 13, 16, 17, 19 }, new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 } @@ -64,7 +125,7 @@ } public void testDecomposition3() throws Exception { - assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。", + assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。", new String[] { "魔女", "狩", "大将", "マシュー", "ホプキンス" }, new int[] { 0, 2, 3, 5, 10 }, new int[] { 2, 3, 5, 9, 15 } @@ -92,9 +153,32 @@ ts.close(); } + /* + // NOTE: intentionally fails! Just trying to debug this + // one input... + public void testDecomposition6() throws Exception { + assertAnalyzesTo(analyzer, "奈良先端科学技術大学院大学", + new String[] { "これ", "は", "本", "で", "は", "ない" }, + new int[] { 0, 2, 3, 4, 5, 6 }, + new int[] { 2, 3, 4, 5, 6, 8 } + ); + } + */ + /** Tests that sentence offset is incorporated into the resulting offsets */ public void testTwoSentences() throws Exception { - assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。", + /* + //TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。")); + TokenStream ts = analyzer.tokenStream("foo", new StringReader("�?>-->;")); + ts.reset(); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + while(ts.incrementToken()) { + System.out.println(" " + termAtt.toString()); + } + System.out.println("DONE PARSE\n\n"); + */ + + assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。", new String[] { "魔女", "狩", "大将", "マシュー", "ホプキンス", "魔女", "狩", "大将", "マシュー", "ホプキンス" }, new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 }, new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 } @@ -104,6 +188,7 @@ /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER); + checkRandomData(random, analyzerNoPunct, 10000*RANDOM_MULTIPLIER); } public void testLargeDocReliability() throws Exception { @@ -126,6 +211,9 @@ public void testSurrogates2() throws IOException { int numIterations = atLeast(10000); for (int i = 0; i < numIterations; i++) { + if (VERBOSE) { + System.out.println("\nTEST: iter=" + i); + } String s = _TestUtil.randomUnicodeString(random, 100); TokenStream ts = analyzer.tokenStream("foo", new StringReader(s)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); @@ -135,22 +223,410 @@ } } } + + public void testOnlyPunctuation() throws IOException { + TokenStream ts = analyzerNoPunct.tokenStream("foo", new StringReader("。、。。")); + ts.reset(); + assertFalse(ts.incrementToken()); + ts.end(); + } + + public void testOnlyPunctuationExtended() throws IOException { + TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", new StringReader("......")); + ts.reset(); + assertFalse(ts.incrementToken()); + ts.end(); + } // note: test is kinda silly since kuromoji emits punctuation tokens. // but, when/if we filter these out it will be useful. public void testEnd() throws Exception { - assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない")), + assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない")), new String[] { "これ", "は", "本", "で", "は", "ない" }, new int[] { 0, 2, 3, 4, 5, 6 }, new int[] { 2, 3, 4, 5, 6, 8 }, new Integer(8) ); - - assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない ")), + + assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない ")), new String[] { "これ", "は", "本", "で", "は", "ない" }, new int[] { 0, 2, 3, 4, 5, 6, 8 }, new int[] { 2, 3, 4, 5, 6, 8, 9 }, new Integer(12) ); } + + public void testUserDict() throws Exception { + // Not a great test because w/o userdict.txt the + // segmentation is the same: + assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("関西国際空港に行った")), + new String[] { "関西", "国際", "空港", "に", "行っ", "た" }, + new int[] { 0, 2, 4, 6, 7, 9 }, + new int[] { 2, 4, 6, 7, 9, 10 }, + new Integer(10) + ); + } + + public void testUserDict2() throws Exception { + // Better test: w/o userdict the segmentation is different: + assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("朝青龍")), + new String[] { "朝青龍" }, + new int[] { 0 }, + new int[] { 3 }, + new Integer(3) + ); + } + + public void testUserDict3() throws Exception { + // Test entry that breaks into multiple tokens: + assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcd")), + new String[] { "a", "b", "cd" }, + new int[] { 0, 1, 2 }, + new int[] { 1, 2, 4 }, + new Integer(4) + ); + } + + // HMM: fails (segments as a/b/cd/efghij)... because the + // two paths have exactly equal paths (1 KNOWN + 1 + // UNKNOWN) and we don't seem to favor longer KNOWN / + // shorter UNKNOWN matches: + + /* + public void testUserDict4() throws Exception { + // Test entry that has another entry as prefix + assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcdefghij")), + new String[] { "ab", "cd", "efg", "hij" }, + new int[] { 0, 2, 4, 7 }, + new int[] { 2, 4, 7, 10 }, + new Integer(10) + ); + } + */ + + public void testSegmentation() throws Exception { + // Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン + // String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。"; + // String[] surfaceForms = { + // "ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。", + // "スペース", "ステーション", "に", "行き", "ます", "。", + // "うたがわしい", "。" + // }; + String input = "スペースステーションに行きます。うたがわしい。"; + String[] surfaceForms = { + "スペース", "ステーション", "に", "行き", "ます", "。", + "うたがわしい", "。" + }; + assertAnalyzesTo(analyzer, + input, + surfaceForms); + } + + public void testLatticeToDot() throws Exception { + final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance()); + final Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + KuromojiTokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH); + tokenizer.setGraphvizFormatter(gv2); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + String input = "スペースステーションに行きます。うたがわしい。"; + String[] surfaceForms = { + "スペース", "ステーション", "に", "行き", "ます", "。", + "うたがわしい", "。" + }; + assertAnalyzesTo(analyzer, + input, + surfaceForms); + + assertTrue(gv2.finish().indexOf("22.0") != -1); + } + + private void assertReadings(String input, String... readings) throws IOException { + TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input)); + ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class); + ts.reset(); + for(String reading : readings) { + assertTrue(ts.incrementToken()); + assertEquals(reading, readingAtt.getReading()); + } + assertFalse(ts.incrementToken()); + ts.end(); + } + + private void assertPronunciations(String input, String... pronunciations) throws IOException { + TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input)); + ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class); + ts.reset(); + for(String pronunciation : pronunciations) { + assertTrue(ts.incrementToken()); + assertEquals(pronunciation, readingAtt.getPronunciation()); + } + assertFalse(ts.incrementToken()); + ts.end(); + } + + private void assertBaseForms(String input, String... baseForms) throws IOException { + TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input)); + BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class); + ts.reset(); + for(String baseForm : baseForms) { + assertTrue(ts.incrementToken()); + assertEquals(baseForm, baseFormAtt.getBaseForm()); + } + assertFalse(ts.incrementToken()); + ts.end(); + } + + private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException { + TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input)); + InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class); + ts.reset(); + for(String inflectionType : inflectionTypes) { + assertTrue(ts.incrementToken()); + assertEquals(inflectionType, inflectionAtt.getInflectionType()); + } + assertFalse(ts.incrementToken()); + ts.end(); + } + + private void assertInflectionForms(String input, String... inflectionForms) throws IOException { + TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input)); + InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class); + ts.reset(); + for(String inflectionForm : inflectionForms) { + assertTrue(ts.incrementToken()); + assertEquals(inflectionForm, inflectionAtt.getInflectionForm()); + } + assertFalse(ts.incrementToken()); + ts.end(); + } + + private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException { + TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input)); + PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class); + ts.reset(); + for(String partOfSpeech : partsOfSpeech) { + assertTrue(ts.incrementToken()); + assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech()); + } + assertFalse(ts.incrementToken()); + ts.end(); + } + + public void testReadings() throws Exception { + assertReadings("寿司が食べたいです。", + "スシ", + "ガ", + "タベ", + "タイ", + "デス", + "。"); + } + + public void testReadings2() throws Exception { + assertReadings("多くの学生が試験に落ちた。", + "オオク", + "ノ", + "ガクセイ", + "ガ", + "シケン", + "ニ", + "オチ", + "タ", + "。"); + } + + public void testPronunciations() throws Exception { + assertPronunciations("寿司が食べたいです。", + "スシ", + "ガ", + "タベ", + "タイ", + "デス", + "。"); + } + + public void testPronunciations2() throws Exception { + // pronunciation differs from reading here + assertPronunciations("多くの学生が試験に落ちた。", + "オーク", + "ノ", + "ガクセイ", + "ガ", + "シケン", + "ニ", + "オチ", + "タ", + "。"); + } + + public void testBasicForms() throws Exception { + assertBaseForms("それはまだ実験段階にあります。", + null, + null, + null, + null, + null, + null, + "ある", + null, + null); + } + + public void testInflectionTypes() throws Exception { + assertInflectionTypes("それはまだ実験段階にあります。", + null, + null, + null, + null, + null, + null, + "五段・ラ行", + "特殊・マス", + null); + } + + public void testInflectionForms() throws Exception { + assertInflectionForms("それはまだ実験段階にあります。", + null, + null, + null, + null, + null, + null, + "連用形", + "基本形", + null); + } + + public void testPartOfSpeech() throws Exception { + assertPartsOfSpeech("それはまだ実験段階にあります。", + "名詞-代名詞-一般", + "助詞-係助詞", + "副詞-助詞類接続", + "名詞-サ変接続", + "名詞-一般", + "助詞-格助詞-一般", + "動詞-自立", + "助動詞", + "記号-句点"); + } + + // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix? + // do we have a possibility to actually lookup the first and last word from dictionary? + public void testYabottai() throws Exception { + assertAnalyzesTo(analyzer, "やぼったい", + new String[] {"やぼったい"}); + } + + public void testTsukitosha() throws Exception { + assertAnalyzesTo(analyzer, "突き通しゃ", + new String[] {"突き通しゃ"}); + } + + public void testBocchan() throws Exception { + doTestBocchan(1); + } + + @Nightly + public void testBocchanBig() throws Exception { + doTestBocchan(100); + } + + /* + public void testWikipedia() throws Exception { + final FileInputStream fis = new FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml"); + final Reader r = new BufferedReader(new InputStreamReader(fis, "UTF-8")); + + final long startTimeNS = System.nanoTime(); + boolean done = false; + long compoundCount = 0; + long nonCompoundCount = 0; + long netOffset = 0; + while (!done) { + final TokenStream ts = analyzer.tokenStream("ignored", r); + ts.reset(); + final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + int count = 0; + while (true) { + if (!ts.incrementToken()) { + done = true; + break; + } + count++; + if (posIncAtt.getPositionIncrement() == 0) { + compoundCount++; + } else { + nonCompoundCount++; + if (nonCompoundCount % 1000000 == 0) { + System.out.println(String.format("%.2f msec [pos=%d, %d, %d]", + (System.nanoTime()-startTimeNS)/1000000.0, + netOffset + offsetAtt.startOffset(), + nonCompoundCount, + compoundCount)); + } + } + if (count == 100000000) { + System.out.println(" again..."); + break; + } + } + ts.end(); + netOffset += offsetAtt.endOffset(); + } + System.out.println("compoundCount=" + compoundCount + " nonCompoundCount=" + nonCompoundCount); + r.close(); + } + */ + + + private void doTestBocchan(int numIterations) throws Exception { + LineNumberReader reader = new LineNumberReader(new InputStreamReader( + this.getClass().getResourceAsStream("bocchan.utf-8"))); + String line = reader.readLine(); + reader.close(); + + if (VERBOSE) { + System.out.println("Test for Bocchan without pre-splitting sentences"); + } + + /* + if (numIterations > 1) { + // warmup + for (int i = 0; i < numIterations; i++) { + final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line)); + ts.reset(); + while(ts.incrementToken()); + } + } + */ + + long totalStart = System.currentTimeMillis(); + for (int i = 0; i < numIterations; i++) { + final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line)); + ts.reset(); + while(ts.incrementToken()); + } + String[] sentences = line.split("、|。"); + if (VERBOSE) { + System.out.println("Total time : " + (System.currentTimeMillis() - totalStart)); + System.out.println("Test for Bocchan with pre-splitting sentences (" + sentences.length + " sentences)"); + } + totalStart = System.currentTimeMillis(); + for (int i = 0; i < numIterations; i++) { + for (String sentence: sentences) { + final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(sentence)); + ts.reset(); + while(ts.incrementToken()); + } + } + if (VERBOSE) { + System.out.println("Total time : " + (System.currentTimeMillis() - totalStart)); + } + } } Index: lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (revision 1297369) +++ lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (working copy) @@ -28,20 +28,19 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.ReusableAnalyzerBase; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.kuromoji.Segmenter.Mode; +import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode; import org.apache.lucene.util.IOUtils; public class TestSearchMode extends BaseTokenStreamTestCase { private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt"; - private final Segmenter segmenter = new Segmenter(Mode.SEARCH); private final Analyzer analyzer = new ReusableAnalyzerBase() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader); + Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH); return new TokenStreamComponents(tokenizer, tokenizer); } }; - + /** Test search mode segmentation */ public void testSearchSegmentation() throws IOException { InputStream is = TestSearchMode.class.getResourceAsStream(SEGMENTATION_FILENAME); @@ -64,7 +63,18 @@ String[] fields = line.split("\t", 2); String sourceText = fields[0]; String[] expectedTokens = fields[1].split("\\s+"); - assertAnalyzesTo(analyzer, sourceText, expectedTokens); + int[] expectedPosIncrs = new int[expectedTokens.length]; + int[] expectedPosLengths = new int[expectedTokens.length]; + for(int tokIDX=0;tokIDX 1. + */ + public void setPositionLength(int positionLength) { + this.positionLength = positionLength; + } + + /** + * Get the length (in tokens) of this token. For normal + * tokens this is 1; for compound tokens it's > 1. + * @return position length of token + */ + public int getPositionLength() { + return positionLength; + } } Index: lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (revision 1297338) +++ lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (working copy) @@ -27,6 +27,7 @@ import java.util.Map; import java.util.TreeMap; +import org.apache.lucene.analysis.kuromoji.dict.Dictionary; import org.apache.lucene.analysis.kuromoji.util.CSVUtil; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; @@ -159,6 +160,10 @@ return found ? toIndexArray(result) : EMPTY_RESULT; } + public TokenInfoFST getFST() { + return fst; + } + private static final int[][] EMPTY_RESULT = new int[0][]; /** @@ -181,6 +186,10 @@ } return result.toArray(new int[result.size()][]); } + + public int[] lookupSegmentation(int phraseID) { + return segmentations[phraseID]; + } //@Override public int getLeftId(int wordId) { Index: lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (revision 1297338) +++ lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (working copy) @@ -1,365 +0,0 @@ -package org.apache.lucene.analysis.kuromoji.viterbi; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.LinkedList; -import java.util.List; - -import org.apache.lucene.analysis.kuromoji.Segmenter.Mode; -import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition; -import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts; -import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary; -import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST; -import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary; -import org.apache.lucene.analysis.kuromoji.dict.UserDictionary; -import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.fst.FST; - -public class Viterbi { - - private final TokenInfoFST fst; - - private final TokenInfoDictionary dictionary; - - private final UnknownDictionary unkDictionary; - - private final ConnectionCosts costs; - - private final UserDictionary userDictionary; - - private final CharacterDefinition characterDefinition; - - private final boolean useUserDictionary; - - private final boolean searchMode; - - private final boolean extendedMode; - - private static final int DEFAULT_COST = 10000000; - - private static final int SEARCH_MODE_KANJI_LENGTH = 2; - - private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH - - private static final int SEARCH_MODE_KANJI_PENALTY = 3000; - - private static final int SEARCH_MODE_OTHER_PENALTY = 1700; - - private static final char[] BOS = "BOS".toCharArray(); - - private static final char[] EOS = "EOS".toCharArray(); - - /** - * Constructor - */ - public Viterbi(TokenInfoDictionary dictionary, - UnknownDictionary unkDictionary, - ConnectionCosts costs, - UserDictionary userDictionary, - Mode mode) { - this.dictionary = dictionary; - this.fst = dictionary.getFST(); - this.unkDictionary = unkDictionary; - this.costs = costs; - this.userDictionary = userDictionary; - if(userDictionary == null) { - this.useUserDictionary = false; - } else { - this.useUserDictionary = true; - } - - switch(mode){ - case SEARCH: - searchMode = true; - extendedMode = false; - break; - case EXTENDED: - searchMode = true; - extendedMode = true; - break; - default: - searchMode = false; - extendedMode = false; - break; - } - - this.characterDefinition = unkDictionary.getCharacterDefinition(); - } - - /** - * Find best path from input lattice. - * @param lattice the result of build method - * @return List of ViterbiNode which consist best path - */ - public List search(ViterbiNode[][][] lattice) { - ViterbiNode[][] startIndexArr = lattice[0]; - ViterbiNode[][] endIndexArr = lattice[1]; - - for (int i = 1; i < startIndexArr.length; i++){ - - if (startIndexArr[i] == null || endIndexArr[i] == null){ // continue since no array which contains ViterbiNodes exists. Or no previous node exists. - continue; - } - - for (ViterbiNode node : startIndexArr[i]) { - if (node == null){ // If array doesn't contain ViterbiNode any more, continue to next index - break; - } - - int backwardConnectionId = node.getLeftId(); - int wordCost = node.getWordCost(); - int leastPathCost = DEFAULT_COST; - for (ViterbiNode leftNode : endIndexArr[i]) { - if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index - break; - } - - int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost] - - // "Search mode". Add extra costs if it is long node. - if (searchMode) { - // System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false. - char[] surfaceForm = node.getSurfaceForm(); - int offset = node.getOffset(); - int length = node.getLength(); - if (length > SEARCH_MODE_KANJI_LENGTH) { - boolean allKanji = true; - // check if node consists of only kanji - for (int pos = 0; pos < length; pos++) { - if (!characterDefinition.isKanji(surfaceForm[offset+pos])){ - allKanji = false; - break; - } - } - - if (allKanji) { // Process only Kanji keywords - pathCost += (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY; - } else if (length > SEARCH_MODE_OTHER_LENGTH) { - pathCost += (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY; - } - } - } - - if (pathCost < leastPathCost){ // If total cost is lower than before, set current previous node as best left node (previous means left). - leastPathCost = pathCost; - node.setPathCost(leastPathCost); - node.setLeftNode(leftNode); - } - } - } - } - - // track best path - ViterbiNode node = endIndexArr[0][0]; // EOS - LinkedList result = new LinkedList(); - result.add(node); - while (true) { - ViterbiNode leftNode = node.getLeftNode(); - if (leftNode == null) { - break; - } - - // EXTENDED mode convert unknown word into unigram node - if (extendedMode && leftNode.getType() == Type.UNKNOWN) { - byte unigramWordId = CharacterDefinition.NGRAM; - int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required - int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required - int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required - char[] surfaceForm = leftNode.getSurfaceForm(); - int offset = leftNode.getOffset(); - int length = leftNode.getLength(); - for (int i = length - 1; i >= 0; i--) { - int charLen = 1; - if (i > 0 && Character.isLowSurrogate(surfaceForm[offset+i])) { - i--; - charLen = 2; - } - ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm, offset + i, charLen, unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i, Type.UNKNOWN); - result.addFirst(uniGramNode); - } - } else { - result.addFirst(leftNode); - } - node = leftNode; - } - - return result; - } - - /** - * Build lattice from input text - * @param text - */ - public ViterbiNode[][][] build(char text[], int offset, int length) throws IOException { - ViterbiNode[][] startIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS - ViterbiNode[][] endIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS - int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr - int[] endSizeArr = new int[length + 2]; // array to keep ViterbiNode count in endIndexArr - FST.Arc arc = new FST.Arc(); - ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN); - addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr); - - final FST.BytesReader fstReader = fst.getBytesReader(0); - - // Process user dictionary; - if (useUserDictionary) { - processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr); - } - - int unknownWordEndIndex = -1; // index of the last character of unknown word - - final IntsRef wordIdRef = new IntsRef(); - - for (int startIndex = 0; startIndex < length; startIndex++) { - // If no token ends where current token starts, skip this index - if (endSizeArr[startIndex + 1] == 0) { - continue; - } - - int suffixStart = offset + startIndex; - int suffixLength = length - startIndex; - - boolean found = false; - arc = fst.getFirstArc(arc); - int output = 0; - for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) { - int ch = text[suffixStart + endIndex - 1]; - - if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) { - break; // continue to next position - } - output += arc.output.intValue(); - - if (arc.isFinal()) { - final int finalOutput = output + arc.nextFinalOutput.intValue(); - found = true; // Don't produce unknown word starting from this index - dictionary.lookupWordIds(finalOutput, wordIdRef); - for (int ofs = 0; ofs < wordIdRef.length; ofs++) { - final int wordId = wordIdRef.ints[wordIdRef.offset + ofs]; - ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN); - addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr); - } - } - } - - // In the case of normal mode, it doesn't process unknown word greedily. - if(!searchMode && unknownWordEndIndex > startIndex){ - continue; - } - - // Process Unknown Word: hmm what is this isInvoke logic (same no matter what) - int unknownWordLength = 0; - char firstCharacter = text[suffixStart]; - boolean isInvoke = characterDefinition.isInvoke(firstCharacter); - if (isInvoke){ // Process "invoke" - unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength); - } else if (found == false){ // Process not "invoke" - unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength); - } - - if (unknownWordLength > 0) { // found unknown word - final int characterId = characterDefinition.getCharacterClass(firstCharacter); - unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same - for (int ofs = 0; ofs < wordIdRef.length; ofs++) { - final int wordId = wordIdRef.ints[wordIdRef.offset + ofs]; - ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, unknownWordLength, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN); - addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr); - } - unknownWordEndIndex = startIndex + unknownWordLength; - } - } - - ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN); - addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0 - - ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr}; - - return result; - } - - /** - * Find token(s) in input text and set found token(s) in arrays as normal tokens - * @param text - * @param startIndexArr - * @param endIndexArr - * @param startSizeArr - * @param endSizeArr - */ - private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) throws IOException { - int[][] result = userDictionary.lookup(text, offset, len); - for(int[] segmentation : result) { - int wordId = segmentation[0]; - int index = segmentation[1]; - int length = segmentation[2]; - ViterbiNode node = new ViterbiNode(wordId, text, offset + index, length, userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER); - addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr); - } - } - - /** - * Add node to arrays and increment count in size array - * @param node - * @param startIndex - * @param endIndex - * @param startIndexArr - * @param endIndexArr - * @param startSizeArr - * @param endSizeArr - */ - private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) { - int startNodesCount = startSizeArr[startIndex]; - int endNodesCount = endSizeArr[endIndex]; - - if (startNodesCount == 0) { - startIndexArr[startIndex] = new ViterbiNode[10]; - } - - if (endNodesCount == 0) { - endIndexArr[endIndex] = new ViterbiNode[10]; - } - - if (startIndexArr[startIndex].length <= startNodesCount){ - startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]); - } - - if (endIndexArr[endIndex].length <= endNodesCount){ - endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]); - } - - startIndexArr[startIndex][startNodesCount] = node; - endIndexArr[endIndex][endNodesCount] = node; - - startSizeArr[startIndex] = startNodesCount + 1; - endSizeArr[endIndex] = endNodesCount + 1; - } - - - /** - * Return twice as big array which contains value of input array - * @param array - * @return - */ - private ViterbiNode[] extendArray(ViterbiNode[] array) { - //extend array - ViterbiNode[] newArray = new ViterbiNode[array.length * 2]; - System.arraycopy(array, 0, newArray, 0, array.length); - return newArray; - } -} Index: lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (revision 1297338) +++ lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (working copy) @@ -1,147 +0,0 @@ -package org.apache.lucene.analysis.kuromoji.viterbi; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -public final class ViterbiNode { - public enum Type { - KNOWN, - UNKNOWN, - USER - } - - private final int wordId; - - private final char[] surfaceForm; - private final int offset; - private final int length; - - private final int leftId; - - private final int rightId; - - /** word cost for this node */ - private final int wordCost; - - /** minimum path cost found thus far */ - private int pathCost; - - private ViterbiNode leftNode; - - private final Type type; - - private final int startIndex; - - public ViterbiNode(int wordId, char[] surfaceForm, int offset, int length, int leftId, int rightId, int wordCost, int startIndex, Type type) { - this.wordId = wordId; - this.surfaceForm = surfaceForm; - this.offset = offset; - this.length = length; - this.leftId = leftId; - this.rightId = rightId; - this.wordCost = wordCost; - this.startIndex = startIndex; - this.type = type; - } - - - /** - * @return the wordId - */ - public int getWordId() { - return wordId; - } - - /** - * @return the surfaceForm - */ - public char[] getSurfaceForm() { - return surfaceForm; - } - - /** - * @return start offset into surfaceForm - */ - public int getOffset() { - return offset; - } - - /** - * @return length of surfaceForm - */ - public int getLength() { - return length; - } - - /** - * @return the surfaceForm as a String - */ - public String getSurfaceFormString() { - return new String(surfaceForm, offset, length); - } - - /** - * @return the leftId - */ - public int getLeftId() { - return leftId; - } - - /** - * @return the rightId - */ - public int getRightId() { - return rightId; - } - - /** - * @return the cost - */ - public int getWordCost() { - return wordCost; - } - - /** - * @return the cost - */ - public int getPathCost() { - return pathCost; - } - - /** - * param cost minimum path cost found this far - */ - public void setPathCost(int pathCost) { - this.pathCost = pathCost; - } - - public void setLeftNode(ViterbiNode node) { - leftNode = node; - } - - public ViterbiNode getLeftNode() { - return leftNode; - } - - public int getStartIndex() { - return startIndex; - } - - public Type getType() { - return type; - } -} Index: lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (revision 1297338) +++ lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (working copy) @@ -1,226 +0,0 @@ -package org.apache.lucene.analysis.kuromoji.viterbi; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts; -import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type; - -public class GraphvizFormatter { - - private final static String BOS_LABEL = "BOS"; - - private final static String EOS_LABEL = "EOS"; - - private final static String FONT_NAME = "Helvetica"; - - private ConnectionCosts costs; - - private Map nodeMap; - - private Map bestPathMap; - - private boolean foundBOS; - - public GraphvizFormatter(ConnectionCosts costs) { - this.costs = costs; - this.nodeMap = new HashMap(); - this.bestPathMap = new HashMap(); - } - - public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) { - initBestPathMap(null); - - StringBuilder sb = new StringBuilder(); - sb.append(formatHeader()); - sb.append(formatNodes(startsArray, endsArray)); - sb.append(formatTrailer()); - return sb.toString(); - } - - public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List bestPath) { - - // List bestPathWithBOSAndEOS = new ArrayList(bastPath); - initBestPathMap(bestPath); - - StringBuilder sb = new StringBuilder(); - sb.append(formatHeader()); - sb.append(formatNodes(startsArray, endsArray)); - sb.append(formatTrailer()); - return sb.toString(); - - } - - private void initBestPathMap(List bestPath) { - this.bestPathMap.clear(); - - if (bestPath == null){ - return; - } - for (int i = 0; i < bestPath.size() - 1; i++) { - ViterbiNode from = bestPath.get(i); - ViterbiNode to = bestPath.get(i + 1); - - String fromId = getNodeId(from); - String toId = getNodeId(to); - - assert this.bestPathMap.containsKey(fromId) == false; - assert this.bestPathMap.containsValue(toId) == false; - this.bestPathMap.put(fromId, toId); - } - } - - private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) { - this.nodeMap.clear(); - this.foundBOS = false; - - StringBuilder sb = new StringBuilder(); - for (int i = 1; i < endsArray.length; i++) { - if(endsArray[i] == null || startsArray[i] == null) { - continue; - } - for (int j = 0; j < endsArray[i].length; j++) { - ViterbiNode from = endsArray[i][j]; - if(from == null){ - continue; - } - sb.append(formatNodeIfNew(from)); - for (int k = 0; k < startsArray[i].length; k++) { - ViterbiNode to = startsArray[i][k]; - if(to == null){ - break; - } - sb.append(formatNodeIfNew(to)); - sb.append(formatEdge(from, to)); - } - } - } - return sb.toString(); - } - - private String formatNodeIfNew(ViterbiNode node) { - String nodeId = getNodeId(node); - if (! this.nodeMap.containsKey(nodeId)) { - this.nodeMap.put(nodeId, node); - return formatNode(node); - } else { - return ""; - } - } - - private String formatHeader() { - StringBuilder sb = new StringBuilder(); - sb.append("digraph viterbi {\n"); - sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n"); - sb.append("# A2 paper size\n"); - sb.append("size = \"34.4,16.5\";\n"); - sb.append("# try to fill paper\n"); - sb.append("ratio = fill;\n"); - sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n"); - sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n"); - - return sb.toString(); - } - - private String formatTrailer() { - return "}"; - } - - - private String formatEdge(ViterbiNode from, ViterbiNode to) { - if (this.bestPathMap.containsKey(getNodeId(from)) && - this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) { - return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 "); - - } else { - return formatEdge(from, to, ""); - } - } - - - private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) { - StringBuilder sb = new StringBuilder(); - sb.append(getNodeId(from)); - sb.append(" -> "); - sb.append(getNodeId(to)); - sb.append(" [ "); - sb.append("label=\""); - sb.append(getCost(from, to)); - sb.append("\""); - sb.append(" "); - sb.append(attributes); - sb.append(" "); - sb.append(" ]"); - sb.append("\n"); - return sb.toString(); - } - - private String formatNode(ViterbiNode node) { - StringBuilder sb = new StringBuilder(); - sb.append("\""); - sb.append(getNodeId(node)); - sb.append("\""); - sb.append(" [ "); - sb.append("label="); - sb.append(formatNodeLabel(node)); - sb.append(" ]"); - return sb.toString(); - } - - private String formatNodeLabel(ViterbiNode node) { - StringBuilder sb = new StringBuilder(); - sb.append("<"); - sb.append(""); - sb.append(""); - // sb.append(""); - sb.append("
"); - sb.append(getNodeLabel(node)); - sb.append("
"); - sb.append(""); - sb.append(node.getWordCost()); - sb.append(""); - sb.append("
"); - // sb.append(this.dictionary.get(node.getWordId()).getPosInfo()); - // sb.append("
>"); - return sb.toString(); - } - - private String getNodeId(ViterbiNode node) { - return String.valueOf(node.hashCode()); - } - - private String getNodeLabel(ViterbiNode node) { - if (node.getType() == Type.KNOWN && node.getWordId() == 0) { - if (this.foundBOS) { - return EOS_LABEL; - } else { - this.foundBOS = true; - return BOS_LABEL; - } - } else { - return node.getSurfaceFormString(); - } - } - - private int getCost(ViterbiNode from, ViterbiNode to) { - return this.costs.get(from.getLeftId(), to.getRightId()); - } -} Index: lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (revision 1297338) +++ lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (working copy) @@ -29,23 +29,27 @@ import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.StopwordAnalyzerBase; +import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode; +import org.apache.lucene.analysis.kuromoji.dict.UserDictionary; import org.apache.lucene.util.Version; public class KuromojiAnalyzer extends StopwordAnalyzerBase { - private final Segmenter segmenter; + private final Mode mode; private final Set stoptags; + private final UserDictionary userDict; public KuromojiAnalyzer(Version matchVersion) { - this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS); + this(matchVersion, null, KuromojiTokenizer.DEFAULT_MODE, DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS); } - public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set stopwords, Set stoptags) { + public KuromojiAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set stoptags) { super(matchVersion, stopwords); - this.segmenter = segmenter; + this.userDict = userDict; + this.mode = mode; this.stoptags = stoptags; } - public static Set getDefaultStopSet(){ + public static CharArraySet getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -58,7 +62,7 @@ * outer class accesses the static final set the first time. */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final CharArraySet DEFAULT_STOP_SET; static final Set DEFAULT_STOP_TAGS; static { @@ -79,7 +83,7 @@ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader); + Tokenizer tokenizer = new KuromojiTokenizer(reader, userDict, true, mode); TokenStream stream = new KuromojiBaseFormFilter(tokenizer); stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); Index: lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (revision 1297338) +++ lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (working copy) @@ -1,214 +0,0 @@ -package org.apache.lucene.analysis.kuromoji; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.ArrayList; -import java.util.EnumMap; -import java.util.List; - -import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts; -import org.apache.lucene.analysis.kuromoji.dict.Dictionary; -import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary; -import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary; -import org.apache.lucene.analysis.kuromoji.dict.UserDictionary; -import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter; -import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi; -import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode; -import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type; - -/** - * Tokenizer main class. - * Thread safe. - */ -public class Segmenter { - public static enum Mode { - NORMAL, SEARCH, EXTENDED - } - - public static final Mode DEFAULT_MODE = Mode.SEARCH; - - private final Viterbi viterbi; - - private final EnumMap dictionaryMap = new EnumMap(Type.class); - - private final boolean split; - - public Segmenter() { - this(null, DEFAULT_MODE, false); - } - - public Segmenter(Mode mode) { - this(null, mode, false); - } - - public Segmenter(UserDictionary userDictionary) { - this(userDictionary, DEFAULT_MODE, false); - } - - public Segmenter(UserDictionary userDictionary, Mode mode) { - this(userDictionary, mode, false); - } - - public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) { - final TokenInfoDictionary dict = TokenInfoDictionary.getInstance(); - final UnknownDictionary unknownDict = UnknownDictionary.getInstance(); - this.viterbi = new Viterbi(dict, unknownDict, ConnectionCosts.getInstance(), userDictionary, mode); - this.split = split; - - dictionaryMap.put(Type.KNOWN, dict); - dictionaryMap.put(Type.UNKNOWN, unknownDict); - dictionaryMap.put(Type.USER, userDictionary); - } - - /** - * Tokenize input text - * @param text - * @return list of Token - */ - public List tokenize(String text) { - - if (!split) { - return doTokenize(0, text); - } - - List splitPositions = getSplitPositions(text); - - if(splitPositions.size() == 0) { - return doTokenize(0, text); - } - - ArrayList result = new ArrayList(); - int offset = 0; - for(int position : splitPositions) { - result.addAll(doTokenize(offset, text.substring(offset, position + 1))); - offset = position + 1; - } - - if(offset < text.length()) { - result.addAll(doTokenize(offset, text.substring(offset))); - } - - return result; - } - - /** - * Split input text at 句読点, which is 。 and 、 - * @param text - * @return list of split position - */ - private List getSplitPositions(String text) { - ArrayList splitPositions = new ArrayList(); - - int position = 0; - int currentPosition = 0; - - while(true) { - int indexOfMaru = text.indexOf("。", currentPosition); - int indexOfTen = text.indexOf("、", currentPosition); - - if(indexOfMaru < 0 || indexOfTen < 0) { - position = Math.max(indexOfMaru, indexOfTen);; - } else { - position = Math.min(indexOfMaru, indexOfTen); - } - - if(position >= 0) { - splitPositions.add(position); - currentPosition = position + 1; - } else { - break; - } - } - - return splitPositions; - } - - private List doTokenize(int offset, String sentence) { - char text[] = sentence.toCharArray(); - return doTokenize(offset, text, 0, text.length, false); - } - - /** - * Tokenize input sentence. - * @param offset offset of sentence in original input text - * @param sentence sentence to tokenize - * @return list of Token - */ - public List doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) { - ArrayList result = new ArrayList(); - - ViterbiNode[][][] lattice; - try { - lattice = viterbi.build(sentence, sentenceOffset, sentenceLength); - } catch (IOException impossible) { - throw new RuntimeException(impossible); - } - List bestPath = viterbi.search(lattice); - for (ViterbiNode node : bestPath) { - int wordId = node.getWordId(); - if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS - continue; - } else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) { - continue; // Do not emit punctuation - } - Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node - result.add(token); - } - - return result; - } - - /** returns a Graphviz String */ - public String debugTokenize(String text) { - ViterbiNode[][][] lattice; - try { - lattice = this.viterbi.build(text.toCharArray(), 0, text.length()); - } catch (IOException impossible) { - throw new RuntimeException(impossible); - } - List bestPath = this.viterbi.search(lattice); - - return new GraphvizFormatter(ConnectionCosts.getInstance()) - .format(lattice[0], lattice[1], bestPath); - } - - static final boolean isPunctuation(char ch) { - switch(Character.getType(ch)) { - case Character.SPACE_SEPARATOR: - case Character.LINE_SEPARATOR: - case Character.PARAGRAPH_SEPARATOR: - case Character.CONTROL: - case Character.FORMAT: - case Character.DASH_PUNCTUATION: - case Character.START_PUNCTUATION: - case Character.END_PUNCTUATION: - case Character.CONNECTOR_PUNCTUATION: - case Character.OTHER_PUNCTUATION: - case Character.MATH_SYMBOL: - case Character.CURRENCY_SYMBOL: - case Character.MODIFIER_SYMBOL: - case Character.OTHER_SYMBOL: - case Character.INITIAL_QUOTE_PUNCTUATION: - case Character.FINAL_QUOTE_PUNCTUATION: - return true; - default: - return false; - } - } -} Index: lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java =================================================================== --- lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (revision 1297338) +++ lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (working copy) @@ -17,67 +17,1133 @@ * limitations under the License. */ +import java.io.IOException; import java.io.Reader; -import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumMap; import java.util.List; -import java.util.Locale; -import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute; -import org.apache.lucene.analysis.kuromoji.tokenattributes.InflectionAttribute; -import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute; -import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition; +import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts; +import org.apache.lucene.analysis.kuromoji.dict.Dictionary; +import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary; +import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST; +import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary; +import org.apache.lucene.analysis.kuromoji.dict.UserDictionary; +import org.apache.lucene.analysis.kuromoji.tokenattributes.*; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.util.SegmentingTokenizerBase; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.RollingCharBuffer; +import org.apache.lucene.util.fst.FST; -public final class KuromojiTokenizer extends SegmentingTokenizerBase { - private static final BreakIterator proto = BreakIterator.getSentenceInstance(Locale.JAPAN); +// TODO: somehow factor out a reusable viterbi search here, +// so other decompounders/tokenizers can reuse... + +/* Uses a rolling Viterbi search to find the least cost + * segmentation (path) of the incoming characters. For + * tokens that appear to be compound (> length 2 for all + * Kanji, or > length 7 for non-Kanji), we see if there is a + * 2nd best segmentation of that token after applying + * penalties to the long tokens. If so, and the Mode is + * SEARCH_WITH_COMPOUND, we output the alternate + * segmentation as well. */ +public final class KuromojiTokenizer extends Tokenizer { + + public static enum Mode { + NORMAL, SEARCH, EXTENDED + } + + public static final Mode DEFAULT_MODE = Mode.SEARCH; + + enum Type { + KNOWN, + UNKNOWN, + USER + } + + private static final boolean VERBOSE = false; + + private static final int SEARCH_MODE_KANJI_LENGTH = 2; + + private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH + + private static final int SEARCH_MODE_KANJI_PENALTY = 3000; + + private static final int SEARCH_MODE_OTHER_PENALTY = 1700; + + // For safety: + private static final int MAX_UNKNOWN_WORD_LENGTH = 1024; + private static final int MAX_BACKTRACE_GAP = 1024; + + private final EnumMap dictionaryMap = new EnumMap(Type.class); + + private final TokenInfoFST fst; + private final TokenInfoDictionary dictionary; + private final UnknownDictionary unkDictionary; + private final ConnectionCosts costs; + private final UserDictionary userDictionary; + private final CharacterDefinition characterDefinition; + + private final FST.Arc arc = new FST.Arc(); + private final FST.BytesReader fstReader; + private final IntsRef wordIdRef = new IntsRef(); + + private final FST.BytesReader userFSTReader; + private final TokenInfoFST userFST; + + private final RollingCharBuffer buffer = new RollingCharBuffer(); + + private final WrappedPositionArray positions = new WrappedPositionArray(); + + private final boolean discardPunctuation; + private final boolean searchMode; + private final boolean extendedMode; + private final boolean outputCompounds; + + // Index of the last character of unknown word: + private int unknownWordEndIndex = -1; + + // True once we've hit the EOF from the input reader: + private boolean end; + + // Last absolute position we backtraced from: + private int lastBackTracePos; + + // Position of last token we returned; we use this to + // figure out whether to set posIncr to 0 or 1: + private int lastTokenPos; + + // Next absolute position to process: + private int pos; + + // Already parsed, but not yet passed to caller, tokens: + private final List pending = new ArrayList(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class); private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class); private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class); private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class); - private final Segmenter segmenter; - - private List tokens; - private int tokenIndex = 0; - private int sentenceStart = 0; - - public KuromojiTokenizer(Reader input) { - this(new Segmenter(), input); + + public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { + super(input); + dictionary = TokenInfoDictionary.getInstance(); + fst = dictionary.getFST(); + unkDictionary = UnknownDictionary.getInstance(); + characterDefinition = unkDictionary.getCharacterDefinition(); + this.userDictionary = userDictionary; + costs = ConnectionCosts.getInstance(); + fstReader = fst.getBytesReader(0); + if (userDictionary != null) { + userFST = userDictionary.getFST(); + userFSTReader = userFST.getBytesReader(0); + } else { + userFST = null; + userFSTReader = null; + } + this.discardPunctuation = discardPunctuation; + switch(mode){ + case SEARCH: + searchMode = true; + extendedMode = false; + outputCompounds = true; + break; + case EXTENDED: + searchMode = true; + extendedMode = true; + outputCompounds = false; + break; + default: + searchMode = false; + extendedMode = false; + outputCompounds = false; + break; + } + buffer.reset(input); + + resetState(); + + dictionaryMap.put(Type.KNOWN, dictionary); + dictionaryMap.put(Type.UNKNOWN, unkDictionary); + dictionaryMap.put(Type.USER, userDictionary); } - - public KuromojiTokenizer(Segmenter segmenter, Reader input) { - super(input, (BreakIterator) proto.clone()); - this.segmenter = segmenter; + + private GraphvizFormatter dotOut; + + /** Expert: set this to produce graphviz (dot) output of + * the Viterbi lattice */ + public void setGraphvizFormatter(GraphvizFormatter dotOut) { + this.dotOut = dotOut; } - + @Override - protected void setNextSentence(int sentenceStart, int sentenceEnd) { - this.sentenceStart = sentenceStart; - // TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us? - tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart, true); - tokenIndex = 0; + public void reset(Reader input) throws IOException { + super.reset(input); + buffer.reset(input); } @Override - protected boolean incrementWord() { - if (tokenIndex == tokens.size()) { - return false; + public void reset() throws IOException { + super.reset(); + resetState(); + } + + private void resetState() { + positions.reset(); + unknownWordEndIndex = -1; + pos = 0; + end = false; + lastBackTracePos = 0; + lastTokenPos = -1; + pending.clear(); + + // Add BOS: + positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN); + } + + @Override + public void end() { + // Set final offset + offsetAtt.setOffset(correctOffset(pos), correctOffset(pos)); + } + + // Returns the added cost that a 2nd best segmentation is + // allowed to have. Ie, if we see path with cost X, + // ending in a compound word, and this method returns + // threshold > 0, then we will also find the 2nd best + // segmentation and if its path score is within this + // threshold of X, we'll include it in the output: + private int computeSecondBestThreshold(int pos, int length) throws IOException { + // TODO: maybe we do something else here, instead of just + // using the penalty...? EG we can be more aggressive on + // when to also test for 2nd best path + return computePenalty(pos, length); + } + + private int computePenalty(int pos, int length) throws IOException { + if (length > SEARCH_MODE_KANJI_LENGTH) { + boolean allKanji = true; + // check if node consists of only kanji + final int endPos = pos + length; + for (int pos2 = pos; pos2 < endPos; pos2++) { + if (!characterDefinition.isKanji((char) buffer.get(pos2))) { + allKanji = false; + break; + } + } + if (allKanji) { // Process only Kanji keywords + return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY; + } else if (length > SEARCH_MODE_OTHER_LENGTH) { + return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY; + } } - Token token = tokens.get(tokenIndex); + return 0; + } + + // Holds all back pointers arriving to this position: + final static class Position { + + int pos; + + int count; + + // maybe single int array * 5? + int[] costs = new int[8]; + int[] lastRightID = new int[8]; + int[] backPos = new int[8]; + int[] backIndex = new int[8]; + int[] backID = new int[8]; + Type[] backType = new Type[8]; + + // Only used when finding 2nd best segmentation under a + // too-long token: + int forwardCount; + int[] forwardPos = new int[8]; + int[] forwardID = new int[8]; + int[] forwardIndex = new int[8]; + Type[] forwardType = new Type[8]; + + public void grow() { + costs = ArrayUtil.grow(costs, 1+count); + lastRightID = ArrayUtil.grow(lastRightID, 1+count); + backPos = ArrayUtil.grow(backPos, 1+count); + backIndex = ArrayUtil.grow(backIndex, 1+count); + backID = ArrayUtil.grow(backID, 1+count); + + // NOTE: sneaky: grow separately because + // ArrayUtil.grow will otherwise pick a different + // length than the int[]s we just grew: + final Type[] newBackType = new Type[backID.length]; + System.arraycopy(backType, 0, newBackType, 0, backType.length); + backType = newBackType; + } + + public void growForward() { + forwardPos = ArrayUtil.grow(forwardPos, 1+forwardCount); + forwardID = ArrayUtil.grow(forwardID, 1+forwardCount); + forwardIndex = ArrayUtil.grow(forwardIndex, 1+forwardCount); + + // NOTE: sneaky: grow separately because + // ArrayUtil.grow will otherwise pick a different + // length than the int[]s we just grew: + final Type[] newForwardType = new Type[forwardPos.length]; + System.arraycopy(forwardType, 0, newForwardType, 0, forwardType.length); + forwardType = newForwardType; + } + + public void add(int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) { + // NOTE: this isn't quite a true Viterbit search, + // becase we should check if lastRightID is + // already present here, and only update if the new + // cost is less than the current cost, instead of + // simply appending. However, that will likely hurt + // performance (usually we add a lastRightID only once), + // and it means we actually create the full graph + // intersection instead of a "normal" Viterbi lattice: + if (count == costs.length) { + grow(); + } + this.costs[count] = cost; + this.lastRightID[count] = lastRightID; + this.backPos[count] = backPos; + this.backIndex[count] = backIndex; + this.backID[count] = backID; + this.backType[count] = backType; + count++; + } + + public void addForward(int forwardPos, int forwardIndex, int forwardID, Type forwardType) { + if (forwardCount == this.forwardID.length) { + growForward(); + } + this.forwardPos[forwardCount] = forwardPos; + this.forwardIndex[forwardCount] = forwardIndex; + this.forwardID[forwardCount] = forwardID; + this.forwardType[forwardCount] = forwardType; + forwardCount++; + } + + public void reset() { + count = 0; + // forwardCount naturally resets after it runs: + assert forwardCount == 0: "pos=" + pos + " forwardCount=" + forwardCount; + } + } + + private void add(Dictionary dict, Position fromPosData, int endPos, int wordID, Type type, boolean addPenalty) throws IOException { + final int wordCost = dict.getWordCost(wordID); + final int leftID = dict.getLeftId(wordID); + int leastCost = Integer.MAX_VALUE; + int leastIDX = -1; + assert fromPosData.count > 0; + for(int idx=0;idx lastTokenPos; + posIncAtt.setPositionIncrement(1); + posLengthAtt.setPositionLength(1); + } + if (VERBOSE) { + System.out.println(Thread.currentThread().getName() + ": incToken: return token=" + token); + } + lastTokenPos = token.getPosition(); return true; } + + // TODO: make generic'd version of this "circular array"? + // It's a bit tricky because we do things to the Position + // (eg, set .pos = N on reuse)... + static final class WrappedPositionArray { + private Position[] positions = new Position[8]; + + public WrappedPositionArray() { + for(int i=0;i 0) { + if (nextWrite == -1) { + nextWrite = positions.length - 1; + } + positions[nextWrite--].reset(); + count--; + } + nextWrite = 0; + nextPos = 0; + count = 0; + } + + /** Get Position instance for this absolute position; + * this is allowed to be arbitrarily far "in the + * future" but cannot be before the last freeBefore. */ + public Position get(int pos) { + while(pos >= nextPos) { + //System.out.println("count=" + count + " vs len=" + positions.length); + if (count == positions.length) { + Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + //System.out.println("grow positions " + newPositions.length); + System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite); + System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite); + for(int i=positions.length;i= nextPos - count; + } + + private int getIndex(int pos) { + int index = nextWrite - (nextPos - pos); + if (index < 0) { + index += positions.length; + } + return index; + } + + public void freeBefore(int pos) { + final int toFree = count - (nextPos - pos); + assert toFree >= 0; + assert toFree <= count; + int index = nextWrite - count; + if (index < 0) { + index += positions.length; + } + for(int i=0;i lastBackTracePos && posData.count == 1 && isFrontier) { + // if (pos > lastBackTracePos && posData.count == 1 && isFrontier) { + // We are at a "frontier", and only one node is + // alive, so whatever the eventual best path is must + // come through this node. So we can safely commit + // to the prefix of the best path at this point: + backtrace(posData, 0); + + // Re-base cost so we don't risk int overflow: + posData.costs[0] = 0; + + if (pending.size() != 0) { + return; + } else { + // This means the backtrace only produced + // punctuation tokens, so we must keep parsing. + } + } + + if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) { + // Safety: if we've buffered too much, force a + // backtrace now: + int leastIDX = -1; + int leastCost = Integer.MAX_VALUE; + for(int idx=0;idx posData.pos) { + pos++; + continue; + } + + final char firstCharacter = (char) buffer.get(pos); + if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) { + + // Find unknown match: + final int characterId = characterDefinition.getCharacterClass(firstCharacter); + + // NOTE: copied from UnknownDictionary.lookup: + int unknownWordLength; + if (!characterDefinition.isGroup(firstCharacter)) { + unknownWordLength = 1; + } else { + // Extract unknown word. Characters with the same character class are considered to be part of unknown word + unknownWordLength = 1; + for (int posAhead=pos+1;unknownWordLength 0) { + + final Position endPosData = positions.get(pos); + int leastCost = Integer.MAX_VALUE; + int leastIDX = -1; + if (VERBOSE) { + System.out.println(" end: " + endPosData.count + " nodes"); + } + for(int idx=0;idx lastBackTracePos) { + //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX); + final Position posData = positions.get(pos); + assert bestIDX < posData.count; + + int backPos = posData.backPos[bestIDX]; + assert backPos >= lastBackTracePos: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos; + int length = pos - backPos; + Type backType = posData.backType[bestIDX]; + int backID = posData.backID[bestIDX]; + int nextBestIDX = posData.backIndex[bestIDX]; + + if (outputCompounds && searchMode && altToken == null && backType != Type.USER) { + + // In searchMode, if best path had picked a too-long + // token, we use the "penalty" to compute the allowed + // max cost of an alternate back-trace. If we find an + // alternate back trace with cost below that + // threshold, we pursue it instead (but also output + // the long token). + //System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos); + + final int penalty = computeSecondBestThreshold(backPos, pos-backPos); + + if (penalty > 0) { + if (VERBOSE) { + System.out.println(" compound=" + new String(buffer.get(backPos, pos-backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID); + } + + // Use the penalty to set maxCost on the 2nd best + // segmentation: + int maxCost = posData.costs[bestIDX] + penalty; + if (lastLeftWordID != -1) { + maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID); + } + + // Now, prune all too-long tokens from the graph: + pruneAndRescore(backPos, pos, + posData.backIndex[bestIDX]); + + // Finally, find 2nd best back-trace and resume + // backtrace there: + int leastCost = Integer.MAX_VALUE; + int leastIDX = -1; + for(int idx=0;idx " + cost); + } + //System.out.println("penalty " + posData.backPos[idx] + " to " + pos); + //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]); + if (cost < leastCost) { + //System.out.println(" ** "); + leastCost = cost; + leastIDX = idx; + } + } + //System.out.println(" leastIDX=" + leastIDX); + + if (VERBOSE) { + System.out.println(" afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID); + } + + if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) { + // We should have pruned the altToken from the graph: + assert posData.backPos[leastIDX] != backPos; + + // Save the current compound token, to output when + // this alternate path joins back: + altToken = new Token(backID, + fragment, + backPos - lastBackTracePos, + length, + backType, + backPos, + getDict(backType)); + + // Redirect our backtrace to 2nd best: + bestIDX = leastIDX; + nextBestIDX = posData.backIndex[bestIDX]; + + backPos = posData.backPos[bestIDX]; + length = pos - backPos; + backType = posData.backType[bestIDX]; + backID = posData.backID[bestIDX]; + backCount = 0; + //System.out.println(" do alt token!"); + + } else { + // I think in theory it's possible there is no + // 2nd best path, which is fine; in this case we + // only output the compound token: + //System.out.println(" no alt token! bestIDX=" + bestIDX); + } + } + } + + final int offset = backPos - lastBackTracePos; + assert offset >= 0; + + if (altToken != null && altToken.getPosition() >= backPos) { + + // We've backtraced to the position where the + // compound token starts; add it now: + + // The pruning we did when we created the altToken + // ensures that the back trace will align back with + // the start of the altToken: + // cannot assert... + //assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos; + + if (VERBOSE) { + System.out.println(" add altToken=" + altToken); + } + if (backCount > 0) { + backCount++; + altToken.setPositionLength(backCount); + pending.add(altToken); + } else { + // This means alt token was all punct tokens: + assert discardPunctuation; + } + altToken = null; + } + + final Dictionary dict = getDict(backType); + + if (backType == Type.USER) { + + // Expand the phraseID we recorded into the actual + // segmentation: + final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID); + int wordID = wordIDAndLength[0]; + int current = 0; + for(int j=1; j < wordIDAndLength.length; j++) { + final int len = wordIDAndLength[j]; + //System.out.println(" add user: len=" + len); + pending.add(new Token(wordID+j-1, + fragment, + current + offset, + len, + Type.USER, + current + backPos, + dict)); + if (VERBOSE) { + System.out.println(" add USER token=" + pending.get(pending.size()-1)); + } + current += len; + } + + // Reverse the tokens we just added, because when we + // serve them up from incrementToken we serve in + // reverse: + Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1), + pending.size())); + + backCount += wordIDAndLength.length-1; + } else { + + if (extendedMode && backType == Type.UNKNOWN) { + // In EXTENDED mode we convert unknown word into + // unigrams: + int unigramTokenCount = 0; + for(int i=length-1;i>=0;i--) { + int charLen = 1; + if (i > 0 && Character.isLowSurrogate(fragment[offset+i])) { + i--; + charLen = 2; + } + //System.out.println(" extended tok offset=" + //+ (offset + i)); + if (!discardPunctuation || !isPunctuation(fragment[offset+i])) { + pending.add(new Token(CharacterDefinition.NGRAM, + fragment, + offset + i, + charLen, + Type.UNKNOWN, + backPos + i, + unkDictionary)); + unigramTokenCount++; + } + } + backCount += unigramTokenCount; + + } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) { + pending.add(new Token(backID, + fragment, + offset, + length, + backType, + backPos, + dict)); + if (VERBOSE) { + System.out.println(" add token=" + pending.get(pending.size()-1)); + } + backCount++; + } else { + if (VERBOSE) { + System.out.println(" skip punctuation token=" + new String(fragment, offset, length)); + } + } + } + + lastLeftWordID = dict.getLeftId(backID); + pos = backPos; + bestIDX = nextBestIDX; + } + + lastBackTracePos = endPos; + + if (VERBOSE) { + System.out.println(" freeBefore pos=" + endPos); + } + // Notify the circular buffers that we are done with + // these positions: + buffer.freeBefore(endPos); + positions.freeBefore(endPos); + } + + Dictionary getDict(Type type) { + return dictionaryMap.get(type); + } + + private static boolean isPunctuation(char ch) { + switch(Character.getType(ch)) { + case Character.SPACE_SEPARATOR: + case Character.LINE_SEPARATOR: + case Character.PARAGRAPH_SEPARATOR: + case Character.CONTROL: + case Character.FORMAT: + case Character.DASH_PUNCTUATION: + case Character.START_PUNCTUATION: + case Character.END_PUNCTUATION: + case Character.CONNECTOR_PUNCTUATION: + case Character.OTHER_PUNCTUATION: + case Character.MATH_SYMBOL: + case Character.CURRENCY_SYMBOL: + case Character.MODIFIER_SYMBOL: + case Character.OTHER_SYMBOL: + case Character.INITIAL_QUOTE_PUNCTUATION: + case Character.FINAL_QUOTE_PUNCTUATION: + return true; + default: + return false; + } + } } Property changes on: lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java ___________________________________________________________________ Added: svn:eol-style + native Property changes on: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym ___________________________________________________________________ Added: svn:mergeinfo Merged /lucene/dev/branches/lucene_solr_3_2/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r1128223,1128247,1129418,1129472 Merged /lucene/dev/branches/lucene_solr_3_3/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r1138390,1138979,1139775 Merged /lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r931298,931337,931502,932129-932131,932163,932304,932369,932374,932398,932417,932541,932576,932587,932698,932731-932749,932752,932773,932795,932828,932856-932857,932862,932864,932878,932963,932998-932999,933541-933575,933598,933613,933679,933879,934339,934954,935014-935048,935065,935186-935513,935521-935522,935553-935962,936522,936544,936605,936657-936726,937039,937360,938582-938646,938989,939111,939611,939649,940433,940447,940451-940452,940666,940699,940730,940878-940892,940994,941270,941363,941780,942166,942235,942288,942292,942676,942719,943142,943493,943931,945057,945090,945130,945245,945343,945420,946139,946330,946338,946599,948011,948082,948429,949156,949288,949311,949318,949445,949976,949997,950008,950042,950458,950467,950613,950667,951126,951355,951397,951521,953628,955547,955613,955615,955796-955797,955809-955996,956097,956125,956173,956316,956715,957465,957481,957486,957520,957634,957707,960367,960371,960374,960719,962555,963372,963654,963720,963781,963873,963906,963909,963920,964019,964054,964430,964459,964720,964753,964832,964856,965103,965110,965222,965230,965299,965327,965330,965585,966354,966878,967080,979453,979809,980369,980428,980436,980501,980909,980911,980917,981265,981550,981598,981650,981661,981857,981936,982073,982084,982201,982323,982725,982824,983100,983212,983216,983313,983328,983495,983500,983530,983622,983632,983778,984187,984202,984232,984510,984968,985453,985455,985672,985875,986158,986173,986612,987122,988087,988206,988216,988259,988346,988478,988527,988543,988592,988613,988688,988710,988736,988739,989004,989010,989013,989030,989035,989315,989321,989334,989785,990160-990161,990180,990189,990281,990301,990451,990459,990766,990781,990854,991053,991191,991310,991497,992424,992469,992567,992571,992623,993106,993194,993199,993287,993408,994935,994976,994979,995247,995250,995376,995607,995772,996268,996357,996416,996511,996611,996623,996647-996653,996720,996942,996961,996978,997180,997230,998055,998505,998684,999016,999037,999137,999139,999152,999175,999223,999378,999409,999483,999545,999842,999984,1000000,1000424,1000428,1000581,1000597,1000675,1001006,1001010,1001129,1001318,1001420,1001661,1001796,1002002,1002739,1003107,1003291,1003614,1003631,1003645,1003841-1003852,1003873,1003877,1003906,1003938,1003954,1003978,1003990,1004038,1004082,1004179,1004200,1004215,1004241,1004335,1005310,1005356,1005363,1006146,1006280,1006290,1006324,1021340,1021357,1021360,1021439,1021449,1021969-1021971,1022165,1022191,1022632,1022708-1022710,1022730-1022735,1022748-1022755,1022762-1022793,1022798-1022802,1022805,1022826,1022927,1022939,1022956,1022989,1022998,1023006,1023009,1023022,1023040,1023106,1023235-1023246,1023250,1023264-1023265,1023312,1023329-1023330,1023346-1023347,1023355,1023493,1023509-1023511,1023518,1023520,1023535-1023536,1023562,1023579-1023588,1023594-1023595,1023600-1023602,1023606,1023621,1023635,1023637,1023711,1023845,1023870,1024196,1024219,1024233,1024238,1024256,1024292,1024305,1024338,1024395,1024402,1024408,1024475-1024476,1024486,1025545,1025547,1025570,1025579,1025597,1025669,1025929,1026044,1026058,1026129-1026130,1026167,1026336,1026431,1026446,1026456,1026460,1026592,1026606,1026610,1026738,1026841,1026868,1026882,1027743,1027788,1027998,1028039,1028386,1029096,1029325,1029333,1029345,1030012,1030019,1030073,1030078,1030754,1031076,1031219,1031460,1031467,1031474,1031480,1031496,1031686,1031689,1032570,1032776,1034007,1034011,1034017,1034342,1034361,1034763,1034921,1034975,1034977,1035096,1035103,1035194,1035205,1035214,1035395,1035397,1035420,1035535,1035651,1035996,1036088,1036970,1037077,1037154,1037223,1037406,1037429,1038562,1038785,1039068,1039314,1039688,1039737,1039759,1039773,1039778,1039868,1039911,1039917,1039962-1039967,1040064,1040290,1040390,1040447,1040463,1040608,1040815,1040935,1040940,1040982,1041844,1041914,1041954,1041963,1042008,1042185,1042213,1042315,1042359,1042373,1043071,1043114,1043148,1043277,1043693,1043749,1044066-1044069,1044098,1044257,1044315,1044328,1044505,1044561,1044635,1044660,1044854,1044867,1045010,1045212,1045266,1045310,1045315,1045322-1045323,1049094,1049107,1049117,1049131-1049132,1049144,1049187,1049413,1049502,1049693,1049918,1050063,1050084,1050687,1050697-1050725,1050728,1050733,1050737,1050813,1050827,1051041,1051058,1051305,1051715,1051872,1051891,1052898,1052926,1052974,1052980,1052991,1053236,1053405,1053509,1053896,1054015,1054164,1054172,1054405-1054406,1055285,1055408,1055435,1055595,1055877,1055892-1055906,1056014,1056428,1056702,1056821,1056955,1057010,1057149,1057221,1057340,1058284-1058288,1058324,1058393,1058939,1059426,1059719,1059866,1060023,1060324,1060437,1060608,1060779,1060807,1060846,1060872,1060997,1061050,1061065,1061078,1061350,1061424,1061499,1061622,1062070,1062123,1062153,1062319,1062451,1062454,1062509,1062604,1062633,1062876,1062879,1063323,1063333,1063478,1063493,1063498,1063501,1063513,1063702,1063762,1063837,1063842,1063868-1063869,1063877,1063897,1063908,1063920,1064330,1064379,1064735,1064781,1064844,1064942,1065059,1065095-1065096,1065102,1065261,1065265,1065272,1065286,1065302,1065304,1065327,1065337,1065410,1065416,1065465,1065474,1065572,1065601,1065621,1065719,1065853,1065855,1065891,1066008,1066691,1066764,1066819,1066850,1066889,1067119,1067131,1067160,1067163,1067165,1067299,1067427,1067551,1068387,1068979,1069316,1069341,1069656,1070183,1070185,1070206,1070240,1070321,1070691,1070760,1070879,1071074,1071417,1071435,1071569,1071594,1071654-1071655,1071658,1072127,1072250,1072567,1072591,1072607,1072683,1073336,1073806,1073850,1073957,1074009,1074017,1074226,1074326,1074357,1074726,1074750,1074952,1075023-1075024,1075069,1075072,1075079,1075089,1075103,1075184,1075190-1075191,1075196,1075287,1075443,1075505,1075850,1076032,1076237,1076279,1076311,1076315,1076319,1076325,1076433,1076884,1077908,1077916,1078058,1078117,1078127,1078398,1078448,1078451,1078463,1078471,1078500-1078501,1078512-1078515,1078529,1078540,1078553,1078563,1078570,1078580,1078599,1078614,1078639,1078659,1078670,1078681,1078770,1079707,1079786,1079949,1080038,1080258,1080424,1080443,1080445,1080647,1080665,1080691,1080762,1080970,1080979,1080985,1080988,1081012,1081017,1081777-1081778,1081790-1081791,1081795,1082186,1082514-1082516,1082601,1082687,1082720,1082730,1082776,1082865,1082919,1082926,1083010,1083213,1083447,1083459,1083991,1084045,1084210,1084247,1084273-1084274,1084327,1084544,1084549,1084566,1084929,1085004,1085089,1085224,1085241,1085423,1085515,1085530,1085689,1086276,1086584,1086629,1086821,1087319,1087426,1087722,1088021,1089335,1089813,1089815,1089906,1089918,1091132-1091159,1091499,1092105,1092136,1092328,1092396,1092812,1092848,1094014,1094214,1095120,1095260,1095432,1095517,1095861,1095937,1096073,1096077,1096178-1096183,1096194,1096249,1096301,1096315,1096334,1096339,1097187,1097216,1097627,1098303,1098357,1098367,1098375,1098532,1098633,1098730,1098740,1098800,1098860,1099041,1099340,1099529,1099582,1099745,1099999,1100435,1100437,1101047,1101056,1101072,1101088,1101539,1101572,1101574,1102058,1102120,1102290,1102377,1102658,1102718,1102785,1102817,1102827,1102907,1103024,1103048,1103077,1103102,1103120,1103155,1103979,1103983,1104421,1104432,1104452,1104519,1124160,1124266,1124293,1124307,1124316,1124330,1124366,1125006,1125150,1125165,1125376,1125932,1125972,1126022,1126091,1126280,1126284,1126487,1126573,1126642,1126645,1126761,1127156,1127247,1127301,1127436,1128105,1128246,1128253,1128549,1128830,1128844,1128854,1128856,1129398,1129403,1129413,1129427,1129450,1129453,1129456,1129459,1129465,1129645,1129656,1129694,1130039,1130042,1130052,1130063,1130150,1130439,1130527,1130547,1130648,1130852,1130858-1130859,1130861,1130954-1131005,1131150,1131158,1131371,1131395,1131401,1132391,1132517,1132620,1132729,1132806,1132855,1132969,1133021,1133136,1133187,1133330,1133383,1133385,1133486,1133553,1133565,1133599,1133616,1133631,1133646,1133839,1133937,1134163,1134328,1134515,1134592,1134685,1134763,1134781,1134895,1134995,1134998,1135009,1135011,1135154,1135204,1135300,1135369,1135509,1135525,1135527,1135537,1135650,1135658,1135670,1135764,1135801,1135818,1135822,1135825,1135954,1136027,1136080,1136357,1136467,1136568,1136605,1136644,1136789,1136792,1137054,1137060,1137064,1137162,1137211,1137330,1137357,1137477,1137480,1137529,1137533,1137665,1137733,1137882,1138030,1138069,1138319,1138405,1138446,1138450,1138821,1138890,1139054,1139173,1139178,1139188,1139199,1139285,1139513,1139789,1139995,1140004,1140119,1140243,1140252,1140498,1140574,1140720,1140827,1140836,1140851,1141167,1141170,1141295,1141400,1141593,1141629,1141999,1142179,1143122,1143189,1143238,1143420,1143558,1143766,1143783,1143878,1144294,1144415,1144513,1144792,1144841,1145158,1145163,1145182,1145198,1145233,1145239,1145255,1145263,1145292,1145442,1145479,1145502,1145518,1145594,1145657,1145701,1145730,1145885,1145925,1145957,1146638,1146984,1147023,1147578,1147586,1147671,1147691,1147807,1147881,1148596,1148602,1148681,1148728,1148763,1148968,1149028,1149050,1149108,1149256,1149740,1149746,1150091,1150362,1150384,1150389,1150394,1150404-1150405,1150415,1150478,1150480,1150486-1150489,1150671,1150840,1151081,1151146,1151720,1151782,1151984,1151997,1152024,1152055,1152089,1152288,1152456,1152525,1152530,1152653,1152669,1152892,1153399,1153408,1153844,1154005,1154926,1154936,1155278,1156053,1156590-1156591,1157437,1158342,1158697,1158730,1158819,1158832,1159291,1159418,1159627,1160832,1161488,1161505,1161964,1161966,1161972,1161974,1162135,1162156,1162158,1162166,1162375,1162394,1162401,1163370,1163568,1163576,1163589,1163625,1164287,1164311,1164620,1164956,1165902,1165995,1166106,1166457,1166530,1166541,1166582,1166656,1166702,1166715,1166728,1166784,1166850,1166866,1166954,1167008,1167199,1167467,1169612,1169816,1169820,1170157,1170203,1170586,1170616,1170699,1170716,1170725,1170908,1171556,1171570,1171597,1171691,1171704,1171739,1172227,1173139,1173423,1173430,1173720,1173778,1173961,1174377-1174407,1175300,1175376,1175385,1175397,1175413,1175425,1175475,1175529,1175579,1175650,1175696,1175699,1175956,1175975,1176097,1176114,1176478,1176772,1176774,1177048-1177049,1177723,1177940,1178612,1178923,1179315,1179677,1179762,1179956,1180124,1181265,1181268,1181299,1181659,1181664,1181760,1182982,1183458,1183464,1183582,1183738,1183753,1183756,1184753-1184754,1184761,1184822,1184851,1184877,1185120,1187900,1188597,1188777,1188975,1189039,1189160,1189186,1189655,1189903,1189958,1190029,1190107,1190410,1195082,1195101,1195275,1196228,1197469,1197690,1197742,1197879,1198009,1198024,1198039,1198089,1198134,1198332,1198371,1198636,1198777-1198778,1198911,1199405,1199832,1199837,1200007,1200051,1200080,1200274,1200440,1200480,1200854,1201036,1201165,1201191,1201329,1201375,1201855,1202152,1202657,1202754,1202969,1203114,1203206,1203756,1203966,1203970,1204416,1204453,1205021,1205152,1205342,1205360,1205366,1205430,1205774,1205954,1206017,1206033,1206070,1206143,1206229,1206436-1206437,1206452,1206707,1206767,1206789,1206996,1207070,1207103,1207291,1207577,1207718,1208032,1208118,1208509,1208525,1210020,1210054,1210469,1210714,1211710,1211827,1211887,1212894,1213013,1213016,1213020,1213033,1213044,1213106,1213329,1213704,1213706,1213800,1213803,1213824,1213826,1213910,1214012,1214376,1214413,1214540,1215018,1215349,1215352,1220426,1220458,1220555,1220705,1220795,1221195,1221368-1221369,1222367-1222368,1225120,1225211,1225231,1225233,1225433,1225920,1226417,1226455,1226793,1226821,1226871,1227439,1228650,1228704,1228727,1228928,1229519,1229523,1229602,1229713,1231223,1231367,1231512,1231514,1231665,1231788,1231795,1232470,1232491,1232769,1232943,1233381,1233583,1233696,1233708,1234396,1234452,1234546,1234598,1234652,1234687,1234850,1234867,1235187,1235228,1235753,1236429,1236431,1237497,1237500,1237506,1237528,1237809,1238832,1238851,1239040,1239052-1239056,1239061,1239316,1239658,1240034-1240081,1240655,1240980,1241355,1241588,1241596,1241598,1241741,1241878,1241986,1242497,1242557,1242740,1242890,1242903,1243278,1243656,1244379,1244458,1244536,1244552,1245710,1245715,1245947,1291020,1291097,1291184,1291541,1291703,1291728,1292282,1292864,1292881,1293728,1293821-1293823,1294856,1294920,1295067,1296237,1296805,1297001,1297048,1297162-1297168 Merged /lucene/java/branches/lucene_2_9_back_compat_tests/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r818601-821336 Merged /lucene/dev/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r932749,1141465 Merged /lucene/dev/branches/preflexfixes/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r967125-979432 Merged /lucene/java/branches/lucene_3_0/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r880793,896906,1098765 Merged /lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym:r1296805 Merged /lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r924483-925561 Merged /lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r924791,924850,930201 Merged /lucene/java/branches/lucene_2_4/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r748824 Merged /lucene/dev/branches/lucene_solr_3_1/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r1081856,1083239,1085499,1085511,1085532,1085809,1101103 Merged /lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym:r817269-818600,825998,829134,829881,831036,896850,909334,948516 Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java =================================================================== --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (revision 1297357) +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (working copy) @@ -112,6 +112,8 @@ private int captureCount; + // TODO: we should set PositionLengthAttr too... + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); Index: lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (revision 1297338) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (working copy) @@ -17,13 +17,18 @@ * limitations under the License. */ +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; -import java.io.IOException; +import java.io.StringWriter; +import java.io.Writer; import java.util.ArrayList; import java.util.List; import java.util.Random; - + import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; @@ -83,7 +88,7 @@ } } - public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException { + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException { assertNotNull(output); CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); @@ -107,6 +112,12 @@ assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); } + + PositionLengthAttribute posLengthAtt = null; + if (posLengths != null) { + assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class)); + posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); + } ts.reset(); for (int i = 0; i < output.length; i++) { @@ -116,6 +127,7 @@ if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); + if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before assertTrue("token "+i+" does not exist", ts.incrementToken()); @@ -130,6 +142,8 @@ assertEquals("type "+i, types[i], typeAtt.type()); if (posIncrements != null) assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement()); + if (posLengths != null) + assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength()); // we can enforce some basic things about a few attributes even if the caller doesn't check: if (offsetAtt != null) { @@ -138,14 +152,18 @@ assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset()); if (finalOffset != null) { assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue()); - assertTrue("endOffset must be <= finalOffset", offsetAtt.endOffset() <= finalOffset.intValue()); + assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(), + offsetAtt.endOffset() <= finalOffset.intValue()); } } if (posIncrAtt != null) { assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); } + if (posLengthAtt != null) { + assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); + } } - assertFalse("end of stream", ts.incrementToken()); + assertFalse("TokenStream has more tokens than expected", ts.incrementToken()); ts.end(); if (finalOffset != null) assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); @@ -155,65 +173,81 @@ ts.close(); } + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException { + assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset); + } + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { - assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null); + assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException { - assertTokenStreamContents(ts, output, null, null, null, null, null); + assertTokenStreamContents(ts, output, null, null, null, null, null, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException { - assertTokenStreamContents(ts, output, null, null, types, null, null); + assertTokenStreamContents(ts, output, null, null, types, null, null, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException { - assertTokenStreamContents(ts, output, null, null, null, posIncrements, null); + assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException { - assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null); + assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException { - assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset); + assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { - assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null); + assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null); } public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException { - assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset); + assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset); } + public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException { + assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset); + } + public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { - assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length()); + assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); } + public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException { + assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length()); + } + public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException { - assertAnalyzesTo(a, input, output, null, null, null, null); + assertAnalyzesTo(a, input, output, null, null, null, null, null); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException { - assertAnalyzesTo(a, input, output, null, null, types, null); + assertAnalyzesTo(a, input, output, null, null, types, null, null); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException { - assertAnalyzesTo(a, input, output, null, null, null, posIncrements); + assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null); } + + public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException { + assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths); + } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { - assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null); + assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException { - assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements); + assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null); } public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { - assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length()); + assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); } public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException { @@ -326,7 +360,7 @@ } if (VERBOSE) { - System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); + System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); } int remainder = random.nextInt(10); @@ -336,10 +370,12 @@ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null; PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null; + PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null; TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null; List tokens = new ArrayList(); List types = new ArrayList(); List positions = new ArrayList(); + List positionLengths = new ArrayList(); List startOffsets = new ArrayList(); List endOffsets = new ArrayList(); ts.reset(); @@ -347,6 +383,7 @@ tokens.add(termAtt.toString()); if (typeAtt != null) types.add(typeAtt.type()); if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement()); + if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength()); if (offsetAtt != null) { startOffsets.add(offsetAtt.startOffset()); endOffsets.add(offsetAtt.endOffset()); @@ -357,11 +394,21 @@ // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (!tokens.isEmpty()) { if (VERBOSE) { - System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis"); + System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens"); } reader = new StringReader(text); ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader); - if (typeAtt != null && posIncAtt != null && offsetAtt != null) { + if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) { + // offset + pos + posLength + type + assertTokenStreamContents(ts, + tokens.toArray(new String[tokens.size()]), + toIntArray(startOffsets), + toIntArray(endOffsets), + types.toArray(new String[types.size()]), + toIntArray(positions), + toIntArray(positionLengths), + text.length()); + } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type assertTokenStreamContents(ts, tokens.toArray(new String[tokens.size()]), @@ -369,7 +416,18 @@ toIntArray(endOffsets), types.toArray(new String[types.size()]), toIntArray(positions), + null, text.length()); + } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) { + // offset + pos + posLength + assertTokenStreamContents(ts, + tokens.toArray(new String[tokens.size()]), + toIntArray(startOffsets), + toIntArray(endOffsets), + null, + toIntArray(positions), + toIntArray(positionLengths), + text.length()); } else if (posIncAtt != null && offsetAtt != null) { // offset + pos assertTokenStreamContents(ts, @@ -378,6 +436,7 @@ toIntArray(endOffsets), null, toIntArray(positions), + null, text.length()); } else if (offsetAtt != null) { // offset @@ -387,6 +446,7 @@ toIntArray(endOffsets), null, null, + null, text.length()); } else { // terms only @@ -396,6 +456,22 @@ } } } + + protected String toDot(Analyzer a, String inputText) throws IOException { + final StringWriter sw = new StringWriter(); + final TokenStream ts = a.tokenStream("field", new StringReader(inputText)); + ts.reset(); + new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot(); + return sw.toString(); + } + + protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException { + Writer w = new OutputStreamWriter(new FileOutputStream(localFileName), "UTF-8"); + final TokenStream ts = a.tokenStream("field", new StringReader(inputText)); + ts.reset(); + new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot(); + w.close(); + } static int[] toIntArray(List list) { int ret[] = new int[list.size()]; Property changes on: lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java ___________________________________________________________________ Added: svn:eol-style + native