Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java =================================================================== --- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java (revision 1306883) +++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java (working copy) @@ -584,7 +584,44 @@ } */ - + /* + public void testSegmentWikipedia() throws Exception { + final File i = new File("/Users/cm/Projects/jawiki/jawiki-20110522-pages-articles-100000.xml"); + final Reader r = new BufferedReader(new InputStreamReader(new FileInputStream(i), "UTF-8")); + +// final File o = new File("jawiki-tok-fast2.txt"); +// final Writer w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(o), "UTF-8")); + + final TokenStream ts = analyzer.tokenStream("ignored", r); + final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + + long startTime = System.currentTimeMillis(); + long tokens = 0; + + ts.reset(); + + System.out.println("Reading from " + i.getAbsolutePath()); +// System.out.println("Writing to " + o.getAbsolutePath()); + + while(ts.incrementToken()) { +// w.write(termAtt.toString()); +// w.write("\n"); + tokens++; + } + + long processingSecs = (System.currentTimeMillis() - startTime) / 1000; + + long minutes = processingSecs / 60; + long seconds = processingSecs % 60; + + System.out.println("Tokenized " + tokens + " tokens in total"); + System.out.println(String.format("Processing took %dm %ds (total seconds %d)", minutes, seconds, processingSecs)); + ts.end(); + r.close(); +// w.close(); + } + */ + private void doTestBocchan(int numIterations) throws Exception { LineNumberReader reader = new LineNumberReader(new InputStreamReader( this.getClass().getResourceAsStream("bocchan.utf-8"))); Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java =================================================================== --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java (revision 1306883) +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/ConnectionCosts.java (working copy) @@ -27,35 +27,38 @@ import org.apache.lucene.util.IOUtils; /** - * n-gram connection cost data + * Connection cost data */ public final class ConnectionCosts { public static final String FILENAME_SUFFIX = ".dat"; public static final String HEADER = "kuromoji_cc"; public static final int VERSION = 1; - - private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter. - + + // array is flattened with backward IDs first -- it gets called using the same backward id consecutively. maybe doesn't matter. + private final short[] costs; + private final int backwardSize; + private ConnectionCosts() throws IOException { IOException priorE = null; InputStream is = null; - short[][] costs = null; + short[] costs = null; + int forwardSize = 0; + int backwardSize = 0; try { is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX); is = new BufferedInputStream(is); final DataInput in = new InputStreamDataInput(is); CodecUtil.checkHeader(in, HEADER, VERSION, VERSION); - int forwardSize = in.readVInt(); - int backwardSize = in.readVInt(); - costs = new short[backwardSize][forwardSize]; + forwardSize = in.readVInt(); + backwardSize = in.readVInt(); + costs = new short[backwardSize * forwardSize]; int accum = 0; - for (int j = 0; j < costs.length; j++) { - final short[] a = costs[j]; - for (int i = 0; i < a.length; i++) { + for (int j = 0; j < backwardSize; j++) { + for (int i = 0; i < forwardSize; i++) { int raw = in.readVInt(); accum += (raw >>> 1) ^ -(raw & 1); - a[i] = (short)accum; + costs[j * backwardSize + i] = (short)accum; } } } catch (IOException ioe) { @@ -65,10 +68,13 @@ } this.costs = costs; + this.backwardSize = backwardSize; } public int get(int forwardId, int backwardId) { - return costs[backwardId][forwardId]; + // since this is called consecutively with the same backwards id, it seems like a + // good idea to have backwards id first to hint caching + return costs[backwardId * backwardSize + forwardId]; } public static ConnectionCosts getInstance() { @@ -84,6 +90,5 @@ throw new RuntimeException("Cannot load ConnectionCosts.", ioe); } } - } - + } }