Index: lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
===================================================================
--- lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java	(revision 1532951)
+++ lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java	(working copy)
@@ -402,8 +402,9 @@
         if(cnounMap.get(entry.getWord())!=null) continue;
          
         // 한글과 매치되는 한자를 짤라서 큐에 저장한다.           
-        morphQueue.add(new IndexWord(term.substring(offset,pos),offset));
-         
+        // nocommit: this is avoiding AIOOBE, original code:
+        // morphQueue.add(new IndexWord(term.substring(offset,pos),offset));
+        morphQueue.add(new IndexWord(term.substring(offset,Math.min(pos, term.length())),offset));
         cnounMap.put(entry.getWord(), entry.getWord());
          
         if(entry.getWord().length()<2) continue; //  한글은 2글자 이상만 저장한다.
Index: lucene/core/src/test/org/apache/lucene/HantecRel.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/HantecRel.java	(revision 0)
+++ lucene/core/src/test/org/apache/lucene/HantecRel.java	(working copy)
@@ -0,0 +1,292 @@
+package org.apache.lucene;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.ko.KoreanAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.benchmark.quality.Judge;
+import org.apache.lucene.benchmark.quality.QualityBenchmark;
+import org.apache.lucene.benchmark.quality.QualityQuery;
+import org.apache.lucene.benchmark.quality.QualityQueryParser;
+import org.apache.lucene.benchmark.quality.QualityStats;
+import org.apache.lucene.benchmark.quality.trec.TrecJudge;
+import org.apache.lucene.benchmark.quality.trec.TrecTopicsReader;
+import org.apache.lucene.benchmark.quality.utils.SimpleQQParser;
+import org.apache.lucene.benchmark.quality.utils.SubmissionReport;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.similarities.AfterEffectL;
+import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.search.similarities.BasicModelG;
+import org.apache.lucene.search.similarities.DFRSimilarity;
+import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.NormalizationH2;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.Version;
+
+/** 
+ * Runs relevance tests against HANTEC-2.0 corpus.
+ * <p>
+ * This thing is a beast:
+ * <ul>
+ *   <li>8 different document collections (each in its own format!!!)
+ *      <ul>
+ *         <li>Korea Times News: 22,000 docs (hkib94)
+ *         <li>Web documents (gov and com domains): 18,000 docs (www)
+ *         <li>Korea Economic Daily Articles: 39,480 docs (ked94)
+ *         <li>Korea Women's Development Institute Papers: 110 docs (kwdi)
+ *         <li>Kyungbuk Congressional proceedings: 410 docs (kyungbuk)
+ *         <li>Korea Institute of Sci&Tech research abstracts: 10,000 docs (krist)
+ *         <li>Science and Technology trend abstracts: 18,000 docs (trend)
+ *         <li>International east asian research abstracts: 12,000 docs (saturn)
+ *      </ul>
+ *   <li><p>
+ *       8 different relevance files, from pooled assessment of the top 50 results from 41 runs.
+ *       Run configurations included various ranking models, query expansion, n-gram and morphological analysis techniques. 
+ *       Then 8 human assessors in pairs applied a funky verification process:
+ *       Documents were verified as 1-5 by both individuals, with 1 being irrelevant and 5 being most relevant.
+ *       Qrel files are marked as such, additionally with G (greatest) and L (lowest).
+ *       so G3 means greatest assessment was a 3, and L2 means the lowest assessment was a 2 and so on.
+ *       </p>
+ *   <li>documents and queries are in EUC-KR/cp949-ish encodings, but with "some problems" 
+ *   <li>any documentation on this monster is in korean (so this entire javadoc header could easily be wrong)
+ * </ul>
+ * @see <a href="http://ir.kaist.ac.kr/anthology/2000.10-%EA%B9%80%EC%A7%80%EC%98%81.pdf">Extension and Validation of Hangul Test Collection (HANTEC)</a>
+ */
+public class HantecRel {
+  // constants you must set correctly
+  
+  /** location of HANTEC-2 root */
+  public static final File ROOT = new File("/data/HANTEC-2.0");
+  /** location of trec_eval (good luck during government shutdown if you dont already have it) */
+  public static final File TREC_EVAL_EXE = new File("/home/rmuir/Downloads/trec_eval.8.1/trec_eval");
+  /** location to put the index */
+  public static final File INDEX = new File("/data/indices/hantec2");
+  
+  // analyzers to test
+  static final Analyzer[] analyzers = new Analyzer[] {
+    new StandardAnalyzer(Version.LUCENE_CURRENT),
+    new CJKAnalyzer(Version.LUCENE_CURRENT),
+    new KoreanAnalyzer(),
+    new MecabAnalyzer(),
+  };
+  
+  // ranking algorithms
+  static final Similarity[] models = new Similarity[] {
+    new DefaultSimilarity(),
+    new BM25Similarity(),
+    new DFRSimilarity(new BasicModelG(), new AfterEffectL(), new NormalizationH2())
+  };
+  
+  public static void main(String args[]) throws Exception {
+    for (Analyzer a : analyzers) {
+      System.out.println("Analyzer: " + a.getClass().getSimpleName());
+      doIndex(a);
+      for (Similarity s : models) {
+        doTest(a, s, "T", "L2");
+      }
+    }
+  }
+  
+  static void doIndex(Analyzer analyzer) throws Exception {
+    INDEX.mkdirs();
+    Directory dir = FSDirectory.open(INDEX);
+    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
+    iwc.setOpenMode(OpenMode.CREATE);
+    IndexWriter writer = new IndexWriter(dir, iwc);
+    long startTime = System.currentTimeMillis();
+    index_corpus(writer, "hkib94");
+    index_corpus(writer, "ked94");
+    index_corpus(writer, "krist");
+    index_corpus(writer, "kwdi");
+    index_corpus(writer, "kyungbuk");
+    index_corpus(writer, "saturn");
+    index_corpus(writer, "trend");
+    index_corpus(writer, "www");
+    long endTime = System.currentTimeMillis();
+    writer.forceMerge(1); // to compare index size
+    writer.close();
+    long size = 0;
+    for (String file : dir.listAll()) {
+      size += dir.fileLength(file);
+    }
+    dir.close();
+    System.out.println("index time (s): " + (endTime-startTime)/1000);
+    System.out.println("index size (MB): " + size/1048576);
+  }
+  
+  // parsing algo is simple: <DOCID> goes into docname field, everything else in body
+  static void index_corpus(IndexWriter writer, String dir) throws Exception {
+    File d = new File(new File(ROOT, "DATA"), dir);
+    File files[] = d.listFiles();
+    Charset cs = Charset.forName("cp949");
+
+    for (File f : files) {
+      // legacy charsets
+      CharsetDecoder decoder = cs.newDecoder()
+        .onMalformedInput(CodingErrorAction.REPLACE)
+        .onUnmappableCharacter(CodingErrorAction.REPLACE);
+      BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(f), decoder));
+      Document doc = new Document();
+      StringField idField = new StringField("docname", "", Field.Store.YES);
+      TextField bodyField = new TextField("body", "", Field.Store.NO);
+      doc.add(idField);
+      doc.add(bodyField);
+      String id = null;
+      StringBuilder body = new StringBuilder();
+      String line = null;
+      while ((line = r.readLine()) != null) {
+        if (line.startsWith("<DOCID>")) {
+          if (id != null) {
+            idField.setStringValue(id);
+            bodyField.setStringValue(body.toString());
+            writer.addDocument(doc);
+          }
+          body.setLength(0);
+          id = line.substring("<DOCID>".length()).trim();
+        } else { 
+          // nuke any start tags
+          if (line.startsWith("<")) {
+            int end = line.indexOf('>');
+            if (end > 0 && end < 10) {
+              line = line.substring(end+1);
+            }
+          }
+          body.append(line);
+          body.append(' ');
+        }
+      }
+      idField.setStringValue(id);
+      bodyField.setStringValue(body.toString());
+      writer.addDocument(doc);
+      r.close();
+    }
+  }
+  
+  static void doTest(Analyzer analyzer, Similarity similarity, String fieldSpec, String measure) throws Exception {
+    File topicsFile = new File(new File(ROOT, "query"), "query_total.txt");
+    File qrelsFile = new File(new File(new File(ROOT, "relevance_file"), "full"), measure + ".rel");
+    SubmissionReport submitLog = new SubmissionReport(new PrintWriter("/tmp/submission.txt", "UTF-8"), "lucene");
+    FSDirectory dir = FSDirectory.open(INDEX);
+    IndexReader reader = DirectoryReader.open(dir);
+    IndexSearcher searcher = new IndexSearcher(reader);
+    searcher.setSimilarity(similarity);
+
+    int maxResults = 1000;
+    String docNameField = "docname";
+
+    QualityQuery qqs[] = readTopics(topicsFile);
+
+    // prepare judge, with trec utilities that read from a QRels file
+    Judge judge = new TrecJudge(new BufferedReader(IOUtils.getDecodingReader(qrelsFile, IOUtils.CHARSET_UTF_8)));
+
+    // validate topics & judgments match each other
+    judge.validateData(qqs, null);
+
+    Set<String> fieldSet = new HashSet<String>();
+    if (fieldSpec.indexOf('T') >= 0) fieldSet.add("title");
+    if (fieldSpec.indexOf('D') >= 0) fieldSet.add("description");
+    
+    // set the parsing of quality queries into Lucene queries.
+    QualityQueryParser qqParser = new SimpleQQParser(fieldSet.toArray(new String[0]), "body", analyzer);
+
+    // run the benchmark
+    QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
+    qrun.setMaxResults(maxResults);
+     qrun.execute(judge, submitLog, null);
+    reader.close();
+    dir.close();
+    // sorry Uwe: get unix already
+    String args[] = new String[3];
+    args[0] = TREC_EVAL_EXE.getAbsolutePath();
+    args[1] = qrelsFile.getAbsolutePath();
+    args[2] = "/tmp/submission.txt";
+    Process p = Runtime.getRuntime().exec(args);
+    InputStream is = p.getInputStream();
+    BufferedReader br = new BufferedReader(new InputStreamReader(is, Charset.defaultCharset()));
+    String line = null;
+    while ((line = br.readLine()) != null) {
+      if (line.startsWith("map")) {
+        String[] values = line.split("\\s+");
+        System.out.println(similarity.getClass().getSimpleName().replace("Similarity", "") + "\t" + measure + ": " + values[2]);
+      }
+    }
+    br.close();
+  }
+  
+  static QualityQuery[] readTopics(File f) throws Exception {
+    ArrayList<QualityQuery> qqs = new ArrayList<QualityQuery>();
+    Charset cs = Charset.forName("cp949");
+    // legacy charsets
+    CharsetDecoder decoder = cs.newDecoder()
+      .onMalformedInput(CodingErrorAction.REPLACE)
+      .onUnmappableCharacter(CodingErrorAction.REPLACE);
+    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(f), decoder));
+    String line = null;
+    String num = null;
+    Map<String,String> map = new HashMap<>();
+    while ((line = r.readLine()) != null) {
+      if (line.startsWith("<num>")) {
+        if (num != null) {
+          qqs.add(new QualityQuery(num, map));
+          map = new HashMap<>();
+        }
+        num = Integer.toString(Integer.parseInt(line.substring("<num>".length()).trim()));
+      } else if (line.startsWith("<title>")) {
+        map.put("title", line.substring("<title>".length()));
+      } else if (line.startsWith("<desc>")) {
+        map.put("description", line.substring("<desc>".length()));
+      }
+    }
+    qqs.add(new QualityQuery(num, map));
+    if (qqs.size() != 50) {
+      throw new IllegalStateException();
+    }
+    r.close();
+    return qqs.toArray(new QualityQuery[50]);
+  }
+}

Property changes on: lucene/core/src/test/org/apache/lucene/HantecRel.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java
===================================================================
--- lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java	(revision 1532951)
+++ lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java	(working copy)
@@ -16,6 +16,7 @@
  */
 package org.apache.lucene.benchmark.quality.utils;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.benchmark.quality.QualityQuery;
 import org.apache.lucene.benchmark.quality.QualityQueryParser;
@@ -35,6 +36,7 @@
 
   private String qqNames[];
   private String indexField;
+  private Analyzer analyzer;
   ThreadLocal<QueryParser> queryParser = new ThreadLocal<QueryParser>();
 
   /**
@@ -43,8 +45,19 @@
    * @param indexField corresponding index field  
    */
   public SimpleQQParser(String qqNames[], String indexField) {
+    this(qqNames, indexField, new StandardAnalyzer(Version.LUCENE_CURRENT));
+  }
+  
+  /**
+   * Constructor of a simple qq parser.
+   * @param qqNames name-value pairs of quality query to use for creating the query
+   * @param indexField corresponding index field  
+   * @param analyzer analyzer to use
+   */
+  public SimpleQQParser(String qqNames[], String indexField, Analyzer analyzer) {
     this.qqNames = qqNames;
     this.indexField = indexField;
+    this.analyzer = analyzer;
   }
 
   /**
@@ -63,7 +76,7 @@
   public Query parse(QualityQuery qq) throws ParseException {
     QueryParser qp = queryParser.get();
     if (qp==null) {
-      qp = new QueryParser(Version.LUCENE_CURRENT, indexField, new StandardAnalyzer(Version.LUCENE_CURRENT));
+      qp = new QueryParser(Version.LUCENE_CURRENT, indexField, analyzer);
       queryParser.set(qp);
     }
     BooleanQuery bq = new BooleanQuery();
