Index: modules/analysis/icu/src/java/overview.html =================================================================== --- modules/analysis/icu/src/java/overview.html (revision 1074125) +++ modules/analysis/icu/src/java/overview.html (working copy) @@ -66,12 +66,12 @@

Example Usages

Tokenizing multilanguage text

-
+
   /**
    * This tokenizer will work well in general for most languages.
    */
   Tokenizer tokenizer = new ICUTokenizer(reader);
-
+

Collation

@@ -111,7 +111,7 @@

Example Usages

Farsi Range Queries

-
+
   Collator collator = Collator.getInstance(new Locale("ar"));
   ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
   RAMDirectory ramDir = new RAMDirectory();
@@ -138,10 +138,10 @@
   ScoreDoc[] result
     = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
   assertEquals("The index Term should not be included.", 0, result.length);
-
+

Danish Sorting

-
+
   Analyzer analyzer 
     = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
   RAMDirectory indexStore = new RAMDirectory();
@@ -166,10 +166,10 @@
     Document doc = searcher.doc(result[i].doc);
     assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
   }
-
+

Turkish Case Normalization

-
+
   Collator collator = Collator.getInstance(new Locale("tr", "TR"));
   collator.setStrength(Collator.PRIMARY);
   Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
@@ -185,7 +185,7 @@
   Query query = parser.parse("d\u0131gy");   // U+0131: dotless i
   ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
   assertEquals("The index Term should be included.", 1, result.length);
-
+

Caveats and Comparisons

@@ -245,7 +245,7 @@

Example Usages

Normalizing text to NFC

-
+
   /**
    * Normalizer2 objects are unmodifiable and immutable.
    */
@@ -254,7 +254,7 @@
    * This filter will normalize to NFC.
    */
   TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, normalizer);
-
+

Case Folding

@@ -284,12 +284,12 @@

Example Usages

Lowercasing text

-
+
   /**
    * This filter will case-fold and normalize to NFKC.
    */
   TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer);
-
+

Search Term Folding

@@ -311,13 +311,13 @@

Example Usages

Removing accents

-
+
   /**
    * This filter will case-fold, remove accents and other distinctions, and
    * normalize to NFKC.
    */
   TokenStream tokenstream = new ICUFoldingFilter(tokenizer);
-
+

Text Transformation

@@ -341,19 +341,19 @@

Example Usages

Convert Traditional to Simplified

-
+
   /**
    * This filter will map Traditional Chinese to Simplified Chinese
    */
   TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Traditional-Simplified"));
-
+

Transliterate Serbian Cyrillic to Serbian Latin

-
+  
   /**
    * This filter will map Serbian Cyrillic to Serbian Latin according to BGN rules
    */
   TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Serbian-Latin/BGN"));
-
+

Backwards Compatibility

@@ -365,7 +365,7 @@

Example Usages

Restricting normalization to Unicode 5.0

-
+
   /**
    * This filter will do NFC normalization, but will ignore any characters that
    * did not exist as of Unicode 5.0. Because of the normalization stability policy
@@ -377,6 +377,6 @@
     set.freeze(); 
     FilteredNormalizer2 unicode50 = new FilteredNormalizer2(normalizer, set);
     TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, unicode50);
-
+
Index: modules/analysis/common/src/java/org/apache/lucene/collation/package.html =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/collation/package.html (revision 1074125) +++ modules/analysis/common/src/java/org/apache/lucene/collation/package.html (working copy) @@ -52,7 +52,7 @@

Example Usages

Farsi Range Queries

-
+
   // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
   Collator collator = Collator.getInstance(new Locale("ar"));
   CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
@@ -80,10 +80,10 @@
   ScoreDoc[] result
     = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
   assertEquals("The index Term should not be included.", 0, result.length);
-
+

Danish Sorting

-
+
   Analyzer analyzer 
     = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
   RAMDirectory indexStore = new RAMDirectory();
@@ -108,10 +108,10 @@
     Document doc = searcher.doc(result[i].doc);
     assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
   }
-
+

Turkish Case Normalization

-
+
   Collator collator = Collator.getInstance(new Locale("tr", "TR"));
   collator.setStrength(Collator.PRIMARY);
   Analyzer analyzer = new CollationKeyAnalyzer(collator);
@@ -127,7 +127,7 @@
   Query query = parser.parse("d\u0131gy");   // U+0131: dotless i
   ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
   assertEquals("The index Term should be included.", 1, result.length);
-
+

Caveats and Comparisons

Index: lucene/src/java/org/apache/lucene/analysis/package.html =================================================================== --- lucene/src/java/org/apache/lucene/analysis/package.html (revision 1074125) +++ lucene/src/java/org/apache/lucene/analysis/package.html (working copy) @@ -130,7 +130,7 @@ However an application might invoke Analysis of any text for testing or for any other purpose, something like: -

+  
       Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
       TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
       while (ts.incrementToken()) {
@@ -182,7 +182,7 @@
   This allows phrase search and proximity search to seamlessly cross 
   boundaries between these "sections".
   In other words, if a certain field "f" is added like this:
-  
+  
       document.add(new Field("f","first ends",...);
       document.add(new Field("f","starts two",...);
       indexWriter.addDocument(document);
@@ -191,7 +191,7 @@
   Where desired, this behavior can be modified by introducing a "position gap" between consecutive field "sections", 
   simply by overriding 
   {@link org.apache.lucene.analysis.Analyzer#getPositionIncrementGap(java.lang.String) Analyzer.getPositionIncrementGap(fieldName)}:
-  
+  
       Analyzer myAnalyzer = new StandardAnalyzer() {
          public int getPositionIncrementGap(String fieldName) {
            return 10;
@@ -220,7 +220,7 @@
    tokens following a removed stop word, using
    {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
    This can be done with something like:
-   
+   
       public TokenStream tokenStream(final String fieldName, Reader reader) {
         final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
         TokenStream res = new TokenStream() {
@@ -334,7 +334,7 @@
 Then we will develop a custom Attribute, a PartOfSpeechAttribute, and add another filter to the chain which
 utilizes the new custom attribute, and call it PartOfSpeechTaggingFilter.
 

Whitespace tokenization

-
+
 public class MyAnalyzer extends Analyzer {
 
   public TokenStream tokenStream(String fieldName, Reader reader) {
@@ -381,7 +381,7 @@
 

Adding a LengthFilter

We want to suppress all tokens that have 2 or less characters. We can do that easily by adding a LengthFilter to the chain. Only the tokenStream() method in our analyzer needs to be changed: -
+
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream stream = new WhitespaceTokenizer(reader);
     stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
@@ -398,7 +398,7 @@
 API
 
Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core): -
+
 public final class LengthFilter extends TokenFilter {
 
   final int min;
@@ -448,7 +448,7 @@
 

Adding a custom Attribute

Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently PartOfSpeechAttribute. First we need to define the interface of the new Attribute: -
+
   public interface PartOfSpeechAttribute extends Attribute {
     public static enum PartOfSpeech {
       Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
@@ -470,7 +470,7 @@
 Now here is the actual class that implements our new Attribute. Notice that the class has to extend
 {@link org.apache.lucene.util.AttributeImpl}:
 
-
+
 public final class PartOfSpeechAttributeImpl extends AttributeImpl 
                             implements PartOfSpeechAttribute{
   
@@ -513,7 +513,7 @@
 new AttributeImpl class and therefore implements its abstract methods clear(), copyTo(), equals(), hashCode().
 Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
 that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
-
+
   public static class PartOfSpeechTaggingFilter extends TokenFilter {
     PartOfSpeechAttribute posAtt;
     CharTermAttribute termAtt;
@@ -544,7 +544,7 @@
 stores references in instance variables. Notice how you only need to pass in the interface of the new
 Attribute and instantiating the correct class is automatically been taken care of.
 Now we need to add the filter to the chain:
-
+
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream stream = new WhitespaceTokenizer(reader);
     stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
@@ -564,7 +564,7 @@
 Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not
 affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer
 to make use of the new PartOfSpeechAttribute and print it out:
-
+
   public static void main(String[] args) throws IOException {
     // text to tokenize
     final String text = "This is a demo of the new TokenStream API";
@@ -606,7 +606,7 @@
 of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words
 as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise). 
 As a small hint, this is how the new Attribute class could begin:
-
+
   public class FirstTokenOfSentenceAttributeImpl extends Attribute
                    implements FirstTokenOfSentenceAttribute {
     
Index: lucene/src/java/org/apache/lucene/search/spans/package.html
===================================================================
--- lucene/src/java/org/apache/lucene/search/spans/package.html	(revision 1074125)
+++ lucene/src/java/org/apache/lucene/search/spans/package.html	(working copy)
@@ -59,7 +59,7 @@
 

For example, a span query which matches "John Kerry" within ten words of "George Bush" within the first 100 words of the document could be constructed with: -

+
 SpanQuery john   = new SpanTermQuery(new Term("content", "john"));
 SpanQuery kerry  = new SpanTermQuery(new Term("content", "kerry"));
 SpanQuery george = new SpanTermQuery(new Term("content", "george"));
@@ -82,7 +82,7 @@
 So, for example, the above query can be restricted to documents which
 also use the word "iraq" with:
 
-
+
 Query query = new BooleanQuery();
 query.add(johnKerryNearGeorgeBushAtStart, true, false);
 query.add(new TermQuery("content", "iraq"), true, false);
Index: lucene/src/java/org/apache/lucene/search/function/package.html
===================================================================
--- lucene/src/java/org/apache/lucene/search/function/package.html	(revision 1074125)
+++ lucene/src/java/org/apache/lucene/search/function/package.html	(working copy)
@@ -130,14 +130,14 @@
     Using field (byte) values to as scores:
     

Indexing: -

+    
       f = new Field("score", "7", Field.Store.NO, Field.Index.UN_TOKENIZED);
       f.setOmitNorms(true);
       d1.add(f);
     

Search: -

+    
       Query q = new FieldScoreQuery("score", FieldScoreQuery.Type.BYTE);
     
Document d1 above would get a score of 7. @@ -148,7 +148,7 @@

Dividing the original score of each document by a square root of its docid (just to demonstrate what it takes to manipulate scores this way) -

+    
       Query q = queryParser.parse("my query text");
       CustomScoreQuery customQ = new CustomScoreQuery(q) {
         public float customScore(int doc, float subQueryScore, float valSrcScore) {
@@ -158,7 +158,7 @@
     

For more informative debug info on the custom query, also override the name() method: -

+        
       CustomScoreQuery customQ = new CustomScoreQuery(q) {
         public float customScore(int doc, float subQueryScore, float valSrcScore) {
           return subQueryScore / Math.sqrt(docid);
@@ -171,7 +171,7 @@
         

Taking the square root of the original score and multiplying it by a "short field driven score", ie, the short value that was indexed for the scored doc in a certain field: -

+        
       Query q = queryParser.parse("my query text");
       FieldScoreQuery qf = new FieldScoreQuery("shortScore", FieldScoreQuery.Type.SHORT);
       CustomScoreQuery customQ = new CustomScoreQuery(q,qf) {
Index: lucene/src/java/org/apache/lucene/messages/package.html
===================================================================
--- lucene/src/java/org/apache/lucene/messages/package.html	(revision 1074125)
+++ lucene/src/java/org/apache/lucene/messages/package.html	(working copy)
@@ -45,7 +45,7 @@
 

Lazy loading of Message Strings -

+
 	public class MessagesTestBundle extends NLS {
 	
 	  private static final String BUNDLE_NAME = MessagesTestBundle.class.getName();
@@ -85,7 +85,7 @@
 

Normal loading of Message Strings -

+
 	String message1 = NLS.getLocalizedMessage(MessagesTestBundle.Q0004E_INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION);
 	String message2 = NLS.getLocalizedMessage(MessagesTestBundle.Q0004E_INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION, Locale.JAPANESE);
 
Index: lucene/contrib/queryparser/src/java/overview.html =================================================================== --- lucene/contrib/queryparser/src/java/overview.html (revision 1074125) +++ lucene/contrib/queryparser/src/java/overview.html (working copy) @@ -131,7 +131,7 @@ {@link org.apache.lucene.queryParser.standard.StandardQueryParser} usage: -
+
       StandardQueryParser qpHelper = new StandardQueryParser();
       StandardQueryConfigHandler config =  qpHelper.getQueryConfigHandler();
       config.setAllowLeadingWildcard(true);
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html	(revision 1074125)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html	(working copy)
@@ -26,7 +26,7 @@
 
 

Example Usage

-
+
   //... Above, create documents with two fields, one with term vectors (tv) and one without (notv)
   IndexSearcher searcher = new IndexSearcher(directory);
   QueryParser parser = new QueryParser("notv", analyzer);