Index: lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test1.html =================================================================== --- lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test1.html (revision 0) +++ lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test1.html (revision 0) @@ -0,0 +1,8 @@ + + + + + + 汉语 + + Property changes on: lucene\contrib\demo\src\test\org\apache\lucene\demo\test-files\html\test1.html ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries.txt =================================================================== --- lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries.txt (revision 0) +++ lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries.txt (revision 0) @@ -0,0 +1 @@ +contents:汉语 Property changes on: lucene\contrib\demo\src\test\org\apache\lucene\demo\test-files\queries.txt ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java =================================================================== --- lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java (revision 0) +++ lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java (revision 0) @@ -0,0 +1,126 @@ +package org.apache.lucene.demo.html; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.Properties; + +import org.apache.lucene.demo.html.HTMLParser; +import org.apache.lucene.util.LuceneTestCase; + +public class TestHtmlParser extends LuceneTestCase { + + public void testUnicode() throws Exception { + String text = "汉语"; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertReadsTo("汉语", parser); + } + + public void testEntities() throws Exception { + String text = "汉语¥"; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertReadsTo("汉语¥", parser); + } + + public void testComments() throws Exception { + String text = "foo"; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertReadsTo("foo", parser); + } + + public void testScript() throws Exception { + String text = "foo"; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertReadsTo("foo", parser); + } + + public void testStyle() throws Exception { + String text = "" + + "foo"; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertReadsTo("foo", parser); + } + + public void testDoctype() throws Exception { + String text = "" + + "foo"; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertReadsTo("foo", parser); + } + + public void testMeta() throws Exception { + String text = "" + + "" + + "" + + "" + + "" + + "foobar"; + HTMLParser parser = new HTMLParser(new StringReader(text)); + Properties tags = parser.getMetaTags(); + assertEquals(4, tags.size()); + assertEquals("1", tags.get("a")); + assertEquals("2", tags.get("b")); + assertEquals("this is a test", tags.get("keywords")); + assertEquals("text/html;charset=utf-8", tags.get("content-type")); + } + + public void testTitle() throws Exception { + String text = "foobar"; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertEquals("foo", parser.getTitle()); + } + + public void testSummary() throws Exception { + String text = "foo" + + "Summarize me. Summarize me. Summarize me. Summarize me. " + + "Summarize me. Summarize me. Summarize me. Summarize me. " + + "Summarize me. Summarize me. Summarize me. Summarize me. " + + "Summarize me. Summarize me. Summarize me. Summarize me. " + + "Summarize me. Summarize me. Summarize me. Summarize me. " + + "Summarize me. Summarize me. Summarize me. Summarize me. " + + "Summarize me. Summarize me. Summarize me. Summarize me. " + + ""; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertEquals(200, parser.getSummary().length()); + } + + // LUCENE-2246 + public void testTurkish() throws Exception { + String text = "" + + "\"ş\"" + + ""; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertReadsTo("[ş]", parser); + } + + private void assertReadsTo(String expected, HTMLParser parser) throws IOException { + Reader reader = parser.getReader(); + StringBuilder builder = new StringBuilder(); + int ch = 0; + while ((ch = reader.read()) != -1) { + builder.append((char)ch); + } + assertEquals(expected, builder.toString()); + } +} Property changes on: lucene\contrib\demo\src\test\org\apache\lucene\demo\html\TestHtmlParser.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java =================================================================== --- lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java (revision 0) +++ lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java (revision 0) @@ -0,0 +1,46 @@ +package org.apache.lucene.demo; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; + +import org.apache.lucene.util.LuceneTestCase; + +public class TestDemo extends LuceneTestCase { + // LUCENE-589 + public void testUnicodeHtml() throws Exception { + File dir = getDataFile("test-files/html"); + File indexDir = new File(TEMP_DIR, "demoIndex"); + IndexHTML.main(new String[] { "-create", "-index", indexDir.getPath(), dir.getPath() }); + File queries = getDataFile("test-files/queries.txt"); + PrintStream outSave = System.out; + try { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + PrintStream fakeSystemOut = new PrintStream(bytes); + System.setOut(fakeSystemOut); + SearchFiles.main(new String[] { "-index", indexDir.getPath(), "-queries", queries.getPath()}); + fakeSystemOut.flush(); + String output = bytes.toString(); // intentionally use default encoding + assertTrue(output.contains("1 total matching documents")); + } finally { + System.setOut(outSave); + } + } +} Property changes on: lucene\contrib\demo\src\test\org\apache\lucene\demo\TestDemo.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java =================================================================== --- lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java (revision 1031275) +++ lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java (working copy) @@ -64,7 +64,8 @@ doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED)); FileInputStream fis = new FileInputStream(f); - HTMLParser parser = new HTMLParser(fis); + InputStreamReader reader = new InputStreamReader(fis, "UTF-8"); + HTMLParser parser = new HTMLParser(reader); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java =================================================================== --- lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java (revision 1031275) +++ lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java (working copy) @@ -2,6 +2,7 @@ package org.apache.lucene.demo.html; import java.io.*; +import java.util.Locale; import java.util.Properties; public class HTMLParser implements HTMLParserConstants { @@ -40,14 +41,6 @@ } } - /** - * @deprecated Use HTMLParser(FileInputStream) instead - */ - @Deprecated - public HTMLParser(File file) throws FileNotFoundException { - this(new FileInputStream(file)); - } - public String getTitle() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread @@ -231,7 +224,7 @@ Token t1, t2; boolean inImg = false; t1 = jj_consume_token(TagName); - String tagName = t1.image.toLowerCase(); + String tagName = t1.image.toLowerCase(Locale.ENGLISH); if(Tags.WS_ELEMS.contains(tagName) ) { addSpace(); } @@ -268,7 +261,7 @@ ) && t2 != null) { - currentMetaTag=t2.image.toLowerCase(); + currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH); if(currentMetaTag != null && currentMetaContent != null) { addMetaTag(); } @@ -276,7 +269,7 @@ if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != null) { - currentMetaContent=t2.image.toLowerCase(); + currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH); if(currentMetaTag != null && currentMetaContent != null) { addMetaTag(); } @@ -454,18 +447,18 @@ finally { jj_save(1, xla); } } - private boolean jj_3_1() { - if (jj_scan_token(ArgQuote1)) return true; - if (jj_scan_token(CloseQuote1)) return true; - return false; - } - private boolean jj_3_2() { if (jj_scan_token(ArgQuote2)) return true; if (jj_scan_token(CloseQuote2)) return true; return false; } + private boolean jj_3_1() { + if (jj_scan_token(ArgQuote1)) return true; + if (jj_scan_token(CloseQuote1)) return true; + return false; + } + /** Generated Token Manager. */ public HTMLParserTokenManager token_source; SimpleCharStream jj_input_stream; Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj =================================================================== --- lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj (revision 1031275) +++ lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj (working copy) @@ -19,9 +19,9 @@ options { STATIC = false; - OPTIMIZE_TOKEN_MANAGER = true; //DEBUG_LOOKAHEAD = true; //DEBUG_TOKEN_MANAGER = true; + UNICODE_INPUT = true; } PARSER_BEGIN(HTMLParser) @@ -29,6 +29,7 @@ package org.apache.lucene.demo.html; import java.io.*; +import java.util.Locale; import java.util.Properties; public class HTMLParser { @@ -67,14 +68,6 @@ } } - /** - * @deprecated Use HTMLParser(FileInputStream) instead - */ - @Deprecated - public HTMLParser(File file) throws FileNotFoundException { - this(new FileInputStream(file)); - } - public String getTitle() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread @@ -231,7 +224,7 @@ } { t1= { - String tagName = t1.image.toLowerCase(); + String tagName = t1.image.toLowerCase(Locale.ENGLISH); if(Tags.WS_ELEMS.contains(tagName) ) { addSpace(); } @@ -253,7 +246,7 @@ ) && t2 != null) { - currentMetaTag=t2.image.toLowerCase(); + currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH); if(currentMetaTag != null && currentMetaContent != null) { addMetaTag(); } @@ -261,7 +254,7 @@ if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != null) { - currentMetaContent=t2.image.toLowerCase(); + currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH); if(currentMetaTag != null && currentMetaContent != null) { addMetaTag(); } Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParserTokenManager.java =================================================================== --- lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParserTokenManager.java (revision 1031275) +++ lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParserTokenManager.java (working copy) @@ -1,6 +1,7 @@ /* Generated By:JavaCC: Do not edit this line. HTMLParserTokenManager.java */ package org.apache.lucene.demo.html; import java.io.*; +import java.util.Locale; import java.util.Properties; /** Token Manager. */ @@ -218,6 +219,9 @@ return jjMoveNfa_0(state, pos + 1); } static final long[] jjbitVec0 = { + 0xfffffffffffffffeL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL +}; +static final long[] jjbitVec2 = { 0x0L, 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL }; private int jjMoveNfa_0(int startState, int curPos) @@ -460,6 +464,9 @@ } else { + int hiByte = (int)(curChar >> 8); + int i1 = hiByte >> 6; + long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; long l2 = 1L << (curChar & 077); do @@ -468,7 +475,7 @@ { case 22: case 23: - if ((jjbitVec0[i2] & l2) == 0L) + if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; if (kind > 2) kind = 2; @@ -476,7 +483,7 @@ break; case 26: case 27: - if ((jjbitVec0[i2] & l2) == 0L) + if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; if (kind > 3) kind = 3; @@ -562,6 +569,9 @@ } else { + int hiByte = (int)(curChar >> 8); + int i1 = hiByte >> 6; + long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; long l2 = 1L << (curChar & 077); do @@ -570,7 +580,7 @@ { case 1: case 0: - if ((jjbitVec0[i2] & l2) == 0L) + if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; if (kind > 25) kind = 25; @@ -660,6 +670,9 @@ } else { + int hiByte = (int)(curChar >> 8); + int i1 = hiByte >> 6; + long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; long l2 = 1L << (curChar & 077); do @@ -667,7 +680,7 @@ switch(jjstateSet[--i]) { case 0: - if ((jjbitVec0[i2] & l2) == 0L) + if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; if (kind > 29) kind = 29; @@ -753,6 +766,9 @@ } else { + int hiByte = (int)(curChar >> 8); + int i1 = hiByte >> 6; + long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; long l2 = 1L << (curChar & 077); do @@ -761,7 +777,7 @@ { case 1: case 0: - if ((jjbitVec0[i2] & l2) == 0L) + if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; if (kind > 23) kind = 23; @@ -876,6 +892,9 @@ } else { + int hiByte = (int)(curChar >> 8); + int i1 = hiByte >> 6; + long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; long l2 = 1L << (curChar & 077); do @@ -884,7 +903,7 @@ { case 0: case 1: - if ((jjbitVec0[i2] & l2) == 0L) + if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; if (kind > 19) kind = 19; @@ -1042,6 +1061,9 @@ } else { + int hiByte = (int)(curChar >> 8); + int i1 = hiByte >> 6; + long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; long l2 = 1L << (curChar & 077); do @@ -1050,7 +1072,7 @@ { case 1: case 0: - if ((jjbitVec0[i2] & l2) == 0L) + if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; if (kind > 27) kind = 27; @@ -1183,6 +1205,9 @@ } else { + int hiByte = (int)(curChar >> 8); + int i1 = hiByte >> 6; + long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; long l2 = 1L << (curChar & 077); do @@ -1191,14 +1216,14 @@ { case 1: case 0: - if ((jjbitVec0[i2] & l2) == 0L) + if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; if (kind > 14) kind = 14; jjCheckNAdd(0); break; case 3: - if ((jjbitVec0[i2] & l2) != 0L) + if (jjCanMove_0(hiByte, i1, i2, l1, l2)) jjAddStates(18, 19); break; default : break; @@ -1336,6 +1361,9 @@ } else { + int hiByte = (int)(curChar >> 8); + int i1 = hiByte >> 6; + long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; long l2 = 1L << (curChar & 077); do @@ -1344,7 +1372,7 @@ { case 0: case 1: - if ((jjbitVec0[i2] & l2) == 0L) + if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) break; if (kind > 16) kind = 16; @@ -1371,6 +1399,18 @@ 20, 21, 24, 12, 14, 16, 5, 8, 0, 4, 6, 0, 4, 6, 5, 0, 4, 6, 3, 4, }; +private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2) +{ + switch(hiByte) + { + case 0: + return ((jjbitVec2[i2] & l2) != 0L); + default : + if ((jjbitVec0[i1] & l1) != 0L) + return true; + return false; + } +} /** Token literal values. */ public static final String[] jjstrLiteralImages = { Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/SearchFiles.java =================================================================== --- lucene/contrib/demo/src/java/org/apache/lucene/demo/SearchFiles.java (revision 1031275) +++ lucene/contrib/demo/src/java/org/apache/lucene/demo/SearchFiles.java (working copy) @@ -19,6 +19,7 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; @@ -124,7 +125,7 @@ BufferedReader in = null; if (queries != null) { - in = new BufferedReader(new FileReader(queries)); + in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); } else { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); } Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/FileDocument.java =================================================================== --- lucene/contrib/demo/src/java/org/apache/lucene/demo/FileDocument.java (revision 1031275) +++ lucene/contrib/demo/src/java/org/apache/lucene/demo/FileDocument.java (working copy) @@ -18,7 +18,9 @@ */ import java.io.File; +import java.io.FileInputStream; import java.io.FileReader; +import java.io.InputStreamReader; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; @@ -40,7 +42,7 @@ Reader field; */ public static Document Document(File f) - throws java.io.FileNotFoundException { + throws java.io.IOException { // make a new, empty document Document doc = new Document(); @@ -58,9 +60,9 @@ // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. - // Note that FileReader expects the file to be in the system's default encoding. + // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. - doc.add(new Field("contents", new FileReader(f))); + doc.add(new Field("contents", new InputStreamReader(new FileInputStream(f), "UTF-8"))); // return the document return doc; Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 1031275) +++ lucene/contrib/CHANGES.txt (working copy) @@ -134,6 +134,12 @@ * LUCENE-2616: FastVectorHighlighter: out of alignment when the first value is empty in multiValued field (Koji Sekiguchi) + +* LUCENE-589: Fix contrib/demo for international documents. + (Curtis d'Entremont via Robert Muir) + +* LUCENE-2246: Fix contrib/demo for Turkish html documents. + (Selim Nadi via Robert Muir) API Changes