Index: lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test1.html
===================================================================
--- lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test1.html (revision 0)
+++ lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test1.html (revision 0)
@@ -0,0 +1,8 @@
+
+
+
+
+
+ 汉语
+
+
Property changes on: lucene\contrib\demo\src\test\org\apache\lucene\demo\test-files\html\test1.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries.txt
===================================================================
--- lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries.txt (revision 0)
+++ lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries.txt (revision 0)
@@ -0,0 +1 @@
+contents:汉语
Property changes on: lucene\contrib\demo\src\test\org\apache\lucene\demo\test-files\queries.txt
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java
===================================================================
--- lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java (revision 0)
+++ lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java (revision 0)
@@ -0,0 +1,126 @@
+package org.apache.lucene.demo.html;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Properties;
+
+import org.apache.lucene.demo.html.HTMLParser;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestHtmlParser extends LuceneTestCase {
+
+ public void testUnicode() throws Exception {
+ String text = "汉语";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("汉语", parser);
+ }
+
+ public void testEntities() throws Exception {
+ String text = "汉语¥";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("汉语¥", parser);
+ }
+
+ public void testComments() throws Exception {
+ String text = "foo";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("foo", parser);
+ }
+
+ public void testScript() throws Exception {
+ String text = "foo";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("foo", parser);
+ }
+
+ public void testStyle() throws Exception {
+ String text = "" +
+ "foo";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("foo", parser);
+ }
+
+ public void testDoctype() throws Exception {
+ String text = "" +
+ "foo";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("foo", parser);
+ }
+
+ public void testMeta() throws Exception {
+ String text = "" +
+ "" +
+ "" +
+ "" +
+ "" +
+ "foobar";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ Properties tags = parser.getMetaTags();
+ assertEquals(4, tags.size());
+ assertEquals("1", tags.get("a"));
+ assertEquals("2", tags.get("b"));
+ assertEquals("this is a test", tags.get("keywords"));
+ assertEquals("text/html;charset=utf-8", tags.get("content-type"));
+ }
+
+ public void testTitle() throws Exception {
+ String text = "foobar";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertEquals("foo", parser.getTitle());
+ }
+
+ public void testSummary() throws Exception {
+ String text = "foo" +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "Summarize me. Summarize me. Summarize me. Summarize me. " +
+ "";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertEquals(200, parser.getSummary().length());
+ }
+
+ // LUCENE-2246
+ public void testTurkish() throws Exception {
+ String text = "" +
+ "
" +
+ "";
+ HTMLParser parser = new HTMLParser(new StringReader(text));
+ assertReadsTo("[ş]", parser);
+ }
+
+ private void assertReadsTo(String expected, HTMLParser parser) throws IOException {
+ Reader reader = parser.getReader();
+ StringBuilder builder = new StringBuilder();
+ int ch = 0;
+ while ((ch = reader.read()) != -1) {
+ builder.append((char)ch);
+ }
+ assertEquals(expected, builder.toString());
+ }
+}
Property changes on: lucene\contrib\demo\src\test\org\apache\lucene\demo\html\TestHtmlParser.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java
===================================================================
--- lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java (revision 0)
+++ lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java (revision 0)
@@ -0,0 +1,46 @@
+package org.apache.lucene.demo;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestDemo extends LuceneTestCase {
+ // LUCENE-589
+ public void testUnicodeHtml() throws Exception {
+ File dir = getDataFile("test-files/html");
+ File indexDir = new File(TEMP_DIR, "demoIndex");
+ IndexHTML.main(new String[] { "-create", "-index", indexDir.getPath(), dir.getPath() });
+ File queries = getDataFile("test-files/queries.txt");
+ PrintStream outSave = System.out;
+ try {
+ ByteArrayOutputStream bytes = new ByteArrayOutputStream();
+ PrintStream fakeSystemOut = new PrintStream(bytes);
+ System.setOut(fakeSystemOut);
+ SearchFiles.main(new String[] { "-index", indexDir.getPath(), "-queries", queries.getPath()});
+ fakeSystemOut.flush();
+ String output = bytes.toString(); // intentionally use default encoding
+ assertTrue(output.contains("1 total matching documents"));
+ } finally {
+ System.setOut(outSave);
+ }
+ }
+}
Property changes on: lucene\contrib\demo\src\test\org\apache\lucene\demo\TestDemo.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java
===================================================================
--- lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java (revision 1031275)
+++ lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java (working copy)
@@ -64,7 +64,8 @@
doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));
FileInputStream fis = new FileInputStream(f);
- HTMLParser parser = new HTMLParser(fis);
+ InputStreamReader reader = new InputStreamReader(fis, "UTF-8");
+ HTMLParser parser = new HTMLParser(reader);
// Add the tag-stripped contents as a Reader-valued Text field so it will
// get tokenized and indexed.
Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java
===================================================================
--- lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java (revision 1031275)
+++ lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java (working copy)
@@ -2,6 +2,7 @@
package org.apache.lucene.demo.html;
import java.io.*;
+import java.util.Locale;
import java.util.Properties;
public class HTMLParser implements HTMLParserConstants {
@@ -40,14 +41,6 @@
}
}
- /**
- * @deprecated Use HTMLParser(FileInputStream) instead
- */
- @Deprecated
- public HTMLParser(File file) throws FileNotFoundException {
- this(new FileInputStream(file));
- }
-
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
@@ -231,7 +224,7 @@
Token t1, t2;
boolean inImg = false;
t1 = jj_consume_token(TagName);
- String tagName = t1.image.toLowerCase();
+ String tagName = t1.image.toLowerCase(Locale.ENGLISH);
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
@@ -268,7 +261,7 @@
)
&& t2 != null)
{
- currentMetaTag=t2.image.toLowerCase();
+ currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
@@ -276,7 +269,7 @@
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null)
{
- currentMetaContent=t2.image.toLowerCase();
+ currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
@@ -454,18 +447,18 @@
finally { jj_save(1, xla); }
}
- private boolean jj_3_1() {
- if (jj_scan_token(ArgQuote1)) return true;
- if (jj_scan_token(CloseQuote1)) return true;
- return false;
- }
-
private boolean jj_3_2() {
if (jj_scan_token(ArgQuote2)) return true;
if (jj_scan_token(CloseQuote2)) return true;
return false;
}
+ private boolean jj_3_1() {
+ if (jj_scan_token(ArgQuote1)) return true;
+ if (jj_scan_token(CloseQuote1)) return true;
+ return false;
+ }
+
/** Generated Token Manager. */
public HTMLParserTokenManager token_source;
SimpleCharStream jj_input_stream;
Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj
===================================================================
--- lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj (revision 1031275)
+++ lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj (working copy)
@@ -19,9 +19,9 @@
options {
STATIC = false;
- OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_LOOKAHEAD = true;
//DEBUG_TOKEN_MANAGER = true;
+ UNICODE_INPUT = true;
}
PARSER_BEGIN(HTMLParser)
@@ -29,6 +29,7 @@
package org.apache.lucene.demo.html;
import java.io.*;
+import java.util.Locale;
import java.util.Properties;
public class HTMLParser {
@@ -67,14 +68,6 @@
}
}
- /**
- * @deprecated Use HTMLParser(FileInputStream) instead
- */
- @Deprecated
- public HTMLParser(File file) throws FileNotFoundException {
- this(new FileInputStream(file));
- }
-
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
@@ -231,7 +224,7 @@
}
{
t1= {
- String tagName = t1.image.toLowerCase();
+ String tagName = t1.image.toLowerCase(Locale.ENGLISH);
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
@@ -253,7 +246,7 @@
)
&& t2 != null)
{
- currentMetaTag=t2.image.toLowerCase();
+ currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
@@ -261,7 +254,7 @@
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null)
{
- currentMetaContent=t2.image.toLowerCase();
+ currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParserTokenManager.java
===================================================================
--- lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParserTokenManager.java (revision 1031275)
+++ lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParserTokenManager.java (working copy)
@@ -1,6 +1,7 @@
/* Generated By:JavaCC: Do not edit this line. HTMLParserTokenManager.java */
package org.apache.lucene.demo.html;
import java.io.*;
+import java.util.Locale;
import java.util.Properties;
/** Token Manager. */
@@ -218,6 +219,9 @@
return jjMoveNfa_0(state, pos + 1);
}
static final long[] jjbitVec0 = {
+ 0xfffffffffffffffeL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
+};
+static final long[] jjbitVec2 = {
0x0L, 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL
};
private int jjMoveNfa_0(int startState, int curPos)
@@ -460,6 +464,9 @@
}
else
{
+ int hiByte = (int)(curChar >> 8);
+ int i1 = hiByte >> 6;
+ long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@@ -468,7 +475,7 @@
{
case 22:
case 23:
- if ((jjbitVec0[i2] & l2) == 0L)
+ if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 2)
kind = 2;
@@ -476,7 +483,7 @@
break;
case 26:
case 27:
- if ((jjbitVec0[i2] & l2) == 0L)
+ if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 3)
kind = 3;
@@ -562,6 +569,9 @@
}
else
{
+ int hiByte = (int)(curChar >> 8);
+ int i1 = hiByte >> 6;
+ long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@@ -570,7 +580,7 @@
{
case 1:
case 0:
- if ((jjbitVec0[i2] & l2) == 0L)
+ if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 25)
kind = 25;
@@ -660,6 +670,9 @@
}
else
{
+ int hiByte = (int)(curChar >> 8);
+ int i1 = hiByte >> 6;
+ long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@@ -667,7 +680,7 @@
switch(jjstateSet[--i])
{
case 0:
- if ((jjbitVec0[i2] & l2) == 0L)
+ if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 29)
kind = 29;
@@ -753,6 +766,9 @@
}
else
{
+ int hiByte = (int)(curChar >> 8);
+ int i1 = hiByte >> 6;
+ long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@@ -761,7 +777,7 @@
{
case 1:
case 0:
- if ((jjbitVec0[i2] & l2) == 0L)
+ if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 23)
kind = 23;
@@ -876,6 +892,9 @@
}
else
{
+ int hiByte = (int)(curChar >> 8);
+ int i1 = hiByte >> 6;
+ long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@@ -884,7 +903,7 @@
{
case 0:
case 1:
- if ((jjbitVec0[i2] & l2) == 0L)
+ if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 19)
kind = 19;
@@ -1042,6 +1061,9 @@
}
else
{
+ int hiByte = (int)(curChar >> 8);
+ int i1 = hiByte >> 6;
+ long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@@ -1050,7 +1072,7 @@
{
case 1:
case 0:
- if ((jjbitVec0[i2] & l2) == 0L)
+ if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 27)
kind = 27;
@@ -1183,6 +1205,9 @@
}
else
{
+ int hiByte = (int)(curChar >> 8);
+ int i1 = hiByte >> 6;
+ long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@@ -1191,14 +1216,14 @@
{
case 1:
case 0:
- if ((jjbitVec0[i2] & l2) == 0L)
+ if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 14)
kind = 14;
jjCheckNAdd(0);
break;
case 3:
- if ((jjbitVec0[i2] & l2) != 0L)
+ if (jjCanMove_0(hiByte, i1, i2, l1, l2))
jjAddStates(18, 19);
break;
default : break;
@@ -1336,6 +1361,9 @@
}
else
{
+ int hiByte = (int)(curChar >> 8);
+ int i1 = hiByte >> 6;
+ long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@@ -1344,7 +1372,7 @@
{
case 0:
case 1:
- if ((jjbitVec0[i2] & l2) == 0L)
+ if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 16)
kind = 16;
@@ -1371,6 +1399,18 @@
20, 21, 24, 12, 14, 16, 5, 8, 0, 4, 6, 0, 4, 6, 5, 0,
4, 6, 3, 4,
};
+private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)
+{
+ switch(hiByte)
+ {
+ case 0:
+ return ((jjbitVec2[i2] & l2) != 0L);
+ default :
+ if ((jjbitVec0[i1] & l1) != 0L)
+ return true;
+ return false;
+ }
+}
/** Token literal values. */
public static final String[] jjstrLiteralImages = {
Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/SearchFiles.java
===================================================================
--- lucene/contrib/demo/src/java/org/apache/lucene/demo/SearchFiles.java (revision 1031275)
+++ lucene/contrib/demo/src/java/org/apache/lucene/demo/SearchFiles.java (working copy)
@@ -19,6 +19,7 @@
import java.io.BufferedReader;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
@@ -124,7 +125,7 @@
BufferedReader in = null;
if (queries != null) {
- in = new BufferedReader(new FileReader(queries));
+ in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
} else {
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
}
Index: lucene/contrib/demo/src/java/org/apache/lucene/demo/FileDocument.java
===================================================================
--- lucene/contrib/demo/src/java/org/apache/lucene/demo/FileDocument.java (revision 1031275)
+++ lucene/contrib/demo/src/java/org/apache/lucene/demo/FileDocument.java (working copy)
@@ -18,7 +18,9 @@
*/
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileReader;
+import java.io.InputStreamReader;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
@@ -40,7 +42,7 @@
Reader field;
*/
public static Document Document(File f)
- throws java.io.FileNotFoundException {
+ throws java.io.IOException {
// make a new, empty document
Document doc = new Document();
@@ -58,9 +60,9 @@
// Add the contents of the file to a field named "contents". Specify a Reader,
// so that the text of the file is tokenized and indexed, but not stored.
- // Note that FileReader expects the file to be in the system's default encoding.
+ // Note that FileReader expects the file to be in UTF-8 encoding.
// If that's not the case searching for special characters will fail.
- doc.add(new Field("contents", new FileReader(f)));
+ doc.add(new Field("contents", new InputStreamReader(new FileInputStream(f), "UTF-8")));
// return the document
return doc;
Index: lucene/contrib/CHANGES.txt
===================================================================
--- lucene/contrib/CHANGES.txt (revision 1031275)
+++ lucene/contrib/CHANGES.txt (working copy)
@@ -134,6 +134,12 @@
* LUCENE-2616: FastVectorHighlighter: out of alignment when the first value is
empty in multiValued field (Koji Sekiguchi)
+
+* LUCENE-589: Fix contrib/demo for international documents.
+ (Curtis d'Entremont via Robert Muir)
+
+* LUCENE-2246: Fix contrib/demo for Turkish html documents.
+ (Selim Nadi via Robert Muir)
API Changes