(300);
- static final String[] encoder = new String[0x100];
-
- static final String decode(String entity) {
- if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon
- entity = entity.substring(0, entity.length()-1);
- if (entity.charAt(1) == '#') {
- int start = 2;
- int radix = 10;
- if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
- start++;
- radix = 16;
- }
- Character c =
- new Character((char)Integer.parseInt(entity.substring(start), radix));
- return c.toString();
- } else {
- String s = decoder.get(entity);
- if (s != null)
- return s;
- else return "";
- }
- }
-
- public static final String encode(String s) {
- int length = s.length();
- StringBuffer buffer = new StringBuffer(length * 2);
- for (int i = 0; i < length; i++) {
- int j = s.charAt(i);
- if (j < 0x100 && encoder[j] != null) {
- buffer.append(encoder[j]); // have a named encoding
- buffer.append(';');
- } else if (j < 0x80) {
- buffer.append((char) j); // use ASCII value
- } else {
- buffer.append(""); // use numeric encoding
- buffer.append(j).append(';');
- }
- }
- return buffer.toString();
- }
-
- static final void add(String entity, int value) {
- decoder.put(entity, (new Character((char)value)).toString());
- if (value < 0x100)
- encoder[value] = entity;
- }
-
- static {
- add(" ", 160);
- add("¡", 161);
- add("¢", 162);
- add("£", 163);
- add("¤", 164);
- add("¥", 165);
- add("¦", 166);
- add("§", 167);
- add("¨", 168);
- add("©", 169);
- add("ª", 170);
- add("«", 171);
- add("¬", 172);
- add("­", 173);
- add("®", 174);
- add("¯", 175);
- add("°", 176);
- add("±", 177);
- add("²", 178);
- add("³", 179);
- add("´", 180);
- add("µ", 181);
- add("¶", 182);
- add("·", 183);
- add("¸", 184);
- add("¹", 185);
- add("º", 186);
- add("»", 187);
- add("¼", 188);
- add("½", 189);
- add("¾", 190);
- add("¿", 191);
- add("À", 192);
- add("Á", 193);
- add("Â", 194);
- add("Ã", 195);
- add("Ä", 196);
- add("Å", 197);
- add("Æ", 198);
- add("Ç", 199);
- add("È", 200);
- add("É", 201);
- add("Ê", 202);
- add("Ë", 203);
- add("Ì", 204);
- add("Í", 205);
- add("Î", 206);
- add("Ï", 207);
- add("Ð", 208);
- add("Ñ", 209);
- add("Ò", 210);
- add("Ó", 211);
- add("Ô", 212);
- add("Õ", 213);
- add("Ö", 214);
- add("×", 215);
- add("Ø", 216);
- add("Ù", 217);
- add("Ú", 218);
- add("Û", 219);
- add("Ü", 220);
- add("Ý", 221);
- add("Þ", 222);
- add("ß", 223);
- add("à", 224);
- add("á", 225);
- add("â", 226);
- add("ã", 227);
- add("ä", 228);
- add("å", 229);
- add("æ", 230);
- add("ç", 231);
- add("è", 232);
- add("é", 233);
- add("ê", 234);
- add("ë", 235);
- add("ì", 236);
- add("í", 237);
- add("î", 238);
- add("ï", 239);
- add("ð", 240);
- add("ñ", 241);
- add("ò", 242);
- add("ó", 243);
- add("ô", 244);
- add("õ", 245);
- add("ö", 246);
- add("÷", 247);
- add("ø", 248);
- add("ù", 249);
- add("ú", 250);
- add("û", 251);
- add("ü", 252);
- add("ý", 253);
- add("þ", 254);
- add("ÿ", 255);
- add("&fnof", 402);
- add("&Alpha", 913);
- add("&Beta", 914);
- add("&Gamma", 915);
- add("&Delta", 916);
- add("&Epsilon",917);
- add("&Zeta", 918);
- add("&Eta", 919);
- add("&Theta", 920);
- add("&Iota", 921);
- add("&Kappa", 922);
- add("&Lambda", 923);
- add("&Mu", 924);
- add("&Nu", 925);
- add("&Xi", 926);
- add("&Omicron",927);
- add("&Pi", 928);
- add("&Rho", 929);
- add("&Sigma", 931);
- add("&Tau", 932);
- add("&Upsilon",933);
- add("&Phi", 934);
- add("&Chi", 935);
- add("&Psi", 936);
- add("&Omega", 937);
- add("&alpha", 945);
- add("&beta", 946);
- add("&gamma", 947);
- add("&delta", 948);
- add("&epsilon",949);
- add("&zeta", 950);
- add("&eta", 951);
- add("&theta", 952);
- add("&iota", 953);
- add("&kappa", 954);
- add("&lambda", 955);
- add("&mu", 956);
- add("&nu", 957);
- add("&xi", 958);
- add("&omicron",959);
- add("&pi", 960);
- add("&rho", 961);
- add("&sigmaf", 962);
- add("&sigma", 963);
- add("&tau", 964);
- add("&upsilon",965);
- add("&phi", 966);
- add("&chi", 967);
- add("&psi", 968);
- add("&omega", 969);
- add("&thetasym",977);
- add("&upsih", 978);
- add("&piv", 982);
- add("&bull", 8226);
- add("&hellip", 8230);
- add("&prime", 8242);
- add("&Prime", 8243);
- add("&oline", 8254);
- add("&frasl", 8260);
- add("&weierp", 8472);
- add("&image", 8465);
- add("&real", 8476);
- add("&trade", 8482);
- add("&alefsym",8501);
- add("&larr", 8592);
- add("&uarr", 8593);
- add("&rarr", 8594);
- add("&darr", 8595);
- add("&harr", 8596);
- add("&crarr", 8629);
- add("&lArr", 8656);
- add("&uArr", 8657);
- add("&rArr", 8658);
- add("&dArr", 8659);
- add("&hArr", 8660);
- add("&forall", 8704);
- add("&part", 8706);
- add("&exist", 8707);
- add("&empty", 8709);
- add("&nabla", 8711);
- add("&isin", 8712);
- add("¬in", 8713);
- add("&ni", 8715);
- add("&prod", 8719);
- add("&sum", 8721);
- add("&minus", 8722);
- add("&lowast", 8727);
- add("&radic", 8730);
- add("&prop", 8733);
- add("&infin", 8734);
- add("&ang", 8736);
- add("&and", 8743);
- add("&or", 8744);
- add("&cap", 8745);
- add("&cup", 8746);
- add("&int", 8747);
- add("&there4", 8756);
- add("&sim", 8764);
- add("&cong", 8773);
- add("&asymp", 8776);
- add("&ne", 8800);
- add("&equiv", 8801);
- add("&le", 8804);
- add("&ge", 8805);
- add("&sub", 8834);
- add("&sup", 8835);
- add("&nsub", 8836);
- add("&sube", 8838);
- add("&supe", 8839);
- add("&oplus", 8853);
- add("&otimes", 8855);
- add("&perp", 8869);
- add("&sdot", 8901);
- add("&lceil", 8968);
- add("&rceil", 8969);
- add("&lfloor", 8970);
- add("&rfloor", 8971);
- add("&lang", 9001);
- add("&rang", 9002);
- add("&loz", 9674);
- add("&spades", 9824);
- add("&clubs", 9827);
- add("&hearts", 9829);
- add("&diams", 9830);
- add(""", 34);
- add("&", 38);
- add("<", 60);
- add(">", 62);
- add("&OElig", 338);
- add("&oelig", 339);
- add("&Scaron", 352);
- add("&scaron", 353);
- add("&Yuml", 376);
- add("&circ", 710);
- add("&tilde", 732);
- add("&ensp", 8194);
- add("&emsp", 8195);
- add("&thinsp", 8201);
- add("&zwnj", 8204);
- add("&zwj", 8205);
- add("&lrm", 8206);
- add("&rlm", 8207);
- add("&ndash", 8211);
- add("&mdash", 8212);
- add("&lsquo", 8216);
- add("&rsquo", 8217);
- add("&sbquo", 8218);
- add("&ldquo", 8220);
- add("&rdquo", 8221);
- add("&bdquo", 8222);
- add("&dagger", 8224);
- add("&Dagger", 8225);
- add("&permil", 8240);
- add("&lsaquo", 8249);
- add("&rsaquo", 8250);
- add("&euro", 8364);
-
- }
-}
Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java
===================================================================
--- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java (revision 1361666)
+++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java (working copy)
@@ -1,123 +0,0 @@
-// FastCharStream.java
-package org.apache.lucene.benchmark.byTask.feeds.demohtml;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-import java.io.*;
-
-/** An efficient implementation of JavaCC's CharStream interface. Note that
- * this does not do line-number counting, but instead keeps track of the
- * character position of the token in the input, as required by Lucene's {@link
- * org.apache.lucene.analysis.Token} API.
- * */
-public final class FastCharStream implements CharStream {
- char[] buffer = null;
-
- int bufferLength = 0; // end of valid chars
- int bufferPosition = 0; // next char to read
-
- int tokenStart = 0; // offset in buffer
- int bufferStart = 0; // position in file of buffer
-
- Reader input; // source of chars
-
- /** Constructs from a Reader. */
- public FastCharStream(Reader r) {
- input = r;
- }
-
- public final char readChar() throws IOException {
- if (bufferPosition >= bufferLength)
- refill();
- return buffer[bufferPosition++];
- }
-
- private final void refill() throws IOException {
- int newPosition = bufferLength - tokenStart;
-
- if (tokenStart == 0) { // token won't fit in buffer
- if (buffer == null) { // first time: alloc buffer
- buffer = new char[2048];
- } else if (bufferLength == buffer.length) { // grow buffer
- char[] newBuffer = new char[buffer.length*2];
- System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
- buffer = newBuffer;
- }
- } else { // shift token to front
- System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
- }
-
- bufferLength = newPosition; // update state
- bufferPosition = newPosition;
- bufferStart += tokenStart;
- tokenStart = 0;
-
- int charsRead = // fill space in buffer
- input.read(buffer, newPosition, buffer.length-newPosition);
- if (charsRead == -1)
- throw new IOException("read past eof");
- else
- bufferLength += charsRead;
- }
-
- public final char BeginToken() throws IOException {
- tokenStart = bufferPosition;
- return readChar();
- }
-
- public final void backup(int amount) {
- bufferPosition -= amount;
- }
-
- public final String GetImage() {
- return new String(buffer, tokenStart, bufferPosition - tokenStart);
- }
-
- public final char[] GetSuffix(int len) {
- char[] value = new char[len];
- System.arraycopy(buffer, bufferPosition - len, value, 0, len);
- return value;
- }
-
- public final void Done() {
- try {
- input.close();
- } catch (IOException e) {
- }
- }
-
- public final int getColumn() {
- return bufferStart + bufferPosition;
- }
- public final int getLine() {
- return 1;
- }
- public final int getEndColumn() {
- return bufferStart + bufferPosition;
- }
- public final int getEndLine() {
- return 1;
- }
- public final int getBeginColumn() {
- return bufferStart + tokenStart;
- }
- public final int getBeginLine() {
- return 1;
- }
-}
Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParseException.java
===================================================================
--- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParseException.java (revision 1361666)
+++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParseException.java (working copy)
@@ -1,198 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */
-/* JavaCCOptions:KEEP_LINE_COL=null */
-package org.apache.lucene.benchmark.byTask.feeds.demohtml;
-
-/**
- * This exception is thrown when parse errors are encountered.
- * You can explicitly create objects of this exception type by
- * calling the method generateParseException in the generated
- * parser.
- *
- * You can modify this class to customize your error reporting
- * mechanisms so long as you retain the public fields.
- */
-public class ParseException extends Exception {
-
- /**
- * This constructor is used by the method "generateParseException"
- * in the generated parser. Calling this constructor generates
- * a new object of this type with the fields "currentToken",
- * "expectedTokenSequences", and "tokenImage" set. The boolean
- * flag "specialConstructor" is also set to true to indicate that
- * this constructor was used to create this object.
- * This constructor calls its super class with the empty string
- * to force the "toString" method of parent class "Throwable" to
- * print the error message in the form:
- * ParseException:
- */
- public ParseException(Token currentTokenVal,
- int[][] expectedTokenSequencesVal,
- String[] tokenImageVal
- )
- {
- super("");
- specialConstructor = true;
- currentToken = currentTokenVal;
- expectedTokenSequences = expectedTokenSequencesVal;
- tokenImage = tokenImageVal;
- }
-
- /**
- * The following constructors are for use by you for whatever
- * purpose you can think of. Constructing the exception in this
- * manner makes the exception behave in the normal way - i.e., as
- * documented in the class "Throwable". The fields "errorToken",
- * "expectedTokenSequences", and "tokenImage" do not contain
- * relevant information. The JavaCC generated code does not use
- * these constructors.
- */
-
- public ParseException() {
- super();
- specialConstructor = false;
- }
-
- /** Constructor with message. */
- public ParseException(String message) {
- super(message);
- specialConstructor = false;
- }
-
- /**
- * This variable determines which constructor was used to create
- * this object and thereby affects the semantics of the
- * "getMessage" method (see below).
- */
- protected boolean specialConstructor;
-
- /**
- * This is the last token that has been consumed successfully. If
- * this object has been created due to a parse error, the token
- * followng this token will (therefore) be the first error token.
- */
- public Token currentToken;
-
- /**
- * Each entry in this array is an array of integers. Each array
- * of integers represents a sequence of tokens (by their ordinal
- * values) that is expected at this point of the parse.
- */
- public int[][] expectedTokenSequences;
-
- /**
- * This is a reference to the "tokenImage" array of the generated
- * parser within which the parse error occurred. This array is
- * defined in the generated ...Constants interface.
- */
- public String[] tokenImage;
-
- /**
- * This method has the standard behavior when this object has been
- * created using the standard constructors. Otherwise, it uses
- * "currentToken" and "expectedTokenSequences" to generate a parse
- * error message and returns it. If this object has been created
- * due to a parse error, and you do not catch it (it gets thrown
- * from the parser), then this method is called during the printing
- * of the final stack trace, and hence the correct error message
- * gets displayed.
- */
- public String getMessage() {
- if (!specialConstructor) {
- return super.getMessage();
- }
- StringBuffer expected = new StringBuffer();
- int maxSize = 0;
- for (int i = 0; i < expectedTokenSequences.length; i++) {
- if (maxSize < expectedTokenSequences[i].length) {
- maxSize = expectedTokenSequences[i].length;
- }
- for (int j = 0; j < expectedTokenSequences[i].length; j++) {
- expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' ');
- }
- if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
- expected.append("...");
- }
- expected.append(eol).append(" ");
- }
- String retval = "Encountered \"";
- Token tok = currentToken.next;
- for (int i = 0; i < maxSize; i++) {
- if (i != 0) retval += " ";
- if (tok.kind == 0) {
- retval += tokenImage[0];
- break;
- }
- retval += " " + tokenImage[tok.kind];
- retval += " \"";
- retval += add_escapes(tok.image);
- retval += " \"";
- tok = tok.next;
- }
- retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn;
- retval += "." + eol;
- if (expectedTokenSequences.length == 1) {
- retval += "Was expecting:" + eol + " ";
- } else {
- retval += "Was expecting one of:" + eol + " ";
- }
- retval += expected.toString();
- return retval;
- }
-
- /**
- * The end of line string for this machine.
- */
- protected String eol = System.getProperty("line.separator", "\n");
-
- /**
- * Used to convert raw characters to their escaped version
- * when these raw version cannot be used as part of an ASCII
- * string literal.
- */
- protected String add_escapes(String str) {
- StringBuffer retval = new StringBuffer();
- char ch;
- for (int i = 0; i < str.length(); i++) {
- switch (str.charAt(i))
- {
- case 0 :
- continue;
- case '\b':
- retval.append("\\b");
- continue;
- case '\t':
- retval.append("\\t");
- continue;
- case '\n':
- retval.append("\\n");
- continue;
- case '\f':
- retval.append("\\f");
- continue;
- case '\r':
- retval.append("\\r");
- continue;
- case '\"':
- retval.append("\\\"");
- continue;
- case '\'':
- retval.append("\\\'");
- continue;
- case '\\':
- retval.append("\\\\");
- continue;
- default:
- if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
- String s = "0000" + Integer.toString(ch, 16);
- retval.append("\\u" + s.substring(s.length() - 4, s.length()));
- } else {
- retval.append(ch);
- }
- continue;
- }
- }
- return retval.toString();
- }
-
-}
-/* JavaCC - OriginalChecksum=e449d0e43f3d85deb1260a88b7e90fcd (do not edit this line) */
Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java
===================================================================
--- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java (revision 1361666)
+++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java (working copy)
@@ -1,112 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 4.1 */
-/* JavaCCOptions:STATIC=false */
-package org.apache.lucene.benchmark.byTask.feeds.demohtml;
-
-/**
- * This interface describes a character stream that maintains line and
- * column number positions of the characters. It also has the capability
- * to backup the stream to some extent. An implementation of this
- * interface is used in the TokenManager implementation generated by
- * JavaCCParser.
- *
- * All the methods except backup can be implemented in any fashion. backup
- * needs to be implemented correctly for the correct operation of the lexer.
- * Rest of the methods are all used to get information like line number,
- * column number and the String that constitutes a token and are not used
- * by the lexer. Hence their implementation won't affect the generated lexer's
- * operation.
- */
-
-public interface CharStream {
-
- /**
- * Returns the next character from the selected input. The method
- * of selecting the input is the responsibility of the class
- * implementing this interface. Can throw any java.io.IOException.
- */
- char readChar() throws java.io.IOException;
-
- /**
- * Returns the column position of the character last read.
- * @deprecated
- * @see #getEndColumn
- */
- int getColumn();
-
- /**
- * Returns the line number of the character last read.
- * @deprecated
- * @see #getEndLine
- */
- int getLine();
-
- /**
- * Returns the column number of the last character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getEndColumn();
-
- /**
- * Returns the line number of the last character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getEndLine();
-
- /**
- * Returns the column number of the first character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getBeginColumn();
-
- /**
- * Returns the line number of the first character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getBeginLine();
-
- /**
- * Backs up the input stream by amount steps. Lexer calls this method if it
- * had already read some characters, but could not use them to match a
- * (longer) token. So, they will be used again as the prefix of the next
- * token and it is the implemetation's responsibility to do this right.
- */
- void backup(int amount);
-
- /**
- * Returns the next character that marks the beginning of the next token.
- * All characters must remain in the buffer between two successive calls
- * to this method to implement backup correctly.
- */
- char BeginToken() throws java.io.IOException;
-
- /**
- * Returns a string made up of characters from the marked token beginning
- * to the current buffer position. Implementations have the choice of returning
- * anything that they want to. For example, for efficiency, one might decide
- * to just return null, which is a valid implementation.
- */
- String GetImage();
-
- /**
- * Returns an array of characters that make up the suffix of length 'len' for
- * the currently matched token. This is used to build up the matched string
- * for use in actions in the case of MORE. A simple and inefficient
- * implementation of this is as follows :
- *
- * {
- * String t = GetImage();
- * return t.substring(t.length() - len, t.length()).toCharArray();
- * }
- */
- char[] GetSuffix(int len);
-
- /**
- * The lexer calls this function to indicate that it is done with the stream
- * and hence implementations can free any resources held by this class.
- * Again, the body of this function can be just empty and it will not
- * affect the lexer's operation.
- */
- void Done();
-
-}
-/* JavaCC - OriginalChecksum=e26d9399cd34335f985e19c1fa86c11b (do not edit this line) */
Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java
===================================================================
--- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java (revision 1361666)
+++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java (working copy)
@@ -1,112 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 4.1 */
-/* JavaCCOptions:STATIC=false */
-package org.apache.lucene.benchmark.byTask.feeds.demohtml;
-
-/**
- * This interface describes a character stream that maintains line and
- * column number positions of the characters. It also has the capability
- * to backup the stream to some extent. An implementation of this
- * interface is used in the TokenManager implementation generated by
- * JavaCCParser.
- *
- * All the methods except backup can be implemented in any fashion. backup
- * needs to be implemented correctly for the correct operation of the lexer.
- * Rest of the methods are all used to get information like line number,
- * column number and the String that constitutes a token and are not used
- * by the lexer. Hence their implementation won't affect the generated lexer's
- * operation.
- */
-
-public interface CharStream {
-
- /**
- * Returns the next character from the selected input. The method
- * of selecting the input is the responsibility of the class
- * implementing this interface. Can throw any java.io.IOException.
- */
- char readChar() throws java.io.IOException;
-
- /**
- * Returns the column position of the character last read.
- * @deprecated
- * @see #getEndColumn
- */
- int getColumn();
-
- /**
- * Returns the line number of the character last read.
- * @deprecated
- * @see #getEndLine
- */
- int getLine();
-
- /**
- * Returns the column number of the last character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getEndColumn();
-
- /**
- * Returns the line number of the last character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getEndLine();
-
- /**
- * Returns the column number of the first character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getBeginColumn();
-
- /**
- * Returns the line number of the first character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getBeginLine();
-
- /**
- * Backs up the input stream by amount steps. Lexer calls this method if it
- * had already read some characters, but could not use them to match a
- * (longer) token. So, they will be used again as the prefix of the next
- * token and it is the implemetation's responsibility to do this right.
- */
- void backup(int amount);
-
- /**
- * Returns the next character that marks the beginning of the next token.
- * All characters must remain in the buffer between two successive calls
- * to this method to implement backup correctly.
- */
- char BeginToken() throws java.io.IOException;
-
- /**
- * Returns a string made up of characters from the marked token beginning
- * to the current buffer position. Implementations have the choice of returning
- * anything that they want to. For example, for efficiency, one might decide
- * to just return null, which is a valid implementation.
- */
- String GetImage();
-
- /**
- * Returns an array of characters that make up the suffix of length 'len' for
- * the currently matched token. This is used to build up the matched string
- * for use in actions in the case of MORE. A simple and inefficient
- * implementation of this is as follows :
- *
- * {
- * String t = GetImage();
- * return t.substring(t.length() - len, t.length()).toCharArray();
- * }
- */
- char[] GetSuffix(int len);
-
- /**
- * The lexer calls this function to indicate that it is done with the stream
- * and hence implementations can free any resources held by this class.
- * Again, the body of this function can be just empty and it will not
- * affect the lexer's operation.
- */
- void Done();
-
-}
-/* JavaCC - OriginalChecksum=e26d9399cd34335f985e19c1fa86c11b (do not edit this line) */
Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Entities.java
===================================================================
--- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Entities.java (revision 1361666)
+++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Entities.java (working copy)
@@ -1,330 +0,0 @@
-package org.apache.lucene.benchmark.byTask.feeds.demohtml;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * Utility class for encoding and decoding HTML entities.
- */
-public class Entities {
- static final Map decoder = new HashMap(300);
- static final String[] encoder = new String[0x100];
-
- static final String decode(String entity) {
- if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon
- entity = entity.substring(0, entity.length()-1);
- if (entity.charAt(1) == '#') {
- int start = 2;
- int radix = 10;
- if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
- start++;
- radix = 16;
- }
- Character c =
- new Character((char)Integer.parseInt(entity.substring(start), radix));
- return c.toString();
- } else {
- String s = decoder.get(entity);
- if (s != null)
- return s;
- else return "";
- }
- }
-
- public static final String encode(String s) {
- int length = s.length();
- StringBuffer buffer = new StringBuffer(length * 2);
- for (int i = 0; i < length; i++) {
- int j = s.charAt(i);
- if (j < 0x100 && encoder[j] != null) {
- buffer.append(encoder[j]); // have a named encoding
- buffer.append(';');
- } else if (j < 0x80) {
- buffer.append((char) j); // use ASCII value
- } else {
- buffer.append(""); // use numeric encoding
- buffer.append(j).append(';');
- }
- }
- return buffer.toString();
- }
-
- static final void add(String entity, int value) {
- decoder.put(entity, (new Character((char)value)).toString());
- if (value < 0x100)
- encoder[value] = entity;
- }
-
- static {
- add(" ", 160);
- add("¡", 161);
- add("¢", 162);
- add("£", 163);
- add("¤", 164);
- add("¥", 165);
- add("¦", 166);
- add("§", 167);
- add("¨", 168);
- add("©", 169);
- add("ª", 170);
- add("«", 171);
- add("¬", 172);
- add("­", 173);
- add("®", 174);
- add("¯", 175);
- add("°", 176);
- add("±", 177);
- add("²", 178);
- add("³", 179);
- add("´", 180);
- add("µ", 181);
- add("¶", 182);
- add("·", 183);
- add("¸", 184);
- add("¹", 185);
- add("º", 186);
- add("»", 187);
- add("¼", 188);
- add("½", 189);
- add("¾", 190);
- add("¿", 191);
- add("À", 192);
- add("Á", 193);
- add("Â", 194);
- add("Ã", 195);
- add("Ä", 196);
- add("Å", 197);
- add("Æ", 198);
- add("Ç", 199);
- add("È", 200);
- add("É", 201);
- add("Ê", 202);
- add("Ë", 203);
- add("Ì", 204);
- add("Í", 205);
- add("Î", 206);
- add("Ï", 207);
- add("Ð", 208);
- add("Ñ", 209);
- add("Ò", 210);
- add("Ó", 211);
- add("Ô", 212);
- add("Õ", 213);
- add("Ö", 214);
- add("×", 215);
- add("Ø", 216);
- add("Ù", 217);
- add("Ú", 218);
- add("Û", 219);
- add("Ü", 220);
- add("Ý", 221);
- add("Þ", 222);
- add("ß", 223);
- add("à", 224);
- add("á", 225);
- add("â", 226);
- add("ã", 227);
- add("ä", 228);
- add("å", 229);
- add("æ", 230);
- add("ç", 231);
- add("è", 232);
- add("é", 233);
- add("ê", 234);
- add("ë", 235);
- add("ì", 236);
- add("í", 237);
- add("î", 238);
- add("ï", 239);
- add("ð", 240);
- add("ñ", 241);
- add("ò", 242);
- add("ó", 243);
- add("ô", 244);
- add("õ", 245);
- add("ö", 246);
- add("÷", 247);
- add("ø", 248);
- add("ù", 249);
- add("ú", 250);
- add("û", 251);
- add("ü", 252);
- add("ý", 253);
- add("þ", 254);
- add("ÿ", 255);
- add("&fnof", 402);
- add("&Alpha", 913);
- add("&Beta", 914);
- add("&Gamma", 915);
- add("&Delta", 916);
- add("&Epsilon",917);
- add("&Zeta", 918);
- add("&Eta", 919);
- add("&Theta", 920);
- add("&Iota", 921);
- add("&Kappa", 922);
- add("&Lambda", 923);
- add("&Mu", 924);
- add("&Nu", 925);
- add("&Xi", 926);
- add("&Omicron",927);
- add("&Pi", 928);
- add("&Rho", 929);
- add("&Sigma", 931);
- add("&Tau", 932);
- add("&Upsilon",933);
- add("&Phi", 934);
- add("&Chi", 935);
- add("&Psi", 936);
- add("&Omega", 937);
- add("&alpha", 945);
- add("&beta", 946);
- add("&gamma", 947);
- add("&delta", 948);
- add("&epsilon",949);
- add("&zeta", 950);
- add("&eta", 951);
- add("&theta", 952);
- add("&iota", 953);
- add("&kappa", 954);
- add("&lambda", 955);
- add("&mu", 956);
- add("&nu", 957);
- add("&xi", 958);
- add("&omicron",959);
- add("&pi", 960);
- add("&rho", 961);
- add("&sigmaf", 962);
- add("&sigma", 963);
- add("&tau", 964);
- add("&upsilon",965);
- add("&phi", 966);
- add("&chi", 967);
- add("&psi", 968);
- add("&omega", 969);
- add("&thetasym",977);
- add("&upsih", 978);
- add("&piv", 982);
- add("&bull", 8226);
- add("&hellip", 8230);
- add("&prime", 8242);
- add("&Prime", 8243);
- add("&oline", 8254);
- add("&frasl", 8260);
- add("&weierp", 8472);
- add("&image", 8465);
- add("&real", 8476);
- add("&trade", 8482);
- add("&alefsym",8501);
- add("&larr", 8592);
- add("&uarr", 8593);
- add("&rarr", 8594);
- add("&darr", 8595);
- add("&harr", 8596);
- add("&crarr", 8629);
- add("&lArr", 8656);
- add("&uArr", 8657);
- add("&rArr", 8658);
- add("&dArr", 8659);
- add("&hArr", 8660);
- add("&forall", 8704);
- add("&part", 8706);
- add("&exist", 8707);
- add("&empty", 8709);
- add("&nabla", 8711);
- add("&isin", 8712);
- add("¬in", 8713);
- add("&ni", 8715);
- add("&prod", 8719);
- add("&sum", 8721);
- add("&minus", 8722);
- add("&lowast", 8727);
- add("&radic", 8730);
- add("&prop", 8733);
- add("&infin", 8734);
- add("&ang", 8736);
- add("&and", 8743);
- add("&or", 8744);
- add("&cap", 8745);
- add("&cup", 8746);
- add("&int", 8747);
- add("&there4", 8756);
- add("&sim", 8764);
- add("&cong", 8773);
- add("&asymp", 8776);
- add("&ne", 8800);
- add("&equiv", 8801);
- add("&le", 8804);
- add("&ge", 8805);
- add("&sub", 8834);
- add("&sup", 8835);
- add("&nsub", 8836);
- add("&sube", 8838);
- add("&supe", 8839);
- add("&oplus", 8853);
- add("&otimes", 8855);
- add("&perp", 8869);
- add("&sdot", 8901);
- add("&lceil", 8968);
- add("&rceil", 8969);
- add("&lfloor", 8970);
- add("&rfloor", 8971);
- add("&lang", 9001);
- add("&rang", 9002);
- add("&loz", 9674);
- add("&spades", 9824);
- add("&clubs", 9827);
- add("&hearts", 9829);
- add("&diams", 9830);
- add(""", 34);
- add("&", 38);
- add("<", 60);
- add(">", 62);
- add("&OElig", 338);
- add("&oelig", 339);
- add("&Scaron", 352);
- add("&scaron", 353);
- add("&Yuml", 376);
- add("&circ", 710);
- add("&tilde", 732);
- add("&ensp", 8194);
- add("&emsp", 8195);
- add("&thinsp", 8201);
- add("&zwnj", 8204);
- add("&zwj", 8205);
- add("&lrm", 8206);
- add("&rlm", 8207);
- add("&ndash", 8211);
- add("&mdash", 8212);
- add("&lsquo", 8216);
- add("&rsquo", 8217);
- add("&sbquo", 8218);
- add("&ldquo", 8220);
- add("&rdquo", 8221);
- add("&bdquo", 8222);
- add("&dagger", 8224);
- add("&Dagger", 8225);
- add("&permil", 8240);
- add("&lsaquo", 8249);
- add("&rsaquo", 8250);
- add("&euro", 8364);
-
- }
-}
Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java
===================================================================
--- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java (revision 1361666)
+++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java (working copy)
@@ -1,123 +0,0 @@
-// FastCharStream.java
-package org.apache.lucene.benchmark.byTask.feeds.demohtml;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-import java.io.*;
-
-/** An efficient implementation of JavaCC's CharStream interface. Note that
- * this does not do line-number counting, but instead keeps track of the
- * character position of the token in the input, as required by Lucene's {@link
- * org.apache.lucene.analysis.Token} API.
- * */
-public final class FastCharStream implements CharStream {
- char[] buffer = null;
-
- int bufferLength = 0; // end of valid chars
- int bufferPosition = 0; // next char to read
-
- int tokenStart = 0; // offset in buffer
- int bufferStart = 0; // position in file of buffer
-
- Reader input; // source of chars
-
- /** Constructs from a Reader. */
- public FastCharStream(Reader r) {
- input = r;
- }
-
- public final char readChar() throws IOException {
- if (bufferPosition >= bufferLength)
- refill();
- return buffer[bufferPosition++];
- }
-
- private final void refill() throws IOException {
- int newPosition = bufferLength - tokenStart;
-
- if (tokenStart == 0) { // token won't fit in buffer
- if (buffer == null) { // first time: alloc buffer
- buffer = new char[2048];
- } else if (bufferLength == buffer.length) { // grow buffer
- char[] newBuffer = new char[buffer.length*2];
- System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
- buffer = newBuffer;
- }
- } else { // shift token to front
- System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
- }
-
- bufferLength = newPosition; // update state
- bufferPosition = newPosition;
- bufferStart += tokenStart;
- tokenStart = 0;
-
- int charsRead = // fill space in buffer
- input.read(buffer, newPosition, buffer.length-newPosition);
- if (charsRead == -1)
- throw new IOException("read past eof");
- else
- bufferLength += charsRead;
- }
-
- public final char BeginToken() throws IOException {
- tokenStart = bufferPosition;
- return readChar();
- }
-
- public final void backup(int amount) {
- bufferPosition -= amount;
- }
-
- public final String GetImage() {
- return new String(buffer, tokenStart, bufferPosition - tokenStart);
- }
-
- public final char[] GetSuffix(int len) {
- char[] value = new char[len];
- System.arraycopy(buffer, bufferPosition - len, value, 0, len);
- return value;
- }
-
- public final void Done() {
- try {
- input.close();
- } catch (IOException e) {
- }
- }
-
- public final int getColumn() {
- return bufferStart + bufferPosition;
- }
- public final int getLine() {
- return 1;
- }
- public final int getEndColumn() {
- return bufferStart + bufferPosition;
- }
- public final int getEndLine() {
- return 1;
- }
- public final int getBeginColumn() {
- return bufferStart + tokenStart;
- }
- public final int getBeginLine() {
- return 1;
- }
-}
Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.java
===================================================================
--- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.java (revision 1361666)
+++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.java (working copy)
@@ -1,722 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
-package org.apache.lucene.benchmark.byTask.feeds.demohtml;
-
-import java.io.*;
-import java.util.Locale;
-import java.util.Properties;
-
-/**
- * Basic html parser (for demo/testing purposes only!)
- */
-public class HTMLParser implements HTMLParserConstants {
- public static int SUMMARY_LENGTH = 200;
-
- StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
- StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
- Properties metaTags=new Properties();
- String currentMetaTag=null;
- String currentMetaContent=null;
- int length = 0;
- boolean titleComplete = false;
- boolean inTitle = false;
- boolean inMetaTag = false;
- boolean inStyle = false;
- boolean afterTag = false;
- boolean afterSpace = false;
- String eol = System.getProperty("line.separator");
- Reader pipeIn = null;
- Writer pipeOut;
- private MyPipedInputStream pipeInStream = null;
- private PipedOutputStream pipeOutStream = null;
-
- public HTMLParser(Reader reader) {
- this(new FastCharStream(reader));
- }
-
- private class MyPipedInputStream extends PipedInputStream{
-
- public MyPipedInputStream(){
- super();
- }
-
- public MyPipedInputStream(PipedOutputStream src) throws IOException{
- super(src);
- }
-
- public boolean full() throws IOException{
- return this.available() >= PipedInputStream.PIPE_SIZE;
- }
- }
-
- public String getTitle() throws IOException, InterruptedException {
- if (pipeIn == null)
- getReader(); // spawn parsing thread
- while (true) {
- synchronized(this) {
- if (titleComplete || pipeInStream.full())
- break;
- wait(10);
- }
- }
- return title.toString().trim();
- }
-
- public Properties getMetaTags() throws IOException,
-InterruptedException {
- if (pipeIn == null)
- getReader(); // spawn parsing thread
- while (true) {
- synchronized(this) {
- if (titleComplete || pipeInStream.full())
- break;
- wait(10);
- }
- }
- return metaTags;
- }
-
-
- public String getSummary() throws IOException, InterruptedException {
- if (pipeIn == null)
- getReader(); // spawn parsing thread
- while (true) {
- synchronized(this) {
- if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
- break;
- wait(10);
- }
- }
- if (summary.length() > SUMMARY_LENGTH)
- summary.setLength(SUMMARY_LENGTH);
-
- String sum = summary.toString().trim();
- String tit = getTitle();
- if (sum.equals(""))
- return tit;
- else
- return sum;
- }
-
- public Reader getReader() throws IOException {
- if (pipeIn == null) {
- pipeInStream = new MyPipedInputStream();
- pipeOutStream = new PipedOutputStream(pipeInStream);
- pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
- pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
-
- Thread thread = new ParserThread(this);
- thread.start(); // start parsing
- }
-
- return pipeIn;
- }
-
- void addToSummary(String text) {
- if (summary.length() < SUMMARY_LENGTH) {
- summary.append(text);
- if (summary.length() >= SUMMARY_LENGTH) {
- synchronized(this) {
- notifyAll();
- }
- }
- }
- }
-
- void addText(String text) throws IOException {
- if (inStyle)
- return;
- if (inTitle)
- title.append(text);
- else {
- addToSummary(text);
- if (!titleComplete && !(title.length() == 0)) { // finished title
- synchronized(this) {
- titleComplete = true; // tell waiting threads
- notifyAll();
- }
- }
- }
-
- length += text.length();
- pipeOut.write(text);
-
- afterSpace = false;
- }
-
- void addMetaTag() {
- metaTags.setProperty(currentMetaTag, currentMetaContent);
- currentMetaTag = null;
- currentMetaContent = null;
- return;
- }
-
- void addSpace() throws IOException {
- if (!afterSpace) {
- if (inTitle)
- title.append(" ");
- else
- addToSummary(" ");
-
- String space = afterTag ? eol : " ";
- length += space.length();
- pipeOut.write(space);
- afterSpace = true;
- }
- }
-
- final public void HTMLDocument() throws ParseException, IOException {
- Token t;
- label_1:
- while (true) {
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ScriptStart:
- case TagName:
- case DeclName:
- case Comment1:
- case Comment2:
- case Word:
- case Entity:
- case Space:
- case Punct:
- ;
- break;
- default:
- jj_la1[0] = jj_gen;
- break label_1;
- }
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case TagName:
- Tag();
- afterTag = true;
- break;
- case DeclName:
- t = Decl();
- afterTag = true;
- break;
- case Comment1:
- case Comment2:
- CommentTag();
- afterTag = true;
- break;
- case ScriptStart:
- ScriptTag();
- afterTag = true;
- break;
- case Word:
- t = jj_consume_token(Word);
- addText(t.image); afterTag = false;
- break;
- case Entity:
- t = jj_consume_token(Entity);
- addText(Entities.decode(t.image)); afterTag = false;
- break;
- case Punct:
- t = jj_consume_token(Punct);
- addText(t.image); afterTag = false;
- break;
- case Space:
- jj_consume_token(Space);
- addSpace(); afterTag = false;
- break;
- default:
- jj_la1[1] = jj_gen;
- jj_consume_token(-1);
- throw new ParseException();
- }
- }
- jj_consume_token(0);
- }
-
- final public void Tag() throws ParseException, IOException {
- Token t1, t2;
- boolean inImg = false;
- t1 = jj_consume_token(TagName);
- String tagName = t1.image.toLowerCase(Locale.ROOT);
- if(Tags.WS_ELEMS.contains(tagName) ) {
- addSpace();
- }
- inTitle = tagName.equalsIgnoreCase("
- inMetaTag = tagName.equalsIgnoreCase("
- inStyle = tagName.equalsIgnoreCase("" +
- "foo