Index: dev-tools/eclipse/dot.classpath =================================================================== --- dev-tools/eclipse/dot.classpath (revision 1361666) +++ dev-tools/eclipse/dot.classpath (working copy) @@ -102,6 +102,7 @@ + Index: lucene/benchmark/build.xml =================================================================== --- lucene/benchmark/build.xml (revision 1361666) +++ lucene/benchmark/build.xml (working copy) @@ -155,6 +155,7 @@ + @@ -261,20 +262,6 @@ - - - - - - - - - - - - Index: lucene/benchmark/ivy.xml =================================================================== --- lucene/benchmark/ivy.xml (revision 1361666) +++ lucene/benchmark/ivy.xml (working copy) @@ -21,6 +21,7 @@ + Index: lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1 =================================================================== --- lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1 (revision 0) +++ lucene/benchmark/lib/nekohtml-1.9.15.jar.sha1 (working copy) @@ -0,0 +1 @@ +a45cd7b7401d9c2264d4908182380452c03ebf8f Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserTokenManager.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserTokenManager.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserTokenManager.java (working copy) @@ -1,1657 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. HTMLParserTokenManager.java */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; -import java.io.*; -import java.util.Locale; -import java.util.Properties; - -/** Token Manager. */ -public class HTMLParserTokenManager implements HTMLParserConstants -{ - - /** Debug output. */ - public java.io.PrintStream debugStream = System.out; - /** Set debug output. */ - public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; } -private final int jjStopStringLiteralDfa_0(int pos, long active0) -{ - switch (pos) - { - case 0: - if ((active0 & 0x32L) != 0L) - return 20; - return -1; - case 1: - if ((active0 & 0x2L) != 0L) - { - if (jjmatchedPos != 1) - { - jjmatchedKind = 2; - jjmatchedPos = 1; - } - return 22; - } - if ((active0 & 0x30L) != 0L) - return 25; - return -1; - case 2: - if ((active0 & 0x2L) != 0L) - { - jjmatchedKind = 2; - jjmatchedPos = 2; - return 23; - } - return -1; - case 3: - if ((active0 & 0x2L) != 0L) - { - jjmatchedKind = 2; - jjmatchedPos = 3; - return 23; - } - return -1; - case 4: - if ((active0 & 0x2L) != 0L) - { - jjmatchedKind = 2; - jjmatchedPos = 4; - return 23; - } - return -1; - case 5: - if ((active0 & 0x2L) != 0L) - { - jjmatchedKind = 2; - jjmatchedPos = 5; - return 23; - } - return -1; - default : - return -1; - } -} -private final int jjStartNfa_0(int pos, long active0) -{ - return jjMoveNfa_0(jjStopStringLiteralDfa_0(pos, active0), pos + 1); -} -private int jjStopAtPos(int pos, int kind) -{ - jjmatchedKind = kind; - jjmatchedPos = pos; - return pos + 1; -} -private int jjMoveStringLiteralDfa0_0() -{ - switch(curChar) - { - case 60: - return jjMoveStringLiteralDfa1_0(0x32L); - default : - return jjMoveNfa_0(11, 0); - } -} -private int jjMoveStringLiteralDfa1_0(long active0) -{ - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(0, active0); - return 1; - } - switch(curChar) - { - case 33: - if ((active0 & 0x20L) != 0L) - { - jjmatchedKind = 5; - jjmatchedPos = 1; - } - return jjMoveStringLiteralDfa2_0(active0, 0x10L); - case 115: - return jjMoveStringLiteralDfa2_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(0, active0); -} -private int jjMoveStringLiteralDfa2_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(0, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(1, active0); - return 2; - } - switch(curChar) - { - case 45: - return jjMoveStringLiteralDfa3_0(active0, 0x10L); - case 99: - return jjMoveStringLiteralDfa3_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(1, active0); -} -private int jjMoveStringLiteralDfa3_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(1, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(2, active0); - return 3; - } - switch(curChar) - { - case 45: - if ((active0 & 0x10L) != 0L) - return jjStopAtPos(3, 4); - break; - case 114: - return jjMoveStringLiteralDfa4_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(2, active0); -} -private int jjMoveStringLiteralDfa4_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(2, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(3, active0); - return 4; - } - switch(curChar) - { - case 105: - return jjMoveStringLiteralDfa5_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(3, active0); -} -private int jjMoveStringLiteralDfa5_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(3, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(4, active0); - return 5; - } - switch(curChar) - { - case 112: - return jjMoveStringLiteralDfa6_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(4, active0); -} -private int jjMoveStringLiteralDfa6_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(4, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(5, active0); - return 6; - } - switch(curChar) - { - case 116: - if ((active0 & 0x2L) != 0L) - return jjStartNfaWithStates_0(6, 1, 23); - break; - default : - break; - } - return jjStartNfa_0(5, active0); -} -private int jjStartNfaWithStates_0(int pos, int kind, int state) -{ - jjmatchedKind = kind; - jjmatchedPos = pos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return pos + 1; } - return jjMoveNfa_0(state, pos + 1); -} -static final long[] jjbitVec0 = { - 0xfffffffffffffffeL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL -}; -static final long[] jjbitVec2 = { - 0x0L, 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL -}; -private int jjMoveNfa_0(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 28; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 20: - if (curChar == 33) - jjstateSet[jjnewStateCnt++] = 25; - else if (curChar == 47) - jjCheckNAdd(21); - break; - case 11: - if ((0x3ff000000000000L & l) != 0L) - jjCheckNAddTwoStates(7, 2); - else if ((0x100002600L & l) != 0L) - { - if (kind > 11) - kind = 11; - jjCheckNAdd(10); - } - else if (curChar == 60) - jjCheckNAddStates(0, 2); - else if (curChar == 38) - jjAddStates(3, 5); - else if (curChar == 36) - jjstateSet[jjnewStateCnt++] = 1; - if ((0x3ff000000000000L & l) != 0L) - { - if (kind > 6) - kind = 6; - jjCheckNAddStates(6, 10); - } - break; - case 0: - if (curChar == 36) - jjstateSet[jjnewStateCnt++] = 1; - break; - case 1: - if ((0x3ff000000000000L & l) != 0L) - jjCheckNAdd(2); - break; - case 2: - if ((0x500000000000L & l) != 0L) - jjstateSet[jjnewStateCnt++] = 3; - break; - case 3: - case 9: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(11, 13); - break; - case 4: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(6, 10); - break; - case 5: - if ((0x880000000000L & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(14, 17); - break; - case 6: - if ((0x3ff000000000000L & l) != 0L) - jjCheckNAddTwoStates(7, 2); - break; - case 7: - if (curChar != 34) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(11, 13); - break; - case 8: - if ((0x208000000000L & l) != 0L) - jjstateSet[jjnewStateCnt++] = 9; - break; - case 10: - if ((0x100002600L & l) == 0L) - break; - kind = 11; - jjCheckNAdd(10); - break; - case 13: - if (curChar == 59 && kind > 10) - kind = 10; - break; - case 14: - if (curChar == 35) - jjCheckNAdd(15); - break; - case 15: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 10) - kind = 10; - jjCheckNAddTwoStates(15, 13); - break; - case 16: - if (curChar == 35) - jjstateSet[jjnewStateCnt++] = 17; - break; - case 18: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 10) - kind = 10; - jjCheckNAddTwoStates(18, 13); - break; - case 19: - if (curChar == 60) - jjCheckNAddStates(0, 2); - break; - case 22: - if ((0x9fffff7affffd9ffL & l) == 0L) - break; - if (kind > 2) - kind = 2; - jjCheckNAdd(23); - break; - case 23: - if ((0x9ffffffeffffd9ffL & l) == 0L) - break; - if (kind > 2) - kind = 2; - jjCheckNAdd(23); - break; - case 24: - if (curChar == 33) - jjstateSet[jjnewStateCnt++] = 25; - break; - case 26: - if ((0x9fffff7affffd9ffL & l) == 0L) - break; - if (kind > 3) - kind = 3; - jjCheckNAdd(27); - break; - case 27: - if ((0x9ffffffeffffd9ffL & l) == 0L) - break; - if (kind > 3) - kind = 3; - jjCheckNAdd(27); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 20: - case 21: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 2) - kind = 2; - jjstateSet[jjnewStateCnt++] = 22; - break; - case 11: - case 4: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(6, 10); - break; - case 9: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(11, 13); - break; - case 12: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 10) - kind = 10; - jjCheckNAddTwoStates(12, 13); - break; - case 17: - if ((0x100000001000000L & l) != 0L) - jjCheckNAdd(18); - break; - case 18: - if ((0x7e0000007eL & l) == 0L) - break; - if (kind > 10) - kind = 10; - jjCheckNAddTwoStates(18, 13); - break; - case 22: - case 23: - if (kind > 2) - kind = 2; - jjCheckNAdd(23); - break; - case 25: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 3) - kind = 3; - jjstateSet[jjnewStateCnt++] = 26; - break; - case 26: - case 27: - if (kind > 3) - kind = 3; - jjCheckNAdd(27); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 22: - case 23: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 2) - kind = 2; - jjCheckNAdd(23); - break; - case 26: - case 27: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 3) - kind = 3; - jjCheckNAdd(27); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 28 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private int jjMoveStringLiteralDfa0_5() -{ - return jjMoveNfa_5(1, 0); -} -private int jjMoveNfa_5(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 2; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 1: - if ((0xfffffffbffffffffL & l) != 0L) - { - if (kind > 25) - kind = 25; - jjCheckNAdd(0); - } - else if (curChar == 34) - { - if (kind > 26) - kind = 26; - } - break; - case 0: - if ((0xfffffffbffffffffL & l) == 0L) - break; - kind = 25; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - kind = 25; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 25) - kind = 25; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 2 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private final int jjStopStringLiteralDfa_7(int pos, long active0) -{ - switch (pos) - { - default : - return -1; - } -} -private final int jjStartNfa_7(int pos, long active0) -{ - return jjMoveNfa_7(jjStopStringLiteralDfa_7(pos, active0), pos + 1); -} -private int jjMoveStringLiteralDfa0_7() -{ - switch(curChar) - { - case 62: - return jjStopAtPos(0, 30); - default : - return jjMoveNfa_7(0, 0); - } -} -private int jjMoveNfa_7(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 1; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 0: - if ((0xbfffffffffffffffL & l) == 0L) - break; - kind = 29; - jjstateSet[jjnewStateCnt++] = 0; - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - kind = 29; - jjstateSet[jjnewStateCnt++] = 0; - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 29) - kind = 29; - jjstateSet[jjnewStateCnt++] = 0; - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 1 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private int jjMoveStringLiteralDfa0_4() -{ - return jjMoveNfa_4(1, 0); -} -private int jjMoveNfa_4(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 2; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 1: - if ((0xffffff7fffffffffL & l) != 0L) - { - if (kind > 23) - kind = 23; - jjCheckNAdd(0); - } - else if (curChar == 39) - { - if (kind > 24) - kind = 24; - } - break; - case 0: - if ((0xffffff7fffffffffL & l) == 0L) - break; - kind = 23; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - kind = 23; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 23) - kind = 23; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 2 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private final int jjStopStringLiteralDfa_3(int pos, long active0) -{ - switch (pos) - { - default : - return -1; - } -} -private final int jjStartNfa_3(int pos, long active0) -{ - return jjMoveNfa_3(jjStopStringLiteralDfa_3(pos, active0), pos + 1); -} -private int jjMoveStringLiteralDfa0_3() -{ - switch(curChar) - { - case 34: - return jjStopAtPos(0, 21); - case 39: - return jjStopAtPos(0, 20); - default : - return jjMoveNfa_3(0, 0); - } -} -private int jjMoveNfa_3(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 3; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 0: - if ((0x9fffff7affffd9ffL & l) != 0L) - { - if (kind > 19) - kind = 19; - jjCheckNAdd(1); - } - else if ((0x100002600L & l) != 0L) - { - if (kind > 22) - kind = 22; - jjCheckNAdd(2); - } - break; - case 1: - if ((0xbffffffeffffd9ffL & l) == 0L) - break; - if (kind > 19) - kind = 19; - jjCheckNAdd(1); - break; - case 2: - if ((0x100002600L & l) == 0L) - break; - kind = 22; - jjCheckNAdd(2); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - case 1: - if (kind > 19) - kind = 19; - jjCheckNAdd(1); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - case 1: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 19) - kind = 19; - jjCheckNAdd(1); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 3 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private final int jjStopStringLiteralDfa_6(int pos, long active0) -{ - switch (pos) - { - case 0: - if ((active0 & 0x10000000L) != 0L) - { - jjmatchedKind = 27; - return -1; - } - return -1; - case 1: - if ((active0 & 0x10000000L) != 0L) - { - if (jjmatchedPos == 0) - { - jjmatchedKind = 27; - jjmatchedPos = 0; - } - return -1; - } - return -1; - default : - return -1; - } -} -private final int jjStartNfa_6(int pos, long active0) -{ - return jjMoveNfa_6(jjStopStringLiteralDfa_6(pos, active0), pos + 1); -} -private int jjMoveStringLiteralDfa0_6() -{ - switch(curChar) - { - case 45: - return jjMoveStringLiteralDfa1_6(0x10000000L); - default : - return jjMoveNfa_6(1, 0); - } -} -private int jjMoveStringLiteralDfa1_6(long active0) -{ - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_6(0, active0); - return 1; - } - switch(curChar) - { - case 45: - return jjMoveStringLiteralDfa2_6(active0, 0x10000000L); - default : - break; - } - return jjStartNfa_6(0, active0); -} -private int jjMoveStringLiteralDfa2_6(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_6(0, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_6(1, active0); - return 2; - } - switch(curChar) - { - case 62: - if ((active0 & 0x10000000L) != 0L) - return jjStopAtPos(2, 28); - break; - default : - break; - } - return jjStartNfa_6(1, active0); -} -private int jjMoveNfa_6(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 2; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 1: - if ((0xffffdfffffffffffL & l) != 0L) - { - if (kind > 27) - kind = 27; - jjCheckNAdd(0); - } - else if (curChar == 45) - { - if (kind > 27) - kind = 27; - } - break; - case 0: - if ((0xffffdfffffffffffL & l) == 0L) - break; - kind = 27; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - kind = 27; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 27) - kind = 27; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 2 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private int jjMoveStringLiteralDfa0_1() -{ - return jjMoveNfa_1(1, 0); -} -private int jjMoveNfa_1(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 12; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 1: - if ((0xafffffffffffffffL & l) != 0L) - { - if (kind > 14) - kind = 14; - jjCheckNAdd(0); - } - else if ((0x5000000000000000L & l) != 0L) - { - if (kind > 14) - kind = 14; - } - if (curChar == 60) - jjstateSet[jjnewStateCnt++] = 10; - break; - case 0: - if ((0xafffffffffffffffL & l) == 0L) - break; - if (kind > 14) - kind = 14; - jjCheckNAdd(0); - break; - case 3: - if ((0xafffffffffffffffL & l) != 0L) - jjAddStates(18, 19); - break; - case 4: - if (curChar == 62 && kind > 15) - kind = 15; - break; - case 10: - if (curChar == 47) - jjstateSet[jjnewStateCnt++] = 9; - break; - case 11: - if (curChar == 60) - jjstateSet[jjnewStateCnt++] = 10; - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (kind > 14) - kind = 14; - jjCheckNAdd(0); - break; - case 2: - if (curChar == 116) - jjCheckNAddTwoStates(3, 4); - break; - case 3: - jjCheckNAddTwoStates(3, 4); - break; - case 5: - if (curChar == 112) - jjstateSet[jjnewStateCnt++] = 2; - break; - case 6: - if (curChar == 105) - jjstateSet[jjnewStateCnt++] = 5; - break; - case 7: - if (curChar == 114) - jjstateSet[jjnewStateCnt++] = 6; - break; - case 8: - if (curChar == 99) - jjstateSet[jjnewStateCnt++] = 7; - break; - case 9: - if (curChar == 115) - jjstateSet[jjnewStateCnt++] = 8; - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 14) - kind = 14; - jjCheckNAdd(0); - break; - case 3: - if (jjCanMove_0(hiByte, i1, i2, l1, l2)) - jjAddStates(18, 19); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 12 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private final int jjStopStringLiteralDfa_2(int pos, long active0) -{ - switch (pos) - { - default : - return -1; - } -} -private final int jjStartNfa_2(int pos, long active0) -{ - return jjMoveNfa_2(jjStopStringLiteralDfa_2(pos, active0), pos + 1); -} -private int jjMoveStringLiteralDfa0_2() -{ - switch(curChar) - { - case 34: - return jjStopAtPos(0, 21); - case 39: - return jjStopAtPos(0, 20); - case 61: - return jjStartNfaWithStates_2(0, 17, 3); - default : - return jjMoveNfa_2(0, 0); - } -} -private int jjStartNfaWithStates_2(int pos, int kind, int state) -{ - jjmatchedKind = kind; - jjmatchedPos = pos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return pos + 1; } - return jjMoveNfa_2(state, pos + 1); -} -private int jjMoveNfa_2(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 6; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 0: - if ((0x9fffff7affffd9ffL & l) != 0L) - { - if (kind > 16) - kind = 16; - jjCheckNAdd(1); - } - else if ((0x100002600L & l) != 0L) - { - if (kind > 22) - kind = 22; - jjCheckNAdd(5); - } - else if (curChar == 61) - jjstateSet[jjnewStateCnt++] = 3; - else if (curChar == 62) - { - if (kind > 18) - kind = 18; - } - break; - case 1: - if ((0x9ffffffeffffd9ffL & l) == 0L) - break; - if (kind > 16) - kind = 16; - jjCheckNAdd(1); - break; - case 2: - case 3: - if (curChar == 62 && kind > 18) - kind = 18; - break; - case 4: - if (curChar == 61) - jjstateSet[jjnewStateCnt++] = 3; - break; - case 5: - if ((0x100002600L & l) == 0L) - break; - kind = 22; - jjCheckNAdd(5); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - case 1: - if (kind > 16) - kind = 16; - jjCheckNAdd(1); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - case 1: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 16) - kind = 16; - jjCheckNAdd(1); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 6 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -static final int[] jjnextStates = { - 20, 21, 24, 12, 14, 16, 5, 8, 0, 4, 6, 0, 4, 6, 5, 0, - 4, 6, 3, 4, -}; -private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2) -{ - switch(hiByte) - { - case 0: - return ((jjbitVec2[i2] & l2) != 0L); - default : - if ((jjbitVec0[i1] & l1) != 0L) - return true; - return false; - } -} - -/** Token literal values. */ -public static final String[] jjstrLiteralImages = { -"", "\74\163\143\162\151\160\164", null, null, "\74\41\55\55", "\74\41", null, -null, null, null, null, null, null, null, null, null, null, "\75", null, null, -"\47", "\42", null, null, null, null, null, null, "\55\55\76", null, "\76", }; - -/** Lexer state names. */ -public static final String[] lexStateNames = { - "DEFAULT", - "WithinScript", - "WithinTag", - "AfterEquals", - "WithinQuote1", - "WithinQuote2", - "WithinComment1", - "WithinComment2", -}; - -/** Lex State array. */ -public static final int[] jjnewLexState = { - -1, 1, 2, 2, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 3, 0, 2, 4, 5, -1, -1, 2, - -1, 2, -1, 0, -1, 0, -}; -static final long[] jjtoToken = { - 0x7fbfec7fL, -}; -static final long[] jjtoSkip = { - 0x400000L, -}; -protected CharStream input_stream; -private final int[] jjrounds = new int[28]; -private final int[] jjstateSet = new int[56]; -protected char curChar; -/** Constructor. */ -public HTMLParserTokenManager(CharStream stream){ - input_stream = stream; -} - -/** Constructor. */ -public HTMLParserTokenManager(CharStream stream, int lexState){ - this(stream); - SwitchTo(lexState); -} - -/** Reinitialise parser. */ -public void ReInit(CharStream stream) -{ - jjmatchedPos = jjnewStateCnt = 0; - curLexState = defaultLexState; - input_stream = stream; - ReInitRounds(); -} -private void ReInitRounds() -{ - int i; - jjround = 0x80000001; - for (i = 28; i-- > 0;) - jjrounds[i] = 0x80000000; -} - -/** Reinitialise parser. */ -public void ReInit(CharStream stream, int lexState) -{ - ReInit(stream); - SwitchTo(lexState); -} - -/** Switch to specified lex state. */ -public void SwitchTo(int lexState) -{ - if (lexState >= 8 || lexState < 0) - throw new TokenMgrError("Error: Ignoring invalid lexical state : " + lexState + ". State unchanged.", TokenMgrError.INVALID_LEXICAL_STATE); - else - curLexState = lexState; -} - -protected Token jjFillToken() -{ - final Token t; - final String curTokenImage; - final int beginLine; - final int endLine; - final int beginColumn; - final int endColumn; - String im = jjstrLiteralImages[jjmatchedKind]; - curTokenImage = (im == null) ? input_stream.GetImage() : im; - beginLine = input_stream.getBeginLine(); - beginColumn = input_stream.getBeginColumn(); - endLine = input_stream.getEndLine(); - endColumn = input_stream.getEndColumn(); - t = Token.newToken(jjmatchedKind, curTokenImage); - - t.beginLine = beginLine; - t.endLine = endLine; - t.beginColumn = beginColumn; - t.endColumn = endColumn; - - return t; -} - -int curLexState = 0; -int defaultLexState = 0; -int jjnewStateCnt; -int jjround; -int jjmatchedPos; -int jjmatchedKind; - -/** Get the next Token. */ -public Token getNextToken() -{ - Token matchedToken; - int curPos = 0; - - EOFLoop : - for (;;) - { - try - { - curChar = input_stream.BeginToken(); - } - catch(java.io.IOException e) - { - jjmatchedKind = 0; - matchedToken = jjFillToken(); - return matchedToken; - } - - switch(curLexState) - { - case 0: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_0(); - if (jjmatchedPos == 0 && jjmatchedKind > 13) - { - jjmatchedKind = 13; - } - break; - case 1: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_1(); - break; - case 2: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_2(); - break; - case 3: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_3(); - break; - case 4: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_4(); - break; - case 5: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_5(); - break; - case 6: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_6(); - break; - case 7: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_7(); - break; - } - if (jjmatchedKind != 0x7fffffff) - { - if (jjmatchedPos + 1 < curPos) - input_stream.backup(curPos - jjmatchedPos - 1); - if ((jjtoToken[jjmatchedKind >> 6] & (1L << (jjmatchedKind & 077))) != 0L) - { - matchedToken = jjFillToken(); - if (jjnewLexState[jjmatchedKind] != -1) - curLexState = jjnewLexState[jjmatchedKind]; - return matchedToken; - } - else - { - if (jjnewLexState[jjmatchedKind] != -1) - curLexState = jjnewLexState[jjmatchedKind]; - continue EOFLoop; - } - } - int error_line = input_stream.getEndLine(); - int error_column = input_stream.getEndColumn(); - String error_after = null; - boolean EOFSeen = false; - try { input_stream.readChar(); input_stream.backup(1); } - catch (java.io.IOException e1) { - EOFSeen = true; - error_after = curPos <= 1 ? "" : input_stream.GetImage(); - if (curChar == '\n' || curChar == '\r') { - error_line++; - error_column = 0; - } - else - error_column++; - } - if (!EOFSeen) { - input_stream.backup(1); - error_after = curPos <= 1 ? "" : input_stream.GetImage(); - } - throw new TokenMgrError(EOFSeen, curLexState, error_line, error_column, error_after, curChar, TokenMgrError.LEXICAL_ERROR); - } -} - -private void jjCheckNAdd(int state) -{ - if (jjrounds[state] != jjround) - { - jjstateSet[jjnewStateCnt++] = state; - jjrounds[state] = jjround; - } -} -private void jjAddStates(int start, int end) -{ - do { - jjstateSet[jjnewStateCnt++] = jjnextStates[start]; - } while (start++ != end); -} -private void jjCheckNAddTwoStates(int state1, int state2) -{ - jjCheckNAdd(state1); - jjCheckNAdd(state2); -} - -private void jjCheckNAddStates(int start, int end) -{ - do { - jjCheckNAdd(jjnextStates[start]); - } while (start++ != end); -} - -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Tags.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Tags.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Tags.java (working copy) @@ -1,67 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.HashSet; -import java.util.Set; - - -/** - * Utility class storing set of commonly-used html tags. - */ -public final class Tags { - - /** - * contains all tags for which whitespaces have to be inserted for proper tokenization - */ - public static final Set WS_ELEMS; - - static{ - WS_ELEMS = new HashSet(); - WS_ELEMS.add("" does not need to be listed explicitly - WS_ELEMS.add(" - - - -Example html parser based on JavaCC - - Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserConstants.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserConstants.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserConstants.java (working copy) @@ -1,124 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. HTMLParserConstants.java */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - - -/** - * Token literal values and constants. - * Generated by org.javacc.parser.OtherFilesGen#start() - */ -public interface HTMLParserConstants { - - /** End of File. */ - int EOF = 0; - /** RegularExpression Id. */ - int ScriptStart = 1; - /** RegularExpression Id. */ - int TagName = 2; - /** RegularExpression Id. */ - int DeclName = 3; - /** RegularExpression Id. */ - int Comment1 = 4; - /** RegularExpression Id. */ - int Comment2 = 5; - /** RegularExpression Id. */ - int Word = 6; - /** RegularExpression Id. */ - int LET = 7; - /** RegularExpression Id. */ - int NUM = 8; - /** RegularExpression Id. */ - int HEX = 9; - /** RegularExpression Id. */ - int Entity = 10; - /** RegularExpression Id. */ - int Space = 11; - /** RegularExpression Id. */ - int SP = 12; - /** RegularExpression Id. */ - int Punct = 13; - /** RegularExpression Id. */ - int ScriptText = 14; - /** RegularExpression Id. */ - int ScriptEnd = 15; - /** RegularExpression Id. */ - int ArgName = 16; - /** RegularExpression Id. */ - int ArgEquals = 17; - /** RegularExpression Id. */ - int TagEnd = 18; - /** RegularExpression Id. */ - int ArgValue = 19; - /** RegularExpression Id. */ - int ArgQuote1 = 20; - /** RegularExpression Id. */ - int ArgQuote2 = 21; - /** RegularExpression Id. */ - int Quote1Text = 23; - /** RegularExpression Id. */ - int CloseQuote1 = 24; - /** RegularExpression Id. */ - int Quote2Text = 25; - /** RegularExpression Id. */ - int CloseQuote2 = 26; - /** RegularExpression Id. */ - int CommentText1 = 27; - /** RegularExpression Id. */ - int CommentEnd1 = 28; - /** RegularExpression Id. */ - int CommentText2 = 29; - /** RegularExpression Id. */ - int CommentEnd2 = 30; - - /** Lexical state. */ - int DEFAULT = 0; - /** Lexical state. */ - int WithinScript = 1; - /** Lexical state. */ - int WithinTag = 2; - /** Lexical state. */ - int AfterEquals = 3; - /** Lexical state. */ - int WithinQuote1 = 4; - /** Lexical state. */ - int WithinQuote2 = 5; - /** Lexical state. */ - int WithinComment1 = 6; - /** Lexical state. */ - int WithinComment2 = 7; - - /** Literal token values. */ - String[] tokenImage = { - "", - "\"", - "", - "\"\"", - "", - "\">\"", - }; - -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/TokenMgrError.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/TokenMgrError.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/TokenMgrError.java (working copy) @@ -1,141 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 4.1 */ -/* JavaCCOptions: */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/** Token Manager Error. */ -@SuppressWarnings("serial") -public class TokenMgrError extends Error -{ - - /* - * Ordinals for various reasons why an Error of this type can be thrown. - */ - - /** - * Lexical error occurred. - */ - static final int LEXICAL_ERROR = 0; - - /** - * An attempt was made to create a second instance of a static token manager. - */ - static final int STATIC_LEXER_ERROR = 1; - - /** - * Tried to change to an invalid lexical state. - */ - static final int INVALID_LEXICAL_STATE = 2; - - /** - * Detected (and bailed out of) an infinite loop in the token manager. - */ - static final int LOOP_DETECTED = 3; - - /** - * Indicates the reason why the exception is thrown. It will have - * one of the above 4 values. - */ - int errorCode; - - /** - * Replaces unprintable characters by their escaped (or unicode escaped) - * equivalents in the given string - */ - protected static final String addEscapes(String str) { - StringBuffer retval = new StringBuffer(); - char ch; - for (int i = 0; i < str.length(); i++) { - switch (str.charAt(i)) - { - case 0 : - continue; - case '\b': - retval.append("\\b"); - continue; - case '\t': - retval.append("\\t"); - continue; - case '\n': - retval.append("\\n"); - continue; - case '\f': - retval.append("\\f"); - continue; - case '\r': - retval.append("\\r"); - continue; - case '\"': - retval.append("\\\""); - continue; - case '\'': - retval.append("\\\'"); - continue; - case '\\': - retval.append("\\\\"); - continue; - default: - if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) { - String s = "0000" + Integer.toString(ch, 16); - retval.append("\\u" + s.substring(s.length() - 4, s.length())); - } else { - retval.append(ch); - } - continue; - } - } - return retval.toString(); - } - - /** - * Returns a detailed message for the Error when it is thrown by the - * token manager to indicate a lexical error. - * Parameters : - * EOFSeen : indicates if EOF caused the lexical error - * curLexState : lexical state in which this error occurred - * errorLine : line number when the error occurred - * errorColumn : column number when the error occurred - * errorAfter : prefix that was seen before this error occurred - * curchar : the offending character - * Note: You can customize the lexical error message by modifying this method. - */ - protected static String LexicalError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar) { - return("Lexical error at line " + - errorLine + ", column " + - errorColumn + ". Encountered: " + - (EOFSeen ? " " : ("\"" + addEscapes(String.valueOf(curChar)) + "\"") + " (" + (int)curChar + "), ") + - "after : \"" + addEscapes(errorAfter) + "\""); - } - - /** - * You can also modify the body of this method to customize your error messages. - * For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not - * of end-users concern, so you can return something like : - * - * "Internal Error : Please file a bug report .... " - * - * from this method for such cases in the release version of your parser. - */ - public String getMessage() { - return super.getMessage(); - } - - /* - * Constructors of various flavors follow. - */ - - /** No arg constructor. */ - public TokenMgrError() { - } - - /** Constructor with message and reason. */ - public TokenMgrError(String message, int reason) { - super(message); - errorCode = reason; - } - - /** Full Constructor. */ - public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar, int reason) { - this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason); - } -} -/* JavaCC - OriginalChecksum=538f0da130356fcc0bc7db621ab0389d (do not edit this line) */ Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Token.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Token.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Token.java (working copy) @@ -1,124 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. Token.java Version 4.1 */ -/* JavaCCOptions:TOKEN_EXTENDS=,KEEP_LINE_COL=null */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/** - * Describes the input token stream. - */ - -public class Token { - - /** - * An integer that describes the kind of this token. This numbering - * system is determined by JavaCCParser, and a table of these numbers is - * stored in the file ...Constants.java. - */ - public int kind; - - /** The line number of the first character of this Token. */ - public int beginLine; - /** The column number of the first character of this Token. */ - public int beginColumn; - /** The line number of the last character of this Token. */ - public int endLine; - /** The column number of the last character of this Token. */ - public int endColumn; - - /** - * The string image of the token. - */ - public String image; - - /** - * A reference to the next regular (non-special) token from the input - * stream. If this is the last token from the input stream, or if the - * token manager has not read tokens beyond this one, this field is - * set to null. This is true only if this token is also a regular - * token. Otherwise, see below for a description of the contents of - * this field. - */ - public Token next; - - /** - * This field is used to access special tokens that occur prior to this - * token, but after the immediately preceding regular (non-special) token. - * If there are no such special tokens, this field is set to null. - * When there are more than one such special token, this field refers - * to the last of these special tokens, which in turn refers to the next - * previous special token through its specialToken field, and so on - * until the first special token (whose specialToken field is null). - * The next fields of special tokens refer to other special tokens that - * immediately follow it (without an intervening regular token). If there - * is no such token, this field is null. - */ - public Token specialToken; - - /** - * An optional attribute value of the Token. - * Tokens which are not used as syntactic sugar will often contain - * meaningful values that will be used later on by the compiler or - * interpreter. This attribute value is often different from the image. - * Any subclass of Token that actually wants to return a non-null value can - * override this method as appropriate. - */ - public Object getValue() { - return null; - } - - /** - * No-argument constructor - */ - public Token() {} - - /** - * Constructs a new token for the specified Image. - */ - public Token(int kind) - { - this(kind, null); - } - - /** - * Constructs a new token for the specified Image and Kind. - */ - public Token(int kind, String image) - { - this.kind = kind; - this.image = image; - } - - /** - * Returns the image. - */ - public String toString() - { - return image; - } - - /** - * Returns a new Token object, by default. However, if you want, you - * can create and return subclass objects based on the value of ofKind. - * Simply add the cases to the switch for all those special cases. - * For example, if you have a subclass of Token called IDToken that - * you want to create if ofKind is ID, simply add something like : - * - * case MyParserConstants.ID : return new IDToken(ofKind, image); - * - * to the following switch statement. Then you can cast matchedToken - * variable to the appropriate type and use sit in your lexical actions. - */ - public static Token newToken(int ofKind, String image) - { - switch(ofKind) - { - default : return new Token(ofKind, image); - } - } - - public static Token newToken(int ofKind) - { - return newToken(ofKind, null); - } - -} -/* JavaCC - OriginalChecksum=24643dc85fd6daeec42ceba20b46ee61 (do not edit this line) */ Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.java (working copy) @@ -1,722 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -import java.io.*; -import java.util.Locale; -import java.util.Properties; - -/** - * Basic html parser (for demo/testing purposes only!) - */ -public class HTMLParser implements HTMLParserConstants { - public static int SUMMARY_LENGTH = 200; - - StringBuffer title = new StringBuffer(SUMMARY_LENGTH); - StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); - Properties metaTags=new Properties(); - String currentMetaTag=null; - String currentMetaContent=null; - int length = 0; - boolean titleComplete = false; - boolean inTitle = false; - boolean inMetaTag = false; - boolean inStyle = false; - boolean afterTag = false; - boolean afterSpace = false; - String eol = System.getProperty("line.separator"); - Reader pipeIn = null; - Writer pipeOut; - private MyPipedInputStream pipeInStream = null; - private PipedOutputStream pipeOutStream = null; - - public HTMLParser(Reader reader) { - this(new FastCharStream(reader)); - } - - private class MyPipedInputStream extends PipedInputStream{ - - public MyPipedInputStream(){ - super(); - } - - public MyPipedInputStream(PipedOutputStream src) throws IOException{ - super(src); - } - - public boolean full() throws IOException{ - return this.available() >= PipedInputStream.PIPE_SIZE; - } - } - - public String getTitle() throws IOException, InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (titleComplete || pipeInStream.full()) - break; - wait(10); - } - } - return title.toString().trim(); - } - - public Properties getMetaTags() throws IOException, -InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (titleComplete || pipeInStream.full()) - break; - wait(10); - } - } - return metaTags; - } - - - public String getSummary() throws IOException, InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full()) - break; - wait(10); - } - } - if (summary.length() > SUMMARY_LENGTH) - summary.setLength(SUMMARY_LENGTH); - - String sum = summary.toString().trim(); - String tit = getTitle(); - if (sum.equals("")) - return tit; - else - return sum; - } - - public Reader getReader() throws IOException { - if (pipeIn == null) { - pipeInStream = new MyPipedInputStream(); - pipeOutStream = new PipedOutputStream(pipeInStream); - pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE"); - pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE"); - - Thread thread = new ParserThread(this); - thread.start(); // start parsing - } - - return pipeIn; - } - - void addToSummary(String text) { - if (summary.length() < SUMMARY_LENGTH) { - summary.append(text); - if (summary.length() >= SUMMARY_LENGTH) { - synchronized(this) { - notifyAll(); - } - } - } - } - - void addText(String text) throws IOException { - if (inStyle) - return; - if (inTitle) - title.append(text); - else { - addToSummary(text); - if (!titleComplete && !(title.length() == 0)) { // finished title - synchronized(this) { - titleComplete = true; // tell waiting threads - notifyAll(); - } - } - } - - length += text.length(); - pipeOut.write(text); - - afterSpace = false; - } - - void addMetaTag() { - metaTags.setProperty(currentMetaTag, currentMetaContent); - currentMetaTag = null; - currentMetaContent = null; - return; - } - - void addSpace() throws IOException { - if (!afterSpace) { - if (inTitle) - title.append(" "); - else - addToSummary(" "); - - String space = afterTag ? eol : " "; - length += space.length(); - pipeOut.write(space); - afterSpace = true; - } - } - - final public void HTMLDocument() throws ParseException, IOException { - Token t; - label_1: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ScriptStart: - case TagName: - case DeclName: - case Comment1: - case Comment2: - case Word: - case Entity: - case Space: - case Punct: - ; - break; - default: - jj_la1[0] = jj_gen; - break label_1; - } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case TagName: - Tag(); - afterTag = true; - break; - case DeclName: - t = Decl(); - afterTag = true; - break; - case Comment1: - case Comment2: - CommentTag(); - afterTag = true; - break; - case ScriptStart: - ScriptTag(); - afterTag = true; - break; - case Word: - t = jj_consume_token(Word); - addText(t.image); afterTag = false; - break; - case Entity: - t = jj_consume_token(Entity); - addText(Entities.decode(t.image)); afterTag = false; - break; - case Punct: - t = jj_consume_token(Punct); - addText(t.image); afterTag = false; - break; - case Space: - jj_consume_token(Space); - addSpace(); afterTag = false; - break; - default: - jj_la1[1] = jj_gen; - jj_consume_token(-1); - throw new ParseException(); - } - } - jj_consume_token(0); - } - - final public void Tag() throws ParseException, IOException { - Token t1, t2; - boolean inImg = false; - t1 = jj_consume_token(TagName); - String tagName = t1.image.toLowerCase(Locale.ROOT); - if(Tags.WS_ELEMS.contains(tagName) ) { - addSpace(); - } - inTitle = tagName.equalsIgnoreCase(" - inMetaTag = tagName.equalsIgnoreCase(" - inStyle = tagName.equalsIgnoreCase(" - inImg = tagName.equalsIgnoreCase(" - - label_2: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgName: - ; - break; - default: - jj_la1[2] = jj_gen; - break label_2; - } - t1 = jj_consume_token(ArgName); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgEquals: - jj_consume_token(ArgEquals); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgValue: - case ArgQuote1: - case ArgQuote2: - t2 = ArgValue(); - if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) - addText("[" + t2.image + "]"); - - if(inMetaTag && - ( t1.image.equalsIgnoreCase("name") || - t1.image.equalsIgnoreCase("HTTP-EQUIV") - ) - && t2 != null) - { - currentMetaTag=t2.image.toLowerCase(Locale.ROOT); - if(currentMetaTag != null && currentMetaContent != null) { - addMetaTag(); - } - } - if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != -null) - { - currentMetaContent=t2.image.toLowerCase(Locale.ROOT); - if(currentMetaTag != null && currentMetaContent != null) { - addMetaTag(); - } - } - break; - default: - jj_la1[3] = jj_gen; - ; - } - break; - default: - jj_la1[4] = jj_gen; - ; - } - } - jj_consume_token(TagEnd); - } - - final public Token ArgValue() throws ParseException { - Token t = null; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgValue: - t = jj_consume_token(ArgValue); - {if (true) return t;} - break; - default: - jj_la1[5] = jj_gen; - if (jj_2_1(2)) { - jj_consume_token(ArgQuote1); - jj_consume_token(CloseQuote1); - {if (true) return t;} - } else { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgQuote1: - jj_consume_token(ArgQuote1); - t = jj_consume_token(Quote1Text); - jj_consume_token(CloseQuote1); - {if (true) return t;} - break; - default: - jj_la1[6] = jj_gen; - if (jj_2_2(2)) { - jj_consume_token(ArgQuote2); - jj_consume_token(CloseQuote2); - {if (true) return t;} - } else { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgQuote2: - jj_consume_token(ArgQuote2); - t = jj_consume_token(Quote2Text); - jj_consume_token(CloseQuote2); - {if (true) return t;} - break; - default: - jj_la1[7] = jj_gen; - jj_consume_token(-1); - throw new ParseException(); - } - } - } - } - } - throw new Error("Missing return statement in function"); - } - - final public Token Decl() throws ParseException { - Token t; - t = jj_consume_token(DeclName); - label_3: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgName: - case ArgEquals: - case ArgValue: - case ArgQuote1: - case ArgQuote2: - ; - break; - default: - jj_la1[8] = jj_gen; - break label_3; - } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgName: - jj_consume_token(ArgName); - break; - case ArgValue: - case ArgQuote1: - case ArgQuote2: - ArgValue(); - break; - case ArgEquals: - jj_consume_token(ArgEquals); - break; - default: - jj_la1[9] = jj_gen; - jj_consume_token(-1); - throw new ParseException(); - } - } - jj_consume_token(TagEnd); - {if (true) return t;} - throw new Error("Missing return statement in function"); - } - - final public void CommentTag() throws ParseException { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case Comment1: - jj_consume_token(Comment1); - label_4: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case CommentText1: - ; - break; - default: - jj_la1[10] = jj_gen; - break label_4; - } - jj_consume_token(CommentText1); - } - jj_consume_token(CommentEnd1); - break; - case Comment2: - jj_consume_token(Comment2); - label_5: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case CommentText2: - ; - break; - default: - jj_la1[11] = jj_gen; - break label_5; - } - jj_consume_token(CommentText2); - } - jj_consume_token(CommentEnd2); - break; - default: - jj_la1[12] = jj_gen; - jj_consume_token(-1); - throw new ParseException(); - } - } - - final public void ScriptTag() throws ParseException { - jj_consume_token(ScriptStart); - label_6: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ScriptText: - ; - break; - default: - jj_la1[13] = jj_gen; - break label_6; - } - jj_consume_token(ScriptText); - } - jj_consume_token(ScriptEnd); - } - - private boolean jj_2_1(int xla) { - jj_la = xla; jj_lastpos = jj_scanpos = token; - try { return !jj_3_1(); } - catch(LookaheadSuccess ls) { return true; } - finally { jj_save(0, xla); } - } - - private boolean jj_2_2(int xla) { - jj_la = xla; jj_lastpos = jj_scanpos = token; - try { return !jj_3_2(); } - catch(LookaheadSuccess ls) { return true; } - finally { jj_save(1, xla); } - } - - private boolean jj_3_2() { - if (jj_scan_token(ArgQuote2)) return true; - if (jj_scan_token(CloseQuote2)) return true; - return false; - } - - private boolean jj_3_1() { - if (jj_scan_token(ArgQuote1)) return true; - if (jj_scan_token(CloseQuote1)) return true; - return false; - } - - /** Generated Token Manager. */ - public HTMLParserTokenManager token_source; - /** Current token. */ - public Token token; - /** Next token. */ - public Token jj_nt; - private int jj_ntk; - private Token jj_scanpos, jj_lastpos; - private int jj_la; - private int jj_gen; - final private int[] jj_la1 = new int[14]; - static private int[] jj_la1_0; - static { - jj_la1_init_0(); - } - private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x2c7e,0x2c7e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,}; - } - final private JJCalls[] jj_2_rtns = new JJCalls[2]; - private boolean jj_rescan = false; - private int jj_gc = 0; - - /** Constructor with user supplied CharStream. */ - public HTMLParser(CharStream stream) { - token_source = new HTMLParserTokenManager(stream); - token = new Token(); - jj_ntk = -1; - jj_gen = 0; - for (int i = 0; i < 14; i++) jj_la1[i] = -1; - for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); - } - - /** Reinitialise. */ - public void ReInit(CharStream stream) { - token_source.ReInit(stream); - token = new Token(); - jj_ntk = -1; - jj_gen = 0; - for (int i = 0; i < 14; i++) jj_la1[i] = -1; - for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); - } - - /** Constructor with generated Token Manager. */ - public HTMLParser(HTMLParserTokenManager tm) { - token_source = tm; - token = new Token(); - jj_ntk = -1; - jj_gen = 0; - for (int i = 0; i < 14; i++) jj_la1[i] = -1; - for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); - } - - /** Reinitialise. */ - public void ReInit(HTMLParserTokenManager tm) { - token_source = tm; - token = new Token(); - jj_ntk = -1; - jj_gen = 0; - for (int i = 0; i < 14; i++) jj_la1[i] = -1; - for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); - } - - private Token jj_consume_token(int kind) throws ParseException { - Token oldToken; - if ((oldToken = token).next != null) token = token.next; - else token = token.next = token_source.getNextToken(); - jj_ntk = -1; - if (token.kind == kind) { - jj_gen++; - if (++jj_gc > 100) { - jj_gc = 0; - for (int i = 0; i < jj_2_rtns.length; i++) { - JJCalls c = jj_2_rtns[i]; - while (c != null) { - if (c.gen < jj_gen) c.first = null; - c = c.next; - } - } - } - return token; - } - token = oldToken; - jj_kind = kind; - throw generateParseException(); - } - - static private final class LookaheadSuccess extends java.lang.Error { } - final private LookaheadSuccess jj_ls = new LookaheadSuccess(); - private boolean jj_scan_token(int kind) { - if (jj_scanpos == jj_lastpos) { - jj_la--; - if (jj_scanpos.next == null) { - jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken(); - } else { - jj_lastpos = jj_scanpos = jj_scanpos.next; - } - } else { - jj_scanpos = jj_scanpos.next; - } - if (jj_rescan) { - int i = 0; Token tok = token; - while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; } - if (tok != null) jj_add_error_token(kind, i); - } - if (jj_scanpos.kind != kind) return true; - if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls; - return false; - } - - -/** Get the next Token. */ - final public Token getNextToken() { - if (token.next != null) token = token.next; - else token = token.next = token_source.getNextToken(); - jj_ntk = -1; - jj_gen++; - return token; - } - -/** Get the specific Token. */ - final public Token getToken(int index) { - Token t = token; - for (int i = 0; i < index; i++) { - if (t.next != null) t = t.next; - else t = t.next = token_source.getNextToken(); - } - return t; - } - - private int jj_ntk() { - if ((jj_nt=token.next) == null) - return (jj_ntk = (token.next=token_source.getNextToken()).kind); - else - return (jj_ntk = jj_nt.kind); - } - - private java.util.List jj_expentries = new java.util.ArrayList(); - private int[] jj_expentry; - private int jj_kind = -1; - private int[] jj_lasttokens = new int[100]; - private int jj_endpos; - - private void jj_add_error_token(int kind, int pos) { - if (pos >= 100) return; - if (pos == jj_endpos + 1) { - jj_lasttokens[jj_endpos++] = kind; - } else if (jj_endpos != 0) { - jj_expentry = new int[jj_endpos]; - for (int i = 0; i < jj_endpos; i++) { - jj_expentry[i] = jj_lasttokens[i]; - } - jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) { - int[] oldentry = (int[])(it.next()); - if (oldentry.length == jj_expentry.length) { - for (int i = 0; i < jj_expentry.length; i++) { - if (oldentry[i] != jj_expentry[i]) { - continue jj_entries_loop; - } - } - jj_expentries.add(jj_expentry); - break jj_entries_loop; - } - } - if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind; - } - } - - /** Generate ParseException. */ - public ParseException generateParseException() { - jj_expentries.clear(); - boolean[] la1tokens = new boolean[31]; - if (jj_kind >= 0) { - la1tokens[jj_kind] = true; - jj_kind = -1; - } - for (int i = 0; i < 14; i++) { - if (jj_la1[i] == jj_gen) { - for (int j = 0; j < 32; j++) { - if ((jj_la1_0[i] & (1< jj_gen) { - jj_la = p.arg; jj_lastpos = jj_scanpos = p.first; - switch (i) { - case 0: jj_3_1(); break; - case 1: jj_3_2(); break; - } - } - p = p.next; - } while (p != null); - } catch(LookaheadSuccess ls) { } - } - jj_rescan = false; - } - - private void jj_save(int index, int xla) { - JJCalls p = jj_2_rtns[index]; - while (p.gen > jj_gen) { - if (p.next == null) { p = p.next = new JJCalls(); break; } - p = p.next; - } - p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla; - } - - static final class JJCalls { - int gen; - Token first; - int arg; - JJCalls next; - } - -// void handleException(Exception e) { -// System.out.println(e.toString()); // print the error message -// System.out.println("Skipping..."); -// Token t; -// do { -// t = getNextToken(); -// } while (t.kind != TagEnd); -// } -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj (working copy) @@ -1,394 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// HTMLParser.jj - -options { - STATIC = false; - //DEBUG_LOOKAHEAD = true; - //DEBUG_TOKEN_MANAGER = true; - UNICODE_INPUT = true; - USER_CHAR_STREAM=true; -} - -PARSER_BEGIN(HTMLParser) - -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -import java.io.*; -import java.util.Locale; -import java.util.Properties; - -/** - * Basic html parser (for demo/testing purposes only!) - */ -public class HTMLParser { - public static int SUMMARY_LENGTH = 200; - - StringBuffer title = new StringBuffer(SUMMARY_LENGTH); - StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); - Properties metaTags=new Properties(); - String currentMetaTag=null; - String currentMetaContent=null; - int length = 0; - boolean titleComplete = false; - boolean inTitle = false; - boolean inMetaTag = false; - boolean inStyle = false; - boolean afterTag = false; - boolean afterSpace = false; - String eol = System.getProperty("line.separator"); - Reader pipeIn = null; - Writer pipeOut; - private MyPipedInputStream pipeInStream = null; - private PipedOutputStream pipeOutStream = null; - - public HTMLParser(Reader reader) { - this(new FastCharStream(reader)); - } - - private class MyPipedInputStream extends PipedInputStream{ - - public MyPipedInputStream(){ - super(); - } - - public MyPipedInputStream(PipedOutputStream src) throws IOException{ - super(src); - } - - public boolean full() throws IOException{ - return this.available() >= PipedInputStream.PIPE_SIZE; - } - } - - public String getTitle() throws IOException, InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (titleComplete || pipeInStream.full()) - break; - wait(10); - } - } - return title.toString().trim(); - } - - public Properties getMetaTags() throws IOException, -InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (titleComplete || pipeInStream.full()) - break; - wait(10); - } - } - return metaTags; - } - - - public String getSummary() throws IOException, InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full()) - break; - wait(10); - } - } - if (summary.length() > SUMMARY_LENGTH) - summary.setLength(SUMMARY_LENGTH); - - String sum = summary.toString().trim(); - String tit = getTitle(); - if (sum.equals("")) - return tit; - else - return sum; - } - - public Reader getReader() throws IOException { - if (pipeIn == null) { - pipeInStream = new MyPipedInputStream(); - pipeOutStream = new PipedOutputStream(pipeInStream); - pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE"); - pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE"); - - Thread thread = new ParserThread(this); - thread.start(); // start parsing - } - - return pipeIn; - } - - void addToSummary(String text) { - if (summary.length() < SUMMARY_LENGTH) { - summary.append(text); - if (summary.length() >= SUMMARY_LENGTH) { - synchronized(this) { - notifyAll(); - } - } - } - } - - void addText(String text) throws IOException { - if (inStyle) - return; - if (inTitle) - title.append(text); - else { - addToSummary(text); - if (!titleComplete && !(title.length() == 0)) { // finished title - synchronized(this) { - titleComplete = true; // tell waiting threads - notifyAll(); - } - } - } - - length += text.length(); - pipeOut.write(text); - - afterSpace = false; - } - - void addMetaTag() { - metaTags.setProperty(currentMetaTag, currentMetaContent); - currentMetaTag = null; - currentMetaContent = null; - return; - } - - void addSpace() throws IOException { - if (!afterSpace) { - if (inTitle) - title.append(" "); - else - addToSummary(" "); - - String space = afterTag ? eol : " "; - length += space.length(); - pipeOut.write(space); - afterSpace = true; - } - } - -// void handleException(Exception e) { -// System.out.println(e.toString()); // print the error message -// System.out.println("Skipping..."); -// Token t; -// do { -// t = getNextToken(); -// } while (t.kind != TagEnd); -// } -} - -PARSER_END(HTMLParser) - - -void HTMLDocument() throws IOException : -{ - Token t; -} -{ -// try { - ( Tag() { afterTag = true; } - | t=Decl() { afterTag = true; } - | CommentTag() { afterTag = true; } - | ScriptTag() { afterTag = true; } - | t= { addText(t.image); afterTag = false; } - | t= { addText(Entities.decode(t.image)); afterTag = false; } - | t= { addText(t.image); afterTag = false; } - | { addSpace(); afterTag = false; } - )* -// } catch (ParseException e) { -// handleException(e); -// } -} - -void Tag() throws IOException : -{ - Token t1, t2; - boolean inImg = false; -} -{ - t1= { - String tagName = t1.image.toLowerCase(Locale.ROOT); - if(Tags.WS_ELEMS.contains(tagName) ) { - addSpace(); - } - inTitle = tagName.equalsIgnoreCase(" - inMetaTag = tagName.equalsIgnoreCase(" - inStyle = tagName.equalsIgnoreCase(" - inImg = tagName.equalsIgnoreCase(" - } - (t1= - ( - (t2=ArgValue() // save ALT text in IMG tag - { - if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) - addText("[" + t2.image + "]"); - - if(inMetaTag && - ( t1.image.equalsIgnoreCase("name") || - t1.image.equalsIgnoreCase("HTTP-EQUIV") - ) - && t2 != null) - { - currentMetaTag=t2.image.toLowerCase(Locale.ROOT); - if(currentMetaTag != null && currentMetaContent != null) { - addMetaTag(); - } - } - if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != -null) - { - currentMetaContent=t2.image.toLowerCase(Locale.ROOT); - if(currentMetaTag != null && currentMetaContent != null) { - addMetaTag(); - } - } - } - )? - )? - )* - -} - -Token ArgValue() : -{ - Token t = null; -} -{ - t= { return t; } -| LOOKAHEAD(2) - { return t; } -| t= { return t; } -| LOOKAHEAD(2) - { return t; } -| t= { return t; } -} - - -Token Decl() : -{ - Token t; -} -{ - t= ( | ArgValue() | )* - { return t; } -} - - -void CommentTag() : -{} -{ - ( ( )* ) - | - ( ( )* ) -} - -void ScriptTag() : -{} -{ - ( )* -} - - -TOKEN : -{ - < ScriptStart: " : WithinScript -| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] ()? > : WithinTag -| < DeclName: "<" "!" ["A"-"Z","a"-"z"] ()? > : WithinTag - -| < Comment1: "" > : DEFAULT -} - - TOKEN : -{ - < CommentText2: (~[">"])+ > -| < CommentEnd2: ">" > : DEFAULT -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParserThread.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParserThread.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParserThread.java (working copy) @@ -1,50 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.*; - -class ParserThread extends Thread { - HTMLParser parser; - - ParserThread(HTMLParser p) { - parser = p; - } - - @Override - public void run() { // convert pipeOut to pipeIn - try { - try { // parse document to pipeOut - parser.HTMLDocument(); - } catch (ParseException e) { - System.out.println("Parse Aborted: " + e.getMessage()); - } catch (TokenMgrError e) { - System.out.println("Parse Aborted: " + e.getMessage()); - } finally { - parser.pipeOut.close(); - synchronized (parser) { - parser.summary.setLength(HTMLParser.SUMMARY_LENGTH); - parser.titleComplete = true; - parser.notifyAll(); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - } -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Entities.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Entities.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Entities.java (working copy) @@ -1,330 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.HashMap; -import java.util.Map; - -/** - * Utility class for encoding and decoding HTML entities. - */ -public class Entities { - static final Map decoder = new HashMap(300); - static final String[] encoder = new String[0x100]; - - static final String decode(String entity) { - if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon - entity = entity.substring(0, entity.length()-1); - if (entity.charAt(1) == '#') { - int start = 2; - int radix = 10; - if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') { - start++; - radix = 16; - } - Character c = - new Character((char)Integer.parseInt(entity.substring(start), radix)); - return c.toString(); - } else { - String s = decoder.get(entity); - if (s != null) - return s; - else return ""; - } - } - - public static final String encode(String s) { - int length = s.length(); - StringBuffer buffer = new StringBuffer(length * 2); - for (int i = 0; i < length; i++) { - int j = s.charAt(i); - if (j < 0x100 && encoder[j] != null) { - buffer.append(encoder[j]); // have a named encoding - buffer.append(';'); - } else if (j < 0x80) { - buffer.append((char) j); // use ASCII value - } else { - buffer.append("&#"); // use numeric encoding - buffer.append(j).append(';'); - } - } - return buffer.toString(); - } - - static final void add(String entity, int value) { - decoder.put(entity, (new Character((char)value)).toString()); - if (value < 0x100) - encoder[value] = entity; - } - - static { - add(" ", 160); - add("¡", 161); - add("¢", 162); - add("£", 163); - add("¤", 164); - add("¥", 165); - add("¦", 166); - add("§", 167); - add("¨", 168); - add("©", 169); - add("ª", 170); - add("«", 171); - add("¬", 172); - add("­", 173); - add("®", 174); - add("¯", 175); - add("°", 176); - add("±", 177); - add("²", 178); - add("³", 179); - add("´", 180); - add("µ", 181); - add("¶", 182); - add("·", 183); - add("¸", 184); - add("¹", 185); - add("º", 186); - add("»", 187); - add("¼", 188); - add("½", 189); - add("¾", 190); - add("¿", 191); - add("À", 192); - add("Á", 193); - add("Â", 194); - add("Ã", 195); - add("Ä", 196); - add("Å", 197); - add("Æ", 198); - add("Ç", 199); - add("È", 200); - add("É", 201); - add("Ê", 202); - add("Ë", 203); - add("Ì", 204); - add("Í", 205); - add("Î", 206); - add("Ï", 207); - add("Ð", 208); - add("Ñ", 209); - add("Ò", 210); - add("Ó", 211); - add("Ô", 212); - add("Õ", 213); - add("Ö", 214); - add("×", 215); - add("Ø", 216); - add("Ù", 217); - add("Ú", 218); - add("Û", 219); - add("Ü", 220); - add("Ý", 221); - add("Þ", 222); - add("ß", 223); - add("à", 224); - add("á", 225); - add("â", 226); - add("ã", 227); - add("ä", 228); - add("å", 229); - add("æ", 230); - add("ç", 231); - add("è", 232); - add("é", 233); - add("ê", 234); - add("ë", 235); - add("ì", 236); - add("í", 237); - add("î", 238); - add("ï", 239); - add("ð", 240); - add("ñ", 241); - add("ò", 242); - add("ó", 243); - add("ô", 244); - add("õ", 245); - add("ö", 246); - add("÷", 247); - add("ø", 248); - add("ù", 249); - add("ú", 250); - add("û", 251); - add("ü", 252); - add("ý", 253); - add("þ", 254); - add("ÿ", 255); - add("&fnof", 402); - add("&Alpha", 913); - add("&Beta", 914); - add("&Gamma", 915); - add("&Delta", 916); - add("&Epsilon",917); - add("&Zeta", 918); - add("&Eta", 919); - add("&Theta", 920); - add("&Iota", 921); - add("&Kappa", 922); - add("&Lambda", 923); - add("&Mu", 924); - add("&Nu", 925); - add("&Xi", 926); - add("&Omicron",927); - add("&Pi", 928); - add("&Rho", 929); - add("&Sigma", 931); - add("&Tau", 932); - add("&Upsilon",933); - add("&Phi", 934); - add("&Chi", 935); - add("&Psi", 936); - add("&Omega", 937); - add("&alpha", 945); - add("&beta", 946); - add("&gamma", 947); - add("&delta", 948); - add("&epsilon",949); - add("&zeta", 950); - add("&eta", 951); - add("&theta", 952); - add("&iota", 953); - add("&kappa", 954); - add("&lambda", 955); - add("&mu", 956); - add("&nu", 957); - add("&xi", 958); - add("&omicron",959); - add("&pi", 960); - add("&rho", 961); - add("&sigmaf", 962); - add("&sigma", 963); - add("&tau", 964); - add("&upsilon",965); - add("&phi", 966); - add("&chi", 967); - add("&psi", 968); - add("&omega", 969); - add("&thetasym",977); - add("&upsih", 978); - add("&piv", 982); - add("&bull", 8226); - add("&hellip", 8230); - add("&prime", 8242); - add("&Prime", 8243); - add("&oline", 8254); - add("&frasl", 8260); - add("&weierp", 8472); - add("&image", 8465); - add("&real", 8476); - add("&trade", 8482); - add("&alefsym",8501); - add("&larr", 8592); - add("&uarr", 8593); - add("&rarr", 8594); - add("&darr", 8595); - add("&harr", 8596); - add("&crarr", 8629); - add("&lArr", 8656); - add("&uArr", 8657); - add("&rArr", 8658); - add("&dArr", 8659); - add("&hArr", 8660); - add("&forall", 8704); - add("&part", 8706); - add("&exist", 8707); - add("&empty", 8709); - add("&nabla", 8711); - add("&isin", 8712); - add("¬in", 8713); - add("&ni", 8715); - add("&prod", 8719); - add("&sum", 8721); - add("&minus", 8722); - add("&lowast", 8727); - add("&radic", 8730); - add("&prop", 8733); - add("&infin", 8734); - add("&ang", 8736); - add("&and", 8743); - add("&or", 8744); - add("&cap", 8745); - add("&cup", 8746); - add("&int", 8747); - add("&there4", 8756); - add("&sim", 8764); - add("&cong", 8773); - add("&asymp", 8776); - add("&ne", 8800); - add("&equiv", 8801); - add("&le", 8804); - add("&ge", 8805); - add("&sub", 8834); - add("&sup", 8835); - add("&nsub", 8836); - add("&sube", 8838); - add("&supe", 8839); - add("&oplus", 8853); - add("&otimes", 8855); - add("&perp", 8869); - add("&sdot", 8901); - add("&lceil", 8968); - add("&rceil", 8969); - add("&lfloor", 8970); - add("&rfloor", 8971); - add("&lang", 9001); - add("&rang", 9002); - add("&loz", 9674); - add("&spades", 9824); - add("&clubs", 9827); - add("&hearts", 9829); - add("&diams", 9830); - add(""", 34); - add("&", 38); - add("<", 60); - add(">", 62); - add("&OElig", 338); - add("&oelig", 339); - add("&Scaron", 352); - add("&scaron", 353); - add("&Yuml", 376); - add("&circ", 710); - add("&tilde", 732); - add("&ensp", 8194); - add("&emsp", 8195); - add("&thinsp", 8201); - add("&zwnj", 8204); - add("&zwj", 8205); - add("&lrm", 8206); - add("&rlm", 8207); - add("&ndash", 8211); - add("&mdash", 8212); - add("&lsquo", 8216); - add("&rsquo", 8217); - add("&sbquo", 8218); - add("&ldquo", 8220); - add("&rdquo", 8221); - add("&bdquo", 8222); - add("&dagger", 8224); - add("&Dagger", 8225); - add("&permil", 8240); - add("&lsaquo", 8249); - add("&rsaquo", 8250); - add("&euro", 8364); - - } -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java (working copy) @@ -1,123 +0,0 @@ -// FastCharStream.java -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -import java.io.*; - -/** An efficient implementation of JavaCC's CharStream interface.

Note that - * this does not do line-number counting, but instead keeps track of the - * character position of the token in the input, as required by Lucene's {@link - * org.apache.lucene.analysis.Token} API. - * */ -public final class FastCharStream implements CharStream { - char[] buffer = null; - - int bufferLength = 0; // end of valid chars - int bufferPosition = 0; // next char to read - - int tokenStart = 0; // offset in buffer - int bufferStart = 0; // position in file of buffer - - Reader input; // source of chars - - /** Constructs from a Reader. */ - public FastCharStream(Reader r) { - input = r; - } - - public final char readChar() throws IOException { - if (bufferPosition >= bufferLength) - refill(); - return buffer[bufferPosition++]; - } - - private final void refill() throws IOException { - int newPosition = bufferLength - tokenStart; - - if (tokenStart == 0) { // token won't fit in buffer - if (buffer == null) { // first time: alloc buffer - buffer = new char[2048]; - } else if (bufferLength == buffer.length) { // grow buffer - char[] newBuffer = new char[buffer.length*2]; - System.arraycopy(buffer, 0, newBuffer, 0, bufferLength); - buffer = newBuffer; - } - } else { // shift token to front - System.arraycopy(buffer, tokenStart, buffer, 0, newPosition); - } - - bufferLength = newPosition; // update state - bufferPosition = newPosition; - bufferStart += tokenStart; - tokenStart = 0; - - int charsRead = // fill space in buffer - input.read(buffer, newPosition, buffer.length-newPosition); - if (charsRead == -1) - throw new IOException("read past eof"); - else - bufferLength += charsRead; - } - - public final char BeginToken() throws IOException { - tokenStart = bufferPosition; - return readChar(); - } - - public final void backup(int amount) { - bufferPosition -= amount; - } - - public final String GetImage() { - return new String(buffer, tokenStart, bufferPosition - tokenStart); - } - - public final char[] GetSuffix(int len) { - char[] value = new char[len]; - System.arraycopy(buffer, bufferPosition - len, value, 0, len); - return value; - } - - public final void Done() { - try { - input.close(); - } catch (IOException e) { - } - } - - public final int getColumn() { - return bufferStart + bufferPosition; - } - public final int getLine() { - return 1; - } - public final int getEndColumn() { - return bufferStart + bufferPosition; - } - public final int getEndLine() { - return 1; - } - public final int getBeginColumn() { - return bufferStart + tokenStart; - } - public final int getBeginLine() { - return 1; - } -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParseException.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParseException.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParseException.java (working copy) @@ -1,198 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */ -/* JavaCCOptions:KEEP_LINE_COL=null */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/** - * This exception is thrown when parse errors are encountered. - * You can explicitly create objects of this exception type by - * calling the method generateParseException in the generated - * parser. - * - * You can modify this class to customize your error reporting - * mechanisms so long as you retain the public fields. - */ -public class ParseException extends Exception { - - /** - * This constructor is used by the method "generateParseException" - * in the generated parser. Calling this constructor generates - * a new object of this type with the fields "currentToken", - * "expectedTokenSequences", and "tokenImage" set. The boolean - * flag "specialConstructor" is also set to true to indicate that - * this constructor was used to create this object. - * This constructor calls its super class with the empty string - * to force the "toString" method of parent class "Throwable" to - * print the error message in the form: - * ParseException: - */ - public ParseException(Token currentTokenVal, - int[][] expectedTokenSequencesVal, - String[] tokenImageVal - ) - { - super(""); - specialConstructor = true; - currentToken = currentTokenVal; - expectedTokenSequences = expectedTokenSequencesVal; - tokenImage = tokenImageVal; - } - - /** - * The following constructors are for use by you for whatever - * purpose you can think of. Constructing the exception in this - * manner makes the exception behave in the normal way - i.e., as - * documented in the class "Throwable". The fields "errorToken", - * "expectedTokenSequences", and "tokenImage" do not contain - * relevant information. The JavaCC generated code does not use - * these constructors. - */ - - public ParseException() { - super(); - specialConstructor = false; - } - - /** Constructor with message. */ - public ParseException(String message) { - super(message); - specialConstructor = false; - } - - /** - * This variable determines which constructor was used to create - * this object and thereby affects the semantics of the - * "getMessage" method (see below). - */ - protected boolean specialConstructor; - - /** - * This is the last token that has been consumed successfully. If - * this object has been created due to a parse error, the token - * followng this token will (therefore) be the first error token. - */ - public Token currentToken; - - /** - * Each entry in this array is an array of integers. Each array - * of integers represents a sequence of tokens (by their ordinal - * values) that is expected at this point of the parse. - */ - public int[][] expectedTokenSequences; - - /** - * This is a reference to the "tokenImage" array of the generated - * parser within which the parse error occurred. This array is - * defined in the generated ...Constants interface. - */ - public String[] tokenImage; - - /** - * This method has the standard behavior when this object has been - * created using the standard constructors. Otherwise, it uses - * "currentToken" and "expectedTokenSequences" to generate a parse - * error message and returns it. If this object has been created - * due to a parse error, and you do not catch it (it gets thrown - * from the parser), then this method is called during the printing - * of the final stack trace, and hence the correct error message - * gets displayed. - */ - public String getMessage() { - if (!specialConstructor) { - return super.getMessage(); - } - StringBuffer expected = new StringBuffer(); - int maxSize = 0; - for (int i = 0; i < expectedTokenSequences.length; i++) { - if (maxSize < expectedTokenSequences[i].length) { - maxSize = expectedTokenSequences[i].length; - } - for (int j = 0; j < expectedTokenSequences[i].length; j++) { - expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' '); - } - if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) { - expected.append("..."); - } - expected.append(eol).append(" "); - } - String retval = "Encountered \""; - Token tok = currentToken.next; - for (int i = 0; i < maxSize; i++) { - if (i != 0) retval += " "; - if (tok.kind == 0) { - retval += tokenImage[0]; - break; - } - retval += " " + tokenImage[tok.kind]; - retval += " \""; - retval += add_escapes(tok.image); - retval += " \""; - tok = tok.next; - } - retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn; - retval += "." + eol; - if (expectedTokenSequences.length == 1) { - retval += "Was expecting:" + eol + " "; - } else { - retval += "Was expecting one of:" + eol + " "; - } - retval += expected.toString(); - return retval; - } - - /** - * The end of line string for this machine. - */ - protected String eol = System.getProperty("line.separator", "\n"); - - /** - * Used to convert raw characters to their escaped version - * when these raw version cannot be used as part of an ASCII - * string literal. - */ - protected String add_escapes(String str) { - StringBuffer retval = new StringBuffer(); - char ch; - for (int i = 0; i < str.length(); i++) { - switch (str.charAt(i)) - { - case 0 : - continue; - case '\b': - retval.append("\\b"); - continue; - case '\t': - retval.append("\\t"); - continue; - case '\n': - retval.append("\\n"); - continue; - case '\f': - retval.append("\\f"); - continue; - case '\r': - retval.append("\\r"); - continue; - case '\"': - retval.append("\\\""); - continue; - case '\'': - retval.append("\\\'"); - continue; - case '\\': - retval.append("\\\\"); - continue; - default: - if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) { - String s = "0000" + Integer.toString(ch, 16); - retval.append("\\u" + s.substring(s.length() - 4, s.length())); - } else { - retval.append(ch); - } - continue; - } - } - return retval.toString(); - } - -} -/* JavaCC - OriginalChecksum=e449d0e43f3d85deb1260a88b7e90fcd (do not edit this line) */ Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java (working copy) @@ -1,112 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 4.1 */ -/* JavaCCOptions:STATIC=false */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/** - * This interface describes a character stream that maintains line and - * column number positions of the characters. It also has the capability - * to backup the stream to some extent. An implementation of this - * interface is used in the TokenManager implementation generated by - * JavaCCParser. - * - * All the methods except backup can be implemented in any fashion. backup - * needs to be implemented correctly for the correct operation of the lexer. - * Rest of the methods are all used to get information like line number, - * column number and the String that constitutes a token and are not used - * by the lexer. Hence their implementation won't affect the generated lexer's - * operation. - */ - -public interface CharStream { - - /** - * Returns the next character from the selected input. The method - * of selecting the input is the responsibility of the class - * implementing this interface. Can throw any java.io.IOException. - */ - char readChar() throws java.io.IOException; - - /** - * Returns the column position of the character last read. - * @deprecated - * @see #getEndColumn - */ - int getColumn(); - - /** - * Returns the line number of the character last read. - * @deprecated - * @see #getEndLine - */ - int getLine(); - - /** - * Returns the column number of the last character for current token (being - * matched after the last call to BeginTOken). - */ - int getEndColumn(); - - /** - * Returns the line number of the last character for current token (being - * matched after the last call to BeginTOken). - */ - int getEndLine(); - - /** - * Returns the column number of the first character for current token (being - * matched after the last call to BeginTOken). - */ - int getBeginColumn(); - - /** - * Returns the line number of the first character for current token (being - * matched after the last call to BeginTOken). - */ - int getBeginLine(); - - /** - * Backs up the input stream by amount steps. Lexer calls this method if it - * had already read some characters, but could not use them to match a - * (longer) token. So, they will be used again as the prefix of the next - * token and it is the implemetation's responsibility to do this right. - */ - void backup(int amount); - - /** - * Returns the next character that marks the beginning of the next token. - * All characters must remain in the buffer between two successive calls - * to this method to implement backup correctly. - */ - char BeginToken() throws java.io.IOException; - - /** - * Returns a string made up of characters from the marked token beginning - * to the current buffer position. Implementations have the choice of returning - * anything that they want to. For example, for efficiency, one might decide - * to just return null, which is a valid implementation. - */ - String GetImage(); - - /** - * Returns an array of characters that make up the suffix of length 'len' for - * the currently matched token. This is used to build up the matched string - * for use in actions in the case of MORE. A simple and inefficient - * implementation of this is as follows : - * - * { - * String t = GetImage(); - * return t.substring(t.length() - len, t.length()).toCharArray(); - * } - */ - char[] GetSuffix(int len); - - /** - * The lexer calls this function to indicate that it is done with the stream - * and hence implementations can free any resources held by this class. - * Again, the body of this function can be just empty and it will not - * affect the lexer's operation. - */ - void Done(); - -} -/* JavaCC - OriginalChecksum=e26d9399cd34335f985e19c1fa86c11b (do not edit this line) */ Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/CharStream.java (working copy) @@ -1,112 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 4.1 */ -/* JavaCCOptions:STATIC=false */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/** - * This interface describes a character stream that maintains line and - * column number positions of the characters. It also has the capability - * to backup the stream to some extent. An implementation of this - * interface is used in the TokenManager implementation generated by - * JavaCCParser. - * - * All the methods except backup can be implemented in any fashion. backup - * needs to be implemented correctly for the correct operation of the lexer. - * Rest of the methods are all used to get information like line number, - * column number and the String that constitutes a token and are not used - * by the lexer. Hence their implementation won't affect the generated lexer's - * operation. - */ - -public interface CharStream { - - /** - * Returns the next character from the selected input. The method - * of selecting the input is the responsibility of the class - * implementing this interface. Can throw any java.io.IOException. - */ - char readChar() throws java.io.IOException; - - /** - * Returns the column position of the character last read. - * @deprecated - * @see #getEndColumn - */ - int getColumn(); - - /** - * Returns the line number of the character last read. - * @deprecated - * @see #getEndLine - */ - int getLine(); - - /** - * Returns the column number of the last character for current token (being - * matched after the last call to BeginTOken). - */ - int getEndColumn(); - - /** - * Returns the line number of the last character for current token (being - * matched after the last call to BeginTOken). - */ - int getEndLine(); - - /** - * Returns the column number of the first character for current token (being - * matched after the last call to BeginTOken). - */ - int getBeginColumn(); - - /** - * Returns the line number of the first character for current token (being - * matched after the last call to BeginTOken). - */ - int getBeginLine(); - - /** - * Backs up the input stream by amount steps. Lexer calls this method if it - * had already read some characters, but could not use them to match a - * (longer) token. So, they will be used again as the prefix of the next - * token and it is the implemetation's responsibility to do this right. - */ - void backup(int amount); - - /** - * Returns the next character that marks the beginning of the next token. - * All characters must remain in the buffer between two successive calls - * to this method to implement backup correctly. - */ - char BeginToken() throws java.io.IOException; - - /** - * Returns a string made up of characters from the marked token beginning - * to the current buffer position. Implementations have the choice of returning - * anything that they want to. For example, for efficiency, one might decide - * to just return null, which is a valid implementation. - */ - String GetImage(); - - /** - * Returns an array of characters that make up the suffix of length 'len' for - * the currently matched token. This is used to build up the matched string - * for use in actions in the case of MORE. A simple and inefficient - * implementation of this is as follows : - * - * { - * String t = GetImage(); - * return t.substring(t.length() - len, t.length()).toCharArray(); - * } - */ - char[] GetSuffix(int len); - - /** - * The lexer calls this function to indicate that it is done with the stream - * and hence implementations can free any resources held by this class. - * Again, the body of this function can be just empty and it will not - * affect the lexer's operation. - */ - void Done(); - -} -/* JavaCC - OriginalChecksum=e26d9399cd34335f985e19c1fa86c11b (do not edit this line) */ Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Entities.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Entities.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Entities.java (working copy) @@ -1,330 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.HashMap; -import java.util.Map; - -/** - * Utility class for encoding and decoding HTML entities. - */ -public class Entities { - static final Map decoder = new HashMap(300); - static final String[] encoder = new String[0x100]; - - static final String decode(String entity) { - if (entity.charAt(entity.length()-1) == ';') // remove trailing semicolon - entity = entity.substring(0, entity.length()-1); - if (entity.charAt(1) == '#') { - int start = 2; - int radix = 10; - if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') { - start++; - radix = 16; - } - Character c = - new Character((char)Integer.parseInt(entity.substring(start), radix)); - return c.toString(); - } else { - String s = decoder.get(entity); - if (s != null) - return s; - else return ""; - } - } - - public static final String encode(String s) { - int length = s.length(); - StringBuffer buffer = new StringBuffer(length * 2); - for (int i = 0; i < length; i++) { - int j = s.charAt(i); - if (j < 0x100 && encoder[j] != null) { - buffer.append(encoder[j]); // have a named encoding - buffer.append(';'); - } else if (j < 0x80) { - buffer.append((char) j); // use ASCII value - } else { - buffer.append("&#"); // use numeric encoding - buffer.append(j).append(';'); - } - } - return buffer.toString(); - } - - static final void add(String entity, int value) { - decoder.put(entity, (new Character((char)value)).toString()); - if (value < 0x100) - encoder[value] = entity; - } - - static { - add(" ", 160); - add("¡", 161); - add("¢", 162); - add("£", 163); - add("¤", 164); - add("¥", 165); - add("¦", 166); - add("§", 167); - add("¨", 168); - add("©", 169); - add("ª", 170); - add("«", 171); - add("¬", 172); - add("­", 173); - add("®", 174); - add("¯", 175); - add("°", 176); - add("±", 177); - add("²", 178); - add("³", 179); - add("´", 180); - add("µ", 181); - add("¶", 182); - add("·", 183); - add("¸", 184); - add("¹", 185); - add("º", 186); - add("»", 187); - add("¼", 188); - add("½", 189); - add("¾", 190); - add("¿", 191); - add("À", 192); - add("Á", 193); - add("Â", 194); - add("Ã", 195); - add("Ä", 196); - add("Å", 197); - add("Æ", 198); - add("Ç", 199); - add("È", 200); - add("É", 201); - add("Ê", 202); - add("Ë", 203); - add("Ì", 204); - add("Í", 205); - add("Î", 206); - add("Ï", 207); - add("Ð", 208); - add("Ñ", 209); - add("Ò", 210); - add("Ó", 211); - add("Ô", 212); - add("Õ", 213); - add("Ö", 214); - add("×", 215); - add("Ø", 216); - add("Ù", 217); - add("Ú", 218); - add("Û", 219); - add("Ü", 220); - add("Ý", 221); - add("Þ", 222); - add("ß", 223); - add("à", 224); - add("á", 225); - add("â", 226); - add("ã", 227); - add("ä", 228); - add("å", 229); - add("æ", 230); - add("ç", 231); - add("è", 232); - add("é", 233); - add("ê", 234); - add("ë", 235); - add("ì", 236); - add("í", 237); - add("î", 238); - add("ï", 239); - add("ð", 240); - add("ñ", 241); - add("ò", 242); - add("ó", 243); - add("ô", 244); - add("õ", 245); - add("ö", 246); - add("÷", 247); - add("ø", 248); - add("ù", 249); - add("ú", 250); - add("û", 251); - add("ü", 252); - add("ý", 253); - add("þ", 254); - add("ÿ", 255); - add("&fnof", 402); - add("&Alpha", 913); - add("&Beta", 914); - add("&Gamma", 915); - add("&Delta", 916); - add("&Epsilon",917); - add("&Zeta", 918); - add("&Eta", 919); - add("&Theta", 920); - add("&Iota", 921); - add("&Kappa", 922); - add("&Lambda", 923); - add("&Mu", 924); - add("&Nu", 925); - add("&Xi", 926); - add("&Omicron",927); - add("&Pi", 928); - add("&Rho", 929); - add("&Sigma", 931); - add("&Tau", 932); - add("&Upsilon",933); - add("&Phi", 934); - add("&Chi", 935); - add("&Psi", 936); - add("&Omega", 937); - add("&alpha", 945); - add("&beta", 946); - add("&gamma", 947); - add("&delta", 948); - add("&epsilon",949); - add("&zeta", 950); - add("&eta", 951); - add("&theta", 952); - add("&iota", 953); - add("&kappa", 954); - add("&lambda", 955); - add("&mu", 956); - add("&nu", 957); - add("&xi", 958); - add("&omicron",959); - add("&pi", 960); - add("&rho", 961); - add("&sigmaf", 962); - add("&sigma", 963); - add("&tau", 964); - add("&upsilon",965); - add("&phi", 966); - add("&chi", 967); - add("&psi", 968); - add("&omega", 969); - add("&thetasym",977); - add("&upsih", 978); - add("&piv", 982); - add("&bull", 8226); - add("&hellip", 8230); - add("&prime", 8242); - add("&Prime", 8243); - add("&oline", 8254); - add("&frasl", 8260); - add("&weierp", 8472); - add("&image", 8465); - add("&real", 8476); - add("&trade", 8482); - add("&alefsym",8501); - add("&larr", 8592); - add("&uarr", 8593); - add("&rarr", 8594); - add("&darr", 8595); - add("&harr", 8596); - add("&crarr", 8629); - add("&lArr", 8656); - add("&uArr", 8657); - add("&rArr", 8658); - add("&dArr", 8659); - add("&hArr", 8660); - add("&forall", 8704); - add("&part", 8706); - add("&exist", 8707); - add("&empty", 8709); - add("&nabla", 8711); - add("&isin", 8712); - add("¬in", 8713); - add("&ni", 8715); - add("&prod", 8719); - add("&sum", 8721); - add("&minus", 8722); - add("&lowast", 8727); - add("&radic", 8730); - add("&prop", 8733); - add("&infin", 8734); - add("&ang", 8736); - add("&and", 8743); - add("&or", 8744); - add("&cap", 8745); - add("&cup", 8746); - add("&int", 8747); - add("&there4", 8756); - add("&sim", 8764); - add("&cong", 8773); - add("&asymp", 8776); - add("&ne", 8800); - add("&equiv", 8801); - add("&le", 8804); - add("&ge", 8805); - add("&sub", 8834); - add("&sup", 8835); - add("&nsub", 8836); - add("&sube", 8838); - add("&supe", 8839); - add("&oplus", 8853); - add("&otimes", 8855); - add("&perp", 8869); - add("&sdot", 8901); - add("&lceil", 8968); - add("&rceil", 8969); - add("&lfloor", 8970); - add("&rfloor", 8971); - add("&lang", 9001); - add("&rang", 9002); - add("&loz", 9674); - add("&spades", 9824); - add("&clubs", 9827); - add("&hearts", 9829); - add("&diams", 9830); - add(""", 34); - add("&", 38); - add("<", 60); - add(">", 62); - add("&OElig", 338); - add("&oelig", 339); - add("&Scaron", 352); - add("&scaron", 353); - add("&Yuml", 376); - add("&circ", 710); - add("&tilde", 732); - add("&ensp", 8194); - add("&emsp", 8195); - add("&thinsp", 8201); - add("&zwnj", 8204); - add("&zwj", 8205); - add("&lrm", 8206); - add("&rlm", 8207); - add("&ndash", 8211); - add("&mdash", 8212); - add("&lsquo", 8216); - add("&rsquo", 8217); - add("&sbquo", 8218); - add("&ldquo", 8220); - add("&rdquo", 8221); - add("&bdquo", 8222); - add("&dagger", 8224); - add("&Dagger", 8225); - add("&permil", 8240); - add("&lsaquo", 8249); - add("&rsaquo", 8250); - add("&euro", 8364); - - } -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/FastCharStream.java (working copy) @@ -1,123 +0,0 @@ -// FastCharStream.java -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -import java.io.*; - -/** An efficient implementation of JavaCC's CharStream interface.

Note that - * this does not do line-number counting, but instead keeps track of the - * character position of the token in the input, as required by Lucene's {@link - * org.apache.lucene.analysis.Token} API. - * */ -public final class FastCharStream implements CharStream { - char[] buffer = null; - - int bufferLength = 0; // end of valid chars - int bufferPosition = 0; // next char to read - - int tokenStart = 0; // offset in buffer - int bufferStart = 0; // position in file of buffer - - Reader input; // source of chars - - /** Constructs from a Reader. */ - public FastCharStream(Reader r) { - input = r; - } - - public final char readChar() throws IOException { - if (bufferPosition >= bufferLength) - refill(); - return buffer[bufferPosition++]; - } - - private final void refill() throws IOException { - int newPosition = bufferLength - tokenStart; - - if (tokenStart == 0) { // token won't fit in buffer - if (buffer == null) { // first time: alloc buffer - buffer = new char[2048]; - } else if (bufferLength == buffer.length) { // grow buffer - char[] newBuffer = new char[buffer.length*2]; - System.arraycopy(buffer, 0, newBuffer, 0, bufferLength); - buffer = newBuffer; - } - } else { // shift token to front - System.arraycopy(buffer, tokenStart, buffer, 0, newPosition); - } - - bufferLength = newPosition; // update state - bufferPosition = newPosition; - bufferStart += tokenStart; - tokenStart = 0; - - int charsRead = // fill space in buffer - input.read(buffer, newPosition, buffer.length-newPosition); - if (charsRead == -1) - throw new IOException("read past eof"); - else - bufferLength += charsRead; - } - - public final char BeginToken() throws IOException { - tokenStart = bufferPosition; - return readChar(); - } - - public final void backup(int amount) { - bufferPosition -= amount; - } - - public final String GetImage() { - return new String(buffer, tokenStart, bufferPosition - tokenStart); - } - - public final char[] GetSuffix(int len) { - char[] value = new char[len]; - System.arraycopy(buffer, bufferPosition - len, value, 0, len); - return value; - } - - public final void Done() { - try { - input.close(); - } catch (IOException e) { - } - } - - public final int getColumn() { - return bufferStart + bufferPosition; - } - public final int getLine() { - return 1; - } - public final int getEndColumn() { - return bufferStart + bufferPosition; - } - public final int getEndLine() { - return 1; - } - public final int getBeginColumn() { - return bufferStart + tokenStart; - } - public final int getBeginLine() { - return 1; - } -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.java (working copy) @@ -1,722 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -import java.io.*; -import java.util.Locale; -import java.util.Properties; - -/** - * Basic html parser (for demo/testing purposes only!) - */ -public class HTMLParser implements HTMLParserConstants { - public static int SUMMARY_LENGTH = 200; - - StringBuffer title = new StringBuffer(SUMMARY_LENGTH); - StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); - Properties metaTags=new Properties(); - String currentMetaTag=null; - String currentMetaContent=null; - int length = 0; - boolean titleComplete = false; - boolean inTitle = false; - boolean inMetaTag = false; - boolean inStyle = false; - boolean afterTag = false; - boolean afterSpace = false; - String eol = System.getProperty("line.separator"); - Reader pipeIn = null; - Writer pipeOut; - private MyPipedInputStream pipeInStream = null; - private PipedOutputStream pipeOutStream = null; - - public HTMLParser(Reader reader) { - this(new FastCharStream(reader)); - } - - private class MyPipedInputStream extends PipedInputStream{ - - public MyPipedInputStream(){ - super(); - } - - public MyPipedInputStream(PipedOutputStream src) throws IOException{ - super(src); - } - - public boolean full() throws IOException{ - return this.available() >= PipedInputStream.PIPE_SIZE; - } - } - - public String getTitle() throws IOException, InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (titleComplete || pipeInStream.full()) - break; - wait(10); - } - } - return title.toString().trim(); - } - - public Properties getMetaTags() throws IOException, -InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (titleComplete || pipeInStream.full()) - break; - wait(10); - } - } - return metaTags; - } - - - public String getSummary() throws IOException, InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full()) - break; - wait(10); - } - } - if (summary.length() > SUMMARY_LENGTH) - summary.setLength(SUMMARY_LENGTH); - - String sum = summary.toString().trim(); - String tit = getTitle(); - if (sum.equals("")) - return tit; - else - return sum; - } - - public Reader getReader() throws IOException { - if (pipeIn == null) { - pipeInStream = new MyPipedInputStream(); - pipeOutStream = new PipedOutputStream(pipeInStream); - pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE"); - pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE"); - - Thread thread = new ParserThread(this); - thread.start(); // start parsing - } - - return pipeIn; - } - - void addToSummary(String text) { - if (summary.length() < SUMMARY_LENGTH) { - summary.append(text); - if (summary.length() >= SUMMARY_LENGTH) { - synchronized(this) { - notifyAll(); - } - } - } - } - - void addText(String text) throws IOException { - if (inStyle) - return; - if (inTitle) - title.append(text); - else { - addToSummary(text); - if (!titleComplete && !(title.length() == 0)) { // finished title - synchronized(this) { - titleComplete = true; // tell waiting threads - notifyAll(); - } - } - } - - length += text.length(); - pipeOut.write(text); - - afterSpace = false; - } - - void addMetaTag() { - metaTags.setProperty(currentMetaTag, currentMetaContent); - currentMetaTag = null; - currentMetaContent = null; - return; - } - - void addSpace() throws IOException { - if (!afterSpace) { - if (inTitle) - title.append(" "); - else - addToSummary(" "); - - String space = afterTag ? eol : " "; - length += space.length(); - pipeOut.write(space); - afterSpace = true; - } - } - - final public void HTMLDocument() throws ParseException, IOException { - Token t; - label_1: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ScriptStart: - case TagName: - case DeclName: - case Comment1: - case Comment2: - case Word: - case Entity: - case Space: - case Punct: - ; - break; - default: - jj_la1[0] = jj_gen; - break label_1; - } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case TagName: - Tag(); - afterTag = true; - break; - case DeclName: - t = Decl(); - afterTag = true; - break; - case Comment1: - case Comment2: - CommentTag(); - afterTag = true; - break; - case ScriptStart: - ScriptTag(); - afterTag = true; - break; - case Word: - t = jj_consume_token(Word); - addText(t.image); afterTag = false; - break; - case Entity: - t = jj_consume_token(Entity); - addText(Entities.decode(t.image)); afterTag = false; - break; - case Punct: - t = jj_consume_token(Punct); - addText(t.image); afterTag = false; - break; - case Space: - jj_consume_token(Space); - addSpace(); afterTag = false; - break; - default: - jj_la1[1] = jj_gen; - jj_consume_token(-1); - throw new ParseException(); - } - } - jj_consume_token(0); - } - - final public void Tag() throws ParseException, IOException { - Token t1, t2; - boolean inImg = false; - t1 = jj_consume_token(TagName); - String tagName = t1.image.toLowerCase(Locale.ROOT); - if(Tags.WS_ELEMS.contains(tagName) ) { - addSpace(); - } - inTitle = tagName.equalsIgnoreCase(" - inMetaTag = tagName.equalsIgnoreCase(" - inStyle = tagName.equalsIgnoreCase(" - inImg = tagName.equalsIgnoreCase(" - - label_2: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgName: - ; - break; - default: - jj_la1[2] = jj_gen; - break label_2; - } - t1 = jj_consume_token(ArgName); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgEquals: - jj_consume_token(ArgEquals); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgValue: - case ArgQuote1: - case ArgQuote2: - t2 = ArgValue(); - if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) - addText("[" + t2.image + "]"); - - if(inMetaTag && - ( t1.image.equalsIgnoreCase("name") || - t1.image.equalsIgnoreCase("HTTP-EQUIV") - ) - && t2 != null) - { - currentMetaTag=t2.image.toLowerCase(Locale.ROOT); - if(currentMetaTag != null && currentMetaContent != null) { - addMetaTag(); - } - } - if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != -null) - { - currentMetaContent=t2.image.toLowerCase(Locale.ROOT); - if(currentMetaTag != null && currentMetaContent != null) { - addMetaTag(); - } - } - break; - default: - jj_la1[3] = jj_gen; - ; - } - break; - default: - jj_la1[4] = jj_gen; - ; - } - } - jj_consume_token(TagEnd); - } - - final public Token ArgValue() throws ParseException { - Token t = null; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgValue: - t = jj_consume_token(ArgValue); - {if (true) return t;} - break; - default: - jj_la1[5] = jj_gen; - if (jj_2_1(2)) { - jj_consume_token(ArgQuote1); - jj_consume_token(CloseQuote1); - {if (true) return t;} - } else { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgQuote1: - jj_consume_token(ArgQuote1); - t = jj_consume_token(Quote1Text); - jj_consume_token(CloseQuote1); - {if (true) return t;} - break; - default: - jj_la1[6] = jj_gen; - if (jj_2_2(2)) { - jj_consume_token(ArgQuote2); - jj_consume_token(CloseQuote2); - {if (true) return t;} - } else { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgQuote2: - jj_consume_token(ArgQuote2); - t = jj_consume_token(Quote2Text); - jj_consume_token(CloseQuote2); - {if (true) return t;} - break; - default: - jj_la1[7] = jj_gen; - jj_consume_token(-1); - throw new ParseException(); - } - } - } - } - } - throw new Error("Missing return statement in function"); - } - - final public Token Decl() throws ParseException { - Token t; - t = jj_consume_token(DeclName); - label_3: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgName: - case ArgEquals: - case ArgValue: - case ArgQuote1: - case ArgQuote2: - ; - break; - default: - jj_la1[8] = jj_gen; - break label_3; - } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ArgName: - jj_consume_token(ArgName); - break; - case ArgValue: - case ArgQuote1: - case ArgQuote2: - ArgValue(); - break; - case ArgEquals: - jj_consume_token(ArgEquals); - break; - default: - jj_la1[9] = jj_gen; - jj_consume_token(-1); - throw new ParseException(); - } - } - jj_consume_token(TagEnd); - {if (true) return t;} - throw new Error("Missing return statement in function"); - } - - final public void CommentTag() throws ParseException { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case Comment1: - jj_consume_token(Comment1); - label_4: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case CommentText1: - ; - break; - default: - jj_la1[10] = jj_gen; - break label_4; - } - jj_consume_token(CommentText1); - } - jj_consume_token(CommentEnd1); - break; - case Comment2: - jj_consume_token(Comment2); - label_5: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case CommentText2: - ; - break; - default: - jj_la1[11] = jj_gen; - break label_5; - } - jj_consume_token(CommentText2); - } - jj_consume_token(CommentEnd2); - break; - default: - jj_la1[12] = jj_gen; - jj_consume_token(-1); - throw new ParseException(); - } - } - - final public void ScriptTag() throws ParseException { - jj_consume_token(ScriptStart); - label_6: - while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ScriptText: - ; - break; - default: - jj_la1[13] = jj_gen; - break label_6; - } - jj_consume_token(ScriptText); - } - jj_consume_token(ScriptEnd); - } - - private boolean jj_2_1(int xla) { - jj_la = xla; jj_lastpos = jj_scanpos = token; - try { return !jj_3_1(); } - catch(LookaheadSuccess ls) { return true; } - finally { jj_save(0, xla); } - } - - private boolean jj_2_2(int xla) { - jj_la = xla; jj_lastpos = jj_scanpos = token; - try { return !jj_3_2(); } - catch(LookaheadSuccess ls) { return true; } - finally { jj_save(1, xla); } - } - - private boolean jj_3_2() { - if (jj_scan_token(ArgQuote2)) return true; - if (jj_scan_token(CloseQuote2)) return true; - return false; - } - - private boolean jj_3_1() { - if (jj_scan_token(ArgQuote1)) return true; - if (jj_scan_token(CloseQuote1)) return true; - return false; - } - - /** Generated Token Manager. */ - public HTMLParserTokenManager token_source; - /** Current token. */ - public Token token; - /** Next token. */ - public Token jj_nt; - private int jj_ntk; - private Token jj_scanpos, jj_lastpos; - private int jj_la; - private int jj_gen; - final private int[] jj_la1 = new int[14]; - static private int[] jj_la1_0; - static { - jj_la1_init_0(); - } - private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x2c7e,0x2c7e,0x10000,0x380000,0x20000,0x80000,0x100000,0x200000,0x3b0000,0x3b0000,0x8000000,0x20000000,0x30,0x4000,}; - } - final private JJCalls[] jj_2_rtns = new JJCalls[2]; - private boolean jj_rescan = false; - private int jj_gc = 0; - - /** Constructor with user supplied CharStream. */ - public HTMLParser(CharStream stream) { - token_source = new HTMLParserTokenManager(stream); - token = new Token(); - jj_ntk = -1; - jj_gen = 0; - for (int i = 0; i < 14; i++) jj_la1[i] = -1; - for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); - } - - /** Reinitialise. */ - public void ReInit(CharStream stream) { - token_source.ReInit(stream); - token = new Token(); - jj_ntk = -1; - jj_gen = 0; - for (int i = 0; i < 14; i++) jj_la1[i] = -1; - for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); - } - - /** Constructor with generated Token Manager. */ - public HTMLParser(HTMLParserTokenManager tm) { - token_source = tm; - token = new Token(); - jj_ntk = -1; - jj_gen = 0; - for (int i = 0; i < 14; i++) jj_la1[i] = -1; - for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); - } - - /** Reinitialise. */ - public void ReInit(HTMLParserTokenManager tm) { - token_source = tm; - token = new Token(); - jj_ntk = -1; - jj_gen = 0; - for (int i = 0; i < 14; i++) jj_la1[i] = -1; - for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); - } - - private Token jj_consume_token(int kind) throws ParseException { - Token oldToken; - if ((oldToken = token).next != null) token = token.next; - else token = token.next = token_source.getNextToken(); - jj_ntk = -1; - if (token.kind == kind) { - jj_gen++; - if (++jj_gc > 100) { - jj_gc = 0; - for (int i = 0; i < jj_2_rtns.length; i++) { - JJCalls c = jj_2_rtns[i]; - while (c != null) { - if (c.gen < jj_gen) c.first = null; - c = c.next; - } - } - } - return token; - } - token = oldToken; - jj_kind = kind; - throw generateParseException(); - } - - static private final class LookaheadSuccess extends java.lang.Error { } - final private LookaheadSuccess jj_ls = new LookaheadSuccess(); - private boolean jj_scan_token(int kind) { - if (jj_scanpos == jj_lastpos) { - jj_la--; - if (jj_scanpos.next == null) { - jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken(); - } else { - jj_lastpos = jj_scanpos = jj_scanpos.next; - } - } else { - jj_scanpos = jj_scanpos.next; - } - if (jj_rescan) { - int i = 0; Token tok = token; - while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; } - if (tok != null) jj_add_error_token(kind, i); - } - if (jj_scanpos.kind != kind) return true; - if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls; - return false; - } - - -/** Get the next Token. */ - final public Token getNextToken() { - if (token.next != null) token = token.next; - else token = token.next = token_source.getNextToken(); - jj_ntk = -1; - jj_gen++; - return token; - } - -/** Get the specific Token. */ - final public Token getToken(int index) { - Token t = token; - for (int i = 0; i < index; i++) { - if (t.next != null) t = t.next; - else t = t.next = token_source.getNextToken(); - } - return t; - } - - private int jj_ntk() { - if ((jj_nt=token.next) == null) - return (jj_ntk = (token.next=token_source.getNextToken()).kind); - else - return (jj_ntk = jj_nt.kind); - } - - private java.util.List jj_expentries = new java.util.ArrayList(); - private int[] jj_expentry; - private int jj_kind = -1; - private int[] jj_lasttokens = new int[100]; - private int jj_endpos; - - private void jj_add_error_token(int kind, int pos) { - if (pos >= 100) return; - if (pos == jj_endpos + 1) { - jj_lasttokens[jj_endpos++] = kind; - } else if (jj_endpos != 0) { - jj_expentry = new int[jj_endpos]; - for (int i = 0; i < jj_endpos; i++) { - jj_expentry[i] = jj_lasttokens[i]; - } - jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) { - int[] oldentry = (int[])(it.next()); - if (oldentry.length == jj_expentry.length) { - for (int i = 0; i < jj_expentry.length; i++) { - if (oldentry[i] != jj_expentry[i]) { - continue jj_entries_loop; - } - } - jj_expentries.add(jj_expentry); - break jj_entries_loop; - } - } - if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind; - } - } - - /** Generate ParseException. */ - public ParseException generateParseException() { - jj_expentries.clear(); - boolean[] la1tokens = new boolean[31]; - if (jj_kind >= 0) { - la1tokens[jj_kind] = true; - jj_kind = -1; - } - for (int i = 0; i < 14; i++) { - if (jj_la1[i] == jj_gen) { - for (int j = 0; j < 32; j++) { - if ((jj_la1_0[i] & (1< jj_gen) { - jj_la = p.arg; jj_lastpos = jj_scanpos = p.first; - switch (i) { - case 0: jj_3_1(); break; - case 1: jj_3_2(); break; - } - } - p = p.next; - } while (p != null); - } catch(LookaheadSuccess ls) { } - } - jj_rescan = false; - } - - private void jj_save(int index, int xla) { - JJCalls p = jj_2_rtns[index]; - while (p.gen > jj_gen) { - if (p.next == null) { p = p.next = new JJCalls(); break; } - p = p.next; - } - p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla; - } - - static final class JJCalls { - int gen; - Token first; - int arg; - JJCalls next; - } - -// void handleException(Exception e) { -// System.out.println(e.toString()); // print the error message -// System.out.println("Skipping..."); -// Token t; -// do { -// t = getNextToken(); -// } while (t.kind != TagEnd); -// } -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj (working copy) @@ -1,394 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// HTMLParser.jj - -options { - STATIC = false; - //DEBUG_LOOKAHEAD = true; - //DEBUG_TOKEN_MANAGER = true; - UNICODE_INPUT = true; - USER_CHAR_STREAM=true; -} - -PARSER_BEGIN(HTMLParser) - -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -import java.io.*; -import java.util.Locale; -import java.util.Properties; - -/** - * Basic html parser (for demo/testing purposes only!) - */ -public class HTMLParser { - public static int SUMMARY_LENGTH = 200; - - StringBuffer title = new StringBuffer(SUMMARY_LENGTH); - StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2); - Properties metaTags=new Properties(); - String currentMetaTag=null; - String currentMetaContent=null; - int length = 0; - boolean titleComplete = false; - boolean inTitle = false; - boolean inMetaTag = false; - boolean inStyle = false; - boolean afterTag = false; - boolean afterSpace = false; - String eol = System.getProperty("line.separator"); - Reader pipeIn = null; - Writer pipeOut; - private MyPipedInputStream pipeInStream = null; - private PipedOutputStream pipeOutStream = null; - - public HTMLParser(Reader reader) { - this(new FastCharStream(reader)); - } - - private class MyPipedInputStream extends PipedInputStream{ - - public MyPipedInputStream(){ - super(); - } - - public MyPipedInputStream(PipedOutputStream src) throws IOException{ - super(src); - } - - public boolean full() throws IOException{ - return this.available() >= PipedInputStream.PIPE_SIZE; - } - } - - public String getTitle() throws IOException, InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (titleComplete || pipeInStream.full()) - break; - wait(10); - } - } - return title.toString().trim(); - } - - public Properties getMetaTags() throws IOException, -InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (titleComplete || pipeInStream.full()) - break; - wait(10); - } - } - return metaTags; - } - - - public String getSummary() throws IOException, InterruptedException { - if (pipeIn == null) - getReader(); // spawn parsing thread - while (true) { - synchronized(this) { - if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full()) - break; - wait(10); - } - } - if (summary.length() > SUMMARY_LENGTH) - summary.setLength(SUMMARY_LENGTH); - - String sum = summary.toString().trim(); - String tit = getTitle(); - if (sum.equals("")) - return tit; - else - return sum; - } - - public Reader getReader() throws IOException { - if (pipeIn == null) { - pipeInStream = new MyPipedInputStream(); - pipeOutStream = new PipedOutputStream(pipeInStream); - pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE"); - pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE"); - - Thread thread = new ParserThread(this); - thread.start(); // start parsing - } - - return pipeIn; - } - - void addToSummary(String text) { - if (summary.length() < SUMMARY_LENGTH) { - summary.append(text); - if (summary.length() >= SUMMARY_LENGTH) { - synchronized(this) { - notifyAll(); - } - } - } - } - - void addText(String text) throws IOException { - if (inStyle) - return; - if (inTitle) - title.append(text); - else { - addToSummary(text); - if (!titleComplete && !(title.length() == 0)) { // finished title - synchronized(this) { - titleComplete = true; // tell waiting threads - notifyAll(); - } - } - } - - length += text.length(); - pipeOut.write(text); - - afterSpace = false; - } - - void addMetaTag() { - metaTags.setProperty(currentMetaTag, currentMetaContent); - currentMetaTag = null; - currentMetaContent = null; - return; - } - - void addSpace() throws IOException { - if (!afterSpace) { - if (inTitle) - title.append(" "); - else - addToSummary(" "); - - String space = afterTag ? eol : " "; - length += space.length(); - pipeOut.write(space); - afterSpace = true; - } - } - -// void handleException(Exception e) { -// System.out.println(e.toString()); // print the error message -// System.out.println("Skipping..."); -// Token t; -// do { -// t = getNextToken(); -// } while (t.kind != TagEnd); -// } -} - -PARSER_END(HTMLParser) - - -void HTMLDocument() throws IOException : -{ - Token t; -} -{ -// try { - ( Tag() { afterTag = true; } - | t=Decl() { afterTag = true; } - | CommentTag() { afterTag = true; } - | ScriptTag() { afterTag = true; } - | t= { addText(t.image); afterTag = false; } - | t= { addText(Entities.decode(t.image)); afterTag = false; } - | t= { addText(t.image); afterTag = false; } - | { addSpace(); afterTag = false; } - )* -// } catch (ParseException e) { -// handleException(e); -// } -} - -void Tag() throws IOException : -{ - Token t1, t2; - boolean inImg = false; -} -{ - t1= { - String tagName = t1.image.toLowerCase(Locale.ROOT); - if(Tags.WS_ELEMS.contains(tagName) ) { - addSpace(); - } - inTitle = tagName.equalsIgnoreCase(" - inMetaTag = tagName.equalsIgnoreCase(" - inStyle = tagName.equalsIgnoreCase(" - inImg = tagName.equalsIgnoreCase(" - } - (t1= - ( - (t2=ArgValue() // save ALT text in IMG tag - { - if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) - addText("[" + t2.image + "]"); - - if(inMetaTag && - ( t1.image.equalsIgnoreCase("name") || - t1.image.equalsIgnoreCase("HTTP-EQUIV") - ) - && t2 != null) - { - currentMetaTag=t2.image.toLowerCase(Locale.ROOT); - if(currentMetaTag != null && currentMetaContent != null) { - addMetaTag(); - } - } - if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != -null) - { - currentMetaContent=t2.image.toLowerCase(Locale.ROOT); - if(currentMetaTag != null && currentMetaContent != null) { - addMetaTag(); - } - } - } - )? - )? - )* - -} - -Token ArgValue() : -{ - Token t = null; -} -{ - t= { return t; } -| LOOKAHEAD(2) - { return t; } -| t= { return t; } -| LOOKAHEAD(2) - { return t; } -| t= { return t; } -} - - -Token Decl() : -{ - Token t; -} -{ - t= ( | ArgValue() | )* - { return t; } -} - - -void CommentTag() : -{} -{ - ( ( )* ) - | - ( ( )* ) -} - -void ScriptTag() : -{} -{ - ( )* -} - - -TOKEN : -{ - < ScriptStart: " : WithinScript -| < TagName: "<" ("/")? ["A"-"Z","a"-"z"] ()? > : WithinTag -| < DeclName: "<" "!" ["A"-"Z","a"-"z"] ()? > : WithinTag - -| < Comment1: "" > : DEFAULT -} - - TOKEN : -{ - < CommentText2: (~[">"])+ > -| < CommentEnd2: ">" > : DEFAULT -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserConstants.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserConstants.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserConstants.java (working copy) @@ -1,124 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. HTMLParserConstants.java */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - - -/** - * Token literal values and constants. - * Generated by org.javacc.parser.OtherFilesGen#start() - */ -public interface HTMLParserConstants { - - /** End of File. */ - int EOF = 0; - /** RegularExpression Id. */ - int ScriptStart = 1; - /** RegularExpression Id. */ - int TagName = 2; - /** RegularExpression Id. */ - int DeclName = 3; - /** RegularExpression Id. */ - int Comment1 = 4; - /** RegularExpression Id. */ - int Comment2 = 5; - /** RegularExpression Id. */ - int Word = 6; - /** RegularExpression Id. */ - int LET = 7; - /** RegularExpression Id. */ - int NUM = 8; - /** RegularExpression Id. */ - int HEX = 9; - /** RegularExpression Id. */ - int Entity = 10; - /** RegularExpression Id. */ - int Space = 11; - /** RegularExpression Id. */ - int SP = 12; - /** RegularExpression Id. */ - int Punct = 13; - /** RegularExpression Id. */ - int ScriptText = 14; - /** RegularExpression Id. */ - int ScriptEnd = 15; - /** RegularExpression Id. */ - int ArgName = 16; - /** RegularExpression Id. */ - int ArgEquals = 17; - /** RegularExpression Id. */ - int TagEnd = 18; - /** RegularExpression Id. */ - int ArgValue = 19; - /** RegularExpression Id. */ - int ArgQuote1 = 20; - /** RegularExpression Id. */ - int ArgQuote2 = 21; - /** RegularExpression Id. */ - int Quote1Text = 23; - /** RegularExpression Id. */ - int CloseQuote1 = 24; - /** RegularExpression Id. */ - int Quote2Text = 25; - /** RegularExpression Id. */ - int CloseQuote2 = 26; - /** RegularExpression Id. */ - int CommentText1 = 27; - /** RegularExpression Id. */ - int CommentEnd1 = 28; - /** RegularExpression Id. */ - int CommentText2 = 29; - /** RegularExpression Id. */ - int CommentEnd2 = 30; - - /** Lexical state. */ - int DEFAULT = 0; - /** Lexical state. */ - int WithinScript = 1; - /** Lexical state. */ - int WithinTag = 2; - /** Lexical state. */ - int AfterEquals = 3; - /** Lexical state. */ - int WithinQuote1 = 4; - /** Lexical state. */ - int WithinQuote2 = 5; - /** Lexical state. */ - int WithinComment1 = 6; - /** Lexical state. */ - int WithinComment2 = 7; - - /** Literal token values. */ - String[] tokenImage = { - "", - "\"", - "", - "\"\"", - "", - "\">\"", - }; - -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserTokenManager.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserTokenManager.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParserTokenManager.java (working copy) @@ -1,1657 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. HTMLParserTokenManager.java */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; -import java.io.*; -import java.util.Locale; -import java.util.Properties; - -/** Token Manager. */ -public class HTMLParserTokenManager implements HTMLParserConstants -{ - - /** Debug output. */ - public java.io.PrintStream debugStream = System.out; - /** Set debug output. */ - public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; } -private final int jjStopStringLiteralDfa_0(int pos, long active0) -{ - switch (pos) - { - case 0: - if ((active0 & 0x32L) != 0L) - return 20; - return -1; - case 1: - if ((active0 & 0x2L) != 0L) - { - if (jjmatchedPos != 1) - { - jjmatchedKind = 2; - jjmatchedPos = 1; - } - return 22; - } - if ((active0 & 0x30L) != 0L) - return 25; - return -1; - case 2: - if ((active0 & 0x2L) != 0L) - { - jjmatchedKind = 2; - jjmatchedPos = 2; - return 23; - } - return -1; - case 3: - if ((active0 & 0x2L) != 0L) - { - jjmatchedKind = 2; - jjmatchedPos = 3; - return 23; - } - return -1; - case 4: - if ((active0 & 0x2L) != 0L) - { - jjmatchedKind = 2; - jjmatchedPos = 4; - return 23; - } - return -1; - case 5: - if ((active0 & 0x2L) != 0L) - { - jjmatchedKind = 2; - jjmatchedPos = 5; - return 23; - } - return -1; - default : - return -1; - } -} -private final int jjStartNfa_0(int pos, long active0) -{ - return jjMoveNfa_0(jjStopStringLiteralDfa_0(pos, active0), pos + 1); -} -private int jjStopAtPos(int pos, int kind) -{ - jjmatchedKind = kind; - jjmatchedPos = pos; - return pos + 1; -} -private int jjMoveStringLiteralDfa0_0() -{ - switch(curChar) - { - case 60: - return jjMoveStringLiteralDfa1_0(0x32L); - default : - return jjMoveNfa_0(11, 0); - } -} -private int jjMoveStringLiteralDfa1_0(long active0) -{ - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(0, active0); - return 1; - } - switch(curChar) - { - case 33: - if ((active0 & 0x20L) != 0L) - { - jjmatchedKind = 5; - jjmatchedPos = 1; - } - return jjMoveStringLiteralDfa2_0(active0, 0x10L); - case 115: - return jjMoveStringLiteralDfa2_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(0, active0); -} -private int jjMoveStringLiteralDfa2_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(0, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(1, active0); - return 2; - } - switch(curChar) - { - case 45: - return jjMoveStringLiteralDfa3_0(active0, 0x10L); - case 99: - return jjMoveStringLiteralDfa3_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(1, active0); -} -private int jjMoveStringLiteralDfa3_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(1, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(2, active0); - return 3; - } - switch(curChar) - { - case 45: - if ((active0 & 0x10L) != 0L) - return jjStopAtPos(3, 4); - break; - case 114: - return jjMoveStringLiteralDfa4_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(2, active0); -} -private int jjMoveStringLiteralDfa4_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(2, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(3, active0); - return 4; - } - switch(curChar) - { - case 105: - return jjMoveStringLiteralDfa5_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(3, active0); -} -private int jjMoveStringLiteralDfa5_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(3, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(4, active0); - return 5; - } - switch(curChar) - { - case 112: - return jjMoveStringLiteralDfa6_0(active0, 0x2L); - default : - break; - } - return jjStartNfa_0(4, active0); -} -private int jjMoveStringLiteralDfa6_0(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_0(4, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_0(5, active0); - return 6; - } - switch(curChar) - { - case 116: - if ((active0 & 0x2L) != 0L) - return jjStartNfaWithStates_0(6, 1, 23); - break; - default : - break; - } - return jjStartNfa_0(5, active0); -} -private int jjStartNfaWithStates_0(int pos, int kind, int state) -{ - jjmatchedKind = kind; - jjmatchedPos = pos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return pos + 1; } - return jjMoveNfa_0(state, pos + 1); -} -static final long[] jjbitVec0 = { - 0xfffffffffffffffeL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL -}; -static final long[] jjbitVec2 = { - 0x0L, 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL -}; -private int jjMoveNfa_0(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 28; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 20: - if (curChar == 33) - jjstateSet[jjnewStateCnt++] = 25; - else if (curChar == 47) - jjCheckNAdd(21); - break; - case 11: - if ((0x3ff000000000000L & l) != 0L) - jjCheckNAddTwoStates(7, 2); - else if ((0x100002600L & l) != 0L) - { - if (kind > 11) - kind = 11; - jjCheckNAdd(10); - } - else if (curChar == 60) - jjCheckNAddStates(0, 2); - else if (curChar == 38) - jjAddStates(3, 5); - else if (curChar == 36) - jjstateSet[jjnewStateCnt++] = 1; - if ((0x3ff000000000000L & l) != 0L) - { - if (kind > 6) - kind = 6; - jjCheckNAddStates(6, 10); - } - break; - case 0: - if (curChar == 36) - jjstateSet[jjnewStateCnt++] = 1; - break; - case 1: - if ((0x3ff000000000000L & l) != 0L) - jjCheckNAdd(2); - break; - case 2: - if ((0x500000000000L & l) != 0L) - jjstateSet[jjnewStateCnt++] = 3; - break; - case 3: - case 9: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(11, 13); - break; - case 4: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(6, 10); - break; - case 5: - if ((0x880000000000L & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(14, 17); - break; - case 6: - if ((0x3ff000000000000L & l) != 0L) - jjCheckNAddTwoStates(7, 2); - break; - case 7: - if (curChar != 34) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(11, 13); - break; - case 8: - if ((0x208000000000L & l) != 0L) - jjstateSet[jjnewStateCnt++] = 9; - break; - case 10: - if ((0x100002600L & l) == 0L) - break; - kind = 11; - jjCheckNAdd(10); - break; - case 13: - if (curChar == 59 && kind > 10) - kind = 10; - break; - case 14: - if (curChar == 35) - jjCheckNAdd(15); - break; - case 15: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 10) - kind = 10; - jjCheckNAddTwoStates(15, 13); - break; - case 16: - if (curChar == 35) - jjstateSet[jjnewStateCnt++] = 17; - break; - case 18: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 10) - kind = 10; - jjCheckNAddTwoStates(18, 13); - break; - case 19: - if (curChar == 60) - jjCheckNAddStates(0, 2); - break; - case 22: - if ((0x9fffff7affffd9ffL & l) == 0L) - break; - if (kind > 2) - kind = 2; - jjCheckNAdd(23); - break; - case 23: - if ((0x9ffffffeffffd9ffL & l) == 0L) - break; - if (kind > 2) - kind = 2; - jjCheckNAdd(23); - break; - case 24: - if (curChar == 33) - jjstateSet[jjnewStateCnt++] = 25; - break; - case 26: - if ((0x9fffff7affffd9ffL & l) == 0L) - break; - if (kind > 3) - kind = 3; - jjCheckNAdd(27); - break; - case 27: - if ((0x9ffffffeffffd9ffL & l) == 0L) - break; - if (kind > 3) - kind = 3; - jjCheckNAdd(27); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 20: - case 21: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 2) - kind = 2; - jjstateSet[jjnewStateCnt++] = 22; - break; - case 11: - case 4: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(6, 10); - break; - case 9: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 6) - kind = 6; - jjCheckNAddStates(11, 13); - break; - case 12: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 10) - kind = 10; - jjCheckNAddTwoStates(12, 13); - break; - case 17: - if ((0x100000001000000L & l) != 0L) - jjCheckNAdd(18); - break; - case 18: - if ((0x7e0000007eL & l) == 0L) - break; - if (kind > 10) - kind = 10; - jjCheckNAddTwoStates(18, 13); - break; - case 22: - case 23: - if (kind > 2) - kind = 2; - jjCheckNAdd(23); - break; - case 25: - if ((0x7fffffe07fffffeL & l) == 0L) - break; - if (kind > 3) - kind = 3; - jjstateSet[jjnewStateCnt++] = 26; - break; - case 26: - case 27: - if (kind > 3) - kind = 3; - jjCheckNAdd(27); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 22: - case 23: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 2) - kind = 2; - jjCheckNAdd(23); - break; - case 26: - case 27: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 3) - kind = 3; - jjCheckNAdd(27); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 28 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private int jjMoveStringLiteralDfa0_5() -{ - return jjMoveNfa_5(1, 0); -} -private int jjMoveNfa_5(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 2; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 1: - if ((0xfffffffbffffffffL & l) != 0L) - { - if (kind > 25) - kind = 25; - jjCheckNAdd(0); - } - else if (curChar == 34) - { - if (kind > 26) - kind = 26; - } - break; - case 0: - if ((0xfffffffbffffffffL & l) == 0L) - break; - kind = 25; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - kind = 25; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 25) - kind = 25; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 2 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private final int jjStopStringLiteralDfa_7(int pos, long active0) -{ - switch (pos) - { - default : - return -1; - } -} -private final int jjStartNfa_7(int pos, long active0) -{ - return jjMoveNfa_7(jjStopStringLiteralDfa_7(pos, active0), pos + 1); -} -private int jjMoveStringLiteralDfa0_7() -{ - switch(curChar) - { - case 62: - return jjStopAtPos(0, 30); - default : - return jjMoveNfa_7(0, 0); - } -} -private int jjMoveNfa_7(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 1; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 0: - if ((0xbfffffffffffffffL & l) == 0L) - break; - kind = 29; - jjstateSet[jjnewStateCnt++] = 0; - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - kind = 29; - jjstateSet[jjnewStateCnt++] = 0; - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 29) - kind = 29; - jjstateSet[jjnewStateCnt++] = 0; - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 1 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private int jjMoveStringLiteralDfa0_4() -{ - return jjMoveNfa_4(1, 0); -} -private int jjMoveNfa_4(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 2; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 1: - if ((0xffffff7fffffffffL & l) != 0L) - { - if (kind > 23) - kind = 23; - jjCheckNAdd(0); - } - else if (curChar == 39) - { - if (kind > 24) - kind = 24; - } - break; - case 0: - if ((0xffffff7fffffffffL & l) == 0L) - break; - kind = 23; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - kind = 23; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 23) - kind = 23; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 2 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private final int jjStopStringLiteralDfa_3(int pos, long active0) -{ - switch (pos) - { - default : - return -1; - } -} -private final int jjStartNfa_3(int pos, long active0) -{ - return jjMoveNfa_3(jjStopStringLiteralDfa_3(pos, active0), pos + 1); -} -private int jjMoveStringLiteralDfa0_3() -{ - switch(curChar) - { - case 34: - return jjStopAtPos(0, 21); - case 39: - return jjStopAtPos(0, 20); - default : - return jjMoveNfa_3(0, 0); - } -} -private int jjMoveNfa_3(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 3; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 0: - if ((0x9fffff7affffd9ffL & l) != 0L) - { - if (kind > 19) - kind = 19; - jjCheckNAdd(1); - } - else if ((0x100002600L & l) != 0L) - { - if (kind > 22) - kind = 22; - jjCheckNAdd(2); - } - break; - case 1: - if ((0xbffffffeffffd9ffL & l) == 0L) - break; - if (kind > 19) - kind = 19; - jjCheckNAdd(1); - break; - case 2: - if ((0x100002600L & l) == 0L) - break; - kind = 22; - jjCheckNAdd(2); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - case 1: - if (kind > 19) - kind = 19; - jjCheckNAdd(1); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - case 1: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 19) - kind = 19; - jjCheckNAdd(1); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 3 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private final int jjStopStringLiteralDfa_6(int pos, long active0) -{ - switch (pos) - { - case 0: - if ((active0 & 0x10000000L) != 0L) - { - jjmatchedKind = 27; - return -1; - } - return -1; - case 1: - if ((active0 & 0x10000000L) != 0L) - { - if (jjmatchedPos == 0) - { - jjmatchedKind = 27; - jjmatchedPos = 0; - } - return -1; - } - return -1; - default : - return -1; - } -} -private final int jjStartNfa_6(int pos, long active0) -{ - return jjMoveNfa_6(jjStopStringLiteralDfa_6(pos, active0), pos + 1); -} -private int jjMoveStringLiteralDfa0_6() -{ - switch(curChar) - { - case 45: - return jjMoveStringLiteralDfa1_6(0x10000000L); - default : - return jjMoveNfa_6(1, 0); - } -} -private int jjMoveStringLiteralDfa1_6(long active0) -{ - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_6(0, active0); - return 1; - } - switch(curChar) - { - case 45: - return jjMoveStringLiteralDfa2_6(active0, 0x10000000L); - default : - break; - } - return jjStartNfa_6(0, active0); -} -private int jjMoveStringLiteralDfa2_6(long old0, long active0) -{ - if (((active0 &= old0)) == 0L) - return jjStartNfa_6(0, old0); - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { - jjStopStringLiteralDfa_6(1, active0); - return 2; - } - switch(curChar) - { - case 62: - if ((active0 & 0x10000000L) != 0L) - return jjStopAtPos(2, 28); - break; - default : - break; - } - return jjStartNfa_6(1, active0); -} -private int jjMoveNfa_6(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 2; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 1: - if ((0xffffdfffffffffffL & l) != 0L) - { - if (kind > 27) - kind = 27; - jjCheckNAdd(0); - } - else if (curChar == 45) - { - if (kind > 27) - kind = 27; - } - break; - case 0: - if ((0xffffdfffffffffffL & l) == 0L) - break; - kind = 27; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - kind = 27; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 27) - kind = 27; - jjCheckNAdd(0); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 2 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private int jjMoveStringLiteralDfa0_1() -{ - return jjMoveNfa_1(1, 0); -} -private int jjMoveNfa_1(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 12; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 1: - if ((0xafffffffffffffffL & l) != 0L) - { - if (kind > 14) - kind = 14; - jjCheckNAdd(0); - } - else if ((0x5000000000000000L & l) != 0L) - { - if (kind > 14) - kind = 14; - } - if (curChar == 60) - jjstateSet[jjnewStateCnt++] = 10; - break; - case 0: - if ((0xafffffffffffffffL & l) == 0L) - break; - if (kind > 14) - kind = 14; - jjCheckNAdd(0); - break; - case 3: - if ((0xafffffffffffffffL & l) != 0L) - jjAddStates(18, 19); - break; - case 4: - if (curChar == 62 && kind > 15) - kind = 15; - break; - case 10: - if (curChar == 47) - jjstateSet[jjnewStateCnt++] = 9; - break; - case 11: - if (curChar == 60) - jjstateSet[jjnewStateCnt++] = 10; - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (kind > 14) - kind = 14; - jjCheckNAdd(0); - break; - case 2: - if (curChar == 116) - jjCheckNAddTwoStates(3, 4); - break; - case 3: - jjCheckNAddTwoStates(3, 4); - break; - case 5: - if (curChar == 112) - jjstateSet[jjnewStateCnt++] = 2; - break; - case 6: - if (curChar == 105) - jjstateSet[jjnewStateCnt++] = 5; - break; - case 7: - if (curChar == 114) - jjstateSet[jjnewStateCnt++] = 6; - break; - case 8: - if (curChar == 99) - jjstateSet[jjnewStateCnt++] = 7; - break; - case 9: - if (curChar == 115) - jjstateSet[jjnewStateCnt++] = 8; - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 1: - case 0: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 14) - kind = 14; - jjCheckNAdd(0); - break; - case 3: - if (jjCanMove_0(hiByte, i1, i2, l1, l2)) - jjAddStates(18, 19); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 12 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -private final int jjStopStringLiteralDfa_2(int pos, long active0) -{ - switch (pos) - { - default : - return -1; - } -} -private final int jjStartNfa_2(int pos, long active0) -{ - return jjMoveNfa_2(jjStopStringLiteralDfa_2(pos, active0), pos + 1); -} -private int jjMoveStringLiteralDfa0_2() -{ - switch(curChar) - { - case 34: - return jjStopAtPos(0, 21); - case 39: - return jjStopAtPos(0, 20); - case 61: - return jjStartNfaWithStates_2(0, 17, 3); - default : - return jjMoveNfa_2(0, 0); - } -} -private int jjStartNfaWithStates_2(int pos, int kind, int state) -{ - jjmatchedKind = kind; - jjmatchedPos = pos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return pos + 1; } - return jjMoveNfa_2(state, pos + 1); -} -private int jjMoveNfa_2(int startState, int curPos) -{ - int startsAt = 0; - jjnewStateCnt = 6; - int i = 1; - jjstateSet[0] = startState; - int kind = 0x7fffffff; - for (;;) - { - if (++jjround == 0x7fffffff) - ReInitRounds(); - if (curChar < 64) - { - long l = 1L << curChar; - do - { - switch(jjstateSet[--i]) - { - case 0: - if ((0x9fffff7affffd9ffL & l) != 0L) - { - if (kind > 16) - kind = 16; - jjCheckNAdd(1); - } - else if ((0x100002600L & l) != 0L) - { - if (kind > 22) - kind = 22; - jjCheckNAdd(5); - } - else if (curChar == 61) - jjstateSet[jjnewStateCnt++] = 3; - else if (curChar == 62) - { - if (kind > 18) - kind = 18; - } - break; - case 1: - if ((0x9ffffffeffffd9ffL & l) == 0L) - break; - if (kind > 16) - kind = 16; - jjCheckNAdd(1); - break; - case 2: - case 3: - if (curChar == 62 && kind > 18) - kind = 18; - break; - case 4: - if (curChar == 61) - jjstateSet[jjnewStateCnt++] = 3; - break; - case 5: - if ((0x100002600L & l) == 0L) - break; - kind = 22; - jjCheckNAdd(5); - break; - default : break; - } - } while(i != startsAt); - } - else if (curChar < 128) - { - long l = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - case 1: - if (kind > 16) - kind = 16; - jjCheckNAdd(1); - break; - default : break; - } - } while(i != startsAt); - } - else - { - int hiByte = (int)(curChar >> 8); - int i1 = hiByte >> 6; - long l1 = 1L << (hiByte & 077); - int i2 = (curChar & 0xff) >> 6; - long l2 = 1L << (curChar & 077); - do - { - switch(jjstateSet[--i]) - { - case 0: - case 1: - if (!jjCanMove_0(hiByte, i1, i2, l1, l2)) - break; - if (kind > 16) - kind = 16; - jjCheckNAdd(1); - break; - default : break; - } - } while(i != startsAt); - } - if (kind != 0x7fffffff) - { - jjmatchedKind = kind; - jjmatchedPos = curPos; - kind = 0x7fffffff; - } - ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 6 - (jjnewStateCnt = startsAt))) - return curPos; - try { curChar = input_stream.readChar(); } - catch(java.io.IOException e) { return curPos; } - } -} -static final int[] jjnextStates = { - 20, 21, 24, 12, 14, 16, 5, 8, 0, 4, 6, 0, 4, 6, 5, 0, - 4, 6, 3, 4, -}; -private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2) -{ - switch(hiByte) - { - case 0: - return ((jjbitVec2[i2] & l2) != 0L); - default : - if ((jjbitVec0[i1] & l1) != 0L) - return true; - return false; - } -} - -/** Token literal values. */ -public static final String[] jjstrLiteralImages = { -"", "\74\163\143\162\151\160\164", null, null, "\74\41\55\55", "\74\41", null, -null, null, null, null, null, null, null, null, null, null, "\75", null, null, -"\47", "\42", null, null, null, null, null, null, "\55\55\76", null, "\76", }; - -/** Lexer state names. */ -public static final String[] lexStateNames = { - "DEFAULT", - "WithinScript", - "WithinTag", - "AfterEquals", - "WithinQuote1", - "WithinQuote2", - "WithinComment1", - "WithinComment2", -}; - -/** Lex State array. */ -public static final int[] jjnewLexState = { - -1, 1, 2, 2, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 3, 0, 2, 4, 5, -1, -1, 2, - -1, 2, -1, 0, -1, 0, -}; -static final long[] jjtoToken = { - 0x7fbfec7fL, -}; -static final long[] jjtoSkip = { - 0x400000L, -}; -protected CharStream input_stream; -private final int[] jjrounds = new int[28]; -private final int[] jjstateSet = new int[56]; -protected char curChar; -/** Constructor. */ -public HTMLParserTokenManager(CharStream stream){ - input_stream = stream; -} - -/** Constructor. */ -public HTMLParserTokenManager(CharStream stream, int lexState){ - this(stream); - SwitchTo(lexState); -} - -/** Reinitialise parser. */ -public void ReInit(CharStream stream) -{ - jjmatchedPos = jjnewStateCnt = 0; - curLexState = defaultLexState; - input_stream = stream; - ReInitRounds(); -} -private void ReInitRounds() -{ - int i; - jjround = 0x80000001; - for (i = 28; i-- > 0;) - jjrounds[i] = 0x80000000; -} - -/** Reinitialise parser. */ -public void ReInit(CharStream stream, int lexState) -{ - ReInit(stream); - SwitchTo(lexState); -} - -/** Switch to specified lex state. */ -public void SwitchTo(int lexState) -{ - if (lexState >= 8 || lexState < 0) - throw new TokenMgrError("Error: Ignoring invalid lexical state : " + lexState + ". State unchanged.", TokenMgrError.INVALID_LEXICAL_STATE); - else - curLexState = lexState; -} - -protected Token jjFillToken() -{ - final Token t; - final String curTokenImage; - final int beginLine; - final int endLine; - final int beginColumn; - final int endColumn; - String im = jjstrLiteralImages[jjmatchedKind]; - curTokenImage = (im == null) ? input_stream.GetImage() : im; - beginLine = input_stream.getBeginLine(); - beginColumn = input_stream.getBeginColumn(); - endLine = input_stream.getEndLine(); - endColumn = input_stream.getEndColumn(); - t = Token.newToken(jjmatchedKind, curTokenImage); - - t.beginLine = beginLine; - t.endLine = endLine; - t.beginColumn = beginColumn; - t.endColumn = endColumn; - - return t; -} - -int curLexState = 0; -int defaultLexState = 0; -int jjnewStateCnt; -int jjround; -int jjmatchedPos; -int jjmatchedKind; - -/** Get the next Token. */ -public Token getNextToken() -{ - Token matchedToken; - int curPos = 0; - - EOFLoop : - for (;;) - { - try - { - curChar = input_stream.BeginToken(); - } - catch(java.io.IOException e) - { - jjmatchedKind = 0; - matchedToken = jjFillToken(); - return matchedToken; - } - - switch(curLexState) - { - case 0: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_0(); - if (jjmatchedPos == 0 && jjmatchedKind > 13) - { - jjmatchedKind = 13; - } - break; - case 1: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_1(); - break; - case 2: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_2(); - break; - case 3: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_3(); - break; - case 4: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_4(); - break; - case 5: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_5(); - break; - case 6: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_6(); - break; - case 7: - jjmatchedKind = 0x7fffffff; - jjmatchedPos = 0; - curPos = jjMoveStringLiteralDfa0_7(); - break; - } - if (jjmatchedKind != 0x7fffffff) - { - if (jjmatchedPos + 1 < curPos) - input_stream.backup(curPos - jjmatchedPos - 1); - if ((jjtoToken[jjmatchedKind >> 6] & (1L << (jjmatchedKind & 077))) != 0L) - { - matchedToken = jjFillToken(); - if (jjnewLexState[jjmatchedKind] != -1) - curLexState = jjnewLexState[jjmatchedKind]; - return matchedToken; - } - else - { - if (jjnewLexState[jjmatchedKind] != -1) - curLexState = jjnewLexState[jjmatchedKind]; - continue EOFLoop; - } - } - int error_line = input_stream.getEndLine(); - int error_column = input_stream.getEndColumn(); - String error_after = null; - boolean EOFSeen = false; - try { input_stream.readChar(); input_stream.backup(1); } - catch (java.io.IOException e1) { - EOFSeen = true; - error_after = curPos <= 1 ? "" : input_stream.GetImage(); - if (curChar == '\n' || curChar == '\r') { - error_line++; - error_column = 0; - } - else - error_column++; - } - if (!EOFSeen) { - input_stream.backup(1); - error_after = curPos <= 1 ? "" : input_stream.GetImage(); - } - throw new TokenMgrError(EOFSeen, curLexState, error_line, error_column, error_after, curChar, TokenMgrError.LEXICAL_ERROR); - } -} - -private void jjCheckNAdd(int state) -{ - if (jjrounds[state] != jjround) - { - jjstateSet[jjnewStateCnt++] = state; - jjrounds[state] = jjround; - } -} -private void jjAddStates(int start, int end) -{ - do { - jjstateSet[jjnewStateCnt++] = jjnextStates[start]; - } while (start++ != end); -} -private void jjCheckNAddTwoStates(int state1, int state2) -{ - jjCheckNAdd(state1); - jjCheckNAdd(state2); -} - -private void jjCheckNAddStates(int start, int end) -{ - do { - jjCheckNAdd(jjnextStates[start]); - } while (start++ != end); -} - -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/package.html =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/package.html (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/package.html (working copy) @@ -1,22 +0,0 @@ - - - - -Example html parser based on JavaCC - - Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParseException.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParseException.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParseException.java (working copy) @@ -1,198 +0,0 @@ -/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */ -/* JavaCCOptions:KEEP_LINE_COL=null */ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/** - * This exception is thrown when parse errors are encountered. - * You can explicitly create objects of this exception type by - * calling the method generateParseException in the generated - * parser. - * - * You can modify this class to customize your error reporting - * mechanisms so long as you retain the public fields. - */ -public class ParseException extends Exception { - - /** - * This constructor is used by the method "generateParseException" - * in the generated parser. Calling this constructor generates - * a new object of this type with the fields "currentToken", - * "expectedTokenSequences", and "tokenImage" set. The boolean - * flag "specialConstructor" is also set to true to indicate that - * this constructor was used to create this object. - * This constructor calls its super class with the empty string - * to force the "toString" method of parent class "Throwable" to - * print the error message in the form: - * ParseException: - */ - public ParseException(Token currentTokenVal, - int[][] expectedTokenSequencesVal, - String[] tokenImageVal - ) - { - super(""); - specialConstructor = true; - currentToken = currentTokenVal; - expectedTokenSequences = expectedTokenSequencesVal; - tokenImage = tokenImageVal; - } - - /** - * The following constructors are for use by you for whatever - * purpose you can think of. Constructing the exception in this - * manner makes the exception behave in the normal way - i.e., as - * documented in the class "Throwable". The fields "errorToken", - * "expectedTokenSequences", and "tokenImage" do not contain - * relevant information. The JavaCC generated code does not use - * these constructors. - */ - - public ParseException() { - super(); - specialConstructor = false; - } - - /** Constructor with message. */ - public ParseException(String message) { - super(message); - specialConstructor = false; - } - - /** - * This variable determines which constructor was used to create - * this object and thereby affects the semantics of the - * "getMessage" method (see below). - */ - protected boolean specialConstructor; - - /** - * This is the last token that has been consumed successfully. If - * this object has been created due to a parse error, the token - * followng this token will (therefore) be the first error token. - */ - public Token currentToken; - - /** - * Each entry in this array is an array of integers. Each array - * of integers represents a sequence of tokens (by their ordinal - * values) that is expected at this point of the parse. - */ - public int[][] expectedTokenSequences; - - /** - * This is a reference to the "tokenImage" array of the generated - * parser within which the parse error occurred. This array is - * defined in the generated ...Constants interface. - */ - public String[] tokenImage; - - /** - * This method has the standard behavior when this object has been - * created using the standard constructors. Otherwise, it uses - * "currentToken" and "expectedTokenSequences" to generate a parse - * error message and returns it. If this object has been created - * due to a parse error, and you do not catch it (it gets thrown - * from the parser), then this method is called during the printing - * of the final stack trace, and hence the correct error message - * gets displayed. - */ - public String getMessage() { - if (!specialConstructor) { - return super.getMessage(); - } - StringBuffer expected = new StringBuffer(); - int maxSize = 0; - for (int i = 0; i < expectedTokenSequences.length; i++) { - if (maxSize < expectedTokenSequences[i].length) { - maxSize = expectedTokenSequences[i].length; - } - for (int j = 0; j < expectedTokenSequences[i].length; j++) { - expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' '); - } - if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) { - expected.append("..."); - } - expected.append(eol).append(" "); - } - String retval = "Encountered \""; - Token tok = currentToken.next; - for (int i = 0; i < maxSize; i++) { - if (i != 0) retval += " "; - if (tok.kind == 0) { - retval += tokenImage[0]; - break; - } - retval += " " + tokenImage[tok.kind]; - retval += " \""; - retval += add_escapes(tok.image); - retval += " \""; - tok = tok.next; - } - retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn; - retval += "." + eol; - if (expectedTokenSequences.length == 1) { - retval += "Was expecting:" + eol + " "; - } else { - retval += "Was expecting one of:" + eol + " "; - } - retval += expected.toString(); - return retval; - } - - /** - * The end of line string for this machine. - */ - protected String eol = System.getProperty("line.separator", "\n"); - - /** - * Used to convert raw characters to their escaped version - * when these raw version cannot be used as part of an ASCII - * string literal. - */ - protected String add_escapes(String str) { - StringBuffer retval = new StringBuffer(); - char ch; - for (int i = 0; i < str.length(); i++) { - switch (str.charAt(i)) - { - case 0 : - continue; - case '\b': - retval.append("\\b"); - continue; - case '\t': - retval.append("\\t"); - continue; - case '\n': - retval.append("\\n"); - continue; - case '\f': - retval.append("\\f"); - continue; - case '\r': - retval.append("\\r"); - continue; - case '\"': - retval.append("\\\""); - continue; - case '\'': - retval.append("\\\'"); - continue; - case '\\': - retval.append("\\\\"); - continue; - default: - if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) { - String s = "0000" + Integer.toString(ch, 16); - retval.append("\\u" + s.substring(s.length() - 4, s.length())); - } else { - retval.append(ch); - } - continue; - } - } - return retval.toString(); - } - -} -/* JavaCC - OriginalChecksum=e449d0e43f3d85deb1260a88b7e90fcd (do not edit this line) */ Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParserThread.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParserThread.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/ParserThread.java (working copy) @@ -1,50 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.*; - -class ParserThread extends Thread { - HTMLParser parser; - - ParserThread(HTMLParser p) { - parser = p; - } - - @Override - public void run() { // convert pipeOut to pipeIn - try { - try { // parse document to pipeOut - parser.HTMLDocument(); - } catch (ParseException e) { - System.out.println("Parse Aborted: " + e.getMessage()); - } catch (TokenMgrError e) { - System.out.println("Parse Aborted: " + e.getMessage()); - } finally { - parser.pipeOut.close(); - synchronized (parser) { - parser.summary.setLength(HTMLParser.SUMMARY_LENGTH); - parser.titleComplete = true; - parser.notifyAll(); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - } -} Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Tags.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Tags.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/Tags.java (working copy) @@ -1,67 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.HashSet; -import java.util.Set; - - -/** - * Utility class storing set of commonly-used html tags. - */ -public final class Tags { - - /** - * contains all tags for which whitespaces have to be inserted for proper tokenization - */ - public static final Set WS_ELEMS; - - static{ - WS_ELEMS = new HashSet(); - WS_ELEMS.add("" does not need to be listed explicitly - WS_ELEMS.add(" 0x7e) { - String s = "0000" + Integer.toString(ch, 16); - retval.append("\\u" + s.substring(s.length() - 4, s.length())); - } else { - retval.append(ch); - } - continue; - } - } - return retval.toString(); - } - - /** - * Returns a detailed message for the Error when it is thrown by the - * token manager to indicate a lexical error. - * Parameters : - * EOFSeen : indicates if EOF caused the lexical error - * curLexState : lexical state in which this error occurred - * errorLine : line number when the error occurred - * errorColumn : column number when the error occurred - * errorAfter : prefix that was seen before this error occurred - * curchar : the offending character - * Note: You can customize the lexical error message by modifying this method. - */ - protected static String LexicalError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar) { - return("Lexical error at line " + - errorLine + ", column " + - errorColumn + ". Encountered: " + - (EOFSeen ? " " : ("\"" + addEscapes(String.valueOf(curChar)) + "\"") + " (" + (int)curChar + "), ") + - "after : \"" + addEscapes(errorAfter) + "\""); - } - - /** - * You can also modify the body of this method to customize your error messages. - * For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not - * of end-users concern, so you can return something like : - * - * "Internal Error : Please file a bug report .... " - * - * from this method for such cases in the release version of your parser. - */ - public String getMessage() { - return super.getMessage(); - } - - /* - * Constructors of various flavors follow. - */ - - /** No arg constructor. */ - public TokenMgrError() { - } - - /** Constructor with message and reason. */ - public TokenMgrError(String message, int reason) { - super(message); - errorCode = reason; - } - - /** Full Constructor. */ - public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar, int reason) { - this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason); - } -} -/* JavaCC - OriginalChecksum=538f0da130356fcc0bc7db621ab0389d (do not edit this line) */ Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (working copy) @@ -19,50 +19,203 @@ import java.io.IOException; import java.io.Reader; +import java.io.StringReader; import java.text.DateFormat; import java.text.ParseException; +import java.util.Collections; import java.util.Date; +import java.util.HashSet; +import java.util.Locale; import java.util.Properties; +import java.util.Set; +import org.cyberneko.html.parsers.SAXParser; + +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + /** - * HTML Parser that is based on Lucene's demo HTML parser. + * Simple HTML Parser extracting title, meta tags, and body text + * that is based on NekoHTML. */ -public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser { +public class DemoHTMLParser implements HTMLParser { + + /** The actual parser to read HTML documents */ + public static final class DocumentParser { + + // TODO: remove the Turkish workaround once this is fixed in NekoHTML: + // https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178 + + // BEGIN: workaround + static final String convertTurkish(String s) { + return s.replace('i', 'ı'); + } + + static final boolean equalsIgnoreTurkish(String s1, String s2) { + final int len1 = s1.length(), len2 = s2.length(); + if (len1 != len2) + return false; + for (int i = 0; i < len1; i++) { + char ch1 = s1.charAt(i), ch2 = s2.charAt(i); + if (ch1 == 'ı') ch1 = 'i'; + if (ch2 == 'ı') ch2 = 'i'; + if (ch1 != ch2) + return false; + } + return true; + } + // END: workaround + + static final Set createElementNameSet(String... names) { + final HashSet set = new HashSet(); + for (final String name : names) { + set.add(name); + set.add(convertTurkish(name)); + } + return Collections.unmodifiableSet(set); + } + + /** HTML elements that cause a line break (they are block-elements) */ + static final Set ENDLINE_ELEMENTS = createElementNameSet( + "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", + "pre", "hr", "blockquote", "address", "fieldset", "table", "form", + "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option" + ); - public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { - org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser p = new org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser(reader); + /** HTML elements with contents that are ignored */ + static final Set SUPPRESS_ELEMENTS = createElementNameSet( + "style", "script" + ); + + public final Properties metaTags = new Properties(); + public final String title, body; + public DocumentParser(Reader reader) throws IOException, SAXException { + final SAXParser parser = new SAXParser(); + parser.setFeature("http://xml.org/sax/features/namespaces", true); + parser.setFeature("http://cyberneko.org/html/features/balance-tags", true); + parser.setFeature("http://cyberneko.org/html/features/report-errors", false); + parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); + parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); + + final StringBuilder title = new StringBuilder(), body = new StringBuilder(); + final DefaultHandler handler = new DefaultHandler() { + private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0; + + @Override + public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { + if (inHEAD > 0) { + if (equalsIgnoreTurkish("title", localName)) { + inTITLE++; + } else { + if (equalsIgnoreTurkish("meta", localName)) { + String name = atts.getValue("name"); + if (name == null) { + name = atts.getValue("http-equiv"); + } + final String val = atts.getValue("content"); + if (name != null && val != null) { + metaTags.setProperty(name.toLowerCase(Locale.ROOT), val); + } + } + } + } else if (inBODY > 0) { + if (SUPPRESS_ELEMENTS.contains(localName)) { + suppressed++; + } else if (equalsIgnoreTurkish("img", localName)) { + // the original javacc-based parser preserved ... + // attribute as body text in [] parenthesis: + final String alt = atts.getValue("alt"); + if (alt != null) { + body.append('[').append(alt).append(']'); + } + } + } else if (equalsIgnoreTurkish("body", localName)) { + inBODY++; + } else if (equalsIgnoreTurkish("head", localName)) { + inHEAD++; + } else if (equalsIgnoreTurkish("frameset", localName)) { + throw new SAXException("This parser does not support HTML framesets."); + } + } + + @Override + public void endElement(String namespaceURI, String localName, String qName) throws SAXException { + if (equalsIgnoreTurkish("body", localName)) { + inBODY--; + } else if (equalsIgnoreTurkish("head", localName)) { + inHEAD--; + } else if (inBODY > 0) { + if (ENDLINE_ELEMENTS.contains(localName)) { + body.append('\n'); + } else if (SUPPRESS_ELEMENTS.contains(localName)) { + suppressed--; + } + } else if (inHEAD > 0) { + if (equalsIgnoreTurkish("title", localName)) { + inTITLE--; + } + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (inBODY > 0 && suppressed == 0) { + body.append(ch, start, length); + } else if (inTITLE > 0) { + title.append(ch, start, length); + } + } + + @Override + public InputSource resolveEntity(String publicId, String systemId) { + // disable network access caused by DTDs + return new InputSource(new StringReader("")); + } + }; + parser.setContentHandler(handler); + parser.setErrorHandler(handler); + parser.parse(new InputSource(reader)); + + // javacc-based parser trimmed title (which should be done for HTML): + this.title = title.toString().trim(); + + // assign body text + this.body = body.toString(); + } + } + + @Override + public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException { + final DocumentParser p; + try { + p = new DocumentParser(reader); + } catch (SAXException saxe) { + throw new IOException("SAX exception occurred while parsing HTML document.", saxe); + } + // title if (title==null) { - title = p.getTitle(); + title = p.title; } // properties - Properties props = p.getMetaTags(); - // body - Reader r = p.getReader(); - char c[] = new char[1024]; - StringBuilder bodyBuf = new StringBuilder(); - int n; - while ((n = r.read(c)) >= 0) { - if (n>0) { - bodyBuf.append(c,0,n); - } - } - r.close(); - if (date == null && props.getProperty("date")!=null) { + final Properties props = p.metaTags; + if (date == null && props.getProperty("date") != null) { try { date = dateFormat.parse(props.getProperty("date").trim()); } catch (ParseException e) { // do not fail test just because a date could not be parsed - System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date")); - date = new Date(); // now + System.out.println("ignoring date parse exception for: "+props.getProperty("date")); + date = null; } } docData.clear(); docData.setName(name); - docData.setBody(bodyBuf.toString()); + docData.setBody(p.body); docData.setTitle(title); docData.setProps(props); docData.setDate(date); Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (working copy) @@ -41,6 +41,6 @@ * @throws IOException * @throws InterruptedException */ - public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException; + public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException; } Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (working copy) @@ -22,7 +22,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.Reader; import java.text.DateFormat; import java.text.ParsePosition; import java.text.SimpleDateFormat; @@ -33,7 +32,6 @@ import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.StreamUtils; -import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader; import org.apache.lucene.util.ThreadInterruptedException; /** @@ -83,13 +81,10 @@ }; private ThreadLocal dateFormats = new ThreadLocal(); - private ThreadLocal trecDocReader = new ThreadLocal(); private ThreadLocal trecDocBuffer = new ThreadLocal(); private File dataDir = null; private ArrayList inputFiles = new ArrayList(); private int nextFile = 0; - private int rawDocSize = 0; - // Use to synchronize threads on reading from the TREC documents. private Object lock = new Object(); @@ -126,17 +121,6 @@ return sb; } - Reader getTrecDocReader(StringBuilder docBuffer) { - StringBuilderReader r = trecDocReader.get(); - if (r == null) { - r = new StringBuilderReader(docBuffer); - trecDocReader.set(r); - } else { - r.set(docBuffer); - } - return r; - } - HTMLParser getHtmlParser() { return htmlParser; } @@ -161,7 +145,7 @@ continue; } - rawDocSize += line.length(); + line.length(); if (lineStart!=null && line.startsWith(lineStart)) { if (collectMatchLine) { Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (working copy) @@ -18,7 +18,7 @@ */ import java.io.IOException; -import java.io.Reader; +import java.io.StringReader; import java.util.Date; /** @@ -29,6 +29,9 @@ private static final String DATE = "Date: "; private static final String DATE_END = TrecContentSource.NEW_LINE; + private static final String DOCSTART = ""; + private static final String DOCEND = ""; + private static final String DOCHDR = ""; private static final String TERMINATING_DOCHDR = ""; private static final int TERMINATING_DOCHDR_LENGTH = TERMINATING_DOCHDR.length(); @@ -36,24 +39,28 @@ @Override public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException { - // Set up a (per-thread) reused Reader over the read content, reset it to re-read from docBuf - Reader r = trecSrc.getTrecDocReader(docBuf); - - // skip some of the text, optionally set date + // skip some of the non-html text, optionally set date Date date = null; - int h1 = docBuf.indexOf(DOCHDR); - if (h1>=0) { - int h2 = docBuf.indexOf(TERMINATING_DOCHDR,h1); + int h1, start = 0, end = docBuf.length(); + h1 = docBuf.indexOf(DOCSTART); + if (h1 >= 0) { + start = h1; + } + h1 = docBuf.indexOf(DOCHDR); + if (h1 >= 0) { + int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1); String dateStr = extract(docBuf, DATE, DATE_END, h2, null); if (dateStr != null) { date = trecSrc.parseDate(dateStr); } - r.mark(h2+TERMINATING_DOCHDR_LENGTH); + start = h2 + TERMINATING_DOCHDR_LENGTH; } - - r.reset(); - HTMLParser htmlParser = trecSrc.getHtmlParser(); - return htmlParser.parse(docData, name, date, null, r, null); + h1 = docBuf.lastIndexOf(DOCEND); + if (h1 >= 0 && h1 >= start) { + end = h1; + } + String html = docBuf.substring(start, end); + return trecSrc.getHtmlParser().parse(docData, name, date, null, new StringReader(html), null); } } Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java =================================================================== --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java (revision 1361666) +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java (working copy) @@ -1,181 +0,0 @@ -package org.apache.lucene.benchmark.byTask.utils; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; - -/** - * Implements a {@link Reader} over a {@link StringBuilder} instance. Although - * one can use {@link java.io.StringReader} by passing it - * {@link StringBuilder#toString()}, it is better to use this class, as it - * doesn't mark the passed-in {@link StringBuilder} as shared (which will cause - * inner char[] allocations at the next append() attempt).
- * Notes: - *

    - *
  • This implementation assumes the underlying {@link StringBuilder} is not - * changed during the use of this {@link Reader} implementation. - *
  • This implementation is thread-safe. - *
  • The implementation looks very much like {@link java.io.StringReader} (for - * the right reasons). - *
  • If one wants to reuse that instance, then the following needs to be done: - *
    - * StringBuilder sb = new StringBuilder("some text");
    - * Reader reader = new StringBuilderReader(sb);
    - * ... read from reader - don't close it ! ...
    - * sb.setLength(0);
    - * sb.append("some new text");
    - * reader.reset();
    - * ... read the new string from the reader ...
    - * 
    - *
- */ -public class StringBuilderReader extends Reader { - - // The StringBuilder to read from. - private StringBuilder sb; - - // The length of 'sb'. - private int length; - - // The next position to read from the StringBuilder. - private int next = 0; - - // The mark position. The default value 0 means the start of the text. - private int mark = 0; - - public StringBuilderReader(StringBuilder sb) { - set(sb); - } - - /** Check to make sure that the stream has not been closed. */ - private void ensureOpen() throws IOException { - if (sb == null) { - throw new IOException("Stream has already been closed"); - } - } - - @Override - public void close() { - synchronized (lock) { - sb = null; - } - } - - /** - * Mark the present position in the stream. Subsequent calls to reset() will - * reposition the stream to this point. - * - * @param readAheadLimit Limit on the number of characters that may be read - * while still preserving the mark. Because the stream's input comes - * from a StringBuilder, there is no actual limit, so this argument - * must not be negative, but is otherwise ignored. - * @exception IllegalArgumentException If readAheadLimit is < 0 - * @exception IOException If an I/O error occurs - */ - @Override - public void mark(int readAheadLimit) throws IOException { - if (readAheadLimit < 0){ - throw new IllegalArgumentException("Read-ahead limit cannpt be negative: " + readAheadLimit); - } - synchronized (lock) { - ensureOpen(); - mark = next; - } - } - - @Override - public boolean markSupported() { - return true; - } - - @Override - public int read() throws IOException { - synchronized (lock) { - ensureOpen(); - return next >= length ? -1 : sb.charAt(next++); - } - } - - @Override - public int read(char cbuf[], int off, int len) throws IOException { - synchronized (lock) { - ensureOpen(); - - // Validate parameters - if (off < 0 || off > cbuf.length || len < 0 || off + len > cbuf.length) { - throw new IndexOutOfBoundsException("off=" + off + " len=" + len + " cbuf.length=" + cbuf.length); - } - - if (len == 0) { - return 0; - } - - if (next >= length) { - return -1; - } - - int n = Math.min(length - next, len); - sb.getChars(next, next + n, cbuf, off); - next += n; - return n; - } - } - - @Override - public boolean ready() throws IOException { - synchronized (lock) { - ensureOpen(); - return true; - } - } - - @Override - public void reset() throws IOException { - synchronized (lock) { - ensureOpen(); - next = mark; - length = sb.length(); - } - } - - public void set(StringBuilder sb) { - synchronized (lock) { - this.sb = sb; - length = sb.length(); - next = mark = 0; - } - } - - @Override - public long skip(long ns) throws IOException { - synchronized (lock) { - ensureOpen(); - if (next >= length) { - return 0; - } - - // Bound skip by beginning and end of the source - long n = Math.min(length - next, ns); - n = Math.max(-next, n); - next += n; - return n; - } - } - -} Index: lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/demohtml/TestHtmlParser.java =================================================================== --- lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/demohtml/TestHtmlParser.java (revision 1361666) +++ lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/demohtml/TestHtmlParser.java (working copy) @@ -1,132 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.Properties; - -import org.apache.lucene.util.LuceneTestCase; - -public class TestHtmlParser extends LuceneTestCase { - - public void testUnicode() throws Exception { - String text = "汉语"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("汉语", parser); - } - - public void testEntities() throws Exception { - String text = "汉语¥"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("汉语¥", parser); - } - - public void testComments() throws Exception { - String text = "foo"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("foo", parser); - } - - public void testScript() throws Exception { - String text = "foo"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("foo", parser); - } - - public void testStyle() throws Exception { - String text = "" + - "foo"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("foo", parser); - } - - public void testDoctype() throws Exception { - String text = "" + - "foo"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("foo", parser); - } - - public void testMeta() throws Exception { - String text = "" + - "" + - "" + - "" + - "" + - "foobar"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - Properties tags = parser.getMetaTags(); - assertEquals(4, tags.size()); - assertEquals("1", tags.get("a")); - assertEquals("2", tags.get("b")); - assertEquals("this is a test", tags.get("keywords")); - assertEquals("text/html;charset=utf-8", tags.get("content-type")); - } - - public void testTitle() throws Exception { - String text = "foobar"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertEquals("foo", parser.getTitle()); - } - - public void testSummary() throws Exception { - String text = "foo" + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - ""; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertEquals(200, parser.getSummary().length()); - } - - // LUCENE-590 - public void testSummaryTitle() throws Exception { - String text = "SummarySummary of the document"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertEquals("Summary of the document", parser.getSummary()); - } - - // LUCENE-2246 - public void testTurkish() throws Exception { - String text = "" + - "\"ş\"" + - ""; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("[ş]", parser); - } - - private void assertReadsTo(String expected, HTMLParser parser) throws IOException { - Reader reader = parser.getReader(); - StringBuilder builder = new StringBuilder(); - int ch = 0; - while ((ch = reader.read()) != -1) { - builder.append((char)ch); - } - assertEquals(expected, builder.toString()); - } -} Index: lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/demohtml/TestHtmlParser.java =================================================================== --- lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/demohtml/TestHtmlParser.java (revision 1361666) +++ lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/demohtml/TestHtmlParser.java (working copy) @@ -1,132 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds.demohtml; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.Properties; - -import org.apache.lucene.util.LuceneTestCase; - -public class TestHtmlParser extends LuceneTestCase { - - public void testUnicode() throws Exception { - String text = "汉语"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("汉语", parser); - } - - public void testEntities() throws Exception { - String text = "汉语¥"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("汉语¥", parser); - } - - public void testComments() throws Exception { - String text = "foo"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("foo", parser); - } - - public void testScript() throws Exception { - String text = "foo"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("foo", parser); - } - - public void testStyle() throws Exception { - String text = "" + - "foo"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("foo", parser); - } - - public void testDoctype() throws Exception { - String text = "" + - "foo"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("foo", parser); - } - - public void testMeta() throws Exception { - String text = "" + - "" + - "" + - "" + - "" + - "foobar"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - Properties tags = parser.getMetaTags(); - assertEquals(4, tags.size()); - assertEquals("1", tags.get("a")); - assertEquals("2", tags.get("b")); - assertEquals("this is a test", tags.get("keywords")); - assertEquals("text/html;charset=utf-8", tags.get("content-type")); - } - - public void testTitle() throws Exception { - String text = "foobar"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertEquals("foo", parser.getTitle()); - } - - public void testSummary() throws Exception { - String text = "foo" + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - "Summarize me. Summarize me. Summarize me. Summarize me. " + - ""; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertEquals(200, parser.getSummary().length()); - } - - // LUCENE-590 - public void testSummaryTitle() throws Exception { - String text = "SummarySummary of the document"; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertEquals("Summary of the document", parser.getSummary()); - } - - // LUCENE-2246 - public void testTurkish() throws Exception { - String text = "" + - "\"ş\"" + - ""; - HTMLParser parser = new HTMLParser(new StringReader(text)); - assertReadsTo("[ş]", parser); - } - - private void assertReadsTo(String expected, HTMLParser parser) throws IOException { - Reader reader = parser.getReader(); - StringBuilder builder = new StringBuilder(); - int ch = 0; - while ((ch = reader.read()) != -1) { - builder.append((char)ch); - } - assertEquals(expected, builder.toString()); - } -} Index: lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java =================================================================== --- lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java (revision 0) +++ lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java (working copy) @@ -0,0 +1,142 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.Locale; +import java.util.Properties; + +import org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser.DocumentParser; +import org.apache.lucene.util.LuceneTestCase; + +public class TestHtmlParser extends LuceneTestCase { + + public void testUnicode() throws Exception { + String text = "汉语"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("汉语", parser.body); + } + + public void testEntities() throws Exception { + String text = "汉语¥"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("汉语¥", parser.body); + } + + public void testComments() throws Exception { + String text = "foo"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("foo", parser.body); + } + + public void testScript() throws Exception { + String text = "foo"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("foo", parser.body); + } + + public void testStyle() throws Exception { + String text = "" + + "foo"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("foo", parser.body); + } + + public void testDoctype() throws Exception { + String text = "" + + "foo"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("foo", parser.body); + } + + public void testMeta() throws Exception { + String text = "" + + "" + + "" + + "" + + "" + + "foobar"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + Properties tags = parser.metaTags; + assertEquals(4, tags.size()); + assertEquals("1", tags.get("a")); + assertEquals("2", tags.get("b")); + assertEquals("this is a test", tags.get("keywords")); + assertEquals("text/html;charset=UTF-8", tags.get("content-type")); + } + + public void testTitle() throws Exception { + String text = "foobar"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("foo", parser.title); + } + + // LUCENE-2246 + public void testTurkish() throws Exception { + final Locale saved = Locale.getDefault(); + try { + Locale.setDefault(new Locale("tr", "TR")); + String text = "ııı" + + "\"ş\"" + + ""; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("ııı", parser.title); + assertEquals("[ş]", parser.body); + } finally { + Locale.setDefault(saved); + } + } + + public void testSampleTRECDoc() throws Exception { + String text = "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 title\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "\r\n" + + "TEST-000 text\r\n" + + "\r\n" + + "\r\n" + + "\r\n"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("TEST-000 title", parser.title); + assertEquals("TEST-000 text", parser.body.trim()); + } + + public void testNoHTML() throws Exception { + String text = "hallo"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("", parser.title); + assertEquals("hallo", parser.body); + } + + public void testivalid() throws Exception { + String text = "foobar"; + DocumentParser parser = new DocumentParser(new StringReader(text)); + assertEquals("foo", parser.title); + assertEquals("bar", parser.body); + } + +} Index: lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java =================================================================== --- lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java (revision 0) +++ lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java (working copy) Property changes on: lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TestHtmlParser.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +Date Author Id Revision HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java =================================================================== --- lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java (revision 1361666) +++ lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java (working copy) @@ -334,6 +334,7 @@ // Don't test that NoMoreDataException is thrown, since the forever flag is // turned on. + source.close(); } /**