Index: src/demo/org/apache/lucene/demo/html/HTMLParser.jj =================================================================== retrieving revision 1.5 diff -u -r1.5 HTMLParser.jj --- src/demo/org/apache/lucene/demo/html/HTMLParser.jj 23 Nov 2003 18:37:32 -0000 1.5 +++ src/demo/org/apache/lucene/demo/html/HTMLParser.jj 10 Mar 2004 03:13:25 -0000 @@ -87,8 +87,12 @@ Writer pipeOut; private MyPipedInputStream pipeInStream = null; private PipedOutputStream pipeOutStream = null; + private boolean doneReading = false; + private String sourceName = null; private class MyPipedInputStream extends PipedInputStream{ + + ParserThread parserThread; public MyPipedInputStream(){ super(); @@ -98,15 +102,67 @@ super(src); } + public MyPipedInputStream(ParserThread parserThread) { + super(); + this.parserThread = parserThread; + } + + public MyPipedInputStream(ParserThread parserThread, PipedOutputStream src) throws IOException{ + super(src); + this.parserThread = parserThread; + } + public boolean full() throws IOException{ return this.available() >= PipedInputStream.PIPE_SIZE; } + + public void close() throws IOException { + if (!doneReading) { + // the parser thread is blocked on PipedInputStream.recieve + // interrupt the parser thread if the parser is not finished + // so we can avoid the Pipe Closed IOException that is caused + // when the reader is closed when the parser is not finished. + parserThread.interrupt(); + } + super.close(); + } + + protected synchronized void receive(int b) throws IOException { + // Don't enter receive if we are done reading since receive will block + // and hang the thread. + if (!doneReading) { + try { + super.receive(b); + } + catch (IOException ioe) { + // receive recasts an InterruptedException to a java.io.InterruptedIOException + // but the method only throws generic IOExceptions. + // IOException can happen here if the thread is interrupted or + // either end of the pipe is closed. + throw new ParserInterruptedException(); + } + } + } } + public HTMLParser(InputStream stream, String sourceName) { + this(stream); + this.sourceName = sourceName; + } + + public HTMLParser(Reader stream, String sourceName) { + this(stream); + this.sourceName = sourceName; + } + public HTMLParser(File file) throws FileNotFoundException { - this(new FileInputStream(file)); + this(new FileInputStream(file), file.getPath()); } + public String getSourceName() { + return sourceName; + } + public String getTitle() throws IOException, InterruptedException { if (pipeIn == null) getReader(); // spawn parsing thread @@ -158,12 +214,12 @@ public Reader getReader() throws IOException { if (pipeIn == null) { - pipeInStream = new MyPipedInputStream(); + ParserThread thread = new ParserThread(this); + pipeInStream = new MyPipedInputStream(thread); pipeOutStream = new PipedOutputStream(pipeInStream); pipeIn = new InputStreamReader(pipeInStream); pipeOut = new OutputStreamWriter(pipeOutStream); - Thread thread = new ParserThread(this); thread.start(); // start parsing } @@ -252,6 +308,7 @@ // } catch (ParseException e) { // handleException(e); // } + { doneReading = true; } } void Tag() throws IOException : Index: src/demo/org/apache/lucene/demo/html/ParserThread.java =================================================================== retrieving revision 1.1 diff -u -r1.1 ParserThread.java --- src/demo/org/apache/lucene/demo/html/ParserThread.java 26 Jan 2002 15:01:31 -0000 1.1 +++ src/demo/org/apache/lucene/demo/html/ParserThread.java 10 Mar 2004 03:13:25 -0000 @@ -68,9 +68,11 @@ try { // parse document to pipeOut parser.HTMLDocument(); } catch (ParseException e) { + logAbortion(e.getMessage()); } catch (TokenMgrError e) { + logAbortion(e.getMessage()); + } catch (ParserInterruptedException e) { + logAbortion("IndexWriter.maxFieldLength exceeded."); } finally { parser.pipeOut.close(); synchronized (parser) { @@ -82,5 +84,18 @@ } catch (IOException e) { e.printStackTrace(); } + } + + private void logAbortion(String message) { + // it would be nice if you could send this to a logging framework + // or control if/where this message should go. + System.out.print("Parse Aborted: " + message); + String sourceName = parser.getSourceName(); + if (sourceName == null) { + System.out.println(" Unknown Source"); + } + else { + System.out.println(" Source: " + sourceName); + } } } Index: src/demo/org/apache/lucene/demo/html/ParserInterruptedException.java =================================================================== RCS file: src/demo/org/apache/lucene/demo/html/ParserInterruptedException.java diff -N src/demo/org/apache/lucene/demo/html/ParserInterruptedException.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/demo/org/apache/lucene/demo/html/ParserInterruptedException.java 1 Jan 1970 00:00:00 -0000 @@ -0,0 +1,48 @@ +package org.apache.lucene.demo.html; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * An IOException thrown by the HTMLParser if it is interrupted before it is + * finished reading its input. This occurrs when DocumentWriter.invertDocument + * closes its reader when IndexWriter.maxFieldLength tokens have been processed. + * + *

Created: Feb 3, 2004 4:46:55 PM + * + * @author Eric Isakson + * @since lucene 1.4 + * @version $Id:$ + */ +public class ParserInterruptedException extends IOException { + + /** + * + */ + public ParserInterruptedException() { + super(); + } + + /** + * @param message + */ + public ParserInterruptedException(String message) { + super(message); + } + +}