Index: modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java =================================================================== --- modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (revision 1306969) +++ modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (working copy) @@ -20,12 +20,15 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.BufferedReader; +import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.StreamUtils; import org.apache.lucene.util.ThreadInterruptedException; +import org.apache.lucene.util.IOUtils; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -172,7 +175,8 @@ while(true){ final InputStream localFileIS = is; try { - reader.parse(new InputSource(localFileIS)); + // To work around a bug in XERCES, we assume the XML is always UTF8, so we simply provide reader. + reader.parse(new InputSource(new BufferedReader(new InputStreamReader(localFileIS, IOUtils.CHARSET_UTF_8)))); } catch (IOException ioe) { synchronized(EnwikiContentSource.this) { if (localFileIS != is) {