Description
HtmlParser's HtmlHandler does not check for "SCRIPT" in startElement when parsing <head> (i.e. bodylevel == 0 && discardLevel ==0). This causes <script> elements found within <head> to be dropped altogether. They should be treated in the same manner as "LINK" elements.
Here is a sample test case that demonstrates the problem and can be run from within HtmlParserTest.java, although it could be generalized to check for <a>, <link> and <img> links by using the LinkContentHandler.
@Test public void testScriptSrc() throws Exception { String url = "http://domain.com/logic.js"; String scriptInBody = "<html><body><script src=\"" + url + "\"></script></body></html>"; String scriptInHead = "<html><head><script src=\"" + url + "\"></script></head></html>"; assertScriptLink(scriptInBody, url); assertScriptLink(scriptInHead, url); } private void assertScriptLink(String html, String url) throws Exception { // IdentityHtmlMapper needed to extract <script> tags ParseContext context = new ParseContext(); context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html"); final List<String> links = new ArrayList<String>(); new HtmlParser().parse( new ByteArrayInputStream(html.getBytes(UTF_8)), new DefaultHandler() { @Override public void startElement( String u, String l, String name, Attributes atts) { if (name.equals("script") && atts.getValue("", "src") != null) { links.add(atts.getValue("", "src")); } } }, metadata, context); assertEquals(1, links.size()); assertEquals(url, links.get(0)); }