Details
-
Improvement
-
Status: Resolved
-
Minor
-
Resolution: Fixed
-
1.12
-
None
Description
A lot of websites don't have a valid data within <head></head> tag. However, even if header data are invalid(missplaced tag etc.) we should be able to get title tag value if present.
Please find below a straightforward Unit Test to reproduce the problem. You will noticed I have added an anchor in between <head><a></a></head> tags which is not correct. If you remove it, it find title value.
import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; import org.apache.hadoop.conf.Configuration; import org.apache.html.dom.HTMLDocumentImpl; import org.apache.nutch.parse.html.DOMBuilder; import org.apache.nutch.parse.tika.DOMContentUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.w3c.dom.DocumentFragment; public class TestTikaGetTitleWithInvalidHeaders { private Configuration conf; static byte[] readFile(String path, Charset encoding) throws IOException { return Files.readAllBytes(Paths.get(path)); } private final static String WEBPAGE = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML+RDFa 1.0//EN\" \"http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd\">" + "<html>" + "<head>" +"<a href=\"https://plus.google.com/113911985765464238166\" rel=\"publisher\">Google+</a> " + "<title>Welcome!</title>" + "</head>" + "<body>" + "content" + "</body>" + "</html>"; @Before public void setUp() throws Exception { conf = new Configuration(); } @Test public void testGetTitle() { HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); Parser parser = new org.apache.tika.parser.html.HtmlParser(); DOMBuilder domBuilder = new DOMBuilder(doc, root); try { parser.parse(new ByteArrayInputStream(WEBPAGE.getBytes()), domBuilder, new Metadata(), new ParseContext()); } catch (Exception e) { e.printStackTrace(); } StringBuffer sb = new StringBuffer(); new DOMContentUtils(conf).getTitle(sb, root); Assert.assertEquals("Welcome!", sb.toString()); } }
Attachments
Issue Links
- relates to
-
TIKA-1599 Switch from TagSoup to JSoup
- Resolved