Uploaded image for project: 'MyFaces Tomahawk'
  1. MyFaces Tomahawk
  2. TOMAHAWK-1458

ReducedHTMLParser: incorrect assumption about STATE_EXPECTING_ETAGO

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Open
    • Major
    • Resolution: Unresolved
    • 1.1.9
    • None
    • ExtensionsFilter
    • None

    Description

      ReducedHTMLParser assumes that <script> elements cannot contain "</" before </script>. This is not true.
      Raw example:

      <script>
      function foo()

      { var str = "</tr>; }

      "
      </script>

      In this case, ReducedHTMLParser switches to STATE_READY when "</tr>" is handled. But the <script> element is not closed here.

      This may result in errors, especially if the script code contains the strings like "<head>", "</head>", "<body>" or "</body>".

      Below, I provide a patch for the parse() method.
      Note that this patch still works incorrectly if the script contains the string "</script>", like in

      <script>
      function foo()

      { var str = "</script>; }

      "
      </script>

      Patch (my changes indicated by comments // L. Ulrich ...)

      void parse()
      {
      int state = STATE_READY;

      int currentTagStart = -1;
      String currentTagName = null;

      _lineNumber = 1;
      _offset = 0;
      int lastOffset = _offset -1;

      // L. Ulrich, 23.09.2009:
      // New helper variable which holds the tag name
      // in case of STATE_EXPECTING_ETAGO
      String currentEtagoTagName = null;

      while (_offset < _seq.length())
      {
      // Sanity check; each pass through this loop must increase the offset.
      // Failure to do this means a hang situation has occurred.
      if (_offset <= lastOffset)

      { // throw new RuntimeException("Infinite loop detected in ReducedHTMLParser"); log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped."+ " Surroundings: '" + getTagSurroundings() +"'."); //return; }

      lastOffset = _offset;

      if (state == STATE_READY)
      {
      // in this state, nothing but "<" has any significance
      consumeExcept("<");
      if (isFinished())

      { break; }

      if (consumeMatch("<!--"))
      { // Note that whitespace is *not* permitted in <!-- state = STATE_IN_COMMENT; }
      else if (consumeMatch("<!["))
      { // Start of a "marked section", eg "<![CDATA" or // "<![INCLUDE" or "<![IGNORE". These always terminate // with "]]>" log.debug("Marked section found at line " + getCurrentLineNumber()+". "+ "Surroundings: '" + getTagSurroundings() +"'."); state = STATE_IN_MARKED_SECTION; }
      else if (consumeMatch("<!DOCTYPE"))
      { log.debug("DOCTYPE found at line " + getCurrentLineNumber()); // we don't need to actually do anything here; the // tag can't contain a bare "<", so the first "<" // indicates the start of the next real tag. // // TODO: Handle case where the DOCTYPE includes an internal DTD. In // that case there *will* be embedded < chars in the document. However // that's very unlikely to be used in a JSF page, so this is pretty low // priority. }
      else if (consumeMatch("<?"))
      { // xml processing instruction or <!DOCTYPE> tag // we don't need to actually do anything here; the // tag can't contain a bare "<", so the first "<" // indicates the start of the next real tag. log.debug("PI found at line " + getCurrentLineNumber()); }
      else if (consumeMatch("</"))
      {
      if (!processEndTag())
      { // message already logged return; }

      // stay in state READY
      state = STATE_READY;
      }
      else if (consumeMatch("<"))
      {
      // We can't tell the user that the tag has closed until after we have
      // processed any attributes and found the real end of the tag. So save
      // the current info until the end of this tag.
      currentTagStart = _offset - 1;
      currentTagName = consumeElementName();
      if (currentTagName == null)
      { log.warn("Invalid HTML; bare lessthan sign found at line " + getCurrentLineNumber() + ". "+ "Surroundings: '" + getTagSurroundings() +"'."); // remain in STATE_READY; this isn't really the start of // an xml element. }
      else
      { state = STATE_IN_TAG; }
      }
      else
      { // should never get here throw new Error("Internal error at line " + getCurrentLineNumber()); }

      continue;
      }

      if (state == STATE_IN_COMMENT)
      {
      // TODO: handle "-- >", which is a valid way to close a
      // comment according to the specs.

      // in this state, nothing but "--" has any significance
      consumeExcept("-");
      if (isFinished())
      { break; }

      if (consumeMatch("-->"))

      { state = STATE_READY; }
      else
      { // false call; hyphen is not end of comment consumeMatch("-"); }

      continue;
      }

      if (state == STATE_IN_TAG)
      {
      consumeWhitespace();

      if (consumeMatch("/>"))
      { // ok, end of element state = STATE_READY; closedTag(currentTagStart, _offset, currentTagName); // and reset vars just in case... currentTagStart = -1; currentTagName = null; }
      else if (consumeMatch(">"))
      {
      if (currentTagName.equalsIgnoreCase("script")
      || currentTagName.equalsIgnoreCase("style"))
      { // We've just started a special tag which can contain anything except // the ETAGO marker ("</"). See // http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data state = STATE_EXPECTING_ETAGO; // L. Ulrich, 23.09.2009: // set currentEtagoTagName currentEtagoTagName = currentTagName; }
      else
      { state = STATE_READY; }

      // end of open tag, but not end of element
      openedTag(currentTagStart, _offset, currentTagName);

      // and reset vars just in case...
      currentTagStart = -1;
      currentTagName = null;
      }
      else
      {
      // xml attribute
      String attrName = consumeAttrName();
      if (attrName == null)
      { // Oops, we found something quite unexpected in this tag. // The best we can do is probably to drop back to looking // for "/>", though that does risk us misinterpreting the // contents of an attribute's associated string value. log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'" + " at line " + getCurrentLineNumber()+". "+ "Surroundings: '" + getTagSurroundings() +"'."); state = STATE_EXPECTING_ETAGO; // and consume one character ++_offset; }
      else
      {
      consumeWhitespace();

      // html can have "stand-alone" attributes with no following equals sign
      if (consumeMatch("="))
      { consumeAttrValue(); }
      }
      }

      continue;
      }

      if (state == STATE_IN_MARKED_SECTION)
      {
      // in this state, nothing but "]]>" has any significance
      consumeExcept("]");
      if (isFinished())
      { break; }

      if (consumeMatch("]]>"))
      { state = STATE_READY; }

      else

      { // false call; ] is not end of cdata section consumeMatch("]"); }

      continue;
      }

      if (state == STATE_EXPECTING_ETAGO)
      {
      // The term "ETAGO" is the official spec term for "</".
      consumeExcept("<");
      if (isFinished())

      { log.debug("Malformed input page; input terminated while tag not closed."); break; }

      if (consumeMatch("</"))
      {
      // L. Ulrich, 23.09.2009:
      // Workaround to skip other tags used within scripts:
      // Test if the closed tag refers to currentEtagoTagName.
      // Example:
      // <script> function foo()

      { var str = "</body>"; ... }

      </script>
      // => do not tread </body> as the script closing tag
      //
      // Note that this will still not work as expected
      // in case of recursive tags.
      // Example:
      // <script> function foo()

      { var str = "</script>"; ... }

      </script>
      CharSequence str = this._seq.subSequence(this._offset,
      this._offset + currentEtagoTagName.length());
      if (str.toString().equals(currentEtagoTagName))
      {
      if (!processEndTag())

      { return; }

      state = STATE_READY;
      currentEtagoTagName = null;
      }
      }
      else

      { // false call; < does not start an ETAGO consumeMatch("<"); }

      continue;
      }
      }
      }

      Attachments

        Activity

          People

            Unassigned Unassigned
            lutzulrich Lutz Ulruch
            Votes:
            1 Vote for this issue
            Watchers:
            1 Start watching this issue

            Dates

              Created:
              Updated: