Pig
  1. Pig
  2. PIG-3304

XMLLoader in piggybank does not work with inline closed tags

    Details

    • Type: Bug Bug
    • Status: Closed
    • Priority: Major Major
    • Resolution: Fixed
    • Affects Version/s: 0.11.1
    • Fix Version/s: 0.12.0
    • Component/s: piggybank
    • Labels:
    • Patch Info:
      Patch Available
    • Hadoop Flags:
      Reviewed
    • Tags:
      xml

      Description

      The XMLLoader fails to return elements when tags are closed inline such as
      <event id="342"/>

        Activity

        Hide
        Daniel Dai added a comment -

        Piggybank tests pass. Patch commmitted to trunk. Thanks Ahmed!

        Show
        Daniel Dai added a comment - Piggybank tests pass. Patch commmitted to trunk. Thanks Ahmed!
        Hide
        Ahmed Eldawy added a comment -

        An updated path that handles nested elements with inline close tag

        Show
        Ahmed Eldawy added a comment - An updated path that handles nested elements with inline close tag
        Hide
        Ahmed Eldawy added a comment -

        Found a bug with nested entities that are closed inline

        Show
        Ahmed Eldawy added a comment - Found a bug with nested entities that are closed inline
        Hide
        Ahmed Eldawy added a comment -

        A patch that solves the bug

        Show
        Ahmed Eldawy added a comment - A patch that solves the bug
        Hide
        Ahmed Eldawy added a comment -

        diff --git java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
        index 589a545..9daa1a4 100644
        — java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
        +++ java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
        @@ -212,10 +212,16 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre
        //startTag[tmp.length+1] = (byte)'>';

        + // Used to detect tags that are closed inline
        + byte[] inlineCloseTag =

        {'/', '>'}

        ;

        ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
        int idxTagChar = 0;
        int idxStartTagChar = 0;
        + int idxInlineCloseTagChar = 0;
        + // A flag to indicate that we are currently inside the tag to be matched
        + // Initially set to true as skipToTag has been called earlier
        + boolean insideMatchTag = true;
        boolean startTagMatched = false;
        /*

        • Read till an end tag is found.It need not check for any condition since it
          @@ -247,10 +253,18 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre

        if (b == startTag[idxStartTagChar])

        { ++idxStartTagChar; - if(idxStartTagChar == startTag.length) - startTagMatched = true ; // Set the flag as true if start tag matches - }

        else

        • idxStartTagChar = 0;
          + if(idxStartTagChar == startTag.length) { + startTagMatched = true ; // Set the flag as true if start tag matches + // We are currently inside the tag to be matched + insideMatchTag = true; + }

          + } else

          Unknown macro: {+ idxStartTagChar = 0;+ if (idxStartTagChar > 1) { + // Matched only a part of the start tag of some element + insideMatchTag = false; + }+ }

        @@ -268,6 +282,23 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre
        } else
        idxTagChar = 0;

        + if (b == inlineCloseTag[idxInlineCloseTagChar]) {
        + idxInlineCloseTagChar++;
        + if (idxInlineCloseTagChar == inlineCloseTag.length) {
        + idxInlineCloseTagChar = 0;
        + if (insideMatchTag) {
        + if(nestedTags==0) // Break the loop if there were no nested tags
        + break;
        + else

        { + --nestedTags; // Else decrement the count + idxInlineCloseTagChar = 0; // Reset the index + }

        + }
        + }
        + } else

        { + idxInlineCloseTagChar = 0; + }

        +
        }
        catch (IOException e) {
        this.setReadable(false);
        @@ -339,7 +370,7 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre
        break;
        case S_MATCH_PREFIX:
        // tag match iff next character is whitespaces or close tag mark

        • if (b == ' ' || b == '\t' || b == '>') {
          + if (Character.isWhitespace(b) || b == '/' || b == '>') { matchBuf.write((byte)(b)); state = S_MATCH_TAG; }

          else

          { @@ -355,7 +386,7 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre default: throw new IllegalArgumentException("Invalid state: " + state); }
        • if (state == S_MATCH_TAG && b == '>') {
          + if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b))) { break; }

          if (state != S_MATCH_TAG && this.getPosition() > limit) {
          @@ -406,6 +437,12 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre
          byte[] collectTag(String tagName, long limit) throws IOException {
          ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
          byte[] beginTag = skipToTag(tagName, limit);
          +
          + // Check if the tag is closed inline
          + if (beginTag.length > 2 && beginTag[beginTag.length - 2] == '/' &&
          + beginTag[beginTag.length-1] == '>')

          { + return beginTag; + }

        // No need to search for the end tag if the start tag is not found
        if(beginTag.length > 0 ){
        diff --git java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
        index 4adc9cd..f83f0d9 100644
        — java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
        +++ java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
        @@ -75,6 +75,15 @@ public class TestXMLLoader extends TestCase {
        nestedTags.add(new String[]

        { "</events>"});
        }

        + public static ArrayList<String[]> inlineClosedTags = new ArrayList<String[]>();
        + static {
        + inlineClosedTags.add(new String[] { "<events>"});
        + inlineClosedTags.add(new String[] { "<event id='3423'/>"});
        + inlineClosedTags.add(new String[] { "<event/>"});
        + inlineClosedTags.add(new String[] { "<event><event/></event>"});
        + inlineClosedTags.add(new String[] { "</events>"}

        );
        + }
        +
        public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws Exception
        {
        String filename = TestHelper.createTempFile(data, "");
        @@ -333,5 +342,26 @@ public class TestXMLLoader extends TestCase

        { assertEquals(3, tupleCount); }
        • +
          + public void testXMLLoaderShouldWorkWithInlineClosedTags() throws Exception {
          + String filename = TestHelper.createTempFile(inlineClosedTags, "");
          + PigServer pig = new PigServer(LOCAL);
          + filename = filename.replace("
          ", "\\\\");
          + patternString = patternString.replace("
          ", "\\\\");
          + String query = "A = LOAD 'file:" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
          + pig.registerQuery(query);
          + Iterator<?> it = pig.openIterator("A");
          + int tupleCount = 0;
          + while (it.hasNext()) {
          + Tuple tuple = (Tuple) it.next();
          + if (tuple == null)
          + break;
          + else

          Unknown macro: {+ if (tuple.size() > 0) { + tupleCount++; + }+ }

          + }
          + assertEquals(3, tupleCount);
          + }
          }

        Show
        Ahmed Eldawy added a comment - diff --git java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java index 589a545..9daa1a4 100644 — java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java +++ java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java @@ -212,10 +212,16 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre //startTag [tmp.length+1] = (byte)'>'; + // Used to detect tags that are closed inline + byte[] inlineCloseTag = {'/', '>'} ; ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024); int idxTagChar = 0; int idxStartTagChar = 0; + int idxInlineCloseTagChar = 0; + // A flag to indicate that we are currently inside the tag to be matched + // Initially set to true as skipToTag has been called earlier + boolean insideMatchTag = true; boolean startTagMatched = false; /* Read till an end tag is found.It need not check for any condition since it @@ -247,10 +253,18 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre if (b == startTag [idxStartTagChar] ) { ++idxStartTagChar; - if(idxStartTagChar == startTag.length) - startTagMatched = true ; // Set the flag as true if start tag matches - } else idxStartTagChar = 0; + if(idxStartTagChar == startTag.length) { + startTagMatched = true ; // Set the flag as true if start tag matches + // We are currently inside the tag to be matched + insideMatchTag = true; + } + } else Unknown macro: {+ idxStartTagChar = 0;+ if (idxStartTagChar > 1) { + // Matched only a part of the start tag of some element + insideMatchTag = false; + }+ } @@ -268,6 +282,23 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre } else idxTagChar = 0; + if (b == inlineCloseTag [idxInlineCloseTagChar] ) { + idxInlineCloseTagChar++; + if (idxInlineCloseTagChar == inlineCloseTag.length) { + idxInlineCloseTagChar = 0; + if (insideMatchTag) { + if(nestedTags==0) // Break the loop if there were no nested tags + break; + else { + --nestedTags; // Else decrement the count + idxInlineCloseTagChar = 0; // Reset the index + } + } + } + } else { + idxInlineCloseTagChar = 0; + } + } catch (IOException e) { this.setReadable(false); @@ -339,7 +370,7 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre break; case S_MATCH_PREFIX: // tag match iff next character is whitespaces or close tag mark if (b == ' ' || b == '\t' || b == '>') { + if (Character.isWhitespace(b) || b == '/' || b == '>') { matchBuf.write((byte)(b)); state = S_MATCH_TAG; } else { @@ -355,7 +386,7 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre default: throw new IllegalArgumentException("Invalid state: " + state); } if (state == S_MATCH_TAG && b == '>') { + if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b))) { break; } if (state != S_MATCH_TAG && this.getPosition() > limit) { @@ -406,6 +437,12 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre byte[] collectTag(String tagName, long limit) throws IOException { ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024); byte[] beginTag = skipToTag(tagName, limit); + + // Check if the tag is closed inline + if (beginTag.length > 2 && beginTag [beginTag.length - 2] == '/' && + beginTag [beginTag.length-1] == '>') { + return beginTag; + } // No need to search for the end tag if the start tag is not found if(beginTag.length > 0 ){ diff --git java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java index 4adc9cd..f83f0d9 100644 — java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java +++ java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java @@ -75,6 +75,15 @@ public class TestXMLLoader extends TestCase { nestedTags.add(new String[] { "</events>"}); } + public static ArrayList<String[]> inlineClosedTags = new ArrayList<String[]>(); + static { + inlineClosedTags.add(new String[] { "<events>"}); + inlineClosedTags.add(new String[] { "<event id='3423'/>"}); + inlineClosedTags.add(new String[] { "<event/>"}); + inlineClosedTags.add(new String[] { "<event><event/></event>"}); + inlineClosedTags.add(new String[] { "</events>"} ); + } + public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws Exception { String filename = TestHelper.createTempFile(data, ""); @@ -333,5 +342,26 @@ public class TestXMLLoader extends TestCase { assertEquals(3, tupleCount); } + + public void testXMLLoaderShouldWorkWithInlineClosedTags() throws Exception { + String filename = TestHelper.createTempFile(inlineClosedTags, ""); + PigServer pig = new PigServer(LOCAL); + filename = filename.replace(" ", "\\\\"); + patternString = patternString.replace(" ", "\\\\"); + String query = "A = LOAD 'file:" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);"; + pig.registerQuery(query); + Iterator<?> it = pig.openIterator("A"); + int tupleCount = 0; + while (it.hasNext()) { + Tuple tuple = (Tuple) it.next(); + if (tuple == null) + break; + else Unknown macro: {+ if (tuple.size() > 0) { + tupleCount++; + }+ } + } + assertEquals(3, tupleCount); + } }

          People

          • Assignee:
            Ahmed Eldawy
            Reporter:
            Ahmed Eldawy
          • Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

            Dates

            • Created:
              Updated:
              Resolved:

              Development