Uploaded image for project: 'IMPALA'
  1. IMPALA
  2. IMPALA-4436

StringValue::StringCompare bug for non-ascii string

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Resolved
    • Critical
    • Resolution: Fixed
    • Impala 2.8.0
    • Impala 2.8.0
    • Backend

    Description

      StringCompare has a huge bug when string contains non-ascii char (>127,for example: Chinese )。

      Here is my test code:

        std::string a = "\xE9 00000000000000000";                                                                            
        std::string b = "\x32 00000000000000000";                                                                            
        std::string c = "\xE5 00";
        // return -73, a < b
        printf("%d\n", StringCompare(a.c_str(), a.size(), b.c_str(), b.size(), std::min(a.size(), b.size()))); 
       // return -179, b < c
        printf("%d\n", StringCompare(b.c_str(), b.size(), c.c_str(), c.size(), std::min(b.size(), c.size())));        
        // return 4, a > c ??  
        printf("%d\n", StringCompare(a.c_str(), a.size(), c.c_str(), c.size(), std::min(a.size(), c.size())));  
      

      Here is the function StringCompare in string-value.inline.h:

      static inline int StringCompare(const char* s1, int n1, const char* s2, int n2, int len) {
        DCHECK_EQ(len, std::min(n1, n2));
        if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
          while (len >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
            __m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s1));
            __m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s2));
            int chars_match = SSE4_cmpestri<SSEUtil::STRCMP_MODE>(xmm0,
                SSEUtil::CHARS_PER_128_BIT_REGISTER, xmm1,
                SSEUtil::CHARS_PER_128_BIT_REGISTER);
            if (chars_match != SSEUtil::CHARS_PER_128_BIT_REGISTER) {
              // BUG: use char perform subtraction
              return s1[chars_match] - s2[chars_match]
            }
            len -= SSEUtil::CHARS_PER_128_BIT_REGISTER;
            s1 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
            s2 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
          }
        }
        // TODO: for some reason memcmp is way slower than strncmp (2.5x)  why?
        int result = strncmp(s1, s2, len);
        if (result != 0) return result;
        return n1 - n2;
      }
      

      but in the code of strncmp:

      int
      strncmp(const char *s1, const char *s2, register size_t n)
      {
        register unsigned char u1, u2;
      
        while (n-- > 0)
          {
            u1 = (unsigned char) *s1++;
            u2 = (unsigned char) *s2++;
            if (u1 != u2)
              // here use unsigned char to perform subtraction
      	return u1 - u2;
            if (u1 == '\0')
      	return 0;
          }
        return 0;
      }
      

      Attachments

        Issue Links

          Activity

            People

              dhecht Daniel Hecht
              cfreely Fu Lili
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: