Index: lucene/core/src/test/org/apache/lucene/util/TestBitUtil.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/util/TestBitUtil.java	(révision 1411600)
+++ lucene/core/src/test/org/apache/lucene/util/TestBitUtil.java	(copie de travail)
@@ -1,133 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.util;
-
-public class TestBitUtil extends LuceneTestCase {
-
-  private static int slowNlz(long x) {
-    if (x == 0L) return 64;
-    int nlz = 0;
-    while ( ((x << nlz) & (1L << 63)) == 0) {
-      nlz++;
-    }
-    return nlz;
-  }
-
-  private void checkNlz(long x) {
-    assertEquals(slowNlz(x), BitUtil.nlz(x));
-    assertEquals(Long.numberOfLeadingZeros(x), BitUtil.nlz(x));
-  }
-  
-  public void testNlz() {
-    checkNlz(0L);
-    checkNlz(1L);
-    checkNlz(-1L);
-    for (int i = 1; i <= 63; i++) {
-      checkNlz(1L << i);
-      checkNlz((1L << i) + (1L << (i>>1)));
-    }
-  }
-
-  public void testBitUtils() {
-    long num = 100000;
-    assertEquals( 5, BitUtil.ntz(num) );
-    assertEquals( 5, BitUtil.ntz2(num) );
-    assertEquals( 5, BitUtil.ntz3(num) );
-    
-    num = 10;
-    assertEquals( 1, BitUtil.ntz(num) );
-    assertEquals( 1, BitUtil.ntz2(num) );
-    assertEquals( 1, BitUtil.ntz3(num) );
-
-    for (int i=0; i<64; i++) {
-      num = 1L << i;
-      assertEquals( i, BitUtil.ntz(num) );
-      assertEquals( i, BitUtil.ntz2(num) );
-      assertEquals( i, BitUtil.ntz3(num) );
-    }
-  }
-
-
-  private long testArg(int shift) {
-    return (1L << shift) + (1L << (shift>>1));
-  }
-  
-  private long nlzBitUtilBasicLoop(int iters) {
-    long sumRes = 0;
-    while (iters-- >= 0) {
-      for (int i = 1; i <= 63; i++) {
-        long a = testArg(i);
-        sumRes += BitUtil.nlz(a);
-        sumRes += BitUtil.nlz(a + 1);
-        sumRes += BitUtil.nlz(a - 1);
-        sumRes += BitUtil.nlz(a + 10);
-        sumRes += BitUtil.nlz(a - 10);
-      }
-    }
-    return sumRes;
-  }
-    
-  private long nlzLongBasicLoop(int iters) {
-    long sumRes = 0;
-    while (iters-- >= 0) {
-      for (int i = 1; i <= 63; i++) {
-        long a = testArg(i);
-        sumRes += Long.numberOfLeadingZeros(a);
-        sumRes += Long.numberOfLeadingZeros(a + 1);
-        sumRes += Long.numberOfLeadingZeros(a - 1);
-        sumRes += Long.numberOfLeadingZeros(a + 10);
-        sumRes += Long.numberOfLeadingZeros(a - 10);
-      }
-    }
-    return sumRes;
-  }
-
-  public void tstPerfNlz() { // See LUCENE-3197, prefer to use Long.numberOfLeadingZeros() over BitUtil.nlz().
-    final long measureMilliSecs = 2000;
-    final int basicIters = 100000;
-    long startTime;
-    long endTime;
-    long curTime;
-    long dummy = 0; // avoid optimizing away
-
-    dummy = 0;
-    int bitUtilLoops = 0;
-    startTime = System.currentTimeMillis();
-    endTime = startTime + measureMilliSecs;
-    do {
-      dummy += nlzBitUtilBasicLoop(basicIters);
-      bitUtilLoops++;
-      curTime = System.currentTimeMillis();
-    } while (curTime < endTime);
-    int bitUtilPsTime = (int) (1000000000 * (curTime - startTime) / (basicIters * 5 * 63 * (float) bitUtilLoops));
-    System.out.println("BitUtil nlz time: " + (bitUtilPsTime/1) + " picosec/call, dummy: " + dummy);
-
-
-    dummy = 0;
-    int longLoops = 0;
-    startTime = System.currentTimeMillis();
-    endTime = startTime + measureMilliSecs;
-    do {
-      dummy += nlzLongBasicLoop(basicIters);
-      longLoops++;
-      curTime = System.currentTimeMillis();
-    } while (curTime < endTime);
-    int longPsTime = (int) (1000000000 * (curTime - startTime) / (basicIters * 5 * 63 * (float) longLoops));
-    System.out.println("Long    nlz time: " + longPsTime + " picosec/call, dummy: " + dummy);
-  }
-}
Index: lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java	(révision 1411600)
+++ lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java	(copie de travail)
@@ -155,13 +155,13 @@
     long word = bits[i] >> subIndex;  // skip all the bits to the right of index
 
     if (word!=0) {
-      return (i<<6) + subIndex + BitUtil.ntz(word);
+      return (i<<6) + subIndex + Long.numberOfTrailingZeros(word);
     }
 
     while(++i < bits.length) {
       word = bits[i];
       if (word != 0) {
-        return (i<<6) + BitUtil.ntz(word);
+        return (i<<6) + Long.numberOfTrailingZeros(word);
       }
     }
 
Index: lucene/core/src/java/org/apache/lucene/util/OpenBitSet.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/OpenBitSet.java	(révision 1411600)
+++ lucene/core/src/java/org/apache/lucene/util/OpenBitSet.java	(copie de travail)
@@ -629,12 +629,12 @@
     long word = bits[i] >> subIndex;  // skip all the bits to the right of index
 
     if (word!=0) {
-      return (i<<6) + subIndex + BitUtil.ntz(word);
+      return (i<<6) + subIndex + Long.numberOfTrailingZeros(word);
     }
 
     while(++i < wlen) {
       word = bits[i];
-      if (word!=0) return (i<<6) + BitUtil.ntz(word);
+      if (word!=0) return (i<<6) + Long.numberOfTrailingZeros(word);
     }
 
     return -1;
@@ -650,12 +650,12 @@
     long word = bits[i] >>> subIndex;  // skip all the bits to the right of index
 
     if (word!=0) {
-      return (((long)i)<<6) + (subIndex + BitUtil.ntz(word));
+      return (((long)i)<<6) + (subIndex + Long.numberOfTrailingZeros(word));
     }
 
     while(++i < wlen) {
       word = bits[i];
-      if (word!=0) return (((long)i)<<6) + BitUtil.ntz(word);
+      if (word!=0) return (((long)i)<<6) + Long.numberOfTrailingZeros(word);
     }
 
     return -1;
Index: lucene/core/src/java/org/apache/lucene/util/BitUtil.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/BitUtil.java	(révision 1411600)
+++ lucene/core/src/java/org/apache/lucene/util/BitUtil.java	(copie de travail)
@@ -24,793 +24,63 @@
 
   private BitUtil() {} // no instance
 
-  /** Returns the number of bits set in the long */
-  public static int pop(long x) {
-  /* Hacker's Delight 32 bit pop function:
-   * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc
-   *
-  int pop(unsigned x) {
-     x = x - ((x >> 1) & 0x55555555);
-     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-     x = (x + (x >> 4)) & 0x0F0F0F0F;
-     x = x + (x >> 8);
-     x = x + (x >> 16);
-     return x & 0x0000003F;
-    }
-  ***/
+  // The pop methods used to rely on bit-manipulation tricks for speed but it
+  // turns out that it is faster to use the Long.bitCount method (which is an
+  // intrinsic since Java 6u18) in a naive loop, see LUCENE-2221
 
-    // 64 bit java version of the C function from above
-    x = x - ((x >>> 1) & 0x5555555555555555L);
-    x = (x & 0x3333333333333333L) + ((x >>>2 ) & 0x3333333333333333L);
-    x = (x + (x >>> 4)) & 0x0F0F0F0F0F0F0F0FL;
-    x = x + (x >>> 8);
-    x = x + (x >>> 16);
-    x = x + (x >>> 32);
-    return ((int)x) & 0x7F;
-  }
-
   /*** Returns the number of set bits in an array of longs. */
-  public static long pop_array(long A[], int wordOffset, int numWords) {
-    /*
-    * Robert Harley and David Seal's bit counting algorithm, as documented
-    * in the revisions of Hacker's Delight
-    * http://www.hackersdelight.org/revisions.pdf
-    * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc
-    *
-    * This function was adapted to Java, and extended to use 64 bit words.
-    * if only we had access to wider registers like SSE from java...
-    *
-    * This function can be transformed to compute the popcount of other functions
-    * on bitsets via something like this:
-    * sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
-    *
-    */
-    int n = wordOffset+numWords;
-    long tot=0, tot8=0;
-    long ones=0, twos=0, fours=0;
-
-    int i;
-    for (i = wordOffset; i <= n - 8; i+=8) {
-      /***  C macro from Hacker's Delight
-       #define CSA(h,l, a,b,c) \
-       {unsigned u = a ^ b; unsigned v = c; \
-       h = (a & b) | (u & v); l = u ^ v;}
-       ***/
-
-      long twosA,twosB,foursA,foursB,eights;
-
-      // CSA(twosA, ones, ones, A[i], A[i+1])
-      {
-        long b=A[i], c=A[i+1];
-        long u=ones ^ b;
-        twosA=(ones & b)|( u & c);
-        ones=u^c;
-      }
-      // CSA(twosB, ones, ones, A[i+2], A[i+3])
-      {
-        long b=A[i+2], c=A[i+3];
-        long u=ones^b;
-        twosB =(ones&b)|(u&c);
-        ones=u^c;
-      }
-      //CSA(foursA, twos, twos, twosA, twosB)
-      {
-        long u=twos^twosA;
-        foursA=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-      //CSA(twosA, ones, ones, A[i+4], A[i+5])
-      {
-        long b=A[i+4], c=A[i+5];
-        long u=ones^b;
-        twosA=(ones&b)|(u&c);
-        ones=u^c;
-      }
-      // CSA(twosB, ones, ones, A[i+6], A[i+7])
-      {
-        long b=A[i+6], c=A[i+7];
-        long u=ones^b;
-        twosB=(ones&b)|(u&c);
-        ones=u^c;
-      }
-      //CSA(foursB, twos, twos, twosA, twosB)
-      {
-        long u=twos^twosA;
-        foursB=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-
-      //CSA(eights, fours, fours, foursA, foursB)
-      {
-        long u=fours^foursA;
-        eights=(fours&foursA)|(u&foursB);
-        fours=u^foursB;
-      }
-      tot8 += pop(eights);
+  public static long pop_array(long[] arr, int wordOffset, int numWords) {
+    long popCount = 0;
+    for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
+      popCount += Long.bitCount(arr[i]);
     }
-
-    // handle trailing words in a binary-search manner...
-    // derived from the loop above by setting specific elements to 0.
-    // the original method in Hackers Delight used a simple for loop:
-    //   for (i = i; i < n; i++)      // Add in the last elements
-    //  tot = tot + pop(A[i]);
-
-    if (i<=n-4) {
-      long twosA, twosB, foursA, eights;
-      {
-        long b=A[i], c=A[i+1];
-        long u=ones ^ b;
-        twosA=(ones & b)|( u & c);
-        ones=u^c;
-      }
-      {
-        long b=A[i+2], c=A[i+3];
-        long u=ones^b;
-        twosB =(ones&b)|(u&c);
-        ones=u^c;
-      }
-      {
-        long u=twos^twosA;
-        foursA=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-      eights=fours&foursA;
-      fours=fours^foursA;
-
-      tot8 += pop(eights);
-      i+=4;
-    }
-
-    if (i<=n-2) {
-      long b=A[i], c=A[i+1];
-      long u=ones ^ b;
-      long twosA=(ones & b)|( u & c);
-      ones=u^c;
-
-      long foursA=twos&twosA;
-      twos=twos^twosA;
-
-      long eights=fours&foursA;
-      fours=fours^foursA;
-
-      tot8 += pop(eights);
-      i+=2;
-    }
-
-    if (i<n) {
-      tot += pop(A[i]);
-    }
-
-    tot += (pop(fours)<<2)
-            + (pop(twos)<<1)
-            + pop(ones)
-            + (tot8<<3);
-
-    return tot;
+    return popCount;
   }
 
   /** Returns the popcount or cardinality of the two sets after an intersection.
    * Neither array is modified.
    */
-  public static long pop_intersect(long A[], long B[], int wordOffset, int numWords) {
-    // generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
-    int n = wordOffset+numWords;
-    long tot=0, tot8=0;
-    long ones=0, twos=0, fours=0;
-
-    int i;
-    for (i = wordOffset; i <= n - 8; i+=8) {
-      long twosA,twosB,foursA,foursB,eights;
-
-      // CSA(twosA, ones, ones, (A[i] & B[i]), (A[i+1] & B[i+1]))
-      {
-        long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
-        long u=ones ^ b;
-        twosA=(ones & b)|( u & c);
-        ones=u^c;
-      }
-      // CSA(twosB, ones, ones, (A[i+2] & B[i+2]), (A[i+3] & B[i+3]))
-      {
-        long b=(A[i+2] & B[i+2]), c=(A[i+3] & B[i+3]);
-        long u=ones^b;
-        twosB =(ones&b)|(u&c);
-        ones=u^c;
-      }
-      //CSA(foursA, twos, twos, twosA, twosB)
-      {
-        long u=twos^twosA;
-        foursA=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-      //CSA(twosA, ones, ones, (A[i+4] & B[i+4]), (A[i+5] & B[i+5]))
-      {
-        long b=(A[i+4] & B[i+4]), c=(A[i+5] & B[i+5]);
-        long u=ones^b;
-        twosA=(ones&b)|(u&c);
-        ones=u^c;
-      }
-      // CSA(twosB, ones, ones, (A[i+6] & B[i+6]), (A[i+7] & B[i+7]))
-      {
-        long b=(A[i+6] & B[i+6]), c=(A[i+7] & B[i+7]);
-        long u=ones^b;
-        twosB=(ones&b)|(u&c);
-        ones=u^c;
-      }
-      //CSA(foursB, twos, twos, twosA, twosB)
-      {
-        long u=twos^twosA;
-        foursB=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-
-      //CSA(eights, fours, fours, foursA, foursB)
-      {
-        long u=fours^foursA;
-        eights=(fours&foursA)|(u&foursB);
-        fours=u^foursB;
-      }
-      tot8 += pop(eights);
+  public static long pop_intersect(long[] arr1, long[] arr2, int wordOffset, int numWords) {
+    long popCount = 0;
+    for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
+      popCount += Long.bitCount(arr1[i] & arr2[i]);
     }
-
-
-    if (i<=n-4) {
-      long twosA, twosB, foursA, eights;
-      {
-        long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
-        long u=ones ^ b;
-        twosA=(ones & b)|( u & c);
-        ones=u^c;
-      }
-      {
-        long b=(A[i+2] & B[i+2]), c=(A[i+3] & B[i+3]);
-        long u=ones^b;
-        twosB =(ones&b)|(u&c);
-        ones=u^c;
-      }
-      {
-        long u=twos^twosA;
-        foursA=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-      eights=fours&foursA;
-      fours=fours^foursA;
-
-      tot8 += pop(eights);
-      i+=4;
-    }
-
-    if (i<=n-2) {
-      long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
-      long u=ones ^ b;
-      long twosA=(ones & b)|( u & c);
-      ones=u^c;
-
-      long foursA=twos&twosA;
-      twos=twos^twosA;
-
-      long eights=fours&foursA;
-      fours=fours^foursA;
-
-      tot8 += pop(eights);
-      i+=2;
-    }
-
-    if (i<n) {
-      tot += pop((A[i] & B[i]));
-    }
-
-    tot += (pop(fours)<<2)
-            + (pop(twos)<<1)
-            + pop(ones)
-            + (tot8<<3);
-
-    return tot;
+    return popCount;
   }
 
   /** Returns the popcount or cardinality of the union of two sets.
     * Neither array is modified.
     */
-   public static long pop_union(long A[], long B[], int wordOffset, int numWords) {
-     // generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \| B[\1]\)/g'
-     int n = wordOffset+numWords;
-     long tot=0, tot8=0;
-     long ones=0, twos=0, fours=0;
-
-     int i;
-     for (i = wordOffset; i <= n - 8; i+=8) {
-       /***  C macro from Hacker's Delight
-        #define CSA(h,l, a,b,c) \
-        {unsigned u = a ^ b; unsigned v = c; \
-        h = (a & b) | (u & v); l = u ^ v;}
-        ***/
-
-       long twosA,twosB,foursA,foursB,eights;
-
-       // CSA(twosA, ones, ones, (A[i] | B[i]), (A[i+1] | B[i+1]))
-       {
-         long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
-         long u=ones ^ b;
-         twosA=(ones & b)|( u & c);
-         ones=u^c;
-       }
-       // CSA(twosB, ones, ones, (A[i+2] | B[i+2]), (A[i+3] | B[i+3]))
-       {
-         long b=(A[i+2] | B[i+2]), c=(A[i+3] | B[i+3]);
-         long u=ones^b;
-         twosB =(ones&b)|(u&c);
-         ones=u^c;
-       }
-       //CSA(foursA, twos, twos, twosA, twosB)
-       {
-         long u=twos^twosA;
-         foursA=(twos&twosA)|(u&twosB);
-         twos=u^twosB;
-       }
-       //CSA(twosA, ones, ones, (A[i+4] | B[i+4]), (A[i+5] | B[i+5]))
-       {
-         long b=(A[i+4] | B[i+4]), c=(A[i+5] | B[i+5]);
-         long u=ones^b;
-         twosA=(ones&b)|(u&c);
-         ones=u^c;
-       }
-       // CSA(twosB, ones, ones, (A[i+6] | B[i+6]), (A[i+7] | B[i+7]))
-       {
-         long b=(A[i+6] | B[i+6]), c=(A[i+7] | B[i+7]);
-         long u=ones^b;
-         twosB=(ones&b)|(u&c);
-         ones=u^c;
-       }
-       //CSA(foursB, twos, twos, twosA, twosB)
-       {
-         long u=twos^twosA;
-         foursB=(twos&twosA)|(u&twosB);
-         twos=u^twosB;
-       }
-
-       //CSA(eights, fours, fours, foursA, foursB)
-       {
-         long u=fours^foursA;
-         eights=(fours&foursA)|(u&foursB);
-         fours=u^foursB;
-       }
-       tot8 += pop(eights);
+   public static long pop_union(long[] arr1, long[] arr2, int wordOffset, int numWords) {
+     long popCount = 0;
+     for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
+       popCount += Long.bitCount(arr1[i] | arr2[i]);
      }
-
-
-     if (i<=n-4) {
-       long twosA, twosB, foursA, eights;
-       {
-         long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
-         long u=ones ^ b;
-         twosA=(ones & b)|( u & c);
-         ones=u^c;
-       }
-       {
-         long b=(A[i+2] | B[i+2]), c=(A[i+3] | B[i+3]);
-         long u=ones^b;
-         twosB =(ones&b)|(u&c);
-         ones=u^c;
-       }
-       {
-         long u=twos^twosA;
-         foursA=(twos&twosA)|(u&twosB);
-         twos=u^twosB;
-       }
-       eights=fours&foursA;
-       fours=fours^foursA;
-
-       tot8 += pop(eights);
-       i+=4;
-     }
-
-     if (i<=n-2) {
-       long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
-       long u=ones ^ b;
-       long twosA=(ones & b)|( u & c);
-       ones=u^c;
-
-       long foursA=twos&twosA;
-       twos=twos^twosA;
-
-       long eights=fours&foursA;
-       fours=fours^foursA;
-
-       tot8 += pop(eights);
-       i+=2;
-     }
-
-     if (i<n) {
-       tot += pop((A[i] | B[i]));
-     }
-
-     tot += (pop(fours)<<2)
-             + (pop(twos)<<1)
-             + pop(ones)
-             + (tot8<<3);
-
-     return tot;
+     return popCount;
    }
 
   /** Returns the popcount or cardinality of A & ~B
    * Neither array is modified.
    */
-  public static long pop_andnot(long A[], long B[], int wordOffset, int numWords) {
-    // generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& ~B[\1]\)/g'
-    int n = wordOffset+numWords;
-    long tot=0, tot8=0;
-    long ones=0, twos=0, fours=0;
-
-    int i;
-    for (i = wordOffset; i <= n - 8; i+=8) {
-      /***  C macro from Hacker's Delight
-       #define CSA(h,l, a,b,c) \
-       {unsigned u = a ^ b; unsigned v = c; \
-       h = (a & b) | (u & v); l = u ^ v;}
-       ***/
-
-      long twosA,twosB,foursA,foursB,eights;
-
-      // CSA(twosA, ones, ones, (A[i] & ~B[i]), (A[i+1] & ~B[i+1]))
-      {
-        long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
-        long u=ones ^ b;
-        twosA=(ones & b)|( u & c);
-        ones=u^c;
-      }
-      // CSA(twosB, ones, ones, (A[i+2] & ~B[i+2]), (A[i+3] & ~B[i+3]))
-      {
-        long b=(A[i+2] & ~B[i+2]), c=(A[i+3] & ~B[i+3]);
-        long u=ones^b;
-        twosB =(ones&b)|(u&c);
-        ones=u^c;
-      }
-      //CSA(foursA, twos, twos, twosA, twosB)
-      {
-        long u=twos^twosA;
-        foursA=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-      //CSA(twosA, ones, ones, (A[i+4] & ~B[i+4]), (A[i+5] & ~B[i+5]))
-      {
-        long b=(A[i+4] & ~B[i+4]), c=(A[i+5] & ~B[i+5]);
-        long u=ones^b;
-        twosA=(ones&b)|(u&c);
-        ones=u^c;
-      }
-      // CSA(twosB, ones, ones, (A[i+6] & ~B[i+6]), (A[i+7] & ~B[i+7]))
-      {
-        long b=(A[i+6] & ~B[i+6]), c=(A[i+7] & ~B[i+7]);
-        long u=ones^b;
-        twosB=(ones&b)|(u&c);
-        ones=u^c;
-      }
-      //CSA(foursB, twos, twos, twosA, twosB)
-      {
-        long u=twos^twosA;
-        foursB=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-
-      //CSA(eights, fours, fours, foursA, foursB)
-      {
-        long u=fours^foursA;
-        eights=(fours&foursA)|(u&foursB);
-        fours=u^foursB;
-      }
-      tot8 += pop(eights);
+  public static long pop_andnot(long[] arr1, long[] arr2, int wordOffset, int numWords) {
+    long popCount = 0;
+    for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
+      popCount += Long.bitCount(arr1[i] & ~arr2[i]);
     }
-
-
-    if (i<=n-4) {
-      long twosA, twosB, foursA, eights;
-      {
-        long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
-        long u=ones ^ b;
-        twosA=(ones & b)|( u & c);
-        ones=u^c;
-      }
-      {
-        long b=(A[i+2] & ~B[i+2]), c=(A[i+3] & ~B[i+3]);
-        long u=ones^b;
-        twosB =(ones&b)|(u&c);
-        ones=u^c;
-      }
-      {
-        long u=twos^twosA;
-        foursA=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-      eights=fours&foursA;
-      fours=fours^foursA;
-
-      tot8 += pop(eights);
-      i+=4;
-    }
-
-    if (i<=n-2) {
-      long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
-      long u=ones ^ b;
-      long twosA=(ones & b)|( u & c);
-      ones=u^c;
-
-      long foursA=twos&twosA;
-      twos=twos^twosA;
-
-      long eights=fours&foursA;
-      fours=fours^foursA;
-
-      tot8 += pop(eights);
-      i+=2;
-    }
-
-    if (i<n) {
-      tot += pop((A[i] & ~B[i]));
-    }
-
-    tot += (pop(fours)<<2)
-            + (pop(twos)<<1)
-            + pop(ones)
-            + (tot8<<3);
-
-    return tot;
+    return popCount;
   }
 
-  public static long pop_xor(long A[], long B[], int wordOffset, int numWords) {
-    int n = wordOffset+numWords;
-    long tot=0, tot8=0;
-    long ones=0, twos=0, fours=0;
-
-    int i;
-    for (i = wordOffset; i <= n - 8; i+=8) {
-      /***  C macro from Hacker's Delight
-       #define CSA(h,l, a,b,c) \
-       {unsigned u = a ^ b; unsigned v = c; \
-       h = (a & b) | (u & v); l = u ^ v;}
-       ***/
-
-      long twosA,twosB,foursA,foursB,eights;
-
-      // CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i+1] ^ B[i+1]))
-      {
-        long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
-        long u=ones ^ b;
-        twosA=(ones & b)|( u & c);
-        ones=u^c;
-      }
-      // CSA(twosB, ones, ones, (A[i+2] ^ B[i+2]), (A[i+3] ^ B[i+3]))
-      {
-        long b=(A[i+2] ^ B[i+2]), c=(A[i+3] ^ B[i+3]);
-        long u=ones^b;
-        twosB =(ones&b)|(u&c);
-        ones=u^c;
-      }
-      //CSA(foursA, twos, twos, twosA, twosB)
-      {
-        long u=twos^twosA;
-        foursA=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-      //CSA(twosA, ones, ones, (A[i+4] ^ B[i+4]), (A[i+5] ^ B[i+5]))
-      {
-        long b=(A[i+4] ^ B[i+4]), c=(A[i+5] ^ B[i+5]);
-        long u=ones^b;
-        twosA=(ones&b)|(u&c);
-        ones=u^c;
-      }
-      // CSA(twosB, ones, ones, (A[i+6] ^ B[i+6]), (A[i+7] ^ B[i+7]))
-      {
-        long b=(A[i+6] ^ B[i+6]), c=(A[i+7] ^ B[i+7]);
-        long u=ones^b;
-        twosB=(ones&b)|(u&c);
-        ones=u^c;
-      }
-      //CSA(foursB, twos, twos, twosA, twosB)
-      {
-        long u=twos^twosA;
-        foursB=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-
-      //CSA(eights, fours, fours, foursA, foursB)
-      {
-        long u=fours^foursA;
-        eights=(fours&foursA)|(u&foursB);
-        fours=u^foursB;
-      }
-      tot8 += pop(eights);
+  /** Returns the popcount or cardinality of A ^ B
+   * Neither array is modified.
+   */
+  public static long pop_xor(long[] arr1, long[] arr2, int wordOffset, int numWords) {
+    long popCount = 0;
+    for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
+      popCount += Long.bitCount(arr1[i] ^ arr2[i]);
     }
-
-
-    if (i<=n-4) {
-      long twosA, twosB, foursA, eights;
-      {
-        long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
-        long u=ones ^ b;
-        twosA=(ones & b)|( u & c);
-        ones=u^c;
-      }
-      {
-        long b=(A[i+2] ^ B[i+2]), c=(A[i+3] ^ B[i+3]);
-        long u=ones^b;
-        twosB =(ones&b)|(u&c);
-        ones=u^c;
-      }
-      {
-        long u=twos^twosA;
-        foursA=(twos&twosA)|(u&twosB);
-        twos=u^twosB;
-      }
-      eights=fours&foursA;
-      fours=fours^foursA;
-
-      tot8 += pop(eights);
-      i+=4;
-    }
-
-    if (i<=n-2) {
-      long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
-      long u=ones ^ b;
-      long twosA=(ones & b)|( u & c);
-      ones=u^c;
-
-      long foursA=twos&twosA;
-      twos=twos^twosA;
-
-      long eights=fours&foursA;
-      fours=fours^foursA;
-
-      tot8 += pop(eights);
-      i+=2;
-    }
-
-    if (i<n) {
-      tot += pop((A[i] ^ B[i]));
-    }
-
-    tot += (pop(fours)<<2)
-            + (pop(twos)<<1)
-            + pop(ones)
-            + (tot8<<3);
-
-    return tot;
+    return popCount;
   }
 
-  /* python code to generate ntzTable
-  def ntz(val):
-    if val==0: return 8
-    i=0
-    while (val&0x01)==0:
-      i = i+1
-      val >>= 1
-    return i
-  print ','.join([ str(ntz(i)) for i in range(256) ])
-  ***/
-  /** table of number of trailing zeros in a byte */
-  public static final byte[] ntzTable = {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
-
-
-  /** Returns number of trailing zeros in a 64 bit long value. */
-  public static int ntz(long val) {
-    // A full binary search to determine the low byte was slower than
-    // a linear search for nextSetBit().  This is most likely because
-    // the implementation of nextSetBit() shifts bits to the right, increasing
-    // the probability that the first non-zero byte is in the rhs.
-    //
-    // This implementation does a single binary search at the top level only
-    // so that all other bit shifting can be done on ints instead of longs to
-    // remain friendly to 32 bit architectures.  In addition, the case of a
-    // non-zero first byte is checked for first because it is the most common
-    // in dense bit arrays.
-
-    int lower = (int)val;
-    int lowByte = lower & 0xff;
-    if (lowByte != 0) return ntzTable[lowByte];
-
-    if (lower!=0) {
-      lowByte = (lower>>>8) & 0xff;
-      if (lowByte != 0) return ntzTable[lowByte] + 8;
-      lowByte = (lower>>>16) & 0xff;
-      if (lowByte != 0) return ntzTable[lowByte] + 16;
-      // no need to mask off low byte for the last byte in the 32 bit word
-      // no need to check for zero on the last byte either.
-      return ntzTable[lower>>>24] + 24;
-    } else {
-      // grab upper 32 bits
-      int upper=(int)(val>>32);
-      lowByte = upper & 0xff;
-      if (lowByte != 0) return ntzTable[lowByte] + 32;
-      lowByte = (upper>>>8) & 0xff;
-      if (lowByte != 0) return ntzTable[lowByte] + 40;
-      lowByte = (upper>>>16) & 0xff;
-      if (lowByte != 0) return ntzTable[lowByte] + 48;
-      // no need to mask off low byte for the last byte in the 32 bit word
-      // no need to check for zero on the last byte either.
-      return ntzTable[upper>>>24] + 56;
-    }
-  }
-
-  /** Returns number of trailing zeros in a 32 bit int value. */
-  public static int ntz(int val) {
-    // This implementation does a single binary search at the top level only.
-    // In addition, the case of a non-zero first byte is checked for first
-    // because it is the most common in dense bit arrays.
-
-    int lowByte = val & 0xff;
-    if (lowByte != 0) return ntzTable[lowByte];
-    lowByte = (val>>>8) & 0xff;
-    if (lowByte != 0) return ntzTable[lowByte] + 8;
-    lowByte = (val>>>16) & 0xff;
-    if (lowByte != 0) return ntzTable[lowByte] + 16;
-    // no need to mask off low byte for the last byte.
-    // no need to check for zero on the last byte either.
-    return ntzTable[val>>>24] + 24;
-  }
-
-  /** returns 0 based index of first set bit
-   * (only works for x!=0)
-   * <br/> This is an alternate implementation of ntz()
-   */
-  public static int ntz2(long x) {
-   int n = 0;
-   int y = (int)x;
-   if (y==0) {n+=32; y = (int)(x>>>32); }   // the only 64 bit shift necessary
-   if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; }
-   if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; }
-   return (ntzTable[ y & 0xff ]) + n;
-  }
-
-  /** returns 0 based index of first set bit
-   * <br/> This is an alternate implementation of ntz()
-   */
-  public static int ntz3(long x) {
-   // another implementation taken from Hackers Delight, extended to 64 bits
-   // and converted to Java.
-   // Many 32 bit ntz algorithms are at http://www.hackersdelight.org/HDcode/ntz.cc
-   int n = 1;
-
-   // do the first step as a long, all others as ints.
-   int y = (int)x;
-   if (y==0) {n+=32; y = (int)(x>>>32); }
-   if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; }
-   if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; }
-   if ((y & 0x0000000F) == 0) { n+=4; y>>>=4; }
-   if ((y & 0x00000003) == 0) { n+=2; y>>>=2; }
-   return n - (y & 1);
-  }
-
-  /** table of number of leading zeros in a byte */
-  public static final byte[] nlzTable = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-  /** Returns the number of leading zero bits.
-   */
-  public static int nlz(long x) {
-   int n = 0;
-   // do the first step as a long
-   int y = (int)(x>>>32);
-   if (y==0) {n+=32; y = (int)(x); }
-   if ((y & 0xFFFF0000) == 0) { n+=16; y<<=16; }
-   if ((y & 0xFF000000) == 0) { n+=8; y<<=8; }
-   return n + nlzTable[y >>> 24];
- /* implementation without table:
-   if ((y & 0xF0000000) == 0) { n+=4; y<<=4; }
-   if ((y & 0xC0000000) == 0) { n+=2; y<<=2; }
-   if ((y & 0x80000000) == 0) { n+=1; y<<=1; }
-   if ((y & 0x80000000) == 0) { n+=1;}
-   return n;
-  */
-  }
-
-
-  /** returns true if v is a power of two or zero*/
-  public static boolean isPowerOfTwo(int v) {
-    return ((v & (v-1)) == 0);
-  }
-
-  /** returns true if v is a power of two or zero*/
-  public static boolean isPowerOfTwo(long v) {
-    return ((v & (v-1)) == 0);
-  }
-
   /** returns the next highest power of two, or the current value if it's already a power of two or zero*/
   public static int nextHighestPowerOfTwo(int v) {
     v--;
