Uploaded image for project: 'Nutch'
  1. Nutch
  2. NUTCH-1269

Improve distribution of URLS with multi-segment generation

    XMLWordPrintableJSON

Details

    • Patch Available

    Description

      there are some problems with current Generate method, with maxNumSegments and maxHostCount options:
      1. first, size of generated segments are different
      2. with maxHostCount option, it is unclear that it was applied or not
      3. urls from one host are distributed non-uniform between segments
      we change Generator.java as described below:
      in Selector class:
      private int maxNumSegments;
      private int segmentSize;
      private int maxHostCount;
      public void config
      ...
      maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
      segmentSize=(int)job.getInt(GENERATOR_TOP_N, 10000000)/maxNumSegments;
      maxHostCount=job.getInt("GENERATE_MAX_PER_HOST", 100);
      ...
      public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
      OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter)
      throws IOException {
      int limit2=(int)((limit*3)/2);
      while (values.hasNext()) {
      if(count == limit)
      break;
      if (count % segmentSize == 0 ) {
      if (currentsegmentnum < maxNumSegments-1)

      { currentsegmentnum++; }

      else
      currentsegmentnum=0;
      }

      boolean full=true;
      for(int jk=0;jk<maxNumSegments;jk++){
      if (segCounts[jk]<segmentSize)

      { full=false; }

      }
      if(full)

      { break; }

      SelectorEntry entry = values.next();
      Text url = entry.url;
      //logWrite("Generated3:"limit""count""+url.toString());
      String urlString = url.toString();
      URL u = null;
      String hostordomain = null;
      try {
      if (normalise && normalizers != null)

      { urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); }

      u = new URL(urlString);
      if (byDomain)

      { hostordomain = URLUtil.getDomainName(u); }

      else

      { hostordomain = new URL(urlString).getHost(); }

      hostordomain = hostordomain.toLowerCase();

      boolean countLimit=true;
      // only filter if we are counting hosts or domains
      int[] hostCount = hostCounts.get(hostordomain);
      //host count:

      {a,b,c,d}

      means that from this host there are a urls in segment 0 and b urls in seg 1 and ...
      if (hostCount == null)

      { hostCount = new int[maxNumSegments]; for(int kl=0;kl<hostCount.length;kl++) hostCount[kl]=0; hostCounts.put(hostordomain, hostCount); }


      int selectedSeg=currentsegmentnum;
      int minCount=hostCount[selectedSeg];
      for(int jk=0;jk<maxNumSegments;jk++){
      if(hostCount[jk]<minCount)

      { minCount=hostCount[jk]; selectedSeg=jk; }

      }
      if(hostCount[selectedSeg]<=maxHostCount)

      { count++; entry.segnum = new IntWritable(selectedSeg); hostCount[selectedSeg]++; output.collect(key, entry); }

      } catch (Exception e)

      { LOG.warn("Malformed URL: '" + urlString + "', skipping (" logWrite("Generate-malform:"+hostordomain+"-"+url.toString()); + StringUtils.stringifyException(e) + ")"); //continue; }

      }
      }

      Attachments

        1. NUTCH-1269.patch
          6 kB
          behnam nikbakht
        2. NUTCH-1269-v.2.patch
          5 kB
          behnam nikbakht

        Activity

          People

            Unassigned Unassigned
            behnam.nikbakht behnam nikbakht
            Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

            Dates

              Created:
              Updated: