Details
-
Improvement
-
Status: Open
-
Major
-
Resolution: Unresolved
-
1.4
-
None
-
software
-
Patch Available
Description
there are some problems with current Generate method, with maxNumSegments and maxHostCount options:
1. first, size of generated segments are different
2. with maxHostCount option, it is unclear that it was applied or not
3. urls from one host are distributed non-uniform between segments
we change Generator.java as described below:
in Selector class:
private int maxNumSegments;
private int segmentSize;
private int maxHostCount;
public void config
...
maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
segmentSize=(int)job.getInt(GENERATOR_TOP_N, 10000000)/maxNumSegments;
maxHostCount=job.getInt("GENERATE_MAX_PER_HOST", 100);
...
public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter)
throws IOException {
int limit2=(int)((limit*3)/2);
while (values.hasNext()) {
if(count == limit)
break;
if (count % segmentSize == 0 ) {
if (currentsegmentnum < maxNumSegments-1)
else
currentsegmentnum=0;
}
boolean full=true;
for(int jk=0;jk<maxNumSegments;jk++){
if (segCounts[jk]<segmentSize)
}
if(full)
SelectorEntry entry = values.next();
Text url = entry.url;
//logWrite("Generated3:"limit""count""+url.toString());
String urlString = url.toString();
URL u = null;
String hostordomain = null;
try {
if (normalise && normalizers != null)
u = new URL(urlString);
if (byDomain)
else
{ hostordomain = new URL(urlString).getHost(); }hostordomain = hostordomain.toLowerCase();
boolean countLimit=true;
// only filter if we are counting hosts or domains
int[] hostCount = hostCounts.get(hostordomain);
//host count:
means that from this host there are a urls in segment 0 and b urls in seg 1 and ...
if (hostCount == null)
int selectedSeg=currentsegmentnum;
int minCount=hostCount[selectedSeg];
for(int jk=0;jk<maxNumSegments;jk++){
if(hostCount[jk]<minCount)
}
if(hostCount[selectedSeg]<=maxHostCount)
} catch (Exception e)
{ LOG.warn("Malformed URL: '" + urlString + "', skipping (" logWrite("Generate-malform:"+hostordomain+"-"+url.toString()); + StringUtils.stringifyException(e) + ")"); //continue; } }
}