use of com.tdunning.math.stats.MergingDigest in project nutch by apache.
the class CrawlDbReader method processStatJob.
public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException, InterruptedException, ClassNotFoundException {
double[] quantiles = { .01, .05, .1, .2, .25, .3, .4, .5, .6, .7, .75, .8, .9, .95, .99 };
if (config.get("db.stats.score.quantiles") != null) {
List<Double> qs = new ArrayList<>();
for (String s : config.getStrings("db.stats.score.quantiles")) {
try {
double d = Double.parseDouble(s);
if (d >= 0.0 && d <= 1.0) {
qs.add(d);
} else {
LOG.warn("Skipping quantile {} not in range in db.stats.score.quantiles: {}", s);
}
} catch (NumberFormatException e) {
LOG.warn("Skipping bad floating point number {} in db.stats.score.quantiles: {}", s, e.getMessage());
}
quantiles = new double[qs.size()];
int i = 0;
for (Double q : qs) {
quantiles[i++] = q;
}
Arrays.sort(quantiles);
}
}
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb statistics start: " + crawlDb);
}
TreeMap<String, Writable> stats = processStatJobHelper(crawlDb, config, sort);
if (LOG.isInfoEnabled()) {
LOG.info("Statistics for CrawlDb: " + crawlDb);
LongWritable totalCnt = ((LongWritable) stats.get("T"));
stats.remove("T");
LOG.info("TOTAL urls:\t" + totalCnt.get());
for (Map.Entry<String, Writable> entry : stats.entrySet()) {
String k = entry.getKey();
long value = 0;
double fvalue = 0.0;
byte[] bytesValue = null;
Writable val = entry.getValue();
if (val instanceof LongWritable) {
value = ((LongWritable) val).get();
} else if (val instanceof FloatWritable) {
fvalue = ((FloatWritable) val).get();
} else if (val instanceof BytesWritable) {
bytesValue = ((BytesWritable) val).getBytes();
}
if (k.equals("scn")) {
LOG.info("min score:\t" + fvalue);
} else if (k.equals("scx")) {
LOG.info("max score:\t" + fvalue);
} else if (k.equals("sct")) {
LOG.info("avg score:\t" + (fvalue / totalCnt.get()));
} else if (k.equals("scNaN")) {
LOG.info("score == NaN:\t" + value);
} else if (k.equals("ftn")) {
LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * value));
} else if (k.equals("ftx")) {
LOG.info("latest fetch time:\t" + new Date(1000 * 60 * value));
} else if (k.equals("ftt")) {
LOG.info("avg of fetch times:\t" + new Date(1000 * 60 * (value / totalCnt.get())));
} else if (k.equals("fin")) {
LOG.info("shortest fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value));
} else if (k.equals("fix")) {
LOG.info("longest fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value));
} else if (k.equals("fit")) {
LOG.info("avg fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value / totalCnt.get()));
} else if (k.startsWith("status")) {
String[] st = k.split(" ");
int code = Integer.parseInt(st[1]);
if (st.length > 2)
LOG.info(" " + st[2] + " :\t" + val);
else
LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
} else if (k.equals("scd")) {
MergingDigest tdigest = MergingDigest.fromBytes(ByteBuffer.wrap(bytesValue));
for (double q : quantiles) {
LOG.info("score quantile {}:\t{}", q, tdigest.quantile(q));
}
} else {
LOG.info(k + ":\t" + val);
}
}
}
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb statistics: done");
}
}
use of com.tdunning.math.stats.MergingDigest in project nutch by apache.
the class CrawlDbReader method processStatJobHelper.
private TreeMap<String, Writable> processStatJobHelper(String crawlDb, Configuration config, boolean sort) throws IOException, InterruptedException, ClassNotFoundException {
Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
Job job = NutchJob.getInstance(config);
config = job.getConfiguration();
job.setJobName("stats " + crawlDb);
config.setBoolean("db.reader.stats.sort", sort);
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(CrawlDbStatMapper.class);
job.setCombinerClass(CrawlDbStatReducer.class);
job.setReducerClass(CrawlDbStatReducer.class);
FileOutputFormat.setOutputPath(job, tmpFolder);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NutchWritable.class);
// https://issues.apache.org/jira/browse/NUTCH-1029
config.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
try {
int complete = job.waitForCompletion(true) ? 0 : 1;
} catch (InterruptedException | ClassNotFoundException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
// reading the result
FileSystem fileSystem = tmpFolder.getFileSystem(config);
SequenceFile.Reader[] readers = SegmentReaderUtil.getReaders(tmpFolder, config);
Text key = new Text();
NutchWritable value = new NutchWritable();
TreeMap<String, Writable> stats = new TreeMap<>();
for (int i = 0; i < readers.length; i++) {
SequenceFile.Reader reader = readers[i];
while (reader.next(key, value)) {
String k = key.toString();
Writable val = stats.get(k);
if (val == null) {
stats.put(k, value.get());
continue;
}
if (k.equals("sc")) {
float min = Float.MAX_VALUE;
float max = Float.MIN_VALUE;
if (stats.containsKey("scn")) {
min = ((FloatWritable) stats.get("scn")).get();
} else {
min = ((FloatWritable) stats.get("sc")).get();
}
if (stats.containsKey("scx")) {
max = ((FloatWritable) stats.get("scx")).get();
} else {
max = ((FloatWritable) stats.get("sc")).get();
}
float fvalue = ((FloatWritable) value.get()).get();
if (min > fvalue) {
min = fvalue;
}
if (max < fvalue) {
max = fvalue;
}
stats.put("scn", new FloatWritable(min));
stats.put("scx", new FloatWritable(max));
} else if (k.equals("ft") || k.equals("fi")) {
long min = Long.MAX_VALUE;
long max = Long.MIN_VALUE;
String minKey = k + "n";
String maxKey = k + "x";
if (stats.containsKey(minKey)) {
min = ((LongWritable) stats.get(minKey)).get();
} else if (stats.containsKey(k)) {
min = ((LongWritable) stats.get(k)).get();
}
if (stats.containsKey(maxKey)) {
max = ((LongWritable) stats.get(maxKey)).get();
} else if (stats.containsKey(k)) {
max = ((LongWritable) stats.get(k)).get();
}
long lvalue = ((LongWritable) value.get()).get();
if (min > lvalue) {
min = lvalue;
}
if (max < lvalue) {
max = lvalue;
}
stats.put(k + "n", new LongWritable(min));
stats.put(k + "x", new LongWritable(max));
} else if (k.equals("sct")) {
FloatWritable fvalue = (FloatWritable) value.get();
((FloatWritable) val).set(((FloatWritable) val).get() + fvalue.get());
} else if (k.equals("scd")) {
MergingDigest tdigest = null;
MergingDigest tdig = MergingDigest.fromBytes(ByteBuffer.wrap(((BytesWritable) value.get()).getBytes()));
if (val instanceof BytesWritable) {
tdigest = MergingDigest.fromBytes(ByteBuffer.wrap(((BytesWritable) val).getBytes()));
tdigest.add(tdig);
} else {
tdigest = tdig;
}
ByteBuffer tdigestBytes = ByteBuffer.allocate(tdigest.smallByteSize());
tdigest.asSmallBytes(tdigestBytes);
stats.put(k, new BytesWritable(tdigestBytes.array()));
} else {
LongWritable lvalue = (LongWritable) value.get();
((LongWritable) val).set(((LongWritable) val).get() + lvalue.get());
}
}
reader.close();
}
// remove score, fetch interval, and fetch time
// (used for min/max calculation)
stats.remove("sc");
stats.remove("fi");
stats.remove("ft");
// removing the tmp folder
fileSystem.delete(tmpFolder, true);
return stats;
}
Aggregations