Search in sources :

Example 1 with MergingDigest

use of com.tdunning.math.stats.MergingDigest in project nutch by apache.

the class CrawlDbReader method processStatJob.

public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException, InterruptedException, ClassNotFoundException {
    double[] quantiles = { .01, .05, .1, .2, .25, .3, .4, .5, .6, .7, .75, .8, .9, .95, .99 };
    if (config.get("db.stats.score.quantiles") != null) {
        List<Double> qs = new ArrayList<>();
        for (String s : config.getStrings("db.stats.score.quantiles")) {
            try {
                double d = Double.parseDouble(s);
                if (d >= 0.0 && d <= 1.0) {
                    qs.add(d);
                } else {
                    LOG.warn("Skipping quantile {} not in range in db.stats.score.quantiles: {}", s);
                }
            } catch (NumberFormatException e) {
                LOG.warn("Skipping bad floating point number {} in db.stats.score.quantiles: {}", s, e.getMessage());
            }
            quantiles = new double[qs.size()];
            int i = 0;
            for (Double q : qs) {
                quantiles[i++] = q;
            }
            Arrays.sort(quantiles);
        }
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics start: " + crawlDb);
    }
    TreeMap<String, Writable> stats = processStatJobHelper(crawlDb, config, sort);
    if (LOG.isInfoEnabled()) {
        LOG.info("Statistics for CrawlDb: " + crawlDb);
        LongWritable totalCnt = ((LongWritable) stats.get("T"));
        stats.remove("T");
        LOG.info("TOTAL urls:\t" + totalCnt.get());
        for (Map.Entry<String, Writable> entry : stats.entrySet()) {
            String k = entry.getKey();
            long value = 0;
            double fvalue = 0.0;
            byte[] bytesValue = null;
            Writable val = entry.getValue();
            if (val instanceof LongWritable) {
                value = ((LongWritable) val).get();
            } else if (val instanceof FloatWritable) {
                fvalue = ((FloatWritable) val).get();
            } else if (val instanceof BytesWritable) {
                bytesValue = ((BytesWritable) val).getBytes();
            }
            if (k.equals("scn")) {
                LOG.info("min score:\t" + fvalue);
            } else if (k.equals("scx")) {
                LOG.info("max score:\t" + fvalue);
            } else if (k.equals("sct")) {
                LOG.info("avg score:\t" + (fvalue / totalCnt.get()));
            } else if (k.equals("scNaN")) {
                LOG.info("score == NaN:\t" + value);
            } else if (k.equals("ftn")) {
                LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * value));
            } else if (k.equals("ftx")) {
                LOG.info("latest fetch time:\t" + new Date(1000 * 60 * value));
            } else if (k.equals("ftt")) {
                LOG.info("avg of fetch times:\t" + new Date(1000 * 60 * (value / totalCnt.get())));
            } else if (k.equals("fin")) {
                LOG.info("shortest fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value));
            } else if (k.equals("fix")) {
                LOG.info("longest fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value));
            } else if (k.equals("fit")) {
                LOG.info("avg fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value / totalCnt.get()));
            } else if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2)
                    LOG.info("   " + st[2] + " :\t" + val);
                else
                    LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
            } else if (k.equals("scd")) {
                MergingDigest tdigest = MergingDigest.fromBytes(ByteBuffer.wrap(bytesValue));
                for (double q : quantiles) {
                    LOG.info("score quantile {}:\t{}", q, tdigest.quantile(q));
                }
            } else {
                LOG.info(k + ":\t" + val);
            }
        }
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics: done");
    }
}
Also used : MergingDigest(com.tdunning.math.stats.MergingDigest) ArrayList(java.util.ArrayList) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) Date(java.util.Date) FloatWritable(org.apache.hadoop.io.FloatWritable) LongWritable(org.apache.hadoop.io.LongWritable) Map(java.util.Map) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap)

Example 2 with MergingDigest

use of com.tdunning.math.stats.MergingDigest in project nutch by apache.

the class CrawlDbReader method processStatJobHelper.

private TreeMap<String, Writable> processStatJobHelper(String crawlDb, Configuration config, boolean sort) throws IOException, InterruptedException, ClassNotFoundException {
    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
    Job job = NutchJob.getInstance(config);
    config = job.getConfiguration();
    job.setJobName("stats " + crawlDb);
    config.setBoolean("db.reader.stats.sort", sort);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatReducer.class);
    job.setReducerClass(CrawlDbStatReducer.class);
    FileOutputFormat.setOutputPath(job, tmpFolder);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);
    // https://issues.apache.org/jira/browse/NUTCH-1029
    config.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
    try {
        int complete = job.waitForCompletion(true) ? 0 : 1;
    } catch (InterruptedException | ClassNotFoundException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    // reading the result
    FileSystem fileSystem = tmpFolder.getFileSystem(config);
    SequenceFile.Reader[] readers = SegmentReaderUtil.getReaders(tmpFolder, config);
    Text key = new Text();
    NutchWritable value = new NutchWritable();
    TreeMap<String, Writable> stats = new TreeMap<>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            Writable val = stats.get(k);
            if (val == null) {
                stats.put(k, value.get());
                continue;
            }
            if (k.equals("sc")) {
                float min = Float.MAX_VALUE;
                float max = Float.MIN_VALUE;
                if (stats.containsKey("scn")) {
                    min = ((FloatWritable) stats.get("scn")).get();
                } else {
                    min = ((FloatWritable) stats.get("sc")).get();
                }
                if (stats.containsKey("scx")) {
                    max = ((FloatWritable) stats.get("scx")).get();
                } else {
                    max = ((FloatWritable) stats.get("sc")).get();
                }
                float fvalue = ((FloatWritable) value.get()).get();
                if (min > fvalue) {
                    min = fvalue;
                }
                if (max < fvalue) {
                    max = fvalue;
                }
                stats.put("scn", new FloatWritable(min));
                stats.put("scx", new FloatWritable(max));
            } else if (k.equals("ft") || k.equals("fi")) {
                long min = Long.MAX_VALUE;
                long max = Long.MIN_VALUE;
                String minKey = k + "n";
                String maxKey = k + "x";
                if (stats.containsKey(minKey)) {
                    min = ((LongWritable) stats.get(minKey)).get();
                } else if (stats.containsKey(k)) {
                    min = ((LongWritable) stats.get(k)).get();
                }
                if (stats.containsKey(maxKey)) {
                    max = ((LongWritable) stats.get(maxKey)).get();
                } else if (stats.containsKey(k)) {
                    max = ((LongWritable) stats.get(k)).get();
                }
                long lvalue = ((LongWritable) value.get()).get();
                if (min > lvalue) {
                    min = lvalue;
                }
                if (max < lvalue) {
                    max = lvalue;
                }
                stats.put(k + "n", new LongWritable(min));
                stats.put(k + "x", new LongWritable(max));
            } else if (k.equals("sct")) {
                FloatWritable fvalue = (FloatWritable) value.get();
                ((FloatWritable) val).set(((FloatWritable) val).get() + fvalue.get());
            } else if (k.equals("scd")) {
                MergingDigest tdigest = null;
                MergingDigest tdig = MergingDigest.fromBytes(ByteBuffer.wrap(((BytesWritable) value.get()).getBytes()));
                if (val instanceof BytesWritable) {
                    tdigest = MergingDigest.fromBytes(ByteBuffer.wrap(((BytesWritable) val).getBytes()));
                    tdigest.add(tdig);
                } else {
                    tdigest = tdig;
                }
                ByteBuffer tdigestBytes = ByteBuffer.allocate(tdigest.smallByteSize());
                tdigest.asSmallBytes(tdigestBytes);
                stats.put(k, new BytesWritable(tdigestBytes.array()));
            } else {
                LongWritable lvalue = (LongWritable) value.get();
                ((LongWritable) val).set(((LongWritable) val).get() + lvalue.get());
            }
        }
        reader.close();
    }
    // remove score, fetch interval, and fetch time
    // (used for min/max calculation)
    stats.remove("sc");
    stats.remove("fi");
    stats.remove("ft");
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);
    return stats;
}
Also used : Path(org.apache.hadoop.fs.Path) MergingDigest(com.tdunning.math.stats.MergingDigest) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) Text(org.apache.hadoop.io.Text) BytesWritable(org.apache.hadoop.io.BytesWritable) TreeMap(java.util.TreeMap) ByteBuffer(java.nio.ByteBuffer) FloatWritable(org.apache.hadoop.io.FloatWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) LongWritable(org.apache.hadoop.io.LongWritable) NutchJob(org.apache.nutch.util.NutchJob) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

MergingDigest (com.tdunning.math.stats.MergingDigest)2 TreeMap (java.util.TreeMap)2 BytesWritable (org.apache.hadoop.io.BytesWritable)2 FloatWritable (org.apache.hadoop.io.FloatWritable)2 LongWritable (org.apache.hadoop.io.LongWritable)2 Writable (org.apache.hadoop.io.Writable)2 ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 Text (org.apache.hadoop.io.Text)1 Job (org.apache.hadoop.mapreduce.Job)1 NutchJob (org.apache.nutch.util.NutchJob)1