Search in sources :

Example 86 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project metron by apache.

the class PartitionHDFSWriter method handle.

public void handle(long ts, byte[] value) throws IOException {
    turnoverIfNecessary(ts);
    BytesWritable bw = new BytesWritable(value);
    try {
        writer.append(new LongWritable(ts), bw);
    } catch (ArrayIndexOutOfBoundsException aioobe) {
        LOG.warn("This appears to be HDFS-7765 (https://issues.apache.org/jira/browse/HDFS-7765), " + "which is an issue with syncing and not problematic: {}", aioobe.getMessage(), aioobe);
    }
    numWritten++;
    if (numWritten % config.getSyncEvery() == 0) {
        syncHandler.sync(outputStream);
    }
}
Also used : BytesWritable(org.apache.hadoop.io.BytesWritable) LongWritable(org.apache.hadoop.io.LongWritable)

Example 87 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project metron by apache.

the class PcapHelperTest method readSamplePackets.

public static List<byte[]> readSamplePackets(String pcapLoc) throws IOException {
    SequenceFile.Reader reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path(pcapLoc)));
    List<byte[]> ret = new ArrayList<>();
    IntWritable key = new IntWritable();
    BytesWritable value = new BytesWritable();
    while (reader.next(key, value)) {
        byte[] pcapWithHeader = value.copyBytes();
        ret.add(pcapWithHeader);
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) BytesWritable(org.apache.hadoop.io.BytesWritable) IntWritable(org.apache.hadoop.io.IntWritable)

Example 88 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project nutch by apache.

the class CrawlDbReader method processStatJob.

public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException, InterruptedException, ClassNotFoundException {
    double[] quantiles = { .01, .05, .1, .2, .25, .3, .4, .5, .6, .7, .75, .8, .9, .95, .99 };
    if (config.get("db.stats.score.quantiles") != null) {
        List<Double> qs = new ArrayList<>();
        for (String s : config.getStrings("db.stats.score.quantiles")) {
            try {
                double d = Double.parseDouble(s);
                if (d >= 0.0 && d <= 1.0) {
                    qs.add(d);
                } else {
                    LOG.warn("Skipping quantile {} not in range in db.stats.score.quantiles: {}", s);
                }
            } catch (NumberFormatException e) {
                LOG.warn("Skipping bad floating point number {} in db.stats.score.quantiles: {}", s, e.getMessage());
            }
            quantiles = new double[qs.size()];
            int i = 0;
            for (Double q : qs) {
                quantiles[i++] = q;
            }
            Arrays.sort(quantiles);
        }
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics start: " + crawlDb);
    }
    TreeMap<String, Writable> stats = processStatJobHelper(crawlDb, config, sort);
    if (LOG.isInfoEnabled()) {
        LOG.info("Statistics for CrawlDb: " + crawlDb);
        LongWritable totalCnt = ((LongWritable) stats.get("T"));
        stats.remove("T");
        LOG.info("TOTAL urls:\t" + totalCnt.get());
        for (Map.Entry<String, Writable> entry : stats.entrySet()) {
            String k = entry.getKey();
            long value = 0;
            double fvalue = 0.0;
            byte[] bytesValue = null;
            Writable val = entry.getValue();
            if (val instanceof LongWritable) {
                value = ((LongWritable) val).get();
            } else if (val instanceof FloatWritable) {
                fvalue = ((FloatWritable) val).get();
            } else if (val instanceof BytesWritable) {
                bytesValue = ((BytesWritable) val).getBytes();
            }
            if (k.equals("scn")) {
                LOG.info("min score:\t" + fvalue);
            } else if (k.equals("scx")) {
                LOG.info("max score:\t" + fvalue);
            } else if (k.equals("sct")) {
                LOG.info("avg score:\t" + (fvalue / totalCnt.get()));
            } else if (k.equals("scNaN")) {
                LOG.info("score == NaN:\t" + value);
            } else if (k.equals("ftn")) {
                LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * value));
            } else if (k.equals("ftx")) {
                LOG.info("latest fetch time:\t" + new Date(1000 * 60 * value));
            } else if (k.equals("ftt")) {
                LOG.info("avg of fetch times:\t" + new Date(1000 * 60 * (value / totalCnt.get())));
            } else if (k.equals("fin")) {
                LOG.info("shortest fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value));
            } else if (k.equals("fix")) {
                LOG.info("longest fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value));
            } else if (k.equals("fit")) {
                LOG.info("avg fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value / totalCnt.get()));
            } else if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2)
                    LOG.info("   " + st[2] + " :\t" + val);
                else
                    LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
            } else if (k.equals("scd")) {
                MergingDigest tdigest = MergingDigest.fromBytes(ByteBuffer.wrap(bytesValue));
                for (double q : quantiles) {
                    LOG.info("score quantile {}:\t{}", q, tdigest.quantile(q));
                }
            } else {
                LOG.info(k + ":\t" + val);
            }
        }
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb statistics: done");
    }
}
Also used : MergingDigest(com.tdunning.math.stats.MergingDigest) ArrayList(java.util.ArrayList) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) Date(java.util.Date) FloatWritable(org.apache.hadoop.io.FloatWritable) LongWritable(org.apache.hadoop.io.LongWritable) Map(java.util.Map) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap)

Example 89 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project nutch by apache.

the class CrawlDbReader method processStatJobHelper.

private TreeMap<String, Writable> processStatJobHelper(String crawlDb, Configuration config, boolean sort) throws IOException, InterruptedException, ClassNotFoundException {
    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
    Job job = NutchJob.getInstance(config);
    config = job.getConfiguration();
    job.setJobName("stats " + crawlDb);
    config.setBoolean("db.reader.stats.sort", sort);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatReducer.class);
    job.setReducerClass(CrawlDbStatReducer.class);
    FileOutputFormat.setOutputPath(job, tmpFolder);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);
    // https://issues.apache.org/jira/browse/NUTCH-1029
    config.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
    try {
        int complete = job.waitForCompletion(true) ? 0 : 1;
    } catch (InterruptedException | ClassNotFoundException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    // reading the result
    FileSystem fileSystem = tmpFolder.getFileSystem(config);
    SequenceFile.Reader[] readers = SegmentReaderUtil.getReaders(tmpFolder, config);
    Text key = new Text();
    NutchWritable value = new NutchWritable();
    TreeMap<String, Writable> stats = new TreeMap<>();
    for (int i = 0; i < readers.length; i++) {
        SequenceFile.Reader reader = readers[i];
        while (reader.next(key, value)) {
            String k = key.toString();
            Writable val = stats.get(k);
            if (val == null) {
                stats.put(k, value.get());
                continue;
            }
            if (k.equals("sc")) {
                float min = Float.MAX_VALUE;
                float max = Float.MIN_VALUE;
                if (stats.containsKey("scn")) {
                    min = ((FloatWritable) stats.get("scn")).get();
                } else {
                    min = ((FloatWritable) stats.get("sc")).get();
                }
                if (stats.containsKey("scx")) {
                    max = ((FloatWritable) stats.get("scx")).get();
                } else {
                    max = ((FloatWritable) stats.get("sc")).get();
                }
                float fvalue = ((FloatWritable) value.get()).get();
                if (min > fvalue) {
                    min = fvalue;
                }
                if (max < fvalue) {
                    max = fvalue;
                }
                stats.put("scn", new FloatWritable(min));
                stats.put("scx", new FloatWritable(max));
            } else if (k.equals("ft") || k.equals("fi")) {
                long min = Long.MAX_VALUE;
                long max = Long.MIN_VALUE;
                String minKey = k + "n";
                String maxKey = k + "x";
                if (stats.containsKey(minKey)) {
                    min = ((LongWritable) stats.get(minKey)).get();
                } else if (stats.containsKey(k)) {
                    min = ((LongWritable) stats.get(k)).get();
                }
                if (stats.containsKey(maxKey)) {
                    max = ((LongWritable) stats.get(maxKey)).get();
                } else if (stats.containsKey(k)) {
                    max = ((LongWritable) stats.get(k)).get();
                }
                long lvalue = ((LongWritable) value.get()).get();
                if (min > lvalue) {
                    min = lvalue;
                }
                if (max < lvalue) {
                    max = lvalue;
                }
                stats.put(k + "n", new LongWritable(min));
                stats.put(k + "x", new LongWritable(max));
            } else if (k.equals("sct")) {
                FloatWritable fvalue = (FloatWritable) value.get();
                ((FloatWritable) val).set(((FloatWritable) val).get() + fvalue.get());
            } else if (k.equals("scd")) {
                MergingDigest tdigest = null;
                MergingDigest tdig = MergingDigest.fromBytes(ByteBuffer.wrap(((BytesWritable) value.get()).getBytes()));
                if (val instanceof BytesWritable) {
                    tdigest = MergingDigest.fromBytes(ByteBuffer.wrap(((BytesWritable) val).getBytes()));
                    tdigest.add(tdig);
                } else {
                    tdigest = tdig;
                }
                ByteBuffer tdigestBytes = ByteBuffer.allocate(tdigest.smallByteSize());
                tdigest.asSmallBytes(tdigestBytes);
                stats.put(k, new BytesWritable(tdigestBytes.array()));
            } else {
                LongWritable lvalue = (LongWritable) value.get();
                ((LongWritable) val).set(((LongWritable) val).get() + lvalue.get());
            }
        }
        reader.close();
    }
    // remove score, fetch interval, and fetch time
    // (used for min/max calculation)
    stats.remove("sc");
    stats.remove("fi");
    stats.remove("ft");
    // removing the tmp folder
    fileSystem.delete(tmpFolder, true);
    return stats;
}
Also used : Path(org.apache.hadoop.fs.Path) MergingDigest(com.tdunning.math.stats.MergingDigest) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) Text(org.apache.hadoop.io.Text) BytesWritable(org.apache.hadoop.io.BytesWritable) TreeMap(java.util.TreeMap) ByteBuffer(java.nio.ByteBuffer) FloatWritable(org.apache.hadoop.io.FloatWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) LongWritable(org.apache.hadoop.io.LongWritable) NutchJob(org.apache.nutch.util.NutchJob) Job(org.apache.hadoop.mapreduce.Job)

Example 90 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hive by apache.

the class TestDynamicSerDe method testStructsinStructs.

/**
 * Tests map and list null/empty with return nulls *off*.
 */
public void testStructsinStructs() throws Throwable {
    try {
        Properties schema = new Properties();
        // schema.setProperty(serdeConstants.SERIALIZATION_FORMAT,
        // org.apache.thrift.protocol.TJSONProtocol.class.getName());
        schema.setProperty(serdeConstants.SERIALIZATION_FORMAT, org.apache.thrift.protocol.TBinaryProtocol.class.getName());
        schema.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_NAME, "test");
        schema.setProperty(serdeConstants.SERIALIZATION_DDL, "struct inner { i32 field1, string field2 },struct  test {inner foo,  i32 hello, list<string> bye, map<string,i32> another}");
        schema.setProperty(serdeConstants.SERIALIZATION_LIB, new DynamicSerDe().getClass().toString());
        // 
        // construct object of above type
        // 
        // construct the inner struct
        ArrayList<Object> innerStruct = new ArrayList<Object>();
        innerStruct.add(new Integer(22));
        innerStruct.add(new String("hello world"));
        // construct outer struct
        ArrayList<String> bye = new ArrayList<String>();
        bye.add("firstString");
        bye.add("secondString");
        HashMap<String, Integer> another = new HashMap<String, Integer>();
        another.put("firstKey", 1);
        another.put("secondKey", 2);
        ArrayList<Object> struct = new ArrayList<Object>();
        struct.add(innerStruct);
        struct.add(Integer.valueOf(234));
        struct.add(bye);
        struct.add(another);
        DynamicSerDe serde = new DynamicSerDe();
        serde.initialize(new Configuration(), schema);
        ObjectInspector oi = serde.getObjectInspector();
        // Try to serialize
        BytesWritable bytes = (BytesWritable) serde.serialize(struct, oi);
        // Try to deserialize
        Object o = serde.deserialize(bytes);
        List<?> olist = (List<?>) o;
        assertEquals(4, olist.size());
        assertEquals(innerStruct, olist.get(0));
        assertEquals(new Integer(234), olist.get(1));
        assertEquals(bye, olist.get(2));
        assertEquals(another, olist.get(3));
    } catch (Throwable e) {
        e.printStackTrace();
        throw e;
    }
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) BytesWritable(org.apache.hadoop.io.BytesWritable) Properties(java.util.Properties) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

BytesWritable (org.apache.hadoop.io.BytesWritable)275 Text (org.apache.hadoop.io.Text)73 LongWritable (org.apache.hadoop.io.LongWritable)59 Test (org.junit.Test)53 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)46 IntWritable (org.apache.hadoop.io.IntWritable)44 ArrayList (java.util.ArrayList)39 Path (org.apache.hadoop.fs.Path)38 IOException (java.io.IOException)36 Configuration (org.apache.hadoop.conf.Configuration)33 FloatWritable (org.apache.hadoop.io.FloatWritable)33 Writable (org.apache.hadoop.io.Writable)32 BooleanWritable (org.apache.hadoop.io.BooleanWritable)31 List (java.util.List)30 SequenceFile (org.apache.hadoop.io.SequenceFile)27 Random (java.util.Random)24 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)24 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)23 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)22 FileSystem (org.apache.hadoop.fs.FileSystem)21