Examples with DoubleWritable - org.apache.hadoop.io.DoubleWritable

Example 21 with DoubleWritable

use of org.apache.hadoop.io.DoubleWritable in project nifi by apache.

the class NiFiOrcUtils method convertToORCObject.

public static Object convertToORCObject(TypeInfo typeInfo, Object o) {
    if (o != null) {
        if (typeInfo instanceof UnionTypeInfo) {
            OrcUnion union = new OrcUnion();
            // Need to find which of the union types correspond to the primitive object
            TypeInfo objectTypeInfo = TypeInfoUtils.getTypeInfoFromObjectInspector(ObjectInspectorFactory.getReflectionObjectInspector(o.getClass(), ObjectInspectorFactory.ObjectInspectorOptions.JAVA));
            List<TypeInfo> unionTypeInfos = ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos();
            int index = 0;
            while (index < unionTypeInfos.size() && !unionTypeInfos.get(index).equals(objectTypeInfo)) {
                index++;
            }
            if (index < unionTypeInfos.size()) {
                union.set((byte) index, convertToORCObject(objectTypeInfo, o));
            } else {
                throw new IllegalArgumentException("Object Type for class " + o.getClass().getName() + " not in Union declaration");
            }
            return union;
        }
        if (o instanceof Integer) {
            return new IntWritable((int) o);
        }
        if (o instanceof Boolean) {
            return new BooleanWritable((boolean) o);
        }
        if (o instanceof Long) {
            return new LongWritable((long) o);
        }
        if (o instanceof Float) {
            return new FloatWritable((float) o);
        }
        if (o instanceof Double) {
            return new DoubleWritable((double) o);
        }
        if (o instanceof String || o instanceof Utf8 || o instanceof GenericData.EnumSymbol) {
            return new Text(o.toString());
        }
        if (o instanceof ByteBuffer) {
            return new BytesWritable(((ByteBuffer) o).array());
        }
        if (o instanceof int[]) {
            int[] intArray = (int[]) o;
            return Arrays.stream(intArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("int"), element)).collect(Collectors.toList());
        }
        if (o instanceof long[]) {
            long[] longArray = (long[]) o;
            return Arrays.stream(longArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("bigint"), element)).collect(Collectors.toList());
        }
        if (o instanceof float[]) {
            float[] floatArray = (float[]) o;
            return IntStream.range(0, floatArray.length).mapToDouble(i -> floatArray[i]).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("float"), (float) element)).collect(Collectors.toList());
        }
        if (o instanceof double[]) {
            double[] doubleArray = (double[]) o;
            return Arrays.stream(doubleArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("double"), element)).collect(Collectors.toList());
        }
        if (o instanceof boolean[]) {
            boolean[] booleanArray = (boolean[]) o;
            return IntStream.range(0, booleanArray.length).map(i -> booleanArray[i] ? 1 : 0).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("boolean"), element == 1)).collect(Collectors.toList());
        }
        if (o instanceof GenericData.Array) {
            GenericData.Array array = ((GenericData.Array) o);
            // The type information in this case is interpreted as a List
            TypeInfo listTypeInfo = ((ListTypeInfo) typeInfo).getListElementTypeInfo();
            return array.stream().map((element) -> convertToORCObject(listTypeInfo, element)).collect(Collectors.toList());
        }
        if (o instanceof List) {
            return o;
        }
        if (o instanceof Map) {
            Map map = new HashMap();
            TypeInfo keyInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo();
            TypeInfo valueInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo();
            // Unions are not allowed as key/value types, so if we convert the key and value objects,
            // they should return Writable objects
            ((Map) o).forEach((key, value) -> {
                Object keyObject = convertToORCObject(keyInfo, key);
                Object valueObject = convertToORCObject(valueInfo, value);
                if (keyObject == null) {
                    throw new IllegalArgumentException("Maps' key cannot be null");
                }
                map.put(keyObject, valueObject);
            });
            return map;
        }
        if (o instanceof GenericData.Record) {
            GenericData.Record record = (GenericData.Record) o;
            TypeInfo recordSchema = NiFiOrcUtils.getOrcField(record.getSchema());
            List<Schema.Field> recordFields = record.getSchema().getFields();
            if (recordFields != null) {
                Object[] fieldObjects = new Object[recordFields.size()];
                for (int i = 0; i < recordFields.size(); i++) {
                    Schema.Field field = recordFields.get(i);
                    Schema fieldSchema = field.schema();
                    Object fieldObject = record.get(field.name());
                    fieldObjects[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), fieldObject);
                }
                return NiFiOrcUtils.createOrcStruct(recordSchema, fieldObjects);
            }
        }
        throw new IllegalArgumentException("Error converting object of type " + o.getClass().getName() + " to ORC type " + typeInfo.getTypeName());
    } else {
        return null;
    }
}

Also used : IntStream(java.util.stream.IntStream) TypeInfoUtils(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils) HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE) Arrays(java.util.Arrays) Text(org.apache.hadoop.io.Text) HashMap(java.util.HashMap) StringUtils(org.apache.commons.lang3.StringUtils) DoubleWritable(org.apache.hadoop.io.DoubleWritable) LongWritable(org.apache.hadoop.io.LongWritable) ByteBuffer(java.nio.ByteBuffer) GenericData(org.apache.avro.generic.GenericData) ArrayList(java.util.ArrayList) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) HIVE_ORC_DEFAULT_BLOCK_SIZE(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_SIZE) BytesWritable(org.apache.hadoop.io.BytesWritable) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) IntWritable(org.apache.hadoop.io.IntWritable) OutputStream(java.io.OutputStream) Utf8(org.apache.avro.util.Utf8) HIVE_ORC_DEFAULT_BLOCK_PADDING(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_PADDING) Schema(org.apache.avro.Schema) HIVE_ORC_WRITE_FORMAT(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_WRITE_FORMAT) TypeInfoFactory(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory) HiveConf(org.apache.hadoop.hive.conf.HiveConf) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) UnionTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo) List(java.util.List) BooleanWritable(org.apache.hadoop.io.BooleanWritable) ObjectInspectorFactory(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory) BloomFilterIO(org.apache.hadoop.hive.ql.io.filters.BloomFilterIO) FloatWritable(org.apache.hadoop.io.FloatWritable) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) DoubleWritable(org.apache.hadoop.io.DoubleWritable) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) ArrayList(java.util.ArrayList) List(java.util.List) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Text(org.apache.hadoop.io.Text) BytesWritable(org.apache.hadoop.io.BytesWritable) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) UnionTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo) GenericData(org.apache.avro.generic.GenericData) ByteBuffer(java.nio.ByteBuffer) FloatWritable(org.apache.hadoop.io.FloatWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) Utf8(org.apache.avro.util.Utf8) HashMap(java.util.HashMap) Map(java.util.Map) UnionTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo)

Example 22 with DoubleWritable

use of org.apache.hadoop.io.DoubleWritable in project incubator-systemml by apache.

the class SamplingSortMRInputFormat method writePartitionFile.

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 *
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @return index value
 * @throws IOException if something goes wrong
 * @throws InstantiationException if InstantiationException occurs
 * @throws IllegalAccessException if IllegalAccessException occurs
 */
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
    SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
    Sampler sampler = new Sampler();
    Class<? extends WritableComparable> targetKeyClass;
    targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
    // get input converter information
    int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
    int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
    // indicate whether the matrix value in this mapper is a matrix cell or a matrix block
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    // take N samples from different parts of the input
    int totalcount = 0;
    for (int i = 0; i < samples; i++) {
        SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        int count = 0;
        WritableComparable key = (WritableComparable) reader.createKey();
        Writable value = (Writable) reader.createValue();
        while (reader.next(key, value) && count < recordsPerSample) {
            Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
            inputConverter.setBlockSize(brlen, bclen);
            inputConverter.convert(key, value);
            while (inputConverter.hasNext()) {
                Pair pair = inputConverter.next();
                if (pair.getKey() instanceof DoubleWritable) {
                    sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
                } else if (pair.getValue() instanceof MatrixCell) {
                    sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
                } else
                    throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
                count++;
            }
            key = (WritableComparable) reader.createKey();
            value = (Writable) reader.createValue();
        }
        totalcount += count;
    }
    if (// empty input files
    totalcount == 0)
        sampler.addValue(new DoubleWritable(0));
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    // note: key value always double/null as expected by partitioner
    SequenceFile.Writer writer = null;
    int index0 = -1;
    try {
        writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
        NullWritable nullValue = NullWritable.get();
        int i = 0;
        boolean lessthan0 = true;
        for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
            writer.append(splitValue, nullValue);
            if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
                index0 = i;
                lessthan0 = false;
            }
            i++;
        }
        if (lessthan0)
            index0 = partitions - 1;
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
    return index0;
}

Also used : SequenceFileRecordReader(org.apache.hadoop.mapred.SequenceFileRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) WritableComparable(org.apache.hadoop.io.WritableComparable) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Converter(org.apache.sysml.runtime.matrix.data.Converter) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 23 with DoubleWritable

use of org.apache.hadoop.io.DoubleWritable in project cdap by caskdata.

the class AggregationFunctionsTest method averageGenerateAggregationTest.

@Test
public void averageGenerateAggregationTest() throws Exception {
    DataQualityWritable val1 = new DataQualityWritable();
    val1.set(new DoubleWritable(2.0));
    DataQualityWritable val2 = new DataQualityWritable();
    val2.set(new DoubleWritable(2.0));
    Mean mean = new Mean();
    mean.add(val1);
    mean.add(val2);
    byte[] output = mean.aggregate();
    Assert.assertEquals(2.0, Bytes.toDouble(output), 0);
}

Also used : Mean(co.cask.cdap.dq.functions.Mean) DoubleWritable(org.apache.hadoop.io.DoubleWritable) DataQualityWritable(co.cask.cdap.dq.DataQualityWritable) Test(org.junit.Test)

Example 24 with DoubleWritable

use of org.apache.hadoop.io.DoubleWritable in project systemml by apache.

the class SamplingSortMRInputFormat method writePartitionFile.

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 *
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @return index value
 * @throws IOException if something goes wrong
 * @throws InstantiationException if InstantiationException occurs
 * @throws IllegalAccessException if IllegalAccessException occurs
 */
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
    SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
    Sampler sampler = new Sampler();
    Class<? extends WritableComparable> targetKeyClass;
    targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
    // get input converter information
    int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
    int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
    // indicate whether the matrix value in this mapper is a matrix cell or a matrix block
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    // take N samples from different parts of the input
    int totalcount = 0;
    for (int i = 0; i < samples; i++) {
        SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        int count = 0;
        WritableComparable key = (WritableComparable) reader.createKey();
        Writable value = (Writable) reader.createValue();
        while (reader.next(key, value) && count < recordsPerSample) {
            Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
            inputConverter.setBlockSize(brlen, bclen);
            inputConverter.convert(key, value);
            while (inputConverter.hasNext()) {
                Pair pair = inputConverter.next();
                if (pair.getKey() instanceof DoubleWritable) {
                    sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
                } else if (pair.getValue() instanceof MatrixCell) {
                    sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
                } else
                    throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
                count++;
            }
            key = (WritableComparable) reader.createKey();
            value = (Writable) reader.createValue();
        }
        totalcount += count;
    }
    if (// empty input files
    totalcount == 0)
        sampler.addValue(new DoubleWritable(0));
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    // note: key value always double/null as expected by partitioner
    SequenceFile.Writer writer = null;
    int index0 = -1;
    try {
        writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
        NullWritable nullValue = NullWritable.get();
        int i = 0;
        boolean lessthan0 = true;
        for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
            writer.append(splitValue, nullValue);
            if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
                index0 = i;
                lessthan0 = false;
            }
            i++;
        }
        if (lessthan0)
            index0 = partitions - 1;
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
    return index0;
}

Example 25 with DoubleWritable

use of org.apache.hadoop.io.DoubleWritable in project systemml by apache.

the class MapReduceTool method pickValueWeight.

public static double[] pickValueWeight(String dir, MetaDataNumItemsByEachReducer metadata, double p, boolean average) throws IOException {
    long[] counts = metadata.getNumItemsArray();
    long[] ranges = new long[counts.length];
    ranges[0] = counts[0];
    for (int i = 1; i < counts.length; i++) ranges[i] = ranges[i - 1] + counts[i];
    long total = ranges[ranges.length - 1];
    // do averaging only if it is asked for; and sum_wt is even
    average = average && (total % 2 == 0);
    int currentPart = 0;
    double cum_weight = 0;
    long pos = (long) Math.ceil(total * p);
    while (ranges[currentPart] < pos) {
        currentPart++;
        cum_weight += ranges[currentPart];
    }
    int offset;
    if (currentPart > 0)
        offset = (int) (pos - ranges[currentPart - 1] - 1);
    else
        offset = (int) pos - 1;
    Path path = new Path(dir);
    FileSystem fs = IOUtilFunctions.getFileSystem(path);
    FileStatus[] files = fs.listStatus(path);
    Path fileToRead = null;
    for (FileStatus file : files) if (file.getPath().toString().endsWith(Integer.toString(currentPart))) {
        fileToRead = file.getPath();
        break;
    }
    if (fileToRead == null)
        throw new RuntimeException("cannot read partition " + currentPart);
    int buffsz = 64 * 1024;
    DoubleWritable readKey = new DoubleWritable();
    IntWritable readValue = new IntWritable();
    FSDataInputStream currentStream = null;
    double ret = -1;
    try {
        currentStream = fs.open(fileToRead, buffsz);
        boolean contain0s = false;
        long numZeros = 0;
        if (currentPart == metadata.getPartitionOfZero()) {
            contain0s = true;
            numZeros = metadata.getNumberOfZero();
        }
        ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros);
        int numRead = 0;
        while (numRead <= offset) {
            reader.readNextKeyValuePairs(readKey, readValue);
            numRead += readValue.get();
            cum_weight += readValue.get();
        }
        ret = readKey.get();
        if (average) {
            if (numRead <= offset + 1) {
                reader.readNextKeyValuePairs(readKey, readValue);
                cum_weight += readValue.get();
                ret = (ret + readKey.get()) / 2;
            }
        }
    } finally {
        IOUtilFunctions.closeSilently(currentStream);
    }
    return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) };
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) DoubleWritable(org.apache.hadoop.io.DoubleWritable) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ReadWithZeros(org.apache.sysml.runtime.matrix.sort.ReadWithZeros) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

DoubleWritable (org.apache.hadoop.io.DoubleWritable)38 IntWritable (org.apache.hadoop.io.IntWritable)20 Test (org.junit.Test)19 Path (org.apache.hadoop.fs.Path)12 BooleanWritable (org.apache.hadoop.io.BooleanWritable)12 LongWritable (org.apache.hadoop.io.LongWritable)12 BytesWritable (org.apache.hadoop.io.BytesWritable)11 FloatWritable (org.apache.hadoop.io.FloatWritable)11 FileSystem (org.apache.hadoop.fs.FileSystem)10 Writable (org.apache.hadoop.io.Writable)10 Configuration (org.apache.hadoop.conf.Configuration)9 Text (org.apache.hadoop.io.Text)8 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)7 ByteWritable (org.apache.hadoop.io.ByteWritable)7 NullWritable (org.apache.hadoop.io.NullWritable)6 ArrayList (java.util.ArrayList)5 Map (java.util.Map)5 SequenceFile (org.apache.hadoop.io.SequenceFile)5 IOException (java.io.IOException)4 HashMap (java.util.HashMap)4