use of org.apache.hadoop.io.DoubleWritable in project nifi by apache.
the class NiFiOrcUtils method convertToORCObject.
public static Object convertToORCObject(TypeInfo typeInfo, Object o) {
if (o != null) {
if (typeInfo instanceof UnionTypeInfo) {
OrcUnion union = new OrcUnion();
// Need to find which of the union types correspond to the primitive object
TypeInfo objectTypeInfo = TypeInfoUtils.getTypeInfoFromObjectInspector(ObjectInspectorFactory.getReflectionObjectInspector(o.getClass(), ObjectInspectorFactory.ObjectInspectorOptions.JAVA));
List<TypeInfo> unionTypeInfos = ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos();
int index = 0;
while (index < unionTypeInfos.size() && !unionTypeInfos.get(index).equals(objectTypeInfo)) {
index++;
}
if (index < unionTypeInfos.size()) {
union.set((byte) index, convertToORCObject(objectTypeInfo, o));
} else {
throw new IllegalArgumentException("Object Type for class " + o.getClass().getName() + " not in Union declaration");
}
return union;
}
if (o instanceof Integer) {
return new IntWritable((int) o);
}
if (o instanceof Boolean) {
return new BooleanWritable((boolean) o);
}
if (o instanceof Long) {
return new LongWritable((long) o);
}
if (o instanceof Float) {
return new FloatWritable((float) o);
}
if (o instanceof Double) {
return new DoubleWritable((double) o);
}
if (o instanceof String || o instanceof Utf8 || o instanceof GenericData.EnumSymbol) {
return new Text(o.toString());
}
if (o instanceof ByteBuffer) {
return new BytesWritable(((ByteBuffer) o).array());
}
if (o instanceof int[]) {
int[] intArray = (int[]) o;
return Arrays.stream(intArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("int"), element)).collect(Collectors.toList());
}
if (o instanceof long[]) {
long[] longArray = (long[]) o;
return Arrays.stream(longArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("bigint"), element)).collect(Collectors.toList());
}
if (o instanceof float[]) {
float[] floatArray = (float[]) o;
return IntStream.range(0, floatArray.length).mapToDouble(i -> floatArray[i]).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("float"), (float) element)).collect(Collectors.toList());
}
if (o instanceof double[]) {
double[] doubleArray = (double[]) o;
return Arrays.stream(doubleArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("double"), element)).collect(Collectors.toList());
}
if (o instanceof boolean[]) {
boolean[] booleanArray = (boolean[]) o;
return IntStream.range(0, booleanArray.length).map(i -> booleanArray[i] ? 1 : 0).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("boolean"), element == 1)).collect(Collectors.toList());
}
if (o instanceof GenericData.Array) {
GenericData.Array array = ((GenericData.Array) o);
// The type information in this case is interpreted as a List
TypeInfo listTypeInfo = ((ListTypeInfo) typeInfo).getListElementTypeInfo();
return array.stream().map((element) -> convertToORCObject(listTypeInfo, element)).collect(Collectors.toList());
}
if (o instanceof List) {
return o;
}
if (o instanceof Map) {
Map map = new HashMap();
TypeInfo keyInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo();
TypeInfo valueInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo();
// Unions are not allowed as key/value types, so if we convert the key and value objects,
// they should return Writable objects
((Map) o).forEach((key, value) -> {
Object keyObject = convertToORCObject(keyInfo, key);
Object valueObject = convertToORCObject(valueInfo, value);
if (keyObject == null) {
throw new IllegalArgumentException("Maps' key cannot be null");
}
map.put(keyObject, valueObject);
});
return map;
}
if (o instanceof GenericData.Record) {
GenericData.Record record = (GenericData.Record) o;
TypeInfo recordSchema = NiFiOrcUtils.getOrcField(record.getSchema());
List<Schema.Field> recordFields = record.getSchema().getFields();
if (recordFields != null) {
Object[] fieldObjects = new Object[recordFields.size()];
for (int i = 0; i < recordFields.size(); i++) {
Schema.Field field = recordFields.get(i);
Schema fieldSchema = field.schema();
Object fieldObject = record.get(field.name());
fieldObjects[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), fieldObject);
}
return NiFiOrcUtils.createOrcStruct(recordSchema, fieldObjects);
}
}
throw new IllegalArgumentException("Error converting object of type " + o.getClass().getName() + " to ORC type " + typeInfo.getTypeName());
} else {
return null;
}
}
use of org.apache.hadoop.io.DoubleWritable in project incubator-systemml by apache.
the class SamplingSortMRInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
*
* @param conf the job to sample
* @param partFile where to write the output file to
* @return index value
* @throws IOException if something goes wrong
* @throws InstantiationException if InstantiationException occurs
* @throws IllegalAccessException if IllegalAccessException occurs
*/
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
Sampler sampler = new Sampler();
Class<? extends WritableComparable> targetKeyClass;
targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
// get input converter information
int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
// indicate whether the matrix value in this mapper is a matrix cell or a matrix block
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.length / samples;
// take N samples from different parts of the input
int totalcount = 0;
for (int i = 0; i < samples; i++) {
SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
int count = 0;
WritableComparable key = (WritableComparable) reader.createKey();
Writable value = (Writable) reader.createValue();
while (reader.next(key, value) && count < recordsPerSample) {
Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
inputConverter.setBlockSize(brlen, bclen);
inputConverter.convert(key, value);
while (inputConverter.hasNext()) {
Pair pair = inputConverter.next();
if (pair.getKey() instanceof DoubleWritable) {
sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
} else if (pair.getValue() instanceof MatrixCell) {
sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
} else
throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
count++;
}
key = (WritableComparable) reader.createKey();
value = (Writable) reader.createValue();
}
totalcount += count;
}
if (// empty input files
totalcount == 0)
sampler.addValue(new DoubleWritable(0));
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
outFs.delete(partFile, false);
}
// note: key value always double/null as expected by partitioner
SequenceFile.Writer writer = null;
int index0 = -1;
try {
writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
NullWritable nullValue = NullWritable.get();
int i = 0;
boolean lessthan0 = true;
for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
writer.append(splitValue, nullValue);
if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
index0 = i;
lessthan0 = false;
}
i++;
}
if (lessthan0)
index0 = partitions - 1;
} finally {
IOUtilFunctions.closeSilently(writer);
}
return index0;
}
use of org.apache.hadoop.io.DoubleWritable in project cdap by caskdata.
the class AggregationFunctionsTest method averageGenerateAggregationTest.
@Test
public void averageGenerateAggregationTest() throws Exception {
DataQualityWritable val1 = new DataQualityWritable();
val1.set(new DoubleWritable(2.0));
DataQualityWritable val2 = new DataQualityWritable();
val2.set(new DoubleWritable(2.0));
Mean mean = new Mean();
mean.add(val1);
mean.add(val2);
byte[] output = mean.aggregate();
Assert.assertEquals(2.0, Bytes.toDouble(output), 0);
}
use of org.apache.hadoop.io.DoubleWritable in project systemml by apache.
the class SamplingSortMRInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
*
* @param conf the job to sample
* @param partFile where to write the output file to
* @return index value
* @throws IOException if something goes wrong
* @throws InstantiationException if InstantiationException occurs
* @throws IllegalAccessException if IllegalAccessException occurs
*/
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
Sampler sampler = new Sampler();
Class<? extends WritableComparable> targetKeyClass;
targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
// get input converter information
int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
// indicate whether the matrix value in this mapper is a matrix cell or a matrix block
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.length / samples;
// take N samples from different parts of the input
int totalcount = 0;
for (int i = 0; i < samples; i++) {
SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
int count = 0;
WritableComparable key = (WritableComparable) reader.createKey();
Writable value = (Writable) reader.createValue();
while (reader.next(key, value) && count < recordsPerSample) {
Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
inputConverter.setBlockSize(brlen, bclen);
inputConverter.convert(key, value);
while (inputConverter.hasNext()) {
Pair pair = inputConverter.next();
if (pair.getKey() instanceof DoubleWritable) {
sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
} else if (pair.getValue() instanceof MatrixCell) {
sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
} else
throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
count++;
}
key = (WritableComparable) reader.createKey();
value = (Writable) reader.createValue();
}
totalcount += count;
}
if (// empty input files
totalcount == 0)
sampler.addValue(new DoubleWritable(0));
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
outFs.delete(partFile, false);
}
// note: key value always double/null as expected by partitioner
SequenceFile.Writer writer = null;
int index0 = -1;
try {
writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
NullWritable nullValue = NullWritable.get();
int i = 0;
boolean lessthan0 = true;
for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
writer.append(splitValue, nullValue);
if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
index0 = i;
lessthan0 = false;
}
i++;
}
if (lessthan0)
index0 = partitions - 1;
} finally {
IOUtilFunctions.closeSilently(writer);
}
return index0;
}
use of org.apache.hadoop.io.DoubleWritable in project systemml by apache.
the class MapReduceTool method pickValueWeight.
public static double[] pickValueWeight(String dir, MetaDataNumItemsByEachReducer metadata, double p, boolean average) throws IOException {
long[] counts = metadata.getNumItemsArray();
long[] ranges = new long[counts.length];
ranges[0] = counts[0];
for (int i = 1; i < counts.length; i++) ranges[i] = ranges[i - 1] + counts[i];
long total = ranges[ranges.length - 1];
// do averaging only if it is asked for; and sum_wt is even
average = average && (total % 2 == 0);
int currentPart = 0;
double cum_weight = 0;
long pos = (long) Math.ceil(total * p);
while (ranges[currentPart] < pos) {
currentPart++;
cum_weight += ranges[currentPart];
}
int offset;
if (currentPart > 0)
offset = (int) (pos - ranges[currentPart - 1] - 1);
else
offset = (int) pos - 1;
Path path = new Path(dir);
FileSystem fs = IOUtilFunctions.getFileSystem(path);
FileStatus[] files = fs.listStatus(path);
Path fileToRead = null;
for (FileStatus file : files) if (file.getPath().toString().endsWith(Integer.toString(currentPart))) {
fileToRead = file.getPath();
break;
}
if (fileToRead == null)
throw new RuntimeException("cannot read partition " + currentPart);
int buffsz = 64 * 1024;
DoubleWritable readKey = new DoubleWritable();
IntWritable readValue = new IntWritable();
FSDataInputStream currentStream = null;
double ret = -1;
try {
currentStream = fs.open(fileToRead, buffsz);
boolean contain0s = false;
long numZeros = 0;
if (currentPart == metadata.getPartitionOfZero()) {
contain0s = true;
numZeros = metadata.getNumberOfZero();
}
ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros);
int numRead = 0;
while (numRead <= offset) {
reader.readNextKeyValuePairs(readKey, readValue);
numRead += readValue.get();
cum_weight += readValue.get();
}
ret = readKey.get();
if (average) {
if (numRead <= offset + 1) {
reader.readNextKeyValuePairs(readKey, readValue);
cum_weight += readValue.get();
ret = (ret + readKey.get()) / 2;
}
}
} finally {
IOUtilFunctions.closeSilently(currentStream);
}
return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) };
}
Aggregations