use of org.apache.hadoop.io.NullWritable in project hive by apache.
the class ParquetRecordReaderWrapper method next.
@Override
public boolean next(final NullWritable key, final ArrayWritable value) throws IOException {
if (eof) {
return false;
}
try {
if (firstRecord) {
// key & value are already read.
firstRecord = false;
} else if (!realReader.nextKeyValue()) {
// strictly not required, just for consistency
eof = true;
return false;
}
final ArrayWritable tmpCurValue = realReader.getCurrentValue();
if (value != tmpCurValue) {
final Writable[] arrValue = value.get();
final Writable[] arrCurrent = tmpCurValue.get();
if (value != null && arrValue.length == arrCurrent.length) {
System.arraycopy(arrCurrent, 0, arrValue, 0, arrCurrent.length);
} else {
if (arrValue.length != arrCurrent.length) {
throw new IOException("DeprecatedParquetHiveInput : size of object differs. Value" + " size : " + arrValue.length + ", Current Object size : " + arrCurrent.length);
} else {
throw new IOException("DeprecatedParquetHiveInput can not support RecordReaders that" + " don't return same key & value & value is null");
}
}
}
return true;
} catch (final InterruptedException e) {
throw new IOException(e);
}
}
use of org.apache.hadoop.io.NullWritable in project hive by apache.
the class TestFileSinkOperator method confirmOutput.
private void confirmOutput(DataFormat rType) throws IOException, SerDeException, CloneNotSupportedException {
Path[] paths = findFilesInBasePath();
TFSOInputFormat input = new TFSOInputFormat(rType);
FileInputFormat.setInputPaths(jc, paths);
InputSplit[] splits = input.getSplits(jc, 1);
RecordReader<NullWritable, Row> reader = input.getRecordReader(splits[0], jc, Mockito.mock(Reporter.class));
NullWritable key = reader.createKey();
Row value = reader.createValue();
List<Row> results = new ArrayList<Row>(rows.size());
List<Row> sortedRows = new ArrayList<Row>(rows.size());
for (int i = 0; i < rows.size(); i++) {
Assert.assertTrue(reader.next(key, value));
results.add(value.clone());
sortedRows.add(rows.get(i));
}
Assert.assertFalse(reader.next(key, value));
Collections.sort(results);
Collections.sort(sortedRows);
for (int i = 0; i < rows.size(); i++) {
Assert.assertTrue(sortedRows.get(i).equals(results.get(i)));
}
}
use of org.apache.hadoop.io.NullWritable in project hadoop-book by elephantscale.
the class TeraInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
*
* @param conf the job to sample
* @param partFile where to write the output file to
* @throws IOException if something goes wrong
*/
public static void writePartitionFile(JobConf conf, Path partFile) throws IOException {
TeraInputFormat inFormat = new TeraInputFormat();
TextSampler sampler = new TextSampler();
Text key = new Text();
Text value = new Text();
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.length / samples;
long records = 0;
// take N samples from different parts of the input
for (int i = 0; i < samples; ++i) {
RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null);
while (reader.next(key, value)) {
sampler.addKey(key);
records += 1;
if ((i + 1) * recordsPerSample <= records) {
break;
}
}
}
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
outFs.delete(partFile, false);
}
SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class);
NullWritable nullValue = NullWritable.get();
for (Text split : sampler.createPartitions(partitions)) {
writer.append(split, nullValue);
}
writer.close();
}
use of org.apache.hadoop.io.NullWritable in project pinot by linkedin.
the class TopkPhaseTest method testTopKColumnTransformationPhase.
@Test
public void testTopKColumnTransformationPhase() throws Exception {
int recordCount = 0;
List<GenericRecord> inputRecords = generateTestMapperData();
for (GenericRecord record : inputRecords) {
AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
inKey.datum(record);
mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
recordCount++;
}
List<Pair<BytesWritable, BytesWritable>> result = mapDriver.run();
// for each record, we emit 2 records per dimension:
// once for actual value of dimension, once for ALL,ALL
Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount * 3 * 2, result.size());
Map<String, Integer> counts = new HashMap<>();
for (Pair<BytesWritable, BytesWritable> pair : result) {
TopKPhaseMapOutputKey key = TopKPhaseMapOutputKey.fromBytes(pair.getFirst().getBytes());
String dimensionName = key.getDimensionName();
Integer count = counts.get(dimensionName);
if (count == null) {
count = 0;
}
counts.put(dimensionName, count + 1);
}
Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d1"));
Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d2"));
Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d3"));
Assert.assertEquals("Incorrect number of records emitted from map", 6, (int) counts.get("0"));
List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(result);
reduceDriver.addAll(reduceInput);
reduceDriver.run();
File topKFile = new File(outputPath, ThirdEyeConstants.TOPK_VALUES_FILE);
Assert.assertTrue("Topk file failed to generate!", topKFile.exists());
TopKDimensionValues topk = OBJECT_MAPPER.readValue(new FileInputStream(topKFile), TopKDimensionValues.class);
Map<String, Set<String>> topkMap = topk.getTopKDimensions();
Assert.assertEquals("Incorrect topk object", topkMap.size(), 1);
Assert.assertEquals("Incorrect topk values in topk object", Sets.newHashSet("pqr1"), topkMap.get("d2"));
Assert.assertEquals("Incorrect whitelist values in topk object", null, topkMap.get("d3"));
}
use of org.apache.hadoop.io.NullWritable in project pinot by linkedin.
the class DerivedColumnNoTransformationTest method testTopKColumnTransformationPhase.
@Test
public void testTopKColumnTransformationPhase() throws Exception {
int recordCount = 0;
List<GenericRecord> inputRecords = generateTestData();
for (GenericRecord record : inputRecords) {
AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
inKey.datum(record);
mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
recordCount++;
}
resetAvroSerialization();
List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run();
Assert.assertEquals(recordCount, result.size());
for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) {
GenericRecord datum = pair.getFirst().datum();
System.out.println(datum.getSchema().getFields().size());
Assert.assertEquals("Input records must contain same number of fields as output record, when schemas are not transformed", datum.getSchema().getFields().size(), 6);
}
}
Aggregations