use of org.apache.hadoop.io.NullWritable in project hive by apache.
the class TestOrcRawRecordMerger method testRecordReaderOldBaseAndDelta.
/**
* Test the OrcRecordUpdater with the OrcRawRecordMerger when there is
* a base and a delta.
* @throws Exception
*/
@Test
public void testRecordReaderOldBaseAndDelta() throws Exception {
final int BUCKET = 10;
Configuration conf = new Configuration();
OrcOutputFormat of = new OrcOutputFormat();
FileSystem fs = FileSystem.getLocal(conf);
Path root = new Path(tmpDir, "testOldBaseAndDelta").makeQualified(fs);
fs.delete(root, true);
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
// write the base
MemoryManager mgr = new MemoryManager(conf) {
int rowsAddedSinceCheck = 0;
@Override
public synchronized void addedRow(int rows) throws IOException {
rowsAddedSinceCheck += rows;
if (rowsAddedSinceCheck >= 2) {
notifyWriters();
rowsAddedSinceCheck = 0;
}
}
};
// make 5 stripes with 2 rows each
Writer writer = OrcFile.createWriter(new Path(root, "0000010_0"), OrcFile.writerOptions(conf).inspector(inspector).fileSystem(fs).blockPadding(false).bufferSize(10000).compress(CompressionKind.NONE).stripeSize(1).memory(mgr).batchSize(2).version(OrcFile.Version.V_0_11));
String[] values = new String[] { "ignore.1", "0.1", "ignore.2", "ignore.3", "2.0", "2.1", "3.0", "ignore.4", "ignore.5", "ignore.6" };
for (int i = 0; i < values.length; ++i) {
writer.addRow(new BigRow(i, i, values[i], i, i));
}
writer.close();
// write a delta
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).writingBase(false).minimumTransactionId(1).maximumTransactionId(1).bucket(BUCKET).inspector(inspector).filesystem(fs).recordIdColumn(5).finalDestination(root);
RecordUpdater ru = of.getRecordUpdater(root, options);
values = new String[] { "0.0", null, null, "1.1", null, null, null, "ignore.7" };
for (int i = 0; i < values.length; ++i) {
if (values[i] != null) {
ru.update(1, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
}
}
ru.delete(100, new BigRow(9, 0, BUCKET));
ru.close(false);
// write a delta
options = options.minimumTransactionId(2).maximumTransactionId(2);
ru = of.getRecordUpdater(root, options);
values = new String[] { null, null, "1.0", null, null, null, null, "3.1" };
for (int i = 0; i < values.length; ++i) {
if (values[i] != null) {
ru.update(2, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
}
}
ru.delete(100, new BigRow(8, 0, BUCKET));
ru.close(false);
InputFormat inf = new OrcInputFormat();
JobConf job = new JobConf();
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
job.set("mapred.min.split.size", "1");
job.set("mapred.max.split.size", "2");
job.set("mapred.input.dir", root.toString());
InputSplit[] splits = inf.getSplits(job, 5);
assertEquals(5, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
// loop through the 5 splits and read each
for (int i = 0; i < 4; ++i) {
System.out.println("starting split " + i);
rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
// there should be exactly two rows per a split
for (int j = 0; j < 2; ++j) {
System.out.println("i = " + i + ", j = " + j);
assertEquals(true, rr.next(key, value));
System.out.println("record = " + value);
assertEquals(i + "." + j, value.getFieldValue(2).toString());
}
assertEquals(false, rr.next(key, value));
}
rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
}
use of org.apache.hadoop.io.NullWritable in project hive by apache.
the class LlapInputFormat method getRecordReader.
@Override
public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
RecordReader<NullWritable, VectorizedRowBatch> noLlap = checkLlapSplit(split, job, reporter);
if (noLlap != null)
return noLlap;
boolean isVectorized = Utilities.getUseVectorizedInputFileFormat(job);
FileSplit fileSplit = (FileSplit) split;
reporter.setStatus(fileSplit.toString());
try {
List<Integer> includedCols = ColumnProjectionUtils.isReadAllColumns(job) ? null : ColumnProjectionUtils.getReadColumnIDs(job);
LlapRecordReader rr = new LlapRecordReader(job, fileSplit, includedCols, hostName, cvp, executor, sourceInputFormat, sourceSerDe, reporter);
if (!rr.init()) {
return sourceInputFormat.getRecordReader(split, job, reporter);
}
return wrapLlapReader(isVectorized, includedCols, rr, split, job, reporter);
} catch (Exception ex) {
throw new IOException(ex);
}
}
use of org.apache.hadoop.io.NullWritable in project crunch by cloudera.
the class WritablesTest method testNulls.
@Test
public void testNulls() throws Exception {
Void n = null;
NullWritable nw = NullWritable.get();
testInputOutputFn(Writables.nulls(), n, nw);
}
use of org.apache.hadoop.io.NullWritable in project hadoop-book by elephantscale.
the class TeraInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
*
* @param conf the job to sample
* @param partFile where to write the output file to
* @throws IOException if something goes wrong
*/
public static void writePartitionFile(JobConf conf, Path partFile) throws IOException {
TeraInputFormat inFormat = new TeraInputFormat();
TextSampler sampler = new TextSampler();
Text key = new Text();
Text value = new Text();
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.length / samples;
long records = 0;
// take N samples from different parts of the input
for (int i = 0; i < samples; ++i) {
RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null);
while (reader.next(key, value)) {
sampler.addKey(key);
records += 1;
if ((i + 1) * recordsPerSample <= records) {
break;
}
}
}
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
outFs.delete(partFile, false);
}
SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class);
NullWritable nullValue = NullWritable.get();
for (Text split : sampler.createPartitions(partitions)) {
writer.append(split, nullValue);
}
writer.close();
}
use of org.apache.hadoop.io.NullWritable in project Gaffer by gchq.
the class GetJavaRDDOfElementsHandler method doOperation.
private JavaRDD<Element> doOperation(final GetJavaRDDOfElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException {
final JavaSparkContext sparkContext = operation.getJavaSparkContext();
final Configuration conf = getConfiguration(operation);
// Use batch scan option when performing seeded operation
InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
addIterators(accumuloStore, conf, context.getUser(), operation);
addRanges(accumuloStore, conf, operation);
final JavaPairRDD<Element, NullWritable> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class);
final JavaRDD<Element> rdd = pairRDD.map(new FirstElement());
return rdd;
}
Aggregations