Search in sources :

Example 51 with NullWritable

use of org.apache.hadoop.io.NullWritable in project hive by apache.

the class TestOrcRawRecordMerger method testRecordReaderOldBaseAndDelta.

/**
   * Test the OrcRecordUpdater with the OrcRawRecordMerger when there is
   * a base and a delta.
   * @throws Exception
   */
@Test
public void testRecordReaderOldBaseAndDelta() throws Exception {
    final int BUCKET = 10;
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testOldBaseAndDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write the base
    MemoryManager mgr = new MemoryManager(conf) {

        int rowsAddedSinceCheck = 0;

        @Override
        public synchronized void addedRow(int rows) throws IOException {
            rowsAddedSinceCheck += rows;
            if (rowsAddedSinceCheck >= 2) {
                notifyWriters();
                rowsAddedSinceCheck = 0;
            }
        }
    };
    // make 5 stripes with 2 rows each
    Writer writer = OrcFile.createWriter(new Path(root, "0000010_0"), OrcFile.writerOptions(conf).inspector(inspector).fileSystem(fs).blockPadding(false).bufferSize(10000).compress(CompressionKind.NONE).stripeSize(1).memory(mgr).batchSize(2).version(OrcFile.Version.V_0_11));
    String[] values = new String[] { "ignore.1", "0.1", "ignore.2", "ignore.3", "2.0", "2.1", "3.0", "ignore.4", "ignore.5", "ignore.6" };
    for (int i = 0; i < values.length; ++i) {
        writer.addRow(new BigRow(i, i, values[i], i, i));
    }
    writer.close();
    // write a delta
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).writingBase(false).minimumTransactionId(1).maximumTransactionId(1).bucket(BUCKET).inspector(inspector).filesystem(fs).recordIdColumn(5).finalDestination(root);
    RecordUpdater ru = of.getRecordUpdater(root, options);
    values = new String[] { "0.0", null, null, "1.1", null, null, null, "ignore.7" };
    for (int i = 0; i < values.length; ++i) {
        if (values[i] != null) {
            ru.update(1, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
        }
    }
    ru.delete(100, new BigRow(9, 0, BUCKET));
    ru.close(false);
    // write a delta
    options = options.minimumTransactionId(2).maximumTransactionId(2);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { null, null, "1.0", null, null, null, null, "3.1" };
    for (int i = 0; i < values.length; ++i) {
        if (values[i] != null) {
            ru.update(2, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
        }
    }
    ru.delete(100, new BigRow(8, 0, BUCKET));
    ru.close(false);
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
    HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(5, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
    // loop through the 5 splits and read each
    for (int i = 0; i < 4; ++i) {
        System.out.println("starting split " + i);
        rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
        NullWritable key = rr.createKey();
        OrcStruct value = rr.createValue();
        // there should be exactly two rows per a split
        for (int j = 0; j < 2; ++j) {
            System.out.println("i = " + i + ", j = " + j);
            assertEquals(true, rr.next(key, value));
            System.out.println("record = " + value);
            assertEquals(i + "." + j, value.getFieldValue(2).toString());
        }
        assertEquals(false, rr.next(key, value));
    }
    rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
    assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) MemoryManager(org.apache.orc.impl.MemoryManager) NullWritable(org.apache.hadoop.io.NullWritable) InputFormat(org.apache.hadoop.mapred.InputFormat) Test(org.junit.Test)

Example 52 with NullWritable

use of org.apache.hadoop.io.NullWritable in project hive by apache.

the class LlapInputFormat method getRecordReader.

@Override
public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    RecordReader<NullWritable, VectorizedRowBatch> noLlap = checkLlapSplit(split, job, reporter);
    if (noLlap != null)
        return noLlap;
    boolean isVectorized = Utilities.getUseVectorizedInputFileFormat(job);
    FileSplit fileSplit = (FileSplit) split;
    reporter.setStatus(fileSplit.toString());
    try {
        List<Integer> includedCols = ColumnProjectionUtils.isReadAllColumns(job) ? null : ColumnProjectionUtils.getReadColumnIDs(job);
        LlapRecordReader rr = new LlapRecordReader(job, fileSplit, includedCols, hostName, cvp, executor, sourceInputFormat, sourceSerDe, reporter);
        if (!rr.init()) {
            return sourceInputFormat.getRecordReader(split, job, reporter);
        }
        return wrapLlapReader(isVectorized, includedCols, rr, split, job, reporter);
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) NullWritable(org.apache.hadoop.io.NullWritable) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 53 with NullWritable

use of org.apache.hadoop.io.NullWritable in project crunch by cloudera.

the class WritablesTest method testNulls.

@Test
public void testNulls() throws Exception {
    Void n = null;
    NullWritable nw = NullWritable.get();
    testInputOutputFn(Writables.nulls(), n, nw);
}
Also used : NullWritable(org.apache.hadoop.io.NullWritable) Test(org.junit.Test)

Example 54 with NullWritable

use of org.apache.hadoop.io.NullWritable in project hadoop-book by elephantscale.

the class TeraInputFormat method writePartitionFile.

/**
     * Use the input splits to take samples of the input and generate sample
     * keys. By default reads 100,000 keys from 10 locations in the input, sorts
     * them and picks N-1 keys to generate N equally sized partitions.
     *
     * @param conf the job to sample
     * @param partFile where to write the output file to
     * @throws IOException if something goes wrong
     */
public static void writePartitionFile(JobConf conf, Path partFile) throws IOException {
    TeraInputFormat inFormat = new TeraInputFormat();
    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        while (reader.next(key, value)) {
            sampler.addKey(key);
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}
Also used : SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) Text(org.apache.hadoop.io.Text) InputSplit(org.apache.hadoop.mapred.InputSplit) NullWritable(org.apache.hadoop.io.NullWritable)

Example 55 with NullWritable

use of org.apache.hadoop.io.NullWritable in project Gaffer by gchq.

the class GetJavaRDDOfElementsHandler method doOperation.

private JavaRDD<Element> doOperation(final GetJavaRDDOfElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException {
    final JavaSparkContext sparkContext = operation.getJavaSparkContext();
    final Configuration conf = getConfiguration(operation);
    // Use batch scan option when performing seeded operation
    InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
    addIterators(accumuloStore, conf, context.getUser(), operation);
    addRanges(accumuloStore, conf, operation);
    final JavaPairRDD<Element, NullWritable> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class);
    final JavaRDD<Element> rdd = pairRDD.map(new FirstElement());
    return rdd;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Element(uk.gov.gchq.gaffer.data.element.Element) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) NullWritable(org.apache.hadoop.io.NullWritable)

Aggregations

NullWritable (org.apache.hadoop.io.NullWritable)99 Test (org.junit.Test)65 Path (org.apache.hadoop.fs.Path)44 Configuration (org.apache.hadoop.conf.Configuration)41 File (java.io.File)29 FileSystem (org.apache.hadoop.fs.FileSystem)26 SequenceFile (org.apache.hadoop.io.SequenceFile)22 JobConf (org.apache.hadoop.mapred.JobConf)21 RouteBuilder (org.apache.camel.builder.RouteBuilder)18 MockEndpoint (org.apache.camel.component.mock.MockEndpoint)18 ArrayFile (org.apache.hadoop.io.ArrayFile)18 LongWritable (org.apache.hadoop.io.LongWritable)15 Text (org.apache.hadoop.io.Text)15 InputSplit (org.apache.hadoop.mapred.InputSplit)15 IntWritable (org.apache.hadoop.io.IntWritable)10 Writer (org.apache.hadoop.io.SequenceFile.Writer)9 CharacteristicSetWritable (org.apache.jena.hadoop.rdf.types.CharacteristicSetWritable)8 IOException (java.io.IOException)7 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)7 FloatWritable (org.apache.hadoop.io.FloatWritable)7