Search in sources :

Example 46 with NullWritable

use of org.apache.hadoop.io.NullWritable in project hive by apache.

the class ParquetRecordReaderWrapper method next.

@Override
public boolean next(final NullWritable key, final ArrayWritable value) throws IOException {
    if (eof) {
        return false;
    }
    try {
        if (firstRecord) {
            // key & value are already read.
            firstRecord = false;
        } else if (!realReader.nextKeyValue()) {
            // strictly not required, just for consistency
            eof = true;
            return false;
        }
        final ArrayWritable tmpCurValue = realReader.getCurrentValue();
        if (value != tmpCurValue) {
            final Writable[] arrValue = value.get();
            final Writable[] arrCurrent = tmpCurValue.get();
            if (value != null && arrValue.length == arrCurrent.length) {
                System.arraycopy(arrCurrent, 0, arrValue, 0, arrCurrent.length);
            } else {
                if (arrValue.length != arrCurrent.length) {
                    throw new IOException("DeprecatedParquetHiveInput : size of object differs. Value" + " size :  " + arrValue.length + ", Current Object size : " + arrCurrent.length);
                } else {
                    throw new IOException("DeprecatedParquetHiveInput can not support RecordReaders that" + " don't return same key & value & value is null");
                }
            }
        }
        return true;
    } catch (final InterruptedException e) {
        throw new IOException(e);
    }
}
Also used : ArrayWritable(org.apache.hadoop.io.ArrayWritable) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) IOException(java.io.IOException)

Example 47 with NullWritable

use of org.apache.hadoop.io.NullWritable in project hive by apache.

the class TestFileSinkOperator method confirmOutput.

private void confirmOutput(DataFormat rType) throws IOException, SerDeException, CloneNotSupportedException {
    Path[] paths = findFilesInBasePath();
    TFSOInputFormat input = new TFSOInputFormat(rType);
    FileInputFormat.setInputPaths(jc, paths);
    InputSplit[] splits = input.getSplits(jc, 1);
    RecordReader<NullWritable, Row> reader = input.getRecordReader(splits[0], jc, Mockito.mock(Reporter.class));
    NullWritable key = reader.createKey();
    Row value = reader.createValue();
    List<Row> results = new ArrayList<Row>(rows.size());
    List<Row> sortedRows = new ArrayList<Row>(rows.size());
    for (int i = 0; i < rows.size(); i++) {
        Assert.assertTrue(reader.next(key, value));
        results.add(value.clone());
        sortedRows.add(rows.get(i));
    }
    Assert.assertFalse(reader.next(key, value));
    Collections.sort(results);
    Collections.sort(sortedRows);
    for (int i = 0; i < rows.size(); i++) {
        Assert.assertTrue(sortedRows.get(i).equals(results.get(i)));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Reporter(org.apache.hadoop.mapred.Reporter) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapred.InputSplit) NullWritable(org.apache.hadoop.io.NullWritable)

Example 48 with NullWritable

use of org.apache.hadoop.io.NullWritable in project hadoop-book by elephantscale.

the class TeraInputFormat method writePartitionFile.

/**
     * Use the input splits to take samples of the input and generate sample
     * keys. By default reads 100,000 keys from 10 locations in the input, sorts
     * them and picks N-1 keys to generate N equally sized partitions.
     *
     * @param conf the job to sample
     * @param partFile where to write the output file to
     * @throws IOException if something goes wrong
     */
public static void writePartitionFile(JobConf conf, Path partFile) throws IOException {
    TeraInputFormat inFormat = new TeraInputFormat();
    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        while (reader.next(key, value)) {
            sampler.addKey(key);
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}
Also used : SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) Text(org.apache.hadoop.io.Text) InputSplit(org.apache.hadoop.mapred.InputSplit) NullWritable(org.apache.hadoop.io.NullWritable)

Example 49 with NullWritable

use of org.apache.hadoop.io.NullWritable in project pinot by linkedin.

the class TopkPhaseTest method testTopKColumnTransformationPhase.

@Test
public void testTopKColumnTransformationPhase() throws Exception {
    int recordCount = 0;
    List<GenericRecord> inputRecords = generateTestMapperData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }
    List<Pair<BytesWritable, BytesWritable>> result = mapDriver.run();
    // for each record, we emit 2 records per dimension:
    // once for actual value of dimension, once for ALL,ALL
    Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount * 3 * 2, result.size());
    Map<String, Integer> counts = new HashMap<>();
    for (Pair<BytesWritable, BytesWritable> pair : result) {
        TopKPhaseMapOutputKey key = TopKPhaseMapOutputKey.fromBytes(pair.getFirst().getBytes());
        String dimensionName = key.getDimensionName();
        Integer count = counts.get(dimensionName);
        if (count == null) {
            count = 0;
        }
        counts.put(dimensionName, count + 1);
    }
    Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d1"));
    Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d2"));
    Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d3"));
    Assert.assertEquals("Incorrect number of records emitted from map", 6, (int) counts.get("0"));
    List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(result);
    reduceDriver.addAll(reduceInput);
    reduceDriver.run();
    File topKFile = new File(outputPath, ThirdEyeConstants.TOPK_VALUES_FILE);
    Assert.assertTrue("Topk file failed to generate!", topKFile.exists());
    TopKDimensionValues topk = OBJECT_MAPPER.readValue(new FileInputStream(topKFile), TopKDimensionValues.class);
    Map<String, Set<String>> topkMap = topk.getTopKDimensions();
    Assert.assertEquals("Incorrect topk object", topkMap.size(), 1);
    Assert.assertEquals("Incorrect topk values in topk object", Sets.newHashSet("pqr1"), topkMap.get("d2"));
    Assert.assertEquals("Incorrect whitelist values in topk object", null, topkMap.get("d3"));
}
Also used : Set(java.util.Set) HashMap(java.util.HashMap) AvroKey(org.apache.avro.mapred.AvroKey) BytesWritable(org.apache.hadoop.io.BytesWritable) NullWritable(org.apache.hadoop.io.NullWritable) FileInputStream(java.io.FileInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Pair(org.apache.hadoop.mrunit.types.Pair) Test(org.junit.Test)

Example 50 with NullWritable

use of org.apache.hadoop.io.NullWritable in project pinot by linkedin.

the class DerivedColumnNoTransformationTest method testTopKColumnTransformationPhase.

@Test
public void testTopKColumnTransformationPhase() throws Exception {
    int recordCount = 0;
    List<GenericRecord> inputRecords = generateTestData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }
    resetAvroSerialization();
    List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run();
    Assert.assertEquals(recordCount, result.size());
    for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) {
        GenericRecord datum = pair.getFirst().datum();
        System.out.println(datum.getSchema().getFields().size());
        Assert.assertEquals("Input records must contain same number of fields as output record, when schemas are not transformed", datum.getSchema().getFields().size(), 6);
    }
}
Also used : AvroKey(org.apache.avro.mapred.AvroKey) GenericRecord(org.apache.avro.generic.GenericRecord) NullWritable(org.apache.hadoop.io.NullWritable) Pair(org.apache.hadoop.mrunit.types.Pair) Test(org.junit.Test)

Aggregations

NullWritable (org.apache.hadoop.io.NullWritable)101 Test (org.junit.Test)65 Configuration (org.apache.hadoop.conf.Configuration)41 Path (org.apache.hadoop.fs.Path)41 File (java.io.File)29 FileSystem (org.apache.hadoop.fs.FileSystem)26 SequenceFile (org.apache.hadoop.io.SequenceFile)22 JobConf (org.apache.hadoop.mapred.JobConf)22 RouteBuilder (org.apache.camel.builder.RouteBuilder)18 MockEndpoint (org.apache.camel.component.mock.MockEndpoint)18 ArrayFile (org.apache.hadoop.io.ArrayFile)18 Text (org.apache.hadoop.io.Text)16 InputSplit (org.apache.hadoop.mapred.InputSplit)16 LongWritable (org.apache.hadoop.io.LongWritable)15 IntWritable (org.apache.hadoop.io.IntWritable)10 Writer (org.apache.hadoop.io.SequenceFile.Writer)9 CharacteristicSetWritable (org.apache.jena.hadoop.rdf.types.CharacteristicSetWritable)8 IOException (java.io.IOException)7 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)7 FloatWritable (org.apache.hadoop.io.FloatWritable)7