Examples with RecordWriter - org.apache.hadoop.mapred.RecordWriter

Example 6 with RecordWriter

use of org.apache.hadoop.mapred.RecordWriter in project hive by apache.

the class TestInputOutputFormat method testSplitElimination.

@Test
public void testSplitElimination() throws Exception {
    Properties properties = new Properties();
    properties.setProperty("columns", "z,r");
    properties.setProperty("columns.types", "int:struct<x:int,y:int>");
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    AbstractSerDe serde = new OrcSerde();
    OutputFormat<?, ?> outFormat = new OrcOutputFormat();
    conf.setInt("mapred.max.split.size", 50);
    RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
    writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd().lessThan("z", PredicateLeaf.Type.LONG, new Long(0)).end().build();
    conf.set("sarg.pushdown", toKryo(sarg));
    conf.set("hive.io.file.readcolumn.names", "z,r");
    SerDeUtils.initializeSerDe(serde, conf, properties, null);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(0, splits.length);
}

Also used : SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) Properties(java.util.Properties) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) RecordWriter(org.apache.hadoop.mapred.RecordWriter) InputSplit(org.apache.hadoop.mapred.InputSplit) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 7 with RecordWriter

use of org.apache.hadoop.mapred.RecordWriter in project hive by apache.

the class TestInputOutputFormat method testSplitEliminationNullStats.

@Test
public void testSplitEliminationNullStats() throws Exception {
    Properties properties = new Properties();
    StructObjectInspector inspector = createSoi();
    AbstractSerDe serde = new OrcSerde();
    OutputFormat<?, ?> outFormat = new OrcOutputFormat();
    conf.setInt("mapred.max.split.size", 50);
    RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
    writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector));
    writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector));
    writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector));
    writer.close(Reporter.NULL);
    serde = new OrcSerde();
    SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd().lessThan("z", PredicateLeaf.Type.STRING, new String("foo")).end().build();
    conf.set("sarg.pushdown", toKryo(sarg));
    conf.set("hive.io.file.readcolumn.names", "z");
    properties.setProperty("columns", "z");
    properties.setProperty("columns.types", "string");
    SerDeUtils.initializeSerDe(serde, conf, properties, null);
    inspector = (StructObjectInspector) serde.getObjectInspector();
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(0, splits.length);
}

Example 8 with RecordWriter

use of org.apache.hadoop.mapred.RecordWriter in project hive by apache.

the class DynamicPartitionFileRecordWriterContainer method getLocalFileWriter.

@Override
protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException {
    OutputJobInfo localJobInfo = null;
    // Calculate which writer to use from the remaining values - this needs to
    // be done before we delete cols.
    List<String> dynamicPartValues = new ArrayList<String>();
    for (Integer colToAppend : dynamicPartCols) {
        Object partitionValue = value.get(colToAppend);
        dynamicPartValues.add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString());
    }
    String dynKey = dynamicPartValues.toString();
    if (!baseDynamicWriters.containsKey(dynKey)) {
        if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
            throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed.");
        }
        org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil.createTaskAttemptContext(context);
        configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
        localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration());
        // Setup serDe.
        AbstractSerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf());
        try {
            InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
        } catch (SerDeException e) {
            throw new IOException("Failed to initialize SerDe", e);
        }
        // create base OutputFormat
        org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils.newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());
        // We are skipping calling checkOutputSpecs() for each partition
        // As it can throw a FileAlreadyExistsException when more than one
        // mapper is writing to a partition.
        // See HCATALOG-490, also to avoid contacting the namenode for each new
        // FileOutputFormat instance.
        // In general this should be ok for most FileOutputFormat implementations
        // but may become an issue for cases when the method is used to perform
        // other setup tasks.
        // Get Output Committer
        org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf().getOutputCommitter();
        // Create currJobContext the latest so it gets all the config changes
        org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext);
        // Set up job.
        baseOutputCommitter.setupJob(currJobContext);
        // Recreate to refresh jobConf of currTask context.
        currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());
        // Set temp location.
        currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath().toString());
        // Set up task.
        baseOutputCommitter.setupTask(currTaskContext);
        Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
        Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), ""));
        RecordWriter baseRecordWriter = baseOF.getRecordWriter(parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext));
        baseDynamicWriters.put(dynKey, baseRecordWriter);
        baseDynamicSerDe.put(dynKey, currSerDe);
        baseDynamicCommitters.put(dynKey, baseOutputCommitter);
        dynamicContexts.put(dynKey, currTaskContext);
        dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
        dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration()));
    }
    return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey), baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey));
}

Also used : Path(org.apache.hadoop.fs.Path) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) ArrayList(java.util.ArrayList) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) RecordWriter(org.apache.hadoop.mapred.RecordWriter) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 9 with RecordWriter

use of org.apache.hadoop.mapred.RecordWriter in project hadoop by apache.

the class MultipleOutputFormat method getRecordWriter.

/**
   * Create a composite record writer that can write key/value data to different
   * output files
   * 
   * @param fs
   *          the file system to use
   * @param job
   *          the job conf for the job
   * @param name
   *          the leaf file name for the output file (such as part-00000")
   * @param arg3
   *          a progressable for reporting progress.
   * @return a composite record writer
   * @throws IOException
   */
public RecordWriter<K, V> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable arg3) throws IOException {
    final FileSystem myFS = fs;
    final String myName = generateLeafFileName(name);
    final JobConf myJob = job;
    final Progressable myProgressable = arg3;
    return new RecordWriter<K, V>() {

        // a cache storing the record writers for different output files.
        TreeMap<String, RecordWriter<K, V>> recordWriters = new TreeMap<String, RecordWriter<K, V>>();

        public void write(K key, V value) throws IOException {
            // get the file name based on the key
            String keyBasedPath = generateFileNameForKeyValue(key, value, myName);
            // get the file name based on the input file name
            String finalPath = getInputFileBasedOutputFileName(myJob, keyBasedPath);
            // get the actual key
            K actualKey = generateActualKey(key, value);
            V actualValue = generateActualValue(key, value);
            RecordWriter<K, V> rw = this.recordWriters.get(finalPath);
            if (rw == null) {
                // if we don't have the record writer yet for the final path, create
                // one
                // and add it to the cache
                rw = getBaseRecordWriter(myFS, myJob, finalPath, myProgressable);
                this.recordWriters.put(finalPath, rw);
            }
            rw.write(actualKey, actualValue);
        }

        ;

        public void close(Reporter reporter) throws IOException {
            Iterator<String> keys = this.recordWriters.keySet().iterator();
            while (keys.hasNext()) {
                RecordWriter<K, V> rw = this.recordWriters.get(keys.next());
                rw.close(reporter);
            }
            this.recordWriters.clear();
        }

        ;
    };
}

Also used : Progressable(org.apache.hadoop.util.Progressable) RecordWriter(org.apache.hadoop.mapred.RecordWriter) FileSystem(org.apache.hadoop.fs.FileSystem) Reporter(org.apache.hadoop.mapred.Reporter) TreeMap(java.util.TreeMap) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

RecordWriter (org.apache.hadoop.mapred.RecordWriter)9 AbstractSerDe (org.apache.hadoop.hive.serde2.AbstractSerDe)5 IOException (java.io.IOException)3 Properties (java.util.Properties)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Path (org.apache.hadoop.fs.Path)3 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)3 InputSplit (org.apache.hadoop.mapred.InputSplit)3 Reporter (org.apache.hadoop.mapred.Reporter)3 Test (org.junit.Test)3 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)2 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)2 JobConf (org.apache.hadoop.mapred.JobConf)2 OutputStream (java.io.OutputStream)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 TreeMap (java.util.TreeMap)1 Schema (org.apache.avro.Schema)1 AvroWrapper (org.apache.avro.mapred.AvroWrapper)1 BytesRefArrayWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable)1