Search in sources :

Example 1 with FileOutputCommitter

use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project hbase by apache.

the class MultiHFileOutputFormat method createMultiHFileRecordWriter.

static <V extends Cell> RecordWriter<ImmutableBytesWritable, V> createMultiHFileRecordWriter(final TaskAttemptContext context) throws IOException {
    // Get the path of the output directory
    final Path outputPath = FileOutputFormat.getOutputPath(context);
    final Path outputDir = new FileOutputCommitter(outputPath, context).getWorkPath();
    final Configuration conf = context.getConfiguration();
    final FileSystem fs = outputDir.getFileSystem(conf);
    // Map of tables to writers
    final Map<ImmutableBytesWritable, RecordWriter<ImmutableBytesWritable, V>> tableWriters = new HashMap<>();
    return new RecordWriter<ImmutableBytesWritable, V>() {

        @Override
        public void write(ImmutableBytesWritable tableName, V cell) throws IOException, InterruptedException {
            RecordWriter<ImmutableBytesWritable, V> tableWriter = tableWriters.get(tableName);
            // if there is new table, verify that table directory exists
            if (tableWriter == null) {
                // using table name as directory name
                final Path tableOutputDir = new Path(outputDir, Bytes.toString(tableName.copyBytes()));
                fs.mkdirs(tableOutputDir);
                LOG.info("Writing Table '" + tableName.toString() + "' data into following directory" + tableOutputDir.toString());
                // Create writer for one specific table
                tableWriter = new HFileOutputFormat2.HFileRecordWriter<>(context, tableOutputDir);
                // Put table into map
                tableWriters.put(tableName, tableWriter);
            }
            // Write <Row, Cell> into tableWriter
            // in the original code, it does not use Row
            tableWriter.write(null, cell);
        }

        @Override
        public void close(TaskAttemptContext c) throws IOException, InterruptedException {
            for (RecordWriter<ImmutableBytesWritable, V> writer : tableWriters.values()) {
                writer.close(c);
            }
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) FileSystem(org.apache.hadoop.fs.FileSystem) HFileOutputFormat2(org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2)

Example 2 with FileOutputCommitter

use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project hive by apache.

the class HiveHFileOutputFormat method getHiveRecordWriter.

@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, final Progressable progressable) throws IOException {
    // Read configuration for the target path, first from jobconf, then from table properties
    String hfilePath = getFamilyPath(jc, tableProperties);
    if (hfilePath == null) {
        throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
    }
    // Target path's last component is also the column family name.
    final Path columnFamilyPath = new Path(hfilePath);
    final String columnFamilyName = columnFamilyPath.getName();
    final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
    final Job job = new Job(jc);
    setCompressOutput(job, isCompressed);
    setOutputPath(job, finalOutPath);
    // Create the HFile writer
    final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), progressable);
    final Path outputdir = FileOutputFormat.getOutputPath(tac);
    final Path taskAttemptOutputdir = new FileOutputCommitter(outputdir, tac).getWorkPath();
    final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(tac);
    // Individual columns are going to be pivoted to HBase cells,
    // and for each row, they need to be written out in order
    // of column name, so sort the column names now, creating a
    // mapping to their column position.  However, the first
    // column is interpreted as the row key.
    String columnList = tableProperties.getProperty("columns");
    String[] columnArray = columnList.split(",");
    final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    int i = 0;
    for (String columnName : columnArray) {
        if (i != 0) {
            columnMap.put(Bytes.toBytes(columnName), i);
        }
        ++i;
    }
    return new RecordWriter() {

        @Override
        public void close(boolean abort) throws IOException {
            try {
                fileWriter.close(null);
                if (abort) {
                    return;
                }
                // Move the hfiles file(s) from the task output directory to the
                // location specified by the user.
                FileSystem fs = outputdir.getFileSystem(jc);
                fs.mkdirs(columnFamilyPath);
                Path srcDir = taskAttemptOutputdir;
                for (; ; ) {
                    FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
                    if ((files == null) || (files.length == 0)) {
                        throw new IOException("No family directories found in " + srcDir);
                    }
                    if (files.length != 1) {
                        throw new IOException("Multiple family directories found in " + srcDir);
                    }
                    srcDir = files[0].getPath();
                    if (srcDir.getName().equals(columnFamilyName)) {
                        break;
                    }
                    if (files[0].isFile()) {
                        throw new IOException("No family directories found in " + taskAttemptOutputdir + ". " + "The last component in hfile path should match column family name " + columnFamilyName);
                    }
                }
                for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
                    fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
                }
                // Hive actually wants a file as task output (not a directory), so
                // replace the empty directory with an empty file to keep it happy.
                fs.delete(taskAttemptOutputdir, true);
                fs.createNewFile(taskAttemptOutputdir);
            } catch (InterruptedException ex) {
                throw new IOException(ex);
            }
        }

        private void writeText(Text text) throws IOException {
            // Decompose the incoming text row into fields.
            String s = text.toString();
            String[] fields = s.split("");
            assert (fields.length <= (columnMap.size() + 1));
            // First field is the row key.
            byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
            // Remaining fields are cells addressed by column name within row.
            for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
                byte[] columnNameBytes = entry.getKey();
                int iColumn = entry.getValue();
                String val;
                if (iColumn >= fields.length) {
                    // trailing blank field
                    val = "";
                } else {
                    val = fields[iColumn];
                    if ("\\N".equals(val)) {
                        // omit nulls
                        continue;
                    }
                }
                byte[] valBytes = Bytes.toBytes(val);
                KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
                try {
                    fileWriter.write(null, kv);
                } catch (IOException e) {
                    LOG.error("Failed while writing row: " + s);
                    throw e;
                } catch (InterruptedException ex) {
                    throw new IOException(ex);
                }
            }
        }

        private void writePut(PutWritable put) throws IOException {
            ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
            SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
            for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
                Collections.sort(entry.getValue(), new CellComparator());
                for (Cell c : entry.getValue()) {
                    try {
                        fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
                    } catch (InterruptedException e) {
                        throw (InterruptedIOException) new InterruptedIOException().initCause(e);
                    }
                }
            }
        }

        @Override
        public void write(Writable w) throws IOException {
            if (w instanceof Text) {
                writeText((Text) w);
            } else if (w instanceof PutWritable) {
                writePut((PutWritable) w);
            } else {
                throw new IOException("Unexpected writable " + w);
            }
        }
    };
}
Also used : InterruptedIOException(java.io.InterruptedIOException) KeyValue(org.apache.hadoop.hbase.KeyValue) FileStatus(org.apache.hadoop.fs.FileStatus) Writable(org.apache.hadoop.io.Writable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) FileSystem(org.apache.hadoop.fs.FileSystem) List(java.util.List) CellComparator(org.apache.hadoop.hbase.CellComparator) Job(org.apache.hadoop.mapreduce.Job) Cell(org.apache.hadoop.hbase.Cell) Path(org.apache.hadoop.fs.Path) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) Text(org.apache.hadoop.io.Text) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) TreeMap(java.util.TreeMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap)

Example 3 with FileOutputCommitter

use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project flink by apache.

the class HadoopOutputFormatBase method open.

/**
	 * create the temporary output file for hadoop RecordWriter.
	 * @param taskNumber The number of the parallel instance.
	 * @param numTasks The number of parallel tasks.
	 * @throws java.io.IOException
	 */
@Override
public void open(int taskNumber, int numTasks) throws IOException {
    // enforce sequential open() calls
    synchronized (OPEN_MUTEX) {
        if (Integer.toString(taskNumber + 1).length() > 6) {
            throw new IOException("Task id too large.");
        }
        this.taskNumber = taskNumber + 1;
        // for hadoop 2.2
        this.configuration.set("mapreduce.output.basename", "tmp");
        TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_" + String.format("%" + (6 - Integer.toString(taskNumber + 1).length()) + "s", " ").replace(" ", "0") + Integer.toString(taskNumber + 1) + "_0");
        this.configuration.set("mapred.task.id", taskAttemptID.toString());
        this.configuration.setInt("mapred.task.partition", taskNumber + 1);
        // for hadoop 2.2
        this.configuration.set("mapreduce.task.attempt.id", taskAttemptID.toString());
        this.configuration.setInt("mapreduce.task.partition", taskNumber + 1);
        try {
            this.context = HadoopUtils.instantiateTaskAttemptContext(this.configuration, taskAttemptID);
            this.outputCommitter = this.mapreduceOutputFormat.getOutputCommitter(this.context);
            this.outputCommitter.setupJob(HadoopUtils.instantiateJobContext(this.configuration, new JobID()));
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        this.context.getCredentials().addAll(this.credentials);
        Credentials currentUserCreds = getCredentialsFromUGI(UserGroupInformation.getCurrentUser());
        if (currentUserCreds != null) {
            this.context.getCredentials().addAll(currentUserCreds);
        }
        // compatible for hadoop 2.2.0, the temporary output directory is different from hadoop 1.2.1
        if (outputCommitter instanceof FileOutputCommitter) {
            this.configuration.set("mapreduce.task.output.dir", ((FileOutputCommitter) this.outputCommitter).getWorkPath().toString());
        }
        try {
            this.recordWriter = this.mapreduceOutputFormat.getRecordWriter(this.context);
        } catch (InterruptedException e) {
            throw new IOException("Could not create RecordWriter.", e);
        }
    }
}
Also used : TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) IOException(java.io.IOException) JobID(org.apache.hadoop.mapreduce.JobID) IOException(java.io.IOException) Credentials(org.apache.hadoop.security.Credentials)

Example 4 with FileOutputCommitter

use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project hive by apache.

the class RCFileMapReduceOutputFormat method getRecordWriter.

/* (non-Javadoc)
  * @see org.apache.hadoop.mapreduce.lib.output.FileOutputFormat#getRecordWriter(org.apache.hadoop.mapreduce.TaskAttemptContext)
  */
@Override
public org.apache.hadoop.mapreduce.RecordWriter<WritableComparable<?>, BytesRefArrayWritable> getRecordWriter(TaskAttemptContext task) throws IOException, InterruptedException {
    //FileOutputFormat.getWorkOutputPath takes TaskInputOutputContext instead of
    //TaskAttemptContext, so can't use that here
    FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(task);
    Path outputPath = committer.getWorkPath();
    FileSystem fs = outputPath.getFileSystem(task.getConfiguration());
    if (!fs.exists(outputPath)) {
        fs.mkdirs(outputPath);
    }
    Path file = getDefaultWorkFile(task, "");
    CompressionCodec codec = null;
    if (getCompressOutput(task)) {
        Class<?> codecClass = getOutputCompressorClass(task, DefaultCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, task.getConfiguration());
    }
    final RCFile.Writer out = new RCFile.Writer(fs, task.getConfiguration(), file, task, codec);
    return new RecordWriter<WritableComparable<?>, BytesRefArrayWritable>() {

        /* (non-Javadoc)
      * @see org.apache.hadoop.mapreduce.RecordWriter#write(java.lang.Object, java.lang.Object)
      */
        @Override
        public void write(WritableComparable<?> key, BytesRefArrayWritable value) throws IOException {
            out.append(value);
        }

        /* (non-Javadoc)
      * @see org.apache.hadoop.mapreduce.RecordWriter#close(org.apache.hadoop.mapreduce.TaskAttemptContext)
      */
        @Override
        public void close(TaskAttemptContext task) throws IOException, InterruptedException {
            out.close();
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) RCFile(org.apache.hadoop.hive.ql.io.RCFile) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) WritableComparable(org.apache.hadoop.io.WritableComparable) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) FileSystem(org.apache.hadoop.fs.FileSystem) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter)

Example 5 with FileOutputCommitter

use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project hive by apache.

the class DynamicPartitionFileRecordWriterContainer method getLocalFileWriter.

@Override
protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException {
    OutputJobInfo localJobInfo = null;
    // Calculate which writer to use from the remaining values - this needs to
    // be done before we delete cols.
    List<String> dynamicPartValues = new ArrayList<String>();
    for (Integer colToAppend : dynamicPartCols) {
        Object partitionValue = value.get(colToAppend);
        dynamicPartValues.add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString());
    }
    String dynKey = dynamicPartValues.toString();
    if (!baseDynamicWriters.containsKey(dynKey)) {
        if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
            throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed.");
        }
        org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil.createTaskAttemptContext(context);
        configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
        localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration());
        // Setup serDe.
        AbstractSerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf());
        try {
            InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
        } catch (SerDeException e) {
            throw new IOException("Failed to initialize SerDe", e);
        }
        // create base OutputFormat
        org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils.newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());
        // We are skipping calling checkOutputSpecs() for each partition
        // As it can throw a FileAlreadyExistsException when more than one
        // mapper is writing to a partition.
        // See HCATALOG-490, also to avoid contacting the namenode for each new
        // FileOutputFormat instance.
        // In general this should be ok for most FileOutputFormat implementations
        // but may become an issue for cases when the method is used to perform
        // other setup tasks.
        // Get Output Committer
        org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf().getOutputCommitter();
        // Create currJobContext the latest so it gets all the config changes
        org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext);
        // Set up job.
        baseOutputCommitter.setupJob(currJobContext);
        // Recreate to refresh jobConf of currTask context.
        currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());
        // Set temp location.
        currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath().toString());
        // Set up task.
        baseOutputCommitter.setupTask(currTaskContext);
        Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
        Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), ""));
        RecordWriter baseRecordWriter = baseOF.getRecordWriter(parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext));
        baseDynamicWriters.put(dynKey, baseRecordWriter);
        baseDynamicSerDe.put(dynKey, currSerDe);
        baseDynamicCommitters.put(dynKey, baseOutputCommitter);
        dynamicContexts.put(dynKey, currTaskContext);
        dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
        dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration()));
    }
    return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey), baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey));
}
Also used : Path(org.apache.hadoop.fs.Path) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) ArrayList(java.util.ArrayList) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) RecordWriter(org.apache.hadoop.mapred.RecordWriter) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Aggregations

FileOutputCommitter (org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter)8 Path (org.apache.hadoop.fs.Path)7 IOException (java.io.IOException)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)4 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)4 Configuration (org.apache.hadoop.conf.Configuration)3 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2 KeyValue (org.apache.hadoop.hbase.KeyValue)2 ImmutableBytesWritable (org.apache.hadoop.hbase.io.ImmutableBytesWritable)2 Job (org.apache.hadoop.mapreduce.Job)2 JobID (org.apache.hadoop.mapreduce.JobID)2 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)2 InterruptedIOException (java.io.InterruptedIOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 SortedMap (java.util.SortedMap)1 FileStatus (org.apache.hadoop.fs.FileStatus)1