Examples with StatsProvidingRecordWriter - org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter

Example 1 with StatsProvidingRecordWriter

use of org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter in project hive by apache.

the class FileSinkOperator method closeOp.

@Override
public void closeOp(boolean abort) throws HiveException {
    row_count.set(numRows);
    LOG.info(toString() + ": records written - " + numRows);
    if (!bDynParts && !filesCreated) {
        boolean skipFiles = "tez".equalsIgnoreCase(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE));
        if (skipFiles) {
            Class<?> clazz = conf.getTableInfo().getOutputFileFormatClass();
            skipFiles = !StreamingOutputFormat.class.isAssignableFrom(clazz);
        }
        if (!skipFiles) {
            createBucketFiles(fsp);
        }
    }
    lastProgressReport = System.currentTimeMillis();
    if (!abort) {
        // (the size of buffer is kept track of in the ThriftJDBCBinarySerDe).
        if (conf.isUsingThriftJDBCBinarySerDe()) {
            try {
                recordValue = serializer.serialize(null, inputObjInspectors[0]);
                if (null != fpaths) {
                    rowOutWriters = fpaths.outWriters;
                    rowOutWriters[0].write(recordValue);
                }
            } catch (SerDeException | IOException e) {
                throw new HiveException(e);
            }
        }
        List<Path> commitPaths = new ArrayList<>();
        for (FSPaths fsp : valToPaths.values()) {
            fsp.closeWriters(abort);
            // accumulated statistics which will be aggregated in case of spray writers
            if (conf.isGatherStats() && isCollectRWStats) {
                if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable()) {
                    for (int idx = 0; idx < fsp.outWriters.length; idx++) {
                        RecordWriter outWriter = fsp.outWriters[idx];
                        if (outWriter != null) {
                            SerDeStats stats = ((StatsProvidingRecordWriter) outWriter).getStats();
                            if (stats != null) {
                                fsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
                                fsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
                            }
                        }
                    }
                } else {
                    for (int i = 0; i < fsp.updaters.length; i++) {
                        if (fsp.updaters[i] != null) {
                            SerDeStats stats = fsp.updaters[i].getStats();
                            if (stats != null) {
                                fsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
                                fsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
                            }
                        }
                    }
                }
            }
            if (isNativeTable()) {
                fsp.commit(fs, commitPaths);
            }
        }
        if (conf.isMmTable()) {
            Utilities.writeMmCommitManifest(commitPaths, specPath, fs, taskId, conf.getTableWriteId(), conf.getStatementId(), unionPath, conf.getInsertOverwrite());
        }
        // Only publish stats if this operator's flag was set to gather stats
        if (conf.isGatherStats()) {
            publishStats();
        }
    } else {
        // reduce().
        for (FSPaths fsp : valToPaths.values()) {
            fsp.abortWriters(fs, abort, !autoDelete && isNativeTable() && !conf.isMmTable());
        }
    }
    fsp = prevFsp = null;
    super.closeOp(abort);
}

Also used : Path(org.apache.hadoop.fs.Path) SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) IOException(java.io.IOException) StatsProvidingRecordWriter(org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter) StatsProvidingRecordWriter(org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 2 with StatsProvidingRecordWriter

use of org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter in project hive by apache.

the class FileSinkOperator method getDynOutPaths.

protected FSPaths getDynOutPaths(List<String> row, String lbDirName) throws HiveException {
    FSPaths fp;
    // get the path corresponding to the dynamic partition columns,
    String dpDir = getDynPartDirectory(row, dpColNames);
    String pathKey = null;
    if (dpDir != null) {
        dpDir = appendToSource(lbDirName, dpDir);
        pathKey = dpDir;
        if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
            String buckNum = row.get(row.size() - 1);
            taskId = Utilities.replaceTaskIdFromFilename(taskId, buckNum);
            pathKey = appendToSource(taskId, dpDir);
        }
        FSPaths fsp2 = valToPaths.get(pathKey);
        if (fsp2 == null) {
            // CREATED_DYNAMIC_PARTITION counter
            if (valToPaths.size() > maxPartitions) {
                // we cannot proceed and need to tell the hive client that retries won't succeed either
                throw new HiveFatalException(ErrorMsg.DYNAMIC_PARTITIONS_TOO_MANY_PER_NODE_ERROR.getErrorCodedMsg() + "Maximum was set to " + maxPartitions + " partitions per node" + ", number of dynamic partitions on this node: " + valToPaths.size());
            }
            if (!conf.getDpSortState().equals(DPSortState.NONE) && prevFsp != null) {
                // close the previous fsp as it is no longer needed
                prevFsp.closeWriters(false);
                // stats from the record writer and store in the previous fsp that is cached
                if (conf.isGatherStats() && isCollectRWStats) {
                    SerDeStats stats = null;
                    if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable()) {
                        RecordWriter outWriter = prevFsp.outWriters[0];
                        if (outWriter != null) {
                            stats = ((StatsProvidingRecordWriter) outWriter).getStats();
                        }
                    } else if (prevFsp.updaters[0] != null) {
                        stats = prevFsp.updaters[0].getStats();
                    }
                    if (stats != null) {
                        prevFsp.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
                        prevFsp.addToStat(StatsSetupConst.ROW_COUNT, stats.getRowCount());
                    }
                }
                // let writers release the memory for garbage collection
                prevFsp.outWriters[0] = null;
                prevFsp = null;
            }
            fsp2 = createNewPaths(dpDir);
            if (prevFsp == null) {
                prevFsp = fsp2;
            }
            if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
                createBucketForFileIdx(fsp2, 0);
                valToPaths.put(pathKey, fsp2);
            }
        }
        fp = fsp2;
    } else {
        fp = fsp;
    }
    return fp;
}

Also used : SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats) StatsProvidingRecordWriter(org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter) HiveFatalException(org.apache.hadoop.hive.ql.metadata.HiveFatalException)

Example 3 with StatsProvidingRecordWriter

use of org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter in project hive by apache.

the class FileSinkOperator method createBucketForFileIdx.

protected void createBucketForFileIdx(FSPaths fsp, int filesIdx) throws HiveException {
    try {
        fsp.initializeBucketPaths(filesIdx, taskId, isNativeTable(), isSkewedStoredAsSubDirectories);
        if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
            Utilities.FILE_OP_LOGGER.trace("createBucketForFileIdx " + filesIdx + ": final path " + fsp.finalPaths[filesIdx] + "; out path " + fsp.outPaths[filesIdx] + " (spec path " + specPath + ", tmp path " + fsp.getTmpPath() + ", task " + taskId + ")");
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("New Final Path: FS " + fsp.finalPaths[filesIdx]);
        }
        if (isNativeTable() && !conf.isMmTable()) {
            // in recent hadoop versions, use deleteOnExit to clean tmp files.
            autoDelete = fs.deleteOnExit(fsp.outPaths[filesIdx]);
        }
        updateDPCounters(fsp, filesIdx);
        Utilities.copyTableJobPropertiesToConf(conf.getTableInfo(), jc);
        // If MM wants to create a new base for IOW (instead of delta dir), it should specify it here
        if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable()) {
            Path outPath = fsp.outPaths[filesIdx];
            if (conf.isMmTable() && !FileUtils.mkdir(fs, outPath.getParent(), hconf)) {
                LOG.warn("Unable to create directory with inheritPerms: " + outPath);
            }
            fsp.outWriters[filesIdx] = HiveFileFormatUtils.getHiveRecordWriter(jc, conf.getTableInfo(), outputClass, conf, outPath, reporter);
            // If the record writer provides stats, get it from there instead of the serde
            statsFromRecordWriter[filesIdx] = fsp.outWriters[filesIdx] instanceof StatsProvidingRecordWriter;
        // increment the CREATED_FILES counter
        } else if (conf.getWriteType() == AcidUtils.Operation.INSERT) {
            // Only set up the updater for insert.  For update and delete we don't know unitl we see
            // the row.
            ObjectInspector inspector = bDynParts ? subSetOI : outputObjInspector;
            int acidBucketNum = Integer.parseInt(Utilities.getTaskIdFromFilename(taskId));
            fsp.updaters[filesIdx] = HiveFileFormatUtils.getAcidRecordUpdater(jc, conf.getTableInfo(), acidBucketNum, conf, fsp.outPaths[filesIdx], inspector, reporter, -1);
        }
        if (reporter != null) {
            reporter.incrCounter(counterGroup, Operator.HIVE_COUNTER_CREATED_FILES, 1);
        }
    } catch (IOException e) {
        throw new HiveException(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) SubStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) StatsProvidingRecordWriter(org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter) IOException(java.io.IOException)

Aggregations

StatsProvidingRecordWriter (org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter)3 IOException (java.io.IOException)2 Path (org.apache.hadoop.fs.Path)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 SerDeStats (org.apache.hadoop.hive.serde2.SerDeStats)2 ArrayList (java.util.ArrayList)1 HiveFatalException (org.apache.hadoop.hive.ql.metadata.HiveFatalException)1 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)1 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)1 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)1 SubStructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector)1 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)1