Search in sources :

Example 1 with SerDeStats

use of org.apache.hadoop.hive.serde2.SerDeStats in project phoenix by apache.

the class PhoenixRecordUpdater method getStats.

/* (non-Javadoc)
     * @see org.apache.hadoop.hive.ql.io.RecordUpdater#getStats()
     */
@Override
public SerDeStats getStats() {
    if (LOG.isDebugEnabled()) {
        LOG.debug("getStats called");
    }
    SerDeStats stats = new SerDeStats();
    stats.setRowCount(rowCountDelta);
    // that without finding the row we are updating or deleting, which would be a mess.
    return stats;
}
Also used : SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats)

Example 2 with SerDeStats

use of org.apache.hadoop.hive.serde2.SerDeStats in project phoenix by apache.

the class PhoenixRecordWriter method getStats.

@Override
public SerDeStats getStats() {
    if (LOG.isDebugEnabled()) {
        LOG.debug("getStats called");
    }
    SerDeStats stats = new SerDeStats();
    stats.setRowCount(rowCountDelta);
    // that without finding the row we are updating or deleting, which would be a mess.
    return stats;
}
Also used : SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats)

Example 3 with SerDeStats

use of org.apache.hadoop.hive.serde2.SerDeStats in project parquet-mr by apache.

the class ParquetHiveSerDe method initialize.

@Override
public final void initialize(final Configuration conf, final Properties tbl) throws SerDeException {
    final TypeInfo rowTypeInfo;
    final List<String> columnNames;
    final List<TypeInfo> columnTypes;
    // Get column names and sort order
    final String columnNameProperty = tbl.getProperty(IOConstants.COLUMNS);
    final String columnTypeProperty = tbl.getProperty(IOConstants.COLUMNS_TYPES);
    if (columnNameProperty.length() == 0) {
        columnNames = new ArrayList<String>();
    } else {
        columnNames = Arrays.asList(columnNameProperty.split(","));
    }
    if (columnTypeProperty.length() == 0) {
        columnTypes = new ArrayList<TypeInfo>();
    } else {
        columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    }
    if (columnNames.size() != columnTypes.size()) {
        throw new IllegalArgumentException("ParquetHiveSerde initialization failed. Number of column " + "name and column type differs. columnNames = " + columnNames + ", columnTypes = " + columnTypes);
    }
    // Create row related objects
    rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
    this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);
    // Stats part
    stats = new SerDeStats();
    serializedSize = 0;
    deserializedSize = 0;
    status = LAST_OPERATION.UNKNOWN;
}
Also used : SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)

Example 4 with SerDeStats

use of org.apache.hadoop.hive.serde2.SerDeStats in project hive by apache.

the class FileSinkOperator method process.

@Override
public void process(Object row, int tag) throws HiveException {
    runTimeNumRows++;
    /* Create list bucketing sub-directory only if stored-as-directories is on. */
    String lbDirName = null;
    lbDirName = (lbCtx == null) ? null : generateListBucketingDirName(row);
    if (!bDynParts && (!filesCreated || conf.isCompactionTable())) {
        if (lbDirName != null) {
            if (valToPaths.get(lbDirName) == null) {
                createNewPaths(null, lbDirName);
            }
        } else if (conf.isCompactionTable()) {
            int bucketProperty = getBucketProperty(row);
            bucketId = BucketCodec.determineVersion(bucketProperty).decodeWriterId(bucketProperty);
            if (!filesCreatedPerBucket.get(bucketId)) {
                createBucketFilesForCompaction(fsp);
            }
        } else {
            createBucketFiles(fsp);
        }
    }
    try {
        updateProgress();
        // if DP is enabled, get the final output writers and prepare the real output row
        assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT : "input object inspector is not struct";
        if (bDynParts) {
            // we need to read bucket number which is the last column in value (after partition columns)
            if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
                numDynParts += 1;
            }
            // copy the DP column values from the input row to dpVals
            dpVals.clear();
            dpWritables.clear();
            ObjectInspectorUtils.partialCopyToStandardObject(dpWritables, row, dpStartCol, numDynParts, (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE);
            // pass the null value along to the escaping process to determine what the dir should be
            for (Object o : dpWritables) {
                if (o == null || o.toString().length() == 0) {
                    dpVals.add(dpCtx.getDefaultPartitionName());
                } else {
                    dpVals.add(o.toString());
                }
            }
            String invalidPartitionVal;
            if ((invalidPartitionVal = HiveStringUtils.getPartitionValWithInvalidCharacter(dpVals, dpCtx.getWhiteListPattern())) != null) {
                throw new HiveFatalException("Partition value '" + invalidPartitionVal + "' contains a character not matched by whitelist pattern '" + dpCtx.getWhiteListPattern().toString() + "'.  " + "(configure with " + HiveConf.ConfVars.METASTORE_PARTITION_NAME_WHITELIST_PATTERN.varname + ")");
            }
            fpaths = getDynOutPaths(dpVals, lbDirName);
            dynamicPartitionSpecs.add(fpaths.dpDirForCounters);
            // use SubStructObjectInspector to serialize the non-partitioning columns in the input row
            recordValue = serializer.serialize(row, subSetOI);
        } else {
            if (lbDirName != null) {
                fpaths = valToPaths.get(lbDirName);
                if (fpaths == null) {
                    fpaths = createNewPaths(null, lbDirName);
                }
            } else {
                fpaths = fsp;
            }
            recordValue = serializer.serialize(row, inputObjInspectors[0]);
            // is kept track of in the SerDe)
            if (recordValue == null) {
                return;
            }
        }
        rowOutWriters = fpaths.outWriters;
        // check if all record writers implement statistics. if atleast one RW
        // doesn't implement stats interface we will fallback to conventional way
        // of gathering stats
        isCollectRWStats = areAllTrue(statsFromRecordWriter);
        if (conf.isGatherStats() && !isCollectRWStats) {
            SerDeStats stats = serializer.getSerDeStats();
            if (stats != null) {
                fpaths.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
            }
            fpaths.addToStat(StatsSetupConst.ROW_COUNT, 1);
        }
        if ((++numRows == cntr) && LOG.isInfoEnabled()) {
            cntr = logEveryNRows == 0 ? cntr * 10 : numRows + logEveryNRows;
            if (cntr < 0 || numRows < 0) {
                cntr = 0;
                numRows = 1;
            }
            LOG.info(toString() + ": records written - " + numRows);
        }
        int writerOffset;
        // pass the row rather than recordValue.
        if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable() || conf.isCompactionTable()) {
            writerOffset = bucketId;
            if (!conf.isCompactionTable()) {
                writerOffset = findWriterOffset(row);
            }
            rowOutWriters[writerOffset].write(recordValue);
        } else if (conf.getWriteType() == AcidUtils.Operation.INSERT) {
            fpaths.updaters[findWriterOffset(row)].insert(conf.getTableWriteId(), row);
        } else {
            // TODO I suspect we could skip much of the stuff above this in the function in the case
            // of update and delete.  But I don't understand all of the side effects of the above
            // code and don't want to skip over it yet.
            // Find the bucket id, and switch buckets if need to
            ObjectInspector rowInspector = bDynParts ? subSetOI : outputObjInspector;
            Object recId = ((StructObjectInspector) rowInspector).getStructFieldData(row, recIdField);
            int bucketProperty = bucketInspector.get(recIdInspector.getStructFieldData(recId, bucketField));
            int bucketNum = BucketCodec.determineVersion(bucketProperty).decodeWriterId(bucketProperty);
            writerOffset = 0;
            if (multiFileSpray) {
                // bucket_num_reducers_acid.q, TestTxnCommands.testMoreBucketsThanReducers()
                if (!bucketMap.containsKey(bucketNum)) {
                    String extraMsg = "  (no path info/)" + recId;
                    if (fpaths != null && fpaths.finalPaths != null && fpaths.finalPaths.length > 0) {
                        extraMsg = "  (finalPaths[0]=" + fpaths.finalPaths[0] + ")/" + recId;
                    }
                    throw new IllegalStateException("Found bucketNum=" + bucketNum + " from data but no mapping in 'bucketMap'." + extraMsg);
                }
                writerOffset = bucketMap.get(bucketNum);
            } else if (!isBucketed) {
                writerOffset = fpaths.createDynamicBucket(bucketNum);
            }
            if (fpaths.updaters[writerOffset] == null) {
                Integer attemptId = getAttemptIdFromTaskId(taskId);
                fpaths.updaters[writerOffset] = HiveFileFormatUtils.getAcidRecordUpdater(jc, conf.getTableInfo(), bucketNum, conf, fpaths.outPaths[writerOffset], rowInspector, reporter, 0, attemptId);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Created updater for bucket number " + bucketNum + " using file " + fpaths.outPaths[writerOffset]);
                }
            }
            if (conf.getWriteType() == AcidUtils.Operation.UPDATE) {
                fpaths.updaters[writerOffset].update(conf.getTableWriteId(), row);
            } else if (conf.getWriteType() == AcidUtils.Operation.DELETE) {
                fpaths.updaters[writerOffset].delete(conf.getTableWriteId(), row);
            } else {
                throw new HiveException("Unknown write type " + conf.getWriteType().toString());
            }
        }
    } catch (IOException e) {
        LOG.error("Trying to close the writers as an IOException occurred: " + e.getMessage());
        closeWriters(true);
        throw new HiveException(e);
    } catch (SerDeException e) {
        closeWriters(true);
        throw new HiveException(e);
    }
}
Also used : SubStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HiveFatalException(org.apache.hadoop.hive.ql.metadata.HiveFatalException) IOException(java.io.IOException)

Example 5 with SerDeStats

use of org.apache.hadoop.hive.serde2.SerDeStats in project hive by apache.

the class MapOperator method populateVirtualColumnValues.

public static Object[] populateVirtualColumnValues(ExecMapperContext ctx, List<VirtualColumn> vcs, Object[] vcValues, Deserializer deserializer) {
    if (vcs == null) {
        return vcValues;
    }
    if (vcValues == null) {
        vcValues = new Object[vcs.size()];
    }
    for (int i = 0; i < vcs.size(); i++) {
        switch(vcs.get(i)) {
            case FILENAME:
                if (ctx.inputFileChanged()) {
                    vcValues[i] = new Text(ctx.getCurrentInputPath().toString());
                }
                break;
            case BLOCKOFFSET:
                {
                    long current = ctx.getIoCxt().getCurrentBlockStart();
                    LongWritable old = (LongWritable) vcValues[i];
                    if (old == null) {
                        old = new LongWritable(current);
                        vcValues[i] = old;
                        continue;
                    }
                    if (current != old.get()) {
                        old.set(current);
                    }
                }
                break;
            case ROWOFFSET:
                {
                    long current = ctx.getIoCxt().getCurrentRow();
                    LongWritable old = (LongWritable) vcValues[i];
                    if (old == null) {
                        old = new LongWritable(current);
                        vcValues[i] = old;
                        continue;
                    }
                    if (current != old.get()) {
                        old.set(current);
                    }
                }
                break;
            case RAWDATASIZE:
                long current = 0L;
                SerDeStats stats = deserializer.getSerDeStats();
                if (stats != null) {
                    current = stats.getRawDataSize();
                }
                LongWritable old = (LongWritable) vcValues[i];
                if (old == null) {
                    old = new LongWritable(current);
                    vcValues[i] = old;
                    continue;
                }
                if (current != old.get()) {
                    old.set(current);
                }
                break;
            case ROWID:
                if (ctx.getIoCxt().getRecordIdentifier() == null) {
                    vcValues[i] = null;
                } else {
                    if (vcValues[i] == null) {
                        vcValues[i] = new Object[RecordIdentifier.Field.values().length];
                    }
                    RecordIdentifier.StructInfo.toArray(ctx.getIoCxt().getRecordIdentifier(), (Object[]) vcValues[i]);
                    // so we don't accidentally cache the value; shouldn't
                    ctx.getIoCxt().setRecordIdentifier(null);
                // happen since IO layer either knows how to produce ROW__ID or not - but to be safe
                }
                break;
            case ROWISDELETED:
                vcValues[i] = new BooleanWritable(ctx.getIoCxt().isDeletedRecord());
                break;
        }
    }
    return vcValues;
}
Also used : SerDeStats(org.apache.hadoop.hive.serde2.SerDeStats) BooleanWritable(org.apache.hadoop.io.BooleanWritable) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable)

Aggregations

SerDeStats (org.apache.hadoop.hive.serde2.SerDeStats)10 IOException (java.io.IOException)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 HiveFatalException (org.apache.hadoop.hive.ql.metadata.HiveFatalException)1 LazyObjectInspectorParametersImpl (org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyObjectInspectorParametersImpl)1 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)1 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)1 SubStructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector)1 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)1 StructTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo)1 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)1 BooleanWritable (org.apache.hadoop.io.BooleanWritable)1 LongWritable (org.apache.hadoop.io.LongWritable)1 Text (org.apache.hadoop.io.Text)1 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)1 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)1