use of org.apache.hadoop.hive.serde2.SerDeStats in project phoenix by apache.
the class PhoenixRecordUpdater method getStats.
/* (non-Javadoc)
* @see org.apache.hadoop.hive.ql.io.RecordUpdater#getStats()
*/
@Override
public SerDeStats getStats() {
if (LOG.isDebugEnabled()) {
LOG.debug("getStats called");
}
SerDeStats stats = new SerDeStats();
stats.setRowCount(rowCountDelta);
// that without finding the row we are updating or deleting, which would be a mess.
return stats;
}
use of org.apache.hadoop.hive.serde2.SerDeStats in project phoenix by apache.
the class PhoenixRecordWriter method getStats.
@Override
public SerDeStats getStats() {
if (LOG.isDebugEnabled()) {
LOG.debug("getStats called");
}
SerDeStats stats = new SerDeStats();
stats.setRowCount(rowCountDelta);
// that without finding the row we are updating or deleting, which would be a mess.
return stats;
}
use of org.apache.hadoop.hive.serde2.SerDeStats in project parquet-mr by apache.
the class ParquetHiveSerDe method initialize.
@Override
public final void initialize(final Configuration conf, final Properties tbl) throws SerDeException {
final TypeInfo rowTypeInfo;
final List<String> columnNames;
final List<TypeInfo> columnTypes;
// Get column names and sort order
final String columnNameProperty = tbl.getProperty(IOConstants.COLUMNS);
final String columnTypeProperty = tbl.getProperty(IOConstants.COLUMNS_TYPES);
if (columnNameProperty.length() == 0) {
columnNames = new ArrayList<String>();
} else {
columnNames = Arrays.asList(columnNameProperty.split(","));
}
if (columnTypeProperty.length() == 0) {
columnTypes = new ArrayList<TypeInfo>();
} else {
columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
}
if (columnNames.size() != columnTypes.size()) {
throw new IllegalArgumentException("ParquetHiveSerde initialization failed. Number of column " + "name and column type differs. columnNames = " + columnNames + ", columnTypes = " + columnTypes);
}
// Create row related objects
rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);
// Stats part
stats = new SerDeStats();
serializedSize = 0;
deserializedSize = 0;
status = LAST_OPERATION.UNKNOWN;
}
use of org.apache.hadoop.hive.serde2.SerDeStats in project hive by apache.
the class FileSinkOperator method process.
@Override
public void process(Object row, int tag) throws HiveException {
runTimeNumRows++;
/* Create list bucketing sub-directory only if stored-as-directories is on. */
String lbDirName = null;
lbDirName = (lbCtx == null) ? null : generateListBucketingDirName(row);
if (!bDynParts && (!filesCreated || conf.isCompactionTable())) {
if (lbDirName != null) {
if (valToPaths.get(lbDirName) == null) {
createNewPaths(null, lbDirName);
}
} else if (conf.isCompactionTable()) {
int bucketProperty = getBucketProperty(row);
bucketId = BucketCodec.determineVersion(bucketProperty).decodeWriterId(bucketProperty);
if (!filesCreatedPerBucket.get(bucketId)) {
createBucketFilesForCompaction(fsp);
}
} else {
createBucketFiles(fsp);
}
}
try {
updateProgress();
// if DP is enabled, get the final output writers and prepare the real output row
assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT : "input object inspector is not struct";
if (bDynParts) {
// we need to read bucket number which is the last column in value (after partition columns)
if (conf.getDpSortState().equals(DPSortState.PARTITION_BUCKET_SORTED)) {
numDynParts += 1;
}
// copy the DP column values from the input row to dpVals
dpVals.clear();
dpWritables.clear();
ObjectInspectorUtils.partialCopyToStandardObject(dpWritables, row, dpStartCol, numDynParts, (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE);
// pass the null value along to the escaping process to determine what the dir should be
for (Object o : dpWritables) {
if (o == null || o.toString().length() == 0) {
dpVals.add(dpCtx.getDefaultPartitionName());
} else {
dpVals.add(o.toString());
}
}
String invalidPartitionVal;
if ((invalidPartitionVal = HiveStringUtils.getPartitionValWithInvalidCharacter(dpVals, dpCtx.getWhiteListPattern())) != null) {
throw new HiveFatalException("Partition value '" + invalidPartitionVal + "' contains a character not matched by whitelist pattern '" + dpCtx.getWhiteListPattern().toString() + "'. " + "(configure with " + HiveConf.ConfVars.METASTORE_PARTITION_NAME_WHITELIST_PATTERN.varname + ")");
}
fpaths = getDynOutPaths(dpVals, lbDirName);
dynamicPartitionSpecs.add(fpaths.dpDirForCounters);
// use SubStructObjectInspector to serialize the non-partitioning columns in the input row
recordValue = serializer.serialize(row, subSetOI);
} else {
if (lbDirName != null) {
fpaths = valToPaths.get(lbDirName);
if (fpaths == null) {
fpaths = createNewPaths(null, lbDirName);
}
} else {
fpaths = fsp;
}
recordValue = serializer.serialize(row, inputObjInspectors[0]);
// is kept track of in the SerDe)
if (recordValue == null) {
return;
}
}
rowOutWriters = fpaths.outWriters;
// check if all record writers implement statistics. if atleast one RW
// doesn't implement stats interface we will fallback to conventional way
// of gathering stats
isCollectRWStats = areAllTrue(statsFromRecordWriter);
if (conf.isGatherStats() && !isCollectRWStats) {
SerDeStats stats = serializer.getSerDeStats();
if (stats != null) {
fpaths.addToStat(StatsSetupConst.RAW_DATA_SIZE, stats.getRawDataSize());
}
fpaths.addToStat(StatsSetupConst.ROW_COUNT, 1);
}
if ((++numRows == cntr) && LOG.isInfoEnabled()) {
cntr = logEveryNRows == 0 ? cntr * 10 : numRows + logEveryNRows;
if (cntr < 0 || numRows < 0) {
cntr = 0;
numRows = 1;
}
LOG.info(toString() + ": records written - " + numRows);
}
int writerOffset;
// pass the row rather than recordValue.
if (conf.getWriteType() == AcidUtils.Operation.NOT_ACID || conf.isMmTable() || conf.isCompactionTable()) {
writerOffset = bucketId;
if (!conf.isCompactionTable()) {
writerOffset = findWriterOffset(row);
}
rowOutWriters[writerOffset].write(recordValue);
} else if (conf.getWriteType() == AcidUtils.Operation.INSERT) {
fpaths.updaters[findWriterOffset(row)].insert(conf.getTableWriteId(), row);
} else {
// TODO I suspect we could skip much of the stuff above this in the function in the case
// of update and delete. But I don't understand all of the side effects of the above
// code and don't want to skip over it yet.
// Find the bucket id, and switch buckets if need to
ObjectInspector rowInspector = bDynParts ? subSetOI : outputObjInspector;
Object recId = ((StructObjectInspector) rowInspector).getStructFieldData(row, recIdField);
int bucketProperty = bucketInspector.get(recIdInspector.getStructFieldData(recId, bucketField));
int bucketNum = BucketCodec.determineVersion(bucketProperty).decodeWriterId(bucketProperty);
writerOffset = 0;
if (multiFileSpray) {
// bucket_num_reducers_acid.q, TestTxnCommands.testMoreBucketsThanReducers()
if (!bucketMap.containsKey(bucketNum)) {
String extraMsg = " (no path info/)" + recId;
if (fpaths != null && fpaths.finalPaths != null && fpaths.finalPaths.length > 0) {
extraMsg = " (finalPaths[0]=" + fpaths.finalPaths[0] + ")/" + recId;
}
throw new IllegalStateException("Found bucketNum=" + bucketNum + " from data but no mapping in 'bucketMap'." + extraMsg);
}
writerOffset = bucketMap.get(bucketNum);
} else if (!isBucketed) {
writerOffset = fpaths.createDynamicBucket(bucketNum);
}
if (fpaths.updaters[writerOffset] == null) {
Integer attemptId = getAttemptIdFromTaskId(taskId);
fpaths.updaters[writerOffset] = HiveFileFormatUtils.getAcidRecordUpdater(jc, conf.getTableInfo(), bucketNum, conf, fpaths.outPaths[writerOffset], rowInspector, reporter, 0, attemptId);
if (LOG.isDebugEnabled()) {
LOG.debug("Created updater for bucket number " + bucketNum + " using file " + fpaths.outPaths[writerOffset]);
}
}
if (conf.getWriteType() == AcidUtils.Operation.UPDATE) {
fpaths.updaters[writerOffset].update(conf.getTableWriteId(), row);
} else if (conf.getWriteType() == AcidUtils.Operation.DELETE) {
fpaths.updaters[writerOffset].delete(conf.getTableWriteId(), row);
} else {
throw new HiveException("Unknown write type " + conf.getWriteType().toString());
}
}
} catch (IOException e) {
LOG.error("Trying to close the writers as an IOException occurred: " + e.getMessage());
closeWriters(true);
throw new HiveException(e);
} catch (SerDeException e) {
closeWriters(true);
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.serde2.SerDeStats in project hive by apache.
the class MapOperator method populateVirtualColumnValues.
public static Object[] populateVirtualColumnValues(ExecMapperContext ctx, List<VirtualColumn> vcs, Object[] vcValues, Deserializer deserializer) {
if (vcs == null) {
return vcValues;
}
if (vcValues == null) {
vcValues = new Object[vcs.size()];
}
for (int i = 0; i < vcs.size(); i++) {
switch(vcs.get(i)) {
case FILENAME:
if (ctx.inputFileChanged()) {
vcValues[i] = new Text(ctx.getCurrentInputPath().toString());
}
break;
case BLOCKOFFSET:
{
long current = ctx.getIoCxt().getCurrentBlockStart();
LongWritable old = (LongWritable) vcValues[i];
if (old == null) {
old = new LongWritable(current);
vcValues[i] = old;
continue;
}
if (current != old.get()) {
old.set(current);
}
}
break;
case ROWOFFSET:
{
long current = ctx.getIoCxt().getCurrentRow();
LongWritable old = (LongWritable) vcValues[i];
if (old == null) {
old = new LongWritable(current);
vcValues[i] = old;
continue;
}
if (current != old.get()) {
old.set(current);
}
}
break;
case RAWDATASIZE:
long current = 0L;
SerDeStats stats = deserializer.getSerDeStats();
if (stats != null) {
current = stats.getRawDataSize();
}
LongWritable old = (LongWritable) vcValues[i];
if (old == null) {
old = new LongWritable(current);
vcValues[i] = old;
continue;
}
if (current != old.get()) {
old.set(current);
}
break;
case ROWID:
if (ctx.getIoCxt().getRecordIdentifier() == null) {
vcValues[i] = null;
} else {
if (vcValues[i] == null) {
vcValues[i] = new Object[RecordIdentifier.Field.values().length];
}
RecordIdentifier.StructInfo.toArray(ctx.getIoCxt().getRecordIdentifier(), (Object[]) vcValues[i]);
// so we don't accidentally cache the value; shouldn't
ctx.getIoCxt().setRecordIdentifier(null);
// happen since IO layer either knows how to produce ROW__ID or not - but to be safe
}
break;
case ROWISDELETED:
vcValues[i] = new BooleanWritable(ctx.getIoCxt().isDeletedRecord());
break;
}
}
return vcValues;
}
Aggregations