Search in sources :

Example 26 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class BucketIdResolverImpl method attachBucketIdToRecord.

@Override
public Object attachBucketIdToRecord(Object record) {
    int bucketId = computeBucketId(record);
    int bucketProperty = BucketCodec.V1.encode(new AcidOutputFormat.Options(null).bucket(bucketId));
    RecordIdentifier recordIdentifier = new RecordIdentifier(INVALID_TRANSACTION_ID, bucketProperty, INVALID_ROW_ID);
    structObjectInspector.setStructFieldData(record, recordIdentifierField, recordIdentifier);
    return record;
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier)

Example 27 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class OrcInputFormat method getReader.

@Override
public RowReader<OrcStruct> getReader(InputSplit inputSplit, Options options) throws IOException {
    final OrcSplit split = (OrcSplit) inputSplit;
    // Retrieve the acidOperationalProperties for the table, initialized in HiveInputFormat.
    AcidUtils.AcidOperationalProperties acidOperationalProperties = AcidUtils.getAcidOperationalProperties(options.getConfiguration());
    if (!acidOperationalProperties.isSplitUpdate()) {
        throw new IllegalStateException("Expected SpliUpdate table: " + split.getPath());
    }
    Map<String, AcidInputFormat.DeltaMetaData> pathToDeltaMetaData = new HashMap<>();
    final Path[] deltas = VectorizedOrcAcidRowBatchReader.getDeleteDeltaDirsFromSplit(split, pathToDeltaMetaData);
    final Configuration conf = options.getConfiguration();
    final Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, split);
    OrcRawRecordMerger.Options mergerOptions = new OrcRawRecordMerger.Options().isCompacting(false);
    mergerOptions.rootPath(split.getRootDir());
    mergerOptions.bucketPath(split.getPath());
    final int bucket;
    if (split.hasBase()) {
        AcidOutputFormat.Options acidIOOptions = AcidUtils.parseBaseOrDeltaBucketFilename(split.getPath(), conf);
        if (acidIOOptions.getBucketId() < 0) {
            LOG.warn("Can't determine bucket ID for " + split.getPath() + "; ignoring");
        }
        bucket = acidIOOptions.getBucketId();
    } else {
        bucket = (int) split.getStart();
        assert false : "We should never have a split w/o base in acid 2.0 for full acid: " + split.getPath();
    }
    // todo: createOptionsForReader() assumes it's !isOriginal.... why?
    final Reader.Options readOptions = OrcInputFormat.createOptionsForReader(conf);
    readOptions.range(split.getStart(), split.getLength());
    String txnString = conf.get(ValidWriteIdList.VALID_WRITEIDS_KEY);
    ValidWriteIdList validWriteIdList = (txnString == null) ? new ValidReaderWriteIdList() : new ValidReaderWriteIdList(txnString);
    if (LOG.isDebugEnabled()) {
        LOG.debug("getReader:: Read ValidWriteIdList: " + validWriteIdList.toString() + " isTransactionalTable: " + HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN));
        LOG.debug("Creating merger for {} and {}", split.getPath(), Arrays.toString(deltas));
    }
    boolean fetchDeletedRows = acidOperationalProperties.isFetchDeletedRows();
    Map<String, Integer> deltaToAttemptId = AcidUtils.getDeltaToAttemptIdMap(pathToDeltaMetaData, deltas, bucket);
    final OrcRawRecordMerger records;
    if (!fetchDeletedRows) {
        records = new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validWriteIdList, readOptions, deltas, mergerOptions, deltaToAttemptId);
    } else {
        records = new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validWriteIdList, readOptions, deltas, mergerOptions, deltaToAttemptId) {

            @Override
            protected boolean collapse(RecordIdentifier recordIdentifier) {
                ((ReaderKey) recordIdentifier).setValues(prevKey.getCurrentWriteId(), prevKey.getBucketProperty(), prevKey.getRowId(), prevKey.getCurrentWriteId(), true);
                return false;
            }
        };
    }
    return new OrcRowReader(records, readOptions);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) StatsProvidingRecordReader(org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader) BatchToRowReader(org.apache.hadoop.hive.ql.io.BatchToRowReader) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) Path(org.apache.hadoop.fs.Path) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) AcidOperationalProperties(org.apache.hadoop.hive.ql.io.AcidUtils.AcidOperationalProperties) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 28 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class VectorizedOrcAcidRowBatchReader method getKeyInterval.

/**
 * Calculates the min/max record key.
 * Structure in data is like this:
 * <op, owid, writerId, rowid, cwid, <f1, ... fn>>
 * The +1 is to account for the top level struct which has a
 * ColumnStatistics object in colsStats.  Top level struct is normally
 * dropped by the Reader (I guess because of orc.impl.SchemaEvolution)
 * @param colStats The statistics array
 * @return The min record key
 */
private static OrcRawRecordMerger.KeyInterval getKeyInterval(ColumnStatistics[] colStats) {
    IntegerColumnStatistics origWriteId = (IntegerColumnStatistics) colStats[OrcRecordUpdater.ORIGINAL_WRITEID + 1];
    IntegerColumnStatistics bucketProperty = (IntegerColumnStatistics) colStats[OrcRecordUpdater.BUCKET + 1];
    IntegerColumnStatistics rowId = (IntegerColumnStatistics) colStats[OrcRecordUpdater.ROW_ID + 1];
    // the following cast to int
    assert bucketProperty.getMaximum() <= Integer.MAX_VALUE : "was bucketProperty (max) changed to a long (" + bucketProperty.getMaximum() + ")?!";
    assert bucketProperty.getMinimum() <= Integer.MAX_VALUE : "was bucketProperty (min) changed to a long (" + bucketProperty.getMaximum() + ")?!";
    RecordIdentifier maxKey = new RecordIdentifier(origWriteId.getMaximum(), (int) bucketProperty.getMaximum(), rowId.getMaximum());
    RecordIdentifier minKey = new RecordIdentifier(origWriteId.getMinimum(), (int) bucketProperty.getMinimum(), rowId.getMinimum());
    return new OrcRawRecordMerger.KeyInterval(minKey, maxKey);
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics)

Example 29 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class OrcRawRecordMerger method discoverKeyBounds.

/**
 * Find the key range for the split (of the base).  These are used to filter delta files since
 * both are sorted by key.
 * @param reader the reader
 * @param options the options for reading with
 * @throws IOException
 */
private KeyInterval discoverKeyBounds(Reader reader, Reader.Options options) throws IOException {
    RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader);
    long offset = options.getOffset();
    long maxOffset = options.getMaxOffset();
    int firstStripe = 0;
    int stripeCount = 0;
    boolean isTail = true;
    RecordIdentifier minKey = null;
    RecordIdentifier maxKey = null;
    List<StripeInformation> stripes = reader.getStripes();
    for (StripeInformation stripe : stripes) {
        if (offset > stripe.getOffset()) {
            firstStripe += 1;
        } else if (maxOffset > stripe.getOffset()) {
            stripeCount += 1;
        } else {
            isTail = false;
            break;
        }
    }
    if (firstStripe != 0) {
        minKey = keyIndex[firstStripe - 1];
    }
    if (!isTail) {
        maxKey = keyIndex[firstStripe + stripeCount - 1];
    }
    return new KeyInterval(minKey, maxKey);
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StripeInformation(org.apache.orc.StripeInformation)

Example 30 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class TestFileSinkOperator method setupData.

private void setupData(DataFormat format) {
    Class<?> rType;
    switch(format) {
        case WITH_PARTITION_VALUE:
            rType = RowWithPartVal.class;
            break;
        case WITH_RECORD_ID:
            rType = RowWithRecID.class;
            break;
        case WITH_RECORD_ID_AND_PARTITION_VALUE:
            rType = RowWithPartNRecID.class;
            break;
        default:
            throw new RuntimeException("Unknown type");
    }
    inspector = ObjectInspectorFactory.getReflectionObjectInspector(rType, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    rows = new ArrayList<Row>();
    Row r;
    for (int i = 0; i < 10; i++) {
        switch(format) {
            case WITH_PARTITION_VALUE:
                r = new RowWithPartVal(new Text("mary had a little lamb"), (i < 5) ? new Text("Monday") : new Text("Tuesday"));
                break;
            case WITH_RECORD_ID:
                r = new RowWithRecID(new RecordIdentifier(1, 1, i), (i < 5) ? new Text("Monday") : new Text("Tuesday"));
                break;
            case WITH_RECORD_ID_AND_PARTITION_VALUE:
                r = new RowWithPartNRecID(new Text("its fleect was white as snow"), (i < 5) ? new Text("Monday") : new Text("Tuesday"), new RecordIdentifier(1, 1, i));
                break;
            default:
                throw new RuntimeException("Unknown data format");
        }
        rows.add(r);
    }
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) Text(org.apache.hadoop.io.Text)

Aggregations

RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)40 Test (org.junit.Test)13 Path (org.apache.hadoop.fs.Path)9 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)9 StripeInformation (org.apache.orc.StripeInformation)9 Configuration (org.apache.hadoop.conf.Configuration)7 BitSet (java.util.BitSet)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 ValidReadTxnList (org.apache.hadoop.hive.common.ValidReadTxnList)5 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)5 RecordUpdater (org.apache.hadoop.hive.ql.io.RecordUpdater)5 ValidReaderWriteIdList (org.apache.hadoop.hive.common.ValidReaderWriteIdList)4 Table (org.apache.hadoop.hive.metastore.api.Table)4 VectorizedRowBatchCtx (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx)4 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)4 ReaderKey (org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey)4 OrcStruct (org.apache.hadoop.hive.ql.io.orc.OrcStruct)4 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)4 ArrayList (java.util.ArrayList)3 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)3