Search in sources :

Example 1 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class OrcRecordUpdater method parseKeyIndex.

static RecordIdentifier[] parseKeyIndex(Reader reader) {
    String[] stripes;
    try {
        ByteBuffer val = reader.getMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME).duplicate();
        stripes = utf8Decoder.decode(val).toString().split(";");
    } catch (CharacterCodingException e) {
        throw new IllegalArgumentException("Bad string encoding for " + OrcRecordUpdater.ACID_KEY_INDEX_NAME, e);
    }
    RecordIdentifier[] result = new RecordIdentifier[stripes.length];
    for (int i = 0; i < stripes.length; ++i) {
        if (stripes[i].length() != 0) {
            String[] parts = stripes[i].split(",");
            result[i] = new RecordIdentifier();
            result[i].setValues(Long.parseLong(parts[0]), Integer.parseInt(parts[1]), Long.parseLong(parts[2]));
        }
    }
    return result;
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) CharacterCodingException(java.nio.charset.CharacterCodingException) ByteBuffer(java.nio.ByteBuffer)

Example 2 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class StreamingAssert method readRecords.

List<Record> readRecords() throws Exception {
    if (currentDeltas.isEmpty()) {
        throw new AssertionError("No data");
    }
    InputFormat<NullWritable, OrcStruct> inputFormat = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partitionLocation.toString());
    job.set("bucket_count", Integer.toString(table.getSd().getNumBuckets()));
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
    job.set(ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, "true");
    job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString());
    InputSplit[] splits = inputFormat.getSplits(job, 1);
    assertEquals(1, splits.length);
    final AcidRecordReader<NullWritable, OrcStruct> recordReader = (AcidRecordReader<NullWritable, OrcStruct>) inputFormat.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = recordReader.createKey();
    OrcStruct value = recordReader.createValue();
    List<Record> records = new ArrayList<>();
    while (recordReader.next(key, value)) {
        RecordIdentifier recordIdentifier = recordReader.getRecordIdentifier();
        Record record = new Record(new RecordIdentifier(recordIdentifier.getTransactionId(), recordIdentifier.getBucketId(), recordIdentifier.getRowId()), value.toString());
        System.out.println(record);
        records.add(record);
    }
    recordReader.close();
    return records;
}
Also used : ArrayList(java.util.ArrayList) AcidRecordReader(org.apache.hadoop.hive.ql.io.AcidInputFormat.AcidRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 3 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class MutatorCoordinator method reconfigureState.

private void reconfigureState(OperationType operationType, List<String> newPartitionValues, Object record) throws WorkerException {
    RecordIdentifier newRecordIdentifier = extractRecordIdentifier(operationType, newPartitionValues, record);
    int newBucketId = newRecordIdentifier.getBucketProperty();
    if (newPartitionValues == null) {
        newPartitionValues = Collections.emptyList();
    }
    try {
        if (partitionHasChanged(newPartitionValues)) {
            if (table.createPartitions() && operationType == OperationType.INSERT) {
                partitionHelper.createPartitionIfNotExists(newPartitionValues);
            }
            Path newPartitionPath = partitionHelper.getPathForPartition(newPartitionValues);
            resetMutator(newBucketId, newPartitionValues, newPartitionPath);
        } else if (bucketIdHasChanged(newBucketId)) {
            resetMutator(newBucketId, partitionValues, partitionPath);
        } else {
            validateRecordSequence(operationType, newRecordIdentifier);
        }
    } catch (IOException e) {
        throw new WorkerException("Failed to reset mutator when performing " + operationType + " of record: " + record, e);
    }
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException)

Example 4 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class MutatorCoordinator method extractRecordIdentifier.

private RecordIdentifier extractRecordIdentifier(OperationType operationType, List<String> newPartitionValues, Object record) throws BucketIdException {
    RecordIdentifier recordIdentifier = recordInspector.extractRecordIdentifier(record);
    int bucketIdFromRecord = BucketCodec.determineVersion(recordIdentifier.getBucketProperty()).decodeWriterId(recordIdentifier.getBucketProperty());
    int computedBucketId = bucketIdResolver.computeBucketId(record);
    if (operationType != OperationType.DELETE && bucketIdFromRecord != computedBucketId) {
        throw new BucketIdException("RecordIdentifier.bucketId != computed bucketId (" + computedBucketId + ") for record " + recordIdentifier + " in partition " + newPartitionValues + ".");
    }
    return recordIdentifier;
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier)

Example 5 with RecordIdentifier

use of org.apache.hadoop.hive.ql.io.RecordIdentifier in project hive by apache.

the class StreamingAssert method readRecords.

/**
 * TODO: this would be more flexible doing a SQL select statement rather than using InputFormat directly
 * see {@link org.apache.hive.hcatalog.streaming.TestStreaming#checkDataWritten2(Path, long, long, int, String, String...)}
 * @param numSplitsExpected
 * @return
 * @throws Exception
 */
List<Record> readRecords(int numSplitsExpected) throws Exception {
    if (currentDeltas.isEmpty()) {
        throw new AssertionError("No data");
    }
    InputFormat<NullWritable, OrcStruct> inputFormat = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partitionLocation.toString());
    job.set(hive_metastoreConstants.BUCKET_COUNT, Integer.toString(table.getSd().getNumBuckets()));
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
    AcidUtils.setAcidOperationalProperties(job, true, null);
    job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
    InputSplit[] splits = inputFormat.getSplits(job, 1);
    assertEquals(numSplitsExpected, splits.length);
    List<Record> records = new ArrayList<>();
    for (InputSplit is : splits) {
        final AcidRecordReader<NullWritable, OrcStruct> recordReader = (AcidRecordReader<NullWritable, OrcStruct>) inputFormat.getRecordReader(is, job, Reporter.NULL);
        NullWritable key = recordReader.createKey();
        OrcStruct value = recordReader.createValue();
        while (recordReader.next(key, value)) {
            RecordIdentifier recordIdentifier = recordReader.getRecordIdentifier();
            Record record = new Record(new RecordIdentifier(recordIdentifier.getWriteId(), recordIdentifier.getBucketProperty(), recordIdentifier.getRowId()), value.toString());
            System.out.println(record);
            records.add(record);
        }
        recordReader.close();
    }
    return records;
}
Also used : ArrayList(java.util.ArrayList) AcidRecordReader(org.apache.hadoop.hive.ql.io.AcidInputFormat.AcidRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)40 Test (org.junit.Test)13 Path (org.apache.hadoop.fs.Path)9 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)9 StripeInformation (org.apache.orc.StripeInformation)9 Configuration (org.apache.hadoop.conf.Configuration)7 BitSet (java.util.BitSet)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 ValidReadTxnList (org.apache.hadoop.hive.common.ValidReadTxnList)5 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)5 RecordUpdater (org.apache.hadoop.hive.ql.io.RecordUpdater)5 ValidReaderWriteIdList (org.apache.hadoop.hive.common.ValidReaderWriteIdList)4 Table (org.apache.hadoop.hive.metastore.api.Table)4 VectorizedRowBatchCtx (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx)4 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)4 ReaderKey (org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey)4 OrcStruct (org.apache.hadoop.hive.ql.io.orc.OrcStruct)4 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)4 ArrayList (java.util.ArrayList)3 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)3