use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project hive by apache.
the class StreamingAssert method readRecords.
List<Record> readRecords() throws Exception {
if (currentDeltas.isEmpty()) {
throw new AssertionError("No data");
}
InputFormat<NullWritable, OrcStruct> inputFormat = new OrcInputFormat();
JobConf job = new JobConf();
job.set("mapred.input.dir", partitionLocation.toString());
job.set("bucket_count", Integer.toString(table.getSd().getNumBuckets()));
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
job.set(ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, "true");
job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString());
InputSplit[] splits = inputFormat.getSplits(job, 1);
assertEquals(1, splits.length);
final AcidRecordReader<NullWritable, OrcStruct> recordReader = (AcidRecordReader<NullWritable, OrcStruct>) inputFormat.getRecordReader(splits[0], job, Reporter.NULL);
NullWritable key = recordReader.createKey();
OrcStruct value = recordReader.createValue();
List<Record> records = new ArrayList<>();
while (recordReader.next(key, value)) {
RecordIdentifier recordIdentifier = recordReader.getRecordIdentifier();
Record record = new Record(new RecordIdentifier(recordIdentifier.getTransactionId(), recordIdentifier.getBucketId(), recordIdentifier.getRowId()), value.toString());
System.out.println(record);
records.add(record);
}
recordReader.close();
return records;
}
use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project hive by apache.
the class TestCompactor method checkExpectedTxnsPresent.
private void checkExpectedTxnsPresent(Path base, Path[] deltas, String columnNamesProperty, String columnTypesProperty, int bucket, long min, long max) throws IOException {
ValidTxnList txnList = new ValidTxnList() {
@Override
public boolean isTxnValid(long txnid) {
return true;
}
@Override
public RangeResponse isTxnRangeValid(long minTxnId, long maxTxnId) {
return RangeResponse.ALL;
}
@Override
public String writeToString() {
return "";
}
@Override
public void readFromString(String src) {
}
@Override
public long getHighWatermark() {
return Long.MAX_VALUE;
}
@Override
public long[] getInvalidTransactions() {
return new long[0];
}
@Override
public boolean isValidBase(long txnid) {
return true;
}
};
OrcInputFormat aif = new OrcInputFormat();
Configuration conf = new Configuration();
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, columnNamesProperty);
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, columnTypesProperty);
HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
AcidInputFormat.RawReader<OrcStruct> reader = aif.getRawReader(conf, false, bucket, txnList, base, deltas);
RecordIdentifier identifier = reader.createKey();
OrcStruct value = reader.createValue();
long currentTxn = min;
boolean seenCurrentTxn = false;
while (reader.next(identifier, value)) {
if (!seenCurrentTxn) {
Assert.assertEquals(currentTxn, identifier.getTransactionId());
seenCurrentTxn = true;
}
if (currentTxn != identifier.getTransactionId()) {
Assert.assertEquals(currentTxn + 1, identifier.getTransactionId());
currentTxn++;
}
}
Assert.assertEquals(max, currentTxn);
}
use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project hive by apache.
the class TestStreaming method checkDataWritten.
private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles, String... records) throws Exception {
ValidTxnList txns = msClient.getValidTxns();
AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, txns);
Assert.assertEquals(0, dir.getObsolete().size());
Assert.assertEquals(0, dir.getOriginalFiles().size());
List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories();
System.out.println("Files found: ");
for (AcidUtils.ParsedDelta pd : current) System.out.println(pd.getPath().toString());
Assert.assertEquals(numExpectedFiles, current.size());
// find the absolute minimum transaction
long min = Long.MAX_VALUE;
long max = Long.MIN_VALUE;
for (AcidUtils.ParsedDelta pd : current) {
if (pd.getMaxTransaction() > max)
max = pd.getMaxTransaction();
if (pd.getMinTransaction() < min)
min = pd.getMinTransaction();
}
Assert.assertEquals(minTxn, min);
Assert.assertEquals(maxTxn, max);
InputFormat inf = new OrcInputFormat();
JobConf job = new JobConf();
job.set("mapred.input.dir", partitionPath.toString());
job.set("bucket_count", Integer.toString(buckets));
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
job.set(ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, "true");
job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString());
InputSplit[] splits = inf.getSplits(job, buckets);
Assert.assertEquals(buckets, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
for (int i = 0; i < records.length; i++) {
Assert.assertEquals(true, rr.next(key, value));
Assert.assertEquals(records[i], value.toString());
}
Assert.assertEquals(false, rr.next(key, value));
}
use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project druid by druid-io.
the class DruidOrcInputFormatTest method testRead.
@Test
public void testRead() throws IOException, InterruptedException {
InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader reader = inputFormat.createRecordReader(split, context);
OrcHadoopInputRowParser parser = (OrcHadoopInputRowParser) config.getParser();
reader.initialize(split, context);
reader.nextKeyValue();
OrcStruct data = (OrcStruct) reader.getCurrentValue();
MapBasedInputRow row = (MapBasedInputRow) parser.parse(data);
Assert.assertTrue(row.getEvent().keySet().size() == 4);
Assert.assertEquals(new DateTime(timestamp), row.getTimestamp());
Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
Assert.assertEquals(col1, row.getEvent().get("col1"));
Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));
reader.close();
}
use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project presto by prestodb.
the class OrcFileRewriter method uncompressedSize.
private static int uncompressedSize(Object object) throws IOException {
if (object instanceof OrcStruct) {
OrcStruct struct = (OrcStruct) object;
int size = 0;
for (int i = 0; i < struct.getNumFields(); i++) {
size += uncompressedSize(getFieldValue(struct, i));
}
return size;
}
if ((object == null) || (object instanceof BooleanWritable)) {
return SIZE_OF_BYTE;
}
if (object instanceof LongWritable) {
return SIZE_OF_LONG;
}
if (object instanceof DoubleWritable) {
return SIZE_OF_DOUBLE;
}
if (object instanceof Text) {
return ((Text) object).getLength();
}
if (object instanceof BytesWritable) {
return ((BytesWritable) object).getLength();
}
if (object instanceof List<?>) {
int size = 0;
for (Object element : (Iterable<?>) object) {
size += uncompressedSize(element);
}
return size;
}
if (object instanceof Map<?, ?>) {
int size = 0;
for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) {
size += uncompressedSize(entry.getKey());
size += uncompressedSize(entry.getValue());
}
return size;
}
throw new IOException("Unhandled ORC object: " + object.getClass().getName());
}
Aggregations