use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project DataX by alibaba.
the class DFSUtil method orcFileStartRead.
public void orcFileStartRead(String sourceOrcFilePath, Configuration readerSliceConfig, RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
LOG.info(String.format("Start Read orcfile [%s].", sourceOrcFilePath));
List<ColumnEntry> column = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
String nullFormat = readerSliceConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.NULL_FORMAT);
StringBuilder allColumns = new StringBuilder();
StringBuilder allColumnTypes = new StringBuilder();
boolean isReadAllColumns = false;
int columnIndexMax = -1;
// 判断是否读取所有列
if (null == column || column.size() == 0) {
int allColumnsCount = getAllColumnsCount(sourceOrcFilePath);
columnIndexMax = allColumnsCount - 1;
isReadAllColumns = true;
} else {
columnIndexMax = getMaxIndex(column);
}
for (int i = 0; i <= columnIndexMax; i++) {
allColumns.append("col");
allColumnTypes.append("string");
if (i != columnIndexMax) {
allColumns.append(",");
allColumnTypes.append(":");
}
}
if (columnIndexMax >= 0) {
JobConf conf = new JobConf(hadoopConf);
Path orcFilePath = new Path(sourceOrcFilePath);
Properties p = new Properties();
p.setProperty("columns", allColumns.toString());
p.setProperty("columns.types", allColumnTypes.toString());
try {
OrcSerde serde = new OrcSerde();
serde.initialize(conf, p);
StructObjectInspector inspector = (StructObjectInspector) serde.getObjectInspector();
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, orcFilePath.toString());
// If the network disconnected, will retry 45 times, each time the retry interval for 20 seconds
// Each file as a split
// TODO multy threads
InputSplit[] splits = in.getSplits(conf, 1);
RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
Object key = reader.createKey();
Object value = reader.createValue();
// 获取列信息
List<? extends StructField> fields = inspector.getAllStructFieldRefs();
List<Object> recordFields;
while (reader.next(key, value)) {
recordFields = new ArrayList<Object>();
for (int i = 0; i <= columnIndexMax; i++) {
Object field = inspector.getStructFieldData(value, fields.get(i));
recordFields.add(field);
}
transportOneRecord(column, recordFields, recordSender, taskPluginCollector, isReadAllColumns, nullFormat);
}
reader.close();
} catch (Exception e) {
String message = String.format("从orcfile文件路径[%s]中读取数据发生异常,请联系系统管理员。", sourceOrcFilePath);
LOG.error(message);
throw DataXException.asDataXException(HdfsReaderErrorCode.READ_FILE_ERROR, message);
}
} else {
String message = String.format("请确认您所读取的列配置正确!columnIndexMax 小于0,column:%s", JSON.toJSONString(column));
throw DataXException.asDataXException(HdfsReaderErrorCode.BAD_CONFIG_VALUE, message);
}
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project hive by apache.
the class TestCompactor method checkExpectedTxnsPresent.
private void checkExpectedTxnsPresent(Path base, Path[] deltas, String columnNamesProperty, String columnTypesProperty, int bucket, long min, long max, int numBuckets) throws IOException {
ValidWriteIdList writeIdList = new ValidWriteIdList() {
@Override
public String getTableName() {
return "AcidTable";
}
@Override
public boolean isWriteIdValid(long writeid) {
return true;
}
@Override
public RangeResponse isWriteIdRangeValid(long minWriteId, long maxWriteId) {
return RangeResponse.ALL;
}
@Override
public String writeToString() {
return "";
}
@Override
public void readFromString(String src) {
}
@Override
public Long getMinOpenWriteId() {
return null;
}
@Override
public long getHighWatermark() {
return Long.MAX_VALUE;
}
@Override
public long[] getInvalidWriteIds() {
return new long[0];
}
@Override
public boolean isValidBase(long writeid) {
return true;
}
@Override
public boolean isWriteIdAborted(long writeid) {
return true;
}
@Override
public RangeResponse isWriteIdRangeAborted(long minWriteId, long maxWriteId) {
return RangeResponse.ALL;
}
};
OrcInputFormat aif = new OrcInputFormat();
Configuration conf = new Configuration();
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, columnNamesProperty);
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, columnTypesProperty);
conf.set(hive_metastoreConstants.BUCKET_COUNT, Integer.toString(numBuckets));
HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
AcidInputFormat.RawReader<OrcStruct> reader = aif.getRawReader(conf, true, bucket, writeIdList, base, deltas);
RecordIdentifier identifier = reader.createKey();
OrcStruct value = reader.createValue();
long currentTxn = min;
boolean seenCurrentTxn = false;
while (reader.next(identifier, value)) {
if (!seenCurrentTxn) {
Assert.assertEquals(currentTxn, identifier.getWriteId());
seenCurrentTxn = true;
}
if (currentTxn != identifier.getWriteId()) {
Assert.assertEquals(currentTxn + 1, identifier.getWriteId());
currentTxn++;
}
}
Assert.assertEquals(max, currentTxn);
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project hive by apache.
the class TestStreaming method checkDataWritten.
/**
* @deprecated use {@link #checkDataWritten2(Path, long, long, int, String, boolean, String...)} -
* there is little value in using InputFormat directly
*/
@Deprecated
private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles, String... records) throws Exception {
ValidWriteIdList writeIds = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName));
AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds);
Assert.assertEquals(0, dir.getObsolete().size());
Assert.assertEquals(0, dir.getOriginalFiles().size());
List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories();
System.out.println("Files found: ");
for (AcidUtils.ParsedDelta pd : current) {
System.out.println(pd.getPath().toString());
}
Assert.assertEquals(numExpectedFiles, current.size());
// find the absolute minimum transaction
long min = Long.MAX_VALUE;
long max = Long.MIN_VALUE;
for (AcidUtils.ParsedDelta pd : current) {
if (pd.getMaxWriteId() > max) {
max = pd.getMaxWriteId();
}
if (pd.getMinWriteId() < min) {
min = pd.getMinWriteId();
}
}
Assert.assertEquals(minTxn, min);
Assert.assertEquals(maxTxn, max);
InputFormat inf = new OrcInputFormat();
JobConf job = new JobConf();
job.set("mapred.input.dir", partitionPath.toString());
job.set(BUCKET_COUNT, Integer.toString(buckets));
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
AcidUtils.setAcidOperationalProperties(job, true, null);
job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
InputSplit[] splits = inf.getSplits(job, buckets);
Assert.assertEquals(numExpectedFiles, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
for (String record : records) {
Assert.assertEquals(true, rr.next(key, value));
Assert.assertEquals(record, value.toString());
}
Assert.assertEquals(false, rr.next(key, value));
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project Datax by n-kong.
the class DFSUtil method orcFileStartRead.
public void orcFileStartRead(String sourceOrcFilePath, Configuration readerSliceConfig, RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
LOG.info(String.format("Start Read orcfile [%s].", sourceOrcFilePath));
List<ColumnEntry> column = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
String nullFormat = readerSliceConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.NULL_FORMAT);
StringBuilder allColumns = new StringBuilder();
StringBuilder allColumnTypes = new StringBuilder();
boolean isReadAllColumns = false;
int columnIndexMax = -1;
// 判断是否读取所有列
if (null == column || column.size() == 0) {
int allColumnsCount = getAllColumnsCount(sourceOrcFilePath);
columnIndexMax = allColumnsCount - 1;
isReadAllColumns = true;
} else {
columnIndexMax = getMaxIndex(column);
}
for (int i = 0; i <= columnIndexMax; i++) {
allColumns.append("col");
allColumnTypes.append("string");
if (i != columnIndexMax) {
allColumns.append(",");
allColumnTypes.append(":");
}
}
if (columnIndexMax >= 0) {
JobConf conf = new JobConf(hadoopConf);
Path orcFilePath = new Path(sourceOrcFilePath);
Properties p = new Properties();
p.setProperty("columns", allColumns.toString());
p.setProperty("columns.types", allColumnTypes.toString());
try {
OrcSerde serde = new OrcSerde();
serde.initialize(conf, p);
StructObjectInspector inspector = (StructObjectInspector) serde.getObjectInspector();
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, orcFilePath.toString());
// If the network disconnected, will retry 45 times, each time the retry interval for 20 seconds
// Each file as a split
// TODO multy threads
InputSplit[] splits = in.getSplits(conf, 1);
RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
Object key = reader.createKey();
Object value = reader.createValue();
// 获取列信息
List<? extends StructField> fields = inspector.getAllStructFieldRefs();
List<Object> recordFields;
while (reader.next(key, value)) {
recordFields = new ArrayList<Object>();
for (int i = 0; i <= columnIndexMax; i++) {
Object field = inspector.getStructFieldData(value, fields.get(i));
recordFields.add(field);
}
transportOneRecord(column, recordFields, recordSender, taskPluginCollector, isReadAllColumns, nullFormat);
}
reader.close();
} catch (Exception e) {
String message = String.format("从orcfile文件路径[%s]中读取数据发生异常,请联系系统管理员。", sourceOrcFilePath);
LOG.error(message);
throw DataXException.asDataXException(HdfsReaderErrorCode.READ_FILE_ERROR, message);
}
} else {
String message = String.format("请确认您所读取的列配置正确!columnIndexMax 小于0,column:%s", JSON.toJSONString(column));
throw DataXException.asDataXException(HdfsReaderErrorCode.BAD_CONFIG_VALUE, message);
}
}
use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project pxf by greenplum-db.
the class ProfileFactoryTest method get.
@Test
public void get() {
// if user specified vectorized ORC, no matter what the input format is, the profile (as in parameter) should be used
String profileName = ProfileFactory.get(new TextInputFormat(), false, "HiveVectorizedORC");
assertEquals("HiveVectorizedORC", profileName);
profileName = ProfileFactory.get(new TextInputFormat(), false, "hivevectorizedorc");
assertEquals("hivevectorizedorc", profileName);
// For TextInputFormat when table has no complex types, HiveText profile should be used
profileName = ProfileFactory.get(new TextInputFormat(), false);
assertEquals("hive:text", profileName);
// For TextInputFormat when table has complex types, Hive profile should be used, HiveText doesn't support complex types yet
profileName = ProfileFactory.get(new TextInputFormat(), true);
assertEquals("hive", profileName);
// For RCFileInputFormat when table has complex types, HiveRC profile should be used
profileName = ProfileFactory.get(new RCFileInputFormat(), true);
assertEquals("hive:rc", profileName);
// For RCFileInputFormat when table has no complex types, HiveRC profile should be used
profileName = ProfileFactory.get(new RCFileInputFormat(), false);
assertEquals("hive:rc", profileName);
// For OrcInputFormat when table has complex types, HiveORC profile should be used
profileName = ProfileFactory.get(new OrcInputFormat(), true);
assertEquals("hive:orc", profileName);
// For OrcInputFormat when table has no complex types, HiveORC profile should be used
profileName = ProfileFactory.get(new OrcInputFormat(), false);
assertEquals("hive:orc", profileName);
// For other formats Hive profile should be used
profileName = ProfileFactory.get(new SequenceFileInputFilter(), false);
assertEquals("hive", profileName);
}
Aggregations