use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.
the class ParquetPageSourceFactory method createParquetPushDownPageSource.
public HivePushDownPageSource createParquetPushDownPageSource(Path path, long start, long length, com.huawei.boostkit.omnidata.model.Predicate predicate) {
AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext();
Properties transProperties = new Properties();
transProperties.put(OMNIDATA_CLIENT_TARGET_LIST, omniDataServerTarget);
DataSource parquetPushDownDataSource = new com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource(path.toString(), start, length, false);
TaskSource readTaskInfo = new TaskSource(parquetPushDownDataSource, predicate, TaskSource.ONE_MEGABYTES);
DataReader<Page> dataReader = DataReaderFactory.create(transProperties, readTaskInfo, new OpenLooKengDeserializer());
return new HivePushDownPageSource(dataReader, systemMemoryUsage);
}
use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.
the class ParquetPageSourceFactory method createParquetPageSource.
public static ParquetPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, Properties schema, List<HiveColumnHandle> columns, boolean useParquetColumnNames, boolean failOnCorruptedParquetStatistics, DataSize maxReadBlockSize, TypeManager typeManager, TupleDomain<HiveColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, DateTimeZone timeZone) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
DateTimeZone readerTimeZone = timeZone;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(user, () -> fileSystem.open(path));
ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats);
String writerTimeZoneId = fileMetaData.getKeyValueMetaData().get(WRITER_TIME_ZONE_KEY);
if (writerTimeZoneId != null && !writerTimeZoneId.equalsIgnoreCase(readerTimeZone.getID())) {
readerTimeZone = DateTimeZone.forID(writerTimeZoneId);
}
List<org.apache.parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
footerBlocks.add(block);
}
}
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
for (BlockMetaData block : footerBlocks.build()) {
if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, failOnCorruptedParquetStatistics)) {
blocks.add(block);
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumnIO, blocks.build(), dataSource, readerTimeZone, systemMemoryContext, maxReadBlockSize);
return new ParquetPageSource(parquetReader, fileSchema, messageColumnIO, typeManager, schema, columns, effectivePredicate, useParquetColumnNames);
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
if (e instanceof ParquetCorruptionException) {
throw new PrestoException(HIVE_BAD_DATA, e);
}
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.
the class OrcPageSourceFactory method createOrcPushDownPageSource.
public HivePushDownPageSource createOrcPushDownPageSource(Path path, long start, long length, Predicate predicate) {
AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext();
Properties transProperties = new Properties();
transProperties.put(OMNIDATA_CLIENT_TARGET_LIST, omniDataServerTarget);
DataSource orcPushDownDataSource = new com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource(path.toString(), start, length, false);
TaskSource readTaskInfo = new TaskSource(orcPushDownDataSource, predicate, TaskSource.ONE_MEGABYTES);
DataReader<Page> dataReader = DataReaderFactory.create(transProperties, readTaskInfo, new OpenLooKengDeserializer());
return new HivePushDownPageSource(dataReader, systemMemoryUsage);
}
use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.
the class DataIoAdapter method getPageIterator.
/**
* Contact with Omni-Data-Server
* @param pageCandidate file split info
* @param sparkOutPut data schema
* @param partitionColumn partition column
* @param filterOutPut filter schema
* @param pushDownOperators push down expressions
* @return WritableColumnVector data result info
* @throws TaskExecutionException connect to omni-data-server failed exception
* @notice 3rd parties api throws Exception, function has to catch basic Exception
*/
public Iterator<WritableColumnVector[]> getPageIterator(PageCandidate pageCandidate, Seq<Attribute> sparkOutPut, Seq<Attribute> partitionColumn, Seq<Attribute> filterOutPut, PushDownInfo pushDownOperators) throws TaskExecutionException, UnknownHostException {
// initCandidates
initCandidates(pageCandidate, filterOutPut);
// create AggregationInfo
// init agg candidates
List<Attribute> partitionColumnBatch = JavaConverters.seqAsJavaList(partitionColumn);
for (Attribute attribute : partitionColumnBatch) {
partitionColumnName.add(attribute.name());
}
List<AggExeInfo> aggExecutionList = JavaConverters.seqAsJavaList(pushDownOperators.aggExecutions());
if (aggExecutionList.size() == 0) {
initColumnInfo(sparkOutPut);
}
DataSource dataSource = initDataSource(pageCandidate);
RowExpression rowExpression = initFilter(pushDownOperators.filterExecutions());
Optional<RowExpression> prestoFilter = rowExpression == null ? Optional.empty() : Optional.of(rowExpression);
Optional<AggregationInfo> aggregations = initAggAndGroupInfo(aggExecutionList);
// create limitLong
OptionalLong limitLong = NdpUtils.convertLimitExeInfo(pushDownOperators.limitExecution());
Predicate predicate = new Predicate(omnidataTypes, omnidataColumns, prestoFilter, omnidataProjections, ImmutableMap.of(), ImmutableMap.of(), aggregations, limitLong);
TaskSource taskSource = new TaskSource(dataSource, predicate, 1048576);
SparkDeserializer deserializer = initSparkDeserializer();
WritableColumnVector[] page = null;
int failedTimes = 0;
String[] sdiHostArray = pageCandidate.getSdiHosts().split(",");
int randomIndex = (int) (Math.random() * sdiHostArray.length);
Iterator<String> sdiHosts = Arrays.stream(sdiHostArray).iterator();
Set<String> sdiHostSet = new HashSet<>();
sdiHostSet.add(sdiHostArray[randomIndex]);
while (sdiHosts.hasNext()) {
String sdiHost;
if (failedTimes == 0) {
sdiHost = sdiHostArray[randomIndex];
} else {
sdiHost = sdiHosts.next();
if (sdiHostSet.contains(sdiHost)) {
continue;
}
}
String ipAddress = InetAddress.getByName(sdiHost).getHostAddress();
Properties properties = new Properties();
properties.put("omnidata.client.target.list", ipAddress);
LOG.info("Push down node info: [hostname :{} ,ip :{}]", sdiHost, ipAddress);
try {
orcDataReader = new DataReaderImpl<SparkDeserializer>(properties, taskSource, deserializer);
hasNextPage = true;
page = (WritableColumnVector[]) orcDataReader.getNextPageBlocking();
if (orcDataReader.isFinished()) {
orcDataReader.close();
hasNextPage = false;
}
break;
} catch (OmniDataException omniDataException) {
OmniErrorCode errorCode = omniDataException.getErrorCode();
switch(errorCode) {
case OMNIDATA_INSUFFICIENT_RESOURCES:
LOG.warn("OMNIDATA_INSUFFICIENT_RESOURCES: " + "OmniData-server's push down queue is full, " + "begin to find next OmniData-server");
break;
case OMNIDATA_UNSUPPORTED_OPERATOR:
LOG.warn("OMNIDATA_UNSUPPORTED_OPERATOR: " + "OmniDataException: exist unsupported operator");
break;
case OMNIDATA_GENERIC_ERROR:
LOG.warn("OMNIDATA_GENERIC_ERROR: Current OmniData-server unavailable, " + "begin to find next OmniData-server");
break;
case OMNIDATA_NOT_FOUND:
LOG.warn("OMNIDATA_NOT_FOUND: Current OmniData-Server not found, " + "begin to find next OmniData-server");
break;
case OMNIDATA_INVALID_ARGUMENT:
LOG.warn("OMNIDATA_INVALID_ARGUMENT: INVALID_ARGUMENT, " + "exist unsupported operator or dataType");
break;
case OMNIDATA_IO_ERROR:
LOG.warn("OMNIDATA_IO_ERROR: Current OmniData-Server io exception, " + "begin to find next OmniData-server");
break;
default:
LOG.warn("OmniDataException: OMNIDATA_ERROR.");
}
LOG.warn("Push down failed node info [hostname :{} ,ip :{}]", sdiHost, ipAddress);
++failedTimes;
} catch (Exception e) {
LOG.warn("Push down failed node info [hostname :{} ,ip :{}]", sdiHost, ipAddress, e);
++failedTimes;
}
}
int retryTime = Math.min(TASK_FAILED_TIMES, sdiHostArray.length);
if (failedTimes >= retryTime) {
LOG.warn("No Omni-data-server to Connect, Task has tried {} times.", retryTime);
throw new TaskExecutionException("No Omni-data-server to Connect");
}
List<WritableColumnVector[]> l = new ArrayList<>();
l.add(page);
return l.iterator();
}
use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.
the class DataIoAdapter method initDataSource.
private DataSource initDataSource(PageCandidate pageCandidate) throws UnsupportedOperationException {
DataSource dataSource;
String fileFormat = pageCandidate.getFileFormat();
Long fileStartPos = pageCandidate.getStartPos();
Long fileLen = pageCandidate.getSplitLen();
if ("ORC".equalsIgnoreCase(fileFormat)) {
dataSource = new HdfsOrcDataSource(filePath, fileStartPos, fileLen, false);
} else if ("PARQUET".equalsIgnoreCase(fileFormat)) {
dataSource = new HdfsParquetDataSource(filePath, fileStartPos, fileLen, false);
} else {
throw new UnsupportedOperationException("unsupported data format : " + fileFormat);
}
return dataSource;
}
Aggregations