use of com.huawei.boostkit.omnidata.model.TaskSource in project boostkit-bigdata by kunpengcompute.
the class ParquetPageSourceFactory method createParquetPushDownPageSource.
public HivePushDownPageSource createParquetPushDownPageSource(Path path, long start, long length, com.huawei.boostkit.omnidata.model.Predicate predicate) {
AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext();
Properties transProperties = new Properties();
transProperties.put(OMNIDATA_CLIENT_TARGET_LIST, omniDataServerTarget);
DataSource parquetPushDownDataSource = new com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource(path.toString(), start, length, false);
TaskSource readTaskInfo = new TaskSource(parquetPushDownDataSource, predicate, TaskSource.ONE_MEGABYTES);
DataReader<Page> dataReader = DataReaderFactory.create(transProperties, readTaskInfo, new OpenLooKengDeserializer());
return new HivePushDownPageSource(dataReader, systemMemoryUsage);
}
use of com.huawei.boostkit.omnidata.model.TaskSource in project boostkit-bigdata by kunpengcompute.
the class OrcPageSourceFactory method createOrcPushDownPageSource.
public HivePushDownPageSource createOrcPushDownPageSource(Path path, long start, long length, Predicate predicate) {
AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext();
Properties transProperties = new Properties();
transProperties.put(OMNIDATA_CLIENT_TARGET_LIST, omniDataServerTarget);
DataSource orcPushDownDataSource = new com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource(path.toString(), start, length, false);
TaskSource readTaskInfo = new TaskSource(orcPushDownDataSource, predicate, TaskSource.ONE_MEGABYTES);
DataReader<Page> dataReader = DataReaderFactory.create(transProperties, readTaskInfo, new OpenLooKengDeserializer());
return new HivePushDownPageSource(dataReader, systemMemoryUsage);
}
use of com.huawei.boostkit.omnidata.model.TaskSource in project boostkit-bigdata by kunpengcompute.
the class DataIoAdapter method getPageIterator.
/**
* Contact with Omni-Data-Server
* @param pageCandidate file split info
* @param sparkOutPut data schema
* @param partitionColumn partition column
* @param filterOutPut filter schema
* @param pushDownOperators push down expressions
* @return WritableColumnVector data result info
* @throws TaskExecutionException connect to omni-data-server failed exception
* @notice 3rd parties api throws Exception, function has to catch basic Exception
*/
public Iterator<WritableColumnVector[]> getPageIterator(PageCandidate pageCandidate, Seq<Attribute> sparkOutPut, Seq<Attribute> partitionColumn, Seq<Attribute> filterOutPut, PushDownInfo pushDownOperators) throws TaskExecutionException, UnknownHostException {
// initCandidates
initCandidates(pageCandidate, filterOutPut);
// create AggregationInfo
// init agg candidates
List<Attribute> partitionColumnBatch = JavaConverters.seqAsJavaList(partitionColumn);
for (Attribute attribute : partitionColumnBatch) {
partitionColumnName.add(attribute.name());
}
List<AggExeInfo> aggExecutionList = JavaConverters.seqAsJavaList(pushDownOperators.aggExecutions());
if (aggExecutionList.size() == 0) {
initColumnInfo(sparkOutPut);
}
DataSource dataSource = initDataSource(pageCandidate);
RowExpression rowExpression = initFilter(pushDownOperators.filterExecutions());
Optional<RowExpression> prestoFilter = rowExpression == null ? Optional.empty() : Optional.of(rowExpression);
Optional<AggregationInfo> aggregations = initAggAndGroupInfo(aggExecutionList);
// create limitLong
OptionalLong limitLong = NdpUtils.convertLimitExeInfo(pushDownOperators.limitExecution());
Predicate predicate = new Predicate(omnidataTypes, omnidataColumns, prestoFilter, omnidataProjections, ImmutableMap.of(), ImmutableMap.of(), aggregations, limitLong);
TaskSource taskSource = new TaskSource(dataSource, predicate, 1048576);
SparkDeserializer deserializer = initSparkDeserializer();
WritableColumnVector[] page = null;
int failedTimes = 0;
String[] sdiHostArray = pageCandidate.getSdiHosts().split(",");
int randomIndex = (int) (Math.random() * sdiHostArray.length);
Iterator<String> sdiHosts = Arrays.stream(sdiHostArray).iterator();
Set<String> sdiHostSet = new HashSet<>();
sdiHostSet.add(sdiHostArray[randomIndex]);
while (sdiHosts.hasNext()) {
String sdiHost;
if (failedTimes == 0) {
sdiHost = sdiHostArray[randomIndex];
} else {
sdiHost = sdiHosts.next();
if (sdiHostSet.contains(sdiHost)) {
continue;
}
}
String ipAddress = InetAddress.getByName(sdiHost).getHostAddress();
Properties properties = new Properties();
properties.put("omnidata.client.target.list", ipAddress);
LOG.info("Push down node info: [hostname :{} ,ip :{}]", sdiHost, ipAddress);
try {
orcDataReader = new DataReaderImpl<SparkDeserializer>(properties, taskSource, deserializer);
hasNextPage = true;
page = (WritableColumnVector[]) orcDataReader.getNextPageBlocking();
if (orcDataReader.isFinished()) {
orcDataReader.close();
hasNextPage = false;
}
break;
} catch (OmniDataException omniDataException) {
OmniErrorCode errorCode = omniDataException.getErrorCode();
switch(errorCode) {
case OMNIDATA_INSUFFICIENT_RESOURCES:
LOG.warn("OMNIDATA_INSUFFICIENT_RESOURCES: " + "OmniData-server's push down queue is full, " + "begin to find next OmniData-server");
break;
case OMNIDATA_UNSUPPORTED_OPERATOR:
LOG.warn("OMNIDATA_UNSUPPORTED_OPERATOR: " + "OmniDataException: exist unsupported operator");
break;
case OMNIDATA_GENERIC_ERROR:
LOG.warn("OMNIDATA_GENERIC_ERROR: Current OmniData-server unavailable, " + "begin to find next OmniData-server");
break;
case OMNIDATA_NOT_FOUND:
LOG.warn("OMNIDATA_NOT_FOUND: Current OmniData-Server not found, " + "begin to find next OmniData-server");
break;
case OMNIDATA_INVALID_ARGUMENT:
LOG.warn("OMNIDATA_INVALID_ARGUMENT: INVALID_ARGUMENT, " + "exist unsupported operator or dataType");
break;
case OMNIDATA_IO_ERROR:
LOG.warn("OMNIDATA_IO_ERROR: Current OmniData-Server io exception, " + "begin to find next OmniData-server");
break;
default:
LOG.warn("OmniDataException: OMNIDATA_ERROR.");
}
LOG.warn("Push down failed node info [hostname :{} ,ip :{}]", sdiHost, ipAddress);
++failedTimes;
} catch (Exception e) {
LOG.warn("Push down failed node info [hostname :{} ,ip :{}]", sdiHost, ipAddress, e);
++failedTimes;
}
}
int retryTime = Math.min(TASK_FAILED_TIMES, sdiHostArray.length);
if (failedTimes >= retryTime) {
LOG.warn("No Omni-data-server to Connect, Task has tried {} times.", retryTime);
throw new TaskExecutionException("No Omni-data-server to Connect");
}
List<WritableColumnVector[]> l = new ArrayList<>();
l.add(page);
return l.iterator();
}
Aggregations