use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project phoenix-connectors by apache.
the class PhoenixDataSourceReader method planInputPartitions.
@Override
public List<InputPartition<InternalRow>> planInputPartitions() {
Optional<String> currentScnValue = options.get(PhoenixConfigurationUtil.CURRENT_SCN_VALUE);
Optional<String> tenantId = options.get(PhoenixConfigurationUtil.MAPREDUCE_TENANT_ID);
// Generate splits based off statistics, or just region splits?
boolean splitByStats = options.getBoolean(PhoenixConfigurationUtil.MAPREDUCE_SPLIT_BY_STATS, PhoenixConfigurationUtil.DEFAULT_SPLIT_BY_STATS);
if (currentScnValue.isPresent()) {
overriddenProps.put(PhoenixRuntime.CURRENT_SCN_ATTRIB, currentScnValue.get());
}
if (tenantId.isPresent()) {
overriddenProps.put(PhoenixRuntime.TENANT_ID_ATTRIB, tenantId.get());
}
try (Connection conn = DriverManager.getConnection(JDBC_PROTOCOL + JDBC_PROTOCOL_SEPARATOR + zkUrl, overriddenProps)) {
List<ColumnInfo> columnInfos = PhoenixRuntime.generateColumnInfo(conn, tableName, new ArrayList<>(Arrays.asList(schema.names())));
final Statement statement = conn.createStatement();
final String selectStatement = QueryUtil.constructSelectStatement(tableName, columnInfos, whereClause);
if (selectStatement == null) {
throw new NullPointerException();
}
final PhoenixStatement pstmt = statement.unwrap(PhoenixStatement.class);
// Optimize the query plan so that we potentially use secondary indexes
final QueryPlan queryPlan = pstmt.optimizeQuery(selectStatement);
final Scan scan = queryPlan.getContext().getScan();
// setting the snapshot configuration
Optional<String> snapshotName = options.get(PhoenixConfigurationUtil.SNAPSHOT_NAME_KEY);
if (snapshotName.isPresent())
PhoenixConfigurationUtil.setSnapshotNameKey(queryPlan.getContext().getConnection().getQueryServices().getConfiguration(), snapshotName.get());
// Initialize the query plan so it sets up the parallel scans
queryPlan.iterator(MapReduceParallelScanGrouper.getInstance());
List<KeyRange> allSplits = queryPlan.getSplits();
// Get the RegionSizeCalculator
PhoenixConnection phxConn = conn.unwrap(PhoenixConnection.class);
org.apache.hadoop.hbase.client.Connection connection = phxConn.getQueryServices().getAdmin().getConnection();
RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(queryPlan.getTableRef().getTable().getPhysicalName().toString()));
final List<InputPartition<InternalRow>> partitions = new ArrayList<>(allSplits.size());
for (List<Scan> scans : queryPlan.getScans()) {
// Get the region location
HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
String regionLocation = location.getHostname();
// Get the region size
long regionSize = CompatUtil.getSize(regionLocator, connection.getAdmin(), location);
PhoenixDataSourceReadOptions phoenixDataSourceOptions = new PhoenixDataSourceReadOptions(zkUrl, currentScnValue.orElse(null), tenantId.orElse(null), selectStatement, overriddenProps);
if (splitByStats) {
for (Scan aScan : scans) {
partitions.add(getInputPartition(phoenixDataSourceOptions, new PhoenixInputSplit(Collections.singletonList(aScan), regionSize, regionLocation)));
}
} else {
partitions.add(getInputPartition(phoenixDataSourceOptions, new PhoenixInputSplit(scans, regionSize, regionLocation)));
}
}
return partitions;
} catch (Exception e) {
throw new RuntimeException("Unable to plan query", e);
}
}
use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project DataX by alibaba.
the class HbaseSQLReaderTask method init.
public void init() {
LOG.info("reader table info: " + this.readerConfig.toString());
try {
this.getPColumns();
} catch (SQLException e) {
throw DataXException.asDataXException(HbaseSQLReaderErrorCode.GET_PHOENIX_CONNECTIONINFO_ERROR, "获取表的列出问题,重试,若还有问题请检查hbase集群状态," + e.getMessage());
}
this.phoenixInputFormat = new PhoenixInputFormat<PhoenixRecordWritable>();
String splitBase64Str = this.readerConfig.getOriginalConfig().getString(Key.SPLIT_KEY);
byte[] splitBytes = org.apache.commons.codec.binary.Base64.decodeBase64(splitBase64Str);
TaskAttemptID attemptId = new TaskAttemptID();
org.apache.hadoop.conf.Configuration conf = HbaseSQLHelper.generatePhoenixConf(this.readerConfig);
this.hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId);
this.phoenixInputSplit = new PhoenixInputSplit();
try {
HadoopSerializationUtil.deserialize(phoenixInputSplit, splitBytes);
this.phoenixRecordReader = (PhoenixRecordReader) phoenixInputFormat.createRecordReader(phoenixInputSplit, hadoopAttemptContext);
} catch (Exception e) {
throw DataXException.asDataXException(HbaseSQLReaderErrorCode.PHOENIX_CREATEREADER_ERROR, "创建phoenix的reader出现问题,请重试,若还有问题请检查hbase集群状态," + e.getMessage());
}
}
use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project trino by trinodb.
the class PhoenixSplitManager method generateSplits.
// mostly copied from PhoenixInputFormat, but without the region size calculations
private List<InputSplit> generateSplits(QueryPlan queryPlan, List<KeyRange> splits, int maxScansPerSplit) throws IOException {
requireNonNull(queryPlan, "queryPlan is null");
requireNonNull(splits, "splits is null");
try (org.apache.hadoop.hbase.client.Connection connection = phoenixClient.getHConnection()) {
RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(queryPlan.getTableRef().getTable().getPhysicalName().toString()));
long regionSize = -1;
List<InputSplit> inputSplits = new ArrayList<>(splits.size());
for (List<Scan> scans : queryPlan.getScans()) {
HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
String regionLocation = location.getHostname();
if (log.isDebugEnabled()) {
log.debug("Scan count[%d] : %s ~ %s", scans.size(), Bytes.toStringBinary(scans.get(0).getStartRow()), Bytes.toStringBinary(scans.get(scans.size() - 1).getStopRow()));
log.debug("First scan : %swith scanAttribute : %s [scanCache, cacheBlock, scanBatch] : [%d, %s, %d] and regionLocation : %s", scans.get(0), scans.get(0).getAttributesMap(), scans.get(0).getCaching(), scans.get(0).getCacheBlocks(), scans.get(0).getBatch(), regionLocation);
for (int i = 0, limit = scans.size(); i < limit; i++) {
log.debug("EXPECTED_UPPER_REGION_KEY[%d] : %s", i, Bytes.toStringBinary(scans.get(i).getAttribute(EXPECTED_UPPER_REGION_KEY)));
}
}
/*
* Handle parallel execution explicitly in Trino rather than internally in Phoenix.
* Each split is handled by a single ConcatResultIterator
* (See PhoenixClient.getResultSet(...))
*/
for (List<Scan> splitScans : Lists.partition(scans, maxScansPerSplit)) {
inputSplits.add(new PhoenixInputSplit(splitScans, regionSize, regionLocation));
}
}
return inputSplits;
}
}
use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project trino by trinodb.
the class TestPhoenixSplit method testPhoenixSplitJsonRoundtrip.
@Test
public void testPhoenixSplitJsonRoundtrip() throws Exception {
List<HostAddress> addresses = ImmutableList.of(HostAddress.fromString("host:9000"));
List<Scan> scans = ImmutableList.of(new Scan().withStartRow(Bytes.toBytes("A")).withStopRow(Bytes.toBytes("Z")));
PhoenixInputSplit phoenixInputSplit = new PhoenixInputSplit(scans);
PhoenixSplit expected = new PhoenixSplit(addresses, SerializedPhoenixInputSplit.serialize(phoenixInputSplit));
assertTrue(objectMapper.canSerialize(PhoenixSplit.class));
String json = objectMapper.writeValueAsString(expected);
PhoenixSplit actual = objectMapper.readValue(json, PhoenixSplit.class);
assertEquals(actual.getPhoenixInputSplit(), expected.getPhoenixInputSplit());
assertEquals(actual.getAddresses(), expected.getAddresses());
}
use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project phoenix-connectors by apache.
the class PhoenixScan method planInputPartitions.
@Override
public InputPartition[] planInputPartitions() {
populateOverriddenProperties();
try (Connection conn = DriverManager.getConnection(JDBC_PROTOCOL + JDBC_PROTOCOL_SEPARATOR + zkUrl, overriddenProps)) {
List<ColumnInfo> columnInfos = PhoenixRuntime.generateColumnInfo(conn, tableName, new ArrayList<>(Arrays.asList(schema.names())));
final Statement statement = conn.createStatement();
final String selectStatement = QueryUtil.constructSelectStatement(tableName, columnInfos, whereClause);
if (selectStatement == null) {
throw new NullPointerException();
}
final PhoenixStatement pstmt = statement.unwrap(PhoenixStatement.class);
// Optimize the query plan so that we potentially use secondary indexes
final QueryPlan queryPlan = pstmt.optimizeQuery(selectStatement);
final org.apache.hadoop.hbase.client.Scan scan = queryPlan.getContext().getScan();
// Initialize the query plan so it sets up the parallel scans
queryPlan.iterator(MapReduceParallelScanGrouper.getInstance());
List<KeyRange> allSplits = queryPlan.getSplits();
// Get the RegionSizeCalculator
PhoenixConnection phxConn = conn.unwrap(PhoenixConnection.class);
org.apache.hadoop.hbase.client.Connection connection = phxConn.getQueryServices().getAdmin().getConnection();
RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(queryPlan.getTableRef().getTable().getPhysicalName().toString()));
final InputPartition[] partitions = new PhoenixInputPartition[allSplits.size()];
int partitionCount = 0;
for (List<org.apache.hadoop.hbase.client.Scan> scans : queryPlan.getScans()) {
// Get the region location
HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
String regionLocation = location.getHostname();
// Get the region size
long regionSize = CompatUtil.getSize(regionLocator, connection.getAdmin(), location);
phoenixDataSourceOptions = new PhoenixDataSourceReadOptions(zkUrl, currentScnValue, tenantId, selectStatement, overriddenProps);
if (splitByStats) {
for (org.apache.hadoop.hbase.client.Scan aScan : scans) {
partitions[partitionCount++] = new PhoenixInputPartition(new PhoenixInputSplit(Collections.singletonList(aScan), regionSize, regionLocation));
}
} else {
partitions[partitionCount++] = new PhoenixInputPartition(new PhoenixInputSplit(scans, regionSize, regionLocation));
}
}
return partitions;
} catch (Exception e) {
throw new RuntimeException("Unable to plan query", e);
}
}
Aggregations