Search in sources :

Example 6 with PhoenixInputSplit

use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project phoenix-connectors by apache.

the class PhoenixDataSourceReader method planInputPartitions.

@Override
public List<InputPartition<InternalRow>> planInputPartitions() {
    Optional<String> currentScnValue = options.get(PhoenixConfigurationUtil.CURRENT_SCN_VALUE);
    Optional<String> tenantId = options.get(PhoenixConfigurationUtil.MAPREDUCE_TENANT_ID);
    // Generate splits based off statistics, or just region splits?
    boolean splitByStats = options.getBoolean(PhoenixConfigurationUtil.MAPREDUCE_SPLIT_BY_STATS, PhoenixConfigurationUtil.DEFAULT_SPLIT_BY_STATS);
    if (currentScnValue.isPresent()) {
        overriddenProps.put(PhoenixRuntime.CURRENT_SCN_ATTRIB, currentScnValue.get());
    }
    if (tenantId.isPresent()) {
        overriddenProps.put(PhoenixRuntime.TENANT_ID_ATTRIB, tenantId.get());
    }
    try (Connection conn = DriverManager.getConnection(JDBC_PROTOCOL + JDBC_PROTOCOL_SEPARATOR + zkUrl, overriddenProps)) {
        List<ColumnInfo> columnInfos = PhoenixRuntime.generateColumnInfo(conn, tableName, new ArrayList<>(Arrays.asList(schema.names())));
        final Statement statement = conn.createStatement();
        final String selectStatement = QueryUtil.constructSelectStatement(tableName, columnInfos, whereClause);
        if (selectStatement == null) {
            throw new NullPointerException();
        }
        final PhoenixStatement pstmt = statement.unwrap(PhoenixStatement.class);
        // Optimize the query plan so that we potentially use secondary indexes
        final QueryPlan queryPlan = pstmt.optimizeQuery(selectStatement);
        final Scan scan = queryPlan.getContext().getScan();
        // setting the snapshot configuration
        Optional<String> snapshotName = options.get(PhoenixConfigurationUtil.SNAPSHOT_NAME_KEY);
        if (snapshotName.isPresent())
            PhoenixConfigurationUtil.setSnapshotNameKey(queryPlan.getContext().getConnection().getQueryServices().getConfiguration(), snapshotName.get());
        // Initialize the query plan so it sets up the parallel scans
        queryPlan.iterator(MapReduceParallelScanGrouper.getInstance());
        List<KeyRange> allSplits = queryPlan.getSplits();
        // Get the RegionSizeCalculator
        PhoenixConnection phxConn = conn.unwrap(PhoenixConnection.class);
        org.apache.hadoop.hbase.client.Connection connection = phxConn.getQueryServices().getAdmin().getConnection();
        RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(queryPlan.getTableRef().getTable().getPhysicalName().toString()));
        final List<InputPartition<InternalRow>> partitions = new ArrayList<>(allSplits.size());
        for (List<Scan> scans : queryPlan.getScans()) {
            // Get the region location
            HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
            String regionLocation = location.getHostname();
            // Get the region size
            long regionSize = CompatUtil.getSize(regionLocator, connection.getAdmin(), location);
            PhoenixDataSourceReadOptions phoenixDataSourceOptions = new PhoenixDataSourceReadOptions(zkUrl, currentScnValue.orElse(null), tenantId.orElse(null), selectStatement, overriddenProps);
            if (splitByStats) {
                for (Scan aScan : scans) {
                    partitions.add(getInputPartition(phoenixDataSourceOptions, new PhoenixInputSplit(Collections.singletonList(aScan), regionSize, regionLocation)));
                }
            } else {
                partitions.add(getInputPartition(phoenixDataSourceOptions, new PhoenixInputSplit(scans, regionSize, regionLocation)));
            }
        }
        return partitions;
    } catch (Exception e) {
        throw new RuntimeException("Unable to plan query", e);
    }
}
Also used : PhoenixConnection(org.apache.phoenix.jdbc.PhoenixConnection) KeyRange(org.apache.phoenix.query.KeyRange) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.phoenix.util.ColumnInfo) QueryPlan(org.apache.phoenix.compile.QueryPlan) PhoenixStatement(org.apache.phoenix.jdbc.PhoenixStatement) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) PhoenixInputSplit(org.apache.phoenix.mapreduce.PhoenixInputSplit) InputPartition(org.apache.spark.sql.sources.v2.reader.InputPartition) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) PhoenixStatement(org.apache.phoenix.jdbc.PhoenixStatement) Statement(java.sql.Statement) Connection(java.sql.Connection) PhoenixConnection(org.apache.phoenix.jdbc.PhoenixConnection) SQLException(java.sql.SQLException) Scan(org.apache.hadoop.hbase.client.Scan)

Example 7 with PhoenixInputSplit

use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project DataX by alibaba.

the class HbaseSQLReaderTask method init.

public void init() {
    LOG.info("reader table info: " + this.readerConfig.toString());
    try {
        this.getPColumns();
    } catch (SQLException e) {
        throw DataXException.asDataXException(HbaseSQLReaderErrorCode.GET_PHOENIX_CONNECTIONINFO_ERROR, "获取表的列出问题,重试,若还有问题请检查hbase集群状态," + e.getMessage());
    }
    this.phoenixInputFormat = new PhoenixInputFormat<PhoenixRecordWritable>();
    String splitBase64Str = this.readerConfig.getOriginalConfig().getString(Key.SPLIT_KEY);
    byte[] splitBytes = org.apache.commons.codec.binary.Base64.decodeBase64(splitBase64Str);
    TaskAttemptID attemptId = new TaskAttemptID();
    org.apache.hadoop.conf.Configuration conf = HbaseSQLHelper.generatePhoenixConf(this.readerConfig);
    this.hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId);
    this.phoenixInputSplit = new PhoenixInputSplit();
    try {
        HadoopSerializationUtil.deserialize(phoenixInputSplit, splitBytes);
        this.phoenixRecordReader = (PhoenixRecordReader) phoenixInputFormat.createRecordReader(phoenixInputSplit, hadoopAttemptContext);
    } catch (Exception e) {
        throw DataXException.asDataXException(HbaseSQLReaderErrorCode.PHOENIX_CREATEREADER_ERROR, "创建phoenix的reader出现问题,请重试,若还有问题请检查hbase集群状态," + e.getMessage());
    }
}
Also used : TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) PhoenixRecordWritable(org.apache.phoenix.mapreduce.PhoenixRecordWritable) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) PhoenixInputSplit(org.apache.phoenix.mapreduce.PhoenixInputSplit) IOException(java.io.IOException) DataXException(com.alibaba.datax.common.exception.DataXException)

Example 8 with PhoenixInputSplit

use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project trino by trinodb.

the class PhoenixSplitManager method generateSplits.

// mostly copied from PhoenixInputFormat, but without the region size calculations
private List<InputSplit> generateSplits(QueryPlan queryPlan, List<KeyRange> splits, int maxScansPerSplit) throws IOException {
    requireNonNull(queryPlan, "queryPlan is null");
    requireNonNull(splits, "splits is null");
    try (org.apache.hadoop.hbase.client.Connection connection = phoenixClient.getHConnection()) {
        RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(queryPlan.getTableRef().getTable().getPhysicalName().toString()));
        long regionSize = -1;
        List<InputSplit> inputSplits = new ArrayList<>(splits.size());
        for (List<Scan> scans : queryPlan.getScans()) {
            HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
            String regionLocation = location.getHostname();
            if (log.isDebugEnabled()) {
                log.debug("Scan count[%d] : %s ~ %s", scans.size(), Bytes.toStringBinary(scans.get(0).getStartRow()), Bytes.toStringBinary(scans.get(scans.size() - 1).getStopRow()));
                log.debug("First scan : %swith scanAttribute : %s [scanCache, cacheBlock, scanBatch] : [%d, %s, %d] and  regionLocation : %s", scans.get(0), scans.get(0).getAttributesMap(), scans.get(0).getCaching(), scans.get(0).getCacheBlocks(), scans.get(0).getBatch(), regionLocation);
                for (int i = 0, limit = scans.size(); i < limit; i++) {
                    log.debug("EXPECTED_UPPER_REGION_KEY[%d] : %s", i, Bytes.toStringBinary(scans.get(i).getAttribute(EXPECTED_UPPER_REGION_KEY)));
                }
            }
            /*
                 * Handle parallel execution explicitly in Trino rather than internally in Phoenix.
                 * Each split is handled by a single ConcatResultIterator
                 * (See PhoenixClient.getResultSet(...))
                 */
            for (List<Scan> splitScans : Lists.partition(scans, maxScansPerSplit)) {
                inputSplits.add(new PhoenixInputSplit(splitScans, regionSize, regionLocation));
            }
        }
        return inputSplits;
    }
}
Also used : RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) ArrayList(java.util.ArrayList) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) PhoenixInputSplit(org.apache.phoenix.mapreduce.PhoenixInputSplit) Scan(org.apache.hadoop.hbase.client.Scan) PhoenixInputSplit(org.apache.phoenix.mapreduce.PhoenixInputSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 9 with PhoenixInputSplit

use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project trino by trinodb.

the class TestPhoenixSplit method testPhoenixSplitJsonRoundtrip.

@Test
public void testPhoenixSplitJsonRoundtrip() throws Exception {
    List<HostAddress> addresses = ImmutableList.of(HostAddress.fromString("host:9000"));
    List<Scan> scans = ImmutableList.of(new Scan().withStartRow(Bytes.toBytes("A")).withStopRow(Bytes.toBytes("Z")));
    PhoenixInputSplit phoenixInputSplit = new PhoenixInputSplit(scans);
    PhoenixSplit expected = new PhoenixSplit(addresses, SerializedPhoenixInputSplit.serialize(phoenixInputSplit));
    assertTrue(objectMapper.canSerialize(PhoenixSplit.class));
    String json = objectMapper.writeValueAsString(expected);
    PhoenixSplit actual = objectMapper.readValue(json, PhoenixSplit.class);
    assertEquals(actual.getPhoenixInputSplit(), expected.getPhoenixInputSplit());
    assertEquals(actual.getAddresses(), expected.getAddresses());
}
Also used : PhoenixInputSplit(org.apache.phoenix.mapreduce.PhoenixInputSplit) Scan(org.apache.hadoop.hbase.client.Scan) HostAddress(io.trino.spi.HostAddress) Test(org.testng.annotations.Test)

Example 10 with PhoenixInputSplit

use of org.apache.phoenix.mapreduce.PhoenixInputSplit in project phoenix-connectors by apache.

the class PhoenixScan method planInputPartitions.

@Override
public InputPartition[] planInputPartitions() {
    populateOverriddenProperties();
    try (Connection conn = DriverManager.getConnection(JDBC_PROTOCOL + JDBC_PROTOCOL_SEPARATOR + zkUrl, overriddenProps)) {
        List<ColumnInfo> columnInfos = PhoenixRuntime.generateColumnInfo(conn, tableName, new ArrayList<>(Arrays.asList(schema.names())));
        final Statement statement = conn.createStatement();
        final String selectStatement = QueryUtil.constructSelectStatement(tableName, columnInfos, whereClause);
        if (selectStatement == null) {
            throw new NullPointerException();
        }
        final PhoenixStatement pstmt = statement.unwrap(PhoenixStatement.class);
        // Optimize the query plan so that we potentially use secondary indexes
        final QueryPlan queryPlan = pstmt.optimizeQuery(selectStatement);
        final org.apache.hadoop.hbase.client.Scan scan = queryPlan.getContext().getScan();
        // Initialize the query plan so it sets up the parallel scans
        queryPlan.iterator(MapReduceParallelScanGrouper.getInstance());
        List<KeyRange> allSplits = queryPlan.getSplits();
        // Get the RegionSizeCalculator
        PhoenixConnection phxConn = conn.unwrap(PhoenixConnection.class);
        org.apache.hadoop.hbase.client.Connection connection = phxConn.getQueryServices().getAdmin().getConnection();
        RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(queryPlan.getTableRef().getTable().getPhysicalName().toString()));
        final InputPartition[] partitions = new PhoenixInputPartition[allSplits.size()];
        int partitionCount = 0;
        for (List<org.apache.hadoop.hbase.client.Scan> scans : queryPlan.getScans()) {
            // Get the region location
            HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
            String regionLocation = location.getHostname();
            // Get the region size
            long regionSize = CompatUtil.getSize(regionLocator, connection.getAdmin(), location);
            phoenixDataSourceOptions = new PhoenixDataSourceReadOptions(zkUrl, currentScnValue, tenantId, selectStatement, overriddenProps);
            if (splitByStats) {
                for (org.apache.hadoop.hbase.client.Scan aScan : scans) {
                    partitions[partitionCount++] = new PhoenixInputPartition(new PhoenixInputSplit(Collections.singletonList(aScan), regionSize, regionLocation));
                }
            } else {
                partitions[partitionCount++] = new PhoenixInputPartition(new PhoenixInputSplit(scans, regionSize, regionLocation));
            }
        }
        return partitions;
    } catch (Exception e) {
        throw new RuntimeException("Unable to plan query", e);
    }
}
Also used : PhoenixConnection(org.apache.phoenix.jdbc.PhoenixConnection) KeyRange(org.apache.phoenix.query.KeyRange) ColumnInfo(org.apache.phoenix.util.ColumnInfo) QueryPlan(org.apache.phoenix.compile.QueryPlan) PhoenixStatement(org.apache.phoenix.jdbc.PhoenixStatement) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) PhoenixInputSplit(org.apache.phoenix.mapreduce.PhoenixInputSplit) InputPartition(org.apache.spark.sql.connector.read.InputPartition) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) PhoenixStatement(org.apache.phoenix.jdbc.PhoenixStatement) Statement(java.sql.Statement) Connection(java.sql.Connection) PhoenixConnection(org.apache.phoenix.jdbc.PhoenixConnection) Scan(org.apache.spark.sql.connector.read.Scan)

Aggregations

PhoenixInputSplit (org.apache.phoenix.mapreduce.PhoenixInputSplit)10 ArrayList (java.util.ArrayList)5 Scan (org.apache.hadoop.hbase.client.Scan)5 IOException (java.io.IOException)4 HRegionLocation (org.apache.hadoop.hbase.HRegionLocation)4 RegionLocator (org.apache.hadoop.hbase.client.RegionLocator)4 InputSplit (org.apache.hadoop.mapreduce.InputSplit)4 DataXException (com.alibaba.datax.common.exception.DataXException)2 Configuration (com.alibaba.datax.common.util.Configuration)2 HostAddress (io.trino.spi.HostAddress)2 Connection (java.sql.Connection)2 Statement (java.sql.Statement)2 JobID (org.apache.hadoop.mapreduce.JobID)2 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)2 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)2 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)2 QueryPlan (org.apache.phoenix.compile.QueryPlan)2 PhoenixConnection (org.apache.phoenix.jdbc.PhoenixConnection)2 PhoenixStatement (org.apache.phoenix.jdbc.PhoenixStatement)2 PhoenixInputFormat (org.apache.phoenix.mapreduce.PhoenixInputFormat)2