Search in sources :

Example 1 with Partition

use of com.amazonaws.services.glue.model.Partition in project alluxio by Alluxio.

the class GlueDatabase method batchGetPartitions.

private List<Partition> batchGetPartitions(AWSGlueAsync glueClient, String tableName) throws IOException {
    // TODO(shouwei): make getPartition multi-thread to accelerate the large table fetching
    List<Partition> partitions = new ArrayList<>();
    String nextToken = null;
    try {
        do {
            GetPartitionsRequest getPartitionsRequest = new GetPartitionsRequest().withCatalogId(mGlueConfiguration.get(Property.CATALOG_ID)).withDatabaseName(mGlueDbName).withTableName(tableName).withMaxResults(mGlueConfiguration.getInt(Property.MAX_GLUE_FETCH_PARTITIONS)).withNextToken(nextToken);
            GetPartitionsResult getPartitionsResult = glueClient.getPartitions(getPartitionsRequest);
            partitions.addAll(getPartitionsResult.getPartitions());
            nextToken = getPartitionsResult.getNextToken();
            LOG.debug("Glue table {}.{} adding {} batch partitions with total {} partitions.", mGlueDbName, tableName, getPartitionsResult.getPartitions().size(), partitions.size());
        } while (nextToken != null);
        if (partitions != null) {
            LOG.info("Glue table {}.{} has {} partitions.", mGlueDbName, tableName, partitions.size());
            if (LOG.isDebugEnabled()) {
                partitions.stream().forEach(partition -> LOG.debug("Glue table {}.{} with partition: {}.", partition.getDatabaseName(), tableName, partition));
            }
        }
        return partitions;
    } catch (AWSGlueException e) {
        throw new IOException("Cannot get partition information for table: " + tableName + " in Database: " + mGlueDbName + "; Catalog ID: " + mGlueConfiguration.get(Property.CATALOG_ID) + ". error: " + e.getMessage(), e);
    }
}
Also used : UdbPartition(alluxio.table.common.UdbPartition) Partition(com.amazonaws.services.glue.model.Partition) GetPartitionsResult(com.amazonaws.services.glue.model.GetPartitionsResult) AWSGlueException(com.amazonaws.services.glue.model.AWSGlueException) ArrayList(java.util.ArrayList) IOException(java.io.IOException) GetPartitionsRequest(com.amazonaws.services.glue.model.GetPartitionsRequest)

Example 2 with Partition

use of com.amazonaws.services.glue.model.Partition in project alluxio by Alluxio.

the class GlueDatabase method mountAlluxioPaths.

@VisibleForTesting
private PathTranslator mountAlluxioPaths(Table table, List<Partition> partitions, UdbBypassSpec bypassSpec) throws IOException {
    String tableName = table.getName();
    AlluxioURI ufsUri;
    AlluxioURI alluxioUri = mUdbContext.getTableLocation(tableName);
    String glueUfsUri = table.getStorageDescriptor().getLocation();
    try {
        PathTranslator pathTranslator = new PathTranslator();
        if (bypassSpec.hasFullTable(tableName)) {
            pathTranslator.addMapping(glueUfsUri, glueUfsUri);
            return pathTranslator;
        }
        ufsUri = new AlluxioURI(table.getStorageDescriptor().getLocation());
        pathTranslator.addMapping(UdbUtils.mountAlluxioPath(tableName, ufsUri, alluxioUri, mUdbContext, mGlueConfiguration), glueUfsUri);
        for (Partition partition : partitions) {
            AlluxioURI partitionUri;
            String partitionName;
            if (partition.getStorageDescriptor() != null && partition.getStorageDescriptor().getLocation() != null && ufsUri.isAncestorOf(partitionUri = new AlluxioURI(partition.getStorageDescriptor().getLocation()))) {
                glueUfsUri = partition.getStorageDescriptor().getLocation();
                partitionName = partition.getValues().toString();
                try {
                    partitionName = GlueUtils.makePartitionName(table.getPartitionKeys(), partition.getValues());
                } catch (IOException e) {
                    LOG.warn("Error making partition name for table {}," + " partition {} in database {} with CatalogID {}.", tableName, partition.getValues().toString(), mGlueDbName, mGlueConfiguration.get(Property.CATALOG_ID));
                }
                if (bypassSpec.hasPartition(tableName, partitionName)) {
                    pathTranslator.addMapping(partitionUri.getPath(), partitionUri.getPath());
                    continue;
                }
                alluxioUri = new AlluxioURI(PathUtils.concatPath(mUdbContext.getTableLocation(tableName).getPath(), partitionName));
                // mount partition path if it is not already mounted as part of the table path mount
                pathTranslator.addMapping(UdbUtils.mountAlluxioPath(tableName, partitionUri, alluxioUri, mUdbContext, mGlueConfiguration), glueUfsUri);
            }
        }
        return pathTranslator;
    } catch (AlluxioException e) {
        throw new IOException("Failed to mount table location. tableName: " + tableName + " glueUfsLocation: " + glueUfsUri + " AlluxioLocation: " + alluxioUri + " error: " + e.getMessage(), e);
    }
}
Also used : UdbPartition(alluxio.table.common.UdbPartition) Partition(com.amazonaws.services.glue.model.Partition) PathTranslator(alluxio.table.common.udb.PathTranslator) IOException(java.io.IOException) AlluxioURI(alluxio.AlluxioURI) AlluxioException(alluxio.exception.AlluxioException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with Partition

use of com.amazonaws.services.glue.model.Partition in project presto by prestodb.

the class TestGlueToPrestoConverter method testPartitionConversionMemoization.

@Test
public void testPartitionConversionMemoization() {
    String fakeS3Location = "s3://some-fake-location";
    testPartition.getStorageDescriptor().setLocation(fakeS3Location);
    // Second partition to convert with equal (but not aliased) values
    Partition partitionTwo = getGlueTestPartition(testPartition.getDatabaseName(), testPartition.getTableName(), new ArrayList<>(testPartition.getValues()));
    // Ensure storage fields are equal but not aliased as well
    partitionTwo.getStorageDescriptor().setColumns(new ArrayList<>(testPartition.getStorageDescriptor().getColumns()));
    partitionTwo.getStorageDescriptor().setBucketColumns(new ArrayList<>(testPartition.getStorageDescriptor().getBucketColumns()));
    partitionTwo.getStorageDescriptor().setLocation("" + fakeS3Location);
    partitionTwo.getStorageDescriptor().setInputFormat("" + testPartition.getStorageDescriptor().getInputFormat());
    partitionTwo.getStorageDescriptor().setOutputFormat("" + testPartition.getStorageDescriptor().getOutputFormat());
    partitionTwo.getStorageDescriptor().setParameters(new HashMap<>(testPartition.getStorageDescriptor().getParameters()));
    GluePartitionConverter converter = new GluePartitionConverter(testDb.getName(), testTbl.getName());
    com.facebook.presto.hive.metastore.Partition prestoPartition = converter.apply(testPartition);
    com.facebook.presto.hive.metastore.Partition prestoPartition2 = converter.apply(partitionTwo);
    assertNotSame(prestoPartition, prestoPartition2);
    assertSame(prestoPartition2.getDatabaseName(), prestoPartition.getDatabaseName());
    assertSame(prestoPartition2.getTableName(), prestoPartition.getTableName());
    assertSame(prestoPartition2.getColumns(), prestoPartition.getColumns());
    assertSame(prestoPartition2.getParameters(), prestoPartition.getParameters());
    assertNotSame(prestoPartition2.getValues(), prestoPartition.getValues());
    Storage storage = prestoPartition.getStorage();
    Storage storage2 = prestoPartition2.getStorage();
    assertSame(storage2.getStorageFormat(), storage.getStorageFormat());
    assertSame(storage2.getBucketProperty(), storage.getBucketProperty());
    assertSame(storage2.getSerdeParameters(), storage.getSerdeParameters());
    assertNotSame(storage2.getLocation(), storage.getLocation());
}
Also used : TestingMetastoreObjects.getGlueTestPartition(com.facebook.presto.hive.metastore.glue.TestingMetastoreObjects.getGlueTestPartition) Partition(com.amazonaws.services.glue.model.Partition) Storage(com.facebook.presto.hive.metastore.Storage) GluePartitionConverter(com.facebook.presto.hive.metastore.glue.converter.GlueToPrestoConverter.GluePartitionConverter) Test(org.testng.annotations.Test)

Example 4 with Partition

use of com.amazonaws.services.glue.model.Partition in project alluxio by Alluxio.

the class GlueDatabase method getTable.

@Override
public UdbTable getTable(String tableName, UdbBypassSpec bypassSpec) throws IOException {
    Table table;
    List<Partition> partitions;
    try {
        GetTableRequest tableRequest = new GetTableRequest().withCatalogId(mGlueConfiguration.get(Property.CATALOG_ID)).withDatabaseName(mGlueDbName).withName(tableName);
        table = getClient().getTable(tableRequest).getTable();
        partitions = batchGetPartitions(getClient(), tableName);
        PathTranslator pathTranslator = mountAlluxioPaths(table, partitions, bypassSpec);
        List<Column> partitionColumns;
        if (table.getPartitionKeys() == null) {
            partitionColumns = Collections.emptyList();
        } else {
            partitionColumns = table.getPartitionKeys();
        }
        // Get table parameters
        Map<String, String> tableParameters = table.getParameters() == null ? Collections.emptyMap() : table.getParameters();
        // Get column statistics info for table
        List<String> columnNames = table.getStorageDescriptor().getColumns().stream().map(Column::getName).collect(Collectors.toList());
        GetColumnStatisticsForTableRequest getColumnStatisticsForTableRequest = new GetColumnStatisticsForTableRequest().withCatalogId(mGlueConfiguration.get(Property.CATALOG_ID)).withDatabaseName(mGlueDbName).withTableName(tableName).withColumnNames(columnNames);
        List<ColumnStatisticsInfo> columnStatisticsTableData = new ArrayList<>();
        if (mGlueConfiguration.getBoolean(Property.TABLE_COLUMN_STATISTICS_ENABLE)) {
            columnStatisticsTableData = getTableColumnStatistics(mGlueDbName, tableName, getColumnStatisticsForTableRequest);
        }
        // Get column statistics info for partitions
        // potential expensive call
        Map<String, List<ColumnStatisticsInfo>> statsMap = new HashMap<>();
        if (mGlueConfiguration.getBoolean(Property.PARTITION_COLUMN_STATISTICS_ENABLE)) {
            for (Partition partition : partitions) {
                List<String> partitionValue = partition.getValues();
                if (partitionValue != null) {
                    GetColumnStatisticsForPartitionRequest getColumnStatisticsForPartitionRequest = new GetColumnStatisticsForPartitionRequest().withCatalogId(mGlueConfiguration.get(Property.CATALOG_ID)).withDatabaseName(mGlueDbName).withTableName(tableName).withColumnNames(columnNames).withPartitionValues(partitionValue);
                    String partName = GlueUtils.makePartitionName(partitionColumns, partition.getValues());
                    statsMap.put(partName, getPartitionColumnStatistics(mGlueDbName, tableName, getColumnStatisticsForPartitionRequest));
                }
            }
        }
        PartitionInfo partitionInfo = PartitionInfo.newBuilder().setDbName(mGlueDbName).setTableName(tableName).addAllDataCols(GlueUtils.toProto(table.getStorageDescriptor().getColumns())).setStorage(GlueUtils.toProto(table.getStorageDescriptor(), pathTranslator)).putAllParameters(tableParameters).build();
        Layout layout = Layout.newBuilder().setLayoutType(HiveLayout.TYPE).setLayoutData(partitionInfo.toByteString()).build();
        List<UdbPartition> udbPartitions = new ArrayList<>();
        if (partitionColumns.isEmpty()) {
            PartitionInfo.Builder partitionInfoBuilder = PartitionInfo.newBuilder().setDbName(mGlueDbName).setTableName(tableName).addAllDataCols(GlueUtils.toProto(table.getStorageDescriptor().getColumns())).setStorage(GlueUtils.toProto(table.getStorageDescriptor(), pathTranslator)).setPartitionName(tableName).putAllParameters(tableParameters);
            udbPartitions.add(new GluePartition(new HiveLayout(partitionInfoBuilder.build(), Collections.emptyList())));
        } else {
            for (Partition partition : partitions) {
                String partName = GlueUtils.makePartitionName(partitionColumns, partition.getValues());
                PartitionInfo.Builder partitionInfoBuilder = PartitionInfo.newBuilder().setDbName(mGlueDbName).setTableName(tableName).addAllDataCols(GlueUtils.toProto(partition.getStorageDescriptor().getColumns())).setStorage(GlueUtils.toProto(partition.getStorageDescriptor(), pathTranslator)).setPartitionName(partName).putAllParameters(partition.getParameters() == null ? Collections.emptyMap() : partition.getParameters());
                if (partition.getValues() != null) {
                    partitionInfoBuilder.addAllValues(partition.getValues());
                }
                udbPartitions.add(new GluePartition(new HiveLayout(partitionInfoBuilder.build(), statsMap.getOrDefault(partName, Collections.emptyList()))));
            }
        }
        return new GlueTable(this, pathTranslator, tableName, GlueUtils.toProtoSchema(table.getStorageDescriptor().getColumns()), columnStatisticsTableData, // Get FieldSchema from partition keys
        GlueUtils.toProto(table.getPartitionKeys()), udbPartitions, layout, table);
    } catch (EntityNotFoundException e) {
        throw new NotFoundException("Table " + tableName + " does not exist in Database: " + mGlueDbName + "; Catalog ID: " + mGlueConfiguration.get(Property.CATALOG_ID) + ".", e);
    } catch (ValidationException e) {
        throw new IOException("Failed to get table: " + tableName + " in Database: " + mGlueDbName + "; Catalog ID: " + mGlueConfiguration.get(Property.CATALOG_ID) + " with validation error: " + e.getMessage(), e);
    } catch (GlueEncryptionException e) {
        throw new IOException("Failed to get table: " + tableName + " in Database: " + mGlueDbName + "; Catalog ID: " + mGlueConfiguration.get(Property.CATALOG_ID) + " error: " + e.getMessage(), e);
    }
}
Also used : HiveLayout(alluxio.table.common.layout.HiveLayout) ValidationException(com.amazonaws.services.glue.model.ValidationException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) EntityNotFoundException(com.amazonaws.services.glue.model.EntityNotFoundException) NotFoundException(alluxio.exception.status.NotFoundException) UdbPartition(alluxio.table.common.UdbPartition) Column(com.amazonaws.services.glue.model.Column) GlueEncryptionException(com.amazonaws.services.glue.model.GlueEncryptionException) List(java.util.List) ArrayList(java.util.ArrayList) PartitionInfo(alluxio.grpc.table.layout.hive.PartitionInfo) UdbPartition(alluxio.table.common.UdbPartition) Partition(com.amazonaws.services.glue.model.Partition) UdbTable(alluxio.table.common.udb.UdbTable) Table(com.amazonaws.services.glue.model.Table) GetColumnStatisticsForTableRequest(com.amazonaws.services.glue.model.GetColumnStatisticsForTableRequest) EntityNotFoundException(com.amazonaws.services.glue.model.EntityNotFoundException) IOException(java.io.IOException) GetTableRequest(com.amazonaws.services.glue.model.GetTableRequest) GetColumnStatisticsForPartitionRequest(com.amazonaws.services.glue.model.GetColumnStatisticsForPartitionRequest) PathTranslator(alluxio.table.common.udb.PathTranslator) Layout(alluxio.grpc.table.Layout) HiveLayout(alluxio.table.common.layout.HiveLayout) ColumnStatisticsInfo(alluxio.grpc.table.ColumnStatisticsInfo)

Aggregations

Partition (com.amazonaws.services.glue.model.Partition)4 UdbPartition (alluxio.table.common.UdbPartition)3 IOException (java.io.IOException)3 PathTranslator (alluxio.table.common.udb.PathTranslator)2 ArrayList (java.util.ArrayList)2 AlluxioURI (alluxio.AlluxioURI)1 AlluxioException (alluxio.exception.AlluxioException)1 NotFoundException (alluxio.exception.status.NotFoundException)1 ColumnStatisticsInfo (alluxio.grpc.table.ColumnStatisticsInfo)1 Layout (alluxio.grpc.table.Layout)1 PartitionInfo (alluxio.grpc.table.layout.hive.PartitionInfo)1 HiveLayout (alluxio.table.common.layout.HiveLayout)1 UdbTable (alluxio.table.common.udb.UdbTable)1 AWSGlueException (com.amazonaws.services.glue.model.AWSGlueException)1 Column (com.amazonaws.services.glue.model.Column)1 EntityNotFoundException (com.amazonaws.services.glue.model.EntityNotFoundException)1 GetColumnStatisticsForPartitionRequest (com.amazonaws.services.glue.model.GetColumnStatisticsForPartitionRequest)1 GetColumnStatisticsForTableRequest (com.amazonaws.services.glue.model.GetColumnStatisticsForTableRequest)1 GetPartitionsRequest (com.amazonaws.services.glue.model.GetPartitionsRequest)1 GetPartitionsResult (com.amazonaws.services.glue.model.GetPartitionsResult)1