use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.
the class DataLakeRecordHandlerTest method buildSplitSqlNew.
@Test
public void buildSplitSqlNew() throws SQLException {
TableName tableName = new TableName("testSchema", "testTable");
SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
schemaBuilder.addField(FieldBuilder.newBuilder("testCol1", Types.MinorType.INT.getType()).build());
schemaBuilder.addField(FieldBuilder.newBuilder("testCol2", Types.MinorType.DATEDAY.getType()).build());
schemaBuilder.addField(FieldBuilder.newBuilder("testCol3", Types.MinorType.DATEMILLI.getType()).build());
schemaBuilder.addField(FieldBuilder.newBuilder("testCol4", Types.MinorType.VARCHAR.getType()).build());
Schema schema = schemaBuilder.build();
Split split = Mockito.mock(Split.class);
Mockito.when(split.getProperty(DataLakeGen2MetadataHandler.PARTITION_NUMBER)).thenReturn("0");
ValueSet valueSet = getSingleValueSet("varcharTest");
Constraints constraints = Mockito.mock(Constraints.class);
Mockito.when(constraints.getSummary()).thenReturn(new ImmutableMap.Builder<String, ValueSet>().put("testCol4", valueSet).build());
String expectedSql = "SELECT `testCol1`, `testCol2`, `testCol3`, `testCol4` FROM `testSchema`.`testTable` WHERE (`testCol4` = ?)";
PreparedStatement expectedPreparedStatement = Mockito.mock(PreparedStatement.class);
Mockito.when(this.connection.prepareStatement(Mockito.eq(expectedSql))).thenReturn(expectedPreparedStatement);
PreparedStatement preparedStatement = this.dataLakeGen2RecordHandler.buildSplitSql(this.connection, "testCatalogName", tableName, schema, constraints, split);
Assert.assertEquals(expectedPreparedStatement, preparedStatement);
Mockito.verify(preparedStatement, Mockito.times(1)).setString(1, "varcharTest");
}
use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.
the class ExampleMetadataHandler method doGetTable.
/**
* Used to get definition (field names, types, descriptions, etc...) of a Table.
*
* @param allocator Tool for creating and managing Apache Arrow Blocks.
* @param request Provides details on who made the request and which Athena catalog, database, and table they are querying.
* @return A GetTableResponse which primarily contains:
* 1. An Apache Arrow Schema object describing the table's columns, types, and descriptions.
* 2. A Set<String> of partition column names (or empty if the table isn't partitioned).
* 3. A TableName object confirming the schema and table name the response is for.
* 4. A catalog name corresponding the Athena catalog that was queried.
*/
@Override
public GetTableResponse doGetTable(BlockAllocator allocator, GetTableRequest request) {
logger.info("doGetTable: enter - " + request);
Set<String> partitionColNames = new HashSet<>();
/**
* TODO: Add partitions columns, example below.
*
* partitionColNames.add("year");
* partitionColNames.add("month");
* partitionColNames.add("day");
*/
SchemaBuilder tableSchemaBuilder = SchemaBuilder.newBuilder();
return new GetTableResponse(request.getCatalogName(), request.getTableName(), tableSchemaBuilder.build(), partitionColNames);
}
use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.
the class DDBTableUtils method peekTableForSchema.
/**
* Derives an Arrow {@link Schema} for the given table by performing a small table scan and mapping the returned
* attribute values' types to Arrow types. If the table is empty, only attributes found in the table's metadata
* are added to the return schema.
*
* @param tableName the table to derive a schema for
* @param invoker the ThrottlingInvoker to call DDB with
* @param ddbClient the DDB client to use
* @return the table's derived schema
*/
public static Schema peekTableForSchema(String tableName, ThrottlingInvoker invoker, AmazonDynamoDB ddbClient) throws TimeoutException {
ScanRequest scanRequest = new ScanRequest().withTableName(tableName).withLimit(SCHEMA_INFERENCE_NUM_RECORDS);
ScanResult scanResult = invoker.invoke(() -> ddbClient.scan(scanRequest));
List<Map<String, AttributeValue>> items = scanResult.getItems();
Set<String> discoveredColumns = new HashSet<>();
SchemaBuilder schemaBuilder = new SchemaBuilder();
if (!items.isEmpty()) {
for (Map<String, AttributeValue> item : items) {
for (Map.Entry<String, AttributeValue> column : item.entrySet()) {
if (!discoveredColumns.contains(column.getKey())) {
Field field = DDBTypeUtils.inferArrowField(column.getKey(), ItemUtils.toSimpleValue(column.getValue()));
if (field != null) {
schemaBuilder.addField(field);
discoveredColumns.add(column.getKey());
}
}
}
}
} else {
// there's no items, so use any attributes defined in the table metadata
DynamoDBTable table = getTable(tableName, invoker, ddbClient);
for (AttributeDefinition attributeDefinition : table.getKnownAttributeDefinitions()) {
schemaBuilder.addField(DDBTypeUtils.getArrowFieldFromDDBType(attributeDefinition.getAttributeName(), attributeDefinition.getAttributeType()));
}
}
return schemaBuilder.build();
}
use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.
the class GlueMetadataHandler method doGetTable.
/**
* Attempts to retrieve a Table (columns and properties) from AWS Glue for the request schema (aka database) and table
* name with no filtering.
*
* @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
* @param request Provides details on who made the request and which Athena catalog, database, and table they are querying.
* @param filter The TableFilter to apply to any matching table before generating the result.
* @return A GetTableResponse mostly containing the columns, their types, and any table properties for the requested table.
* @note This method throws a RuntimeException if not table matching the requested criteria (and filter) is found.
*/
protected GetTableResponse doGetTable(BlockAllocator blockAllocator, GetTableRequest request, TableFilter filter) throws Exception {
TableName tableName = request.getTableName();
com.amazonaws.services.glue.model.GetTableRequest getTableRequest = new com.amazonaws.services.glue.model.GetTableRequest();
getTableRequest.setCatalogId(getCatalog(request));
getTableRequest.setDatabaseName(tableName.getSchemaName());
getTableRequest.setName(tableName.getTableName());
GetTableResult result = awsGlue.getTable(getTableRequest);
Table table = result.getTable();
if (filter != null && !filter.filter(table)) {
throw new RuntimeException("No matching table found " + request.getTableName());
}
SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
if (table.getParameters() != null) {
table.getParameters().entrySet().forEach(next -> schemaBuilder.addMetadata(next.getKey(), next.getValue()));
}
// A column name mapping can be provided to get around restrictive Glue naming rules
Map<String, String> columnNameMapping = getColumnNameMapping(table);
Map<String, String> dateTimeFormatMapping = getDateTimeFormatMapping(table);
Map<String, String> datetimeFormatMappingWithColumnName = new HashMap<>();
Set<String> partitionCols = new HashSet<>();
if (table.getPartitionKeys() != null) {
partitionCols = table.getPartitionKeys().stream().map(next -> columnNameMapping.getOrDefault(next.getName(), next.getName())).collect(Collectors.toSet());
}
for (Column next : table.getStorageDescriptor().getColumns()) {
String rawColumnName = next.getName();
String mappedColumnName = columnNameMapping.getOrDefault(rawColumnName, rawColumnName);
// apply any type override provided in typeOverrideMapping from metadata
// this is currently only used for timestamp with timezone support
logger.info("Column {} with registered type {}", rawColumnName, next.getType());
schemaBuilder.addField(convertField(mappedColumnName, next.getType()));
// Add non-null non-empty comments to metadata
if (next.getComment() != null && !next.getComment().trim().isEmpty()) {
schemaBuilder.addMetadata(mappedColumnName, next.getComment());
}
if (dateTimeFormatMapping.containsKey(rawColumnName)) {
datetimeFormatMappingWithColumnName.put(mappedColumnName, dateTimeFormatMapping.get(rawColumnName));
}
}
populateDatetimeFormatMappingIfAvailable(schemaBuilder, datetimeFormatMappingWithColumnName);
populateSourceTableNameIfAvailable(table, schemaBuilder);
if (table.getViewOriginalText() != null && !table.getViewOriginalText().isEmpty()) {
schemaBuilder.addMetadata(VIEW_METADATA_FIELD, table.getViewOriginalText());
}
return new GetTableResponse(request.getCatalogName(), request.getTableName(), schemaBuilder.build(), partitionCols);
}
use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.
the class HbaseSchemaUtils method scanAndInferSchema.
/**
* This helper method is used in conjunction with the scan facility provided
*
* @param scanner The HBase ResultScanner to read results from while inferring schema.
* @return An Apache Arrow Schema representing the schema of the HBase table.
* @note The resulting schema is a union of the schema of every row that is scanned. Any time two rows
* have a field with the same name but different inferred type the code will default the type of
* that field in the resulting schema to a VARCHAR. This approach is not perfect and can struggle
* to produce a usable schema if the table has a significant mix of entities.
*/
private static Schema scanAndInferSchema(ResultScanner scanner) {
Map<String, Map<String, ArrowType>> schemaInference = new HashMap<>();
int rowCount = 0;
int fieldCount = 0;
for (Result result : scanner) {
rowCount++;
for (KeyValue keyValue : result.list()) {
fieldCount++;
String family = new String(keyValue.getFamily());
String column = new String(keyValue.getQualifier());
Map<String, ArrowType> schemaForFamily = schemaInference.get(family);
if (schemaForFamily == null) {
schemaForFamily = new HashMap<>();
schemaInference.put(family, schemaForFamily);
}
// Get the previously inferred type for this column if we've seen it on a past row
ArrowType prevInferredType = schemaForFamily.get(column);
// Infer the type of the column from the value on the current row.
Types.MinorType inferredType = inferType(keyValue.getValue());
// Check if the previous and currently inferred types match
if (prevInferredType != null && Types.getMinorTypeForArrowType(prevInferredType) != inferredType) {
logger.info("inferSchema: Type changed detected for field, using VARCHAR - family: {} col: {} previousType: {} newType: {}", family, column, prevInferredType, inferredType);
schemaForFamily.put(column, Types.MinorType.VARCHAR.getType());
} else {
schemaForFamily.put(column, inferredType.getType());
}
logger.info("inferSchema: family: {} col: {} inferredType: {}", family, column, inferredType);
}
}
logger.info("inferSchema: Evaluated {} field values across {} rows.", fieldCount, rowCount);
// Used the union of all row's to produce our resultant Apache Arrow Schema.
SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
for (Map.Entry<String, Map<String, ArrowType>> nextFamily : schemaInference.entrySet()) {
String family = nextFamily.getKey();
for (Map.Entry<String, ArrowType> nextCol : nextFamily.getValue().entrySet()) {
schemaBuilder.addField(family + NAMESPACE_QUALIFIER + nextCol.getKey(), nextCol.getValue());
}
}
Schema schema = schemaBuilder.build();
if (schema.getFields().isEmpty()) {
throw new RuntimeException("No columns found after scanning " + fieldCount + " values across " + rowCount + " rows. Please ensure the table is not empty and contains at least 1 supported column type.");
}
return schema;
}
Aggregations