use of org.finra.herd.model.dto.HivePartitionDto in project herd by FINRAOS.
the class Hive13DdlGenerator method processStorageUnitsForGenerateDdl.
/**
* Adds the relative "alter table add partition" statements for each storage unit entity. Please note that each request partition value might result in
* multiple available storage unit entities (subpartitions).
*
* @param sb the string builder to be updated with the "alter table add partition" statements
* @param replacements the hash map of string values to be used to substitute the custom DDL tokens with their actual values
* @param businessObjectFormatEntity the business object format entity
* @param businessObjectFormat the business object format
* @param ifNotExistsOption specifies if generated DDL contains "if not exists" option
* @param storageUnitEntities the list of storage unit entities
*/
private void processStorageUnitsForGenerateDdl(GenerateDdlRequest generateDdlRequest, StringBuilder sb, HashMap<String, String> replacements, BusinessObjectFormatEntity businessObjectFormatEntity, BusinessObjectFormat businessObjectFormat, String ifNotExistsOption, List<StorageUnitEntity> storageUnitEntities) {
// If flag is not set to suppress scan for unregistered sub-partitions, retrieve all storage
// file paths for the relative storage units loaded in a multi-valued map for easy access.
MultiValuedMap<Integer, String> storageUnitIdToStorageFilePathsMap = BooleanUtils.isTrue(generateDdlRequest.suppressScanForUnregisteredSubPartitions) ? new ArrayListValuedHashMap<>() : storageFileDao.getStorageFilePathsByStorageUnitIds(storageUnitHelper.getStorageUnitIds(storageUnitEntities));
// Process all available business object data instances.
for (StorageUnitEntity storageUnitEntity : storageUnitEntities) {
// Get business object data key and S3 key prefix for this business object data.
BusinessObjectDataKey businessObjectDataKey = businessObjectDataHelper.getBusinessObjectDataKey(storageUnitEntity.getBusinessObjectData());
String s3KeyPrefix = s3KeyPrefixHelper.buildS3KeyPrefix(storageUnitEntity.getStorage(), storageUnitEntity.getBusinessObjectData().getBusinessObjectFormat(), businessObjectDataKey);
// If flag is set to suppress scan for unregistered sub-partitions, use the directory path or the S3 key prefix
// as the partition's location, otherwise, use storage files to discover all unregistered sub-partitions.
Collection<String> storageFilePaths = new ArrayList<>();
if (BooleanUtils.isTrue(generateDdlRequest.suppressScanForUnregisteredSubPartitions)) {
// Validate the directory path value if it is present.
if (storageUnitEntity.getDirectoryPath() != null) {
Assert.isTrue(storageUnitEntity.getDirectoryPath().equals(s3KeyPrefix), String.format("Storage directory path \"%s\" registered with business object data {%s} " + "in \"%s\" storage does not match the expected S3 key prefix \"%s\".", storageUnitEntity.getDirectoryPath(), businessObjectDataHelper.businessObjectDataEntityAltKeyToString(storageUnitEntity.getBusinessObjectData()), storageUnitEntity.getStorage().getName(), s3KeyPrefix));
}
// Add the S3 key prefix to the list of storage files.
// We add a trailing '/' character to the prefix, since it represents a directory.
storageFilePaths.add(StringUtils.appendIfMissing(s3KeyPrefix, "/"));
} else {
// Retrieve storage file paths registered with this business object data in the specified storage.
storageFilePaths = storageUnitIdToStorageFilePathsMap.containsKey(storageUnitEntity.getId()) ? storageUnitIdToStorageFilePathsMap.get(storageUnitEntity.getId()) : new ArrayList<>();
// Validate storage file paths registered with this business object data in the specified storage.
// The validation check below is required even if we have no storage files registered.
storageFileHelper.validateStorageFilePaths(storageFilePaths, s3KeyPrefix, storageUnitEntity.getBusinessObjectData(), storageUnitEntity.getStorage().getName());
// If there are no storage files registered for this storage unit, we should use the storage directory path value.
if (storageFilePaths.isEmpty()) {
// Validate that directory path value is present and it matches the S3 key prefix.
Assert.isTrue(storageUnitEntity.getDirectoryPath() != null && storageUnitEntity.getDirectoryPath().startsWith(s3KeyPrefix), String.format("Storage directory path \"%s\" registered with business object data {%s} " + "in \"%s\" storage does not match the expected S3 key prefix \"%s\".", storageUnitEntity.getDirectoryPath(), businessObjectDataHelper.businessObjectDataEntityAltKeyToString(storageUnitEntity.getBusinessObjectData()), storageUnitEntity.getStorage().getName(), s3KeyPrefix));
// Add storage directory path the empty storage files list.
// We add a trailing '/' character to the path, since it represents a directory.
storageFilePaths.add(storageUnitEntity.getDirectoryPath() + "/");
}
}
// Retrieve the s3 bucket name.
String s3BucketName = getS3BucketName(storageUnitEntity.getStorage(), generateDdlRequest.s3BucketNames);
// For partitioned table, add the relative partitions to the generated DDL.
if (generateDdlRequest.isPartitioned) {
// the business object data equals to the number of partition columns defined in schema for the format selected for DDL generation.
if (BooleanUtils.isTrue(generateDdlRequest.suppressScanForUnregisteredSubPartitions)) {
int businessObjectDataRegisteredPartitions = 1 + CollectionUtils.size(businessObjectDataKey.getSubPartitionValues());
Assert.isTrue(businessObjectFormat.getSchema().getPartitions().size() == businessObjectDataRegisteredPartitions, String.format("Number of primary and sub-partition values (%d) specified for the business object data is not equal to " + "the number of partition columns (%d) defined in the schema of the business object format selected for DDL generation. " + "Business object data: {%s}, business object format: {%s}", businessObjectDataRegisteredPartitions, businessObjectFormat.getSchema().getPartitions().size(), businessObjectDataHelper.businessObjectDataKeyToString(businessObjectDataKey), businessObjectFormatHelper.businessObjectFormatEntityAltKeyToString(businessObjectFormatEntity)));
} else // Otherwise, since the format version selected for DDL generation might not match the relative business object format version that business
// object data is registered against, validate that the number of sub-partition values specified for the business object data is less than
// the number of partition columns defined in schema for the format selected for DDL generation.
{
Assert.isTrue(businessObjectFormat.getSchema().getPartitions().size() > CollectionUtils.size(businessObjectDataKey.getSubPartitionValues()), String.format("Number of subpartition values specified for the business object data is greater than or equal to " + "the number of partition columns defined in the schema of the business object format selected for DDL generation. " + "Business object data: {%s}, business object format: {%s}", businessObjectDataHelper.businessObjectDataKeyToString(businessObjectDataKey), businessObjectFormatHelper.businessObjectFormatEntityAltKeyToString(businessObjectFormatEntity)));
}
// Get partition information. For multiple level partitioning, auto-discover subpartitions (subdirectories) not already included into the S3 key
// prefix. Each discovered partition requires a standalone "add partition" clause. Please note that due to the above validation check, there
// should be no auto discoverable sub-partition columns, when flag is set to suppress scan for unregistered sub-partitions.
List<SchemaColumn> autoDiscoverableSubPartitionColumns = businessObjectFormat.getSchema().getPartitions().subList(1 + CollectionUtils.size(businessObjectDataKey.getSubPartitionValues()), businessObjectFormat.getSchema().getPartitions().size());
for (HivePartitionDto hivePartition : getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, s3KeyPrefix, storageFilePaths, storageUnitEntity.getBusinessObjectData(), storageUnitEntity.getStorage().getName())) {
sb.append(String.format("ALTER TABLE `%s` ADD %sPARTITION (", generateDdlRequest.tableName, ifNotExistsOption));
// Specify all partition column values.
List<String> partitionKeyValuePairs = new ArrayList<>();
for (int i = 0; i < businessObjectFormat.getSchema().getPartitions().size(); i++) {
String partitionColumnName = businessObjectFormat.getSchema().getPartitions().get(i).getName();
String partitionValue = hivePartition.getPartitionValues().get(i);
partitionKeyValuePairs.add(String.format("`%s`='%s'", partitionColumnName, partitionValue));
}
sb.append(StringUtils.join(partitionKeyValuePairs, ", "));
sb.append(String.format(") LOCATION 's3n://%s/%s%s';\n", s3BucketName, s3KeyPrefix, StringUtils.isNotBlank(hivePartition.getPath()) ? hivePartition.getPath() : ""));
}
} else // This is a non-partitioned table.
{
// Get location for this non-partitioned table.
String tableLocation = String.format("s3n://%s/%s", s3BucketName, s3KeyPrefix);
if (generateDdlRequest.customDdlEntity == null) {
// Since custom DDL was not specified and this table is not partitioned, add a LOCATION clause.
// This is the last line in the non-partitioned table DDL.
sb.append(String.format("LOCATION '%s';", tableLocation));
} else {
// Since custom DDL was used for a non-partitioned table, substitute the relative custom DDL token with the actual table location.
replacements.put(NON_PARTITIONED_TABLE_LOCATION_CUSTOM_DDL_TOKEN, tableLocation);
}
}
}
}
use of org.finra.herd.model.dto.HivePartitionDto in project herd by FINRAOS.
the class Hive13DdlGeneratorTest method testGetHivePartitions.
@Test
public void testGetHivePartitions() {
// Create a test business object data entity.
BusinessObjectDataEntity businessObjectDataEntity = businessObjectDataDaoTestHelper.createBusinessObjectDataEntity(NAMESPACE, BDEF_NAME, FORMAT_USAGE_CODE, FORMAT_FILE_TYPE_CODE, FORMAT_VERSION, PARTITION_VALUE, DATA_VERSION, true, BDATA_STATUS);
List<SchemaColumn> autoDiscoverableSubPartitionColumns;
List<String> storageFilePaths;
List<HivePartitionDto> expectedHivePartitions;
List<HivePartitionDto> resultHivePartitions;
// Get business object data key.
BusinessObjectDataKey businessObjectDataKey = businessObjectDataHelper.getBusinessObjectDataKey(businessObjectDataEntity);
// No storage files.
autoDiscoverableSubPartitionColumns = getPartitionColumns(Arrays.asList("Column1", "column2"));
storageFilePaths = new ArrayList<>();
expectedHivePartitions = new ArrayList<>();
resultHivePartitions = hive13DdlGenerator.getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME);
assertEquals(expectedHivePartitions, resultHivePartitions);
// Single level partitioning.
autoDiscoverableSubPartitionColumns = new ArrayList<>();
storageFilePaths = getStorageFilePaths(Arrays.asList("/file1.dat", "/file2.dat"));
expectedHivePartitions = Arrays.asList(HivePartitionDto.builder().withPath("").withPartitionValues(Arrays.asList(PARTITION_VALUE)).build());
resultHivePartitions = hive13DdlGenerator.getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME);
assertEquals(expectedHivePartitions, resultHivePartitions);
// Test that we match column names in storage file paths ignoring the case.
autoDiscoverableSubPartitionColumns = getPartitionColumns(Arrays.asList("Column1", "column2"));
storageFilePaths = getStorageFilePaths(Arrays.asList("/COLUMN1=111/COLUMN2=222/file.dat", "/column1=aa/column2=bb/"));
expectedHivePartitions = Arrays.asList(HivePartitionDto.builder().withPath("/COLUMN1=111/COLUMN2=222").withPartitionValues(Arrays.asList(PARTITION_VALUE, "111", "222")).build(), HivePartitionDto.builder().withPath("/column1=aa/column2=bb").withPartitionValues(Arrays.asList(PARTITION_VALUE, "aa", "bb")).build());
resultHivePartitions = hive13DdlGenerator.getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME);
assertEquals(expectedHivePartitions, resultHivePartitions);
}
use of org.finra.herd.model.dto.HivePartitionDto in project herd by FINRAOS.
the class Hive13DdlGenerator method getHivePartitions.
/**
* Gets a list of Hive partitions. For single level partitioning, no auto-discovery of sub-partitions (sub-directories) is needed - the business object data
* will be represented by a single Hive partition instance. For multiple level partitioning, this method performs an auto-discovery of all sub-partitions
* (sub-directories) and creates a Hive partition object instance for each partition.
*
* @param businessObjectDataKey the business object data key.
* @param autoDiscoverableSubPartitionColumns the auto-discoverable sub-partition columns.
* @param s3KeyPrefix the S3 key prefix.
* @param storageFiles the storage files.
* @param businessObjectDataEntity the business object data entity.
* @param storageName the storage name.
*
* @return the list of Hive partitions
*/
public List<HivePartitionDto> getHivePartitions(BusinessObjectDataKey businessObjectDataKey, List<SchemaColumn> autoDiscoverableSubPartitionColumns, String s3KeyPrefix, Collection<String> storageFiles, BusinessObjectDataEntity businessObjectDataEntity, String storageName) {
// We are using linked hash map to preserve the order of the discovered partitions.
LinkedHashMap<List<String>, HivePartitionDto> linkedHashMap = new LinkedHashMap<>();
Pattern pattern = getHivePathPattern(autoDiscoverableSubPartitionColumns);
for (String storageFile : storageFiles) {
// Remove S3 key prefix from the file path. Please note that the storage files are already validated to start with S3 key prefix.
String relativeFilePath = storageFile.substring(s3KeyPrefix.length());
// Try to match the relative file path to the expected subpartition folders.
Matcher matcher = pattern.matcher(relativeFilePath);
Assert.isTrue(matcher.matches(), String.format("Registered storage file or directory does not match the expected Hive sub-directory pattern. " + "Storage: {%s}, file/directory: {%s}, business object data: {%s}, S3 key prefix: {%s}, pattern: {^%s$}", storageName, storageFile, businessObjectDataHelper.businessObjectDataEntityAltKeyToString(businessObjectDataEntity), s3KeyPrefix, pattern.pattern()));
// Create a list of partition values.
List<String> partitionValues = new ArrayList<>();
// Add partition values per business object data key.
partitionValues.add(businessObjectDataKey.getPartitionValue());
partitionValues.addAll(businessObjectDataKey.getSubPartitionValues());
// Extract relative partition values.
for (int i = 1; i <= matcher.groupCount(); i++) {
partitionValues.add(matcher.group(i));
}
// Get path for this partition by removing trailing "/" plus an optional file name from the relative file path.
String partitionPath = relativeFilePath.replaceAll("/[^/]*$", "");
// Check if we already have that partition discovered - that would happen if partition contains multiple data files.
HivePartitionDto hivePartition = linkedHashMap.get(partitionValues);
if (hivePartition != null) {
// Partition is already discovered, so just validate that the relative paths match.
Assert.isTrue(hivePartition.getPath().equals(partitionPath), String.format("Found two different locations for the same Hive partition. Storage: {%s}, business object data: {%s}, " + "S3 key prefix: {%s}, path[1]: {%s}, path[2]: {%s}", storageName, businessObjectDataHelper.businessObjectDataEntityAltKeyToString(businessObjectDataEntity), s3KeyPrefix, hivePartition.getPath(), partitionPath));
} else {
// Add this partition to the hash map of discovered partitions.
linkedHashMap.put(partitionValues, new HivePartitionDto(partitionPath, partitionValues));
}
}
List<HivePartitionDto> hivePartitions = new ArrayList<>();
hivePartitions.addAll(linkedHashMap.values());
return hivePartitions;
}
Aggregations