Search in sources :

Example 1 with HivePartitionDto

use of org.finra.herd.model.dto.HivePartitionDto in project herd by FINRAOS.

the class Hive13DdlGenerator method processStorageUnitsForGenerateDdl.

/**
 * Adds the relative "alter table add partition" statements for each storage unit entity. Please note that each request partition value might result in
 * multiple available storage unit entities (subpartitions).
 *
 * @param sb the string builder to be updated with the "alter table add partition" statements
 * @param replacements the hash map of string values to be used to substitute the custom DDL tokens with their actual values
 * @param businessObjectFormatEntity the business object format entity
 * @param businessObjectFormat the business object format
 * @param ifNotExistsOption specifies if generated DDL contains "if not exists" option
 * @param storageUnitEntities the list of storage unit entities
 */
private void processStorageUnitsForGenerateDdl(GenerateDdlRequest generateDdlRequest, StringBuilder sb, HashMap<String, String> replacements, BusinessObjectFormatEntity businessObjectFormatEntity, BusinessObjectFormat businessObjectFormat, String ifNotExistsOption, List<StorageUnitEntity> storageUnitEntities) {
    // If flag is not set to suppress scan for unregistered sub-partitions, retrieve all storage
    // file paths for the relative storage units loaded in a multi-valued map for easy access.
    MultiValuedMap<Integer, String> storageUnitIdToStorageFilePathsMap = BooleanUtils.isTrue(generateDdlRequest.suppressScanForUnregisteredSubPartitions) ? new ArrayListValuedHashMap<>() : storageFileDao.getStorageFilePathsByStorageUnitIds(storageUnitHelper.getStorageUnitIds(storageUnitEntities));
    // Process all available business object data instances.
    for (StorageUnitEntity storageUnitEntity : storageUnitEntities) {
        // Get business object data key and S3 key prefix for this business object data.
        BusinessObjectDataKey businessObjectDataKey = businessObjectDataHelper.getBusinessObjectDataKey(storageUnitEntity.getBusinessObjectData());
        String s3KeyPrefix = s3KeyPrefixHelper.buildS3KeyPrefix(storageUnitEntity.getStorage(), storageUnitEntity.getBusinessObjectData().getBusinessObjectFormat(), businessObjectDataKey);
        // If flag is set to suppress scan for unregistered sub-partitions, use the directory path or the S3 key prefix
        // as the partition's location, otherwise, use storage files to discover all unregistered sub-partitions.
        Collection<String> storageFilePaths = new ArrayList<>();
        if (BooleanUtils.isTrue(generateDdlRequest.suppressScanForUnregisteredSubPartitions)) {
            // Validate the directory path value if it is present.
            if (storageUnitEntity.getDirectoryPath() != null) {
                Assert.isTrue(storageUnitEntity.getDirectoryPath().equals(s3KeyPrefix), String.format("Storage directory path \"%s\" registered with business object data {%s} " + "in \"%s\" storage does not match the expected S3 key prefix \"%s\".", storageUnitEntity.getDirectoryPath(), businessObjectDataHelper.businessObjectDataEntityAltKeyToString(storageUnitEntity.getBusinessObjectData()), storageUnitEntity.getStorage().getName(), s3KeyPrefix));
            }
            // Add the S3 key prefix to the list of storage files.
            // We add a trailing '/' character to the prefix, since it represents a directory.
            storageFilePaths.add(StringUtils.appendIfMissing(s3KeyPrefix, "/"));
        } else {
            // Retrieve storage file paths registered with this business object data in the specified storage.
            storageFilePaths = storageUnitIdToStorageFilePathsMap.containsKey(storageUnitEntity.getId()) ? storageUnitIdToStorageFilePathsMap.get(storageUnitEntity.getId()) : new ArrayList<>();
            // Validate storage file paths registered with this business object data in the specified storage.
            // The validation check below is required even if we have no storage files registered.
            storageFileHelper.validateStorageFilePaths(storageFilePaths, s3KeyPrefix, storageUnitEntity.getBusinessObjectData(), storageUnitEntity.getStorage().getName());
            // If there are no storage files registered for this storage unit, we should use the storage directory path value.
            if (storageFilePaths.isEmpty()) {
                // Validate that directory path value is present and it matches the S3 key prefix.
                Assert.isTrue(storageUnitEntity.getDirectoryPath() != null && storageUnitEntity.getDirectoryPath().startsWith(s3KeyPrefix), String.format("Storage directory path \"%s\" registered with business object data {%s} " + "in \"%s\" storage does not match the expected S3 key prefix \"%s\".", storageUnitEntity.getDirectoryPath(), businessObjectDataHelper.businessObjectDataEntityAltKeyToString(storageUnitEntity.getBusinessObjectData()), storageUnitEntity.getStorage().getName(), s3KeyPrefix));
                // Add storage directory path the empty storage files list.
                // We add a trailing '/' character to the path, since it represents a directory.
                storageFilePaths.add(storageUnitEntity.getDirectoryPath() + "/");
            }
        }
        // Retrieve the s3 bucket name.
        String s3BucketName = getS3BucketName(storageUnitEntity.getStorage(), generateDdlRequest.s3BucketNames);
        // For partitioned table, add the relative partitions to the generated DDL.
        if (generateDdlRequest.isPartitioned) {
            // the business object data equals to the number of partition columns defined in schema for the format selected for DDL generation.
            if (BooleanUtils.isTrue(generateDdlRequest.suppressScanForUnregisteredSubPartitions)) {
                int businessObjectDataRegisteredPartitions = 1 + CollectionUtils.size(businessObjectDataKey.getSubPartitionValues());
                Assert.isTrue(businessObjectFormat.getSchema().getPartitions().size() == businessObjectDataRegisteredPartitions, String.format("Number of primary and sub-partition values (%d) specified for the business object data is not equal to " + "the number of partition columns (%d) defined in the schema of the business object format selected for DDL generation. " + "Business object data: {%s},  business object format: {%s}", businessObjectDataRegisteredPartitions, businessObjectFormat.getSchema().getPartitions().size(), businessObjectDataHelper.businessObjectDataKeyToString(businessObjectDataKey), businessObjectFormatHelper.businessObjectFormatEntityAltKeyToString(businessObjectFormatEntity)));
            } else // Otherwise, since the format version selected for DDL generation might not match the relative business object format version that business
            // object data is registered against, validate that the number of sub-partition values specified for the business object data is less than
            // the number of partition columns defined in schema for the format selected for DDL generation.
            {
                Assert.isTrue(businessObjectFormat.getSchema().getPartitions().size() > CollectionUtils.size(businessObjectDataKey.getSubPartitionValues()), String.format("Number of subpartition values specified for the business object data is greater than or equal to " + "the number of partition columns defined in the schema of the business object format selected for DDL generation. " + "Business object data: {%s},  business object format: {%s}", businessObjectDataHelper.businessObjectDataKeyToString(businessObjectDataKey), businessObjectFormatHelper.businessObjectFormatEntityAltKeyToString(businessObjectFormatEntity)));
            }
            // Get partition information. For multiple level partitioning, auto-discover subpartitions (subdirectories) not already included into the S3 key
            // prefix. Each discovered partition requires a standalone "add partition" clause. Please note that due to the above validation check, there
            // should be no auto discoverable sub-partition columns, when flag is set to suppress scan for unregistered sub-partitions.
            List<SchemaColumn> autoDiscoverableSubPartitionColumns = businessObjectFormat.getSchema().getPartitions().subList(1 + CollectionUtils.size(businessObjectDataKey.getSubPartitionValues()), businessObjectFormat.getSchema().getPartitions().size());
            for (HivePartitionDto hivePartition : getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, s3KeyPrefix, storageFilePaths, storageUnitEntity.getBusinessObjectData(), storageUnitEntity.getStorage().getName())) {
                sb.append(String.format("ALTER TABLE `%s` ADD %sPARTITION (", generateDdlRequest.tableName, ifNotExistsOption));
                // Specify all partition column values.
                List<String> partitionKeyValuePairs = new ArrayList<>();
                for (int i = 0; i < businessObjectFormat.getSchema().getPartitions().size(); i++) {
                    String partitionColumnName = businessObjectFormat.getSchema().getPartitions().get(i).getName();
                    String partitionValue = hivePartition.getPartitionValues().get(i);
                    partitionKeyValuePairs.add(String.format("`%s`='%s'", partitionColumnName, partitionValue));
                }
                sb.append(StringUtils.join(partitionKeyValuePairs, ", "));
                sb.append(String.format(") LOCATION 's3n://%s/%s%s';\n", s3BucketName, s3KeyPrefix, StringUtils.isNotBlank(hivePartition.getPath()) ? hivePartition.getPath() : ""));
            }
        } else // This is a non-partitioned table.
        {
            // Get location for this non-partitioned table.
            String tableLocation = String.format("s3n://%s/%s", s3BucketName, s3KeyPrefix);
            if (generateDdlRequest.customDdlEntity == null) {
                // Since custom DDL was not specified and this table is not partitioned, add a LOCATION clause.
                // This is the last line in the non-partitioned table DDL.
                sb.append(String.format("LOCATION '%s';", tableLocation));
            } else {
                // Since custom DDL was used for a non-partitioned table, substitute the relative custom DDL token with the actual table location.
                replacements.put(NON_PARTITIONED_TABLE_LOCATION_CUSTOM_DDL_TOKEN, tableLocation);
            }
        }
    }
}
Also used : StorageUnitEntity(org.finra.herd.model.jpa.StorageUnitEntity) ArrayList(java.util.ArrayList) SchemaColumn(org.finra.herd.model.api.xml.SchemaColumn) BusinessObjectDataKey(org.finra.herd.model.api.xml.BusinessObjectDataKey) HivePartitionDto(org.finra.herd.model.dto.HivePartitionDto)

Example 2 with HivePartitionDto

use of org.finra.herd.model.dto.HivePartitionDto in project herd by FINRAOS.

the class Hive13DdlGeneratorTest method testGetHivePartitions.

@Test
public void testGetHivePartitions() {
    // Create a test business object data entity.
    BusinessObjectDataEntity businessObjectDataEntity = businessObjectDataDaoTestHelper.createBusinessObjectDataEntity(NAMESPACE, BDEF_NAME, FORMAT_USAGE_CODE, FORMAT_FILE_TYPE_CODE, FORMAT_VERSION, PARTITION_VALUE, DATA_VERSION, true, BDATA_STATUS);
    List<SchemaColumn> autoDiscoverableSubPartitionColumns;
    List<String> storageFilePaths;
    List<HivePartitionDto> expectedHivePartitions;
    List<HivePartitionDto> resultHivePartitions;
    // Get business object data key.
    BusinessObjectDataKey businessObjectDataKey = businessObjectDataHelper.getBusinessObjectDataKey(businessObjectDataEntity);
    // No storage files.
    autoDiscoverableSubPartitionColumns = getPartitionColumns(Arrays.asList("Column1", "column2"));
    storageFilePaths = new ArrayList<>();
    expectedHivePartitions = new ArrayList<>();
    resultHivePartitions = hive13DdlGenerator.getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME);
    assertEquals(expectedHivePartitions, resultHivePartitions);
    // Single level partitioning.
    autoDiscoverableSubPartitionColumns = new ArrayList<>();
    storageFilePaths = getStorageFilePaths(Arrays.asList("/file1.dat", "/file2.dat"));
    expectedHivePartitions = Arrays.asList(HivePartitionDto.builder().withPath("").withPartitionValues(Arrays.asList(PARTITION_VALUE)).build());
    resultHivePartitions = hive13DdlGenerator.getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME);
    assertEquals(expectedHivePartitions, resultHivePartitions);
    // Test that we match column names in storage file paths ignoring the case.
    autoDiscoverableSubPartitionColumns = getPartitionColumns(Arrays.asList("Column1", "column2"));
    storageFilePaths = getStorageFilePaths(Arrays.asList("/COLUMN1=111/COLUMN2=222/file.dat", "/column1=aa/column2=bb/"));
    expectedHivePartitions = Arrays.asList(HivePartitionDto.builder().withPath("/COLUMN1=111/COLUMN2=222").withPartitionValues(Arrays.asList(PARTITION_VALUE, "111", "222")).build(), HivePartitionDto.builder().withPath("/column1=aa/column2=bb").withPartitionValues(Arrays.asList(PARTITION_VALUE, "aa", "bb")).build());
    resultHivePartitions = hive13DdlGenerator.getHivePartitions(businessObjectDataKey, autoDiscoverableSubPartitionColumns, TEST_S3_KEY_PREFIX, storageFilePaths, businessObjectDataEntity, STORAGE_NAME);
    assertEquals(expectedHivePartitions, resultHivePartitions);
}
Also used : SchemaColumn(org.finra.herd.model.api.xml.SchemaColumn) HivePartitionDto(org.finra.herd.model.dto.HivePartitionDto) BusinessObjectDataEntity(org.finra.herd.model.jpa.BusinessObjectDataEntity) BusinessObjectDataKey(org.finra.herd.model.api.xml.BusinessObjectDataKey) Test(org.junit.Test) AbstractServiceTest(org.finra.herd.service.AbstractServiceTest)

Example 3 with HivePartitionDto

use of org.finra.herd.model.dto.HivePartitionDto in project herd by FINRAOS.

the class Hive13DdlGenerator method getHivePartitions.

/**
 * Gets a list of Hive partitions. For single level partitioning, no auto-discovery of sub-partitions (sub-directories) is needed - the business object data
 * will be represented by a single Hive partition instance. For multiple level partitioning, this method performs an auto-discovery of all sub-partitions
 * (sub-directories) and creates a Hive partition object instance for each partition.
 *
 * @param businessObjectDataKey the business object data key.
 * @param autoDiscoverableSubPartitionColumns the auto-discoverable sub-partition columns.
 * @param s3KeyPrefix the S3 key prefix.
 * @param storageFiles the storage files.
 * @param businessObjectDataEntity the business object data entity.
 * @param storageName the storage name.
 *
 * @return the list of Hive partitions
 */
public List<HivePartitionDto> getHivePartitions(BusinessObjectDataKey businessObjectDataKey, List<SchemaColumn> autoDiscoverableSubPartitionColumns, String s3KeyPrefix, Collection<String> storageFiles, BusinessObjectDataEntity businessObjectDataEntity, String storageName) {
    // We are using linked hash map to preserve the order of the discovered partitions.
    LinkedHashMap<List<String>, HivePartitionDto> linkedHashMap = new LinkedHashMap<>();
    Pattern pattern = getHivePathPattern(autoDiscoverableSubPartitionColumns);
    for (String storageFile : storageFiles) {
        // Remove S3 key prefix from the file path. Please note that the storage files are already validated to start with S3 key prefix.
        String relativeFilePath = storageFile.substring(s3KeyPrefix.length());
        // Try to match the relative file path to the expected subpartition folders.
        Matcher matcher = pattern.matcher(relativeFilePath);
        Assert.isTrue(matcher.matches(), String.format("Registered storage file or directory does not match the expected Hive sub-directory pattern. " + "Storage: {%s}, file/directory: {%s}, business object data: {%s}, S3 key prefix: {%s}, pattern: {^%s$}", storageName, storageFile, businessObjectDataHelper.businessObjectDataEntityAltKeyToString(businessObjectDataEntity), s3KeyPrefix, pattern.pattern()));
        // Create a list of partition values.
        List<String> partitionValues = new ArrayList<>();
        // Add partition values per business object data key.
        partitionValues.add(businessObjectDataKey.getPartitionValue());
        partitionValues.addAll(businessObjectDataKey.getSubPartitionValues());
        // Extract relative partition values.
        for (int i = 1; i <= matcher.groupCount(); i++) {
            partitionValues.add(matcher.group(i));
        }
        // Get path for this partition by removing trailing "/" plus an optional file name from the relative file path.
        String partitionPath = relativeFilePath.replaceAll("/[^/]*$", "");
        // Check if we already have that partition discovered - that would happen if partition contains multiple data files.
        HivePartitionDto hivePartition = linkedHashMap.get(partitionValues);
        if (hivePartition != null) {
            // Partition is already discovered, so just validate that the relative paths match.
            Assert.isTrue(hivePartition.getPath().equals(partitionPath), String.format("Found two different locations for the same Hive partition. Storage: {%s}, business object data: {%s}, " + "S3 key prefix: {%s}, path[1]: {%s}, path[2]: {%s}", storageName, businessObjectDataHelper.businessObjectDataEntityAltKeyToString(businessObjectDataEntity), s3KeyPrefix, hivePartition.getPath(), partitionPath));
        } else {
            // Add this partition to the hash map of discovered partitions.
            linkedHashMap.put(partitionValues, new HivePartitionDto(partitionPath, partitionValues));
        }
    }
    List<HivePartitionDto> hivePartitions = new ArrayList<>();
    hivePartitions.addAll(linkedHashMap.values());
    return hivePartitions;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) HivePartitionDto(org.finra.herd.model.dto.HivePartitionDto) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

HivePartitionDto (org.finra.herd.model.dto.HivePartitionDto)3 ArrayList (java.util.ArrayList)2 BusinessObjectDataKey (org.finra.herd.model.api.xml.BusinessObjectDataKey)2 SchemaColumn (org.finra.herd.model.api.xml.SchemaColumn)2 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 BusinessObjectDataEntity (org.finra.herd.model.jpa.BusinessObjectDataEntity)1 StorageUnitEntity (org.finra.herd.model.jpa.StorageUnitEntity)1 AbstractServiceTest (org.finra.herd.service.AbstractServiceTest)1 Test (org.junit.Test)1