Search in sources :

Example 1 with FileMetadata

use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.

the class DirectoryPartitioner method validateAndGetOriginalFilteredFiles.

/*
    * This class holds the assumption that the directory remains immutable.
    * If the directory does changes:
    * ignore new files showing up in the directory based on an old version of partition descriptor;
    * throw {@link org.apache.samza.SamzaException} if at least one old file doesn't exist anymore
    */
private List<FileMetadata> validateAndGetOriginalFilteredFiles(List<FileMetadata> newFileList, Map<Partition, List<String>> existingPartitionDescriptor) {
    assert newFileList != null;
    assert existingPartitionDescriptor != null;
    Set<String> oldFileSet = new HashSet<>();
    existingPartitionDescriptor.values().forEach(oldFileSet::addAll);
    Set<String> newFileSet = new HashSet<>();
    newFileList.forEach(file -> newFileSet.add(file.getPath()));
    if (!newFileSet.containsAll(oldFileSet)) {
        throw new SamzaException("The list of new files is not a super set of the old files. diff = " + oldFileSet.removeAll(newFileSet));
    }
    Iterator<FileMetadata> iterator = newFileList.iterator();
    while (iterator.hasNext()) {
        FileMetadata file = iterator.next();
        if (!oldFileSet.contains(file.getPath())) {
            iterator.remove();
        }
    }
    return newFileList;
}
Also used : FileMetadata(org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata) SamzaException(org.apache.samza.SamzaException) HashSet(java.util.HashSet)

Example 2 with FileMetadata

use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.

the class TestDirectoryPartitioner method testValidDirectoryUpdating.

@Test
public void testValidDirectoryUpdating() {
    // the update is valid when there are only new files being added to the directory
    // no changes on the old files
    List<FileMetadata> testList = new ArrayList<>();
    int numInput = 6;
    String[] inputFiles = { "part-001.avro", "part-002.avro", "part-003.avro", "part-005.avro", "part-004.avro", "part-006.avro" };
    long[] fileLength = { 150582, 138132, 214005, 205738, 158273, 982345 };
    for (int i = 0; i < numInput; i++) {
        testList.add(new FileMetadata(inputFiles[i], fileLength[i]));
    }
    String whiteList = ".*";
    String blackList = "";
    String groupPattern = "";
    int expectedNumPartition = 6;
    int[][] expectedPartitioning = { { 0 }, { 1 }, { 2 }, { 3 }, { 4 }, { 5 } };
    DirectoryPartitioner directoryPartitioner = new DirectoryPartitioner(whiteList, blackList, groupPattern, new TestFileSystemAdapter(testList));
    Map<Partition, SystemStreamPartitionMetadata> metadataMap = directoryPartitioner.getPartitionMetadataMap("hdfs", null);
    Assert.assertEquals(expectedNumPartition, metadataMap.size());
    Map<Partition, List<String>> descriporMap = directoryPartitioner.getPartitionDescriptor("hdfs");
    verifyPartitionDescriptor(inputFiles, expectedPartitioning, expectedNumPartition, descriporMap);
    numInput = 7;
    String[] updatedInputFiles = { "part-001.avro", "part-002.avro", "part-003.avro", "part-005.avro", "part-004.avro", // add a new file to the directory
    "part-007.avro", "part-006.avro" };
    long[] updatedFileLength = { 150582, 138132, 214005, 205738, 158273, 2513454, 982345 };
    testList.clear();
    for (int i = 0; i < numInput; i++) {
        testList.add(new FileMetadata(updatedInputFiles[i], updatedFileLength[i]));
    }
    directoryPartitioner = new DirectoryPartitioner(whiteList, blackList, groupPattern, new TestFileSystemAdapter(testList));
    metadataMap = directoryPartitioner.getPartitionMetadataMap("hdfs", descriporMap);
    // still expect only 6 partitions instead of 7
    Assert.assertEquals(expectedNumPartition, metadataMap.size());
    Map<Partition, List<String>> updatedDescriptorMap = directoryPartitioner.getPartitionDescriptor("hdfs");
    verifyPartitionDescriptor(inputFiles, expectedPartitioning, expectedNumPartition, updatedDescriptorMap);
}
Also used : Partition(org.apache.samza.Partition) FileMetadata(org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata) ArrayList(java.util.ArrayList) SystemStreamPartitionMetadata(org.apache.samza.system.SystemStreamMetadata.SystemStreamPartitionMetadata) List(java.util.List) ArrayList(java.util.ArrayList) Test(org.junit.Test)

Example 3 with FileMetadata

use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.

the class DirectoryPartitioner method generatePartitionGroups.

/*
   * Group partitions based on the group identifier extracted from the file path
   */
private List<List<FileMetadata>> generatePartitionGroups(List<FileMetadata> filteredFiles) {
    Map<String, List<FileMetadata>> map = new HashMap<>();
    for (FileMetadata fileMetadata : filteredFiles) {
        String groupId = extractGroupIdentifier(fileMetadata.getPath());
        map.putIfAbsent(groupId, new ArrayList<>());
        map.get(groupId).add(fileMetadata);
    }
    List<List<FileMetadata>> ret = new ArrayList<>();
    // sort the map to guarantee consistent ordering
    List<String> sortedKeys = new ArrayList<>(map.keySet());
    sortedKeys.sort(Comparator.naturalOrder());
    sortedKeys.stream().forEach(key -> ret.add(map.get(key)));
    return ret;
}
Also used : HashMap(java.util.HashMap) FileMetadata(org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List)

Example 4 with FileMetadata

use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.

the class TestDirectoryPartitioner method testInvalidDirectoryUpdating.

@Test
public void testInvalidDirectoryUpdating() {
    // the update is invalid when at least one old file is removed
    List<FileMetadata> testList = new ArrayList<>();
    int numInput = 6;
    String[] inputFiles = { "part-001.avro", "part-002.avro", "part-003.avro", "part-005.avro", "part-004.avro", "part-006.avro" };
    long[] fileLength = { 150582, 138132, 214005, 205738, 158273, 982345 };
    for (int i = 0; i < numInput; i++) {
        testList.add(new FileMetadata(inputFiles[i], fileLength[i]));
    }
    String whiteList = ".*";
    String blackList = "";
    String groupPattern = "";
    int expectedNumPartition = 6;
    int[][] expectedPartitioning = { { 0 }, { 1 }, { 2 }, { 3 }, { 4 }, { 5 } };
    DirectoryPartitioner directoryPartitioner = new DirectoryPartitioner(whiteList, blackList, groupPattern, new TestFileSystemAdapter(testList));
    Map<Partition, SystemStreamPartitionMetadata> metadataMap = directoryPartitioner.getPartitionMetadataMap("hdfs", null);
    Assert.assertEquals(expectedNumPartition, metadataMap.size());
    Map<Partition, List<String>> descriporMap = directoryPartitioner.getPartitionDescriptor("hdfs");
    verifyPartitionDescriptor(inputFiles, expectedPartitioning, expectedNumPartition, descriporMap);
    String[] updatedInputFiles = { "part-001.avro", "part-002.avro", "part-003.avro", "part-005.avro", // remove part-004 and replace it with 007
    "part-007.avro", "part-006.avro" };
    long[] updatedFileLength = { 150582, 138132, 214005, 205738, 158273, 982345 };
    testList.clear();
    for (int i = 0; i < numInput; i++) {
        testList.add(new FileMetadata(updatedInputFiles[i], updatedFileLength[i]));
    }
    directoryPartitioner = new DirectoryPartitioner(whiteList, blackList, groupPattern, new TestFileSystemAdapter(testList));
    try {
        directoryPartitioner.getPartitionMetadataMap("hdfs", descriporMap);
        Assert.fail("Expect exception thrown from getting metadata. Should not reach this point.");
    } catch (SamzaException e) {
    // expect exception to be thrown
    }
}
Also used : Partition(org.apache.samza.Partition) FileMetadata(org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata) ArrayList(java.util.ArrayList) SystemStreamPartitionMetadata(org.apache.samza.system.SystemStreamMetadata.SystemStreamPartitionMetadata) SamzaException(org.apache.samza.SamzaException) List(java.util.List) ArrayList(java.util.ArrayList) Test(org.junit.Test)

Example 5 with FileMetadata

use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.

the class DirectoryPartitioner method getPartitionMetadataMap.

/**
 * Get partition metadata for a stream
 * @param streamName name of the stream; should contain the information about the path of the
 *                   root directory
 * @param existingPartitionDescriptorMap map of the existing partition descriptor
 * @return map of SSP metadata
 */
public Map<Partition, SystemStreamPartitionMetadata> getPartitionMetadataMap(String streamName, @Nullable Map<Partition, List<String>> existingPartitionDescriptorMap) {
    LOG.info("Trying to obtain metadata for " + streamName);
    LOG.info("Existing partition descriptor: " + (MapUtils.isEmpty(existingPartitionDescriptorMap) ? "empty" : existingPartitionDescriptorMap));
    Map<Partition, SystemStreamPartitionMetadata> partitionMetadataMap = new HashMap<>();
    partitionDescriptorMap.putIfAbsent(streamName, new HashMap<>());
    List<FileMetadata> filteredFiles = getFilteredFiles(streamName);
    if (!MapUtils.isEmpty(existingPartitionDescriptorMap)) {
        filteredFiles = validateAndGetOriginalFilteredFiles(filteredFiles, existingPartitionDescriptorMap);
    }
    List<List<FileMetadata>> groupedPartitions = generatePartitionGroups(filteredFiles);
    int partitionId = 0;
    for (List<FileMetadata> fileGroup : groupedPartitions) {
        Partition partition = new Partition(partitionId);
        List<String> pathList = new ArrayList<>();
        List<String> lengthList = new ArrayList<>();
        fileGroup.forEach(fileMetadata -> {
            pathList.add(fileMetadata.getPath());
            lengthList.add(String.valueOf(fileMetadata.getLen()));
        });
        String oldestOffset = MultiFileHdfsReader.generateOffset(0, "0");
        String newestOffset = MultiFileHdfsReader.generateOffset(lengthList.size() - 1, String.valueOf(lengthList.get(lengthList.size() - 1)));
        SystemStreamPartitionMetadata metadata = new SystemStreamPartitionMetadata(oldestOffset, newestOffset, null);
        partitionMetadataMap.put(partition, metadata);
        partitionDescriptorMap.get(streamName).put(partition, pathList);
        partitionId++;
    }
    LOG.info("Obtained metadata map as: " + partitionMetadataMap);
    LOG.info("Computed partition description as: " + partitionDescriptorMap);
    return partitionMetadataMap;
}
Also used : Partition(org.apache.samza.Partition) HashMap(java.util.HashMap) FileMetadata(org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata) ArrayList(java.util.ArrayList) SystemStreamPartitionMetadata(org.apache.samza.system.SystemStreamMetadata.SystemStreamPartitionMetadata) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

FileMetadata (org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata)9 ArrayList (java.util.ArrayList)8 List (java.util.List)8 Partition (org.apache.samza.Partition)7 SystemStreamPartitionMetadata (org.apache.samza.system.SystemStreamMetadata.SystemStreamPartitionMetadata)7 Test (org.junit.Test)6 HashMap (java.util.HashMap)2 SamzaException (org.apache.samza.SamzaException)2 HashSet (java.util.HashSet)1