use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.
the class DirectoryPartitioner method validateAndGetOriginalFilteredFiles.
/*
* This class holds the assumption that the directory remains immutable.
* If the directory does changes:
* ignore new files showing up in the directory based on an old version of partition descriptor;
* throw {@link org.apache.samza.SamzaException} if at least one old file doesn't exist anymore
*/
private List<FileMetadata> validateAndGetOriginalFilteredFiles(List<FileMetadata> newFileList, Map<Partition, List<String>> existingPartitionDescriptor) {
assert newFileList != null;
assert existingPartitionDescriptor != null;
Set<String> oldFileSet = new HashSet<>();
existingPartitionDescriptor.values().forEach(oldFileSet::addAll);
Set<String> newFileSet = new HashSet<>();
newFileList.forEach(file -> newFileSet.add(file.getPath()));
if (!newFileSet.containsAll(oldFileSet)) {
throw new SamzaException("The list of new files is not a super set of the old files. diff = " + oldFileSet.removeAll(newFileSet));
}
Iterator<FileMetadata> iterator = newFileList.iterator();
while (iterator.hasNext()) {
FileMetadata file = iterator.next();
if (!oldFileSet.contains(file.getPath())) {
iterator.remove();
}
}
return newFileList;
}
use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.
the class TestDirectoryPartitioner method testValidDirectoryUpdating.
@Test
public void testValidDirectoryUpdating() {
// the update is valid when there are only new files being added to the directory
// no changes on the old files
List<FileMetadata> testList = new ArrayList<>();
int numInput = 6;
String[] inputFiles = { "part-001.avro", "part-002.avro", "part-003.avro", "part-005.avro", "part-004.avro", "part-006.avro" };
long[] fileLength = { 150582, 138132, 214005, 205738, 158273, 982345 };
for (int i = 0; i < numInput; i++) {
testList.add(new FileMetadata(inputFiles[i], fileLength[i]));
}
String whiteList = ".*";
String blackList = "";
String groupPattern = "";
int expectedNumPartition = 6;
int[][] expectedPartitioning = { { 0 }, { 1 }, { 2 }, { 3 }, { 4 }, { 5 } };
DirectoryPartitioner directoryPartitioner = new DirectoryPartitioner(whiteList, blackList, groupPattern, new TestFileSystemAdapter(testList));
Map<Partition, SystemStreamPartitionMetadata> metadataMap = directoryPartitioner.getPartitionMetadataMap("hdfs", null);
Assert.assertEquals(expectedNumPartition, metadataMap.size());
Map<Partition, List<String>> descriporMap = directoryPartitioner.getPartitionDescriptor("hdfs");
verifyPartitionDescriptor(inputFiles, expectedPartitioning, expectedNumPartition, descriporMap);
numInput = 7;
String[] updatedInputFiles = { "part-001.avro", "part-002.avro", "part-003.avro", "part-005.avro", "part-004.avro", // add a new file to the directory
"part-007.avro", "part-006.avro" };
long[] updatedFileLength = { 150582, 138132, 214005, 205738, 158273, 2513454, 982345 };
testList.clear();
for (int i = 0; i < numInput; i++) {
testList.add(new FileMetadata(updatedInputFiles[i], updatedFileLength[i]));
}
directoryPartitioner = new DirectoryPartitioner(whiteList, blackList, groupPattern, new TestFileSystemAdapter(testList));
metadataMap = directoryPartitioner.getPartitionMetadataMap("hdfs", descriporMap);
// still expect only 6 partitions instead of 7
Assert.assertEquals(expectedNumPartition, metadataMap.size());
Map<Partition, List<String>> updatedDescriptorMap = directoryPartitioner.getPartitionDescriptor("hdfs");
verifyPartitionDescriptor(inputFiles, expectedPartitioning, expectedNumPartition, updatedDescriptorMap);
}
use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.
the class DirectoryPartitioner method generatePartitionGroups.
/*
* Group partitions based on the group identifier extracted from the file path
*/
private List<List<FileMetadata>> generatePartitionGroups(List<FileMetadata> filteredFiles) {
Map<String, List<FileMetadata>> map = new HashMap<>();
for (FileMetadata fileMetadata : filteredFiles) {
String groupId = extractGroupIdentifier(fileMetadata.getPath());
map.putIfAbsent(groupId, new ArrayList<>());
map.get(groupId).add(fileMetadata);
}
List<List<FileMetadata>> ret = new ArrayList<>();
// sort the map to guarantee consistent ordering
List<String> sortedKeys = new ArrayList<>(map.keySet());
sortedKeys.sort(Comparator.naturalOrder());
sortedKeys.stream().forEach(key -> ret.add(map.get(key)));
return ret;
}
use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.
the class TestDirectoryPartitioner method testInvalidDirectoryUpdating.
@Test
public void testInvalidDirectoryUpdating() {
// the update is invalid when at least one old file is removed
List<FileMetadata> testList = new ArrayList<>();
int numInput = 6;
String[] inputFiles = { "part-001.avro", "part-002.avro", "part-003.avro", "part-005.avro", "part-004.avro", "part-006.avro" };
long[] fileLength = { 150582, 138132, 214005, 205738, 158273, 982345 };
for (int i = 0; i < numInput; i++) {
testList.add(new FileMetadata(inputFiles[i], fileLength[i]));
}
String whiteList = ".*";
String blackList = "";
String groupPattern = "";
int expectedNumPartition = 6;
int[][] expectedPartitioning = { { 0 }, { 1 }, { 2 }, { 3 }, { 4 }, { 5 } };
DirectoryPartitioner directoryPartitioner = new DirectoryPartitioner(whiteList, blackList, groupPattern, new TestFileSystemAdapter(testList));
Map<Partition, SystemStreamPartitionMetadata> metadataMap = directoryPartitioner.getPartitionMetadataMap("hdfs", null);
Assert.assertEquals(expectedNumPartition, metadataMap.size());
Map<Partition, List<String>> descriporMap = directoryPartitioner.getPartitionDescriptor("hdfs");
verifyPartitionDescriptor(inputFiles, expectedPartitioning, expectedNumPartition, descriporMap);
String[] updatedInputFiles = { "part-001.avro", "part-002.avro", "part-003.avro", "part-005.avro", // remove part-004 and replace it with 007
"part-007.avro", "part-006.avro" };
long[] updatedFileLength = { 150582, 138132, 214005, 205738, 158273, 982345 };
testList.clear();
for (int i = 0; i < numInput; i++) {
testList.add(new FileMetadata(updatedInputFiles[i], updatedFileLength[i]));
}
directoryPartitioner = new DirectoryPartitioner(whiteList, blackList, groupPattern, new TestFileSystemAdapter(testList));
try {
directoryPartitioner.getPartitionMetadataMap("hdfs", descriporMap);
Assert.fail("Expect exception thrown from getting metadata. Should not reach this point.");
} catch (SamzaException e) {
// expect exception to be thrown
}
}
use of org.apache.samza.system.hdfs.partitioner.FileSystemAdapter.FileMetadata in project samza by apache.
the class DirectoryPartitioner method getPartitionMetadataMap.
/**
* Get partition metadata for a stream
* @param streamName name of the stream; should contain the information about the path of the
* root directory
* @param existingPartitionDescriptorMap map of the existing partition descriptor
* @return map of SSP metadata
*/
public Map<Partition, SystemStreamPartitionMetadata> getPartitionMetadataMap(String streamName, @Nullable Map<Partition, List<String>> existingPartitionDescriptorMap) {
LOG.info("Trying to obtain metadata for " + streamName);
LOG.info("Existing partition descriptor: " + (MapUtils.isEmpty(existingPartitionDescriptorMap) ? "empty" : existingPartitionDescriptorMap));
Map<Partition, SystemStreamPartitionMetadata> partitionMetadataMap = new HashMap<>();
partitionDescriptorMap.putIfAbsent(streamName, new HashMap<>());
List<FileMetadata> filteredFiles = getFilteredFiles(streamName);
if (!MapUtils.isEmpty(existingPartitionDescriptorMap)) {
filteredFiles = validateAndGetOriginalFilteredFiles(filteredFiles, existingPartitionDescriptorMap);
}
List<List<FileMetadata>> groupedPartitions = generatePartitionGroups(filteredFiles);
int partitionId = 0;
for (List<FileMetadata> fileGroup : groupedPartitions) {
Partition partition = new Partition(partitionId);
List<String> pathList = new ArrayList<>();
List<String> lengthList = new ArrayList<>();
fileGroup.forEach(fileMetadata -> {
pathList.add(fileMetadata.getPath());
lengthList.add(String.valueOf(fileMetadata.getLen()));
});
String oldestOffset = MultiFileHdfsReader.generateOffset(0, "0");
String newestOffset = MultiFileHdfsReader.generateOffset(lengthList.size() - 1, String.valueOf(lengthList.get(lengthList.size() - 1)));
SystemStreamPartitionMetadata metadata = new SystemStreamPartitionMetadata(oldestOffset, newestOffset, null);
partitionMetadataMap.put(partition, metadata);
partitionDescriptorMap.get(streamName).put(partition, pathList);
partitionId++;
}
LOG.info("Obtained metadata map as: " + partitionMetadataMap);
LOG.info("Computed partition description as: " + partitionDescriptorMap);
return partitionMetadataMap;
}
Aggregations