use of org.apache.hadoop.fs.BlockLocation in project asterixdb by apache.
the class HDFSUtils method getSplits.
/**
* Instead of creating the split using the input format, we do it manually
* This function returns fileSplits (1 per hdfs file block) irrespective of the number of partitions
* and the produced splits only cover intersection between current files in hdfs and files stored internally
* in AsterixDB
* 1. NoOp means appended file
* 2. AddOp means new file
* 3. UpdateOp means the delta of a file
*
* @return
* @throws IOException
*/
public static InputSplit[] getSplits(JobConf conf, List<ExternalFile> files) throws IOException {
// Create file system object
FileSystem fs = FileSystem.get(conf);
ArrayList<FileSplit> fileSplits = new ArrayList<>();
ArrayList<ExternalFile> orderedExternalFiles = new ArrayList<>();
// Create files splits
for (ExternalFile file : files) {
Path filePath = new Path(file.getFileName());
FileStatus fileStatus;
try {
fileStatus = fs.getFileStatus(filePath);
} catch (FileNotFoundException e) {
// file was deleted at some point, skip to next file
continue;
}
if (file.getPendingOp() == ExternalFilePendingOp.ADD_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
// Get its information from HDFS name node
BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, file.getSize());
// Create a split per block
for (BlockLocation block : fileBlocks) {
if (block.getOffset() < file.getSize()) {
fileSplits.add(new FileSplit(filePath, block.getOffset(), (block.getLength() + block.getOffset()) < file.getSize() ? block.getLength() : (file.getSize() - block.getOffset()), block.getHosts()));
orderedExternalFiles.add(file);
}
}
} else if (file.getPendingOp() == ExternalFilePendingOp.NO_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
long oldSize = 0L;
long newSize = file.getSize();
for (int i = 0; i < files.size(); i++) {
if (files.get(i).getFileName() == file.getFileName() && files.get(i).getSize() != file.getSize()) {
newSize = files.get(i).getSize();
oldSize = file.getSize();
break;
}
}
// Get its information from HDFS name node
BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, newSize);
// Create a split per block
for (BlockLocation block : fileBlocks) {
if (block.getOffset() + block.getLength() > oldSize) {
if (block.getOffset() < newSize) {
// Block interact with delta -> Create a split
long startCut = (block.getOffset() > oldSize) ? 0L : oldSize - block.getOffset();
long endCut = (block.getOffset() + block.getLength() < newSize) ? 0L : block.getOffset() + block.getLength() - newSize;
long splitLength = block.getLength() - startCut - endCut;
fileSplits.add(new FileSplit(filePath, block.getOffset() + startCut, splitLength, block.getHosts()));
orderedExternalFiles.add(file);
}
}
}
}
}
fs.close();
files.clear();
files.addAll(orderedExternalFiles);
return fileSplits.toArray(new FileSplit[fileSplits.size()]);
}
use of org.apache.hadoop.fs.BlockLocation in project drill by apache.
the class TestAffinityCalculator method testBuildRangeMap.
// @Test
// public void testSetEndpointBytes(@Injectable final FileSystem fs, @Injectable final FileStatus file) throws Throwable{
// final long blockSize = 256*1024*1024;
// LinkedList<ParquetGroupScan.RowGroupInfo> rowGroups = new LinkedList<>();
// int numberOfHosts = 4;
// int numberOfBlocks = 3;
// String port = "1234";
// String[] hosts = new String[numberOfHosts];
//
// final BlockLocation[] blockLocations = buildBlockLocations(hosts, blockSize);
// final LinkedList<CoordinationProtos.DrillbitEndpoint> endPoints = buildEndpoints(numberOfHosts);
// buildRowGroups(rowGroups, numberOfBlocks, blockSize, 3);
//
// new NonStrictExpectations() {{
// fs.getFileBlockLocations(file, 0, 3*blockSize); result = blockLocations;
// fs.getFileStatus(new Path(path)); result = file;
// file.getLen(); result = 3*blockSize;
// }};
//
//
// BlockMapBuilder ac = new BlockMapBuilder(fs, endPoints);
// for (ParquetGroupScan.RowGroupInfo rowGroup : rowGroups) {
// ac.setEndpointBytes(rowGroup);
// }
// ParquetGroupScan.RowGroupInfo rg = rowGroups.get(0);
// Long b = rg.getEndpointBytes().get(endPoints.get(0));
// assertEquals(blockSize,b.longValue());
// b = rg.getEndpointBytes().get(endPoints.get(3));
// assertNull(b);
//
// buildRowGroups(rowGroups, numberOfBlocks, blockSize, 2);
//
// ac = new BlockMapBuilder(fs, endPoints);
// for (ParquetGroupScan.RowGroupInfo rowGroup : rowGroups) {
// ac.setEndpointBytes(rowGroup);
// }
// rg = rowGroups.get(0);
// b = rg.getEndpointBytes().get(endPoints.get(0));
// assertEquals(blockSize*3/2,b.longValue());
// b = rg.getEndpointBytes().get(endPoints.get(3));
// assertEquals(blockSize / 2, b.longValue());
//
// buildRowGroups(rowGroups, numberOfBlocks, blockSize, 6);
//
// ac = new BlockMapBuilder(fs, endPoints);
// for (ParquetGroupScan.RowGroupInfo rowGroup : rowGroups) {
// ac.setEndpointBytes(rowGroup);
// }
// rg = rowGroups.get(0);
// b = rg.getEndpointBytes().get(endPoints.get(0));
// assertEquals(blockSize/2,b.longValue());
// b = rg.getEndpointBytes().get(endPoints.get(3));
// assertNull(b);
// }
@Test
public void testBuildRangeMap() {
BlockLocation[] blocks = buildBlockLocations(new String[4], 256 * 1024 * 1024);
long tA = System.nanoTime();
ImmutableRangeMap.Builder<Long, BlockLocation> blockMapBuilder = new ImmutableRangeMap.Builder<Long, BlockLocation>();
for (BlockLocation block : blocks) {
long start = block.getOffset();
long end = start + block.getLength();
Range<Long> range = Range.closedOpen(start, end);
blockMapBuilder = blockMapBuilder.put(range, block);
}
ImmutableRangeMap<Long, BlockLocation> map = blockMapBuilder.build();
long tB = System.nanoTime();
System.out.println(String.format("Took %f ms to build range map", (tB - tA) / 1e6));
}
use of org.apache.hadoop.fs.BlockLocation in project ignite by apache.
the class IgniteHadoopFileSystem method getFileBlockLocations.
/** {@inheritDoc} */
@Override
public BlockLocation[] getFileBlockLocations(Path path, long start, long len) throws IOException {
A.notNull(path, "path");
IgfsPath igfsPath = convert(path);
enterBusy();
try {
long now = System.currentTimeMillis();
List<IgfsBlockLocation> affinity = new ArrayList<>(rmtClient.affinity(igfsPath, start, len));
BlockLocation[] arr = new BlockLocation[affinity.size()];
for (int i = 0; i < arr.length; i++) arr[i] = convert(affinity.get(i));
if (LOG.isDebugEnabled())
LOG.debug("Fetched file locations [path=" + path + ", fetchTime=" + (System.currentTimeMillis() - now) + ", locations=" + Arrays.asList(arr) + ']');
return arr;
} finally {
leaveBusy();
}
}
use of org.apache.hadoop.fs.BlockLocation in project ignite by apache.
the class IgniteHadoopFileSystem method getFileBlockLocations.
/** {@inheritDoc} */
@Override
public BlockLocation[] getFileBlockLocations(FileStatus status, long start, long len) throws IOException {
A.notNull(status, "status");
enterBusy();
try {
IgfsPath path = convert(status.getPath());
long now = System.currentTimeMillis();
List<IgfsBlockLocation> affinity = new ArrayList<>(rmtClient.affinity(path, start, len));
BlockLocation[] arr = new BlockLocation[affinity.size()];
for (int i = 0; i < arr.length; i++) arr[i] = convert(affinity.get(i));
if (LOG.isDebugEnabled())
LOG.debug("Fetched file locations [path=" + path + ", fetchTime=" + (System.currentTimeMillis() - now) + ", locations=" + Arrays.asList(arr) + ']');
return arr;
} catch (FileNotFoundException ignored) {
return EMPTY_BLOCK_LOCATIONS;
} finally {
leaveBusy();
}
}
use of org.apache.hadoop.fs.BlockLocation in project ignite by apache.
the class IgniteHadoopFileSystemAbstractSelfTest method testGetFileBlockLocationsIfFileStatusReferenceNotExistingPath.
/** @throws Exception If failed. */
public void testGetFileBlockLocationsIfFileStatusReferenceNotExistingPath() throws Exception {
Path path = new Path("someFile");
fs.create(path).close();
final FileStatus status = fs.getFileStatus(path);
fs.delete(path, true);
BlockLocation[] locations = fs.getFileBlockLocations(status, 1, 2);
assertEquals(0, locations.length);
}
Aggregations