use of org.apache.hadoop.fs.LocatedFileStatus in project presto by prestodb.
the class BackgroundHiveSplitLoader method loadPartition.
private void loadPartition(HivePartitionMetadata partition) throws IOException {
String partitionName = partition.getHivePartition().getPartitionId();
Properties schema = getPartitionSchema(table, partition.getPartition());
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
Configuration configuration = hdfsEnvironment.getConfiguration(path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);
if (inputFormat instanceof SymlinkTextInputFormat) {
if (bucketHandle.isPresent()) {
throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
}
// TODO: This should use an iterator like the HiveFileIterator
for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
// The input should be in TextInputFormat.
TextInputFormat targetInputFormat = new TextInputFormat();
// get the configuration for the target path -- it may be a different hdfs instance
Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
JobConf targetJob = new JobConf(targetConfiguration);
targetJob.setInputFormat(TextInputFormat.class);
targetInputFormat.configure(targetJob);
FileInputFormat.setInputPaths(targetJob, targetPath);
InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
if (addSplitsToSource(targetSplits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions())) {
return;
}
}
return;
}
// on the input format to obtain file splits.
if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
JobConf jobConf = new JobConf(configuration);
FileInputFormat.setInputPaths(jobConf, path);
InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
addSplitsToSource(splits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions());
return;
}
// If only one bucket could match: load that one file
HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
if (!buckets.isEmpty()) {
int bucketCount = buckets.get(0).getBucketCount();
List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
for (HiveBucket bucket : buckets) {
int bucketNumber = bucket.getBucketNumber();
LocatedFileStatus file = list.get(bucketNumber);
boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions()));
}
addToHiveSplitSourceRoundRobin(iteratorList);
return;
}
// If table is bucketed: list the directory, sort, tag with bucket id
if (bucketHandle.isPresent()) {
// HiveFileIterator skips hidden files automatically.
int bucketCount = bucketHandle.get().getBucketCount();
List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
LocatedFileStatus file = list.get(bucketIndex);
boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions()));
}
addToHiveSplitSourceRoundRobin(iteratorList);
return;
}
fileIterators.addLast(iterator);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project flink by apache.
the class RollingSinkFaultToleranceITCase method postSubmit.
@Override
public void postSubmit() throws Exception {
// We read the files and verify that we have read all the strings. If a valid-length
// file exists we only read the file to that point. (This test should work with
// FileSystems that support truncate() and with others as well.)
Pattern messageRegex = Pattern.compile("message (\\d*)");
// Keep a set of the message IDs that we read. The size must equal the read count and
// the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some
// elements twice.
Set<Integer> readNumbers = Sets.newHashSet();
HashSet<String> uniqMessagesRead = new HashSet<>();
HashSet<String> messagesInCommittedFiles = new HashSet<>();
RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);
while (files.hasNext()) {
LocatedFileStatus file = files.next();
if (!file.getPath().toString().endsWith(".valid-length")) {
int validLength = (int) file.getLen();
if (dfs.exists(file.getPath().suffix(".valid-length"))) {
FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length"));
String validLengthString = inStream.readUTF();
validLength = Integer.parseInt(validLengthString);
System.out.println("VALID LENGTH: " + validLength);
}
FSDataInputStream inStream = dfs.open(file.getPath());
byte[] buffer = new byte[validLength];
inStream.readFully(0, buffer, 0, validLength);
inStream.close();
ByteArrayInputStream bais = new ByteArrayInputStream(buffer);
InputStreamReader inStreamReader = new InputStreamReader(bais);
BufferedReader br = new BufferedReader(inStreamReader);
String line = br.readLine();
while (line != null) {
Matcher matcher = messageRegex.matcher(line);
if (matcher.matches()) {
uniqMessagesRead.add(line);
// check that in the committed files there are no duplicates
if (!file.getPath().toString().endsWith(IN_PROGRESS_SUFFIX) && !file.getPath().toString().endsWith(PENDING_SUFFIX)) {
if (!messagesInCommittedFiles.add(line)) {
Assert.fail("Duplicate entry in committed bucket.");
}
}
int messageId = Integer.parseInt(matcher.group(1));
readNumbers.add(messageId);
} else {
Assert.fail("Read line does not match expected pattern.");
}
line = br.readLine();
}
br.close();
inStreamReader.close();
bais.close();
}
}
// Verify that we read all strings (at-least-once)
Assert.assertEquals(NUM_STRINGS, readNumbers.size());
// Verify that we don't have duplicates (boom!, exactly-once)
Assert.assertEquals(NUM_STRINGS, uniqMessagesRead.size());
}
use of org.apache.hadoop.fs.LocatedFileStatus in project flink by apache.
the class RollingSinkITCase method testDateTimeRollingStringWriter.
/**
* This uses {@link org.apache.flink.streaming.connectors.fs.DateTimeBucketer} to
* produce rolling files. The clock of DateTimeBucketer is set to
* {@link ModifyableClock} to keep the time in lockstep with the processing of elements using
* latches.
*/
@Test
public void testDateTimeRollingStringWriter() throws Exception {
final int NUM_ELEMENTS = 20;
final int PARALLELISM = 2;
final String outPath = hdfsURI + "/rolling-out";
DateTimeBucketer.setClock(new ModifyableClock());
ModifyableClock.setCurrentTime(0);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(PARALLELISM);
DataStream<Tuple2<Integer, String>> source = env.addSource(new WaitingTestSourceFunction(NUM_ELEMENTS)).broadcast();
// the parallel flatMap is chained to the sink, so when it has seen 5 elements it can
// fire the latch
DataStream<String> mapped = source.flatMap(new RichFlatMapFunction<Tuple2<Integer, String>, String>() {
private static final long serialVersionUID = 1L;
int count = 0;
@Override
public void flatMap(Tuple2<Integer, String> value, Collector<String> out) throws Exception {
out.collect(value.f1);
count++;
if (count >= 5) {
if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
latch1.trigger();
} else {
latch2.trigger();
}
count = 0;
}
}
});
RollingSink<String> sink = new RollingSink<String>(outPath).setBucketer(new DateTimeBucketer("ss")).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
mapped.addSink(sink);
env.execute("RollingSink String Write Test");
RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);
// we should have 8 rolling files, 4 time intervals and parallelism of 2
int numFiles = 0;
while (files.hasNext()) {
LocatedFileStatus file = files.next();
numFiles++;
if (file.getPath().toString().contains("rolling-out/00")) {
FSDataInputStream inStream = dfs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 0; i < 5; i++) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
} else if (file.getPath().toString().contains("rolling-out/05")) {
FSDataInputStream inStream = dfs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 5; i < 10; i++) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
} else if (file.getPath().toString().contains("rolling-out/10")) {
FSDataInputStream inStream = dfs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 10; i < 15; i++) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
} else if (file.getPath().toString().contains("rolling-out/15")) {
FSDataInputStream inStream = dfs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 15; i < 20; i++) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
} else {
Assert.fail("File " + file + " does not match any expected roll pattern.");
}
}
Assert.assertEquals(8, numFiles);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class AbstractContractGetFileStatusTest method testListFilesNoDir.
@Test
public void testListFilesNoDir() throws Throwable {
describe("test the listFiles calls on a path which is not present");
Path path = path("missing");
try {
RemoteIterator<LocatedFileStatus> iterator = getFileSystem().listFiles(path, false);
fail("Expected an exception, got an iterator: " + iterator);
} catch (FileNotFoundException expected) {
// expected
}
try {
RemoteIterator<LocatedFileStatus> iterator = getFileSystem().listFiles(path, true);
fail("Expected an exception, got an iterator: " + iterator);
} catch (FileNotFoundException expected) {
// expected
}
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class AbstractContractGetFileStatusTest method verifyFileStats.
/**
* Scan through a filestatus iterator, get the status of every element and
* verify core attributes. This should identify a situation where the
* attributes of a file/dir retrieved in a listing operation do not
* match the values individually retrieved. That is: the metadata returned
* in a directory listing is different from the explicitly retrieved data.
*
* Timestamps are not compared.
* @param results iterator to scan
* @return the number of entries in the result set
* @throws IOException any IO problem
*/
private int verifyFileStats(RemoteIterator<LocatedFileStatus> results) throws IOException {
describe("verifying file statuses");
int count = 0;
while (results.hasNext()) {
count++;
LocatedFileStatus next = results.next();
FileStatus fileStatus = getFileSystem().getFileStatus(next.getPath());
assertEquals("isDirectory", fileStatus.isDirectory(), next.isDirectory());
assertEquals("isFile", fileStatus.isFile(), next.isFile());
assertEquals("getLen", fileStatus.getLen(), next.getLen());
assertEquals("getOwner", fileStatus.getOwner(), next.getOwner());
}
return count;
}
Aggregations