Search in sources :

Example 21 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project presto by prestodb.

the class BackgroundHiveSplitLoader method loadPartition.

private void loadPartition(HivePartitionMetadata partition) throws IOException {
    String partitionName = partition.getHivePartition().getPartitionId();
    Properties schema = getPartitionSchema(table, partition.getPartition());
    List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
    TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();
    Path path = new Path(getPartitionLocation(table, partition.getPartition()));
    Configuration configuration = hdfsEnvironment.getConfiguration(path);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
    FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);
    if (inputFormat instanceof SymlinkTextInputFormat) {
        if (bucketHandle.isPresent()) {
            throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
        }
        // TODO: This should use an iterator like the HiveFileIterator
        for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
            // The input should be in TextInputFormat.
            TextInputFormat targetInputFormat = new TextInputFormat();
            // get the configuration for the target path -- it may be a different hdfs instance
            Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
            JobConf targetJob = new JobConf(targetConfiguration);
            targetJob.setInputFormat(TextInputFormat.class);
            targetInputFormat.configure(targetJob);
            FileInputFormat.setInputPaths(targetJob, targetPath);
            InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
            if (addSplitsToSource(targetSplits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions())) {
                return;
            }
        }
        return;
    }
    // on the input format to obtain file splits.
    if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
        JobConf jobConf = new JobConf(configuration);
        FileInputFormat.setInputPaths(jobConf, path);
        InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
        addSplitsToSource(splits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions());
        return;
    }
    // If only one bucket could match: load that one file
    HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
    if (!buckets.isEmpty()) {
        int bucketCount = buckets.get(0).getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
        List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
        for (HiveBucket bucket : buckets) {
            int bucketNumber = bucket.getBucketNumber();
            LocatedFileStatus file = list.get(bucketNumber);
            boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
            iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions()));
        }
        addToHiveSplitSourceRoundRobin(iteratorList);
        return;
    }
    // If table is bucketed: list the directory, sort, tag with bucket id
    if (bucketHandle.isPresent()) {
        // HiveFileIterator skips hidden files automatically.
        int bucketCount = bucketHandle.get().getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
        List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
        for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
            LocatedFileStatus file = list.get(bucketIndex);
            boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
            iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions()));
        }
        addToHiveSplitSourceRoundRobin(iteratorList);
        return;
    }
    fileIterators.addLast(iterator);
}
Also used : HiveBucket(com.facebook.presto.hive.HiveBucketing.HiveBucket) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) PrestoException(com.facebook.presto.spi.PrestoException) Properties(java.util.Properties) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) PeekingIterator(com.google.common.collect.PeekingIterator) Iterator(java.util.Iterator) AbstractIterator(com.google.common.collect.AbstractIterator) HiveFileIterator(com.facebook.presto.hive.util.HiveFileIterator) HiveFileIterator(com.facebook.presto.hive.util.HiveFileIterator) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat)

Example 22 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project flink by apache.

the class RollingSinkFaultToleranceITCase method postSubmit.

@Override
public void postSubmit() throws Exception {
    // We read the files and verify that we have read all the strings. If a valid-length
    // file exists we only read the file to that point. (This test should work with
    // FileSystems that support truncate() and with others as well.)
    Pattern messageRegex = Pattern.compile("message (\\d*)");
    // Keep a set of the message IDs that we read. The size must equal the read count and
    // the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some
    // elements twice.
    Set<Integer> readNumbers = Sets.newHashSet();
    HashSet<String> uniqMessagesRead = new HashSet<>();
    HashSet<String> messagesInCommittedFiles = new HashSet<>();
    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);
    while (files.hasNext()) {
        LocatedFileStatus file = files.next();
        if (!file.getPath().toString().endsWith(".valid-length")) {
            int validLength = (int) file.getLen();
            if (dfs.exists(file.getPath().suffix(".valid-length"))) {
                FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length"));
                String validLengthString = inStream.readUTF();
                validLength = Integer.parseInt(validLengthString);
                System.out.println("VALID LENGTH: " + validLength);
            }
            FSDataInputStream inStream = dfs.open(file.getPath());
            byte[] buffer = new byte[validLength];
            inStream.readFully(0, buffer, 0, validLength);
            inStream.close();
            ByteArrayInputStream bais = new ByteArrayInputStream(buffer);
            InputStreamReader inStreamReader = new InputStreamReader(bais);
            BufferedReader br = new BufferedReader(inStreamReader);
            String line = br.readLine();
            while (line != null) {
                Matcher matcher = messageRegex.matcher(line);
                if (matcher.matches()) {
                    uniqMessagesRead.add(line);
                    // check that in the committed files there are no duplicates
                    if (!file.getPath().toString().endsWith(IN_PROGRESS_SUFFIX) && !file.getPath().toString().endsWith(PENDING_SUFFIX)) {
                        if (!messagesInCommittedFiles.add(line)) {
                            Assert.fail("Duplicate entry in committed bucket.");
                        }
                    }
                    int messageId = Integer.parseInt(matcher.group(1));
                    readNumbers.add(messageId);
                } else {
                    Assert.fail("Read line does not match expected pattern.");
                }
                line = br.readLine();
            }
            br.close();
            inStreamReader.close();
            bais.close();
        }
    }
    // Verify that we read all strings (at-least-once)
    Assert.assertEquals(NUM_STRINGS, readNumbers.size());
    // Verify that we don't have duplicates (boom!, exactly-once)
    Assert.assertEquals(NUM_STRINGS, uniqMessagesRead.size());
}
Also used : Path(org.apache.hadoop.fs.Path) Pattern(java.util.regex.Pattern) InputStreamReader(java.io.InputStreamReader) Matcher(java.util.regex.Matcher) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ByteArrayInputStream(java.io.ByteArrayInputStream) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) HashSet(java.util.HashSet)

Example 23 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project flink by apache.

the class RollingSinkITCase method testDateTimeRollingStringWriter.

/**
	 * This uses {@link org.apache.flink.streaming.connectors.fs.DateTimeBucketer} to
	 * produce rolling files. The clock of DateTimeBucketer is set to
	 * {@link ModifyableClock} to keep the time in lockstep with the processing of elements using
	 * latches.
	 */
@Test
public void testDateTimeRollingStringWriter() throws Exception {
    final int NUM_ELEMENTS = 20;
    final int PARALLELISM = 2;
    final String outPath = hdfsURI + "/rolling-out";
    DateTimeBucketer.setClock(new ModifyableClock());
    ModifyableClock.setCurrentTime(0);
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);
    DataStream<Tuple2<Integer, String>> source = env.addSource(new WaitingTestSourceFunction(NUM_ELEMENTS)).broadcast();
    // the parallel flatMap is chained to the sink, so when it has seen 5 elements it can
    // fire the latch
    DataStream<String> mapped = source.flatMap(new RichFlatMapFunction<Tuple2<Integer, String>, String>() {

        private static final long serialVersionUID = 1L;

        int count = 0;

        @Override
        public void flatMap(Tuple2<Integer, String> value, Collector<String> out) throws Exception {
            out.collect(value.f1);
            count++;
            if (count >= 5) {
                if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
                    latch1.trigger();
                } else {
                    latch2.trigger();
                }
                count = 0;
            }
        }
    });
    RollingSink<String> sink = new RollingSink<String>(outPath).setBucketer(new DateTimeBucketer("ss")).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
    mapped.addSink(sink);
    env.execute("RollingSink String Write Test");
    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);
    // we should have 8 rolling files, 4 time intervals and parallelism of 2
    int numFiles = 0;
    while (files.hasNext()) {
        LocatedFileStatus file = files.next();
        numFiles++;
        if (file.getPath().toString().contains("rolling-out/00")) {
            FSDataInputStream inStream = dfs.open(file.getPath());
            BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
            for (int i = 0; i < 5; i++) {
                String line = br.readLine();
                Assert.assertEquals("message #" + i, line);
            }
            inStream.close();
        } else if (file.getPath().toString().contains("rolling-out/05")) {
            FSDataInputStream inStream = dfs.open(file.getPath());
            BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
            for (int i = 5; i < 10; i++) {
                String line = br.readLine();
                Assert.assertEquals("message #" + i, line);
            }
            inStream.close();
        } else if (file.getPath().toString().contains("rolling-out/10")) {
            FSDataInputStream inStream = dfs.open(file.getPath());
            BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
            for (int i = 10; i < 15; i++) {
                String line = br.readLine();
                Assert.assertEquals("message #" + i, line);
            }
            inStream.close();
        } else if (file.getPath().toString().contains("rolling-out/15")) {
            FSDataInputStream inStream = dfs.open(file.getPath());
            BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
            for (int i = 15; i < 20; i++) {
                String line = br.readLine();
                Assert.assertEquals("message #" + i, line);
            }
            inStream.close();
        } else {
            Assert.fail("File " + file + " does not match any expected roll pattern.");
        }
    }
    Assert.assertEquals(8, numFiles);
}
Also used : Path(org.apache.hadoop.fs.Path) InputStreamReader(java.io.InputStreamReader) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) Tuple2(org.apache.flink.api.java.tuple.Tuple2) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test)

Example 24 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class AbstractContractGetFileStatusTest method testListFilesNoDir.

@Test
public void testListFilesNoDir() throws Throwable {
    describe("test the listFiles calls on a path which is not present");
    Path path = path("missing");
    try {
        RemoteIterator<LocatedFileStatus> iterator = getFileSystem().listFiles(path, false);
        fail("Expected an exception, got an iterator: " + iterator);
    } catch (FileNotFoundException expected) {
    // expected
    }
    try {
        RemoteIterator<LocatedFileStatus> iterator = getFileSystem().listFiles(path, true);
        fail("Expected an exception, got an iterator: " + iterator);
    } catch (FileNotFoundException expected) {
    // expected
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileNotFoundException(java.io.FileNotFoundException) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Test(org.junit.Test)

Example 25 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class AbstractContractGetFileStatusTest method verifyFileStats.

/**
   * Scan through a filestatus iterator, get the status of every element and
   * verify core attributes. This should identify a situation where the
   * attributes of a file/dir retrieved in a listing operation do not
   * match the values individually retrieved. That is: the metadata returned
   * in a directory listing is different from the explicitly retrieved data.
   *
   * Timestamps are not compared.
   * @param results iterator to scan
   * @return the number of entries in the result set
   * @throws IOException any IO problem
   */
private int verifyFileStats(RemoteIterator<LocatedFileStatus> results) throws IOException {
    describe("verifying file statuses");
    int count = 0;
    while (results.hasNext()) {
        count++;
        LocatedFileStatus next = results.next();
        FileStatus fileStatus = getFileSystem().getFileStatus(next.getPath());
        assertEquals("isDirectory", fileStatus.isDirectory(), next.isDirectory());
        assertEquals("isFile", fileStatus.isFile(), next.isFile());
        assertEquals("getLen", fileStatus.getLen(), next.getLen());
        assertEquals("getOwner", fileStatus.getOwner(), next.getOwner());
    }
    return count;
}
Also used : LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus)

Aggregations

LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)139 Path (org.apache.hadoop.fs.Path)104 FileSystem (org.apache.hadoop.fs.FileSystem)55 ArrayList (java.util.ArrayList)43 Test (org.junit.Test)33 FileStatus (org.apache.hadoop.fs.FileStatus)29 IOException (java.io.IOException)27 Configuration (org.apache.hadoop.conf.Configuration)20 File (java.io.File)13 FileNotFoundException (java.io.FileNotFoundException)11 HashSet (java.util.HashSet)11 BlockLocation (org.apache.hadoop.fs.BlockLocation)9 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)7 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)7 StocatorPath (com.ibm.stocator.fs.common.StocatorPath)6 HashMap (java.util.HashMap)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 Map (java.util.Map)5 Matcher (java.util.regex.Matcher)5 BufferedReader (java.io.BufferedReader)4