Search in sources :

Example 31 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project tez by apache.

the class TestMRInputHelpers method testOldSplitsGen.

@Test(timeout = 5000)
public void testOldSplitsGen() throws Exception {
    DataSourceDescriptor dataSource = generateDataSourceDescriptorMapRed(oldSplitsDir);
    Assert.assertTrue(dataSource.getAdditionalLocalFiles().containsKey(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME));
    Assert.assertTrue(dataSource.getAdditionalLocalFiles().containsKey(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME));
    RemoteIterator<LocatedFileStatus> files = remoteFs.listFiles(oldSplitsDir, false);
    boolean foundSplitsFile = false;
    boolean foundMetaFile = false;
    int totalFilesFound = 0;
    while (files.hasNext()) {
        LocatedFileStatus status = files.next();
        String fName = status.getPath().getName();
        totalFilesFound++;
        if (fName.equals(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME)) {
            foundSplitsFile = true;
        } else if (fName.equals(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME)) {
            foundMetaFile = true;
        } else {
            Assert.fail("Found invalid file in splits dir, filename=" + fName);
        }
        Assert.assertTrue(status.getLen() > 0);
    }
    Assert.assertEquals(2, totalFilesFound);
    Assert.assertTrue(foundSplitsFile);
    Assert.assertTrue(foundMetaFile);
    verifyLocationHints(oldSplitsDir, dataSource.getLocationHint().getTaskLocationHints());
}
Also used : LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) Test(org.junit.Test)

Example 32 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project tez by apache.

the class TestMRInputHelpers method testNewSplitsGen.

@Test(timeout = 5000)
public void testNewSplitsGen() throws Exception {
    DataSourceDescriptor dataSource = generateDataSourceDescriptorMapReduce(newSplitsDir);
    Assert.assertTrue(dataSource.getAdditionalLocalFiles().containsKey(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME));
    Assert.assertTrue(dataSource.getAdditionalLocalFiles().containsKey(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME));
    RemoteIterator<LocatedFileStatus> files = remoteFs.listFiles(newSplitsDir, false);
    boolean foundSplitsFile = false;
    boolean foundMetaFile = false;
    int totalFilesFound = 0;
    while (files.hasNext()) {
        LocatedFileStatus status = files.next();
        String fName = status.getPath().getName();
        totalFilesFound++;
        if (fName.equals(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME)) {
            foundSplitsFile = true;
        } else if (fName.equals(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME)) {
            foundMetaFile = true;
        } else {
            Assert.fail("Found invalid file in splits dir, filename=" + fName);
        }
        Assert.assertTrue(status.getLen() > 0);
    }
    Assert.assertEquals(2, totalFilesFound);
    Assert.assertTrue(foundSplitsFile);
    Assert.assertTrue(foundMetaFile);
    verifyLocationHints(newSplitsDir, dataSource.getLocationHint().getTaskLocationHints());
}
Also used : LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) Test(org.junit.Test)

Example 33 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project cdap by caskdata.

the class DynamicPartitioningOutputCommitter method commitJob.

@Override
public void commitJob(JobContext context) throws IOException {
    Configuration configuration = context.getConfiguration();
    MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
    BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);
    String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
    outputDataset = taskContext.getDataset(outputDatasetName);
    DynamicPartitioner.PartitionWriteOption partitionWriteOption = DynamicPartitioner.PartitionWriteOption.valueOf(configuration.get(PartitionedFileSetArguments.DYNAMIC_PARTITIONER_WRITE_OPTION));
    Partitioning partitioning = outputDataset.getPartitioning();
    partitionsToAdd = new HashMap<>();
    // Go over all files in the temporary directory and keep track of partitions to add for them
    FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
    for (FileStatus committedTaskPath : allCommittedTaskPaths) {
        FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
        RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
        while (fileIter.hasNext()) {
            Path path = fileIter.next().getPath();
            String relativePath = getRelative(committedTaskPath.getPath(), path);
            int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
            if (lastPathSepIdx == -1) {
                // this shouldn't happen because each relative path should consist of at least one partition key and
                // the output file name
                LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path, relativePath);
                continue;
            }
            // relativePath = "../key1/key2/part-m-00000"
            // relativeDir = "../key1/key2"
            // fileName = "part-m-00000"
            String relativeDir = relativePath.substring(0, lastPathSepIdx);
            Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
            if (partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE) {
                if (fs.exists(finalDir)) {
                    throw new FileAlreadyExistsException("Final output path already exists: " + finalDir);
                }
            }
            PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
            partitionsToAdd.put(relativeDir, partitionKey);
        }
    }
    // need to remove any existing partitions, before moving temporary content to final output
    if (partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE_OR_OVERWRITE) {
        for (Map.Entry<String, PartitionKey> entry : partitionsToAdd.entrySet()) {
            if (outputDataset.getPartition(entry.getValue()) != null) {
                // this allows reinstating the existing files if there's a rollback.
                // alternative is to simply remove the files within the partition's location
                // upside to that is easily avoiding explore operations. one downside is that metadata is not removed then
                outputDataset.dropPartition(entry.getValue());
            }
        }
    }
    // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
    // the original outputDir.
    Path finalOutput = FileOutputFormat.getOutputPath(context);
    FileContext fc = FileContext.getFileContext(configuration);
    // the finalOutput path doesn't have scheme or authority (but 'from' does)
    finalOutput = fc.makeQualified(finalOutput);
    for (FileStatus from : getAllCommittedTaskPaths(context)) {
        mergePaths(fc, from, finalOutput);
    }
    // compute the metadata to be written to every output partition
    Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(), PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);
    boolean allowAppend = partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE_OR_APPEND;
    // create all the necessary partitions
    for (Map.Entry<String, PartitionKey> entry : partitionsToAdd.entrySet()) {
        outputDataset.addPartition(entry.getValue(), entry.getKey(), metadata, true, allowAppend);
    }
    // delete the job-specific _temporary folder
    cleanupJob(context);
    // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
    if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
        for (String relativePath : partitionsToAdd.keySet()) {
            Path pathToMark = new Path(finalOutput, relativePath);
            createOrUpdate(fc, new Path(pathToMark, SUCCEEDED_FILE_NAME));
            // also create a _SUCCESS-<RunId>, if allowing append
            if (allowAppend) {
                createOrUpdate(fc, new Path(pathToMark, SUCCEEDED_FILE_NAME + "-" + taskContext.getProgramRunId().getRun()));
            }
        }
    }
}
Also used : BasicMapReduceTaskContext(co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext) Path(org.apache.hadoop.fs.Path) MapReduceClassLoader(co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader) FileAlreadyExistsException(org.apache.hadoop.mapred.FileAlreadyExistsException) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Configuration(org.apache.hadoop.conf.Configuration) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) DynamicPartitioner(co.cask.cdap.api.dataset.lib.DynamicPartitioner) HashMap(java.util.HashMap) Map(java.util.Map) FileContext(org.apache.hadoop.fs.FileContext)

Example 34 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project incubator-crail by apache.

the class HdfsIOBenchmark method enumerateDir.

void enumerateDir() throws Exception {
    System.out.println("enumarate dir, path " + path);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    int repfactor = 4;
    for (int k = 0; k < repfactor; k++) {
        long start = System.currentTimeMillis();
        for (int i = 0; i < size; i++) {
            // single operation == loop
            RemoteIterator<LocatedFileStatus> iter = fs.listFiles(path, false);
            while (iter.hasNext()) {
                iter.next();
            }
        }
        long end = System.currentTimeMillis();
        double executionTime = ((double) (end - start));
        double latency = executionTime * 1000.0 / ((double) size);
        System.out.println("execution time [ms] " + executionTime);
        System.out.println("latency [us] " + latency);
    }
    fs.close();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus)

Example 35 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project drill by apache.

the class TemporaryTablesAutomaticDropTest method createAndCheckSessionTemporaryLocation.

private File createAndCheckSessionTemporaryLocation(String suffix, File schemaLocation) throws Exception {
    String temporaryTableName = "temporary_table_automatic_drop_" + suffix;
    File sessionTemporaryLocation = schemaLocation.toPath().resolve(SESSION_UUID.toString()).toFile();
    test("create TEMPORARY table %s.%s as select 'A' as c1 from (values(1))", DFS_TMP_SCHEMA, temporaryTableName);
    FileSystem fs = getLocalFileSystem();
    Path sessionPath = new Path(sessionTemporaryLocation.getAbsolutePath());
    assertTrue("Session temporary location should exist", fs.exists(sessionPath));
    assertEquals("Directory permission should match", StorageStrategy.TEMPORARY.getFolderPermission(), fs.getFileStatus(sessionPath).getPermission());
    Path tempTablePath = new Path(sessionPath, SESSION_UUID.toString());
    assertTrue("Temporary table location should exist", fs.exists(tempTablePath));
    assertEquals("Directory permission should match", StorageStrategy.TEMPORARY.getFolderPermission(), fs.getFileStatus(tempTablePath).getPermission());
    RemoteIterator<LocatedFileStatus> fileIterator = fs.listFiles(tempTablePath, false);
    while (fileIterator.hasNext()) {
        LocatedFileStatus file = fileIterator.next();
        assertEquals("File permission should match", StorageStrategy.TEMPORARY.getFilePermission(), file.getPermission());
    }
    return sessionTemporaryLocation;
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) File(java.io.File)

Aggregations

LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)139 Path (org.apache.hadoop.fs.Path)104 FileSystem (org.apache.hadoop.fs.FileSystem)55 ArrayList (java.util.ArrayList)43 Test (org.junit.Test)33 FileStatus (org.apache.hadoop.fs.FileStatus)29 IOException (java.io.IOException)27 Configuration (org.apache.hadoop.conf.Configuration)20 File (java.io.File)13 FileNotFoundException (java.io.FileNotFoundException)11 HashSet (java.util.HashSet)11 BlockLocation (org.apache.hadoop.fs.BlockLocation)9 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)7 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)7 StocatorPath (com.ibm.stocator.fs.common.StocatorPath)6 HashMap (java.util.HashMap)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 Map (java.util.Map)5 Matcher (java.util.regex.Matcher)5 BufferedReader (java.io.BufferedReader)4