Search in sources :

Example 76 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project flink by apache.

the class BucketingSinkFaultToleranceITCase method postSubmit.

@Override
public void postSubmit() throws Exception {
    // We read the files and verify that we have read all the strings. If a valid-length
    // file exists we only read the file to that point. (This test should work with
    // FileSystems that support truncate() and with others as well.)
    Pattern messageRegex = Pattern.compile("message (\\d*)");
    // Keep a set of the message IDs that we read. The size must equal the read count and
    // the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some
    // elements twice.
    Set<Integer> readNumbers = Sets.newHashSet();
    HashSet<String> uniqMessagesRead = new HashSet<>();
    HashSet<String> messagesInCommittedFiles = new HashSet<>();
    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);
    while (files.hasNext()) {
        LocatedFileStatus file = files.next();
        if (!file.getPath().toString().endsWith(".valid-length")) {
            int validLength = (int) file.getLen();
            if (dfs.exists(file.getPath().suffix(".valid-length"))) {
                FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length"));
                String validLengthString = inStream.readUTF();
                validLength = Integer.parseInt(validLengthString);
                System.out.println("VALID LENGTH: " + validLength);
            }
            FSDataInputStream inStream = dfs.open(file.getPath());
            byte[] buffer = new byte[validLength];
            inStream.readFully(0, buffer, 0, validLength);
            inStream.close();
            ByteArrayInputStream bais = new ByteArrayInputStream(buffer);
            InputStreamReader inStreamReader = new InputStreamReader(bais);
            BufferedReader br = new BufferedReader(inStreamReader);
            String line = br.readLine();
            while (line != null) {
                Matcher matcher = messageRegex.matcher(line);
                if (matcher.matches()) {
                    uniqMessagesRead.add(line);
                    // check that in the committed files there are no duplicates
                    if (!file.getPath().toString().endsWith(IN_PROGRESS_SUFFIX) && !file.getPath().toString().endsWith(PENDING_SUFFIX)) {
                        if (!messagesInCommittedFiles.add(line)) {
                            Assert.fail("Duplicate entry in committed bucket.");
                        }
                    }
                    int messageId = Integer.parseInt(matcher.group(1));
                    readNumbers.add(messageId);
                } else {
                    Assert.fail("Read line does not match expected pattern.");
                }
                line = br.readLine();
            }
            br.close();
            inStreamReader.close();
            bais.close();
        }
    }
    // Verify that we read all strings (at-least-once)
    Assert.assertEquals(NUM_STRINGS, readNumbers.size());
    // Verify that we don't have duplicates (boom!, exactly-once)
    Assert.assertEquals(NUM_STRINGS, uniqMessagesRead.size());
}
Also used : Path(org.apache.hadoop.fs.Path) Pattern(java.util.regex.Pattern) InputStreamReader(java.io.InputStreamReader) Matcher(java.util.regex.Matcher) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ByteArrayInputStream(java.io.ByteArrayInputStream) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) HashSet(java.util.HashSet)

Example 77 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project flink by apache.

the class BucketingSinkTest method testDateTimeRollingStringWriter.

/**
	 * This uses {@link DateTimeBucketer} to
	 * produce rolling files. We use {@link OneInputStreamOperatorTestHarness} to manually
	 * advance processing time.
	 */
@Test
public void testDateTimeRollingStringWriter() throws Exception {
    final int numElements = 20;
    final String outPath = hdfsURI + "/rolling-out";
    BucketingSink<String> sink = new BucketingSink<String>(outPath).setBucketer(new DateTimeBucketer<String>("ss")).setPartPrefix(PART_PREFIX).setPendingPrefix("").setPendingSuffix("");
    OneInputStreamOperatorTestHarness<String, Object> testHarness = createTestSink(sink, 1, 0);
    testHarness.setProcessingTime(0L);
    testHarness.setup();
    testHarness.open();
    for (int i = 0; i < numElements; i++) {
        // Every 5 elements, increase the clock time. We should end up with 5 elements per bucket.
        if (i % 5 == 0) {
            testHarness.setProcessingTime(i * 1000L);
        }
        testHarness.processElement(new StreamRecord<>("message #" + Integer.toString(i)));
    }
    testHarness.close();
    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);
    // We should have 4 rolling files across 4 time intervals
    int numFiles = 0;
    while (files.hasNext()) {
        LocatedFileStatus file = files.next();
        numFiles++;
        if (file.getPath().toString().contains("rolling-out/00")) {
            FSDataInputStream inStream = dfs.open(file.getPath());
            BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
            for (int i = 0; i < 5; i++) {
                String line = br.readLine();
                Assert.assertEquals("message #" + i, line);
            }
            inStream.close();
        } else if (file.getPath().toString().contains("rolling-out/05")) {
            FSDataInputStream inStream = dfs.open(file.getPath());
            BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
            for (int i = 5; i < 10; i++) {
                String line = br.readLine();
                Assert.assertEquals("message #" + i, line);
            }
            inStream.close();
        } else if (file.getPath().toString().contains("rolling-out/10")) {
            FSDataInputStream inStream = dfs.open(file.getPath());
            BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
            for (int i = 10; i < 15; i++) {
                String line = br.readLine();
                Assert.assertEquals("message #" + i, line);
            }
            inStream.close();
        } else if (file.getPath().toString().contains("rolling-out/15")) {
            FSDataInputStream inStream = dfs.open(file.getPath());
            BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
            for (int i = 15; i < 20; i++) {
                String line = br.readLine();
                Assert.assertEquals("message #" + i, line);
            }
            inStream.close();
        } else {
            Assert.fail("File " + file + " does not match any expected roll pattern.");
        }
    }
    Assert.assertEquals(4, numFiles);
}
Also used : Path(org.apache.hadoop.fs.Path) InputStreamReader(java.io.InputStreamReader) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Test(org.junit.Test)

Example 78 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hbase by apache.

the class HRegion method computeHDFSBlocksDistribution.

/**
   * This is a helper function to compute HDFS block distribution on demand
   * @param conf configuration
   * @param tableDescriptor HTableDescriptor of the table
   * @param regionInfo encoded name of the region
   * @param tablePath the table directory
   * @return The HDFS blocks distribution for the given region.
   * @throws IOException
   */
public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf, final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo, Path tablePath) throws IOException {
    HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
    FileSystem fs = tablePath.getFileSystem(conf);
    HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
    for (HColumnDescriptor family : tableDescriptor.getFamilies()) {
        List<LocatedFileStatus> locatedFileStatusList = HRegionFileSystem.getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true);
        if (locatedFileStatusList == null) {
            continue;
        }
        for (LocatedFileStatus status : locatedFileStatusList) {
            Path p = status.getPath();
            if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) {
                // Only construct StoreFileInfo object if its not a hfile, save obj
                // creation
                StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status);
                hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
            } else if (StoreFileInfo.isHFile(p)) {
                // If its a HFile, then lets just add to the block distribution
                // lets not create more objects here, not even another HDFSBlocksDistribution
                FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution, status.getBlockLocations());
            } else {
                throw new IOException("path=" + p + " doesn't look like a valid StoreFile");
            }
        }
    }
    return hdfsBlocksDistribution;
}
Also used : Path(org.apache.hadoop.fs.Path) HColumnDescriptor(org.apache.hadoop.hbase.HColumnDescriptor) FileSystem(org.apache.hadoop.fs.FileSystem) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) MultipleIOException(org.apache.hadoop.io.MultipleIOException) DoNotRetryIOException(org.apache.hadoop.hbase.DoNotRetryIOException) TimeoutIOException(org.apache.hadoop.hbase.exceptions.TimeoutIOException) HDFSBlocksDistribution(org.apache.hadoop.hbase.HDFSBlocksDistribution)

Example 79 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project druid by druid-io.

the class HadoopConverterJob method run.

public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
        jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
        throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);
    // Map only. Number of map tasks determined by input format
    jobConf.setNumReduceTasks(0);
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));
    setJobName(jobConf, segments);
    if (converterConfig.getJobPriority() != null) {
        jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }
    final Job job = Job.getInstance(jobConf);
    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);
    JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job);
    Throwable throwable = null;
    try {
        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
        final boolean success = job.waitForCompletion(true);
        if (!success) {
            final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
            if (reports != null) {
                for (final TaskReport report : reports) {
                    log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics()));
                }
            }
            return null;
        }
        try {
            loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
            writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
        } catch (IOException ex) {
            log.error(ex, "Could not fetch counters");
        }
        final JobID jobID = job.getJobID();
        final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
        final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
        final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
        final List<Path> goodPaths = new ArrayList<>();
        while (it.hasNext()) {
            final LocatedFileStatus locatedFileStatus = it.next();
            if (locatedFileStatus.isFile()) {
                final Path myPath = locatedFileStatus.getPath();
                if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
                    goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
                }
            }
        }
        if (goodPaths.isEmpty()) {
            log.warn("No good data found at [%s]", jobDir);
            return null;
        }
        final List<DataSegment> returnList = ImmutableList.copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() {

            @Nullable
            @Override
            public DataSegment apply(final Path input) {
                try {
                    if (!fs.exists(input)) {
                        throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir);
                    }
                } catch (final IOException e) {
                    throw Throwables.propagate(e);
                }
                try (final InputStream stream = fs.open(input)) {
                    return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class);
                } catch (final IOException e) {
                    throw Throwables.propagate(e);
                }
            }
        }));
        if (returnList.size() == segments.size()) {
            return returnList;
        } else {
            throw new ISE("Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir);
        }
    } catch (InterruptedException | ClassNotFoundException e) {
        RuntimeException exception = Throwables.propagate(e);
        throwable = exception;
        throw exception;
    } catch (Throwable t) {
        throwable = t;
        throw t;
    } finally {
        try {
            cleanup(job);
        } catch (IOException e) {
            if (throwable != null) {
                throwable.addSuppressed(e);
            } else {
                log.error(e, "Could not clean up job [%s]", job.getJobID());
            }
        }
    }
}
Also used : ArrayList(java.util.ArrayList) DataSegment(io.druid.timeline.DataSegment) WindowedDataSegment(io.druid.indexer.hadoop.WindowedDataSegment) Function(com.google.common.base.Function) FileSystem(org.apache.hadoop.fs.FileSystem) ISE(io.druid.java.util.common.ISE) Job(org.apache.hadoop.mapreduce.Job) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) TaskReport(org.apache.hadoop.mapreduce.TaskReport) InputStream(java.io.InputStream) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) IAE(io.druid.java.util.common.IAE) Map(java.util.Map) JobID(org.apache.hadoop.mapreduce.JobID)

Example 80 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project druid by druid-io.

the class DatasourceInputFormatTest method setUp.

@Before
public void setUp() throws Exception {
    segments = ImmutableList.of(WindowedDataSegment.of(new DataSegment("test1", Interval.parse("2000/3000"), "ver", ImmutableMap.<String, Object>of("type", "local", "path", "/tmp/index1.zip"), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 2)), WindowedDataSegment.of(new DataSegment("test2", Interval.parse("2050/3000"), "ver", ImmutableMap.<String, Object>of("type", "hdfs", "path", "/tmp/index2.zip"), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 11)), WindowedDataSegment.of(new DataSegment("test3", Interval.parse("2030/3000"), "ver", ImmutableMap.<String, Object>of("type", "hdfs", "path", "/tmp/index3.zip"), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 4)));
    Path path1 = new Path(JobHelper.getURIFromSegment(segments.get(0).getSegment()));
    Path path2 = new Path(JobHelper.getURIFromSegment(segments.get(1).getSegment()));
    Path path3 = new Path(JobHelper.getURIFromSegment(segments.get(2).getSegment()));
    // dummy locations for test
    locations = ImmutableList.of(new LocatedFileStatus(1000, false, 0, 0, 0, 0, null, null, null, null, path1, new BlockLocation[] { new BlockLocation(null, new String[] { "s1", "s2" }, 0, 600), new BlockLocation(null, new String[] { "s2", "s3" }, 600, 400) }), new LocatedFileStatus(4000, false, 0, 0, 0, 0, null, null, null, null, path2, new BlockLocation[] { new BlockLocation(null, new String[] { "s1", "s2" }, 0, 1000), new BlockLocation(null, new String[] { "s1", "s3" }, 1000, 1200), new BlockLocation(null, new String[] { "s2", "s3" }, 2200, 1100), new BlockLocation(null, new String[] { "s1", "s2" }, 3300, 700) }), new LocatedFileStatus(500, false, 0, 0, 0, 0, null, null, null, null, path3, new BlockLocation[] { new BlockLocation(null, new String[] { "s2", "s3" }, 0, 500) }));
    config = new JobConf();
    config.set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, new DefaultObjectMapper().writeValueAsString(segments));
    context = EasyMock.createMock(JobContext.class);
    EasyMock.expect(context.getConfiguration()).andReturn(config);
    EasyMock.replay(context);
}
Also used : Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) DefaultObjectMapper(io.druid.jackson.DefaultObjectMapper) JobContext(org.apache.hadoop.mapreduce.JobContext) BlockLocation(org.apache.hadoop.fs.BlockLocation) DataSegment(io.druid.timeline.DataSegment) JobConf(org.apache.hadoop.mapred.JobConf) Before(org.junit.Before)

Aggregations

LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)145 Path (org.apache.hadoop.fs.Path)105 FileSystem (org.apache.hadoop.fs.FileSystem)54 ArrayList (java.util.ArrayList)48 FileStatus (org.apache.hadoop.fs.FileStatus)34 Test (org.junit.Test)33 IOException (java.io.IOException)27 Configuration (org.apache.hadoop.conf.Configuration)20 File (java.io.File)13 HashSet (java.util.HashSet)12 FileNotFoundException (java.io.FileNotFoundException)11 BlockLocation (org.apache.hadoop.fs.BlockLocation)10 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)8 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)7 StocatorPath (com.ibm.stocator.fs.common.StocatorPath)6 HashMap (java.util.HashMap)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 Map (java.util.Map)5 Matcher (java.util.regex.Matcher)5 BufferedReader (java.io.BufferedReader)4