use of org.apache.hadoop.fs.LocatedFileStatus in project flink by apache.
the class BucketingSinkFaultToleranceITCase method postSubmit.
@Override
public void postSubmit() throws Exception {
// We read the files and verify that we have read all the strings. If a valid-length
// file exists we only read the file to that point. (This test should work with
// FileSystems that support truncate() and with others as well.)
Pattern messageRegex = Pattern.compile("message (\\d*)");
// Keep a set of the message IDs that we read. The size must equal the read count and
// the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some
// elements twice.
Set<Integer> readNumbers = Sets.newHashSet();
HashSet<String> uniqMessagesRead = new HashSet<>();
HashSet<String> messagesInCommittedFiles = new HashSet<>();
RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);
while (files.hasNext()) {
LocatedFileStatus file = files.next();
if (!file.getPath().toString().endsWith(".valid-length")) {
int validLength = (int) file.getLen();
if (dfs.exists(file.getPath().suffix(".valid-length"))) {
FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length"));
String validLengthString = inStream.readUTF();
validLength = Integer.parseInt(validLengthString);
System.out.println("VALID LENGTH: " + validLength);
}
FSDataInputStream inStream = dfs.open(file.getPath());
byte[] buffer = new byte[validLength];
inStream.readFully(0, buffer, 0, validLength);
inStream.close();
ByteArrayInputStream bais = new ByteArrayInputStream(buffer);
InputStreamReader inStreamReader = new InputStreamReader(bais);
BufferedReader br = new BufferedReader(inStreamReader);
String line = br.readLine();
while (line != null) {
Matcher matcher = messageRegex.matcher(line);
if (matcher.matches()) {
uniqMessagesRead.add(line);
// check that in the committed files there are no duplicates
if (!file.getPath().toString().endsWith(IN_PROGRESS_SUFFIX) && !file.getPath().toString().endsWith(PENDING_SUFFIX)) {
if (!messagesInCommittedFiles.add(line)) {
Assert.fail("Duplicate entry in committed bucket.");
}
}
int messageId = Integer.parseInt(matcher.group(1));
readNumbers.add(messageId);
} else {
Assert.fail("Read line does not match expected pattern.");
}
line = br.readLine();
}
br.close();
inStreamReader.close();
bais.close();
}
}
// Verify that we read all strings (at-least-once)
Assert.assertEquals(NUM_STRINGS, readNumbers.size());
// Verify that we don't have duplicates (boom!, exactly-once)
Assert.assertEquals(NUM_STRINGS, uniqMessagesRead.size());
}
use of org.apache.hadoop.fs.LocatedFileStatus in project flink by apache.
the class BucketingSinkTest method testDateTimeRollingStringWriter.
/**
* This uses {@link DateTimeBucketer} to
* produce rolling files. We use {@link OneInputStreamOperatorTestHarness} to manually
* advance processing time.
*/
@Test
public void testDateTimeRollingStringWriter() throws Exception {
final int numElements = 20;
final String outPath = hdfsURI + "/rolling-out";
BucketingSink<String> sink = new BucketingSink<String>(outPath).setBucketer(new DateTimeBucketer<String>("ss")).setPartPrefix(PART_PREFIX).setPendingPrefix("").setPendingSuffix("");
OneInputStreamOperatorTestHarness<String, Object> testHarness = createTestSink(sink, 1, 0);
testHarness.setProcessingTime(0L);
testHarness.setup();
testHarness.open();
for (int i = 0; i < numElements; i++) {
// Every 5 elements, increase the clock time. We should end up with 5 elements per bucket.
if (i % 5 == 0) {
testHarness.setProcessingTime(i * 1000L);
}
testHarness.processElement(new StreamRecord<>("message #" + Integer.toString(i)));
}
testHarness.close();
RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);
// We should have 4 rolling files across 4 time intervals
int numFiles = 0;
while (files.hasNext()) {
LocatedFileStatus file = files.next();
numFiles++;
if (file.getPath().toString().contains("rolling-out/00")) {
FSDataInputStream inStream = dfs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 0; i < 5; i++) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
} else if (file.getPath().toString().contains("rolling-out/05")) {
FSDataInputStream inStream = dfs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 5; i < 10; i++) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
} else if (file.getPath().toString().contains("rolling-out/10")) {
FSDataInputStream inStream = dfs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 10; i < 15; i++) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
} else if (file.getPath().toString().contains("rolling-out/15")) {
FSDataInputStream inStream = dfs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(inStream));
for (int i = 15; i < 20; i++) {
String line = br.readLine();
Assert.assertEquals("message #" + i, line);
}
inStream.close();
} else {
Assert.fail("File " + file + " does not match any expected roll pattern.");
}
}
Assert.assertEquals(4, numFiles);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hbase by apache.
the class HRegion method computeHDFSBlocksDistribution.
/**
* This is a helper function to compute HDFS block distribution on demand
* @param conf configuration
* @param tableDescriptor HTableDescriptor of the table
* @param regionInfo encoded name of the region
* @param tablePath the table directory
* @return The HDFS blocks distribution for the given region.
* @throws IOException
*/
public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf, final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo, Path tablePath) throws IOException {
HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
FileSystem fs = tablePath.getFileSystem(conf);
HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
for (HColumnDescriptor family : tableDescriptor.getFamilies()) {
List<LocatedFileStatus> locatedFileStatusList = HRegionFileSystem.getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true);
if (locatedFileStatusList == null) {
continue;
}
for (LocatedFileStatus status : locatedFileStatusList) {
Path p = status.getPath();
if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) {
// Only construct StoreFileInfo object if its not a hfile, save obj
// creation
StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status);
hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
} else if (StoreFileInfo.isHFile(p)) {
// If its a HFile, then lets just add to the block distribution
// lets not create more objects here, not even another HDFSBlocksDistribution
FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution, status.getBlockLocations());
} else {
throw new IOException("path=" + p + " doesn't look like a valid StoreFile");
}
}
}
return hdfsBlocksDistribution;
}
use of org.apache.hadoop.fs.LocatedFileStatus in project druid by druid-io.
the class HadoopConverterJob method run.
public List<DataSegment> run() throws IOException {
final JobConf jobConf = new JobConf();
jobConf.setKeepFailedTaskFiles(false);
for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
}
final List<DataSegment> segments = converterConfig.getSegments();
if (segments.isEmpty()) {
throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
}
converterConfigIntoConfiguration(converterConfig, segments, jobConf);
// Map only. Number of map tasks determined by input format
jobConf.setNumReduceTasks(0);
jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));
setJobName(jobConf, segments);
if (converterConfig.getJobPriority() != null) {
jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
}
final Job job = Job.getInstance(jobConf);
job.setInputFormatClass(ConfigInputFormat.class);
job.setMapperClass(ConvertingMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapSpeculativeExecution(false);
job.setOutputFormatClass(ConvertingOutputFormat.class);
JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job);
Throwable throwable = null;
try {
job.submit();
log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
final boolean success = job.waitForCompletion(true);
if (!success) {
final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
if (reports != null) {
for (final TaskReport report : reports) {
log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics()));
}
}
return null;
}
try {
loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
} catch (IOException ex) {
log.error(ex, "Could not fetch counters");
}
final JobID jobID = job.getJobID();
final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
final List<Path> goodPaths = new ArrayList<>();
while (it.hasNext()) {
final LocatedFileStatus locatedFileStatus = it.next();
if (locatedFileStatus.isFile()) {
final Path myPath = locatedFileStatus.getPath();
if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
}
}
}
if (goodPaths.isEmpty()) {
log.warn("No good data found at [%s]", jobDir);
return null;
}
final List<DataSegment> returnList = ImmutableList.copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() {
@Nullable
@Override
public DataSegment apply(final Path input) {
try {
if (!fs.exists(input)) {
throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir);
}
} catch (final IOException e) {
throw Throwables.propagate(e);
}
try (final InputStream stream = fs.open(input)) {
return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class);
} catch (final IOException e) {
throw Throwables.propagate(e);
}
}
}));
if (returnList.size() == segments.size()) {
return returnList;
} else {
throw new ISE("Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir);
}
} catch (InterruptedException | ClassNotFoundException e) {
RuntimeException exception = Throwables.propagate(e);
throwable = exception;
throw exception;
} catch (Throwable t) {
throwable = t;
throw t;
} finally {
try {
cleanup(job);
} catch (IOException e) {
if (throwable != null) {
throwable.addSuppressed(e);
} else {
log.error(e, "Could not clean up job [%s]", job.getJobID());
}
}
}
}
use of org.apache.hadoop.fs.LocatedFileStatus in project druid by druid-io.
the class DatasourceInputFormatTest method setUp.
@Before
public void setUp() throws Exception {
segments = ImmutableList.of(WindowedDataSegment.of(new DataSegment("test1", Interval.parse("2000/3000"), "ver", ImmutableMap.<String, Object>of("type", "local", "path", "/tmp/index1.zip"), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 2)), WindowedDataSegment.of(new DataSegment("test2", Interval.parse("2050/3000"), "ver", ImmutableMap.<String, Object>of("type", "hdfs", "path", "/tmp/index2.zip"), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 11)), WindowedDataSegment.of(new DataSegment("test3", Interval.parse("2030/3000"), "ver", ImmutableMap.<String, Object>of("type", "hdfs", "path", "/tmp/index3.zip"), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 4)));
Path path1 = new Path(JobHelper.getURIFromSegment(segments.get(0).getSegment()));
Path path2 = new Path(JobHelper.getURIFromSegment(segments.get(1).getSegment()));
Path path3 = new Path(JobHelper.getURIFromSegment(segments.get(2).getSegment()));
// dummy locations for test
locations = ImmutableList.of(new LocatedFileStatus(1000, false, 0, 0, 0, 0, null, null, null, null, path1, new BlockLocation[] { new BlockLocation(null, new String[] { "s1", "s2" }, 0, 600), new BlockLocation(null, new String[] { "s2", "s3" }, 600, 400) }), new LocatedFileStatus(4000, false, 0, 0, 0, 0, null, null, null, null, path2, new BlockLocation[] { new BlockLocation(null, new String[] { "s1", "s2" }, 0, 1000), new BlockLocation(null, new String[] { "s1", "s3" }, 1000, 1200), new BlockLocation(null, new String[] { "s2", "s3" }, 2200, 1100), new BlockLocation(null, new String[] { "s1", "s2" }, 3300, 700) }), new LocatedFileStatus(500, false, 0, 0, 0, 0, null, null, null, null, path3, new BlockLocation[] { new BlockLocation(null, new String[] { "s2", "s3" }, 0, 500) }));
config = new JobConf();
config.set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, new DefaultObjectMapper().writeValueAsString(segments));
context = EasyMock.createMock(JobContext.class);
EasyMock.expect(context.getConfiguration()).andReturn(config);
EasyMock.replay(context);
}
Aggregations