use of org.apache.hadoop.fs.PathFilter in project hadoop by apache.
the class HadoopArchiveLogsRunner method runInternal.
private int runInternal() throws Exception {
String remoteAppLogDir = remoteLogDir + File.separator + user + File.separator + suffix + File.separator + appId;
// Run 'hadoop archives' command in local mode
conf.set("mapreduce.framework.name", "local");
// Set the umask so we get 640 files and 750 dirs
conf.set("fs.permissions.umask-mode", "027");
HadoopArchives ha = new HadoopArchives(conf);
String[] haArgs = { "-archiveName", appId + ".har", "-p", remoteAppLogDir, "*", workingDir };
StringBuilder sb = new StringBuilder("Executing 'hadoop archives'");
for (String haArg : haArgs) {
sb.append("\n\t").append(haArg);
}
LOG.info(sb.toString());
ha.run(haArgs);
FileSystem fs = null;
// Move har file to correct location and delete original logs
try {
fs = FileSystem.get(conf);
Path harDest = new Path(remoteAppLogDir, appId + ".har");
LOG.info("Moving har to original location");
fs.rename(new Path(workingDir, appId + ".har"), harDest);
LOG.info("Deleting original logs");
for (FileStatus original : fs.listStatus(new Path(remoteAppLogDir), new PathFilter() {
@Override
public boolean accept(Path path) {
return !path.getName().endsWith(".har");
}
})) {
fs.delete(original.getPath(), false);
}
} finally {
if (fs != null) {
fs.close();
}
}
return 0;
}
use of org.apache.hadoop.fs.PathFilter in project mongo-hadoop by mongodb.
the class BSONFileInputFormat method getSplits.
@Override
public List<FileSplit> getSplits(final JobContext context) throws IOException {
Configuration config = context.getConfiguration();
PathFilter pf = getInputPathFilter(context);
BSONSplitter splitter = new BSONSplitter();
splitter.setConf(config);
ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
List<FileStatus> inputFiles = listStatus(context);
for (FileStatus file : inputFiles) {
if (pf != null && !pf.accept(file.getPath())) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("skipping file %s not matched path filter.", file.getPath()));
}
continue;
} else if (!isSplitable(context, file.getPath())) {
LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
splits.add(splitter.createFileSplit(file, FileSystem.get(file.getPath().toUri(), config), 0L, file.getLen()));
continue;
} else if (LOG.isDebugEnabled()) {
LOG.debug("processing file " + file.getPath());
}
splitter.setInputPath(file.getPath());
Path splitFilePath = getSplitsFilePath(file.getPath(), config);
try {
splitter.loadSplitsFromSplitFile(file, splitFilePath);
} catch (BSONSplitter.NoSplitFileException nsfe) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("No split file for %s; building split file", file.getPath()));
}
splitter.readSplitsForFile(file);
}
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
}
splits.addAll(splitter.getAllSplits());
}
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Total of %d found.", splits.size()));
}
return splits;
}
use of org.apache.hadoop.fs.PathFilter in project voldemort by voldemort.
the class HadoopStoreBuilder method processCheckSumMetadataFile.
/**
* For the given node, following three actions are done:
*
* 1. Computes checksum of checksums
*
* 2. Computes total data size
*
* 3. Computes total index size
*
* Finally updates the metadata file with those information
*
* @param directoryName
* @param outputFs
* @param checkSumGenerator
* @param nodePath
* @param metadata
* @throws IOException
*/
private void processCheckSumMetadataFile(String directoryName, FileSystem outputFs, CheckSum checkSumGenerator, Path nodePath, ReadOnlyStorageMetadata metadata) throws IOException {
long dataSizeInBytes = 0L;
long indexSizeInBytes = 0L;
FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() {
@Override
public boolean accept(Path arg0) {
if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
return true;
}
return false;
}
});
if (storeFiles != null && storeFiles.length > 0) {
Arrays.sort(storeFiles, new IndexFileLastComparator());
FSDataInputStream input = null;
CheckSumMetadata checksumMetadata;
for (FileStatus file : storeFiles) {
try {
// HDFS NameNodes can sometimes GC for extended periods
// of time, hence the exponential back-off strategy below.
// TODO: Refactor all BnP retry code into a pluggable mechanism
int totalAttempts = 4;
int attemptsRemaining = totalAttempts;
while (attemptsRemaining > 0) {
try {
attemptsRemaining--;
input = outputFs.open(file.getPath());
} catch (Exception e) {
if (attemptsRemaining < 1) {
throw e;
}
// Exponential back-off sleep times: 5s, 25s, 45s.
int sleepTime = ((totalAttempts - attemptsRemaining) ^ 2) * 5;
logger.error("Error getting checksum file from HDFS. Retries left: " + attemptsRemaining + ". Back-off until next retry: " + sleepTime + " seconds.", e);
Thread.sleep(sleepTime * 1000);
}
}
checksumMetadata = new CheckSumMetadata(input);
if (checkSumType != CheckSumType.NONE) {
byte[] fileChecksum = checksumMetadata.getCheckSum();
logger.debug("Checksum for file " + file.toString() + " - " + new String(Hex.encodeHex(fileChecksum)));
checkSumGenerator.update(fileChecksum);
}
/*
* if this is a 'data checksum' file, add the data file size
* to 'dataSizeIbBytes'
*/
String dataFileSizeInBytes = (String) checksumMetadata.get(CheckSumMetadata.DATA_FILE_SIZE_IN_BYTES);
if (dataFileSizeInBytes != null) {
dataSizeInBytes += Long.parseLong(dataFileSizeInBytes);
}
/*
* if this is a 'index checksum' file, add the index file
* size to 'indexSizeIbBytes'
*/
String indexFileSizeInBytes = (String) checksumMetadata.get(CheckSumMetadata.INDEX_FILE_SIZE_IN_BYTES);
if (indexFileSizeInBytes != null) {
indexSizeInBytes += Long.parseLong(indexFileSizeInBytes);
}
} catch (Exception e) {
logger.error("Error getting checksum file from HDFS", e);
} finally {
if (input != null)
input.close();
}
outputFs.delete(file.getPath(), false);
}
// update metadata
String checkSum = "NONE";
if (checkSumType != CheckSumType.NONE) {
metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType));
checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum()));
metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum);
}
long diskSizeForNodeInBytes = dataSizeInBytes + indexSizeInBytes;
logger.debug(directoryName + ": Checksum = " + checkSum + ", Size = " + (diskSizeForNodeInBytes / ByteUtils.BYTES_PER_KB) + " KB");
metadata.add(ReadOnlyStorageMetadata.DISK_SIZE_IN_BYTES, Long.toString(diskSizeForNodeInBytes));
}
}
use of org.apache.hadoop.fs.PathFilter in project voldemort by voldemort.
the class AbstractHadoopJob method prepareJobConf.
/**
*
* @param conf Base {@link JobConf} to start from.
* @return
* @throws IOException
* @throws URISyntaxException
*/
public JobConf prepareJobConf(JobConf conf) throws IOException, URISyntaxException {
conf.setJobName(getId());
conf.setNumReduceTasks(0);
String hadoop_ugi = props.getString("hadoop.job.ugi", null);
if (hadoop_ugi != null) {
conf.set("hadoop.job.ugi", hadoop_ugi);
}
if (props.getBoolean("is.local", false)) {
conf.set("mapred.job.tracker", "local");
conf.set("fs.default.name", "file:///");
conf.set("mapred.local.dir", "/tmp/map-red");
info("Running locally, no hadoop jar set.");
} else {
// set custom class loader with custom find resource strategy.
setClassLoaderAndJar(conf, getClass());
info("Setting hadoop jar file for class:" + getClass() + " to " + conf.getJar());
info("*************************************************************************");
info(" Running on Real Hadoop Cluster(" + conf.get("mapred.job.tracker") + ") ");
info("*************************************************************************");
}
// set JVM options if present
if (props.containsKey("mapred.child.java.opts")) {
conf.set("mapred.child.java.opts", props.getString("mapred.child.java.opts"));
info("mapred.child.java.opts set to " + props.getString("mapred.child.java.opts"));
}
// set input and output paths if they are present
if (props.containsKey("input.paths")) {
List<String> inputPaths = props.getList("input.paths");
if (inputPaths.size() == 0)
throw new IllegalArgumentException("Must specify at least one value for property 'input.paths'");
for (String path : inputPaths) {
// Implied stuff, but good implied stuff
if (path.endsWith(LATEST_SUFFIX)) {
FileSystem fs = FileSystem.get(conf);
PathFilter filter = new PathFilter() {
@Override
public boolean accept(Path arg0) {
return !arg0.getName().startsWith("_") && !arg0.getName().startsWith(".");
}
};
String latestPath = path.substring(0, path.length() - LATEST_SUFFIX.length());
FileStatus[] statuses = fs.listStatus(new Path(latestPath), filter);
Arrays.sort(statuses);
path = statuses[statuses.length - 1].getPath().toString();
System.out.println("Using latest folder: " + path);
}
HadoopUtils.addAllSubPaths(conf, new Path(path));
}
}
if (props.containsKey("output.path")) {
String location = props.get("output.path");
if (location.endsWith("#CURRENT")) {
DateTimeFormatter format = DateTimeFormat.forPattern(COMMON_FILE_DATE_PATTERN);
String destPath = format.print(new DateTime());
location = location.substring(0, location.length() - "#CURRENT".length()) + destPath;
System.out.println("Store location set to " + location);
}
FileOutputFormat.setOutputPath(conf, new Path(location));
// For testing purpose only remove output file if exists
if (props.getBoolean("force.output.overwrite", false)) {
FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf);
fs.delete(FileOutputFormat.getOutputPath(conf), true);
}
}
// Adds External jars to hadoop classpath
String externalJarList = props.getString("hadoop.external.jarFiles", null);
if (externalJarList != null) {
String[] jarFiles = externalJarList.split(",");
for (String jarFile : jarFiles) {
info("Adding extenral jar File:" + jarFile);
DistributedCache.addFileToClassPath(new Path(jarFile), conf);
}
}
// Adds distributed cache files
String cacheFileList = props.getString("hadoop.cache.files", null);
if (cacheFileList != null) {
String[] cacheFiles = cacheFileList.split(",");
for (String cacheFile : cacheFiles) {
info("Adding Distributed Cache File:" + cacheFile);
DistributedCache.addCacheFile(new URI(cacheFile), conf);
}
}
// Adds distributed cache files
String archiveFileList = props.getString("hadoop.cache.archives", null);
if (archiveFileList != null) {
String[] archiveFiles = archiveFileList.split(",");
for (String archiveFile : archiveFiles) {
info("Adding Distributed Cache Archive File:" + archiveFile);
DistributedCache.addCacheArchive(new URI(archiveFile), conf);
}
}
// this property can be set by azkaban to manage voldemort lib path on
// hdfs
addToDistributedCache(voldemortLibPath, conf);
boolean isAddFiles = props.getBoolean("hdfs.default.classpath.dir.enable", false);
if (isAddFiles) {
addToDistributedCache(hadoopLibPath, conf);
}
// May want to add this to HadoopUtils, but will await refactoring
for (String key : getProps().keySet()) {
String lowerCase = key.toLowerCase();
if (lowerCase.startsWith(HADOOP_PREFIX)) {
String newKey = key.substring(HADOOP_PREFIX.length());
conf.set(newKey, getProps().get(key));
}
}
// Add properties specific to reducer output compression
conf.setBoolean(VoldemortBuildAndPushJob.REDUCER_OUTPUT_COMPRESS, props.getBoolean(VoldemortBuildAndPushJob.REDUCER_OUTPUT_COMPRESS, false));
if (props.containsKey(VoldemortBuildAndPushJob.REDUCER_OUTPUT_COMPRESS_CODEC)) {
conf.set(VoldemortBuildAndPushJob.REDUCER_OUTPUT_COMPRESS_CODEC, props.get(VoldemortBuildAndPushJob.REDUCER_OUTPUT_COMPRESS_CODEC));
}
HadoopUtils.setPropsInJob(conf, getProps());
// tokens.
if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
conf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
}
return conf;
}
use of org.apache.hadoop.fs.PathFilter in project voldemort by voldemort.
the class HadoopUtils method getLatestVersionedPath.
/**
* Looks for the latest (the alphabetically greatest) path contained in the
* given directory that passes the specified regex pattern.
*
* @param fs The file system
* @param directory The directory that will contain the versions
* @param acceptRegex The String pattern
* @return
* @throws IOException
*/
public static Path getLatestVersionedPath(FileSystem fs, Path directory, String acceptRegex) throws IOException {
final String pattern = acceptRegex != null ? acceptRegex : "\\S+";
PathFilter filter = new PathFilter() {
@Override
public boolean accept(Path arg0) {
return !arg0.getName().startsWith("_") && Pattern.matches(pattern, arg0.getName());
}
};
FileStatus[] statuses = fs.listStatus(directory, filter);
if (statuses == null || statuses.length == 0) {
return null;
}
Arrays.sort(statuses);
return statuses[statuses.length - 1].getPath();
}
Aggregations