use of org.apache.hadoop.fs.FsUrlStreamHandlerFactory in project datawave by NationalSecurityAgency.
the class IngestJob method run.
@Override
public int run(String[] args) throws Exception {
Logger.getLogger(TypeRegistry.class).setLevel(Level.ALL);
ca.setThreshold(Level.INFO);
log.addAppender(ca);
log.setLevel(Level.INFO);
// Initialize the markings file helper so we get the right markings file
MarkingFunctions.Factory.createMarkingFunctions();
TypeRegistry.reset();
// Parse the job arguments
Configuration conf = parseArguments(args, this.getConf());
if (conf == null) {
printUsage();
return -1;
}
updateConfWithOverrides(conf);
jobObservable = new JobObservable(srcHdfs != null ? getFileSystem(conf, srcHdfs) : null);
for (Observer observer : jobObservers) {
this.jobObservable.addObserver(observer);
if (observer instanceof Configurable) {
log.info("Applying configuration to observer");
((Configurable) observer).setConf(conf);
}
}
AccumuloHelper cbHelper = new AccumuloHelper();
cbHelper.setup(conf);
TypeRegistry.getInstance(conf);
log.info(conf.toString());
log.info(String.format("getStrings('%s') = %s", TypeRegistry.INGEST_DATA_TYPES, conf.get(TypeRegistry.INGEST_DATA_TYPES)));
log.info(String.format("getStrings('data.name') = %s", conf.get("data.name")));
int index = 0;
for (String name : TypeRegistry.getTypeNames()) {
log.info(String.format("name[%d] = '%s'", index++, name));
}
if (TypeRegistry.getTypes().isEmpty()) {
log.error("No data types were configured");
return -1;
}
TableConfigurationUtil tableConfigUtil = new TableConfigurationUtil(conf);
tableNames = tableConfigUtil.getTableNames();
if (createTables) {
boolean wasConfigureTablesSuccessful = tableConfigUtil.configureTables(conf);
if (!wasConfigureTablesSuccessful) {
return -1;
} else
log.info("Created tables: " + tableNames + " successfully!");
}
try {
tableConfigUtil.serializeAggregatorConfiguration(cbHelper, conf, log);
} catch (TableNotFoundException tnf) {
log.error("One or more configured DataWave tables are missing in Accumulo. If this is a new system or if new tables have recently been introduced, run a job using the '-createTables' flag before attempting to ingest more data", tnf);
return -1;
}
// get the source and output hadoop file systems
FileSystem inputFs = getFileSystem(conf, srcHdfs);
FileSystem outputFs = (writeDirectlyToDest ? getFileSystem(conf, destHdfs) : inputFs);
conf.set("output.fs.uri", outputFs.getUri().toString());
// get the qualified work directory path
Path unqualifiedWorkPath = Path.getPathWithoutSchemeAndAuthority(new Path(workDir));
conf.set("ingest.work.dir.unqualified", unqualifiedWorkPath.toString());
Path workDirPath = new Path(new Path(writeDirectlyToDest ? destHdfs : srcHdfs), unqualifiedWorkPath);
conf.set("ingest.work.dir.qualified", workDirPath.toString());
// Create the Job
Job job = Job.getInstance(conf);
// Job copies the configuration, so any changes made after this point don't get captured in the job.
// Use the job's configuration from this point.
conf = job.getConfiguration();
if (!useMapOnly || !outputMutations) {
// depends on this.
try {
configureBulkPartitionerAndOutputFormatter(job, cbHelper, conf, outputFs);
} catch (Exception e) {
log.error(e);
log.info("Deleting orphaned directory: " + workDirPath);
try {
outputFs.delete(workDirPath, true);
} catch (Exception er) {
log.error("Unable to remove directory: " + workDirPath, er);
}
return -1;
}
}
job.setJarByClass(this.getClass());
for (Path inputPath : getFilesToProcess(inputFs, inputFileLists, inputFileListMarker, inputPaths)) {
FileInputFormat.addInputPath(job, inputPath);
}
for (Path dependency : jobDependencies) job.addFileToClassPath(dependency);
configureInputFormat(job, cbHelper, conf);
configureJob(job, conf, workDirPath, outputFs);
// Log configuration
log.info("Types: " + TypeRegistry.getTypeNames());
log.info("Tables: " + Arrays.toString(tableNames));
log.info("InputFormat: " + job.getInputFormatClass().getName());
log.info("Mapper: " + job.getMapperClass().getName());
log.info("Reduce tasks: " + (useMapOnly ? 0 : reduceTasks));
log.info("Split File: " + workDirPath + "/splits.txt");
// Note that if we run any other jobs in the same vm (such as a sampler), then we may
// need to catch and throw away an exception here
URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory(conf));
startDaemonProcesses(conf);
long start = System.currentTimeMillis();
job.submit();
JobID jobID = job.getJobID();
log.info("JOB ID: " + jobID);
createFileWithRetries(outputFs, new Path(workDirPath, jobID.toString()));
// Wait for reduce progress to pass the 30% mark and then
// kick off the next job of this type.
boolean done = false;
while (generateMarkerFile && !done && !job.isComplete()) {
if (job.reduceProgress() > markerFileReducePercentage) {
File flagDir = new File(flagFileDir);
if (flagDir.isDirectory()) {
// Find flag files that start with this datatype
RegexFileFilter filter;
if (flagFilePattern != null) {
filter = new RegexFileFilter(flagFilePattern);
} else {
filter = new RegexFileFilter(outputMutations ? ".*_(live|fivemin)_.*\\.flag" : ".*_(bulk|onehr)_.*\\.flag");
}
File[] flagFiles = flagDir.listFiles((FilenameFilter) filter);
if (flagFiles.length > 0) {
// Reverse sort by time to get the earliest file
Comparator<File> comparator = LastModifiedFileComparator.LASTMODIFIED_COMPARATOR;
if (!markerFileFIFO) {
comparator = LastModifiedFileComparator.LASTMODIFIED_REVERSE;
}
Arrays.sort(flagFiles, comparator);
// Just grab the first one and rename it to .marker
File flag = flagFiles[0];
File targetFile = new File(flag.getAbsolutePath() + (pipelineId == null ? "" : '.' + pipelineId) + ".marker");
if (!flag.renameTo(targetFile)) {
log.error("Unable to rename flag file: " + flag.getAbsolutePath());
continue;
}
log.info("Renamed flag file " + flag + " to " + targetFile);
} else {
log.info("No more flag files to process");
// + datatype);
}
} else {
log.error("Flag file directory does not exist: " + flagFileDir);
}
done = true;
} else {
try {
Thread.sleep(3000);
} catch (InterruptedException ie) {
// do nothing
}
}
}
job.waitForCompletion(true);
long stop = System.currentTimeMillis();
// output the counters to the log
Counters counters = job.getCounters();
log.info(counters);
try (JobClient jobClient = new JobClient((org.apache.hadoop.mapred.JobConf) job.getConfiguration())) {
RunningJob runningJob = jobClient.getJob(new org.apache.hadoop.mapred.JobID(jobID.getJtIdentifier(), jobID.getId()));
// If the job failed, then don't bring the map files online.
if (!job.isSuccessful()) {
return jobFailed(job, runningJob, outputFs, workDirPath);
}
// determine if we had processing errors
if (counters.findCounter(IngestProcess.RUNTIME_EXCEPTION).getValue() > 0) {
eventProcessingError = true;
log.error("Found Runtime Exceptions in the counters");
long numExceptions = 0;
long numRecords = 0;
CounterGroup exceptionCounterGroup = counters.getGroup(IngestProcess.RUNTIME_EXCEPTION.name());
for (Counter exceptionC : exceptionCounterGroup) {
numExceptions += exceptionC.getValue();
}
CounterGroup recordCounterGroup = counters.getGroup(IngestOutput.EVENTS_PROCESSED.name());
for (Counter recordC : recordCounterGroup) {
numRecords += recordC.getValue();
}
// records that throw runtime exceptions are still counted as processed
float percentError = 100 * ((float) numExceptions / numRecords);
log.info(String.format("Percent Error: %.2f", percentError));
if (conf.getInt("job.percent.error.threshold", 101) <= percentError) {
return jobFailed(job, runningJob, outputFs, workDirPath);
}
}
}
if (counters.findCounter(IngestInput.EVENT_FATAL_ERROR).getValue() > 0) {
eventProcessingError = true;
log.error("Found Fatal Errors in the counters");
}
// separate process will bulk import the map files.
if (outputMutations) {
markFilesLoaded(inputFs, FileInputFormat.getInputPaths(job), job.getJobID());
boolean deleted = outputFs.delete(workDirPath, true);
if (!deleted) {
log.error("Unable to remove job working directory: " + workDirPath);
}
} else {
// now move the job directory over to the warehouse if needed
FileSystem destFs = getFileSystem(conf, destHdfs);
if (!inputFs.equals(destFs) && !writeDirectlyToDest) {
Configuration distCpConf = conf;
// current config.
if (distCpConfDir != null) {
distCpConf = new Configuration(false);
FilenameFilter ff = (dir, name) -> name.toLowerCase().endsWith("-site.xml");
for (String file : new File(distCpConfDir).list(ff)) {
Path path = new Path(distCpConfDir, file);
distCpConf.addResource(file.replace("-site", "-default"));
distCpConf.addResource(path);
}
}
log.info("Moving (using distcp) " + unqualifiedWorkPath + " from " + inputFs.getUri() + " to " + destFs.getUri());
try {
distCpDirectory(unqualifiedWorkPath, inputFs, destFs, distCpConf, deleteAfterDistCp);
} catch (Exception e) {
log.error("Failed to move job directory over to the warehouse.", e);
return -3;
}
}
Path destWorkDirPath = FileSystem.get(destHdfs, conf).makeQualified(unqualifiedWorkPath);
boolean marked = markJobComplete(destFs, destWorkDirPath);
if (!marked) {
log.error("Failed to create marker file indicating job completion.");
return -3;
}
}
if (metricsOutputEnabled) {
log.info("Writing Stats");
Path statsDir = new Path(unqualifiedWorkPath.getParent(), "IngestMetrics");
if (!writeStats(log, job, jobID, counters, start, stop, outputMutations, inputFs, statsDir, this.metricsLabelOverride)) {
log.warn("Failed to output statistics for the job");
return -5;
}
} else {
log.info("Ingest stats output disabled via 'ingestMetricsDisabled' flag");
}
if (eventProcessingError) {
log.warn("Job had processing errors. See counters for more information");
return -5;
}
return 0;
}
use of org.apache.hadoop.fs.FsUrlStreamHandlerFactory in project geowave by locationtech.
the class SparkIngestDriver method setHdfsURLStreamHandlerFactory.
public static void setHdfsURLStreamHandlerFactory() throws NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException {
final Field factoryField = URL.class.getDeclaredField("factory");
factoryField.setAccessible(true);
// HP Fortify "Access Control" false positive
// The need to change the accessibility here is
// necessary, has been review and judged to be safe
final URLStreamHandlerFactory urlStreamHandlerFactory = (URLStreamHandlerFactory) factoryField.get(null);
if (urlStreamHandlerFactory == null) {
URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
} else {
try {
factoryField.setAccessible(true);
// HP Fortify "Access Control" false positive
// The need to change the accessibility here is
// necessary, has been review and judged to be safe
factoryField.set(null, new FsUrlStreamHandlerFactory());
} catch (final IllegalAccessException e1) {
LOGGER.error("Could not access URLStreamHandler factory field on URL class: {}", e1);
throw new RuntimeException("Could not access URLStreamHandler factory field on URL class: {}", e1);
}
}
}
Aggregations