Search in sources :

Example 1 with FsUrlStreamHandlerFactory

use of org.apache.hadoop.fs.FsUrlStreamHandlerFactory in project datawave by NationalSecurityAgency.

the class IngestJob method run.

@Override
public int run(String[] args) throws Exception {
    Logger.getLogger(TypeRegistry.class).setLevel(Level.ALL);
    ca.setThreshold(Level.INFO);
    log.addAppender(ca);
    log.setLevel(Level.INFO);
    // Initialize the markings file helper so we get the right markings file
    MarkingFunctions.Factory.createMarkingFunctions();
    TypeRegistry.reset();
    // Parse the job arguments
    Configuration conf = parseArguments(args, this.getConf());
    if (conf == null) {
        printUsage();
        return -1;
    }
    updateConfWithOverrides(conf);
    jobObservable = new JobObservable(srcHdfs != null ? getFileSystem(conf, srcHdfs) : null);
    for (Observer observer : jobObservers) {
        this.jobObservable.addObserver(observer);
        if (observer instanceof Configurable) {
            log.info("Applying configuration to observer");
            ((Configurable) observer).setConf(conf);
        }
    }
    AccumuloHelper cbHelper = new AccumuloHelper();
    cbHelper.setup(conf);
    TypeRegistry.getInstance(conf);
    log.info(conf.toString());
    log.info(String.format("getStrings('%s') = %s", TypeRegistry.INGEST_DATA_TYPES, conf.get(TypeRegistry.INGEST_DATA_TYPES)));
    log.info(String.format("getStrings('data.name') = %s", conf.get("data.name")));
    int index = 0;
    for (String name : TypeRegistry.getTypeNames()) {
        log.info(String.format("name[%d] = '%s'", index++, name));
    }
    if (TypeRegistry.getTypes().isEmpty()) {
        log.error("No data types were configured");
        return -1;
    }
    TableConfigurationUtil tableConfigUtil = new TableConfigurationUtil(conf);
    tableNames = tableConfigUtil.getTableNames();
    if (createTables) {
        boolean wasConfigureTablesSuccessful = tableConfigUtil.configureTables(conf);
        if (!wasConfigureTablesSuccessful) {
            return -1;
        } else
            log.info("Created tables: " + tableNames + " successfully!");
    }
    try {
        tableConfigUtil.serializeAggregatorConfiguration(cbHelper, conf, log);
    } catch (TableNotFoundException tnf) {
        log.error("One or more configured DataWave tables are missing in Accumulo. If this is a new system or if new tables have recently been introduced, run a job using the '-createTables' flag before attempting to ingest more data", tnf);
        return -1;
    }
    // get the source and output hadoop file systems
    FileSystem inputFs = getFileSystem(conf, srcHdfs);
    FileSystem outputFs = (writeDirectlyToDest ? getFileSystem(conf, destHdfs) : inputFs);
    conf.set("output.fs.uri", outputFs.getUri().toString());
    // get the qualified work directory path
    Path unqualifiedWorkPath = Path.getPathWithoutSchemeAndAuthority(new Path(workDir));
    conf.set("ingest.work.dir.unqualified", unqualifiedWorkPath.toString());
    Path workDirPath = new Path(new Path(writeDirectlyToDest ? destHdfs : srcHdfs), unqualifiedWorkPath);
    conf.set("ingest.work.dir.qualified", workDirPath.toString());
    // Create the Job
    Job job = Job.getInstance(conf);
    // Job copies the configuration, so any changes made after this point don't get captured in the job.
    // Use the job's configuration from this point.
    conf = job.getConfiguration();
    if (!useMapOnly || !outputMutations) {
        // depends on this.
        try {
            configureBulkPartitionerAndOutputFormatter(job, cbHelper, conf, outputFs);
        } catch (Exception e) {
            log.error(e);
            log.info("Deleting orphaned directory: " + workDirPath);
            try {
                outputFs.delete(workDirPath, true);
            } catch (Exception er) {
                log.error("Unable to remove directory: " + workDirPath, er);
            }
            return -1;
        }
    }
    job.setJarByClass(this.getClass());
    for (Path inputPath : getFilesToProcess(inputFs, inputFileLists, inputFileListMarker, inputPaths)) {
        FileInputFormat.addInputPath(job, inputPath);
    }
    for (Path dependency : jobDependencies) job.addFileToClassPath(dependency);
    configureInputFormat(job, cbHelper, conf);
    configureJob(job, conf, workDirPath, outputFs);
    // Log configuration
    log.info("Types: " + TypeRegistry.getTypeNames());
    log.info("Tables: " + Arrays.toString(tableNames));
    log.info("InputFormat: " + job.getInputFormatClass().getName());
    log.info("Mapper: " + job.getMapperClass().getName());
    log.info("Reduce tasks: " + (useMapOnly ? 0 : reduceTasks));
    log.info("Split File: " + workDirPath + "/splits.txt");
    // Note that if we run any other jobs in the same vm (such as a sampler), then we may
    // need to catch and throw away an exception here
    URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory(conf));
    startDaemonProcesses(conf);
    long start = System.currentTimeMillis();
    job.submit();
    JobID jobID = job.getJobID();
    log.info("JOB ID: " + jobID);
    createFileWithRetries(outputFs, new Path(workDirPath, jobID.toString()));
    // Wait for reduce progress to pass the 30% mark and then
    // kick off the next job of this type.
    boolean done = false;
    while (generateMarkerFile && !done && !job.isComplete()) {
        if (job.reduceProgress() > markerFileReducePercentage) {
            File flagDir = new File(flagFileDir);
            if (flagDir.isDirectory()) {
                // Find flag files that start with this datatype
                RegexFileFilter filter;
                if (flagFilePattern != null) {
                    filter = new RegexFileFilter(flagFilePattern);
                } else {
                    filter = new RegexFileFilter(outputMutations ? ".*_(live|fivemin)_.*\\.flag" : ".*_(bulk|onehr)_.*\\.flag");
                }
                File[] flagFiles = flagDir.listFiles((FilenameFilter) filter);
                if (flagFiles.length > 0) {
                    // Reverse sort by time to get the earliest file
                    Comparator<File> comparator = LastModifiedFileComparator.LASTMODIFIED_COMPARATOR;
                    if (!markerFileFIFO) {
                        comparator = LastModifiedFileComparator.LASTMODIFIED_REVERSE;
                    }
                    Arrays.sort(flagFiles, comparator);
                    // Just grab the first one and rename it to .marker
                    File flag = flagFiles[0];
                    File targetFile = new File(flag.getAbsolutePath() + (pipelineId == null ? "" : '.' + pipelineId) + ".marker");
                    if (!flag.renameTo(targetFile)) {
                        log.error("Unable to rename flag file: " + flag.getAbsolutePath());
                        continue;
                    }
                    log.info("Renamed flag file " + flag + " to " + targetFile);
                } else {
                    log.info("No more flag files to process");
                // + datatype);
                }
            } else {
                log.error("Flag file directory does not exist: " + flagFileDir);
            }
            done = true;
        } else {
            try {
                Thread.sleep(3000);
            } catch (InterruptedException ie) {
            // do nothing
            }
        }
    }
    job.waitForCompletion(true);
    long stop = System.currentTimeMillis();
    // output the counters to the log
    Counters counters = job.getCounters();
    log.info(counters);
    try (JobClient jobClient = new JobClient((org.apache.hadoop.mapred.JobConf) job.getConfiguration())) {
        RunningJob runningJob = jobClient.getJob(new org.apache.hadoop.mapred.JobID(jobID.getJtIdentifier(), jobID.getId()));
        // If the job failed, then don't bring the map files online.
        if (!job.isSuccessful()) {
            return jobFailed(job, runningJob, outputFs, workDirPath);
        }
        // determine if we had processing errors
        if (counters.findCounter(IngestProcess.RUNTIME_EXCEPTION).getValue() > 0) {
            eventProcessingError = true;
            log.error("Found Runtime Exceptions in the counters");
            long numExceptions = 0;
            long numRecords = 0;
            CounterGroup exceptionCounterGroup = counters.getGroup(IngestProcess.RUNTIME_EXCEPTION.name());
            for (Counter exceptionC : exceptionCounterGroup) {
                numExceptions += exceptionC.getValue();
            }
            CounterGroup recordCounterGroup = counters.getGroup(IngestOutput.EVENTS_PROCESSED.name());
            for (Counter recordC : recordCounterGroup) {
                numRecords += recordC.getValue();
            }
            // records that throw runtime exceptions are still counted as processed
            float percentError = 100 * ((float) numExceptions / numRecords);
            log.info(String.format("Percent Error: %.2f", percentError));
            if (conf.getInt("job.percent.error.threshold", 101) <= percentError) {
                return jobFailed(job, runningJob, outputFs, workDirPath);
            }
        }
    }
    if (counters.findCounter(IngestInput.EVENT_FATAL_ERROR).getValue() > 0) {
        eventProcessingError = true;
        log.error("Found Fatal Errors in the counters");
    }
    // separate process will bulk import the map files.
    if (outputMutations) {
        markFilesLoaded(inputFs, FileInputFormat.getInputPaths(job), job.getJobID());
        boolean deleted = outputFs.delete(workDirPath, true);
        if (!deleted) {
            log.error("Unable to remove job working directory: " + workDirPath);
        }
    } else {
        // now move the job directory over to the warehouse if needed
        FileSystem destFs = getFileSystem(conf, destHdfs);
        if (!inputFs.equals(destFs) && !writeDirectlyToDest) {
            Configuration distCpConf = conf;
            // current config.
            if (distCpConfDir != null) {
                distCpConf = new Configuration(false);
                FilenameFilter ff = (dir, name) -> name.toLowerCase().endsWith("-site.xml");
                for (String file : new File(distCpConfDir).list(ff)) {
                    Path path = new Path(distCpConfDir, file);
                    distCpConf.addResource(file.replace("-site", "-default"));
                    distCpConf.addResource(path);
                }
            }
            log.info("Moving (using distcp) " + unqualifiedWorkPath + " from " + inputFs.getUri() + " to " + destFs.getUri());
            try {
                distCpDirectory(unqualifiedWorkPath, inputFs, destFs, distCpConf, deleteAfterDistCp);
            } catch (Exception e) {
                log.error("Failed to move job directory over to the warehouse.", e);
                return -3;
            }
        }
        Path destWorkDirPath = FileSystem.get(destHdfs, conf).makeQualified(unqualifiedWorkPath);
        boolean marked = markJobComplete(destFs, destWorkDirPath);
        if (!marked) {
            log.error("Failed to create marker file indicating job completion.");
            return -3;
        }
    }
    if (metricsOutputEnabled) {
        log.info("Writing Stats");
        Path statsDir = new Path(unqualifiedWorkPath.getParent(), "IngestMetrics");
        if (!writeStats(log, job, jobID, counters, start, stop, outputMutations, inputFs, statsDir, this.metricsLabelOverride)) {
            log.warn("Failed to output statistics for the job");
            return -5;
        }
    } else {
        log.info("Ingest stats output disabled via 'ingestMetricsDisabled' flag");
    }
    if (eventProcessingError) {
        log.warn("Job had processing errors.  See counters for more information");
        return -5;
    }
    return 0;
}
Also used : FsUrlStreamHandlerFactory(org.apache.hadoop.fs.FsUrlStreamHandlerFactory) Arrays(java.util.Arrays) BufferedInputStream(java.io.BufferedInputStream) DedupeContextWriter(datawave.ingest.mapreduce.job.writer.DedupeContextWriter) FileSystem(org.apache.hadoop.fs.FileSystem) Text(org.apache.hadoop.io.Text) FileStatus(org.apache.hadoop.fs.FileStatus) SequenceFile(org.apache.hadoop.io.SequenceFile) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) CounterGroup(org.apache.hadoop.mapreduce.CounterGroup) Configuration(org.apache.hadoop.conf.Configuration) Level(org.apache.log4j.Level) Map(java.util.Map) Counters(org.apache.hadoop.mapreduce.Counters) Set(java.util.Set) CounterToStatsDConfiguration(datawave.ingest.mapreduce.job.statsd.CounterToStatsDConfiguration) LastModifiedFileComparator(org.apache.commons.io.comparator.LastModifiedFileComparator) PasswordConverter(datawave.util.cli.PasswordConverter) BulkContextWriter(datawave.ingest.mapreduce.job.writer.BulkContextWriter) FilenameFilter(java.io.FilenameFilter) IngestOutput(datawave.ingest.metric.IngestOutput) StringUtils(datawave.util.StringUtils) TableExistsException(org.apache.accumulo.core.client.TableExistsException) SimpleDateFormat(java.text.SimpleDateFormat) BufferedOutputStream(java.io.BufferedOutputStream) ArrayList(java.util.ArrayList) ColumnUpdate(org.apache.accumulo.core.data.ColumnUpdate) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Key(org.apache.accumulo.core.data.Key) RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) ConfigurationHelper(datawave.ingest.data.config.ConfigurationHelper) Counter(org.apache.hadoop.mapreduce.Counter) InputFormat(org.apache.hadoop.mapreduce.InputFormat) IOException(java.io.IOException) InputStreamReader(java.io.InputStreamReader) File(java.io.File) AggregatingContextWriter(datawave.ingest.mapreduce.job.writer.AggregatingContextWriter) FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) TableConfigCache(datawave.ingest.config.TableConfigCache) ColumnVisibility(org.apache.accumulo.core.security.ColumnVisibility) JobContext(org.apache.hadoop.mapreduce.JobContext) Writer(org.apache.hadoop.io.SequenceFile.Writer) BufferedReader(java.io.BufferedReader) ClientConfiguration(org.apache.accumulo.core.client.ClientConfiguration) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType) Observer(java.util.Observer) AbstractContextWriter(datawave.ingest.mapreduce.job.writer.AbstractContextWriter) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) URL(java.net.URL) Date(java.util.Date) URISyntaxException(java.net.URISyntaxException) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Mutation(org.apache.accumulo.core.data.Mutation) ContextWriter(datawave.ingest.mapreduce.job.writer.ContextWriter) Logger(org.apache.log4j.Logger) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) MarkingFunctions(datawave.marking.MarkingFunctions) Path(org.apache.hadoop.fs.Path) KeyValue(org.apache.accumulo.core.data.KeyValue) Value(org.apache.accumulo.core.data.Value) Configurable(org.apache.hadoop.conf.Configurable) URI(java.net.URI) Method(java.lang.reflect.Method) IngestInput(datawave.ingest.metric.IngestInput) TaskInputOutputContext(org.apache.hadoop.mapreduce.TaskInputOutputContext) RunningJob(org.apache.hadoop.mapred.RunningJob) ChainedContextWriter(datawave.ingest.mapreduce.job.writer.ChainedContextWriter) LiveContextWriter(datawave.ingest.mapreduce.job.writer.LiveContextWriter) Tool(org.apache.hadoop.util.Tool) FileNotFoundException(java.io.FileNotFoundException) BulkIngestKeyDedupeCombiner(datawave.ingest.mapreduce.job.reduce.BulkIngestKeyDedupeCombiner) List(java.util.List) Mapper(org.apache.hadoop.mapreduce.Mapper) Job(org.apache.hadoop.mapreduce.Job) ConsoleAppender(org.apache.log4j.ConsoleAppender) EventSequenceFileInputFormat(datawave.ingest.input.reader.event.EventSequenceFileInputFormat) Entry(java.util.Map.Entry) DistCp(org.apache.hadoop.tools.DistCp) HashMap(java.util.HashMap) BulkIngestKeyAggregatingReducer(datawave.ingest.mapreduce.job.reduce.BulkIngestKeyAggregatingReducer) MultiTableRangePartitioner(datawave.ingest.mapreduce.partition.MultiTableRangePartitioner) HashSet(java.util.HashSet) JobStatus(org.apache.hadoop.mapred.JobStatus) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) TableCachingContextWriter(datawave.ingest.mapreduce.job.writer.TableCachingContextWriter) JobID(org.apache.hadoop.mapreduce.JobID) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) CounterStatsDClient(datawave.ingest.mapreduce.job.statsd.CounterStatsDClient) RegexFileFilter(org.apache.commons.io.filefilter.RegexFileFilter) OutputStream(java.io.OutputStream) PrintStream(java.io.PrintStream) ToolRunner(org.apache.hadoop.util.ToolRunner) FileInputStream(java.io.FileInputStream) Constants(org.apache.accumulo.core.Constants) AccumuloException(org.apache.accumulo.core.client.AccumuloException) EventMapper(datawave.ingest.mapreduce.EventMapper) TypeRegistry(datawave.ingest.data.TypeRegistry) AccumuloHelper(datawave.ingest.data.config.ingest.AccumuloHelper) PatternLayout(org.apache.log4j.PatternLayout) Comparator(java.util.Comparator) NumShards(datawave.ingest.mapreduce.handler.shard.NumShards) IngestProcess(datawave.ingest.metric.IngestProcess) Collections(java.util.Collections) JobClient(org.apache.hadoop.mapred.JobClient) InputStream(java.io.InputStream) Configuration(org.apache.hadoop.conf.Configuration) CounterToStatsDConfiguration(datawave.ingest.mapreduce.job.statsd.CounterToStatsDConfiguration) ClientConfiguration(org.apache.accumulo.core.client.ClientConfiguration) FsUrlStreamHandlerFactory(org.apache.hadoop.fs.FsUrlStreamHandlerFactory) Configurable(org.apache.hadoop.conf.Configurable) TypeRegistry(datawave.ingest.data.TypeRegistry) JobClient(org.apache.hadoop.mapred.JobClient) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) FilenameFilter(java.io.FilenameFilter) Counter(org.apache.hadoop.mapreduce.Counter) Observer(java.util.Observer) FileSystem(org.apache.hadoop.fs.FileSystem) RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) AccumuloHelper(datawave.ingest.data.config.ingest.AccumuloHelper) RunningJob(org.apache.hadoop.mapred.RunningJob) Job(org.apache.hadoop.mapreduce.Job) Path(org.apache.hadoop.fs.Path) CounterGroup(org.apache.hadoop.mapreduce.CounterGroup) RegexFileFilter(org.apache.commons.io.filefilter.RegexFileFilter) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) TableExistsException(org.apache.accumulo.core.client.TableExistsException) IOException(java.io.IOException) URISyntaxException(java.net.URISyntaxException) FileNotFoundException(java.io.FileNotFoundException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) AccumuloException(org.apache.accumulo.core.client.AccumuloException) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapreduce.Counters) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File) JobID(org.apache.hadoop.mapreduce.JobID)

Example 2 with FsUrlStreamHandlerFactory

use of org.apache.hadoop.fs.FsUrlStreamHandlerFactory in project geowave by locationtech.

the class SparkIngestDriver method setHdfsURLStreamHandlerFactory.

public static void setHdfsURLStreamHandlerFactory() throws NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException {
    final Field factoryField = URL.class.getDeclaredField("factory");
    factoryField.setAccessible(true);
    // HP Fortify "Access Control" false positive
    // The need to change the accessibility here is
    // necessary, has been review and judged to be safe
    final URLStreamHandlerFactory urlStreamHandlerFactory = (URLStreamHandlerFactory) factoryField.get(null);
    if (urlStreamHandlerFactory == null) {
        URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
    } else {
        try {
            factoryField.setAccessible(true);
            // HP Fortify "Access Control" false positive
            // The need to change the accessibility here is
            // necessary, has been review and judged to be safe
            factoryField.set(null, new FsUrlStreamHandlerFactory());
        } catch (final IllegalAccessException e1) {
            LOGGER.error("Could not access URLStreamHandler factory field on URL class: {}", e1);
            throw new RuntimeException("Could not access URLStreamHandler factory field on URL class: {}", e1);
        }
    }
}
Also used : Field(java.lang.reflect.Field) FsUrlStreamHandlerFactory(org.apache.hadoop.fs.FsUrlStreamHandlerFactory) URLStreamHandlerFactory(java.net.URLStreamHandlerFactory)

Aggregations

FsUrlStreamHandlerFactory (org.apache.hadoop.fs.FsUrlStreamHandlerFactory)2 TableConfigCache (datawave.ingest.config.TableConfigCache)1 TypeRegistry (datawave.ingest.data.TypeRegistry)1 ConfigurationHelper (datawave.ingest.data.config.ConfigurationHelper)1 AccumuloHelper (datawave.ingest.data.config.ingest.AccumuloHelper)1 EventSequenceFileInputFormat (datawave.ingest.input.reader.event.EventSequenceFileInputFormat)1 EventMapper (datawave.ingest.mapreduce.EventMapper)1 NumShards (datawave.ingest.mapreduce.handler.shard.NumShards)1 BulkIngestKeyAggregatingReducer (datawave.ingest.mapreduce.job.reduce.BulkIngestKeyAggregatingReducer)1 BulkIngestKeyDedupeCombiner (datawave.ingest.mapreduce.job.reduce.BulkIngestKeyDedupeCombiner)1 CounterStatsDClient (datawave.ingest.mapreduce.job.statsd.CounterStatsDClient)1 CounterToStatsDConfiguration (datawave.ingest.mapreduce.job.statsd.CounterToStatsDConfiguration)1 AbstractContextWriter (datawave.ingest.mapreduce.job.writer.AbstractContextWriter)1 AggregatingContextWriter (datawave.ingest.mapreduce.job.writer.AggregatingContextWriter)1 BulkContextWriter (datawave.ingest.mapreduce.job.writer.BulkContextWriter)1 ChainedContextWriter (datawave.ingest.mapreduce.job.writer.ChainedContextWriter)1 ContextWriter (datawave.ingest.mapreduce.job.writer.ContextWriter)1 DedupeContextWriter (datawave.ingest.mapreduce.job.writer.DedupeContextWriter)1 LiveContextWriter (datawave.ingest.mapreduce.job.writer.LiveContextWriter)1 TableCachingContextWriter (datawave.ingest.mapreduce.job.writer.TableCachingContextWriter)1