Search in sources :

Example 91 with Closer

use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.

the class ValidationJob method getValidationOutputFromHive.

/**
 * Execute Hive queries using {@link HiveJdbcConnector} and validate results.
 * @param queries Queries to execute.
 */
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "SQL_NONCONSTANT_STRING_PASSED_TO_EXECUTE", justification = "Temporary fix")
private List<Long> getValidationOutputFromHive(List<String> queries) throws IOException {
    if (null == queries || queries.size() == 0) {
        log.warn("No queries specified to be executed");
        return Collections.emptyList();
    }
    List<Long> rowCounts = Lists.newArrayList();
    Closer closer = Closer.create();
    try {
        HiveJdbcConnector hiveJdbcConnector = closer.register(HiveJdbcConnector.newConnectorWithProps(props));
        for (String query : queries) {
            String hiveOutput = "hiveConversionValidationOutput_" + UUID.randomUUID().toString();
            Path hiveTempDir = new Path("/tmp" + Path.SEPARATOR + hiveOutput);
            query = "INSERT OVERWRITE DIRECTORY '" + hiveTempDir + "' " + query;
            log.info("Executing query: " + query);
            try {
                if (this.hiveSettings.size() > 0) {
                    hiveJdbcConnector.executeStatements(this.hiveSettings.toArray(new String[this.hiveSettings.size()]));
                }
                hiveJdbcConnector.executeStatements("SET hive.exec.compress.output=false", "SET hive.auto.convert.join=false", query);
                FileStatus[] fileStatusList = this.fs.listStatus(hiveTempDir);
                List<FileStatus> files = new ArrayList<>();
                for (FileStatus fileStatus : fileStatusList) {
                    if (fileStatus.isFile()) {
                        files.add(fileStatus);
                    }
                }
                if (files.size() > 1) {
                    log.warn("Found more than one output file. Should have been one.");
                } else if (files.size() == 0) {
                    log.warn("Found no output file. Should have been one.");
                } else {
                    String theString = IOUtils.toString(new InputStreamReader(this.fs.open(files.get(0).getPath()), Charsets.UTF_8));
                    log.info("Found row count: " + theString.trim());
                    if (StringUtils.isBlank(theString.trim())) {
                        rowCounts.add(0l);
                    } else {
                        try {
                            rowCounts.add(Long.parseLong(theString.trim()));
                        } catch (NumberFormatException e) {
                            throw new RuntimeException("Could not parse Hive output: " + theString.trim(), e);
                        }
                    }
                }
            } finally {
                if (this.fs.exists(hiveTempDir)) {
                    log.debug("Deleting temp dir: " + hiveTempDir);
                    this.fs.delete(hiveTempDir, true);
                }
            }
        }
    } catch (SQLException e) {
        log.warn("Execution failed for query set " + queries.toString(), e);
    } finally {
        try {
            closer.close();
        } catch (Exception e) {
            log.warn("Could not close HiveJdbcConnector", e);
        }
    }
    return rowCounts;
}
Also used : Closer(com.google.common.io.Closer) Path(org.apache.hadoop.fs.Path) HiveJdbcConnector(org.apache.gobblin.util.HiveJdbcConnector) FileStatus(org.apache.hadoop.fs.FileStatus) InputStreamReader(java.io.InputStreamReader) SQLException(java.sql.SQLException) ArrayList(java.util.ArrayList) ParseException(java.text.ParseException) SQLException(java.sql.SQLException) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) IOException(java.io.IOException)

Example 92 with Closer

use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.

the class Kafka08DataWriterIntegrationTest method testErrors.

@Test
public void testErrors() throws Exception {
    log.warn("Process id = " + ManagementFactory.getRuntimeMXBean().getName());
    int numRecordsPerExtract = 5;
    int numParallel = 2;
    int errorEvery = 2000;
    int totalRecords = numRecordsPerExtract * numParallel;
    int totalSuccessful = totalRecords / errorEvery + totalRecords % errorEvery;
    {
        Closer closer = Closer.create();
        try {
            kafkaTestHelper.provisionTopic(TOPIC);
            jobProps.setProperty("source.numRecordsPerExtract", "" + numRecordsPerExtract);
            jobProps.setProperty("source.numParallelism", "" + numParallel);
            jobProps.setProperty("writer.kafka.producerConfig.flaky.errorType", "regex");
            // all records from partition 0 will be dropped.
            jobProps.setProperty("writer.kafka.producerConfig.flaky.regexPattern", ":index:0.*");
            jobProps.setProperty("job.commit.policy", "partial");
            jobProps.setProperty("publish.at.job.level", "false");
            // number of records in partition 1
            totalSuccessful = 5;
            JobLauncher jobLauncher = closer.register(JobLauncherFactory.newJobLauncher(gobblinProps, jobProps));
            jobLauncher.launchJob(null);
        } catch (Exception e) {
            log.error("Failed to run job with exception ", e);
            Assert.fail("Should not throw exception on running the job");
        } finally {
            closer.close();
        }
        // test records written
        testRecordsWritten(totalSuccessful, TOPIC);
    }
    boolean trySecond = true;
    if (trySecond) {
        Closer closer = Closer.create();
        try {
            jobProps.setProperty("source.numRecordsPerExtract", "" + numRecordsPerExtract);
            jobProps.setProperty("source.numParallelism", "" + numParallel);
            jobProps.setProperty("writer.kafka.producerConfig.flaky.errorType", "nth");
            jobProps.setProperty("writer.kafka.producerConfig.flaky.errorEvery", "" + errorEvery);
            JobLauncher jobLauncher = closer.register(JobLauncherFactory.newJobLauncher(gobblinProps, jobProps));
            jobLauncher.launchJob(null);
            totalSuccessful = totalRecords / errorEvery + totalRecords % errorEvery;
        } catch (Exception e) {
            log.error("Failed to run job with exception ", e);
            Assert.fail("Should not throw exception on running the job");
        } finally {
            closer.close();
        }
    }
    // test records written
    testRecordsWritten(totalSuccessful, TOPIC);
}
Also used : Closer(com.google.common.io.Closer) JobLauncher(org.apache.gobblin.runtime.JobLauncher) IOException(java.io.IOException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Test(org.testng.annotations.Test)

Example 93 with Closer

use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.

the class GobblinOrcWriterTest method testWrite.

/**
 * A basic unit for trivial writer correctness.
 * TODO: A detailed test suite of ORC-writer for different sorts of schema:
 */
@Test
public void testWrite() throws Exception {
    Schema schema = new Schema.Parser().parse(this.getClass().getClassLoader().getResourceAsStream("orc_writer_test/schema.avsc"));
    List<GenericRecord> recordList = deserializeAvroRecords(this.getClass(), schema, "orc_writer_test/data.json");
    // Mock WriterBuilder, bunch of mocking behaviors to work-around precondition checks in writer builder
    FsDataWriterBuilder<Schema, GenericRecord> mockBuilder = (FsDataWriterBuilder<Schema, GenericRecord>) Mockito.mock(FsDataWriterBuilder.class);
    when(mockBuilder.getSchema()).thenReturn(schema);
    State dummyState = new WorkUnit();
    String stagingDir = Files.createTempDir().getAbsolutePath();
    String outputDir = Files.createTempDir().getAbsolutePath();
    dummyState.setProp(ConfigurationKeys.WRITER_STAGING_DIR, stagingDir);
    dummyState.setProp(ConfigurationKeys.WRITER_FILE_PATH, "simple");
    dummyState.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, outputDir);
    when(mockBuilder.getFileName(dummyState)).thenReturn("file");
    Path outputFilePath = new Path(outputDir, "simple/file");
    // Having a closer to manage the life-cycle of the writer object.
    // Will verify if scenarios like double-close could survive.
    Closer closer = Closer.create();
    GobblinOrcWriter orcWriter = closer.register(new GobblinOrcWriter(mockBuilder, dummyState));
    // Create one more writer to test fail-case.
    GobblinOrcWriter orcFailWriter = new GobblinOrcWriter(mockBuilder, dummyState);
    for (GenericRecord record : recordList) {
        orcWriter.write(record);
        orcFailWriter.write(record);
    }
    // Not yet flushed or reaching default batch size, no records should have been materialized.
    Assert.assertEquals(orcWriter.recordsWritten(), 0);
    Assert.assertEquals(orcFailWriter.recordsWritten(), 0);
    // Try close, should catch relevant CloseBeforeFlushException
    try {
        orcFailWriter.close();
    } catch (CloseBeforeFlushException e) {
        Assert.assertEquals(e.datasetName, schema.getName());
    }
    orcWriter.commit();
    Assert.assertEquals(orcWriter.recordsWritten(), 2);
    // Verify ORC file contains correct records.
    FileSystem fs = FileSystem.getLocal(new Configuration());
    Assert.assertTrue(fs.exists(outputFilePath));
    List<Writable> orcRecords = deserializeOrcRecords(outputFilePath, fs);
    Assert.assertEquals(orcRecords.size(), 2);
    // in close method implementation if want to verify.
    try {
        closer.close();
    } catch (NullPointerException npe) {
        Assert.fail();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Closer(com.google.common.io.Closer) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) Writable(org.apache.hadoop.io.Writable) State(org.apache.gobblin.configuration.State) FileSystem(org.apache.hadoop.fs.FileSystem) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.testng.annotations.Test)

Example 94 with Closer

use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.

the class SafeDatasetCommit method generateCommitSequenceBuilder.

@SuppressWarnings("unchecked")
private Optional<CommitSequence.Builder> generateCommitSequenceBuilder(JobState.DatasetState datasetState, Collection<TaskState> taskStates) {
    try (Closer closer = Closer.create()) {
        Class<? extends CommitSequencePublisher> dataPublisherClass = (Class<? extends CommitSequencePublisher>) Class.forName(datasetState.getProp(ConfigurationKeys.DATA_PUBLISHER_TYPE, ConfigurationKeys.DEFAULT_DATA_PUBLISHER_TYPE));
        CommitSequencePublisher publisher = (CommitSequencePublisher) closer.register(DataPublisher.getInstance(dataPublisherClass, this.jobContext.getJobState()));
        publisher.publish(taskStates);
        return publisher.getCommitSequenceBuilder();
    } catch (Throwable t) {
        log.error("Failed to generate commit sequence", t);
        setTaskFailureException(datasetState.getTaskStates(), t);
        throw Throwables.propagate(t);
    }
}
Also used : Closer(com.google.common.io.Closer) CommitSequencePublisher(org.apache.gobblin.publisher.CommitSequencePublisher)

Example 95 with Closer

use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.

the class SafeDatasetCommit method call.

@Override
public Void call() throws Exception {
    if (this.datasetState.getState() == JobState.RunningState.COMMITTED) {
        log.info(this.datasetUrn + " have been committed.");
        return null;
    }
    metricContext = Instrumented.getMetricContext(datasetState, SafeDatasetCommit.class);
    finalizeDatasetStateBeforeCommit(this.datasetState);
    Class<? extends DataPublisher> dataPublisherClass;
    try (Closer closer = Closer.create()) {
        dataPublisherClass = JobContext.getJobDataPublisherClass(this.jobContext.getJobState()).or((Class<? extends DataPublisher>) Class.forName(ConfigurationKeys.DEFAULT_DATA_PUBLISHER_TYPE));
        if (!canCommitDataset(datasetState)) {
            log.warn(String.format("Not committing dataset %s of job %s with commit policy %s and state %s", this.datasetUrn, this.jobContext.getJobId(), this.jobContext.getJobCommitPolicy(), this.datasetState.getState()));
            checkForUnpublishedWUHandling(this.datasetUrn, this.datasetState, dataPublisherClass, closer);
            throw new RuntimeException(String.format("Not committing dataset %s of job %s with commit policy %s and state %s", this.datasetUrn, this.jobContext.getJobId(), this.jobContext.getJobCommitPolicy(), this.datasetState.getState()));
        }
    } catch (ReflectiveOperationException roe) {
        log.error("Failed to instantiate data publisher for dataset %s of job %s.", this.datasetUrn, this.jobContext.getJobId(), roe);
        throw new RuntimeException(roe);
    } finally {
        maySubmitFailureEvent(datasetState);
    }
    if (this.isJobCancelled) {
        log.info("Executing commit steps although job is cancelled due to job commit policy: " + this.jobContext.getJobCommitPolicy());
    }
    Optional<CommitSequence.Builder> commitSequenceBuilder = Optional.absent();
    boolean canPersistStates = true;
    try (Closer closer = Closer.create()) {
        if (this.shouldCommitDataInJob) {
            log.info(String.format("Committing dataset %s of job %s with commit policy %s and state %s", this.datasetUrn, this.jobContext.getJobId(), this.jobContext.getJobCommitPolicy(), this.datasetState.getState()));
            ListMultimap<TaskFactoryWrapper, TaskState> taskStatesByFactory = groupByTaskFactory(this.datasetState);
            for (Map.Entry<TaskFactoryWrapper, Collection<TaskState>> entry : taskStatesByFactory.asMap().entrySet()) {
                TaskFactory taskFactory = entry.getKey().getTaskFactory();
                if (this.deliverySemantics == DeliverySemantics.EXACTLY_ONCE) {
                    if (taskFactory != null) {
                        throw new RuntimeException("Custom task factories do not support exactly once delivery semantics.");
                    }
                    generateCommitSequenceBuilder(this.datasetState, entry.getValue());
                } else {
                    DataPublisher publisher;
                    if (taskFactory == null) {
                        publisher = DataPublisherFactory.get(dataPublisherClass.getName(), this.jobContext.getJobState(), this.jobContext.getJobBroker());
                        // the closer
                        if (!DataPublisherFactory.isPublisherCacheable(publisher)) {
                            closer.register(publisher);
                        }
                    } else {
                        // NOTE: sharing of publishers is not supported when they are instantiated through the TaskFactory.
                        // This should be revisited if sharing is required.
                        publisher = taskFactory.createDataPublisher(this.datasetState);
                    }
                    if (this.isJobCancelled) {
                        if (publisher.canBeSkipped()) {
                            log.warn(publisher.getClass() + " will be skipped.");
                        } else {
                            canPersistStates = false;
                            throw new RuntimeException("Cannot persist state upon cancellation because publisher has unfinished work and cannot be skipped.");
                        }
                    } else if (this.isMultithreaded && !publisher.isThreadSafe()) {
                        log.warn(String.format("Gobblin is set up to parallelize publishing, however the publisher %s is not thread-safe. " + "Falling back to serial publishing.", publisher.getClass().getName()));
                        safeCommitDataset(entry.getValue(), publisher);
                    } else {
                        commitDataset(entry.getValue(), publisher);
                    }
                }
            }
            this.datasetState.setState(JobState.RunningState.COMMITTED);
        } else {
            if (this.datasetState.getState() == JobState.RunningState.SUCCESSFUL) {
                this.datasetState.setState(JobState.RunningState.COMMITTED);
            }
        }
    } catch (Throwable throwable) {
        log.error(String.format("Failed to commit dataset state for dataset %s of job %s", this.datasetUrn, this.jobContext.getJobId()), throwable);
        throw new RuntimeException(throwable);
    } finally {
        try {
            finalizeDatasetState(datasetState, datasetUrn);
            maySubmitFailureEvent(datasetState);
            maySubmitLineageEvent(datasetState);
            if (commitSequenceBuilder.isPresent()) {
                buildAndExecuteCommitSequence(commitSequenceBuilder.get(), datasetState, datasetUrn);
                datasetState.setState(JobState.RunningState.COMMITTED);
            } else if (canPersistStates) {
                persistDatasetState(datasetUrn, datasetState);
            }
        } catch (IOException | RuntimeException ioe) {
            log.error(String.format("Failed to persist dataset state for dataset %s of job %s", datasetUrn, this.jobContext.getJobId()), ioe);
            throw new RuntimeException(ioe);
        }
    }
    return null;
}
Also used : Closer(com.google.common.io.Closer) FailureEventBuilder(org.apache.gobblin.metrics.event.FailureEventBuilder) IOException(java.io.IOException) DataPublisher(org.apache.gobblin.publisher.DataPublisher) TaskFactory(org.apache.gobblin.runtime.task.TaskFactory) Collection(java.util.Collection) Map(java.util.Map)

Aggregations

Closer (com.google.common.io.Closer)213 IOException (java.io.IOException)95 File (java.io.File)26 Test (org.testng.annotations.Test)21 Path (org.apache.hadoop.fs.Path)18 Test (org.junit.Test)18 Properties (java.util.Properties)16 Closer (org.apache.flink.shaded.guava30.com.google.common.io.Closer)16 FileOutputStream (java.io.FileOutputStream)15 ArrayList (java.util.ArrayList)15 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)13 FileInputStream (java.io.FileInputStream)12 InputStream (java.io.InputStream)12 OutputStream (java.io.OutputStream)12 Map (java.util.Map)12 ByteArrayInputStream (java.io.ByteArrayInputStream)10 DataInputStream (java.io.DataInputStream)10 UncheckedIOException (java.io.UncheckedIOException)10 Configuration (org.apache.hadoop.conf.Configuration)10 Text (org.apache.hadoop.io.Text)9