use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.
the class ValidationJob method getValidationOutputFromHive.
/**
* Execute Hive queries using {@link HiveJdbcConnector} and validate results.
* @param queries Queries to execute.
*/
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "SQL_NONCONSTANT_STRING_PASSED_TO_EXECUTE", justification = "Temporary fix")
private List<Long> getValidationOutputFromHive(List<String> queries) throws IOException {
if (null == queries || queries.size() == 0) {
log.warn("No queries specified to be executed");
return Collections.emptyList();
}
List<Long> rowCounts = Lists.newArrayList();
Closer closer = Closer.create();
try {
HiveJdbcConnector hiveJdbcConnector = closer.register(HiveJdbcConnector.newConnectorWithProps(props));
for (String query : queries) {
String hiveOutput = "hiveConversionValidationOutput_" + UUID.randomUUID().toString();
Path hiveTempDir = new Path("/tmp" + Path.SEPARATOR + hiveOutput);
query = "INSERT OVERWRITE DIRECTORY '" + hiveTempDir + "' " + query;
log.info("Executing query: " + query);
try {
if (this.hiveSettings.size() > 0) {
hiveJdbcConnector.executeStatements(this.hiveSettings.toArray(new String[this.hiveSettings.size()]));
}
hiveJdbcConnector.executeStatements("SET hive.exec.compress.output=false", "SET hive.auto.convert.join=false", query);
FileStatus[] fileStatusList = this.fs.listStatus(hiveTempDir);
List<FileStatus> files = new ArrayList<>();
for (FileStatus fileStatus : fileStatusList) {
if (fileStatus.isFile()) {
files.add(fileStatus);
}
}
if (files.size() > 1) {
log.warn("Found more than one output file. Should have been one.");
} else if (files.size() == 0) {
log.warn("Found no output file. Should have been one.");
} else {
String theString = IOUtils.toString(new InputStreamReader(this.fs.open(files.get(0).getPath()), Charsets.UTF_8));
log.info("Found row count: " + theString.trim());
if (StringUtils.isBlank(theString.trim())) {
rowCounts.add(0l);
} else {
try {
rowCounts.add(Long.parseLong(theString.trim()));
} catch (NumberFormatException e) {
throw new RuntimeException("Could not parse Hive output: " + theString.trim(), e);
}
}
}
} finally {
if (this.fs.exists(hiveTempDir)) {
log.debug("Deleting temp dir: " + hiveTempDir);
this.fs.delete(hiveTempDir, true);
}
}
}
} catch (SQLException e) {
log.warn("Execution failed for query set " + queries.toString(), e);
} finally {
try {
closer.close();
} catch (Exception e) {
log.warn("Could not close HiveJdbcConnector", e);
}
}
return rowCounts;
}
use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.
the class Kafka08DataWriterIntegrationTest method testErrors.
@Test
public void testErrors() throws Exception {
log.warn("Process id = " + ManagementFactory.getRuntimeMXBean().getName());
int numRecordsPerExtract = 5;
int numParallel = 2;
int errorEvery = 2000;
int totalRecords = numRecordsPerExtract * numParallel;
int totalSuccessful = totalRecords / errorEvery + totalRecords % errorEvery;
{
Closer closer = Closer.create();
try {
kafkaTestHelper.provisionTopic(TOPIC);
jobProps.setProperty("source.numRecordsPerExtract", "" + numRecordsPerExtract);
jobProps.setProperty("source.numParallelism", "" + numParallel);
jobProps.setProperty("writer.kafka.producerConfig.flaky.errorType", "regex");
// all records from partition 0 will be dropped.
jobProps.setProperty("writer.kafka.producerConfig.flaky.regexPattern", ":index:0.*");
jobProps.setProperty("job.commit.policy", "partial");
jobProps.setProperty("publish.at.job.level", "false");
// number of records in partition 1
totalSuccessful = 5;
JobLauncher jobLauncher = closer.register(JobLauncherFactory.newJobLauncher(gobblinProps, jobProps));
jobLauncher.launchJob(null);
} catch (Exception e) {
log.error("Failed to run job with exception ", e);
Assert.fail("Should not throw exception on running the job");
} finally {
closer.close();
}
// test records written
testRecordsWritten(totalSuccessful, TOPIC);
}
boolean trySecond = true;
if (trySecond) {
Closer closer = Closer.create();
try {
jobProps.setProperty("source.numRecordsPerExtract", "" + numRecordsPerExtract);
jobProps.setProperty("source.numParallelism", "" + numParallel);
jobProps.setProperty("writer.kafka.producerConfig.flaky.errorType", "nth");
jobProps.setProperty("writer.kafka.producerConfig.flaky.errorEvery", "" + errorEvery);
JobLauncher jobLauncher = closer.register(JobLauncherFactory.newJobLauncher(gobblinProps, jobProps));
jobLauncher.launchJob(null);
totalSuccessful = totalRecords / errorEvery + totalRecords % errorEvery;
} catch (Exception e) {
log.error("Failed to run job with exception ", e);
Assert.fail("Should not throw exception on running the job");
} finally {
closer.close();
}
}
// test records written
testRecordsWritten(totalSuccessful, TOPIC);
}
use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.
the class GobblinOrcWriterTest method testWrite.
/**
* A basic unit for trivial writer correctness.
* TODO: A detailed test suite of ORC-writer for different sorts of schema:
*/
@Test
public void testWrite() throws Exception {
Schema schema = new Schema.Parser().parse(this.getClass().getClassLoader().getResourceAsStream("orc_writer_test/schema.avsc"));
List<GenericRecord> recordList = deserializeAvroRecords(this.getClass(), schema, "orc_writer_test/data.json");
// Mock WriterBuilder, bunch of mocking behaviors to work-around precondition checks in writer builder
FsDataWriterBuilder<Schema, GenericRecord> mockBuilder = (FsDataWriterBuilder<Schema, GenericRecord>) Mockito.mock(FsDataWriterBuilder.class);
when(mockBuilder.getSchema()).thenReturn(schema);
State dummyState = new WorkUnit();
String stagingDir = Files.createTempDir().getAbsolutePath();
String outputDir = Files.createTempDir().getAbsolutePath();
dummyState.setProp(ConfigurationKeys.WRITER_STAGING_DIR, stagingDir);
dummyState.setProp(ConfigurationKeys.WRITER_FILE_PATH, "simple");
dummyState.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, outputDir);
when(mockBuilder.getFileName(dummyState)).thenReturn("file");
Path outputFilePath = new Path(outputDir, "simple/file");
// Having a closer to manage the life-cycle of the writer object.
// Will verify if scenarios like double-close could survive.
Closer closer = Closer.create();
GobblinOrcWriter orcWriter = closer.register(new GobblinOrcWriter(mockBuilder, dummyState));
// Create one more writer to test fail-case.
GobblinOrcWriter orcFailWriter = new GobblinOrcWriter(mockBuilder, dummyState);
for (GenericRecord record : recordList) {
orcWriter.write(record);
orcFailWriter.write(record);
}
// Not yet flushed or reaching default batch size, no records should have been materialized.
Assert.assertEquals(orcWriter.recordsWritten(), 0);
Assert.assertEquals(orcFailWriter.recordsWritten(), 0);
// Try close, should catch relevant CloseBeforeFlushException
try {
orcFailWriter.close();
} catch (CloseBeforeFlushException e) {
Assert.assertEquals(e.datasetName, schema.getName());
}
orcWriter.commit();
Assert.assertEquals(orcWriter.recordsWritten(), 2);
// Verify ORC file contains correct records.
FileSystem fs = FileSystem.getLocal(new Configuration());
Assert.assertTrue(fs.exists(outputFilePath));
List<Writable> orcRecords = deserializeOrcRecords(outputFilePath, fs);
Assert.assertEquals(orcRecords.size(), 2);
// in close method implementation if want to verify.
try {
closer.close();
} catch (NullPointerException npe) {
Assert.fail();
}
}
use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.
the class SafeDatasetCommit method generateCommitSequenceBuilder.
@SuppressWarnings("unchecked")
private Optional<CommitSequence.Builder> generateCommitSequenceBuilder(JobState.DatasetState datasetState, Collection<TaskState> taskStates) {
try (Closer closer = Closer.create()) {
Class<? extends CommitSequencePublisher> dataPublisherClass = (Class<? extends CommitSequencePublisher>) Class.forName(datasetState.getProp(ConfigurationKeys.DATA_PUBLISHER_TYPE, ConfigurationKeys.DEFAULT_DATA_PUBLISHER_TYPE));
CommitSequencePublisher publisher = (CommitSequencePublisher) closer.register(DataPublisher.getInstance(dataPublisherClass, this.jobContext.getJobState()));
publisher.publish(taskStates);
return publisher.getCommitSequenceBuilder();
} catch (Throwable t) {
log.error("Failed to generate commit sequence", t);
setTaskFailureException(datasetState.getTaskStates(), t);
throw Throwables.propagate(t);
}
}
use of org.apache.flink.shaded.guava30.com.google.common.io.Closer in project incubator-gobblin by apache.
the class SafeDatasetCommit method call.
@Override
public Void call() throws Exception {
if (this.datasetState.getState() == JobState.RunningState.COMMITTED) {
log.info(this.datasetUrn + " have been committed.");
return null;
}
metricContext = Instrumented.getMetricContext(datasetState, SafeDatasetCommit.class);
finalizeDatasetStateBeforeCommit(this.datasetState);
Class<? extends DataPublisher> dataPublisherClass;
try (Closer closer = Closer.create()) {
dataPublisherClass = JobContext.getJobDataPublisherClass(this.jobContext.getJobState()).or((Class<? extends DataPublisher>) Class.forName(ConfigurationKeys.DEFAULT_DATA_PUBLISHER_TYPE));
if (!canCommitDataset(datasetState)) {
log.warn(String.format("Not committing dataset %s of job %s with commit policy %s and state %s", this.datasetUrn, this.jobContext.getJobId(), this.jobContext.getJobCommitPolicy(), this.datasetState.getState()));
checkForUnpublishedWUHandling(this.datasetUrn, this.datasetState, dataPublisherClass, closer);
throw new RuntimeException(String.format("Not committing dataset %s of job %s with commit policy %s and state %s", this.datasetUrn, this.jobContext.getJobId(), this.jobContext.getJobCommitPolicy(), this.datasetState.getState()));
}
} catch (ReflectiveOperationException roe) {
log.error("Failed to instantiate data publisher for dataset %s of job %s.", this.datasetUrn, this.jobContext.getJobId(), roe);
throw new RuntimeException(roe);
} finally {
maySubmitFailureEvent(datasetState);
}
if (this.isJobCancelled) {
log.info("Executing commit steps although job is cancelled due to job commit policy: " + this.jobContext.getJobCommitPolicy());
}
Optional<CommitSequence.Builder> commitSequenceBuilder = Optional.absent();
boolean canPersistStates = true;
try (Closer closer = Closer.create()) {
if (this.shouldCommitDataInJob) {
log.info(String.format("Committing dataset %s of job %s with commit policy %s and state %s", this.datasetUrn, this.jobContext.getJobId(), this.jobContext.getJobCommitPolicy(), this.datasetState.getState()));
ListMultimap<TaskFactoryWrapper, TaskState> taskStatesByFactory = groupByTaskFactory(this.datasetState);
for (Map.Entry<TaskFactoryWrapper, Collection<TaskState>> entry : taskStatesByFactory.asMap().entrySet()) {
TaskFactory taskFactory = entry.getKey().getTaskFactory();
if (this.deliverySemantics == DeliverySemantics.EXACTLY_ONCE) {
if (taskFactory != null) {
throw new RuntimeException("Custom task factories do not support exactly once delivery semantics.");
}
generateCommitSequenceBuilder(this.datasetState, entry.getValue());
} else {
DataPublisher publisher;
if (taskFactory == null) {
publisher = DataPublisherFactory.get(dataPublisherClass.getName(), this.jobContext.getJobState(), this.jobContext.getJobBroker());
// the closer
if (!DataPublisherFactory.isPublisherCacheable(publisher)) {
closer.register(publisher);
}
} else {
// NOTE: sharing of publishers is not supported when they are instantiated through the TaskFactory.
// This should be revisited if sharing is required.
publisher = taskFactory.createDataPublisher(this.datasetState);
}
if (this.isJobCancelled) {
if (publisher.canBeSkipped()) {
log.warn(publisher.getClass() + " will be skipped.");
} else {
canPersistStates = false;
throw new RuntimeException("Cannot persist state upon cancellation because publisher has unfinished work and cannot be skipped.");
}
} else if (this.isMultithreaded && !publisher.isThreadSafe()) {
log.warn(String.format("Gobblin is set up to parallelize publishing, however the publisher %s is not thread-safe. " + "Falling back to serial publishing.", publisher.getClass().getName()));
safeCommitDataset(entry.getValue(), publisher);
} else {
commitDataset(entry.getValue(), publisher);
}
}
}
this.datasetState.setState(JobState.RunningState.COMMITTED);
} else {
if (this.datasetState.getState() == JobState.RunningState.SUCCESSFUL) {
this.datasetState.setState(JobState.RunningState.COMMITTED);
}
}
} catch (Throwable throwable) {
log.error(String.format("Failed to commit dataset state for dataset %s of job %s", this.datasetUrn, this.jobContext.getJobId()), throwable);
throw new RuntimeException(throwable);
} finally {
try {
finalizeDatasetState(datasetState, datasetUrn);
maySubmitFailureEvent(datasetState);
maySubmitLineageEvent(datasetState);
if (commitSequenceBuilder.isPresent()) {
buildAndExecuteCommitSequence(commitSequenceBuilder.get(), datasetState, datasetUrn);
datasetState.setState(JobState.RunningState.COMMITTED);
} else if (canPersistStates) {
persistDatasetState(datasetUrn, datasetState);
}
} catch (IOException | RuntimeException ioe) {
log.error(String.format("Failed to persist dataset state for dataset %s of job %s", datasetUrn, this.jobContext.getJobId()), ioe);
throw new RuntimeException(ioe);
}
}
return null;
}
Aggregations