use of com.metamx.common.lifecycle.Lifecycle in project hive by apache.
the class DruidStorageHandler method commitCreateTable.
@Override
public void commitCreateTable(Table table) throws MetaException {
if (MetaStoreUtils.isExternalTable(table)) {
return;
}
Lifecycle lifecycle = new Lifecycle();
LOG.info(String.format("Committing table [%s] to the druid metastore", table.getDbName()));
final Path tableDir = getSegmentDescriptorDir();
try {
List<DataSegment> segmentList = DruidStorageHandlerUtils.getPublishedSegments(tableDir, getConf());
LOG.info(String.format("Found [%d] segments under path [%s]", segmentList.size(), tableDir));
druidSqlMetadataStorageUpdaterJobHandler.publishSegments(druidMetadataStorageTablesConfig.getSegmentsTable(), segmentList, DruidStorageHandlerUtils.JSON_MAPPER);
final String coordinatorAddress = HiveConf.getVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_COORDINATOR_DEFAULT_ADDRESS);
int maxTries = HiveConf.getIntVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_MAX_TRIES);
final String dataSourceName = table.getParameters().get(Constants.DRUID_DATA_SOURCE);
LOG.info(String.format("checking load status from coordinator [%s]", coordinatorAddress));
// check if the coordinator is up
httpClient = makeHttpClient(lifecycle);
try {
lifecycle.start();
} catch (Exception e) {
Throwables.propagate(e);
}
String coordinatorResponse = null;
try {
coordinatorResponse = RetryUtils.retry(new Callable<String>() {
@Override
public String call() throws Exception {
return DruidStorageHandlerUtils.getURL(httpClient, new URL(String.format("http://%s/status", coordinatorAddress)));
}
}, new Predicate<Throwable>() {
@Override
public boolean apply(@Nullable Throwable input) {
return input instanceof IOException;
}
}, maxTries);
} catch (Exception e) {
console.printInfo("Will skip waiting for data loading");
return;
}
if (Strings.isNullOrEmpty(coordinatorResponse)) {
console.printInfo("Will skip waiting for data loading");
return;
}
console.printInfo(String.format("Waiting for the loading of [%s] segments", segmentList.size()));
long passiveWaitTimeMs = HiveConf.getLongVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_PASSIVE_WAIT_TIME);
ImmutableSet<URL> setOfUrls = FluentIterable.from(segmentList).transform(new Function<DataSegment, URL>() {
@Override
public URL apply(DataSegment dataSegment) {
try {
//Need to make sure that we are using UTC since most of the druid cluster use UTC by default
return new URL(String.format("http://%s/druid/coordinator/v1/datasources/%s/segments/%s", coordinatorAddress, dataSourceName, DataSegment.makeDataSegmentIdentifier(dataSegment.getDataSource(), new DateTime(dataSegment.getInterval().getStartMillis(), DateTimeZone.UTC), new DateTime(dataSegment.getInterval().getEndMillis(), DateTimeZone.UTC), dataSegment.getVersion(), dataSegment.getShardSpec())));
} catch (MalformedURLException e) {
Throwables.propagate(e);
}
return null;
}
}).toSet();
int numRetries = 0;
while (numRetries++ < maxTries && !setOfUrls.isEmpty()) {
setOfUrls = ImmutableSet.copyOf(Sets.filter(setOfUrls, new Predicate<URL>() {
@Override
public boolean apply(URL input) {
try {
String result = DruidStorageHandlerUtils.getURL(httpClient, input);
LOG.debug(String.format("Checking segment [%s] response is [%s]", input, result));
return Strings.isNullOrEmpty(result);
} catch (IOException e) {
LOG.error(String.format("Error while checking URL [%s]", input), e);
return true;
}
}
}));
try {
if (!setOfUrls.isEmpty()) {
Thread.sleep(passiveWaitTimeMs);
}
} catch (InterruptedException e) {
Thread.interrupted();
Throwables.propagate(e);
}
}
if (!setOfUrls.isEmpty()) {
// We are not Throwing an exception since it might be a transient issue that is blocking loading
console.printError(String.format("Wait time exhausted and we have [%s] out of [%s] segments not loaded yet", setOfUrls.size(), segmentList.size()));
}
} catch (IOException e) {
LOG.error("Exception while commit", e);
Throwables.propagate(e);
} finally {
cleanWorkingDir();
lifecycle.stop();
}
}
use of com.metamx.common.lifecycle.Lifecycle in project hive by apache.
the class DruidQueryBasedInputFormat method splitSelectQuery.
/* Method that splits Select query depending on the threshold so read can be
* parallelized. We will only contact the Druid broker to obtain all results. */
private static HiveDruidSplit[] splitSelectQuery(Configuration conf, String address, SelectQuery query, Path dummyPath) throws IOException {
final int selectThreshold = (int) HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_SELECT_THRESHOLD);
final int numConnection = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
final Period readTimeout = new Period(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
final boolean isFetch = query.getContextBoolean(Constants.DRUID_QUERY_FETCH, false);
if (isFetch) {
// If it has a limit, we use it and we do not split the query
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
// We do not have the number of rows, thus we need to execute a
// Segment Metadata query to obtain number of rows
SegmentMetadataQueryBuilder metadataBuilder = new Druids.SegmentMetadataQueryBuilder();
metadataBuilder.dataSource(query.getDataSource());
metadataBuilder.intervals(query.getIntervals());
metadataBuilder.merge(true);
metadataBuilder.analysisTypes();
SegmentMetadataQuery metadataQuery = metadataBuilder.build();
Lifecycle lifecycle = new Lifecycle();
HttpClient client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
try {
lifecycle.start();
} catch (Exception e) {
LOG.error("Lifecycle start issue");
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
InputStream response;
try {
response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, metadataQuery));
} catch (Exception e) {
lifecycle.stop();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
// Retrieve results
List<SegmentAnalysis> metadataList;
try {
metadataList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<SegmentAnalysis>>() {
});
} catch (Exception e) {
response.close();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
} finally {
lifecycle.stop();
}
if (metadataList == null) {
throw new IOException("Connected to Druid but could not retrieve datasource information");
}
if (metadataList.isEmpty()) {
// There are no rows for that time range, we can submit query as it is
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
if (metadataList.size() != 1) {
throw new IOException("Information about segments should have been merged");
}
final long numRows = metadataList.get(0).getNumRows();
query = query.withPagingSpec(PagingSpec.newSpec(Integer.MAX_VALUE));
if (numRows <= selectThreshold) {
// We are not going to split it
return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
}
// If the query does not specify a timestamp, we obtain the total time using
// a Time Boundary query. Then, we use the information to split the query
// following the Select threshold configuration property
final List<Interval> intervals = new ArrayList<>();
if (query.getIntervals().size() == 1 && query.getIntervals().get(0).withChronology(ISOChronology.getInstanceUTC()).equals(DruidTable.DEFAULT_INTERVAL)) {
// Default max and min, we should execute a time boundary query to get a
// more precise range
TimeBoundaryQueryBuilder timeBuilder = new Druids.TimeBoundaryQueryBuilder();
timeBuilder.dataSource(query.getDataSource());
TimeBoundaryQuery timeQuery = timeBuilder.build();
lifecycle = new Lifecycle();
client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
try {
lifecycle.start();
} catch (Exception e) {
LOG.error("Lifecycle start issue");
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
try {
response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, timeQuery));
} catch (Exception e) {
lifecycle.stop();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
// Retrieve results
List<Result<TimeBoundaryResultValue>> timeList;
try {
timeList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<Result<TimeBoundaryResultValue>>>() {
});
} catch (Exception e) {
response.close();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
} finally {
lifecycle.stop();
}
if (timeList == null || timeList.isEmpty()) {
throw new IOException("Connected to Druid but could not retrieve time boundary information");
}
if (timeList.size() != 1) {
throw new IOException("We should obtain a single time boundary");
}
intervals.add(new Interval(timeList.get(0).getValue().getMinTime().getMillis(), timeList.get(0).getValue().getMaxTime().getMillis(), ISOChronology.getInstanceUTC()));
} else {
intervals.addAll(query.getIntervals());
}
// Create (numRows/default threshold) input splits
int numSplits = (int) Math.ceil((double) numRows / selectThreshold);
List<List<Interval>> newIntervals = createSplitsIntervals(intervals, numSplits);
HiveDruidSplit[] splits = new HiveDruidSplit[numSplits];
for (int i = 0; i < numSplits; i++) {
// Create partial Select query
final SelectQuery partialQuery = query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(newIntervals.get(i)));
splits[i] = new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(partialQuery), dummyPath, new String[] { address });
}
return splits;
}
use of com.metamx.common.lifecycle.Lifecycle in project hive by apache.
the class DruidQueryRecordReader method initialize.
public void initialize(InputSplit split, Configuration conf) throws IOException {
HiveDruidSplit hiveDruidSplit = (HiveDruidSplit) split;
// Create query
query = createQuery(hiveDruidSplit.getDruidQuery());
// Execute query
if (LOG.isInfoEnabled()) {
LOG.info("Retrieving from druid using query:\n " + query);
}
final Lifecycle lifecycle = new Lifecycle();
final int numConnection = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
final Period readTimeout = new Period(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
HttpClient client = HttpClientInit.createClient(HttpClientConfig.builder().withReadTimeout(readTimeout.toStandardDuration()).withNumConnections(numConnection).build(), lifecycle);
try {
lifecycle.start();
} catch (Exception e) {
LOG.error("Issues with lifecycle start", e);
}
InputStream response;
try {
response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(hiveDruidSplit.getLocations()[0], query));
} catch (Exception e) {
lifecycle.stop();
throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
// Retrieve results
List<R> resultsList;
try {
resultsList = createResultsList(response);
} catch (IOException e) {
response.close();
throw e;
} finally {
lifecycle.stop();
}
if (resultsList == null || resultsList.isEmpty()) {
return;
}
results = resultsList.iterator();
}
Aggregations