use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.
the class PartitionMonitorTest method preparePartitionMonitor.
private void preparePartitionMonitor() {
List<List<String>> seenPartitionsSinceOffset = new ArrayList<>();
JobConf jobConf = new JobConf();
Configuration configuration = new Configuration();
ObjectPath tablePath = new ObjectPath("testDb", "testTable");
configuration.setString("streaming-source.consume-order", "create-time");
HiveContinuousPartitionContext<Partition, Long> fetcherContext = new HiveContinuousPartitionContext<Partition, Long>() {
@Override
public HiveTablePartition toHiveTablePartition(Partition partition) {
StorageDescriptor sd = partition.getSd();
Map<String, String> partitionColValues = new HashMap<>();
for (String partCol : partition.getValues()) {
String[] arr = partCol.split("=");
Asserts.check(arr.length == 2, "partition string should be key=value format");
partitionColValues.put(arr[0], arr[1]);
}
return new HiveTablePartition(sd, partitionColValues, new Properties());
}
@Override
public ObjectPath getTablePath() {
return null;
}
@Override
public TypeSerializer<Long> getTypeSerializer() {
return null;
}
@Override
public Long getConsumeStartOffset() {
return null;
}
@Override
public void open() throws Exception {
}
@Override
public Optional<Partition> getPartition(List<String> partValues) throws Exception {
return Optional.empty();
}
@Override
public List<ComparablePartitionValue> getComparablePartitionValueList() throws Exception {
return null;
}
@Override
public void close() throws Exception {
}
};
ContinuousPartitionFetcher<Partition, Long> continuousPartitionFetcher = new ContinuousPartitionFetcher<Partition, Long>() {
private static final long serialVersionUID = 1L;
@Override
public List<Tuple2<Partition, Long>> fetchPartitions(Context<Partition, Long> context, Long previousOffset) throws Exception {
return testPartitionWithOffset.stream().filter(p -> (long) p.getCreateTime() >= previousOffset).map(p -> Tuple2.of(p, (long) p.getCreateTime())).collect(Collectors.toList());
}
@Override
public List<Partition> fetch(PartitionFetcher.Context<Partition> context) throws Exception {
return null;
}
};
partitionMonitor = new ContinuousHiveSplitEnumerator.PartitionMonitor<>(0L, seenPartitionsSinceOffset, tablePath, configuration, jobConf, continuousPartitionFetcher, fetcherContext);
}
use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.
the class HiveSourceFileEnumerator method getNumFiles.
public static int getNumFiles(List<HiveTablePartition> partitions, JobConf jobConf) throws IOException {
int numFiles = 0;
for (HiveTablePartition partition : partitions) {
StorageDescriptor sd = partition.getStorageDescriptor();
org.apache.hadoop.fs.Path inputPath = new org.apache.hadoop.fs.Path(sd.getLocation());
FileSystem fs = inputPath.getFileSystem(jobConf);
// it's possible a partition exists in metastore but the data has been removed
if (!fs.exists(inputPath)) {
continue;
}
numFiles += fs.listStatus(inputPath).length;
}
return numFiles;
}
use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.
the class HiveTableSink method consume.
private DataStreamSink<?> consume(ProviderContext providerContext, DataStream<RowData> dataStream, boolean isBounded, DataStructureConverter converter) {
checkAcidTable(catalogTable.getOptions(), identifier.toObjectPath());
try (HiveMetastoreClientWrapper client = HiveMetastoreClientFactory.create(HiveConfUtils.create(jobConf), hiveVersion)) {
Table table = client.getTable(identifier.getDatabaseName(), identifier.getObjectName());
StorageDescriptor sd = table.getSd();
Class hiveOutputFormatClz = hiveShim.getHiveOutputFormatClass(Class.forName(sd.getOutputFormat()));
boolean isCompressed = jobConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false);
HiveWriterFactory writerFactory = new HiveWriterFactory(jobConf, hiveOutputFormatClz, sd.getSerdeInfo(), tableSchema, getPartitionKeyArray(), HiveReflectionUtils.getTableMetadata(hiveShim, table), hiveShim, isCompressed);
String extension = Utilities.getFileExtension(jobConf, isCompressed, (HiveOutputFormat<?, ?>) hiveOutputFormatClz.newInstance());
OutputFileConfig.OutputFileConfigBuilder fileNamingBuilder = OutputFileConfig.builder().withPartPrefix("part-" + UUID.randomUUID().toString()).withPartSuffix(extension == null ? "" : extension);
final int parallelism = Optional.ofNullable(configuredParallelism).orElse(dataStream.getParallelism());
if (isBounded) {
OutputFileConfig fileNaming = fileNamingBuilder.build();
return createBatchSink(dataStream, converter, sd, writerFactory, fileNaming, parallelism);
} else {
if (overwrite) {
throw new IllegalStateException("Streaming mode not support overwrite.");
}
Properties tableProps = HiveReflectionUtils.getTableMetadata(hiveShim, table);
return createStreamSink(providerContext, dataStream, sd, tableProps, writerFactory, fileNamingBuilder, parallelism);
}
} catch (TException e) {
throw new CatalogException("Failed to query Hive metaStore", e);
} catch (IOException e) {
throw new FlinkRuntimeException("Failed to create staging dir", e);
} catch (ClassNotFoundException e) {
throw new FlinkHiveException("Failed to get output format class", e);
} catch (IllegalAccessException | InstantiationException e) {
throw new FlinkHiveException("Failed to instantiate output format instance", e);
}
}
use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.
the class HiveTableSink method createBatchSink.
private DataStreamSink<Row> createBatchSink(DataStream<RowData> dataStream, DataStructureConverter converter, StorageDescriptor sd, HiveWriterFactory recordWriterFactory, OutputFileConfig fileNaming, final int parallelism) throws IOException {
FileSystemOutputFormat.Builder<Row> builder = new FileSystemOutputFormat.Builder<>();
builder.setPartitionComputer(new HiveRowPartitionComputer(hiveShim, JobConfUtils.getDefaultPartitionName(jobConf), tableSchema.getFieldNames(), tableSchema.getFieldDataTypes(), getPartitionKeyArray()));
builder.setDynamicGrouped(dynamicGrouping);
builder.setPartitionColumns(getPartitionKeyArray());
builder.setFileSystemFactory(fsFactory());
builder.setFormatFactory(new HiveOutputFormatFactory(recordWriterFactory));
builder.setMetaStoreFactory(msFactory());
builder.setOverwrite(overwrite);
builder.setStaticPartitions(staticPartitionSpec);
builder.setTempPath(new org.apache.flink.core.fs.Path(toStagingDir(sd.getLocation(), jobConf)));
builder.setOutputFileConfig(fileNaming);
return dataStream.map((MapFunction<RowData, Row>) value -> (Row) converter.toExternal(value)).writeUsingOutputFormat(builder.build()).setParallelism(parallelism);
}
use of org.apache.hadoop.hive.metastore.api.StorageDescriptor in project flink by apache.
the class HiveCatalog method instantiateHivePartition.
private Partition instantiateHivePartition(Table hiveTable, CatalogPartitionSpec partitionSpec, CatalogPartition catalogPartition) throws PartitionSpecInvalidException {
List<String> partCols = getFieldNames(hiveTable.getPartitionKeys());
List<String> partValues = getOrderedFullPartitionValues(partitionSpec, partCols, new ObjectPath(hiveTable.getDbName(), hiveTable.getTableName()));
// validate partition values
for (int i = 0; i < partCols.size(); i++) {
if (isNullOrWhitespaceOnly(partValues.get(i))) {
throw new PartitionSpecInvalidException(getName(), partCols, new ObjectPath(hiveTable.getDbName(), hiveTable.getTableName()), partitionSpec);
}
}
// TODO: handle GenericCatalogPartition
StorageDescriptor sd = hiveTable.getSd().deepCopy();
sd.setLocation(catalogPartition.getProperties().remove(SqlCreateHiveTable.TABLE_LOCATION_URI));
Map<String, String> properties = new HashMap<>(catalogPartition.getProperties());
String comment = catalogPartition.getComment();
if (comment != null) {
properties.put(HiveCatalogConfig.COMMENT, comment);
}
return HiveTableUtil.createHivePartition(hiveTable.getDbName(), hiveTable.getTableName(), partValues, sd, properties);
}
Aggregations