use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class ExploreTableManager method generateFileSetCreateStatement.
/**
* Generate a create statement for a ((time-)partitioned) file set.
*
* @param dataset the instantiated dataset
* @param datasetId the dataset id
* @param properties the properties from dataset specification
* @param truncating whether this call to create() is part of a truncate() operation. The effect is:
* If possessExisting is true, then the truncate() has just dropped this
* dataset and that deleted the explore table: we must recreate it.
* @return a CREATE TABLE statement, or null if the dataset is not explorable
* @throws IllegalArgumentException if the schema cannot be parsed, or if shouldErrorOnMissingSchema is true and
* the dataset spec does not contain a schema.
*/
@Nullable
private String generateFileSetCreateStatement(DatasetId datasetId, Dataset dataset, Map<String, String> properties, boolean truncating) throws IllegalArgumentException, ExploreException {
String tableName = tableNaming.getTableName(datasetId, properties);
String databaseName = ExploreProperties.getExploreDatabaseName(properties);
Map<String, String> tableProperties = FileSetProperties.getTableProperties(properties);
// if this dataset reuses an existing table, do not attempt to create it
if (FileSetProperties.isUseExisting(tableProperties) || (FileSetProperties.isPossessExisting(tableProperties) && !truncating)) {
try {
exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
// table exists: do not attempt to create
return null;
} catch (TableNotFoundException e) {
throw new ExploreException(String.format("Dataset '%s' is configured to use an existing explore table, but table '%s' does not " + "exist in database '%s'. ", datasetId.getDataset(), tableName, databaseName));
}
}
Location baseLocation;
Partitioning partitioning = null;
if (dataset instanceof PartitionedFileSet) {
partitioning = ((PartitionedFileSet) dataset).getPartitioning();
baseLocation = ((PartitionedFileSet) dataset).getEmbeddedFileSet().getBaseLocation();
} else {
baseLocation = ((FileSet) dataset).getBaseLocation();
}
CreateStatementBuilder createStatementBuilder = new CreateStatementBuilder(datasetId.getDataset(), databaseName, tableName, shouldEscapeColumns).setLocation(baseLocation).setPartitioning(partitioning).setTableProperties(tableProperties);
String schema = FileSetProperties.getExploreSchema(properties);
String format = FileSetProperties.getExploreFormat(properties);
if (format != null) {
if ("parquet".equals(format)) {
return createStatementBuilder.setSchema(FileSetProperties.getExploreSchema(properties)).buildWithFileFormat("parquet");
}
// for text and csv, we know what to do
Preconditions.checkArgument("text".equals(format) || "csv".equals(format), "Only text and csv are supported as native formats");
Preconditions.checkNotNull(schema, "for native formats, explore schema must be given in dataset properties");
String delimiter = null;
if ("text".equals(format)) {
delimiter = FileSetProperties.getExploreFormatProperties(properties).get("delimiter");
} else if ("csv".equals(format)) {
delimiter = ",";
}
return createStatementBuilder.setSchema(schema).setRowFormatDelimited(delimiter, null).buildWithFileFormat("TEXTFILE");
} else {
// They can be created by setting the avro.schema.literal table property
if (schema != null) {
createStatementBuilder.setSchema(schema);
}
// format not given, look for serde, input format, etc.
String serde = FileSetProperties.getSerDe(properties);
String inputFormat = FileSetProperties.getExploreInputFormat(properties);
String outputFormat = FileSetProperties.getExploreOutputFormat(properties);
Preconditions.checkArgument(serde != null && inputFormat != null && outputFormat != null, "All of SerDe, InputFormat and OutputFormat must be given in dataset properties");
return createStatementBuilder.setRowFormatSerde(serde).buildWithFormats(inputFormat, outputFormat);
}
}
use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class PartitioningTest method testFieldOrder.
@Test
public void testFieldOrder() {
Partitioning partitioning = Partitioning.builder().addIntField("1").addLongField("2").addStringField("3").build();
Iterator<Map.Entry<String, FieldType>> iterator = partitioning.getFields().entrySet().iterator();
Assert.assertEquals("1", iterator.next().getKey());
Assert.assertEquals("2", iterator.next().getKey());
Assert.assertEquals("3", iterator.next().getKey());
Assert.assertFalse(iterator.hasNext());
// the previous order may have been preserved by chance. Now try the reverse order
partitioning = Partitioning.builder().addIntField("3").addLongField("2").addStringField("1").build();
iterator = partitioning.getFields().entrySet().iterator();
Assert.assertEquals("3", iterator.next().getKey());
Assert.assertEquals("2", iterator.next().getKey());
Assert.assertEquals("1", iterator.next().getKey());
Assert.assertFalse(iterator.hasNext());
}
use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class ExploreExecutorHttpHandler method doDropPartition.
private void doDropPartition(HttpRequest request, HttpResponder responder, DatasetId datasetId) {
Dataset dataset;
try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
dataset = datasetInstantiator.getDataset(datasetId);
if (dataset == null) {
responder.sendString(HttpResponseStatus.NOT_FOUND, "Cannot load dataset " + datasetId);
return;
}
} catch (IOException e) {
String classNotFoundMessage = isClassNotFoundException(e);
if (classNotFoundMessage != null) {
JsonObject json = new JsonObject();
json.addProperty("handle", QueryHandle.NO_OP.getHandle());
responder.sendJson(HttpResponseStatus.OK, json);
return;
}
LOG.error("Exception instantiating dataset {}.", datasetId, e);
responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, "Exception instantiating dataset " + datasetId);
return;
}
try {
if (!(dataset instanceof PartitionedFileSet)) {
responder.sendString(HttpResponseStatus.BAD_REQUEST, "not a partitioned dataset.");
return;
}
Partitioning partitioning = ((PartitionedFileSet) dataset).getPartitioning();
Reader reader = new InputStreamReader(new ChannelBufferInputStream(request.getContent()));
Map<String, String> properties = GSON.fromJson(reader, new TypeToken<Map<String, String>>() {
}.getType());
PartitionKey partitionKey;
try {
partitionKey = PartitionedFileSetArguments.getOutputPartitionKey(properties, partitioning);
} catch (Exception e) {
responder.sendString(HttpResponseStatus.BAD_REQUEST, "invalid partition key: " + e.getMessage());
return;
}
if (partitionKey == null) {
responder.sendString(HttpResponseStatus.BAD_REQUEST, "no partition key was given.");
return;
}
QueryHandle handle = exploreTableManager.dropPartition(datasetId, properties, partitionKey);
JsonObject json = new JsonObject();
json.addProperty("handle", handle.getHandle());
responder.sendJson(HttpResponseStatus.OK, json);
} catch (Throwable e) {
LOG.error("Got exception:", e);
responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, e.getMessage());
}
}
use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class DynamicPartitioningOutputCommitter method commitJob.
@Override
public void commitJob(JobContext context) throws IOException {
Configuration configuration = context.getConfiguration();
MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);
String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
outputDataset = taskContext.getDataset(outputDatasetName);
Partitioning partitioning = outputDataset.getPartitioning();
Set<PartitionKey> partitionsToAdd = new HashSet<>();
relativePaths = new HashSet<>();
// Go over all files in the temporary directory and keep track of partitions to add for them
FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
for (FileStatus committedTaskPath : allCommittedTaskPaths) {
FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
while (fileIter.hasNext()) {
Path path = fileIter.next().getPath();
String relativePath = getRelative(committedTaskPath.getPath(), path);
int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
if (lastPathSepIdx == -1) {
// this shouldn't happen because each relative path should consist of at least one partition key and
// the output file name
LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path, relativePath);
continue;
}
// relativePath = "../key1/key2/part-m-00000"
// relativeDir = "../key1/key2"
// fileName = "part-m-00000"
String relativeDir = relativePath.substring(0, lastPathSepIdx);
String fileName = relativePath.substring(lastPathSepIdx + 1);
Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
if (fs.exists(finalDir)) {
throw new FileAlreadyExistsException("Final output path " + finalDir + " already exists");
}
PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
partitionsToAdd.add(partitionKey);
relativePaths.add(relativeDir);
}
}
// We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
// the original outputDir.
Path finalOutput = FileOutputFormat.getOutputPath(context);
FileSystem fs = finalOutput.getFileSystem(configuration);
for (FileStatus stat : getAllCommittedTaskPaths(context)) {
mergePaths(fs, stat, finalOutput);
}
// compute the metadata to be written to every output partition
Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(), PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);
// create all the necessary partitions
for (PartitionKey partitionKey : partitionsToAdd) {
PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey);
partitionOutput.setMetadata(metadata);
partitionOutput.addPartition();
}
// close the TaskContext, which flushes dataset operations
try {
taskContext.flushOperations();
} catch (Exception e) {
Throwables.propagateIfPossible(e, IOException.class);
throw new IOException(e);
}
// delete the job-specific _temporary folder and create a _done file in the o/p folder
cleanupJob(context);
// mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
for (String relativePath : relativePaths) {
Path pathToMark = new Path(finalOutput, relativePath);
Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME);
fs.createNewFile(markerPath);
}
}
}
use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class PartitionedFileSetDefinition method getDataset.
@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
// properties must contain the partitioning
Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
// make any necessary updates to the arguments
arguments = updateArgumentsIfNeeded(arguments, partitioning);
FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments, getExploreProvider());
}
Aggregations