Examples with Partitioning - co.cask.cdap.api.dataset.lib.Partitioning

Example 1 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class ExploreTableManager method generateFileSetCreateStatement.

/**
   * Generate a create statement for a ((time-)partitioned) file set.
   *
   * @param dataset the instantiated dataset
   * @param datasetId the dataset id
   * @param properties the properties from dataset specification
   * @param truncating whether this call to create() is part of a truncate() operation. The effect is:
   *                   If possessExisting is true, then the truncate() has just dropped this
   *                   dataset and that deleted the explore table: we must recreate it.

   * @return a CREATE TABLE statement, or null if the dataset is not explorable
   * @throws IllegalArgumentException if the schema cannot be parsed, or if shouldErrorOnMissingSchema is true and
   *                                  the dataset spec does not contain a schema.
   */
@Nullable
private String generateFileSetCreateStatement(DatasetId datasetId, Dataset dataset, Map<String, String> properties, boolean truncating) throws IllegalArgumentException, ExploreException {
    String tableName = tableNaming.getTableName(datasetId, properties);
    String databaseName = ExploreProperties.getExploreDatabaseName(properties);
    Map<String, String> tableProperties = FileSetProperties.getTableProperties(properties);
    // if this dataset reuses an existing table, do not attempt to create it
    if (FileSetProperties.isUseExisting(tableProperties) || (FileSetProperties.isPossessExisting(tableProperties) && !truncating)) {
        try {
            exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
            // table exists: do not attempt to create
            return null;
        } catch (TableNotFoundException e) {
            throw new ExploreException(String.format("Dataset '%s' is configured to use an existing explore table, but table '%s' does not " + "exist in database '%s'. ", datasetId.getDataset(), tableName, databaseName));
        }
    }
    Location baseLocation;
    Partitioning partitioning = null;
    if (dataset instanceof PartitionedFileSet) {
        partitioning = ((PartitionedFileSet) dataset).getPartitioning();
        baseLocation = ((PartitionedFileSet) dataset).getEmbeddedFileSet().getBaseLocation();
    } else {
        baseLocation = ((FileSet) dataset).getBaseLocation();
    }
    CreateStatementBuilder createStatementBuilder = new CreateStatementBuilder(datasetId.getDataset(), databaseName, tableName, shouldEscapeColumns).setLocation(baseLocation).setPartitioning(partitioning).setTableProperties(tableProperties);
    String schema = FileSetProperties.getExploreSchema(properties);
    String format = FileSetProperties.getExploreFormat(properties);
    if (format != null) {
        if ("parquet".equals(format)) {
            return createStatementBuilder.setSchema(FileSetProperties.getExploreSchema(properties)).buildWithFileFormat("parquet");
        }
        // for text and csv, we know what to do
        Preconditions.checkArgument("text".equals(format) || "csv".equals(format), "Only text and csv are supported as native formats");
        Preconditions.checkNotNull(schema, "for native formats, explore schema must be given in dataset properties");
        String delimiter = null;
        if ("text".equals(format)) {
            delimiter = FileSetProperties.getExploreFormatProperties(properties).get("delimiter");
        } else if ("csv".equals(format)) {
            delimiter = ",";
        }
        return createStatementBuilder.setSchema(schema).setRowFormatDelimited(delimiter, null).buildWithFileFormat("TEXTFILE");
    } else {
        // They can be created by setting the avro.schema.literal table property
        if (schema != null) {
            createStatementBuilder.setSchema(schema);
        }
        // format not given, look for serde, input format, etc.
        String serde = FileSetProperties.getSerDe(properties);
        String inputFormat = FileSetProperties.getExploreInputFormat(properties);
        String outputFormat = FileSetProperties.getExploreOutputFormat(properties);
        Preconditions.checkArgument(serde != null && inputFormat != null && outputFormat != null, "All of SerDe, InputFormat and OutputFormat must be given in dataset properties");
        return createStatementBuilder.setRowFormatSerde(serde).buildWithFormats(inputFormat, outputFormat);
    }
}

Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) CreateStatementBuilder(co.cask.cdap.explore.table.CreateStatementBuilder) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Location(org.apache.twill.filesystem.Location) Nullable(javax.annotation.Nullable)

Example 2 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class PartitioningTest method testFieldOrder.

@Test
public void testFieldOrder() {
    Partitioning partitioning = Partitioning.builder().addIntField("1").addLongField("2").addStringField("3").build();
    Iterator<Map.Entry<String, FieldType>> iterator = partitioning.getFields().entrySet().iterator();
    Assert.assertEquals("1", iterator.next().getKey());
    Assert.assertEquals("2", iterator.next().getKey());
    Assert.assertEquals("3", iterator.next().getKey());
    Assert.assertFalse(iterator.hasNext());
    // the previous order may have been preserved by chance. Now try the reverse order
    partitioning = Partitioning.builder().addIntField("3").addLongField("2").addStringField("1").build();
    iterator = partitioning.getFields().entrySet().iterator();
    Assert.assertEquals("3", iterator.next().getKey());
    Assert.assertEquals("2", iterator.next().getKey());
    Assert.assertEquals("1", iterator.next().getKey());
    Assert.assertFalse(iterator.hasNext());
}

Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) Test(org.junit.Test)

Example 3 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class ExploreExecutorHttpHandler method doDropPartition.

private void doDropPartition(HttpRequest request, HttpResponder responder, DatasetId datasetId) {
    Dataset dataset;
    try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
        dataset = datasetInstantiator.getDataset(datasetId);
        if (dataset == null) {
            responder.sendString(HttpResponseStatus.NOT_FOUND, "Cannot load dataset " + datasetId);
            return;
        }
    } catch (IOException e) {
        String classNotFoundMessage = isClassNotFoundException(e);
        if (classNotFoundMessage != null) {
            JsonObject json = new JsonObject();
            json.addProperty("handle", QueryHandle.NO_OP.getHandle());
            responder.sendJson(HttpResponseStatus.OK, json);
            return;
        }
        LOG.error("Exception instantiating dataset {}.", datasetId, e);
        responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, "Exception instantiating dataset " + datasetId);
        return;
    }
    try {
        if (!(dataset instanceof PartitionedFileSet)) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "not a partitioned dataset.");
            return;
        }
        Partitioning partitioning = ((PartitionedFileSet) dataset).getPartitioning();
        Reader reader = new InputStreamReader(new ChannelBufferInputStream(request.getContent()));
        Map<String, String> properties = GSON.fromJson(reader, new TypeToken<Map<String, String>>() {
        }.getType());
        PartitionKey partitionKey;
        try {
            partitionKey = PartitionedFileSetArguments.getOutputPartitionKey(properties, partitioning);
        } catch (Exception e) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "invalid partition key: " + e.getMessage());
            return;
        }
        if (partitionKey == null) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "no partition key was given.");
            return;
        }
        QueryHandle handle = exploreTableManager.dropPartition(datasetId, properties, partitionKey);
        JsonObject json = new JsonObject();
        json.addProperty("handle", handle.getHandle());
        responder.sendJson(HttpResponseStatus.OK, json);
    } catch (Throwable e) {
        LOG.error("Got exception:", e);
        responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, e.getMessage());
    }
}

Also used : InputStreamReader(java.io.InputStreamReader) Dataset(co.cask.cdap.api.dataset.Dataset) JsonObject(com.google.gson.JsonObject) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException) BadRequestException(co.cask.cdap.common.BadRequestException) ExploreException(co.cask.cdap.explore.service.ExploreException) SQLException(java.sql.SQLException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) JsonSyntaxException(com.google.gson.JsonSyntaxException) UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException) IOException(java.io.IOException) Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) SystemDatasetInstantiator(co.cask.cdap.data.dataset.SystemDatasetInstantiator) TypeToken(com.google.common.reflect.TypeToken) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) ChannelBufferInputStream(org.jboss.netty.buffer.ChannelBufferInputStream) QueryHandle(co.cask.cdap.proto.QueryHandle)

Example 4 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class DynamicPartitioningOutputCommitter method commitJob.

@Override
public void commitJob(JobContext context) throws IOException {
    Configuration configuration = context.getConfiguration();
    MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
    BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);
    String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
    outputDataset = taskContext.getDataset(outputDatasetName);
    Partitioning partitioning = outputDataset.getPartitioning();
    Set<PartitionKey> partitionsToAdd = new HashSet<>();
    relativePaths = new HashSet<>();
    // Go over all files in the temporary directory and keep track of partitions to add for them
    FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
    for (FileStatus committedTaskPath : allCommittedTaskPaths) {
        FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
        RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
        while (fileIter.hasNext()) {
            Path path = fileIter.next().getPath();
            String relativePath = getRelative(committedTaskPath.getPath(), path);
            int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
            if (lastPathSepIdx == -1) {
                // this shouldn't happen because each relative path should consist of at least one partition key and
                // the output file name
                LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path, relativePath);
                continue;
            }
            // relativePath = "../key1/key2/part-m-00000"
            // relativeDir = "../key1/key2"
            // fileName = "part-m-00000"
            String relativeDir = relativePath.substring(0, lastPathSepIdx);
            String fileName = relativePath.substring(lastPathSepIdx + 1);
            Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
            if (fs.exists(finalDir)) {
                throw new FileAlreadyExistsException("Final output path " + finalDir + " already exists");
            }
            PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
            partitionsToAdd.add(partitionKey);
            relativePaths.add(relativeDir);
        }
    }
    // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
    // the original outputDir.
    Path finalOutput = FileOutputFormat.getOutputPath(context);
    FileSystem fs = finalOutput.getFileSystem(configuration);
    for (FileStatus stat : getAllCommittedTaskPaths(context)) {
        mergePaths(fs, stat, finalOutput);
    }
    // compute the metadata to be written to every output partition
    Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(), PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);
    // create all the necessary partitions
    for (PartitionKey partitionKey : partitionsToAdd) {
        PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey);
        partitionOutput.setMetadata(metadata);
        partitionOutput.addPartition();
    }
    // close the TaskContext, which flushes dataset operations
    try {
        taskContext.flushOperations();
    } catch (Exception e) {
        Throwables.propagateIfPossible(e, IOException.class);
        throw new IOException(e);
    }
    // delete the job-specific _temporary folder and create a _done file in the o/p folder
    cleanupJob(context);
    // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
    if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
        for (String relativePath : relativePaths) {
            Path pathToMark = new Path(finalOutput, relativePath);
            Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME);
            fs.createNewFile(markerPath);
        }
    }
}

Also used : BasicMapReduceTaskContext(co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext) Path(org.apache.hadoop.fs.Path) MapReduceClassLoader(co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader) FileAlreadyExistsException(org.apache.hadoop.mapred.FileAlreadyExistsException) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Configuration(org.apache.hadoop.conf.Configuration) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) IOException(java.io.IOException) FileAlreadyExistsException(org.apache.hadoop.mapred.FileAlreadyExistsException) Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) HashSet(java.util.HashSet)

Example 5 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class PartitionedFileSetDefinition method getDataset.

@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
    // properties must contain the partitioning
    Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
    // make any necessary updates to the arguments
    arguments = updateArgumentsIfNeeded(arguments, partitioning);
    FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
    IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
    return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments, getExploreProvider());
}

Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IndexedTable(co.cask.cdap.api.dataset.lib.IndexedTable)

Aggregations

Partitioning (co.cask.cdap.api.dataset.lib.Partitioning)13 Test (org.junit.Test)5 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)4 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)3 IOException (java.io.IOException)3 UnsupportedTypeException (co.cask.cdap.api.data.schema.UnsupportedTypeException)2 Dataset (co.cask.cdap.api.dataset.Dataset)2 DatasetManagementException (co.cask.cdap.api.dataset.DatasetManagementException)2 DatasetProperties (co.cask.cdap.api.dataset.DatasetProperties)2 BadRequestException (co.cask.cdap.common.BadRequestException)2 SystemDatasetInstantiator (co.cask.cdap.data.dataset.SystemDatasetInstantiator)2 ExploreException (co.cask.cdap.explore.service.ExploreException)2 QueryHandle (co.cask.cdap.proto.QueryHandle)2 TypeToken (com.google.common.reflect.TypeToken)2 JsonObject (com.google.gson.JsonObject)2 JsonSyntaxException (com.google.gson.JsonSyntaxException)2 InputStreamReader (java.io.InputStreamReader)2 Reader (java.io.Reader)2 SQLException (java.sql.SQLException)2 ChannelBufferInputStream (org.jboss.netty.buffer.ChannelBufferInputStream)2