Search in sources :

Example 41 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.

the class NaiveBayesTrainer method run.

@Override
public void run(SparkExecutionPluginContext sparkContext, JavaRDD<StructuredRecord> input) throws Exception {
    final HashingTF tf = new HashingTF(100);
    JavaRDD<LabeledPoint> trainingData = input.map(new Function<StructuredRecord, LabeledPoint>() {

        @Override
        public LabeledPoint call(StructuredRecord record) throws Exception {
            // should never happen, here to test app correctness in unit tests
            if (inputSchema != null && !inputSchema.equals(record.getSchema())) {
                throw new IllegalStateException("runtime schema does not match what was set at configure time.");
            }
            String text = record.get(config.fieldToClassify);
            return new LabeledPoint((Double) record.get(config.predictionField), tf.transform(Lists.newArrayList(text.split(" "))));
        }
    });
    trainingData = trainingData.cache();
    final NaiveBayesModel model = NaiveBayes.train(trainingData.rdd(), 1.0);
    // save the model to a file in the output FileSet
    JavaSparkContext javaSparkContext = sparkContext.getSparkContext();
    FileSet outputFS = sparkContext.getDataset(config.fileSetName);
    model.save(JavaSparkContext.toSparkContext(javaSparkContext), outputFS.getBaseLocation().append(config.path).toURI().getPath());
    JavaRDD<String> textsToClassify = sparkContext.<LongWritable, Text>fromDataset(TEXTS_TO_CLASSIFY).values().map(Text::toString);
    JavaRDD<Vector> featuresToClassify = textsToClassify.map(text -> tf.transform(Lists.newArrayList(text.split(" "))));
    JavaRDD<Double> predict = model.predict(featuresToClassify);
    LOG.info("Predictions: {}", predict.collect());
    // key the predictions with the message
    JavaPairRDD<String, Double> keyedPredictions = textsToClassify.zip(predict);
    // convert to byte[],byte[] to write to data
    JavaPairRDD<byte[], byte[]> bytesRDD = keyedPredictions.mapToPair(new PairFunction<Tuple2<String, Double>, byte[], byte[]>() {

        @Override
        public Tuple2<byte[], byte[]> call(Tuple2<String, Double> tuple) throws Exception {
            return new Tuple2<>(Bytes.toBytes(tuple._1()), Bytes.toBytes(tuple._2()));
        }
    });
    sparkContext.saveAsDataset(bytesRDD, CLASSIFIED_TEXTS);
}
Also used : LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) NaiveBayesModel(org.apache.spark.mllib.classification.NaiveBayesModel) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) HashingTF(org.apache.spark.mllib.feature.HashingTF) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) Vector(org.apache.spark.mllib.linalg.Vector) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) Text(org.apache.hadoop.io.Text) Tuple2(scala.Tuple2)

Example 42 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.

the class FileSetTest method testAbsolutePath.

@Test
public void testAbsolutePath() throws IOException, DatasetManagementException, UnauthorizedException {
    String absolutePath = tmpFolder.newFolder() + "/absolute/path";
    dsFrameworkUtil.createInstance("fileSet", testFileSetInstance3, FileSetProperties.builder().setBasePath(absolutePath).build());
    // validate that the base path for the file set was created
    Assert.assertTrue(new File(absolutePath).isDirectory());
    // instantiate the file set with an output path
    Map<String, String> fileArgs = Maps.newHashMap();
    FileSetArguments.setOutputPath(fileArgs, "out");
    FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance3, fileArgs);
    // write to the output path
    Assert.assertEquals(absolutePath + "/out", fileSet.getOutputLocation().toURI().getPath());
    try (OutputStream out = fileSet.getOutputLocation().getOutputStream()) {
        out.write(42);
    }
    // validate that the file was created
    Assert.assertTrue(new File(absolutePath + "/out").isFile());
}
Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) OutputStream(java.io.OutputStream) File(java.io.File) Test(org.junit.Test)

Example 43 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.

the class FileSetTest method testInputOutputFormatClassAtRuntime.

@Test
public void testInputOutputFormatClassAtRuntime() throws Exception {
    // create a dataset with text input and output formats
    DatasetId datasetId = OTHER_NAMESPACE.dataset("testRuntimeFormats");
    dsFrameworkUtil.createInstance("fileSet", datasetId, FileSetProperties.builder().setInputFormat(TextInputFormat.class).setOutputFormat(TextOutputFormat.class).build());
    // without passing anything in arguments, the input/output format classes will come from dataset properties
    FileSet fs = dsFrameworkUtil.getInstance(datasetId);
    Assert.assertEquals(TextInputFormat.class.getName(), fs.getInputFormatClassName());
    Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName());
    // allow overriding the input format in dataset runtime args
    fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of(FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName()));
    Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName());
    Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName());
    // allow overriding both the input and output format in dataset runtime args
    fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of(FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName(), FileSetProperties.OUTPUT_FORMAT, NullOutputFormat.class.getName()));
    Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName());
    Assert.assertEquals(NullOutputFormat.class.getName(), fs.getOutputFormatClassName());
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) NullOutputFormat(org.apache.hadoop.mapreduce.lib.output.NullOutputFormat) DatasetId(io.cdap.cdap.proto.id.DatasetId) Test(org.junit.Test)

Example 44 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.

the class FileSetTest method testRollback.

@Test
public void testRollback() throws IOException, TransactionFailureException, DatasetManagementException, UnauthorizedException {
    // test deletion of an empty output directory
    FileSet fileSet1 = createFileset(testFileSetInstance1);
    Location outputLocation = fileSet1.getOutputLocation();
    Assert.assertFalse(outputLocation.exists());
    Assert.assertTrue(outputLocation.mkdirs());
    Assert.assertTrue(outputLocation.exists());
    ((FileSetDataset) fileSet1).onFailure();
    Assert.assertFalse(outputLocation.exists());
}
Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) FileSetDataset(io.cdap.cdap.data2.dataset2.lib.file.FileSetDataset) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 45 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.

the class FileSetTest method testExternalAbsolutePath.

@Test
public void testExternalAbsolutePath() throws IOException, DatasetManagementException, UnauthorizedException {
    // create an external dir and create a file in it
    String absolutePath = tmpFolder.newFolder() + "/absolute/path";
    File absoluteFile = new File(absolutePath);
    absoluteFile.mkdirs();
    File someFile = new File(absoluteFile, "some.file");
    someFile.createNewFile();
    // create an external dataset
    dsFrameworkUtil.createInstance("fileSet", testFileSetInstance5, FileSetProperties.builder().setBasePath(absolutePath).setDataExternal(true).build());
    // instantiate the file set with an input and output path
    Map<String, String> fileArgs = Maps.newHashMap();
    FileSetArguments.setInputPath(fileArgs, "some.file");
    FileSetArguments.setOutputPath(fileArgs, "out");
    FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance5, fileArgs);
    Assert.assertNotNull(fileSet);
    // read the existing file
    Location input = fileSet.getInputLocations().iterator().next();
    InputStream in = input.getInputStream();
    in.close();
    // attempt to write an output file
    try {
        fileSet.getOutputLocation();
        Assert.fail("Extrernal file set should not allow writing output.");
    } catch (UnsupportedOperationException e) {
    // expected
    }
    // delete the dataset and validate that the files are still there
    dsFrameworkUtil.deleteInstance(testFileSetInstance5);
    Assert.assertTrue(someFile.exists());
}
Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) InputStream(java.io.InputStream) File(java.io.File) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Aggregations

FileSet (io.cdap.cdap.api.dataset.lib.FileSet)90 Location (org.apache.twill.filesystem.Location)56 Test (org.junit.Test)44 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)26 HashMap (java.util.HashMap)26 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)24 DatasetId (io.cdap.cdap.proto.id.DatasetId)22 TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)16 ApplicationManager (io.cdap.cdap.test.ApplicationManager)16 Table (io.cdap.cdap.api.dataset.table.Table)14 WorkflowManager (io.cdap.cdap.test.WorkflowManager)14 ColumnDesc (io.cdap.cdap.proto.ColumnDesc)12 QueryResult (io.cdap.cdap.proto.QueryResult)12 SparkManager (io.cdap.cdap.test.SparkManager)12 File (java.io.File)10 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)8 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)8 IOException (java.io.IOException)8 PrintStream (java.io.PrintStream)8 PrintWriter (java.io.PrintWriter)8