use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.
the class NaiveBayesTrainer method run.
@Override
public void run(SparkExecutionPluginContext sparkContext, JavaRDD<StructuredRecord> input) throws Exception {
final HashingTF tf = new HashingTF(100);
JavaRDD<LabeledPoint> trainingData = input.map(new Function<StructuredRecord, LabeledPoint>() {
@Override
public LabeledPoint call(StructuredRecord record) throws Exception {
// should never happen, here to test app correctness in unit tests
if (inputSchema != null && !inputSchema.equals(record.getSchema())) {
throw new IllegalStateException("runtime schema does not match what was set at configure time.");
}
String text = record.get(config.fieldToClassify);
return new LabeledPoint((Double) record.get(config.predictionField), tf.transform(Lists.newArrayList(text.split(" "))));
}
});
trainingData = trainingData.cache();
final NaiveBayesModel model = NaiveBayes.train(trainingData.rdd(), 1.0);
// save the model to a file in the output FileSet
JavaSparkContext javaSparkContext = sparkContext.getSparkContext();
FileSet outputFS = sparkContext.getDataset(config.fileSetName);
model.save(JavaSparkContext.toSparkContext(javaSparkContext), outputFS.getBaseLocation().append(config.path).toURI().getPath());
JavaRDD<String> textsToClassify = sparkContext.<LongWritable, Text>fromDataset(TEXTS_TO_CLASSIFY).values().map(Text::toString);
JavaRDD<Vector> featuresToClassify = textsToClassify.map(text -> tf.transform(Lists.newArrayList(text.split(" "))));
JavaRDD<Double> predict = model.predict(featuresToClassify);
LOG.info("Predictions: {}", predict.collect());
// key the predictions with the message
JavaPairRDD<String, Double> keyedPredictions = textsToClassify.zip(predict);
// convert to byte[],byte[] to write to data
JavaPairRDD<byte[], byte[]> bytesRDD = keyedPredictions.mapToPair(new PairFunction<Tuple2<String, Double>, byte[], byte[]>() {
@Override
public Tuple2<byte[], byte[]> call(Tuple2<String, Double> tuple) throws Exception {
return new Tuple2<>(Bytes.toBytes(tuple._1()), Bytes.toBytes(tuple._2()));
}
});
sparkContext.saveAsDataset(bytesRDD, CLASSIFIED_TEXTS);
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.
the class FileSetTest method testAbsolutePath.
@Test
public void testAbsolutePath() throws IOException, DatasetManagementException, UnauthorizedException {
String absolutePath = tmpFolder.newFolder() + "/absolute/path";
dsFrameworkUtil.createInstance("fileSet", testFileSetInstance3, FileSetProperties.builder().setBasePath(absolutePath).build());
// validate that the base path for the file set was created
Assert.assertTrue(new File(absolutePath).isDirectory());
// instantiate the file set with an output path
Map<String, String> fileArgs = Maps.newHashMap();
FileSetArguments.setOutputPath(fileArgs, "out");
FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance3, fileArgs);
// write to the output path
Assert.assertEquals(absolutePath + "/out", fileSet.getOutputLocation().toURI().getPath());
try (OutputStream out = fileSet.getOutputLocation().getOutputStream()) {
out.write(42);
}
// validate that the file was created
Assert.assertTrue(new File(absolutePath + "/out").isFile());
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.
the class FileSetTest method testInputOutputFormatClassAtRuntime.
@Test
public void testInputOutputFormatClassAtRuntime() throws Exception {
// create a dataset with text input and output formats
DatasetId datasetId = OTHER_NAMESPACE.dataset("testRuntimeFormats");
dsFrameworkUtil.createInstance("fileSet", datasetId, FileSetProperties.builder().setInputFormat(TextInputFormat.class).setOutputFormat(TextOutputFormat.class).build());
// without passing anything in arguments, the input/output format classes will come from dataset properties
FileSet fs = dsFrameworkUtil.getInstance(datasetId);
Assert.assertEquals(TextInputFormat.class.getName(), fs.getInputFormatClassName());
Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName());
// allow overriding the input format in dataset runtime args
fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of(FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName()));
Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName());
Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName());
// allow overriding both the input and output format in dataset runtime args
fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of(FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName(), FileSetProperties.OUTPUT_FORMAT, NullOutputFormat.class.getName()));
Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName());
Assert.assertEquals(NullOutputFormat.class.getName(), fs.getOutputFormatClassName());
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.
the class FileSetTest method testRollback.
@Test
public void testRollback() throws IOException, TransactionFailureException, DatasetManagementException, UnauthorizedException {
// test deletion of an empty output directory
FileSet fileSet1 = createFileset(testFileSetInstance1);
Location outputLocation = fileSet1.getOutputLocation();
Assert.assertFalse(outputLocation.exists());
Assert.assertTrue(outputLocation.mkdirs());
Assert.assertTrue(outputLocation.exists());
((FileSetDataset) fileSet1).onFailure();
Assert.assertFalse(outputLocation.exists());
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by cdapio.
the class FileSetTest method testExternalAbsolutePath.
@Test
public void testExternalAbsolutePath() throws IOException, DatasetManagementException, UnauthorizedException {
// create an external dir and create a file in it
String absolutePath = tmpFolder.newFolder() + "/absolute/path";
File absoluteFile = new File(absolutePath);
absoluteFile.mkdirs();
File someFile = new File(absoluteFile, "some.file");
someFile.createNewFile();
// create an external dataset
dsFrameworkUtil.createInstance("fileSet", testFileSetInstance5, FileSetProperties.builder().setBasePath(absolutePath).setDataExternal(true).build());
// instantiate the file set with an input and output path
Map<String, String> fileArgs = Maps.newHashMap();
FileSetArguments.setInputPath(fileArgs, "some.file");
FileSetArguments.setOutputPath(fileArgs, "out");
FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance5, fileArgs);
Assert.assertNotNull(fileSet);
// read the existing file
Location input = fileSet.getInputLocations().iterator().next();
InputStream in = input.getInputStream();
in.close();
// attempt to write an output file
try {
fileSet.getOutputLocation();
Assert.fail("Extrernal file set should not allow writing output.");
} catch (UnsupportedOperationException e) {
// expected
}
// delete the dataset and validate that the files are still there
dsFrameworkUtil.deleteInstance(testFileSetInstance5);
Assert.assertTrue(someFile.exists());
}
Aggregations