Search in sources :

Example 36 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class AkExample method main.

public static void main(String[] args) throws Exception {
    String URL = "https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/iris.csv";
    String SCHEMA_STR = "sepal_length double, sepal_width double, petal_length double, petal_width double, category string";
    // Note: Complete the parameter below with the right oss configure.
    BaseFileSystem<?> ossFileSystem = new OssFileSystem("OssVersion", "OssEndPoint", "OssBucket", "OssId", "OssKey");
    // Note: Complete the parameter below with the right hdfs configure.
    BaseFileSystem<?> hadoopFileSystem = new HadoopFileSystem("HadoopVersion", "HdfsFileSystemUri");
    // csv to oss
    CsvSourceBatchOp csvSourceBatchOp = new CsvSourceBatchOp().setFilePath(URL).setSchemaStr(SCHEMA_STR);
    AkSinkBatchOp akSinkToOss = new AkSinkBatchOp().setFilePath(new FilePath("iris", ossFileSystem)).setOverwriteSink(true);
    csvSourceBatchOp.link(akSinkToOss);
    BatchOperator.execute();
    // oss to hdfs
    AkSourceBatchOp akSourceFromOss = new AkSourceBatchOp().setFilePath(new FilePath("iris", ossFileSystem));
    AkSinkBatchOp akSinkToHdfs = new AkSinkBatchOp().setFilePath(new FilePath("iris", hadoopFileSystem)).setOverwriteSink(true);
    akSourceFromOss.link(akSinkToHdfs);
    BatchOperator.execute();
    // hdfs to stdout
    AkSourceBatchOp akSourceFromHdfs = new AkSourceBatchOp().setFilePath(new FilePath("iris", hadoopFileSystem));
    akSourceFromHdfs.firstN(10).print();
}
Also used : FilePath(com.alibaba.alink.common.io.filesystem.FilePath) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) HadoopFileSystem(com.alibaba.alink.common.io.filesystem.HadoopFileSystem) OssFileSystem(com.alibaba.alink.common.io.filesystem.OssFileSystem) CsvSourceBatchOp(com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)

Example 37 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class TFTableModelClassificationFlatModelMapperTest method test.

@Category(DLTest.class)
@Test
public void test() throws Exception {
    AlinkGlobalConfiguration.setPrintProcessInfo(true);
    PluginDownloader pluginDownloader = AlinkGlobalConfiguration.getPluginDownloader();
    RegisterKey registerKey = TFPredictorClassLoaderFactory.getRegisterKey();
    pluginDownloader.downloadPlugin(registerKey.getName(), registerKey.getVersion());
    List<Row> baseData = Arrays.asList(Row.of((float) 1.2, 3.4, 10, 3L, "bad"), Row.of((float) 1.2, 3.4, 2, 5L, "good"), Row.of((float) 1.2, 3.4, 6, 8L, "bad"), Row.of((float) 1.2, 3.4, 3, 2L, "good"));
    String dataSchemaStr = "f float, d double, i int, l long, label string";
    Random random = new Random();
    List<Row> data = new ArrayList<>();
    for (int i = 0; i < 1000; i += 1) {
        data.add(baseData.get(random.nextInt(baseData.size())));
    }
    InputStream resourceAsStream = getClass().getClassLoader().getResourceAsStream("tf_table_model_binary_class_model.ak");
    String modelPath = Files.createTempFile("tf_table_model_binary_class_model", ".ak").toString();
    assert resourceAsStream != null;
    FileUtils.copyInputStreamToFile(resourceAsStream, new File(modelPath));
    BatchOperator<?> modelOp = new AkSourceBatchOp().setFilePath(modelPath);
    List<Row> modelRows = modelOp.collect();
    Params params = new Params();
    params.set(HasPredictionCol.PREDICTION_COL, "pred");
    params.set(HasPredictionDetailCol.PREDICTION_DETAIL_COL, "pred_detail");
    params.set(HasReservedColsDefaultAsNull.RESERVED_COLS, new String[] { "l", "label" });
    TFTableModelClassificationFlatModelMapper mapper = new TFTableModelClassificationFlatModelMapper(modelOp.getSchema(), CsvUtil.schemaStr2Schema(dataSchemaStr), params);
    mapper.loadModel(modelRows);
    List<Row> list = new ArrayList<>();
    ListCollector<Row> collector = new ListCollector<>(list);
    mapper.open();
    for (Row row : data) {
        mapper.flatMap(row, collector);
    }
    mapper.close();
    Assert.assertEquals(TableSchema.builder().field("l", Types.LONG).field("label", Types.STRING).field("pred", Types.STRING).field("pred_detail", Types.STRING).build(), mapper.getOutputSchema());
    Assert.assertEquals(data.size(), list.size());
    for (int i = 0; i < data.size(); i += 1) {
        Assert.assertEquals(4, list.get(i).getArity());
        Assert.assertEquals(data.get(i).getField(3), list.get(i).getField(0));
        Assert.assertEquals(data.get(i).getField(4), list.get(i).getField(1));
    }
}
Also used : InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) Params(org.apache.flink.ml.api.misc.param.Params) PluginDownloader(com.alibaba.alink.common.io.plugin.PluginDownloader) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) ListCollector(org.apache.flink.api.common.functions.util.ListCollector) Random(java.util.Random) Row(org.apache.flink.types.Row) RegisterKey(com.alibaba.alink.common.io.plugin.RegisterKey) File(java.io.File) Category(org.junit.experimental.categories.Category) Test(org.junit.Test) DLTest(com.alibaba.alink.testutil.categories.DLTest)

Example 38 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class TFTableModelClassificationModelMapperTest method test.

@Category(DLTest.class)
@Test
public void test() throws Exception {
    AlinkGlobalConfiguration.setPrintProcessInfo(true);
    PluginDownloader pluginDownloader = AlinkGlobalConfiguration.getPluginDownloader();
    RegisterKey registerKey = TFPredictorClassLoaderFactory.getRegisterKey();
    pluginDownloader.downloadPlugin(registerKey.getName(), registerKey.getVersion());
    List<Row> baseData = Arrays.asList(Row.of((float) 1.2, 3.4, 10, 3L, "bad"), Row.of((float) 1.2, 3.4, 2, 5L, "good"), Row.of((float) 1.2, 3.4, 6, 8L, "bad"), Row.of((float) 1.2, 3.4, 3, 2L, "good"));
    String dataSchemaStr = "f float, d double, i int, l long, label string";
    Random random = new Random();
    List<Row> data = new ArrayList<>();
    for (int i = 0; i < 1000; i += 1) {
        data.add(baseData.get(random.nextInt(baseData.size())));
    }
    InputStream resourceAsStream = getClass().getClassLoader().getResourceAsStream("tf_table_model_binary_class_model.ak");
    String modelPath = Files.createTempFile("tf_table_model_binary_class_model", ".ak").toString();
    assert resourceAsStream != null;
    FileUtils.copyInputStreamToFile(resourceAsStream, new File(modelPath));
    BatchOperator<?> modelOp = new AkSourceBatchOp().setFilePath(modelPath);
    List<Row> modelRows = modelOp.collect();
    Params params = new Params();
    params.set(HasPredictionCol.PREDICTION_COL, "pred");
    params.set(HasPredictionDetailCol.PREDICTION_DETAIL_COL, "pred_detail");
    params.set(HasReservedColsDefaultAsNull.RESERVED_COLS, new String[] { "l", "label" });
    TFTableModelClassificationModelMapper mapper = new TFTableModelClassificationModelMapper(modelOp.getSchema(), CsvUtil.schemaStr2Schema(dataSchemaStr), params);
    mapper.loadModel(modelRows);
    mapper.open();
    Assert.assertEquals(TableSchema.builder().field("l", Types.LONG).field("label", Types.STRING).field("pred", Types.STRING).field("pred_detail", Types.STRING).build(), mapper.getOutputSchema());
    for (Row row : data) {
        Row output = mapper.map(row);
        Assert.assertEquals(4, output.getArity());
        Assert.assertEquals(row.getField(3), output.getField(0));
        Assert.assertEquals(row.getField(4), output.getField(1));
    }
    mapper.close();
}
Also used : InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) Params(org.apache.flink.ml.api.misc.param.Params) PluginDownloader(com.alibaba.alink.common.io.plugin.PluginDownloader) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) Random(java.util.Random) Row(org.apache.flink.types.Row) RegisterKey(com.alibaba.alink.common.io.plugin.RegisterKey) File(java.io.File) Category(org.junit.experimental.categories.Category) Test(org.junit.Test) DLTest(com.alibaba.alink.testutil.categories.DLTest)

Example 39 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class TFTableModelRegressionModelMapperTest method test.

@Category(DLTest.class)
@Test
public void test() throws Exception {
    AlinkGlobalConfiguration.setPrintProcessInfo(true);
    PluginDownloader pluginDownloader = AlinkGlobalConfiguration.getPluginDownloader();
    RegisterKey registerKey = TFPredictorClassLoaderFactory.getRegisterKey();
    pluginDownloader.downloadPlugin(registerKey.getName(), registerKey.getVersion());
    List<Row> baseData = Arrays.asList(Row.of(1.2, 3.4, 10L, 3L, "yes", 0.), Row.of(1.2, 3.4, 2L, 5L, "no", 0.2), Row.of(1.2, 3.4, 6L, 8L, "no", 0.4), Row.of(1.2, 3.4, 3L, 2L, "yes", 1.0));
    String dataSchemaStr = "f double, d double, i long, l long, s string, label double";
    Random random = new Random();
    List<Row> data = new ArrayList<>();
    for (int i = 0; i < 1000; i += 1) {
        data.add(baseData.get(random.nextInt(baseData.size())));
    }
    InputStream resourceAsStream = getClass().getClassLoader().getResourceAsStream("tf_table_model_regression_model.ak");
    String modelPath = Files.createTempFile("tf_table_model_regression_model", ".ak").toString();
    assert resourceAsStream != null;
    FileUtils.copyInputStreamToFile(resourceAsStream, new File(modelPath));
    BatchOperator<?> modelOp = new AkSourceBatchOp().setFilePath(modelPath);
    List<Row> modelRows = modelOp.collect();
    Params params = new Params();
    params.set(HasPredictionCol.PREDICTION_COL, "pred");
    params.set(HasReservedColsDefaultAsNull.RESERVED_COLS, new String[] { "s", "label" });
    TFTableModelRegressionModelMapper mapper = new TFTableModelRegressionModelMapper(modelOp.getSchema(), CsvUtil.schemaStr2Schema(dataSchemaStr), params);
    mapper.loadModel(modelRows);
    mapper.open();
    Assert.assertEquals(TableSchema.builder().field("s", Types.STRING).field("label", Types.DOUBLE).field("pred", Types.DOUBLE).build(), mapper.getOutputSchema());
    for (Row row : data) {
        Row output = mapper.map(row);
        Assert.assertEquals(3, output.getArity());
        Assert.assertEquals(row.getField(4), output.getField(0));
        Assert.assertEquals(row.getField(5), output.getField(1));
    }
    mapper.close();
}
Also used : InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) Params(org.apache.flink.ml.api.misc.param.Params) PluginDownloader(com.alibaba.alink.common.io.plugin.PluginDownloader) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) Random(java.util.Random) Row(org.apache.flink.types.Row) RegisterKey(com.alibaba.alink.common.io.plugin.RegisterKey) File(java.io.File) Category(org.junit.experimental.categories.Category) Test(org.junit.Test) DLTest(com.alibaba.alink.testutil.categories.DLTest)

Example 40 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class Chap21 method c_7.

private static void c_7() throws Exception {
    BatchOperator<?> docs = getSource().select(LABEL_COL_NAME + ", " + TXT_COL_NAME).link(new SegmentBatchOp().setSelectedCol(TXT_COL_NAME)).link(new StopWordsRemoverBatchOp().setSelectedCol(TXT_COL_NAME));
    docs.lazyPrint(10);
    if (!new File(DATA_DIR + LDA_MODEL_FILE).exists()) {
        LdaTrainBatchOp lda = new LdaTrainBatchOp().setTopicNum(10).setNumIter(200).setVocabSize(20000).setSelectedCol(TXT_COL_NAME).setRandomSeed(123);
        docs.link(lda);
        lda.lazyPrintModelInfo();
        lda.link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE));
        lda.getSideOutput(0).link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE));
        BatchOperator.execute();
    }
    new LdaPredictBatchOp().setSelectedCol(TXT_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol("predinfo").linkFrom(new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE), docs).lazyPrint(5).link(new EvalClusterBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics());
    AkSourceBatchOp pwz = new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE);
    pwz.sample(0.001).lazyPrint(10);
    for (int t = 0; t < 10; t++) {
        pwz.select("word, topic_" + t).orderBy("topic_" + t, 20, false).lazyPrint(-1, "topic" + t);
    }
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) LdaTrainBatchOp(com.alibaba.alink.operator.batch.clustering.LdaTrainBatchOp) LdaPredictBatchOp(com.alibaba.alink.operator.batch.clustering.LdaPredictBatchOp) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) File(java.io.File) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)

Aggregations

AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)66 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)20 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)18 File (java.io.File)16 EvalMultiClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp)10 Pipeline (com.alibaba.alink.pipeline.Pipeline)10 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)10 EvalClusterBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)9 Stopwatch (com.alibaba.alink.common.utils.Stopwatch)8 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)7 Row (org.apache.flink.types.Row)6 Test (org.junit.Test)6 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)5 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)5 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)5 ArrayList (java.util.ArrayList)4 PluginDownloader (com.alibaba.alink.common.io.plugin.PluginDownloader)3 RegisterKey (com.alibaba.alink.common.io.plugin.RegisterKey)3 LogisticRegressionPredictBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionPredictBatchOp)3 LogisticRegressionTrainBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionTrainBatchOp)3