Search in sources :

Example 41 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class Chap22 method c_2.

static void c_2() throws Exception {
    BatchOperator.setParallelism(1);
    TextSourceBatchOp source = new TextSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE);
    source.lazyPrint(8);
    final String[] CHARACTER_DICT = new String[] { "曹操", "孔明", "玄德", "刘玄德", "刘备", "关羽", "张飞", "赵云", "曹孟德", "诸葛亮", "张郃", "孙权", "张辽", "鲁肃" };
    source.link(new SegmentBatchOp().setSelectedCol("text").setUserDefinedDict(CHARACTER_DICT)).link(new StopWordsRemoverBatchOp().setSelectedCol("text")).link(new WordCountBatchOp().setSelectedCol("text")).orderBy("cnt", 100, false).print();
    if (!new File(DATA_DIR + W2V_MODEL_FILE).exists()) {
        source.link(new SegmentBatchOp().setSelectedCol("text").setUserDefinedDict(CHARACTER_DICT)).link(new StopWordsRemoverBatchOp().setSelectedCol("text").setStopWords("亦", "曰", "遂", "吾", "已", "去", "二人", "今", "使", "中", "知", "不", "见", "都", "令", "却", "欲", "请", "人", "谓", "不可", "闻", "前", "后", "皆", "便", "问", "日", "时", "耳", "不敢", "问", "回", "才", "之事", "之人", "之时", "料", "今日", "令人", "受", "说", "出", "已毕", "不得", "使人", "众", "何不", "不知", "再", "处", "无", "即日", "诸", "此时", "只", "下", "还", "上", "杀", "将军", "却说", "兵", "汝", "走", "言", "寨", "不能", "斩", "死", "商议", "听", "军士", "军", "左右", "军马", "引兵", "次日", "二", "看", "耶", "退", "更", "毕", "正", "一人", "原来", "大笑", "车胄", "口", "引", "大喜", "其事", "助", "事", "未", "大", "至此", "讫", "心中", "敢")).link(new Word2VecTrainBatchOp().setSelectedCol("text").setMinCount(10).setNumIter(50)).link(new AkSinkBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE));
        BatchOperator.execute();
    }
    AkSourceBatchOp word2vec = new AkSourceBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE);
    new VectorNearestNeighbor().setIdCol("word").setSelectedCol("vec").setTopN(20).setOutputCol("similar_words").fit(word2vec).transform(word2vec.filter("word IN ('曹操', '操', '玄德', '刘备', '孔明', " + "'亮', '卧龙', '周瑜', '吕布', '貂蝉', '云长', '孙权')")).select("word, similar_words").print();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) WordCountBatchOp(com.alibaba.alink.operator.batch.nlp.WordCountBatchOp) Word2VecTrainBatchOp(com.alibaba.alink.operator.batch.nlp.Word2VecTrainBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) VectorNearestNeighbor(com.alibaba.alink.pipeline.similarity.VectorNearestNeighbor) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) TextSourceBatchOp(com.alibaba.alink.operator.batch.source.TextSourceBatchOp) File(java.io.File)

Example 42 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class Chap23 method c_4.

static void c_4() throws Exception {
    AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    if (!new File(DATA_DIR + PIPELINE_MODEL).exists()) {
        new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(50000).setSelectedCol("v_2").setOutputCol("v_2")).add(new NGram().setN(3).setSelectedCol(TXT_COL_NAME).setOutputCol("v_3")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(10000).setSelectedCol("v_3").setOutputCol("v_3")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2", "v_3").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).save(DATA_DIR + PIPELINE_MODEL);
        BatchOperator.execute();
    }
    PipelineModel pipeline_model = PipelineModel.load(DATA_DIR + PIPELINE_MODEL);
    AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    pipeline_model.transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2 and 3"));
    BatchOperator.execute();
    AkSourceStreamOp test_stream = new AkSourceStreamOp().setFilePath(DATA_DIR + TEST_FILE);
    pipeline_model.transform(test_stream).sample(0.001).select(PREDICTION_COL_NAME + ", " + LABEL_COL_NAME + ", " + TXT_COL_NAME).print();
    StreamOperator.execute();
    String str = "Oh dear. good cast, but to write and direct is an art and to write wit and direct wit is a bit of a " + "task. Even doing good comedy you have to get the timing and moment right. Im not putting it all down " + "there were parts where i laughed loud but that was at very few times. The main focus to me was on the " + "fast free flowing dialogue, that made some people in the film annoying. It may sound great while " + "reading the script in your head but getting that out and to the camera is a different task. And the " + "hand held camera work does give energy to few parts of the film. Overall direction was good but the " + "script was not all that to me, but I'm sure you was reading the script in your head it would sound good" + ". Sorry.";
    Row pred_row;
    LocalPredictor local_predictor = pipeline_model.collectLocalPredictor("review string");
    System.out.println(local_predictor.getOutputSchema());
    pred_row = local_predictor.map(Row.of(str));
    System.out.println(pred_row.getField(4));
    LocalPredictor local_predictor_2 = new LocalPredictor(DATA_DIR + PIPELINE_MODEL, "review string");
    System.out.println(local_predictor_2.getOutputSchema());
    pred_row = local_predictor_2.map(Row.of(str));
    System.out.println(pred_row.getField(4));
}
Also used : LocalPredictor(com.alibaba.alink.pipeline.LocalPredictor) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) NGram(com.alibaba.alink.pipeline.nlp.NGram) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) Pipeline(com.alibaba.alink.pipeline.Pipeline) PipelineModel(com.alibaba.alink.pipeline.PipelineModel) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) RegexTokenizer(com.alibaba.alink.pipeline.nlp.RegexTokenizer) AkSourceStreamOp(com.alibaba.alink.operator.stream.source.AkSourceStreamOp) Row(org.apache.flink.types.Row) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) File(java.io.File)

Example 43 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class Chap24 method c_5.

static void c_5() throws Exception {
    if (!new File(DATA_DIR + ITEMCF_MODEL_FILE).exists()) {
        getSourceRatings().link(new ItemCfTrainBatchOp().setUserCol(USER_COL).setItemCol(ITEM_COL).setRateCol(RATING_COL)).link(new AkSinkBatchOp().setFilePath(DATA_DIR + ITEMCF_MODEL_FILE));
        BatchOperator.execute();
    }
    MemSourceBatchOp test_data = new MemSourceBatchOp(new Long[] { 1L }, "user_id");
    new ItemCfItemsPerUserRecommender().setUserCol(USER_COL).setRecommCol(RECOMM_COL).setModelData(new AkSourceBatchOp().setFilePath(DATA_DIR + ITEMCF_MODEL_FILE)).transform(test_data).print();
    LocalPredictor recomm_predictor = new ItemCfItemsPerUserRecommender().setUserCol(USER_COL).setRecommCol(RECOMM_COL).setK(20).setModelData(new AkSourceBatchOp().setFilePath(DATA_DIR + ITEMCF_MODEL_FILE)).collectLocalPredictor("user_id long");
    System.out.println(recomm_predictor.getOutputSchema());
    LocalPredictor kv_predictor = new Lookup().setSelectedCols(ITEM_COL).setOutputCols("item_name").setModelData(getSourceItems()).setMapKeyCols("item_id").setMapValueCols("title").collectLocalPredictor("item_id long");
    System.out.println(kv_predictor.getOutputSchema());
    MTable recommResult = (MTable) recomm_predictor.map(Row.of(1L)).getField(1);
    System.out.println(recommResult);
    new Lookup().setSelectedCols(ITEM_COL).setOutputCols("item_name").setModelData(getSourceItems()).setMapKeyCols("item_id").setMapValueCols("title").transform(getSourceRatings().filter("user_id=1 AND rating>4")).select("item_name").orderBy("item_name", 1000).lazyPrint(-1);
    LocalPredictor recomm_predictor_2 = new ItemCfItemsPerUserRecommender().setUserCol(USER_COL).setRecommCol(RECOMM_COL).setK(20).setExcludeKnown(true).setModelData(new AkSourceBatchOp().setFilePath(DATA_DIR + ITEMCF_MODEL_FILE)).collectLocalPredictor("user_id long");
    recommResult = (MTable) recomm_predictor_2.map(Row.of(1L)).getField(1);
    System.out.println(recommResult);
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) MTable(com.alibaba.alink.common.MTable) LocalPredictor(com.alibaba.alink.pipeline.LocalPredictor) Lookup(com.alibaba.alink.pipeline.dataproc.Lookup) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) File(java.io.File) ItemCfTrainBatchOp(com.alibaba.alink.operator.batch.recommendation.ItemCfTrainBatchOp)

Example 44 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class Chap24 method c_4.

static void c_4() throws Exception {
    TsvSourceBatchOp train_set = new TsvSourceBatchOp().setFilePath(DATA_DIR + RATING_TRAIN_FILE).setSchemaStr(RATING_SCHEMA_STRING);
    TsvSourceBatchOp test_set = new TsvSourceBatchOp().setFilePath(DATA_DIR + RATING_TEST_FILE).setSchemaStr(RATING_SCHEMA_STRING);
    if (!new File(DATA_DIR + ALS_MODEL_FILE).exists()) {
        train_set.link(new AlsTrainBatchOp().setUserCol(USER_COL).setItemCol(ITEM_COL).setRateCol(RATING_COL).setLambda(0.1).setRank(10).setNumIter(10)).link(new AkSinkBatchOp().setFilePath(DATA_DIR + ALS_MODEL_FILE));
        BatchOperator.execute();
    }
    new PipelineModel(new AlsRateRecommender().setUserCol(USER_COL).setItemCol(ITEM_COL).setRecommCol(RECOMM_COL).setModelData(new AkSourceBatchOp().setFilePath(DATA_DIR + ALS_MODEL_FILE)), new Lookup().setSelectedCols(ITEM_COL).setOutputCols("item_name").setModelData(getSourceItems()).setMapKeyCols("item_id").setMapValueCols("title")).transform(test_set.filter("user_id=1")).select("user_id, rating, recomm, item_name").orderBy("rating, recomm", 1000).lazyPrint(-1);
    BatchOperator.execute();
    new AlsRateRecommender().setUserCol(USER_COL).setItemCol(ITEM_COL).setRecommCol(RECOMM_COL).setModelData(new AkSourceBatchOp().setFilePath(DATA_DIR + ALS_MODEL_FILE)).transform(test_set).link(new EvalRegressionBatchOp().setLabelCol(RATING_COL).setPredictionCol(RECOMM_COL).lazyPrintMetrics());
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) AlsTrainBatchOp(com.alibaba.alink.operator.batch.recommendation.AlsTrainBatchOp) EvalRegressionBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalRegressionBatchOp) Lookup(com.alibaba.alink.pipeline.dataproc.Lookup) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) TsvSourceBatchOp(com.alibaba.alink.operator.batch.source.TsvSourceBatchOp) File(java.io.File) PipelineModel(com.alibaba.alink.pipeline.PipelineModel)

Example 45 with AkSourceBatchOp

use of com.alibaba.alink.operator.batch.source.AkSourceBatchOp in project Alink by alibaba.

the class Chap24 method c_8.

static void c_8() throws Exception {
    MemSourceBatchOp test_data = new MemSourceBatchOp(new Long[] { 1L }, USER_COL);
    new UserCfSimilarUsersRecommender().setUserCol(USER_COL).setRecommCol(RECOMM_COL).setModelData(new AkSourceBatchOp().setFilePath(DATA_DIR + USERCF_MODEL_FILE)).transform(test_data).print();
    getSourceUsers().filter("user_id IN (1, 916,864,268,92,435,457,738,429,303,276)").print();
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp)

Aggregations

AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)66 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)20 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)18 File (java.io.File)16 EvalMultiClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp)10 Pipeline (com.alibaba.alink.pipeline.Pipeline)10 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)10 EvalClusterBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)9 Stopwatch (com.alibaba.alink.common.utils.Stopwatch)8 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)7 Row (org.apache.flink.types.Row)6 Test (org.junit.Test)6 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)5 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)5 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)5 ArrayList (java.util.ArrayList)4 PluginDownloader (com.alibaba.alink.common.io.plugin.PluginDownloader)3 RegisterKey (com.alibaba.alink.common.io.plugin.RegisterKey)3 LogisticRegressionPredictBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionPredictBatchOp)3 LogisticRegressionTrainBatchOp (com.alibaba.alink.operator.batch.classification.LogisticRegressionTrainBatchOp)3