Search in sources :

Example 1 with StopWordsRemoverBatchOp

use of com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp in project Alink by alibaba.

the class StopWordsRemoverTest method testInitializer.

@Test
public void testInitializer() {
    StopWordsRemover op = new StopWordsRemover(new Params());
    Assert.assertEquals(op.getParams().size(), 0);
    BatchOperator<?> b = new StopWordsRemoverBatchOp();
    Assert.assertEquals(b.getParams().size(), 0);
    b = new StopWordsRemoverBatchOp(new Params());
    Assert.assertEquals(b.getParams().size(), 0);
    StreamOperator<?> s = new StopWordsRemoverStreamOp();
    Assert.assertEquals(s.getParams().size(), 0);
    s = new StopWordsRemoverStreamOp(new Params());
    Assert.assertEquals(s.getParams().size(), 0);
}
Also used : StopWordsRemoverStreamOp(com.alibaba.alink.operator.stream.nlp.StopWordsRemoverStreamOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) Params(org.apache.flink.ml.api.misc.param.Params) Test(org.junit.Test)

Example 2 with StopWordsRemoverBatchOp

use of com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp in project Alink by alibaba.

the class Chap21 method c_2_2.

static void c_2_2() throws Exception {
    String[] strings = new String[] { "Hello!      This is Alink!", "Flink,Alink..AI#ML@2020" };
    MemSourceBatchOp source = new MemSourceBatchOp(strings, "sentence");
    source.link(new TokenizerBatchOp().setSelectedCol("sentence").setOutputCol("tokens")).link(new RegexTokenizerBatchOp().setSelectedCol("sentence").setOutputCol("regex_tokens")).lazyPrint(-1);
    source.link(new RegexTokenizerBatchOp().setSelectedCol("sentence").setOutputCol("tokens_1").setPattern("\\W+")).link(new RegexTokenizerBatchOp().setSelectedCol("sentence").setOutputCol("tokens_2").setGaps(false).setPattern("\\w+")).lazyPrint(-1);
    source.link(new RegexTokenizerBatchOp().setSelectedCol("sentence").setOutputCol("tokens_1").setPattern("\\W+")).link(new RegexTokenizerBatchOp().setSelectedCol("sentence").setOutputCol("tokens_2").setPattern("\\W+").setToLowerCase(false)).lazyPrint(-1);
    source.link(new RegexTokenizerBatchOp().setSelectedCol("sentence").setOutputCol("tokens").setPattern("\\W+")).link(new StopWordsRemoverBatchOp().setSelectedCol("tokens").setOutputCol("left_tokens")).lazyPrint(-1);
    BatchOperator.execute();
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) RegexTokenizerBatchOp(com.alibaba.alink.operator.batch.nlp.RegexTokenizerBatchOp) RegexTokenizerBatchOp(com.alibaba.alink.operator.batch.nlp.RegexTokenizerBatchOp) TokenizerBatchOp(com.alibaba.alink.operator.batch.nlp.TokenizerBatchOp)

Example 3 with StopWordsRemoverBatchOp

use of com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp in project Alink by alibaba.

the class Chap21 method c_2_1.

static void c_2_1() throws Exception {
    BatchOperator.setParallelism(1);
    String[] strings = new String[] { "大家好!我在学习、使用Alink。", "【流式计算和批式计算】、(Alink)", "《人工智能》,“机器学习”?2020" };
    MemSourceBatchOp source = new MemSourceBatchOp(strings, "sentence");
    source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words")).print();
    source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words").setUserDefinedDict("流式计算", "机器学习")).print();
    source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words").setUserDefinedDict("流式计算", "机器学习")).link(new StopWordsRemoverBatchOp().setSelectedCol("words").setOutputCol("left_words")).print();
    source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words").setUserDefinedDict("流式计算", "机器学习")).link(new StopWordsRemoverBatchOp().setSelectedCol("words").setOutputCol("left_words").setStopWords("计算", "2020")).print();
    getSource().select("news_title").link(new SegmentBatchOp().setSelectedCol("news_title").setOutputCol("segmented_title")).firstN(10).print();
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp)

Example 4 with StopWordsRemoverBatchOp

use of com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp in project Alink by alibaba.

the class Chap21 method c_7.

private static void c_7() throws Exception {
    BatchOperator<?> docs = getSource().select(LABEL_COL_NAME + ", " + TXT_COL_NAME).link(new SegmentBatchOp().setSelectedCol(TXT_COL_NAME)).link(new StopWordsRemoverBatchOp().setSelectedCol(TXT_COL_NAME));
    docs.lazyPrint(10);
    if (!new File(DATA_DIR + LDA_MODEL_FILE).exists()) {
        LdaTrainBatchOp lda = new LdaTrainBatchOp().setTopicNum(10).setNumIter(200).setVocabSize(20000).setSelectedCol(TXT_COL_NAME).setRandomSeed(123);
        docs.link(lda);
        lda.lazyPrintModelInfo();
        lda.link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE));
        lda.getSideOutput(0).link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE));
        BatchOperator.execute();
    }
    new LdaPredictBatchOp().setSelectedCol(TXT_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol("predinfo").linkFrom(new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE), docs).lazyPrint(5).link(new EvalClusterBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics());
    AkSourceBatchOp pwz = new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE);
    pwz.sample(0.001).lazyPrint(10);
    for (int t = 0; t < 10; t++) {
        pwz.select("word, topic_" + t).orderBy("topic_" + t, 20, false).lazyPrint(-1, "topic" + t);
    }
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) LdaTrainBatchOp(com.alibaba.alink.operator.batch.clustering.LdaTrainBatchOp) LdaPredictBatchOp(com.alibaba.alink.operator.batch.clustering.LdaPredictBatchOp) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) File(java.io.File) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)

Example 5 with StopWordsRemoverBatchOp

use of com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp in project Alink by alibaba.

the class Chap21 method c_5_2.

static void c_5_2() throws Exception {
    BatchOperator.setParallelism(1);
    String[] strings = new String[] { "蒸羊羔、蒸熊掌、蒸鹿尾儿、烧花鸭、烧雏鸡、烧子鹅、卤猪、卤鸭、酱鸡、腊肉、松花小肚儿、晾肉、香肠儿、什锦苏盘、熏鸡白肚儿、清蒸八宝猪、江米酿鸭子、罐儿野鸡、罐儿鹌鹑。" + "卤什件儿、卤子鹅、山鸡、兔脯、菜蟒、银鱼、清蒸哈什蚂、烩鸭丝、烩鸭腰、烩鸭条、清拌鸭丝、黄心管儿、焖白鳝、焖黄鳝、豆豉鲇鱼、锅烧鲤鱼、烀烂甲鱼、抓炒鲤鱼、抓炒对儿虾。" + "软炸里脊、软炸鸡、什锦套肠儿、卤煮寒鸦儿、麻酥油卷儿、熘鲜蘑、熘鱼脯、熘鱼肚、熘鱼片儿、醋熘肉片儿、烩三鲜、烩白蘑、烩鸽子蛋、炒银丝、烩鳗鱼、炒白虾、炝青蛤、炒面鱼。" + "炒竹笋、芙蓉燕菜、炒虾仁儿、烩虾仁儿、烩腰花儿、烩海参、炒蹄筋儿、锅烧海参、锅烧白菜、炸木耳、炒肝尖儿、桂花翅子、清蒸翅子、炸飞禽、炸汁儿、炸排骨、清蒸江瑶柱。" + "糖熘芡仁米、拌鸡丝、拌肚丝、什锦豆腐、什锦丁儿、糟鸭、糟熘鱼片儿、熘蟹肉、炒蟹肉、烩蟹肉、清拌蟹肉、蒸南瓜、酿倭瓜、炒丝瓜、酿冬瓜、烟鸭掌儿、焖鸭掌儿、焖笋、炝茭白。" + "茄子晒炉肉、鸭羹、蟹肉羹、鸡血汤、三鲜木樨汤、红丸子、白丸子、南煎丸子、四喜丸子、三鲜丸子、氽丸子、鲜虾丸子、鱼脯丸子、饹炸丸子、豆腐丸子、樱桃肉、马牙肉、米粉肉。" + "一品肉、栗子肉、坛子肉、红焖肉、黄焖肉、酱豆腐肉、晒炉肉、炖肉、黏糊肉、烀肉、扣肉、松肉、罐儿肉、烧肉、大肉、烤肉、白肉、红肘子、白肘子、熏肘子、水晶肘子、蜜蜡肘子。" + "锅烧肘子、扒肘条、炖羊肉、酱羊肉、烧羊肉、烤羊肉、清羔羊肉、五香羊肉、氽三样儿、爆三样儿、炸卷果儿、烩散丹、烩酸燕儿、烩银丝、烩白杂碎、氽节子、烩节子、炸绣球。" + "三鲜鱼翅、栗子鸡、氽鲤鱼、酱汁鲫鱼、活钻鲤鱼、板鸭、筒子鸡、烩脐肚、烩南荠、爆肚仁儿、盐水肘花儿、锅烧猪蹄儿、拌稂子、炖吊子、烧肝尖儿、烧肥肠儿、烧心、烧肺。" + "烧紫盖儿、烧连帖、烧宝盖儿、油炸肺、酱瓜丝儿、山鸡丁儿、拌海蜇、龙须菜、炝冬笋、玉兰片、烧鸳鸯、烧鱼头、烧槟子、烧百合、炸豆腐、炸面筋、炸软巾、糖熘饹儿。" + "拔丝山药、糖焖莲子、酿山药、杏仁儿酪、小炒螃蟹、氽大甲、炒荤素儿、什锦葛仙米、鳎目鱼、八代鱼、海鲫鱼、黄花鱼、鲥鱼、带鱼、扒海参、扒燕窝、扒鸡腿儿、扒鸡块儿。" + "扒肉、扒面筋、扒三样儿、油泼肉、酱泼肉、炒虾黄、熘蟹黄、炒子蟹、炸子蟹、佛手海参、炸烹儿、炒芡子米、奶汤、翅子汤、三丝汤、熏斑鸠、卤斑鸠、海白米、烩腰丁儿。" + "火烧茨菰、炸鹿尾儿、焖鱼头、拌皮渣儿、氽肥肠儿、炸紫盖儿、鸡丝豆苗、十二台菜、汤羊、鹿肉、驼峰、鹿大哈、插根儿、炸花件儿,清拌粉皮儿、炝莴笋、烹芽韭、木樨菜。" + "烹丁香、烹大肉、烹白肉、麻辣野鸡、烩酸蕾、熘脊髓、咸肉丝儿、白肉丝儿、荸荠一品锅、素炝春不老、清焖莲子、酸黄菜、烧萝卜、脂油雪花儿菜、烩银耳、炒银枝儿。" + "八宝榛子酱、黄鱼锅子、白菜锅子、什锦锅子、汤圆锅子、菊花锅子、杂烩锅子、煮饽饽锅子、肉丁辣酱、炒肉丝、炒肉片儿、烩酸菜、烩白菜、烩豌豆、焖扁豆、氽毛豆、炒豇豆,外加腌苤蓝丝儿。" };
    new MemSourceBatchOp(strings, "doc").link(new SegmentBatchOp().setSelectedCol("doc").setOutputCol("words")).link(new StopWordsRemoverBatchOp().setSelectedCol("words")).link(new KeywordsExtractionBatchOp().setMethod(Method.TEXT_RANK).setSelectedCol("words").setOutputCol("extract_keywords")).select("extract_keywords").print();
    getSource().link(new SegmentBatchOp().setSelectedCol("news_title").setOutputCol("segmented_title")).link(new StopWordsRemoverBatchOp().setSelectedCol("segmented_title")).link(new KeywordsExtractionBatchOp().setTopN(5).setMethod(Method.TF_IDF).setSelectedCol("segmented_title").setOutputCol("extract_keywords")).select("news_title, extract_keywords").firstN(10).print();
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) KeywordsExtractionBatchOp(com.alibaba.alink.operator.batch.nlp.KeywordsExtractionBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp)

Aggregations

StopWordsRemoverBatchOp (com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp)6 SegmentBatchOp (com.alibaba.alink.operator.batch.nlp.SegmentBatchOp)4 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)3 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)2 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)2 File (java.io.File)2 LdaPredictBatchOp (com.alibaba.alink.operator.batch.clustering.LdaPredictBatchOp)1 LdaTrainBatchOp (com.alibaba.alink.operator.batch.clustering.LdaTrainBatchOp)1 EvalClusterBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)1 KeywordsExtractionBatchOp (com.alibaba.alink.operator.batch.nlp.KeywordsExtractionBatchOp)1 RegexTokenizerBatchOp (com.alibaba.alink.operator.batch.nlp.RegexTokenizerBatchOp)1 TokenizerBatchOp (com.alibaba.alink.operator.batch.nlp.TokenizerBatchOp)1 Word2VecTrainBatchOp (com.alibaba.alink.operator.batch.nlp.Word2VecTrainBatchOp)1 WordCountBatchOp (com.alibaba.alink.operator.batch.nlp.WordCountBatchOp)1 TextSourceBatchOp (com.alibaba.alink.operator.batch.source.TextSourceBatchOp)1 StopWordsRemoverStreamOp (com.alibaba.alink.operator.stream.nlp.StopWordsRemoverStreamOp)1 VectorNearestNeighbor (com.alibaba.alink.pipeline.similarity.VectorNearestNeighbor)1 Params (org.apache.flink.ml.api.misc.param.Params)1 Test (org.junit.Test)1