Search in sources :

Example 1 with SegmentBatchOp

use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.

the class SegmentTest method testInitializer.

@Test
public void testInitializer() {
    Segment op = new Segment(new Params());
    Assert.assertEquals(op.getParams().size(), 0);
    BatchOperator<?> b = new SegmentBatchOp();
    Assert.assertEquals(b.getParams().size(), 0);
    b = new SegmentBatchOp(new Params());
    Assert.assertEquals(b.getParams().size(), 0);
    StreamOperator<?> s = new SegmentStreamOp();
    Assert.assertEquals(s.getParams().size(), 0);
    s = new SegmentStreamOp(new Params());
    Assert.assertEquals(s.getParams().size(), 0);
}
Also used : SegmentStreamOp(com.alibaba.alink.operator.stream.nlp.SegmentStreamOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) Params(org.apache.flink.ml.api.misc.param.Params) Test(org.junit.Test)

Example 2 with SegmentBatchOp

use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.

the class Chap21 method c_3.

static void c_3() throws Exception {
    BatchOperator.setParallelism(1);
    BatchOperator titles = getSource().firstN(10).select("news_title").link(new SegmentBatchOp().setSelectedCol("news_title").setOutputCol("segmented_title").setReservedCols(new String[] {}));
    titles.link(new WordCountBatchOp().setSelectedCol("segmented_title")).orderBy("cnt", 100, false).lazyPrint(-1, "WordCount");
    titles.link(new DocWordCountBatchOp().setDocIdCol("segmented_title").setContentCol("segmented_title")).lazyPrint(-1, "DocWordCount");
    BatchOperator.execute();
}
Also used : WordCountBatchOp(com.alibaba.alink.operator.batch.nlp.WordCountBatchOp) DocWordCountBatchOp(com.alibaba.alink.operator.batch.nlp.DocWordCountBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) DocWordCountBatchOp(com.alibaba.alink.operator.batch.nlp.DocWordCountBatchOp) BatchOperator(com.alibaba.alink.operator.batch.BatchOperator)

Example 3 with SegmentBatchOp

use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.

the class Chap21 method c_2_1.

static void c_2_1() throws Exception {
    BatchOperator.setParallelism(1);
    String[] strings = new String[] { "大家好!我在学习、使用Alink。", "【流式计算和批式计算】、(Alink)", "《人工智能》,“机器学习”?2020" };
    MemSourceBatchOp source = new MemSourceBatchOp(strings, "sentence");
    source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words")).print();
    source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words").setUserDefinedDict("流式计算", "机器学习")).print();
    source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words").setUserDefinedDict("流式计算", "机器学习")).link(new StopWordsRemoverBatchOp().setSelectedCol("words").setOutputCol("left_words")).print();
    source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words").setUserDefinedDict("流式计算", "机器学习")).link(new StopWordsRemoverBatchOp().setSelectedCol("words").setOutputCol("left_words").setStopWords("计算", "2020")).print();
    getSource().select("news_title").link(new SegmentBatchOp().setSelectedCol("news_title").setOutputCol("segmented_title")).firstN(10).print();
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp)

Example 4 with SegmentBatchOp

use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.

the class Chap21 method c_7.

private static void c_7() throws Exception {
    BatchOperator<?> docs = getSource().select(LABEL_COL_NAME + ", " + TXT_COL_NAME).link(new SegmentBatchOp().setSelectedCol(TXT_COL_NAME)).link(new StopWordsRemoverBatchOp().setSelectedCol(TXT_COL_NAME));
    docs.lazyPrint(10);
    if (!new File(DATA_DIR + LDA_MODEL_FILE).exists()) {
        LdaTrainBatchOp lda = new LdaTrainBatchOp().setTopicNum(10).setNumIter(200).setVocabSize(20000).setSelectedCol(TXT_COL_NAME).setRandomSeed(123);
        docs.link(lda);
        lda.lazyPrintModelInfo();
        lda.link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE));
        lda.getSideOutput(0).link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE));
        BatchOperator.execute();
    }
    new LdaPredictBatchOp().setSelectedCol(TXT_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol("predinfo").linkFrom(new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE), docs).lazyPrint(5).link(new EvalClusterBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics());
    AkSourceBatchOp pwz = new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE);
    pwz.sample(0.001).lazyPrint(10);
    for (int t = 0; t < 10; t++) {
        pwz.select("word, topic_" + t).orderBy("topic_" + t, 20, false).lazyPrint(-1, "topic" + t);
    }
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) StopWordsRemoverBatchOp(com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) LdaTrainBatchOp(com.alibaba.alink.operator.batch.clustering.LdaTrainBatchOp) LdaPredictBatchOp(com.alibaba.alink.operator.batch.clustering.LdaPredictBatchOp) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) File(java.io.File) EvalClusterBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)

Example 5 with SegmentBatchOp

use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.

the class Chap21 method c_6_1.

static void c_6_1() throws Exception {
    Row[] rows = new Row[] { Row.of("机器学习", "机器学习"), Row.of("批式计算", "流式计算"), Row.of("Machine Learning", "ML"), Row.of("Flink", "Alink"), Row.of("Good Morning!", "Good Evening!") };
    MemSourceBatchOp source = new MemSourceBatchOp(rows, new String[] { "col1", "col2" });
    source.lazyPrint(-1);
    source.link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN").setOutputCol("LEVENSHTEIN")).link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN_SIM").setOutputCol("LEVENSHTEIN_SIM")).link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LCS").setOutputCol("LCS")).link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LCS_SIM").setOutputCol("LCS_SIM")).link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("JACCARD_SIM").setOutputCol("JACCARD_SIM")).lazyPrint(-1, "\n## StringSimilarityPairwiseBatchOp ##");
    source.link(new SegmentBatchOp().setSelectedCol("col1")).link(new SegmentBatchOp().setSelectedCol("col2")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN").setOutputCol("LEVENSHTEIN")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN_SIM").setOutputCol("LEVENSHTEIN_SIM")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LCS").setOutputCol("LCS")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LCS_SIM").setOutputCol("LCS_SIM")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("JACCARD_SIM").setOutputCol("JACCARD_SIM")).lazyPrint(-1, "\n## TextSimilarityPairwiseBatchOp ##");
    BatchOperator.execute();
    MemSourceStreamOp source_stream = new MemSourceStreamOp(rows, new String[] { "col1", "col2" });
    source_stream.link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN").setOutputCol("LEVENSHTEIN")).link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN_SIM").setOutputCol("LEVENSHTEIN_SIM")).link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("LCS").setOutputCol("LCS")).link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("LCS_SIM").setOutputCol("LCS_SIM")).link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("JACCARD_SIM").setOutputCol("JACCARD_SIM")).print();
    StreamOperator.execute();
}
Also used : MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) MemSourceStreamOp(com.alibaba.alink.operator.stream.source.MemSourceStreamOp) TextSimilarityPairwiseBatchOp(com.alibaba.alink.operator.batch.similarity.TextSimilarityPairwiseBatchOp) SegmentBatchOp(com.alibaba.alink.operator.batch.nlp.SegmentBatchOp) StringSimilarityPairwiseBatchOp(com.alibaba.alink.operator.batch.similarity.StringSimilarityPairwiseBatchOp) Row(org.apache.flink.types.Row) StringSimilarityPairwiseStreamOp(com.alibaba.alink.operator.stream.similarity.StringSimilarityPairwiseStreamOp)

Aggregations

SegmentBatchOp (com.alibaba.alink.operator.batch.nlp.SegmentBatchOp)8 StopWordsRemoverBatchOp (com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp)4 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)3 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)2 WordCountBatchOp (com.alibaba.alink.operator.batch.nlp.WordCountBatchOp)2 AkSinkBatchOp (com.alibaba.alink.operator.batch.sink.AkSinkBatchOp)2 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)2 File (java.io.File)2 LdaPredictBatchOp (com.alibaba.alink.operator.batch.clustering.LdaPredictBatchOp)1 LdaTrainBatchOp (com.alibaba.alink.operator.batch.clustering.LdaTrainBatchOp)1 EvalClusterBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalClusterBatchOp)1 DocWordCountBatchOp (com.alibaba.alink.operator.batch.nlp.DocWordCountBatchOp)1 KeywordsExtractionBatchOp (com.alibaba.alink.operator.batch.nlp.KeywordsExtractionBatchOp)1 Word2VecTrainBatchOp (com.alibaba.alink.operator.batch.nlp.Word2VecTrainBatchOp)1 StringSimilarityPairwiseBatchOp (com.alibaba.alink.operator.batch.similarity.StringSimilarityPairwiseBatchOp)1 TextSimilarityPairwiseBatchOp (com.alibaba.alink.operator.batch.similarity.TextSimilarityPairwiseBatchOp)1 TextSourceBatchOp (com.alibaba.alink.operator.batch.source.TextSourceBatchOp)1 SegmentStreamOp (com.alibaba.alink.operator.stream.nlp.SegmentStreamOp)1 StringSimilarityPairwiseStreamOp (com.alibaba.alink.operator.stream.similarity.StringSimilarityPairwiseStreamOp)1 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)1