use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.
the class SegmentTest method testInitializer.
@Test
public void testInitializer() {
Segment op = new Segment(new Params());
Assert.assertEquals(op.getParams().size(), 0);
BatchOperator<?> b = new SegmentBatchOp();
Assert.assertEquals(b.getParams().size(), 0);
b = new SegmentBatchOp(new Params());
Assert.assertEquals(b.getParams().size(), 0);
StreamOperator<?> s = new SegmentStreamOp();
Assert.assertEquals(s.getParams().size(), 0);
s = new SegmentStreamOp(new Params());
Assert.assertEquals(s.getParams().size(), 0);
}
use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.
the class Chap21 method c_3.
static void c_3() throws Exception {
BatchOperator.setParallelism(1);
BatchOperator titles = getSource().firstN(10).select("news_title").link(new SegmentBatchOp().setSelectedCol("news_title").setOutputCol("segmented_title").setReservedCols(new String[] {}));
titles.link(new WordCountBatchOp().setSelectedCol("segmented_title")).orderBy("cnt", 100, false).lazyPrint(-1, "WordCount");
titles.link(new DocWordCountBatchOp().setDocIdCol("segmented_title").setContentCol("segmented_title")).lazyPrint(-1, "DocWordCount");
BatchOperator.execute();
}
use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.
the class Chap21 method c_2_1.
static void c_2_1() throws Exception {
BatchOperator.setParallelism(1);
String[] strings = new String[] { "大家好!我在学习、使用Alink。", "【流式计算和批式计算】、(Alink)", "《人工智能》,“机器学习”?2020" };
MemSourceBatchOp source = new MemSourceBatchOp(strings, "sentence");
source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words")).print();
source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words").setUserDefinedDict("流式计算", "机器学习")).print();
source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words").setUserDefinedDict("流式计算", "机器学习")).link(new StopWordsRemoverBatchOp().setSelectedCol("words").setOutputCol("left_words")).print();
source.link(new SegmentBatchOp().setSelectedCol("sentence").setOutputCol("words").setUserDefinedDict("流式计算", "机器学习")).link(new StopWordsRemoverBatchOp().setSelectedCol("words").setOutputCol("left_words").setStopWords("计算", "2020")).print();
getSource().select("news_title").link(new SegmentBatchOp().setSelectedCol("news_title").setOutputCol("segmented_title")).firstN(10).print();
}
use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.
the class Chap21 method c_7.
private static void c_7() throws Exception {
BatchOperator<?> docs = getSource().select(LABEL_COL_NAME + ", " + TXT_COL_NAME).link(new SegmentBatchOp().setSelectedCol(TXT_COL_NAME)).link(new StopWordsRemoverBatchOp().setSelectedCol(TXT_COL_NAME));
docs.lazyPrint(10);
if (!new File(DATA_DIR + LDA_MODEL_FILE).exists()) {
LdaTrainBatchOp lda = new LdaTrainBatchOp().setTopicNum(10).setNumIter(200).setVocabSize(20000).setSelectedCol(TXT_COL_NAME).setRandomSeed(123);
docs.link(lda);
lda.lazyPrintModelInfo();
lda.link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE));
lda.getSideOutput(0).link(new AkSinkBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE));
BatchOperator.execute();
}
new LdaPredictBatchOp().setSelectedCol(TXT_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol("predinfo").linkFrom(new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_MODEL_FILE), docs).lazyPrint(5).link(new EvalClusterBatchOp().setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).lazyPrintMetrics());
AkSourceBatchOp pwz = new AkSourceBatchOp().setFilePath(DATA_DIR + LDA_PWZ_FILE);
pwz.sample(0.001).lazyPrint(10);
for (int t = 0; t < 10; t++) {
pwz.select("word, topic_" + t).orderBy("topic_" + t, 20, false).lazyPrint(-1, "topic" + t);
}
BatchOperator.execute();
}
use of com.alibaba.alink.operator.batch.nlp.SegmentBatchOp in project Alink by alibaba.
the class Chap21 method c_6_1.
static void c_6_1() throws Exception {
Row[] rows = new Row[] { Row.of("机器学习", "机器学习"), Row.of("批式计算", "流式计算"), Row.of("Machine Learning", "ML"), Row.of("Flink", "Alink"), Row.of("Good Morning!", "Good Evening!") };
MemSourceBatchOp source = new MemSourceBatchOp(rows, new String[] { "col1", "col2" });
source.lazyPrint(-1);
source.link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN").setOutputCol("LEVENSHTEIN")).link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN_SIM").setOutputCol("LEVENSHTEIN_SIM")).link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LCS").setOutputCol("LCS")).link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LCS_SIM").setOutputCol("LCS_SIM")).link(new StringSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("JACCARD_SIM").setOutputCol("JACCARD_SIM")).lazyPrint(-1, "\n## StringSimilarityPairwiseBatchOp ##");
source.link(new SegmentBatchOp().setSelectedCol("col1")).link(new SegmentBatchOp().setSelectedCol("col2")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN").setOutputCol("LEVENSHTEIN")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN_SIM").setOutputCol("LEVENSHTEIN_SIM")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LCS").setOutputCol("LCS")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("LCS_SIM").setOutputCol("LCS_SIM")).link(new TextSimilarityPairwiseBatchOp().setSelectedCols("col1", "col2").setMetric("JACCARD_SIM").setOutputCol("JACCARD_SIM")).lazyPrint(-1, "\n## TextSimilarityPairwiseBatchOp ##");
BatchOperator.execute();
MemSourceStreamOp source_stream = new MemSourceStreamOp(rows, new String[] { "col1", "col2" });
source_stream.link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN").setOutputCol("LEVENSHTEIN")).link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("LEVENSHTEIN_SIM").setOutputCol("LEVENSHTEIN_SIM")).link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("LCS").setOutputCol("LCS")).link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("LCS_SIM").setOutputCol("LCS_SIM")).link(new StringSimilarityPairwiseStreamOp().setSelectedCols("col1", "col2").setMetric("JACCARD_SIM").setOutputCol("JACCARD_SIM")).print();
StreamOperator.execute();
}
Aggregations