use of com.alibaba.alink.pipeline.similarity.StringNearestNeighbor in project Alink by alibaba.
the class Chap21 method c_6_2.
private static void c_6_2() throws Exception {
BatchOperator.setParallelism(2);
Row[] rows = new Row[] { Row.of("林徽因什么理由拒绝了徐志摩而选择梁思成为终身伴侣"), Row.of("发酵床的垫料种类有哪些?哪种更好?"), Row.of("京城最值得你来场文化之旅的博物馆"), Row.of("什么是超写实绘画?") };
MemSourceBatchOp target = new MemSourceBatchOp(rows, new String[] { TXT_COL_NAME });
BatchOperator<?> source = getSource();
for (String metric : new String[] { "LEVENSHTEIN", "LCS", "SSK", "COSINE" }) {
new StringNearestNeighbor().setMetric(metric).setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles").fit(source).transform(target).lazyPrint(-1, "StringNeareastNeighbor + " + metric.toString());
BatchOperator.execute();
}
for (String metric : new String[] { "LEVENSHTEIN", "LCS", "SSK", "COSINE" }) {
new Pipeline().add(new Segment().setSelectedCol(TXT_COL_NAME).setOutputCol("segmented_title")).add(new TextNearestNeighbor().setMetric(metric).setSelectedCol("segmented_title").setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles")).fit(source).transform(target).lazyPrint(-1, "TextNeareastNeighbor + " + metric.toString());
BatchOperator.execute();
}
for (String metric : new String[] { "JACCARD_SIM", "MINHASH_JACCARD_SIM", "SIMHASH_HAMMING_SIM" }) {
new StringApproxNearestNeighbor().setMetric(metric).setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles").fit(source).transform(target).lazyPrint(-1, "StringApproxNeareastNeighbor + " + metric.toString());
BatchOperator.execute();
}
for (String metric : new String[] { "JACCARD_SIM", "MINHASH_JACCARD_SIM", "SIMHASH_HAMMING_SIM" }) {
new Pipeline().add(new Segment().setSelectedCol(TXT_COL_NAME).setOutputCol("segmented_title")).add(new TextApproxNearestNeighbor().setMetric(metric).setSelectedCol("segmented_title").setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles")).fit(source).transform(target).lazyPrint(-1, "TextApproxNeareastNeighbor + " + metric.toString());
BatchOperator.execute();
}
Pipeline snn = new Pipeline().add(new StringNearestNeighbor().setMetric("LEVENSHTEIN").setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles"));
Pipeline approx_snn = new Pipeline().add(new StringApproxNearestNeighbor().setMetric("JACCARD_SIM").setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles"));
Stopwatch sw = new Stopwatch();
if (!new File(DATA_DIR + SNN_MODEL_FILE).exists()) {
sw.reset();
sw.start();
snn.fit(source).save(DATA_DIR + SNN_MODEL_FILE);
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
}
if (!new File(DATA_DIR + APPROX_SNN_MODEL_FILE).exists()) {
sw.reset();
sw.start();
approx_snn.fit(source).save(DATA_DIR + APPROX_SNN_MODEL_FILE);
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
}
BatchOperator<?> target_stock = source.filter("category_name = 'stock'");
BatchOperator<?> target_news_story = source.filter("category_name = 'news_story'");
sw.reset();
sw.start();
PipelineModel.load(DATA_DIR + SNN_MODEL_FILE).transform(target_stock).lazyPrint(10, "StringNeareastNeighbor + LEVENSHTEIN");
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
sw.reset();
sw.start();
PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(target_stock).lazyPrint(10, "JACCARD_SIM + stock");
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
sw.reset();
sw.start();
PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(target_news_story).lazyPrint(10, "JACCARD_SIM + news_story");
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
StreamOperator.setParallelism(1);
StreamOperator<?> stream_target = new MemSourceStreamOp(rows, new String[] { TXT_COL_NAME });
PipelineModel.load(DATA_DIR + SNN_MODEL_FILE).transform(stream_target).print();
StreamOperator.execute();
StreamOperator<?> stream_target_stock = getStreamSource().filter("category_name = 'stock'");
sw.reset();
sw.start();
PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(stream_target_stock).sample(0.02).print();
StreamOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
}
Aggregations