Search in sources :

Example 1 with TextApproxNearestNeighbor

use of com.alibaba.alink.pipeline.similarity.TextApproxNearestNeighbor in project Alink by alibaba.

the class Chap21 method c_6_2.

private static void c_6_2() throws Exception {
    BatchOperator.setParallelism(2);
    Row[] rows = new Row[] { Row.of("林徽因什么理由拒绝了徐志摩而选择梁思成为终身伴侣"), Row.of("发酵床的垫料种类有哪些?哪种更好?"), Row.of("京城最值得你来场文化之旅的博物馆"), Row.of("什么是超写实绘画?") };
    MemSourceBatchOp target = new MemSourceBatchOp(rows, new String[] { TXT_COL_NAME });
    BatchOperator<?> source = getSource();
    for (String metric : new String[] { "LEVENSHTEIN", "LCS", "SSK", "COSINE" }) {
        new StringNearestNeighbor().setMetric(metric).setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles").fit(source).transform(target).lazyPrint(-1, "StringNeareastNeighbor + " + metric.toString());
        BatchOperator.execute();
    }
    for (String metric : new String[] { "LEVENSHTEIN", "LCS", "SSK", "COSINE" }) {
        new Pipeline().add(new Segment().setSelectedCol(TXT_COL_NAME).setOutputCol("segmented_title")).add(new TextNearestNeighbor().setMetric(metric).setSelectedCol("segmented_title").setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles")).fit(source).transform(target).lazyPrint(-1, "TextNeareastNeighbor + " + metric.toString());
        BatchOperator.execute();
    }
    for (String metric : new String[] { "JACCARD_SIM", "MINHASH_JACCARD_SIM", "SIMHASH_HAMMING_SIM" }) {
        new StringApproxNearestNeighbor().setMetric(metric).setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles").fit(source).transform(target).lazyPrint(-1, "StringApproxNeareastNeighbor + " + metric.toString());
        BatchOperator.execute();
    }
    for (String metric : new String[] { "JACCARD_SIM", "MINHASH_JACCARD_SIM", "SIMHASH_HAMMING_SIM" }) {
        new Pipeline().add(new Segment().setSelectedCol(TXT_COL_NAME).setOutputCol("segmented_title")).add(new TextApproxNearestNeighbor().setMetric(metric).setSelectedCol("segmented_title").setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles")).fit(source).transform(target).lazyPrint(-1, "TextApproxNeareastNeighbor + " + metric.toString());
        BatchOperator.execute();
    }
    Pipeline snn = new Pipeline().add(new StringNearestNeighbor().setMetric("LEVENSHTEIN").setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles"));
    Pipeline approx_snn = new Pipeline().add(new StringApproxNearestNeighbor().setMetric("JACCARD_SIM").setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles"));
    Stopwatch sw = new Stopwatch();
    if (!new File(DATA_DIR + SNN_MODEL_FILE).exists()) {
        sw.reset();
        sw.start();
        snn.fit(source).save(DATA_DIR + SNN_MODEL_FILE);
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
    }
    if (!new File(DATA_DIR + APPROX_SNN_MODEL_FILE).exists()) {
        sw.reset();
        sw.start();
        approx_snn.fit(source).save(DATA_DIR + APPROX_SNN_MODEL_FILE);
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
    }
    BatchOperator<?> target_stock = source.filter("category_name = 'stock'");
    BatchOperator<?> target_news_story = source.filter("category_name = 'news_story'");
    sw.reset();
    sw.start();
    PipelineModel.load(DATA_DIR + SNN_MODEL_FILE).transform(target_stock).lazyPrint(10, "StringNeareastNeighbor + LEVENSHTEIN");
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(target_stock).lazyPrint(10, "JACCARD_SIM + stock");
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(target_news_story).lazyPrint(10, "JACCARD_SIM + news_story");
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    StreamOperator.setParallelism(1);
    StreamOperator<?> stream_target = new MemSourceStreamOp(rows, new String[] { TXT_COL_NAME });
    PipelineModel.load(DATA_DIR + SNN_MODEL_FILE).transform(stream_target).print();
    StreamOperator.execute();
    StreamOperator<?> stream_target_stock = getStreamSource().filter("category_name = 'stock'");
    sw.reset();
    sw.start();
    PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(stream_target_stock).sample(0.02).print();
    StreamOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
}
Also used : MemSourceStreamOp(com.alibaba.alink.operator.stream.source.MemSourceStreamOp) TextApproxNearestNeighbor(com.alibaba.alink.pipeline.similarity.TextApproxNearestNeighbor) StringApproxNearestNeighbor(com.alibaba.alink.pipeline.similarity.StringApproxNearestNeighbor) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) Segment(com.alibaba.alink.pipeline.nlp.Segment) Pipeline(com.alibaba.alink.pipeline.Pipeline) MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) Row(org.apache.flink.types.Row) TextNearestNeighbor(com.alibaba.alink.pipeline.similarity.TextNearestNeighbor) File(java.io.File) StringNearestNeighbor(com.alibaba.alink.pipeline.similarity.StringNearestNeighbor)

Aggregations

Stopwatch (com.alibaba.alink.common.utils.Stopwatch)1 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)1 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)1 Pipeline (com.alibaba.alink.pipeline.Pipeline)1 Segment (com.alibaba.alink.pipeline.nlp.Segment)1 StringApproxNearestNeighbor (com.alibaba.alink.pipeline.similarity.StringApproxNearestNeighbor)1 StringNearestNeighbor (com.alibaba.alink.pipeline.similarity.StringNearestNeighbor)1 TextApproxNearestNeighbor (com.alibaba.alink.pipeline.similarity.TextApproxNearestNeighbor)1 TextNearestNeighbor (com.alibaba.alink.pipeline.similarity.TextNearestNeighbor)1 File (java.io.File)1 Row (org.apache.flink.types.Row)1