Search in sources :

Example 31 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class Chap21 method c_6_2.

private static void c_6_2() throws Exception {
    BatchOperator.setParallelism(2);
    Row[] rows = new Row[] { Row.of("林徽因什么理由拒绝了徐志摩而选择梁思成为终身伴侣"), Row.of("发酵床的垫料种类有哪些?哪种更好?"), Row.of("京城最值得你来场文化之旅的博物馆"), Row.of("什么是超写实绘画?") };
    MemSourceBatchOp target = new MemSourceBatchOp(rows, new String[] { TXT_COL_NAME });
    BatchOperator<?> source = getSource();
    for (String metric : new String[] { "LEVENSHTEIN", "LCS", "SSK", "COSINE" }) {
        new StringNearestNeighbor().setMetric(metric).setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles").fit(source).transform(target).lazyPrint(-1, "StringNeareastNeighbor + " + metric.toString());
        BatchOperator.execute();
    }
    for (String metric : new String[] { "LEVENSHTEIN", "LCS", "SSK", "COSINE" }) {
        new Pipeline().add(new Segment().setSelectedCol(TXT_COL_NAME).setOutputCol("segmented_title")).add(new TextNearestNeighbor().setMetric(metric).setSelectedCol("segmented_title").setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles")).fit(source).transform(target).lazyPrint(-1, "TextNeareastNeighbor + " + metric.toString());
        BatchOperator.execute();
    }
    for (String metric : new String[] { "JACCARD_SIM", "MINHASH_JACCARD_SIM", "SIMHASH_HAMMING_SIM" }) {
        new StringApproxNearestNeighbor().setMetric(metric).setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles").fit(source).transform(target).lazyPrint(-1, "StringApproxNeareastNeighbor + " + metric.toString());
        BatchOperator.execute();
    }
    for (String metric : new String[] { "JACCARD_SIM", "MINHASH_JACCARD_SIM", "SIMHASH_HAMMING_SIM" }) {
        new Pipeline().add(new Segment().setSelectedCol(TXT_COL_NAME).setOutputCol("segmented_title")).add(new TextApproxNearestNeighbor().setMetric(metric).setSelectedCol("segmented_title").setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles")).fit(source).transform(target).lazyPrint(-1, "TextApproxNeareastNeighbor + " + metric.toString());
        BatchOperator.execute();
    }
    Pipeline snn = new Pipeline().add(new StringNearestNeighbor().setMetric("LEVENSHTEIN").setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles"));
    Pipeline approx_snn = new Pipeline().add(new StringApproxNearestNeighbor().setMetric("JACCARD_SIM").setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles"));
    Stopwatch sw = new Stopwatch();
    if (!new File(DATA_DIR + SNN_MODEL_FILE).exists()) {
        sw.reset();
        sw.start();
        snn.fit(source).save(DATA_DIR + SNN_MODEL_FILE);
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
    }
    if (!new File(DATA_DIR + APPROX_SNN_MODEL_FILE).exists()) {
        sw.reset();
        sw.start();
        approx_snn.fit(source).save(DATA_DIR + APPROX_SNN_MODEL_FILE);
        BatchOperator.execute();
        sw.stop();
        System.out.println(sw.getElapsedTimeSpan());
    }
    BatchOperator<?> target_stock = source.filter("category_name = 'stock'");
    BatchOperator<?> target_news_story = source.filter("category_name = 'news_story'");
    sw.reset();
    sw.start();
    PipelineModel.load(DATA_DIR + SNN_MODEL_FILE).transform(target_stock).lazyPrint(10, "StringNeareastNeighbor + LEVENSHTEIN");
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(target_stock).lazyPrint(10, "JACCARD_SIM + stock");
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    sw.reset();
    sw.start();
    PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(target_news_story).lazyPrint(10, "JACCARD_SIM + news_story");
    BatchOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
    StreamOperator.setParallelism(1);
    StreamOperator<?> stream_target = new MemSourceStreamOp(rows, new String[] { TXT_COL_NAME });
    PipelineModel.load(DATA_DIR + SNN_MODEL_FILE).transform(stream_target).print();
    StreamOperator.execute();
    StreamOperator<?> stream_target_stock = getStreamSource().filter("category_name = 'stock'");
    sw.reset();
    sw.start();
    PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(stream_target_stock).sample(0.02).print();
    StreamOperator.execute();
    sw.stop();
    System.out.println(sw.getElapsedTimeSpan());
}
Also used : MemSourceStreamOp(com.alibaba.alink.operator.stream.source.MemSourceStreamOp) TextApproxNearestNeighbor(com.alibaba.alink.pipeline.similarity.TextApproxNearestNeighbor) StringApproxNearestNeighbor(com.alibaba.alink.pipeline.similarity.StringApproxNearestNeighbor) Stopwatch(com.alibaba.alink.common.utils.Stopwatch) Segment(com.alibaba.alink.pipeline.nlp.Segment) Pipeline(com.alibaba.alink.pipeline.Pipeline) MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) Row(org.apache.flink.types.Row) TextNearestNeighbor(com.alibaba.alink.pipeline.similarity.TextNearestNeighbor) File(java.io.File) StringNearestNeighbor(com.alibaba.alink.pipeline.similarity.StringNearestNeighbor)

Example 32 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class Chap23 method c_2.

static void c_2() throws Exception {
    if (!new File(DATA_DIR + TRAIN_FILE).exists()) {
        ArrayList<Row> trainRows = new ArrayList<>();
        ArrayList<Row> testRows = new ArrayList<>();
        for (String label : new String[] { "pos", "neg" }) {
            File subfolder = new File(ORIGIN_DATA_DIR + "train" + File.separator + label);
            for (File f : subfolder.listFiles()) {
                trainRows.add(Row.of(label, readFileContent(f)));
            }
        }
        for (String label : new String[] { "pos", "neg" }) {
            File subfolder = new File(ORIGIN_DATA_DIR + "test" + File.separator + label);
            for (File f : subfolder.listFiles()) {
                testRows.add(Row.of(label, readFileContent(f)));
            }
        }
        new MemSourceBatchOp(trainRows, COL_NAMES).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TRAIN_FILE));
        new MemSourceBatchOp(testRows, COL_NAMES).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TEST_FILE));
        BatchOperator.execute();
    }
    AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    train_set.lazyPrint(2);
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("DocCountVectorizer"));
    BatchOperator.execute();
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocHashCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("DocHashCountVectorizer"));
    BatchOperator.execute();
}
Also used : ArrayList(java.util.ArrayList) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) Pipeline(com.alibaba.alink.pipeline.Pipeline) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp) MemSourceBatchOp(com.alibaba.alink.operator.batch.source.MemSourceBatchOp) AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) RegexTokenizer(com.alibaba.alink.pipeline.nlp.RegexTokenizer) Row(org.apache.flink.types.Row) AkSinkBatchOp(com.alibaba.alink.operator.batch.sink.AkSinkBatchOp) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) File(java.io.File) DocHashCountVectorizer(com.alibaba.alink.pipeline.nlp.DocHashCountVectorizer)

Example 33 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class Chap23 method c_3.

static void c_3() throws Exception {
    AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
    AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2").enableLazyPrintTransformData(1, "2-gram")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2"));
    BatchOperator.execute();
    new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new NGram().setN(3).setSelectedCol(TXT_COL_NAME).setOutputCol("v_3")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(10000).setSelectedCol("v_3").setOutputCol("v_3")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2", "v_3").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2 and 3"));
    BatchOperator.execute();
}
Also used : AkSourceBatchOp(com.alibaba.alink.operator.batch.source.AkSourceBatchOp) VectorAssembler(com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler) RegexTokenizer(com.alibaba.alink.pipeline.nlp.RegexTokenizer) NGram(com.alibaba.alink.pipeline.nlp.NGram) DocCountVectorizer(com.alibaba.alink.pipeline.nlp.DocCountVectorizer) LogisticRegression(com.alibaba.alink.pipeline.classification.LogisticRegression) Pipeline(com.alibaba.alink.pipeline.Pipeline) EvalBinaryClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)

Example 34 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class Chap25 method softmax.

public static void softmax(BatchOperator<?> train_set, BatchOperator<?> test_set) throws Exception {
    BatchOperator.setParallelism(1);
    new Pipeline().add(new Softmax().setVectorCol("vec").setLabelCol("label").setPredictionCol("pred")).fit(train_set).transform(test_set).link(new EvalMultiClassBatchOp().setLabelCol("label").setPredictionCol("pred").lazyPrintMetrics());
    BatchOperator.execute();
}
Also used : EvalMultiClassBatchOp(com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp) Softmax(com.alibaba.alink.pipeline.classification.Softmax) Pipeline(com.alibaba.alink.pipeline.Pipeline)

Example 35 with Pipeline

use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.

the class BaseTuning method findBestTVSplit.

protected Tuple2<Pipeline, Report> findBestTVSplit(BatchOperator<?> in, double ratio, PipelineCandidatesBase candidates) {
    int nIter = candidates.size();
    SplitBatchOp sbo = new SplitBatchOp().setFraction(ratio).setMLEnvironmentId(getMLEnvironmentId()).linkFrom(new TableSourceBatchOp(DataSetConversionUtil.toTable(in.getMLEnvironmentId(), shuffle(in.getDataSet()), in.getSchema())).setMLEnvironmentId(getMLEnvironmentId()));
    int bestIdx = -1;
    double bestMetric = 0.;
    ArrayList<Double> experienceScores = new ArrayList<>(nIter);
    List<Report.ReportElement> reportElements = new ArrayList<>();
    for (int i = 0; i < nIter; i++) {
        Tuple2<Pipeline, List<Tuple3<Integer, ParamInfo, Object>>> cur;
        try {
            cur = candidates.get(i, experienceScores);
        } catch (CloneNotSupportedException e) {
            throw new RuntimeException(e);
        }
        double metric = Double.NaN;
        try {
            metric = tuningEvaluator.evaluate(cur.f0.fit(sbo).transform(sbo.getSideOutput(0)));
        } catch (Exception ex) {
            if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
                System.out.println(String.format("BestTVSplit, i: %d, best: %f, metric: %f, exception: %s", i, bestMetric, metric, ExceptionUtils.stringifyException(ex)));
            }
            experienceScores.add(i, metric);
            reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric, ExceptionUtils.stringifyException(ex)));
            continue;
        }
        experienceScores.add(i, metric);
        if (Double.isNaN(metric)) {
            reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric, "Metric is nan."));
            continue;
        }
        reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric));
        if (bestIdx == -1) {
            bestMetric = metric;
            bestIdx = i;
        } else {
            if ((tuningEvaluator.isLargerBetter() && bestMetric < metric) || (!tuningEvaluator.isLargerBetter() && bestMetric > metric)) {
                bestMetric = metric;
                bestIdx = i;
            }
        }
        if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
            System.out.println(String.format("BestTVSplit, i: %d, best: %f, metric: %f", i, bestMetric, metric));
        }
    }
    if (bestIdx < 0) {
        throw new RuntimeException("Can not find a best model. Report: " + new Report(tuningEvaluator, reportElements).toPrettyJson());
    }
    try {
        return Tuple2.of(candidates.get(bestIdx, experienceScores).f0, new Report(tuningEvaluator, reportElements));
    } catch (CloneNotSupportedException e) {
        throw new RuntimeException(e);
    }
}
Also used : ArrayList(java.util.ArrayList) TableSourceBatchOp(com.alibaba.alink.operator.batch.source.TableSourceBatchOp) SplitBatchOp(com.alibaba.alink.operator.batch.dataproc.SplitBatchOp) Pipeline(com.alibaba.alink.pipeline.Pipeline) ArrayList(java.util.ArrayList) List(java.util.List) ParamInfo(org.apache.flink.ml.api.misc.param.ParamInfo)

Aggregations

Pipeline (com.alibaba.alink.pipeline.Pipeline)63 Test (org.junit.Test)38 PipelineModel (com.alibaba.alink.pipeline.PipelineModel)34 LogisticRegression (com.alibaba.alink.pipeline.classification.LogisticRegression)20 Row (org.apache.flink.types.Row)18 BatchOperator (com.alibaba.alink.operator.batch.BatchOperator)16 MemSourceBatchOp (com.alibaba.alink.operator.batch.source.MemSourceBatchOp)16 VectorAssembler (com.alibaba.alink.pipeline.dataproc.vector.VectorAssembler)11 AkSourceBatchOp (com.alibaba.alink.operator.batch.source.AkSourceBatchOp)10 CollectSinkStreamOp (com.alibaba.alink.operator.stream.sink.CollectSinkStreamOp)9 EvalBinaryClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalBinaryClassBatchOp)8 MemSourceStreamOp (com.alibaba.alink.operator.stream.source.MemSourceStreamOp)7 File (java.io.File)5 ArrayList (java.util.ArrayList)5 EvalMultiClassBatchOp (com.alibaba.alink.operator.batch.evaluation.EvalMultiClassBatchOp)4 StandardScaler (com.alibaba.alink.pipeline.dataproc.StandardScaler)4 Stopwatch (com.alibaba.alink.common.utils.Stopwatch)3 CsvSourceBatchOp (com.alibaba.alink.operator.batch.source.CsvSourceBatchOp)3 KMeans (com.alibaba.alink.pipeline.clustering.KMeans)3 VectorToTensor (com.alibaba.alink.pipeline.dataproc.VectorToTensor)3