use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.
the class Chap21 method c_6_2.
private static void c_6_2() throws Exception {
BatchOperator.setParallelism(2);
Row[] rows = new Row[] { Row.of("林徽因什么理由拒绝了徐志摩而选择梁思成为终身伴侣"), Row.of("发酵床的垫料种类有哪些?哪种更好?"), Row.of("京城最值得你来场文化之旅的博物馆"), Row.of("什么是超写实绘画?") };
MemSourceBatchOp target = new MemSourceBatchOp(rows, new String[] { TXT_COL_NAME });
BatchOperator<?> source = getSource();
for (String metric : new String[] { "LEVENSHTEIN", "LCS", "SSK", "COSINE" }) {
new StringNearestNeighbor().setMetric(metric).setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles").fit(source).transform(target).lazyPrint(-1, "StringNeareastNeighbor + " + metric.toString());
BatchOperator.execute();
}
for (String metric : new String[] { "LEVENSHTEIN", "LCS", "SSK", "COSINE" }) {
new Pipeline().add(new Segment().setSelectedCol(TXT_COL_NAME).setOutputCol("segmented_title")).add(new TextNearestNeighbor().setMetric(metric).setSelectedCol("segmented_title").setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles")).fit(source).transform(target).lazyPrint(-1, "TextNeareastNeighbor + " + metric.toString());
BatchOperator.execute();
}
for (String metric : new String[] { "JACCARD_SIM", "MINHASH_JACCARD_SIM", "SIMHASH_HAMMING_SIM" }) {
new StringApproxNearestNeighbor().setMetric(metric).setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles").fit(source).transform(target).lazyPrint(-1, "StringApproxNeareastNeighbor + " + metric.toString());
BatchOperator.execute();
}
for (String metric : new String[] { "JACCARD_SIM", "MINHASH_JACCARD_SIM", "SIMHASH_HAMMING_SIM" }) {
new Pipeline().add(new Segment().setSelectedCol(TXT_COL_NAME).setOutputCol("segmented_title")).add(new TextApproxNearestNeighbor().setMetric(metric).setSelectedCol("segmented_title").setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles")).fit(source).transform(target).lazyPrint(-1, "TextApproxNeareastNeighbor + " + metric.toString());
BatchOperator.execute();
}
Pipeline snn = new Pipeline().add(new StringNearestNeighbor().setMetric("LEVENSHTEIN").setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles"));
Pipeline approx_snn = new Pipeline().add(new StringApproxNearestNeighbor().setMetric("JACCARD_SIM").setSelectedCol(TXT_COL_NAME).setIdCol(TXT_COL_NAME).setTopN(5).setOutputCol("similar_titles"));
Stopwatch sw = new Stopwatch();
if (!new File(DATA_DIR + SNN_MODEL_FILE).exists()) {
sw.reset();
sw.start();
snn.fit(source).save(DATA_DIR + SNN_MODEL_FILE);
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
}
if (!new File(DATA_DIR + APPROX_SNN_MODEL_FILE).exists()) {
sw.reset();
sw.start();
approx_snn.fit(source).save(DATA_DIR + APPROX_SNN_MODEL_FILE);
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
}
BatchOperator<?> target_stock = source.filter("category_name = 'stock'");
BatchOperator<?> target_news_story = source.filter("category_name = 'news_story'");
sw.reset();
sw.start();
PipelineModel.load(DATA_DIR + SNN_MODEL_FILE).transform(target_stock).lazyPrint(10, "StringNeareastNeighbor + LEVENSHTEIN");
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
sw.reset();
sw.start();
PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(target_stock).lazyPrint(10, "JACCARD_SIM + stock");
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
sw.reset();
sw.start();
PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(target_news_story).lazyPrint(10, "JACCARD_SIM + news_story");
BatchOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
StreamOperator.setParallelism(1);
StreamOperator<?> stream_target = new MemSourceStreamOp(rows, new String[] { TXT_COL_NAME });
PipelineModel.load(DATA_DIR + SNN_MODEL_FILE).transform(stream_target).print();
StreamOperator.execute();
StreamOperator<?> stream_target_stock = getStreamSource().filter("category_name = 'stock'");
sw.reset();
sw.start();
PipelineModel.load(DATA_DIR + APPROX_SNN_MODEL_FILE).transform(stream_target_stock).sample(0.02).print();
StreamOperator.execute();
sw.stop();
System.out.println(sw.getElapsedTimeSpan());
}
use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.
the class Chap23 method c_2.
static void c_2() throws Exception {
if (!new File(DATA_DIR + TRAIN_FILE).exists()) {
ArrayList<Row> trainRows = new ArrayList<>();
ArrayList<Row> testRows = new ArrayList<>();
for (String label : new String[] { "pos", "neg" }) {
File subfolder = new File(ORIGIN_DATA_DIR + "train" + File.separator + label);
for (File f : subfolder.listFiles()) {
trainRows.add(Row.of(label, readFileContent(f)));
}
}
for (String label : new String[] { "pos", "neg" }) {
File subfolder = new File(ORIGIN_DATA_DIR + "test" + File.separator + label);
for (File f : subfolder.listFiles()) {
testRows.add(Row.of(label, readFileContent(f)));
}
}
new MemSourceBatchOp(trainRows, COL_NAMES).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TRAIN_FILE));
new MemSourceBatchOp(testRows, COL_NAMES).link(new AkSinkBatchOp().setFilePath(DATA_DIR + TEST_FILE));
BatchOperator.execute();
}
AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
train_set.lazyPrint(2);
new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("DocCountVectorizer"));
BatchOperator.execute();
new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocHashCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME).enableLazyPrintTransformData(1)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("DocHashCountVectorizer"));
BatchOperator.execute();
}
use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.
the class Chap23 method c_3.
static void c_3() throws Exception {
AkSourceBatchOp train_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
AkSourceBatchOp test_set = new AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);
new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2").enableLazyPrintTransformData(1, "2-gram")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2"));
BatchOperator.execute();
new Pipeline().add(new RegexTokenizer().setPattern("\\W+").setSelectedCol(TXT_COL_NAME)).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol(TXT_COL_NAME).setOutputCol(VECTOR_COL_NAME)).add(new NGram().setN(2).setSelectedCol(TXT_COL_NAME).setOutputCol("v_2")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setSelectedCol("v_2").setOutputCol("v_2")).add(new NGram().setN(3).setSelectedCol(TXT_COL_NAME).setOutputCol("v_3")).add(new DocCountVectorizer().setFeatureType("WORD_COUNT").setVocabSize(10000).setSelectedCol("v_3").setOutputCol("v_3")).add(new VectorAssembler().setSelectedCols(VECTOR_COL_NAME, "v_2", "v_3").setOutputCol(VECTOR_COL_NAME)).add(new LogisticRegression().setMaxIter(30).setVectorCol(VECTOR_COL_NAME).setLabelCol(LABEL_COL_NAME).setPredictionCol(PREDICTION_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME)).fit(train_set).transform(test_set).link(new EvalBinaryClassBatchOp().setPositiveLabelValueString("pos").setLabelCol(LABEL_COL_NAME).setPredictionDetailCol(PRED_DETAIL_COL_NAME).lazyPrintMetrics("NGram 2 and 3"));
BatchOperator.execute();
}
use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.
the class Chap25 method softmax.
public static void softmax(BatchOperator<?> train_set, BatchOperator<?> test_set) throws Exception {
BatchOperator.setParallelism(1);
new Pipeline().add(new Softmax().setVectorCol("vec").setLabelCol("label").setPredictionCol("pred")).fit(train_set).transform(test_set).link(new EvalMultiClassBatchOp().setLabelCol("label").setPredictionCol("pred").lazyPrintMetrics());
BatchOperator.execute();
}
use of com.alibaba.alink.pipeline.Pipeline in project Alink by alibaba.
the class BaseTuning method findBestTVSplit.
protected Tuple2<Pipeline, Report> findBestTVSplit(BatchOperator<?> in, double ratio, PipelineCandidatesBase candidates) {
int nIter = candidates.size();
SplitBatchOp sbo = new SplitBatchOp().setFraction(ratio).setMLEnvironmentId(getMLEnvironmentId()).linkFrom(new TableSourceBatchOp(DataSetConversionUtil.toTable(in.getMLEnvironmentId(), shuffle(in.getDataSet()), in.getSchema())).setMLEnvironmentId(getMLEnvironmentId()));
int bestIdx = -1;
double bestMetric = 0.;
ArrayList<Double> experienceScores = new ArrayList<>(nIter);
List<Report.ReportElement> reportElements = new ArrayList<>();
for (int i = 0; i < nIter; i++) {
Tuple2<Pipeline, List<Tuple3<Integer, ParamInfo, Object>>> cur;
try {
cur = candidates.get(i, experienceScores);
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
double metric = Double.NaN;
try {
metric = tuningEvaluator.evaluate(cur.f0.fit(sbo).transform(sbo.getSideOutput(0)));
} catch (Exception ex) {
if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
System.out.println(String.format("BestTVSplit, i: %d, best: %f, metric: %f, exception: %s", i, bestMetric, metric, ExceptionUtils.stringifyException(ex)));
}
experienceScores.add(i, metric);
reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric, ExceptionUtils.stringifyException(ex)));
continue;
}
experienceScores.add(i, metric);
if (Double.isNaN(metric)) {
reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric, "Metric is nan."));
continue;
}
reportElements.add(new Report.ReportElement(cur.f0, cur.f1, metric));
if (bestIdx == -1) {
bestMetric = metric;
bestIdx = i;
} else {
if ((tuningEvaluator.isLargerBetter() && bestMetric < metric) || (!tuningEvaluator.isLargerBetter() && bestMetric > metric)) {
bestMetric = metric;
bestIdx = i;
}
}
if (AlinkGlobalConfiguration.isPrintProcessInfo()) {
System.out.println(String.format("BestTVSplit, i: %d, best: %f, metric: %f", i, bestMetric, metric));
}
}
if (bestIdx < 0) {
throw new RuntimeException("Can not find a best model. Report: " + new Report(tuningEvaluator, reportElements).toPrettyJson());
}
try {
return Tuple2.of(candidates.get(bestIdx, experienceScores).f0, new Report(tuningEvaluator, reportElements));
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
Aggregations